In [0]:
%sql
alter table fq_dev_catalog.silver.wastage

In [0]:
dbutils.widgets.dropdown(
    name="environment",
    defaultValue="fq_dev",
    choices=["fq_dev", "fq_test", "fq_prod"],
    label="Select environment"
)

dbutils.widgets.combobox(
    name="source",
    defaultValue="netsuite",
    choices=["posist", "netsuite", "other"],
    label="Source"
)

dbutils.widgets.combobox(
    name="domain",
    defaultValue="wastage",
    choices=["discount", "sales", "cost","wastage"],
    label="Domain"
)

environment = dbutils.widgets.get("environment")
source = dbutils.widgets.get("source")
domain = dbutils.widgets.get("domain")

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, to_date, concat_ws, last_day, expr, explode, row_number
from pyspark.sql.window import Window
from delta.tables import DeltaTable
import sys

def get_external_location(name: str) -> str:
    return (
        spark.sql(f"DESCRIBE EXTERNAL LOCATION `{name}`")
             .select("url")
             .collect()[0][0]
    )

bronze_path = get_external_location(f"{environment}_extloc_bronze")
silver_path = get_external_location(f"{environment}_extloc_silver")
gold_path = get_external_location(f"{environment}_extloc_gold")
checkpoint_path = get_external_location(f"{environment}_extloc_checkpoint")
staging_path = get_external_location(f"{environment}_extloc_staging")

In [0]:
def read_bronze_for_silver(table_name: str):
    return (
        spark.readStream
             .format("delta")
             .option("readChangeFeed", "true")
             .table(table_name)
             .filter(
                 "_change_type IN ('insert', 'update_postimage')"
             )
    )
df_bronze = read_bronze_for_silver("fq_dev_catalog.bronze.wastage")



In [0]:
# df_bronze.display()

In [0]:
df_exploded = (
    df_bronze
    .withColumn("result", explode(col("results")))
)


In [0]:
df_silver_ready = (
    df_exploded
    .select(
        # business fields
        col("result.accountName").alias("account_name"),
        col("result.accountNumber").alias("account_number"),
        col("result.accountType").alias("account_type"),
        col("result.amount").alias("amount"),
        col("result.credit").alias("credit"),
        col("result.debit").alias("debit"),
        col("result.location").alias("location"),
        col("result.subsidiary").alias("subsidiary"),
        col("result.subsidiaryId").alias("subsidiary_id"),
        col("result.year").alias("year"),
        col("result.month").alias("month"),
        col("result.period").alias("period"),

        # metadata
        col("sys_id"),
        col("file_path"),
        col("ingestion_ts"),
        col("_commit_version"),
        col("_commit_timestamp"),
        col("_change_type")
    )
)


In [0]:
%sql
select * from fq_dev_catalog.bronze.wastage

In [0]:
from pyspark.sql.functions import count, col

df_silver_ready \
    .filter(col("month") == "NOV") \
    .agg(count("location").alias("location_count")) \
    .display()


In [0]:
from pyspark.sql.functions import (
    col, lit, concat_ws, to_date, last_day
)

wastage_base = df_silver_ready.select(
        "account_name",
        "account_number",
        "account_type",
        "amount",
        "credit",
        "debit",
        "location",
        "subsidiary",
        "subsidiary_id",
        "year",
        "month",
        "period"
    )
store_df = (
        spark.read
            .table(f"{environment}_catalog.silver.dim_store")
            .select(
                "netsuite_location_name","deployment_name"
                
            )
    )

In [0]:
df_joined = (
        wastage_base
        .join(
            store_df,
            wastage_base.location == store_df.netsuite_location_name,
            "inner"
        )
        
    )

In [0]:
df_joined \
    .filter(col("month") == "NOV") \
    .agg(count("location").alias("location_count")) \
    .display()

In [0]:
def transform_wastage_base(df_base):

    df_enriched = (
        df_joined
        .withColumn(
            "effective_from_date",
            to_date(
                concat_ws("-", col("year"), col("month"), lit("01")),
                "yyyy-MMM-dd"
            )
        )
        .withColumn(
            "effective_to_date",
            last_day(col("effective_from_date"))
        )
        .withColumn("source_system", lit("NETSUITE"))
    )
   

    #  Final transformed projection (aggregation-ready)
    df_final = df_enriched.select(
        "deployment_name",
        "month",
        "year",
        "location",
        "effective_from_date",
        "effective_to_date",
        col("debit").alias("wastage_amount"),
        "account_name",
        "account_type",
        "account_number",
        "source_system",
    )

    return df_final


wastage_enriched = transform_wastage_base(df_silver_ready)

In [0]:
wastage_enriched \
    .filter(col("month") == "NOV") \
    .agg(count("location").alias("location_count")) \
    .display()

In [0]:
from pyspark.sql.functions import trim
wastage_final = (
    wastage_enriched
    .filter(
        col("location").isNotNull() &
        (trim(col("location")) != "None")
    )
)

In [0]:
wastage_final \
    .filter(col("month") == "NOV") \
    .agg(count("location").alias("location_count")) \
    .display()

In [0]:
wastage_final \
    .filter(col("month") == "NOV") \
    .agg(sum("wastage_amount").alias("location_count")) \
    .display()

In [0]:
from pyspark.sql.functions import  sum as _sum
df_aggregated = (
    wastage_final
    .groupBy(
        "deployment_name",
        "location",
        "month",
        "year",
        "effective_from_date",
        "effective_to_date",
        "account_name",
        "account_type",
        "account_number",
        "source_system"
    )
    .agg(
        _sum("wastage_amount").alias("wastage_amount"),
    )
)

In [0]:
df_aggregated_final=df_aggregated.withColumn("sys_id",expr("uuid()"))
df_aggregated_final=df_aggregated_final.select(
    "sys_id",
    "deployment_name",
    "location",
    "month",
    "year",
    "effective_from_date",
    "effective_to_date",
    "wastage_amount",
    "account_name",
    "account_type",
    "account_number",
    "source_system")

In [0]:
# %sql
# CREATE TABLE IF NOT EXISTS fq_dev_catalog.silver.wastagetrial11 (
#     sys_id STRING NOT NULL,
#     deployment_name STRING NOT NULL,
#     location STRING NOT NULL,
#     month STRING NOT NULL,
#     year STRING NOT NULL,
#     effective_from_date DATE NOT NULL,
#     effective_to_date DATE NOT NULL,
#     wastage_amount DOUBLE,
#     account_name STRING,
#     account_type STRING,
#     account_number STRING,
#     source_system STRING NOT NULL
# )
# USING DELTA
# TBLPROPERTIES (
#     delta.enableChangeDataFeed = true,
#     delta.autoOptimize.optimizeWrite = true,
#     delta.autoOptimize.autoCompact = true,
#     delta.columnMapping.mode = 'name'
# );

In [0]:
from delta.tables import DeltaTable
import sys

def upsert_to_silver(
    df,
    table_name,
    checkpoint_path,
    business_keys
):

    def foreach_batch_function(batch_df, batch_id):

        # Skip empty micro-batches
        if batch_df.isEmpty():
            return

        # Load target Delta table
        silver_table = DeltaTable.forName(spark, table_name)

        # Build merge condition dynamically from business keys
        merge_condition = " AND ".join(
            [f"t.{k} = s.{k}" for k in business_keys]
        )

        (
            silver_table.alias("t")
            .merge(
                batch_df.alias("s"),
                merge_condition
            )
            .whenMatchedUpdate(set={
                "wastage_amount": "s.wastage_amount",
                "sys_id": "s.sys_id"
            })
            .whenNotMatchedInsert(values={
                "sys_id": "s.sys_id",
                "deployment_name": "s.deployment_name",
                "location" :"s.location",
                "month": "s.month",
                "year": "s.year",
                "effective_from_date": "s.effective_from_date",
                "effective_to_date": "s.effective_to_date",
                "wastage_amount": "s.wastage_amount",
                "account_name": "s.account_name",
                "account_type": "s.account_type",
                "account_number": "s.account_number",
                "source_system": "s.source_system"
            })
            .execute()
        )

    try:
        (
           df.writeStream
          .foreachBatch(foreach_batch_function)
          .option("checkpointLocation", checkpoint_path)
          .outputMode("update")    
          .trigger(availableNow=True)
          .start()
          .awaitTermination()
        )

    except Exception as e:
        print(f" Upsert to Silver failed: {e}")
        sys.exit(1)

In [0]:
silver_table_name = f"{environment}_catalog.silver.wastagetrial11"

silver_checkpoint_path = (
    f"{checkpoint_path}/{source}/{domain}/dim_wastage_checkpointtrial"
)


business_keys = [
        "deployment_name",
        "location",
        "month",
        "year",
        "effective_from_date",
        "effective_to_date",
        "account_name",
        "account_type",
        "account_number",
        "source_system"
]

upsert_to_silver(
    df_aggregated_final,        
    silver_table_name,  
    silver_checkpoint_path,    
    business_keys        
)

In [0]:
%sql
select count("*") from fq_dev_catalog.silver.wastagetrial11
where month='NOV'

In [0]:
from pyspark.sql.functions import when, current_timestamp, trim
wastage_rejected = (
    wastage_enriched
    .withColumn(
        "rejection_reason",
        when(col("location").isNull(), "LOCATION_NULL")
        .when(trim(col("location")) == "None", "LOCATION_IS_NONE")
        .when(col("store_id").isNull(), "STORE_NOT_FOUND_IN_DIM_STORE")
        .when(col("company_id").isNull(), "COMPANY_ID_NULL")
        .when(col("brand_id").isNull(), "BRAND_ID_NULL")
        .when(col("currency_code").isNull(), "CURRENCY_CODE_NULL")
    )
    .filter(col("rejection_reason").isNotNull())
    .withColumn("rejection_ts", current_timestamp())
)
# wastage_rejected.display()

In [0]:
# spark.sql(f"""
# CREATE TABLE IF NOT EXISTS fq_dev_catalog.silver.wastage_rejected_stores
# USING DELTA
# TBLPROPERTIES (delta.enableChangeDataFeed = true, delta.autoOptimize.optimizeWrite = true, delta.autoOptimize.autoCompact = true, delta.columnMapping.mode = 'name')
# """)

In [0]:
def write_wastage_rejected_to_silver(df):
    query = (
            wastage_rejected.writeStream
                .format("delta")
                .outputMode("append")
                .trigger(availableNow=True)
                .queryName(f"{domain}_silver_rejected_stores")
                .option("checkpointLocation", f"{checkpoint_path}/{source}/{domain}/checkpoint_wastage_rejected")
                .option("mergeSchema", "true")
                .toTable(f"`{environment}_catalog`.`silver`.`{domain}_rejected_stores`"
                )
    )
    return query
write_wastage_rejected_to_silver(wastage_rejected)

In [0]:
# %sql
# select * from fq_dev_catalog.silver.wastage_rejected_stores

In [0]:
# from pyspark.sql.functions import count
# wastage_final.agg(count("store_id")).display()

In [0]:
# wastage_enriched.printSchema()

In [0]:
# wastage_final.display()

In [0]:
# from pyspark.sql.functions import sum
# wastage_final.agg(sum("wastage_amount").alias("wastage_amount")).display()



In [0]:
# df_aggregated_final.display()

In [0]:
# %sql
# drop table if exists fq_dev_catalog.silver.dim_wastage;

In [0]:
# %sql
# CREATE TABLE IF NOT EXISTS fq_dev_catalog.silver.wastage (
#     sys_id STRING NOT NULL,
#     store_id STRING,
#     company_id STRING,
#     brand_id STRING,
#     currency_code STRING,
#     month STRING NOT NULL,
#     year STRING NOT NULL,
#     effective_from_date DATE NOT NULL,
#     effective_to_date DATE NOT NULL,
#     wastage_amount DOUBLE,
#     account_name STRING,
#     account_type STRING,
#     account_number STRING,
#     source_system STRING NOT NULL
# )
# USING DELTA
# TBLPROPERTIES (
#     delta.enableChangeDataFeed = true,
#     delta.autoOptimize.optimizeWrite = true,
#     delta.autoOptimize.autoCompact = true,
#     delta.columnMapping.mode = 'name'
# );


In [0]:
from delta.tables import DeltaTable
import sys

def upsert_to_silver(
    df,
    table_name,
    checkpoint_path,
    business_keys
):

    def foreach_batch_function(batch_df, batch_id):

        # Skip empty micro-batches
        if batch_df.isEmpty():
            return

        # Load target Delta table
        silver_table = DeltaTable.forName(spark, table_name)

        # Build merge condition dynamically from business keys
        merge_condition = " AND ".join(
            [f"t.{k} = s.{k}" for k in business_keys]
        )

        (
            silver_table.alias("t")
            .merge(
                batch_df.alias("s"),
                merge_condition
            )
            .whenMatchedUpdate(set={
                "wastage_amount": "s.wastage_amount",
                "sys_id": "s.sys_id",
            })
            .whenNotMatchedInsert(values={
                "sys_id": "s.sys_id",
                "store_id": "s.store_id",
                "company_id": "s.company_id",
                "brand_id": "s.brand_id",
                "currency_code": "s.currency_code",
                "month" : "s.month",
                "year" : "s.year",
                "effective_from_date": "s.effective_from_date",
                "effective_to_date": "s.effective_to_date",
                "wastage_amount": "s.wastage_amount",
                "account_name": "s.account_name",
                "account_type": "s.account_type",
                "account_number": "s.account_number",
                "source_system": "s.source_system"
            })
            .execute()
        )

    try:
        (
           df.writeStream
          .foreachBatch(foreach_batch_function)
          .option("checkpointLocation", checkpoint_path)
          .outputMode("update")    
          .trigger(availableNow=True)
          .start()
          .awaitTermination()
        )

    except Exception as e:
        print(f" Upsert to Silver failed: {e}")
        sys.exit(1)


In [0]:
silver_table_name = f"{environment}_catalog.silver.wastage"

silver_checkpoint_path = (
    f"{checkpoint_path}/{source}/{domain}/dim_wastage_checkpoint"
)

business_keys = [
    "store_id",
        "company_id",
        "brand_id",
        "currency_code",
        "month",
        "year",
        "effective_from_date",
        "effective_to_date",
        "account_name",
        "account_type",
        "account_number",
        "source_system"
]

upsert_to_silver(
    df_aggregated_final,        
    silver_table_name,      
    silver_checkpoint_path, 
    business_keys        
)


In [0]:
# df = spark.read.table("fq_dev_catalog.silver.wastage")
# df.display()

In [0]:
# %sql
# SELECT
#     SUM(wastage_amount) AS total_debit
# FROM fq_dev_catalog.silver.wastage


In [0]:
# %sql
# SELECT
#     SUM(wastage_amount) AS total_debit
# FROM fq_dev_catalog.silver.wastage
# WHERE effective_from_date >= DATE '2025-09-01'
#   OR effective_from_date   <= DATE '2025-12-01';


In [0]:
# %sql
# select * from fq_dev_catalog.silver.wastage

In [0]:
# from pyspark.sql.functions import col, to_date, lit

# df = spark.read.table("fq_dev_catalog.silver.wastage")
# df.filter(
#     (col("effective_from_date") >= to_date(lit("2025-11-01"))) &
#     (col("effective_from_date") <= to_date(lit("2025-12-31")))
# ).display()

In [0]:
# cdf_df = (
#     spark.read
#          .format("delta")
#          .option("readChangeFeed", "true")
#          .option("startingVersion", 0)
#          .table("fq_dev_catalog.silver.wastage")
# )

# cdf_df.display()