In [0]:
dbutils.widgets.dropdown(
    name="environment",
    defaultValue="fq_dev",
    choices=["fq_dev", "fq_test", "fq_prod"],
    label="Select environment"
)

dbutils.widgets.combobox(
    name="source",
    defaultValue="netsuite",
    choices=["posist", "netsuite", "other"],
    label="Source"
)

dbutils.widgets.combobox(
    name="domain",
    defaultValue="wastage",
    choices=["discount", "sales", "cost","wastage"],
    label="Domain"
)

environment = dbutils.widgets.get("environment")
source = dbutils.widgets.get("source")
domain = dbutils.widgets.get("domain")

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, to_date, concat_ws, last_day, expr, explode, row_number,count, sum
from pyspark.sql.window import Window
from delta.tables import DeltaTable
import sys

def get_external_location(name: str) -> str:
    return (spark.sql(f"DESCRIBE EXTERNAL LOCATION `{name}`")
             .select("url")
             .collect()[0][0]
    )

bronze_path = get_external_location(f"{environment}_extloc_bronze")
silver_path = get_external_location(f"{environment}_extloc_silver")
gold_path = get_external_location(f"{environment}_extloc_gold")
checkpoint_path = get_external_location(f"{environment}_extloc_checkpoint")
staging_path = get_external_location(f"{environment}_extloc_staging")

In [0]:
from delta.tables import DeltaTable
from pyspark.sql.functions import col, current_timestamp, max as _max

SOURCE_TABLE = "fq_dev_catalog.bronze.wastage"
VERSION_TABLE = "fq_dev_catalog.bronze.version_control"
CONSUMER_NAME = "dim_wastage_clone"

# 1️ Get latest version from source
delta_table = DeltaTable.forName(spark, SOURCE_TABLE)

latest_version = (
    delta_table.history()
    .select(_max("version").alias("latest_version"))
    .collect()[0]["latest_version"]
)

# 2️ Get last processed version
if spark.catalog.tableExists(VERSION_TABLE):
    last_processed_version = (
        spark.table(VERSION_TABLE)
        .filter(col("source_table") == SOURCE_TABLE)
        .filter(col("consumer_name") == CONSUMER_NAME)
        .agg(_max("last_processed_version").alias("last_version"))
        .collect()[0]["last_version"]
    )
else:
    last_processed_version = None

last_processed_version = last_processed_version if last_processed_version is not None else -1

# 3️ Decide whether to process
if last_processed_version >= latest_version:
    print(
        f"[INFO] No new data to process | "
        f"Latest version: {latest_version}, "
        f"Last processed: {last_processed_version}"
    )
    df_bronze_wastage = None  
else:
    start_version = last_processed_version + 1
    print(
        f"[INFO] Processing versions from {start_version} to {latest_version}"
    )

    df_bronze_wastage = (
        spark.read
        .format("delta")
        .option("startingVersion", start_version)
        .table(SOURCE_TABLE)
    )


In [0]:
if df_bronze_wastage is None or df_bronze_wastage.isEmpty():
    print("[INFO] No new wastage data to process — exiting job")
    dbutils.notebook.exit("NO_NEW_DATA") 


In [0]:
# df_bronze_wastage.display()

In [0]:

from pyspark.sql.functions import explode_outer

df_exploded = (
    df_bronze_wastage
    .withColumn("result", explode_outer(col("results")))
)



In [0]:
df_silver_ready = (
    df_exploded
    .select(
        # business fields
        col("result.accountName").alias("account_name"),
        col("result.accountNumber").alias("account_number"),
        col("result.accountType").alias("account_type"),
        col("result.amount").alias("amount"),
        col("result.credit").alias("credit"),
        col("result.debit").alias("debit"),
        col("result.location").alias("location"),
        col("result.subsidiary").alias("subsidiary"),
        col("result.subsidiaryId").alias("subsidiary_id"),
        col("result.year").alias("year"),
        col("result.month").alias("month"),
        col("result.period").alias("period"),

        # metadata
    )
)


In [0]:
# df_silver_ready.display()

In [0]:
# df_silver_ready.agg(count("*")).display()

In [0]:
# df_silver_ready.agg(sum("debit")).display()

In [0]:
from pyspark.sql.functions import (
    col, lit, concat_ws, to_date, last_day
)

wastage_base = df_silver_ready.select(
        "account_name",
        "account_number",
        "account_type",
        "amount",
        "credit",
        "debit",
        "location",
        "subsidiary",
        "subsidiary_id",
        "year",
        "month",
        "period"
    )
# wastage_base.agg(count("location")).display()

In [0]:
store_df = (
        spark.read
            .table(f"{environment}_catalog.silver.dim_store")
            .select(
                "netsuite_location_name","deployment_name"
                
            )
    )

In [0]:
from pyspark.sql.functions import lit, current_timestamp, trim

rejected_wastage_df = (
    wastage_base.alias("w")
    .join(
        store_df.select("netsuite_location_name").alias("s"),
        col("w.location") == col("s.netsuite_location_name"),
        "left"
    )
    .filter(
        col("s.netsuite_location_name").isNull()   # store not found
        | col("w.location").isNull()  |
        (trim(col("w.location")) == "None")              # location itself is null
    )
    .select("w.*")   # keep only wastage columns
    .withColumn(
        "rejection_reason",
        lit("STORE_LOCATION_NOT_FOUND_IN_MASTER_dim_store")
    )
    .withColumn("rejection_ts", current_timestamp())
)

In [0]:
# rejected_wastage_df.agg(sum("debit")).display()
# rejected_wastage_df.agg(count("*")).display()


In [0]:
# rejected_wastage_df.select("location").distinct().display()

In [0]:
# spark.sql(f"""
# CREATE TABLE IF NOT EXISTS fq_dev_catalog.silver.wastage_rejected
# USING DELTA
# TBLPROPERTIES (delta.enableChangeDataFeed = true, delta.autoOptimize.optimizeWrite = true, delta.autoOptimize.autoCompact = true, delta.columnMapping.mode = 'name')
# """)

In [0]:
target_table = "fq_dev_catalog.silver.wastage_rejected"

def write_wastage_rejected_to_silver(df):
    if df.isEmpty():
        print("No rejected data to write")
        return

    (
        df.write
          .format("delta")
          .mode("append")
          .option("mergeSchema", "true")
          .saveAsTable(target_table)
    )

write_wastage_rejected_to_silver(rejected_wastage_df)


In [0]:
# %sql
# select sum(debit) as wastage_amount
# from fq_dev_catalog.silver.wastage_rejected

In [0]:
df_joined = (
        wastage_base
        .join(
            store_df,
            wastage_base.location == store_df.netsuite_location_name,
            "inner"
        )
        
    )
# df_joined.agg(count("location")).display()

In [0]:
# df_joined.display()

In [0]:
# from pyspark.sql.functions import count
# df_joined.agg(count("*")).display()

In [0]:
from pyspark.sql.functions import (
    col, lit, to_date, concat_ws, last_day
)

def enrich_wastage_base(df_base):

    df_enriched = (
        df_base
            .withColumn(
                "effective_from_date",
                to_date(
                    concat_ws("-", col("year"), col("month"), lit("01")),
                    "yyyy-MMM-dd"
                )
            )
            .withColumn(
                "effective_to_date",
                last_day(col("effective_from_date"))
            )
            .withColumn(
                "source_system",
                lit("NETSUITE")
            )
    )

    return df_enriched


In [0]:
df_enriched = enrich_wastage_base(df_joined)
# df_enriched.display()

In [0]:
# df_enriched.agg(count("*")).display()

In [0]:
def finalize_wastage(df_enrich):

    df_final = df_enrich.select(
        "deployment_name",
        "month",
        "year",
        "location",
        "effective_from_date",
        "effective_to_date",
        col("debit").alias("wastage_amount"),
        "account_name",
        "account_type",
        "account_number",
        "source_system"
    )

    return df_final


In [0]:
wastage_enriched = finalize_wastage(df_enriched)
# wastage_enriched.display()

In [0]:
# wastage_enriched.agg(count("*")).display()

In [0]:
# wastage_enriched.filter(col("location")=="ALB-146-Mall of Emirates New").display()

In [0]:
# wastage_enriched.agg(sum("wastage_amount")).display()

In [0]:
# wastage_enriched.display()

In [0]:
wastage_final = (
    wastage_enriched
    .filter(
        col("location").isNotNull() &
        (trim(col("location")) != "None")
    )
)


In [0]:
# wastage_final.agg(count("*")).display()

In [0]:
# from pyspark.sql.functions import sum;
# wastage_final.agg(sum("wastage_amount")).display()

In [0]:
from pyspark.sql.functions import sum as _sum, max as _max, first

df_aggregated = (
    wastage_final
    .groupBy(
        "deployment_name",
        "location",
        "month",
        "year",
        "effective_from_date",
        "effective_to_date",
        "account_name",
        "account_type",
        "account_number",
        "source_system"
    )
    .agg(
        _sum("wastage_amount").alias("wastage_amount"),
    )
)




In [0]:
# df_aggregated.display()

In [0]:
df_aggregated_final=df_aggregated.withColumn("sys_id",expr("uuid()"))
df_aggregated_final=df_aggregated_final.select(
    "sys_id",
    "deployment_name",
    "location",
    "month",
    "year",
    "effective_from_date",
    "effective_to_date",
    "wastage_amount",
    "account_name",
    "account_type",
    "account_number",
    "source_system")
# df_aggregated_final.display()

In [0]:
# df_aggregated_final.agg(sum("wastage_amount")).display()

In [0]:
# from pyspark.sql.functions import col, count

# df_nov_count = (
#     df_aggregated_final
#     .filter(col("month") == "NOV")
#     .agg(count("*").alias("row_count"))
# )

# df_nov_count.display()


In [0]:
# df_aggregated_final.printSchema()

In [0]:
# %sql
# CREATE TABLE IF NOT EXISTS fq_dev_catalog.silver.wastage (
#     sys_id STRING NOT NULL,
#     deployment_name STRING NOT NULL,
#     location STRING NOT NULL,
#     month STRING NOT NULL,
#     year STRING NOT NULL,
#     effective_from_date DATE NOT NULL,
#     effective_to_date DATE NOT NULL,
#     wastage_amount DOUBLE,
#     account_name STRING,
#     account_type STRING,
#     account_number STRING,
#     source_system STRING NOT NULL
# )
# USING DELTA
# TBLPROPERTIES (
#     delta.enableChangeDataFeed = true,
#     delta.autoOptimize.optimizeWrite = true,
#     delta.autoOptimize.autoCompact = true,
#     delta.columnMapping.mode = 'name'
# );


In [0]:
from delta.tables import DeltaTable
import sys

def upsert_to_silver_batch(
    df,
    table_name,
    business_keys
):
    try:
        # Skip empty DataFrame
        if df.isEmpty():
            print("No data to upsert")
            return

        # Load target Delta table
        silver_table = DeltaTable.forName(spark, table_name)

        # Build merge condition dynamically
        merge_condition = " AND ".join(
            [f"t.{k} = s.{k}" for k in business_keys]
        )

        (
            silver_table.alias("t")
            .merge(
                df.alias("s"),
                merge_condition
            )
            .whenMatchedUpdate(set={
                "wastage_amount": "s.wastage_amount",
                "sys_id": "s.sys_id"
            })
            .whenNotMatchedInsert(values={
                "sys_id": "s.sys_id",
                "deployment_name": "s.deployment_name",
                "location" :"s.location",
                "month": "s.month",
                "year": "s.year",
                "effective_from_date": "s.effective_from_date",
                "effective_to_date": "s.effective_to_date",
                "wastage_amount": "s.wastage_amount",
                "account_name": "s.account_name",
                "account_type": "s.account_type",
                "account_number": "s.account_number",
                "source_system": "s.source_system"
            })
            .execute()
        )

        print("Batch upsert to silver completed successfully")

    except Exception as e:
        print(f"Upsert to Silver failed: {e}")
        sys.exit(1)


In [0]:
silver_table_name = f"{environment}_catalog.silver.wastage"

business_keys = [
        "deployment_name",
        "location",
        "month",
        "year",
        "effective_from_date",
        "effective_to_date",
        "account_name",
        "account_type",
        "account_number",
        "source_system"
]

upsert_to_silver_batch(
    df_aggregated_final,        
    silver_table_name,       
    business_keys        
)


In [0]:
# %sql
# select sum(wastage_amount) from fq_dev_catalog.silver.wastage

In [0]:
# spark.sql(f"""
# INSERT INTO {VERSION_TABLE} (source_table, consumer_name, last_processed_version, updated_at)
# SELECT
#   '{SOURCE_TABLE}',
#   '{CONSUMER_NAME}',
#   {latest_version},
#   current_timestamp()
# WHERE NOT EXISTS (
#   SELECT 1
#   FROM {VERSION_TABLE}
#   WHERE
#     source_table = '{SOURCE_TABLE}'
#     AND consumer_name = '{CONSUMER_NAME}'
# )
# """)

In [0]:
spark.sql(f"""
UPDATE {VERSION_TABLE}
SET
  last_processed_version = {latest_version},
  updated_at = current_timestamp()
WHERE
  source_table = '{SOURCE_TABLE}'
  AND consumer_name = '{CONSUMER_NAME}'
""")

In [0]:
# %sql
# select * from fq_dev_catalog.silver.wastage
# where location ="ALB-102-Mall of the Emirates"

In [0]:
# %sql
# SELECT
#     SUM(debit) AS total_debit
# FROM fq_dev_catalog.silver.wastage_rejected


In [0]:
# cdf_df = (
#     spark.read
#          .format("delta")
#          .option("readChangeFeed", "true")
#          .option("startingVersion", 0)
#          .table("fq_dev_catalog.silver.wastage")
# )

# cdf_df.display()

In [0]:
# %sql
# select * from fq_dev_catalog.bronze.version_control

In [0]:
# %sql
# delete from fq_dev_catalog.bronze.version_control
# where consumer_name="dim_wastage_clone"

In [0]:
# from pyspark.sql.functions import col, sum as _sum

# df_cdf = (
#     spark.read
#         .format("delta")
#         .option("readChangeFeed", "true")
#         .option("startingVersion", 9)
#         .table("fq_dev_catalog.silver.wastage_rejected")
# )

# df_valid = df_cdf.filter(
#     col("_change_type").isin("insert", "update_postimage")
# )

# df_sum = df_valid.agg(
#     _sum("debit").alias("total_debit")
# )

# df_sum.display()


In [0]:
# cdf_df = (
#     spark.read
#          .format("delta")
#          .option("readChangeFeed", "true")
#          .option("startingVersion", 0)
#          .table("fq_dev_catalog.silver.wastage_rejected")
# )

# cdf_df.display()