In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, to_date, concat_ws, last_day, expr, explode, row_number
from pyspark.sql.window import Window
from delta.tables import DeltaTable
import sys

In [0]:
dbutils.widgets.dropdown(
    name="environment",
    defaultValue="fq_dev",
    choices=["fq_dev", "fq_test", "fq_prod"],
    label="Select environment"
)

dbutils.widgets.combobox(
    name="source",
    defaultValue="excel_sheet",
    choices=["posist", "netsuite", "other","excel_sheet"],
    label="Source"
)

dbutils.widgets.combobox(
    name="domain",
    defaultValue="budget",
    choices=["discount", "sales", "cost","wastage","budget"],
    label="Domain"
)

environment = dbutils.widgets.get("environment")
source = dbutils.widgets.get("source")
domain = dbutils.widgets.get("domain")



In [0]:
def get_external_location(name: str) -> str:
    return (
        spark.sql(f"DESCRIBE EXTERNAL LOCATION `{name}`")
             .select("url")
             .collect()[0][0]
    )

bronze_path = get_external_location(f"{environment}_extloc_bronze")
silver_path = get_external_location(f"{environment}_extloc_silver")
gold_path = get_external_location(f"{environment}_extloc_gold")
checkpoint_path = get_external_location(f"{environment}_extloc_checkpoint")
staging_path = get_external_location(f"{environment}_extloc_staging")


In [0]:
def read_bronze_for_silver(table_name: str):
    return (
        spark.readStream
             .format("delta")
             .option("readChangeFeed", "true")
             .table(table_name)
             .filter(
                 "_change_type IN ('insert', 'update_postimage')"
             )
    )
df_bronze = read_bronze_for_silver("fq_dev_catalog.bronze.budget")

In [0]:
# df_bronze.display()

In [0]:
df_bronze= df_bronze.select("store_code","budget","remarks","dateorg")

In [0]:
# from pyspark.sql.functions import count
# df_bronze.agg(count("budget")).display()

In [0]:
from pyspark.sql.functions import (
    lit, expr, coalesce, try_to_date, col,
    trunc, year, date_format
)

def transform_budget_df(df):
    """
    Cleans and enriches budget dataframe.
    """
    df = (
        df.withColumnRenamed("store_code", "store_id")
          .withColumnRenamed("budget", "budget_amount")
          .withColumn("source_system", lit("EXCEL_SHEET"))
          .withColumn("sys_id", expr("uuid()"))
          .withColumn(
              "dateorg",
              coalesce(
                  try_to_date(col("dateorg"), "MM-dd-yyyy"),
                  try_to_date(col("dateorg"), "M-d-yyyy"),
                  try_to_date(col("dateorg"), "MM/dd/yyyy"),
                  try_to_date(col("dateorg"), "M/d/yyyy")
              )
          )      
    )
  
    return df

budget_df = transform_budget_df(df_bronze)
   



In [0]:
# budget_df.display()

In [0]:
def read_store_dimension():
    """
    Reads and prepares store dimension.
    """
    df = spark.read.table(f"{environment}_catalog.silver.dim_store") \
        .select("brand_id", "store_id", "company_id", "country_code","currency_code")
    return df

store_df = read_store_dimension()
# store_df.display()

In [0]:
from pyspark.sql.functions import lit, current_timestamp, col

budget_rejected_stores = (
    budget_df.alias("b")
    .join(
        store_df.alias("s"),
        col("b.store_id") == col("s.store_id"),
        "left"
    )
    .filter(col("s.store_id").isNull())
    .select("b.*")   # keep only budget columns
    .withColumn(
        "rejection_reason",
        lit("STORE_NOT_FOUND_IN_MASTER_TABLE_dim_store")
    )
    .withColumn("rejection_ts", current_timestamp())
)

# budget_rejected_stores.display()

In [0]:
# %sql
# drop table fq_dev_catalog.silver.budget_rejected_stores

In [0]:
# spark.sql(f"""
# CREATE TABLE IF NOT EXISTS fq_dev_catalog.silver.budget_rejected_stores
# USING DELTA
# TBLPROPERTIES (delta.enableChangeDataFeed = true, delta.autoOptimize.optimizeWrite = true, delta.autoOptimize.autoCompact = true, delta.columnMapping.mode = 'name')
# """)

In [0]:
def write_budget_rejected_to_silver(df):
    query = (
            budget_rejected_stores.writeStream
                .format("delta")
                .outputMode("append")
                .trigger(availableNow=True)
                .queryName(f"{domain}_silver_rejected_stores")
                .option("checkpointLocation", f"{checkpoint_path}/{source}/{domain}/checkpoint_budget_rejected")
                .option("mergeSchema", "true")
                .toTable(f"`{environment}_catalog`.`silver`.`{domain}_rejected_stores`"
                )
    )
    return query

In [0]:
write_budget_rejected_to_silver(budget_rejected_stores)

In [0]:
# %sql
# select * from fq_dev_catalog.silver.budget_rejected_stores

In [0]:
def create_final_budget(budget_df, store_df):
    """
    Joins budget with dimensions and selects final columns.
    """
    df = budget_df.join(
        store_df,
        budget_df.store_id == store_df.store_id,
        "left"
    ).drop(store_df.store_id)

    df = df.select(
        "sys_id",
        "store_id",
        "company_id",
        "brand_id",
        "currency_code",
        "dateorg",
        "budget_amount",
        col("remarks").alias("budget_remarks"),
        "source_system"
    )
    # display(df)
    return df

budget_final = create_final_budget(budget_df, store_df)
   



In [0]:
# budget_final.display()

In [0]:
# budget_final.agg(count("sys_id")).display()

In [0]:
budget_final=budget_final.dropDuplicates(["store_id","dateorg"])

In [0]:
# budget_final.display()

In [0]:
# from pyspark.sql.functions import sum
# budget_final.agg(sum("budget_amount")).display()

In [0]:
# %sql
# select * from fq_dev_catalog.silver.dim_store

In [0]:
# budget_final.select("brand_id").distinct().display()

In [0]:
budget_final = budget_final.dropna(
    subset=['store_id','company_id','brand_id','currency_code','dateorg']
)

In [0]:
# budget_final.display()

In [0]:
# from pyspark.sql.functions import sum
# budget_final.agg(sum("budget_amount")).display()


In [0]:
# budget_final.display()

In [0]:
# budget_final.printSchema()

In [0]:

# %sql
# CREATE TABLE IF NOT EXISTS fq_dev_catalog.silver.dim_budget (
#     sys_id STRING NOT NULL,
#     store_id STRING NOT NULL,
#     company_id STRING NOT NULL,
#     brand_id STRING NOT NULL,
#     currency_code STRING NOT NULL,
#     dateorg DATE NOT NULL,
#     budget_amount DOUBLE,
#     budget_remarks STRING,
#     source_system STRING NOT NULL
# )
# USING DELTA
# TBLPROPERTIES (
#     -- Enable Change Data Feed
#     delta.enableChangeDataFeed = true,
#     delta.columnMapping.mode = 'name',


#     -- Optimization properties
#     delta.autoOptimize.optimizeWrite = true,
#     delta.autoOptimize.autoCompact = true
# );


In [0]:
%sql
select * from fq_dev_catalog.silver.dim_budget

In [0]:
from delta.tables import DeltaTable
import sys

def upsert_to_silver(
    df,
    table_name,
    checkpoint_path,
    business_keys
):

    def foreach_batch_function(batch_df, batch_id):

        # Skip empty micro-batches
        if batch_df.isEmpty():
            return

        # Load target Delta table
        silver_table = DeltaTable.forName(spark, table_name)

        # Build merge condition dynamically from business keys
        merge_condition = " AND ".join(
            [f"t.{k} = s.{k}" for k in business_keys]
        )

        (
            silver_table.alias("t")
            .merge(
                batch_df.alias("s"),
                merge_condition
            )
            .whenMatchedUpdate(set={
                "sys_id": "s.sys_id",
                "company_id": "s.company_id",
                "brand_id": "s.brand_id",
                "currency_code": "s.currency_code",
                "budget_amount": "s.budget_amount",
                "budget_remarks": "s.budget_remarks",
                "source_system" : "s.source_system"   
            })
            .whenNotMatchedInsert(values={
                "sys_id": "s.sys_id",
                "store_id": "s.store_id",
                "company_id": "s.company_id",
                "brand_id": "s.brand_id",
                "currency_code": "s.currency_code",
                "dateorg": "s.dateorg",
                "budget_amount": "s.budget_amount",
                "budget_remarks": "s.budget_remarks",
                "source_system": "s.source_system"
            })
            .execute()
        )

    try:
        (
           df.writeStream
          .foreachBatch(foreach_batch_function)
          .option("checkpointLocation", checkpoint_path)
          .outputMode("update")        
          .trigger(availableNow=True)
          .start()
          .awaitTermination()
        )

    except Exception as e:
        print(f" Upsert to Silver failed: {e}")
        sys.exit(1)

In [0]:
silver_table_name = f"{environment}_catalog.silver.dim_budget"

silver_checkpoint_path = (
    f"{checkpoint_path}/{source}/{domain}/dim_budget_checkpoint"
)

business_keys = [
    "store_id",
    "dateorg"
]

upsert_to_silver(
    budget_final,        # Streaming DF (already aggregated & unique)
    silver_table_name,      # Silver table
    silver_checkpoint_path, # Checkpoint
    business_keys           # Business keys
)


In [0]:
# %sql
# -- select count("*") from fq_dev_catalog.silver.dim_budget

In [0]:
# df= spark.read.table("fq_dev_catalog.silver.dim_budget")


In [0]:
# from pyspark.sql.functions import col
# df.select(col("store_id")).distinct().display()


In [0]:
# %sql
# select * from fq_dev_catalog.silver.dim_budget

In [0]:
# %sql
# SELECT
#     SUM(budget_amount) AS total_debit
# FROM fq_dev_catalog.silver.dim_budget

In [0]:
# %sql
# SELECT
#     COUNT(budget_amount) AS total_debit
# FROM fq_dev_catalog.silver.dim_budget
# WHERE dateorg >= DATE '2026-01-01'
#   -- OR effective_from_date  <= DATE '2026-12-31';

In [0]:
# df_new=df.filter(
#     (col("effective_from_date") >= "2026-01-01") &
#     (col("effective_from_date") <= "2026-12-31")
# ).display()

In [0]:
# df_new.display()

In [0]:
# %sql
# describe history fq_dev_catalog.silver.dim_budget

In [0]:
# cdf_df = (
#     spark.read
#          .format("delta")
#          .option("readChangeFeed", "true")
#          .option("startingVersion", 0)
#          .table("fq_dev_catalog.silver.dim_budget")
# )

# cdf_df.display()