In [0]:
from pyspark.sql import functions as F
from delta.tables import DeltaTable
from pyspark.sql.window import Window

In [0]:
%run /Workspace/FMCG_Project/01_setup/02_schema_utilities

In [0]:
dbutils.widgets.text("catalog", "fmcg", "catalog")
dbutils.widgets.text("data_source", "gross_price")

catalog = dbutils.widgets.get("catalog")
data_source = dbutils.widgets.get("data_source")

print(f"{catalog} - {data_source}")

fmcg - gross_price


In [0]:
base_path = f"/Volumes/fmcg/bronze/souce_data/chaild_company/full_load/{data_source}/*.csv"
base_path

'/Volumes/fmcg/bronze/souce_data/chaild_company/full_load/gross_price/*.csv'

### Bronze layer

In [0]:
df_bronze = spark.read.format("csv")\
                    .option("inferSchama", True)\
                    .option("header", True)\
                    .load(base_path)\
                    .withColumn("read_timestamp", F.current_timestamp())\
                    .select("*", "_metadata.file_name", "_metadata.file_size")


In [0]:
df_bronze.write.format("delta")\
                .mode("overwrite")\
                .option("enableChangeDataFeed", True)\
                .option("mergeSchema", True)\
                .saveAsTable(f"{catalog}.{bronze_schema}.{data_source}")

### Silver layer

In [0]:
df_silver = spark.read.table(f"{catalog}.{bronze_schema}.{data_source}")

In [0]:
display(
    df_silver.select("month").distinct()
)

month
2025/07/01
01/08/2025
2025/09/01
2025-10-01
2025-11-01
2025-12-01
2025-07-01
2025-08-01
2025-09-01
2025/11/01


In [0]:
df_silver = df_silver.withColumn(
                "month",
                F.coalesce(
                    F.try_to_date(F.col("month"), "yyyy/MM/dd"),
                    F.try_to_date(F.col("month"), "dd/MM/yyyy"),
                    F.try_to_date(F.col("month"), "dd-MM-yyyy"),
                    F.try_to_date(F.col("month"), "yyyy-MM-dd")
                )
            )

In [0]:
df_silver = df_silver.withColumn(
                    "gross_price",
                    F.when(
                        F.col("gross_price").rlike(r"^-?[0-9]+(\.[0-9]+)?$"),
                        F.abs(F.col("gross_price").cast("double"))
                    ).otherwise(F.lit(0.0))
                )

In [0]:
df_products = spark.read.table(f"{catalog}.{silver_schema}.products")

df_joined = df_silver.join(df_products.select("product_id", "product_code"), on="product_id", how="inner")

df_joined = df_joined.select("product_id", "product_code", "month", "gross_price", "read_timestamp", "file_name", "file_size")

In [0]:
df_joined.write.format("delta")\
            .mode("overwrite")\
            .option("enableChangeDataFeed", True)\
            .option("mergeSchema", True)\
            .saveAsTable(f"{catalog}.{silver_schema}.{data_source}")

## Gold leayer

In [0]:
df_gold = spark.read.table(f"{catalog}.{silver_schema}.{data_source}")

df_gold = df_gold.select("product_code", "gross_price", "month")

In [0]:
df_gold.write.format("delta")\
        .mode("overwrite")\
        .option("enableChangeDataFeed", True)\
        .option("mergeSchema", True)\
        .saveAsTable(f"{catalog}.{gold_schema}.sb_dim_{data_source}")

# Merge dataframe to Parent dataframe

In [0]:
df_gold_src = spark.read.table(f"{catalog}.{gold_schema}.sb_dim_{data_source}")

In [0]:
df_gold_src = df_gold_src.withColumn("year", F.year(F.col("month")))\
                        .withColumn("is_zero", F.when(F.col("gross_price") == 0, 1).otherwise(0))

In [0]:
wd = Window.partitionBy(F.col("product_code"), F.col("year")).orderBy(F.col("is_zero"), F.col("month").desc())

df_gold_src = df_gold_src.withColumn("rnk", F.row_number().over(wd))\
                        .filter(F.col("rnk") == 1)

In [0]:
df_gold_src = df_gold_src.select("product_code", "gross_price", "year").withColumnRenamed("gross_price", "price_inr").select("product_code", "price_inr", "year")

product_code,price_inr,year
062f5574bbdf4386b2c7c6075483b417b4a00b172fcba919dbba7dae1b774379,281.0,2025
0cb7b2f42657b625f754e833aa1cf6a967be26f17415f5342302ebb0e90c8a28,100.0,2025
102628255d24304d6bbe0438b1ac992054f262e0814d306d0a34d7356cef3268,86.0,2025
2e387cef1424d6e7b162b45622d4b1a788d11776e33d05cc8552f4ecd2ea1896,108.0,2025
3cab59f05924285270313afcfe40a08983bb03dd88f432e34fc6336914c14345,493.0,2025
451f7167b28a25bde73995910e31c07dfa26411f1db47847f19e16747effbdaa,187.0,2025
716fa4e54b7894c910180276e0535d49afb25cdcfac09533fb74ae00689e5742,440.0,2025
778c2a7aa27bfdb211fd5ece048de80d00fbf3d6924bd908d91054796ba16ab6,296.0,2025
77b6f538a9d0e0cf845db5c2cbecec46fdd30303b501e06f64baf1d4dc0e66f9,50.0,2025
889c67757ece9c973791dfbc2d47b026a3342cc7255e47a3170329d158e897c2,138.0,2025


In [0]:
delta_trg = DeltaTable.forName(spark, f"{catalog}.{gold_schema}.dim_{data_source}")

delta_trg.alias("trg").merge(
    df_gold_src.alias("src"),
    condition="trg.product_code == src.product_code"
).whenMatchedUpdate(
    set={
        "trg.price_inr" :  "src.price_inr",
        "trg.year" : "src.year"
    }
).whenNotMatchedInsert(
    values={
        "trg.product_code" : "src.product_code",
        "trg.price_inr": "src.price_inr",
        "trg.year" : "src.year"
    }
)

<delta.connect.tables.DeltaMergeBuilder at 0xff97b8bc4980>