### Import lib

In [0]:
from pyspark.sql.functions import *
from delta.tables import DeltaTable

In [0]:
%run /Workspace/consolidated_pipeline/1_setup/utilities

In [0]:
print(bronze_schema,silver_schema,gold_schema)

bronze silver gold


In [0]:
dbutils.widgets.text("catalog","fmcg","Catalog")
dbutils.widgets.text("data_source","products","Data Source")

In [0]:
catalog = dbutils.widgets.get("catalog")
data_source = dbutils.widgets.get("data_source")

print(catalog,data_source)

fmcg products


In [0]:
base_path = f's3://pranshu-sports-bar/{data_source}/*.csv'
print(base_path)

s3://pranshu-sports-bar/products/*.csv


## Bronze

In [0]:
df = (
  spark.read.format("csv")
  .option("inferSchema","true")
  .option("header","true")
  .load(base_path)
  .withColumn("read_timestamp",current_timestamp())
  .select("*","_metadata.file_name","_metadata.file_size")
)

display(df.limit(10))

product_name,product_id,category,read_timestamp,file_name,file_size
SportsBar Energy Bar Choco Fudge (60g),25891101,energy bars,2026-01-18T21:07:49.366Z,products.csv,1388
SportsBar Energy Bar Choco Fudge (40g),25891102,energy bars,2026-01-18T21:07:49.366Z,products.csv,1388
SportsBar Energy Bar Choco Fudge (25g),25891103,energy bars,2026-01-18T21:07:49.366Z,products.csv,1388
SportsBar Protien Bar Peanut Crunch (45g),25891201,protien bars,2026-01-18T21:07:49.366Z,products.csv,1388
SportsBar Protien Bar Peanut Crunch (55g),25891202,protien bars,2026-01-18T21:07:49.366Z,products.csv,1388
SportsBar Protien Bar Peanut Crunch (65g),25891203,protien bars,2026-01-18T21:07:49.366Z,products.csv,1388
SportsBar Granola Crunch Honey Almond (400g),25891301,granola & cereals,2026-01-18T21:07:49.366Z,products.csv,1388
SportsBar Granola Crunch Honey Almond (300g),25891302,granola & cereals,2026-01-18T21:07:49.366Z,products.csv,1388
SportsBar Granola Crunch Honey Almond (200g),25891303,granola & cereals,2026-01-18T21:07:49.366Z,products.csv,1388
SportsBar Greek Yogurt Pro Vanilla (200g),25891401,recovery dairy,2026-01-18T21:07:49.366Z,products.csv,1388


In [0]:
df.printSchema()

root
 |-- product_name: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- category: string (nullable = true)
 |-- read_timestamp: timestamp (nullable = false)
 |-- file_name: string (nullable = false)
 |-- file_size: long (nullable = false)



In [0]:
df.write \
    .format("delta") \
        .option("delta.enableChangeDataFeed","true") \
            .mode("overwrite") \
                .saveAsTable(f"{catalog}.{bronze_schema}.{data_source}")

## Silver Processing

In [0]:
df_bronze = spark.sql(f"SELECT * FROM {catalog}.{bronze_schema}.{data_source};")
display(df_bronze.limit(10))

product_name,product_id,category,read_timestamp,file_name,file_size
SportsBar Energy Bar Choco Fudge (60g),25891101,energy bars,2026-01-18T21:08:02.862Z,products.csv,1388
SportsBar Energy Bar Choco Fudge (40g),25891102,energy bars,2026-01-18T21:08:02.862Z,products.csv,1388
SportsBar Energy Bar Choco Fudge (25g),25891103,energy bars,2026-01-18T21:08:02.862Z,products.csv,1388
SportsBar Protien Bar Peanut Crunch (45g),25891201,protien bars,2026-01-18T21:08:02.862Z,products.csv,1388
SportsBar Protien Bar Peanut Crunch (55g),25891202,protien bars,2026-01-18T21:08:02.862Z,products.csv,1388
SportsBar Protien Bar Peanut Crunch (65g),25891203,protien bars,2026-01-18T21:08:02.862Z,products.csv,1388
SportsBar Granola Crunch Honey Almond (400g),25891301,granola & cereals,2026-01-18T21:08:02.862Z,products.csv,1388
SportsBar Granola Crunch Honey Almond (300g),25891302,granola & cereals,2026-01-18T21:08:02.862Z,products.csv,1388
SportsBar Granola Crunch Honey Almond (200g),25891303,granola & cereals,2026-01-18T21:08:02.862Z,products.csv,1388
SportsBar Greek Yogurt Pro Vanilla (200g),25891401,recovery dairy,2026-01-18T21:08:02.862Z,products.csv,1388


In [0]:
df_bronze.printSchema()

root
 |-- product_name: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- category: string (nullable = true)
 |-- read_timestamp: timestamp (nullable = true)
 |-- file_name: string (nullable = true)
 |-- file_size: long (nullable = true)



### Transformation

In [0]:
# Drop duplicates
df_duplicates = df_bronze.groupBy("product_id").count().where("count > 1")
display(df_duplicates)

product_id,count
25891101,2
25891102,2


In [0]:
print("Rows before duplicates dropped:",df_bronze.count())
df_silver = df_bronze.dropDuplicates(['product_id'])
print("Rows after duplicates dropped:",df_silver.count())

Rows before duplicates dropped: 20
Rows after duplicates dropped: 18


In [0]:
df_silver.select("category").distinct().show()

+-----------------+
|         category|
+-----------------+
|      energy bars|
|     protien bars|
|granola & cereals|
|   recovery dairy|
|   healthy snacks|
|  electrolyte mix|
+-----------------+



In [0]:
# init cap
df_silver = (
    df_silver
    .withColumn(
        'category',
        when(col('category').isNull(),None)
        .otherwise(initcap('category'))
    )
)

In [0]:
# df_silver.display()

In [0]:
df_silver.select("category").distinct().show()

+-----------------+
|         category|
+-----------------+
|      Energy Bars|
|     Protien Bars|
|Granola & Cereals|
|   Recovery Dairy|
|   Healthy Snacks|
|  Electrolyte Mix|
+-----------------+



In [0]:
display(df_silver)

product_name,product_id,category,read_timestamp,file_name,file_size
SportsBar Energy Bar Choco Fudge (60g),25891101,Energy Bars,2026-01-18T21:08:02.862Z,products.csv,1388
SportsBar Energy Bar Choco Fudge (40g),25891102,Energy Bars,2026-01-18T21:08:02.862Z,products.csv,1388
SportsBar Energy Bar Choco Fudge (25g),25891103,Energy Bars,2026-01-18T21:08:02.862Z,products.csv,1388
SportsBar Protien Bar Peanut Crunch (45g),25891201,Protien Bars,2026-01-18T21:08:02.862Z,products.csv,1388
SportsBar Protien Bar Peanut Crunch (55g),25891202,Protien Bars,2026-01-18T21:08:02.862Z,products.csv,1388
SportsBar Protien Bar Peanut Crunch (65g),25891203,Protien Bars,2026-01-18T21:08:02.862Z,products.csv,1388
SportsBar Granola Crunch Honey Almond (400g),25891301,Granola & Cereals,2026-01-18T21:08:02.862Z,products.csv,1388
SportsBar Granola Crunch Honey Almond (300g),25891302,Granola & Cereals,2026-01-18T21:08:02.862Z,products.csv,1388
SportsBar Granola Crunch Honey Almond (200g),25891303,Granola & Cereals,2026-01-18T21:08:02.862Z,products.csv,1388
SportsBar Greek Yogurt Pro Vanilla (200g),25891401,Recovery Dairy,2026-01-18T21:08:02.862Z,products.csv,1388


In [0]:
df_silver = (
        df_silver
        .withColumn(
        "product_name",
        regexp_replace(col("product_name"),"(?i)Protien","Protein")
    )
    .withColumn(
        "category",
        regexp_replace(col("category"),"(?i)Protien","Protein")
    )
)


In [0]:
df_silver.display()

product_name,product_id,category,read_timestamp,file_name,file_size
SportsBar Energy Bar Choco Fudge (60g),25891101,Energy Bars,2026-01-18T21:08:02.862Z,products.csv,1388
SportsBar Energy Bar Choco Fudge (40g),25891102,Energy Bars,2026-01-18T21:08:02.862Z,products.csv,1388
SportsBar Energy Bar Choco Fudge (25g),25891103,Energy Bars,2026-01-18T21:08:02.862Z,products.csv,1388
SportsBar Protein Bar Peanut Crunch (45g),25891201,Protein Bars,2026-01-18T21:08:02.862Z,products.csv,1388
SportsBar Protein Bar Peanut Crunch (55g),25891202,Protein Bars,2026-01-18T21:08:02.862Z,products.csv,1388
SportsBar Protein Bar Peanut Crunch (65g),25891203,Protein Bars,2026-01-18T21:08:02.862Z,products.csv,1388
SportsBar Granola Crunch Honey Almond (400g),25891301,Granola & Cereals,2026-01-18T21:08:02.862Z,products.csv,1388
SportsBar Granola Crunch Honey Almond (300g),25891302,Granola & Cereals,2026-01-18T21:08:02.862Z,products.csv,1388
SportsBar Granola Crunch Honey Almond (200g),25891303,Granola & Cereals,2026-01-18T21:08:02.862Z,products.csv,1388
SportsBar Greek Yogurt Pro Vanilla (200g),25891401,Recovery Dairy,2026-01-18T21:08:02.862Z,products.csv,1388


### Matching child columns to parent columns

In [0]:
# 1. create division

df_silver = (
  df_silver
  .withColumn(
    "division",
    when(col("category") == "Energy Bars", "Nutrition Bars")
    .when(col("category") == "Protein Bars", "Nutrition Bars")
    .when(col("category") == "Granola & Cereals" , "Breakfast Foods")
    .when(col("category") == "Recovery Dairy" , "Dairy & Recovery")
    .when(col("category") == "Healthy Snacks", "Healthy Snacks")
    .when(col("category") == "Electrolyte Mix" , "Hyration & Electrolytes")
    .otherwise(lit("Other"))
  )
)

In [0]:
df_silver.display()

product_name,product_id,category,read_timestamp,file_name,file_size,division
SportsBar Energy Bar Choco Fudge (60g),25891101,Energy Bars,2026-01-18T21:08:02.862Z,products.csv,1388,Nutrition Bars
SportsBar Energy Bar Choco Fudge (40g),25891102,Energy Bars,2026-01-18T21:08:02.862Z,products.csv,1388,Nutrition Bars
SportsBar Energy Bar Choco Fudge (25g),25891103,Energy Bars,2026-01-18T21:08:02.862Z,products.csv,1388,Nutrition Bars
SportsBar Protein Bar Peanut Crunch (45g),25891201,Protein Bars,2026-01-18T21:08:02.862Z,products.csv,1388,Nutrition Bars
SportsBar Protein Bar Peanut Crunch (55g),25891202,Protein Bars,2026-01-18T21:08:02.862Z,products.csv,1388,Nutrition Bars
SportsBar Protein Bar Peanut Crunch (65g),25891203,Protein Bars,2026-01-18T21:08:02.862Z,products.csv,1388,Nutrition Bars
SportsBar Granola Crunch Honey Almond (400g),25891301,Granola & Cereals,2026-01-18T21:08:02.862Z,products.csv,1388,Breakfast Foods
SportsBar Granola Crunch Honey Almond (300g),25891302,Granola & Cereals,2026-01-18T21:08:02.862Z,products.csv,1388,Breakfast Foods
SportsBar Granola Crunch Honey Almond (200g),25891303,Granola & Cereals,2026-01-18T21:08:02.862Z,products.csv,1388,Breakfast Foods
SportsBar Greek Yogurt Pro Vanilla (200g),25891401,Recovery Dairy,2026-01-18T21:08:02.862Z,products.csv,1388,Dairy & Recovery


In [0]:
# 2. variant extraction

df_silver = (
    df_silver
    .withColumn(
        "variant",
        regexp_extract(col("product_name"), r"\((.*?)\)" , 1)
    )
)

In [0]:
# 3. product code

df_silver = (
    df_silver
    .withColumn(
        "product_code",
        sha2(col("product_name").cast("string"),256)
    )
)

In [0]:
# 4. invalid product_id

df_silver = (
    df_silver
    .withColumn(
        "product_id",
        when(
            col("product_id").cast("string").rlike("^[0-9]+$"),
            col("product_id").cast("string")
        ).otherwise(lit(999999).cast("string"))
    )
)

In [0]:
display(df_silver)

product_name,product_id,category,read_timestamp,file_name,file_size,division,variant,product_code
SportsBar Energy Bar Choco Fudge (60g),25891101,Energy Bars,2026-01-18T21:08:02.862Z,products.csv,1388,Nutrition Bars,60g,e91ba9d665f90254da5809bfdebe3db2be01a52f50b6fd96b57eed238392b843
SportsBar Energy Bar Choco Fudge (40g),25891102,Energy Bars,2026-01-18T21:08:02.862Z,products.csv,1388,Nutrition Bars,40g,e92c739a8d78cd6cbe954648c2f9dd75ed61fcfd99b03e10dca65c3082d0728e
SportsBar Energy Bar Choco Fudge (25g),25891103,Energy Bars,2026-01-18T21:08:02.862Z,products.csv,1388,Nutrition Bars,25g,102628255d24304d6bbe0438b1ac992054f262e0814d306d0a34d7356cef3268
SportsBar Protein Bar Peanut Crunch (45g),25891201,Protein Bars,2026-01-18T21:08:02.862Z,products.csv,1388,Nutrition Bars,45g,2e387cef1424d6e7b162b45622d4b1a788d11776e33d05cc8552f4ecd2ea1896
SportsBar Protein Bar Peanut Crunch (55g),25891202,Protein Bars,2026-01-18T21:08:02.862Z,products.csv,1388,Nutrition Bars,55g,0cb7b2f42657b625f754e833aa1cf6a967be26f17415f5342302ebb0e90c8a28
SportsBar Protein Bar Peanut Crunch (65g),25891203,Protein Bars,2026-01-18T21:08:02.862Z,products.csv,1388,Nutrition Bars,65g,889c67757ece9c973791dfbc2d47b026a3342cc7255e47a3170329d158e897c2
SportsBar Granola Crunch Honey Almond (400g),25891301,Granola & Cereals,2026-01-18T21:08:02.862Z,products.csv,1388,Breakfast Foods,400g,3cab59f05924285270313afcfe40a08983bb03dd88f432e34fc6336914c14345
SportsBar Granola Crunch Honey Almond (300g),25891302,Granola & Cereals,2026-01-18T21:08:02.862Z,products.csv,1388,Breakfast Foods,300g,d9ebd1ca64d23951a6310af93b1c5ac27d831ac842e89aea59a9e8b38621faa5
SportsBar Granola Crunch Honey Almond (200g),25891303,Granola & Cereals,2026-01-18T21:08:02.862Z,products.csv,1388,Breakfast Foods,200g,c68834ceaff15846bc1892c2185dc4e4f471d64fe3796b1a8ecc39a5a48c614f
SportsBar Greek Yogurt Pro Vanilla (200g),25891401,Recovery Dairy,2026-01-18T21:08:02.862Z,products.csv,1388,Dairy & Recovery,200g,da6bfc596c1360ca07bda4e0ae6bfe3b8456517fc6e8ddc265630ff940f9ab05


In [0]:
# 5. product_name -> product rename krna hai

df_silver = (
    df_silver
    .withColumnRenamed(
        "product_name",
        "product"
    )
)

In [0]:
df_silver = df_silver.select("product_code", "division", "category", "product", "variant", "product_id", "read_timestamp", "file_name", "file_size")

In [0]:
df_silver.write \
    .format("delta") \
        .option("delta.enableChangeDataFeed","true") \
            .option("mergeSchema","true") \
                .mode("overwrite") \
                    .saveAsTable(f"{catalog}.{silver_schema}.{data_source}")

## Gold Processing

In [0]:
df_silver = spark.sql(f"SELECT * FROM {catalog}.{silver_schema}.{data_source};")
df_gold = df_silver.select(["product_code","division","category","product","variant","product_id"])

In [0]:
df_gold.display(
    
)

product_code,division,category,product,variant,product_id
2e387cef1424d6e7b162b45622d4b1a788d11776e33d05cc8552f4ecd2ea1896,Nutrition Bars,Protein Bars,SportsBar Protein Bar Peanut Crunch (45g),45g,25891201
fe5a8036be4b9a787b7c0ae013fc752a8cfb6c55a2f7b2fd152a6380925e9c49,Dairy & Recovery,Recovery Dairy,SportsBar Greek Yogurt Pro Vanilla (120g),120g,25891402
da6bfc596c1360ca07bda4e0ae6bfe3b8456517fc6e8ddc265630ff940f9ab05,Dairy & Recovery,Recovery Dairy,SportsBar Greek Yogurt Pro Vanilla (200g),200g,25891401
e91ba9d665f90254da5809bfdebe3db2be01a52f50b6fd96b57eed238392b843,Nutrition Bars,Energy Bars,SportsBar Energy Bar Choco Fudge (60g),60g,25891101
0cb7b2f42657b625f754e833aa1cf6a967be26f17415f5342302ebb0e90c8a28,Nutrition Bars,Protein Bars,SportsBar Protein Bar Peanut Crunch (55g),55g,25891202
451f7167b28a25bde73995910e31c07dfa26411f1db47847f19e16747effbdaa,Hyration & Electrolytes,Electrolyte Mix,SportsBar Electrolyte Mix Lemon-Lime (5 Sachets),5 Sachets,25891603
c68834ceaff15846bc1892c2185dc4e4f471d64fe3796b1a8ecc39a5a48c614f,Breakfast Foods,Granola & Cereals,SportsBar Granola Crunch Honey Almond (200g),200g,25891303
d9ebd1ca64d23951a6310af93b1c5ac27d831ac842e89aea59a9e8b38621faa5,Breakfast Foods,Granola & Cereals,SportsBar Granola Crunch Honey Almond (300g),300g,25891302
77b6f538a9d0e0cf845db5c2cbecec46fdd30303b501e06f64baf1d4dc0e66f9,Dairy & Recovery,Recovery Dairy,SportsBar Greek Yogurt Pro Vanilla (80g),80g,25891403
5931334e4cbe6b3792c209e8394e87aa21b83816b47d99375e4ff25e651ce63a,Healthy Snacks,Healthy Snacks,SportsBar Oats Cookie Bites ChocoChip (350g),350g,999999


In [0]:
df_gold.write \
    .format("delta") \
        .option("delta.enableChangeDataFeed","true") \
            .mode("overwrite") \
                .saveAsTable(f"{catalog}.{gold_schema}.sb_dim_{data_source}")

## Merging child and parent data

In [0]:
delta_table = DeltaTable.forName(spark,"fmcg.gold.dim_products")
df_child_products = spark.sql(
  f"SELECT product_code ,division,category,product,variant FROM fmcg.gold.sb_dim_products;")

df_child_products.show(5)

+--------------------+----------------+--------------+--------------------+-------+
|        product_code|        division|      category|             product|variant|
+--------------------+----------------+--------------+--------------------+-------+
|2e387cef1424d6e7b...|  Nutrition Bars|  Protein Bars|SportsBar Protein...|    45g|
|fe5a8036be4b9a787...|Dairy & Recovery|Recovery Dairy|SportsBar Greek Y...|   120g|
|da6bfc596c1360ca0...|Dairy & Recovery|Recovery Dairy|SportsBar Greek Y...|   200g|
|e91ba9d665f90254d...|  Nutrition Bars|   Energy Bars|SportsBar Energy ...|    60g|
|0cb7b2f42657b625f...|  Nutrition Bars|  Protein Bars|SportsBar Protein...|    55g|
+--------------------+----------------+--------------+--------------------+-------+
only showing top 5 rows


In [0]:
delta_table.alias("target").merge(
    source = df_child_products.alias("source"),
    condition = "target.product_code = source.product_code"
).whenMatchedUpdate(
    set = {
        "division" : "source.division",
        "category" : "source.category",
        "product" : "source.product",
        "variant" : "source.variant"
    }
).whenNotMatchedInsert(
    values = {
        "product_code" : "source.product_code",
        "division" : "source.division",
        "category" : "source.category",
        "product" : "source.product",
        "variant" : "source.variant"
    }
).execute()

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]