In [0]:
from pyspark.sql.functions import *

In [0]:
from pyspark.sql import functions as F
from delta.tables import DeltaTable

In [0]:
%run /Workspace/consolidated_pipeline/1_setup/utilities

In [0]:
print(bronze_schema,silver_schema,gold_schema)

bronze silver gold


In [0]:
dbutils.widgets.text("catalog","fmcg","Catalog")
dbutils.widgets.text("data_source","customers","Data Source")

In [0]:
catalog = dbutils.widgets.get("catalog")
data_source = dbutils.widgets.get("data_source")

print(catalog,data_source)

fmcg customers


In [0]:
base_path = f's3://pranshu-sports-bar/{data_source}/*.csv'
print(base_path)

s3://pranshu-sports-bar/customers/*.csv


## Bronze

In [0]:
df = (
    spark.read.format("csv")
    .option("header",True)
    .option("inferSchema",True)
    .load(base_path)
    .withColumn("read_timestamp",F.current_timestamp())
    .select("*","_metadata.file_name","_metadata.file_size")
)
display(df.limit(10))

customer_id,customer_name,city,read_timestamp,file_name,file_size
789201,FitFuel Market,Bengaluru,2026-01-18T20:50:18.182Z,customers.csv,1404
789202,FitFuel Market,Hyderabad,2026-01-18T20:50:18.182Z,customers.csv,1404
789203,FitFuel Market,New Delhi,2026-01-18T20:50:18.182Z,customers.csv,1404
789301,Athlete's Choice Store,Bengaluru,2026-01-18T20:50:18.182Z,customers.csv,1404
789303,Athlete's Choice Store,New Delhi,2026-01-18T20:50:18.182Z,customers.csv,1404
789101,Endurance Foods,Bengalore,2026-01-18T20:50:18.182Z,customers.csv,1404
789102,Endurance Foods,Hyderabad,2026-01-18T20:50:18.182Z,customers.csv,1404
789103,Endurance Foods,New Delhi,2026-01-18T20:50:18.182Z,customers.csv,1404
789121,HydroBoost Nutrition,Hyderabad,2026-01-18T20:50:18.182Z,customers.csv,1404
789122,HydroBoost Nutrition,New Delhi,2026-01-18T20:50:18.182Z,customers.csv,1404


In [0]:
df.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- read_timestamp: timestamp (nullable = false)
 |-- file_name: string (nullable = false)
 |-- file_size: long (nullable = false)



In [0]:
df.write\
    .format("delta") \
        .option("delta.enableChangeDataFeed","true") \
            .mode("overwrite") \
                .saveAsTable(f"{catalog}.{bronze_schema}.{data_source}")

### Silver Processing

In [0]:
df_bronze = spark.sql(f"SELECT * FROM {catalog}.{bronze_schema}.{data_source};")
df_bronze.display(10)

customer_id,customer_name,city,read_timestamp,file_name,file_size
789201,FitFuel Market,Bengaluru,2026-01-18T20:50:24.017Z,customers.csv,1404
789202,FitFuel Market,Hyderabad,2026-01-18T20:50:24.017Z,customers.csv,1404
789203,FitFuel Market,New Delhi,2026-01-18T20:50:24.017Z,customers.csv,1404
789301,Athlete's Choice Store,Bengaluru,2026-01-18T20:50:24.017Z,customers.csv,1404
789303,Athlete's Choice Store,New Delhi,2026-01-18T20:50:24.017Z,customers.csv,1404
789101,Endurance Foods,Bengalore,2026-01-18T20:50:24.017Z,customers.csv,1404
789102,Endurance Foods,Hyderabad,2026-01-18T20:50:24.017Z,customers.csv,1404
789103,Endurance Foods,New Delhi,2026-01-18T20:50:24.017Z,customers.csv,1404
789121,HydroBoost Nutrition,Hyderabad,2026-01-18T20:50:24.017Z,customers.csv,1404
789122,HydroBoost Nutrition,New Delhi,2026-01-18T20:50:24.017Z,customers.csv,1404


In [0]:
df_bronze.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- read_timestamp: timestamp (nullable = true)
 |-- file_name: string (nullable = true)
 |-- file_size: long (nullable = true)



In [0]:
df_duplicates = df_bronze.groupBy("customer_id").count().where("count > 1")
display(df_duplicates)

customer_id,count
789321,2
789503,2
789522,2
789603,2


In [0]:
# df_duplicates = df_bronze.groupBy("customer_id").count().filter(F.col("count") > 1)
# display(df_duplicates)

In [0]:
print("Rows before duplicates dropped:",df_bronze.count())
df_silver = df_bronze.dropDuplicates(['customer_id'])
print("Rows after duplicates dropped:",df_silver.count())

Rows before duplicates dropped: 39
Rows after duplicates dropped: 35


In [0]:
display(
    df_silver.filter(col("customer_name") != trim(col("customer_name")))
)

customer_id,customer_name,city,read_timestamp,file_name,file_size
789121,HydroBoost Nutrition,Hyderabad,2026-01-18T20:50:24.017Z,customers.csv,1404
789401,SprintX nutrition,Bengaluru,2026-01-18T20:50:24.017Z,customers.csv,1404
789420,ZenAthlete foods,,2026-01-18T20:50:24.017Z,customers.csv,1404
789421,ZenAthlete Foods,Hyderbad,2026-01-18T20:50:24.017Z,customers.csv,1404
789521,PrimeFuel Nutrition,,2026-01-18T20:50:24.017Z,customers.csv,1404
789702,StaminaX Store,Hyderabad,2026-01-18T20:50:24.017Z,customers.csv,1404


In [0]:
df_silver = df_silver.withColumn(
    "customer_name",
    trim(col("customer_name"))
)

In [0]:
# checking
display(
    df_silver.filter(col("customer_name") != trim(col("customer_name")))
)

customer_id,customer_name,city,read_timestamp,file_name,file_size


In [0]:
df_silver.select('city').distinct().display()

city
Bengaluru
Hyderabad
New Delhi
Bengalore
Hyderabadd
""
Hyderbad
NewDelhee
NewDelhi
Bengaluruu


In [0]:
# typo sahi kro

city_mapping = {
    'Bengaluruu' : 'Bengaluru',
    'Bengalore' : 'Bengaluru',

    'Hyderabadd' : 'Hyderabad',
    'Hyderbad' : 'Hyderabad',

    'NewDelhi' : 'New Delhi',
    'NewDheli' : 'New Delhi',
    'NewDelhee' : 'New Delhi'
}

allowed = ['Bengaluru','New Delhi','Hyderabad']

df_silver = (
    df_silver
    .replace(city_mapping,subset = ['city'])
    .withColumn(
        "city",
        when(col("city").isNull(),None)
        .when(col("city").isin(allowed),col("city"))
        .otherwise(None)
    )
)

In [0]:
df_silver.select("city").distinct().display()

city
Bengaluru
Hyderabad
New Delhi
""


In [0]:
df_silver.select('customer_name').distinct().display()

customer_name
FitFuel Market
Athlete's Choice Store
Endurance Foods
HydroBoost Nutrition
MacroBite Superfoods
MacroBite superfoods
PowerSnack Hub
PowerSnack hub
SprintX nutrition
SprintX Nutrition


In [0]:
# intial cap

df_silver = df_silver.withColumn(
    "customer_name",
    when(col('customer_name').isNull(),None)
    .otherwise(initcap('customer_name'))
)

In [0]:
df_silver.select("customer_name").distinct().display()

customer_name
Fitfuel Market
Athlete's Choice Store
Endurance Foods
Hydroboost Nutrition
Macrobite Superfoods
Powersnack Hub
Sprintx Nutrition
Zenathlete Foods
Peak Performance Store
Primefuel Nutrition


In [0]:
df_silver.filter(col("city").isNull()).show(truncate=False)

+-----------+-------------------+----+--------------------------+-------------+---------+
|customer_id|customer_name      |city|read_timestamp            |file_name    |file_size|
+-----------+-------------------+----+--------------------------+-------------+---------+
|789403     |Sprintx Nutrition  |NULL|2026-01-18 20:50:24.017016|customers.csv|1404     |
|789420     |Zenathlete Foods   |NULL|2026-01-18 20:50:24.017016|customers.csv|1404     |
|789521     |Primefuel Nutrition|NULL|2026-01-18 20:50:24.017016|customers.csv|1404     |
|789603     |Recovery Lane      |NULL|2026-01-18 20:50:24.017016|customers.csv|1404     |
+-----------+-------------------+----+--------------------------+-------------+---------+



In [0]:
null_customer_names = ["Sprintx Nutrition","Recovery Lane","Zenathlete Foods","Primefuel Nutrition"]
df_silver.filter(col("customer_name").isin(null_customer_names)).show(truncate=False)

+-----------+-------------------+---------+--------------------------+-------------+---------+
|customer_id|customer_name      |city     |read_timestamp            |file_name    |file_size|
+-----------+-------------------+---------+--------------------------+-------------+---------+
|789401     |Sprintx Nutrition  |Bengaluru|2026-01-18 20:50:24.017016|customers.csv|1404     |
|789402     |Sprintx Nutrition  |Hyderabad|2026-01-18 20:50:24.017016|customers.csv|1404     |
|789403     |Sprintx Nutrition  |NULL     |2026-01-18 20:50:24.017016|customers.csv|1404     |
|789420     |Zenathlete Foods   |NULL     |2026-01-18 20:50:24.017016|customers.csv|1404     |
|789421     |Zenathlete Foods   |Hyderabad|2026-01-18 20:50:24.017016|customers.csv|1404     |
|789422     |Zenathlete Foods   |New Delhi|2026-01-18 20:50:24.017016|customers.csv|1404     |
|789520     |Primefuel Nutrition|Bengaluru|2026-01-18 20:50:24.017016|customers.csv|1404     |
|789521     |Primefuel Nutrition|NULL     |2026-01

In [0]:
customer_city_fix = {

    #Sprintx Nutrition
    789403 : "New Delhi",

    #Recovery Lane
    789603 : "Hyderabad",

    #Zenathlete Foods
    789420 : "Bengaluru",

    #Primefuel Nutrition
    789521 : "Hyderabad"

}

df_fix = spark.createDataFrame(
    [(k,v) for k,v in customer_city_fix.items()],
    ["customer_id","fixed_city"]
)

In [0]:
df_fix.display()

customer_id,fixed_city
789403,New Delhi
789603,Hyderabad
789420,Bengaluru
789521,Hyderabad


In [0]:
df_silver = (
    df_silver
    .join(df_fix,"customer_id","left")
    .withColumn(
        "city",
        coalesce("city","fixed_city") # replacing null values with fixed one
    )
)

In [0]:
df_silver.display()

customer_id,customer_name,city,read_timestamp,file_name,file_size,fixed_city
789101,Endurance Foods,Bengaluru,2026-01-18T20:50:24.017Z,customers.csv,1404,
789102,Endurance Foods,Hyderabad,2026-01-18T20:50:24.017Z,customers.csv,1404,
789103,Endurance Foods,New Delhi,2026-01-18T20:50:24.017Z,customers.csv,1404,
789121,Hydroboost Nutrition,Hyderabad,2026-01-18T20:50:24.017Z,customers.csv,1404,
789122,Hydroboost Nutrition,New Delhi,2026-01-18T20:50:24.017Z,customers.csv,1404,
789201,Fitfuel Market,Bengaluru,2026-01-18T20:50:24.017Z,customers.csv,1404,
789202,Fitfuel Market,Hyderabad,2026-01-18T20:50:24.017Z,customers.csv,1404,
789203,Fitfuel Market,New Delhi,2026-01-18T20:50:24.017Z,customers.csv,1404,
789220,Macrobite Superfoods,Bengaluru,2026-01-18T20:50:24.017Z,customers.csv,1404,
789221,Macrobite Superfoods,Hyderabad,2026-01-18T20:50:24.017Z,customers.csv,1404,


In [0]:
df_silver = df_silver.drop("fixed_city")

In [0]:
# checking
display(
    df_silver.filter(col("city").isNull())
)

customer_id,customer_name,city,read_timestamp,file_name,file_size


In [0]:
df_silver = df_silver.withColumn("customer_id",col("customer_id").cast("string"))
df_silver.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- read_timestamp: timestamp (nullable = true)
 |-- file_name: string (nullable = true)
 |-- file_size: long (nullable = true)



In [0]:
df_silver.show(5)

+-----------+--------------------+---------+--------------------+-------------+---------+
|customer_id|       customer_name|     city|      read_timestamp|    file_name|file_size|
+-----------+--------------------+---------+--------------------+-------------+---------+
|     789503|Peak Performance ...|New Delhi|2026-01-18 20:50:...|customers.csv|     1404|
|     789420|    Zenathlete Foods|Bengaluru|2026-01-18 20:50:...|customers.csv|     1404|
|     789703|      Staminax Store|New Delhi|2026-01-18 20:50:...|customers.csv|     1404|
|     789621|Eliteathlete Nutr...|Hyderabad|2026-01-18 20:50:...|customers.csv|     1404|
|     789101|     Endurance Foods|Bengaluru|2026-01-18 20:50:...|customers.csv|     1404|
+-----------+--------------------+---------+--------------------+-------------+---------+
only showing top 5 rows


In [0]:
df_silver = (
    df_silver
    .withColumn(
        "customer",
        concat_ws("-","customer_name",coalesce(col("city"),lit("Unknown")))
    )

    # jo change nhi honge
    .withColumn("market",lit("India"))
    .withColumn("platform",lit("Sports Bar"))
    .withColumn("channel",lit("Acquisition"))
)

In [0]:
df_silver.display()

customer_id,customer_name,city,read_timestamp,file_name,file_size,customer,market,platform,channel
789503,Peak Performance Store,New Delhi,2026-01-18T20:50:24.017Z,customers.csv,1404,Peak Performance Store-New Delhi,India,Sports Bar,Acquisition
789420,Zenathlete Foods,Bengaluru,2026-01-18T20:50:24.017Z,customers.csv,1404,Zenathlete Foods-Bengaluru,India,Sports Bar,Acquisition
789703,Staminax Store,New Delhi,2026-01-18T20:50:24.017Z,customers.csv,1404,Staminax Store-New Delhi,India,Sports Bar,Acquisition
789621,Eliteathlete Nutrition,Hyderabad,2026-01-18T20:50:24.017Z,customers.csv,1404,Eliteathlete Nutrition-Hyderabad,India,Sports Bar,Acquisition
789101,Endurance Foods,Bengaluru,2026-01-18T20:50:24.017Z,customers.csv,1404,Endurance Foods-Bengaluru,India,Sports Bar,Acquisition
789220,Macrobite Superfoods,Bengaluru,2026-01-18T20:50:24.017Z,customers.csv,1404,Macrobite Superfoods-Bengaluru,India,Sports Bar,Acquisition
789720,Gameplan Foods,Bengaluru,2026-01-18T20:50:24.017Z,customers.csv,1404,Gameplan Foods-Bengaluru,India,Sports Bar,Acquisition
789601,Recovery Lane,Bengaluru,2026-01-18T20:50:24.017Z,customers.csv,1404,Recovery Lane-Bengaluru,India,Sports Bar,Acquisition
789122,Hydroboost Nutrition,New Delhi,2026-01-18T20:50:24.017Z,customers.csv,1404,Hydroboost Nutrition-New Delhi,India,Sports Bar,Acquisition
789402,Sprintx Nutrition,Hyderabad,2026-01-18T20:50:24.017Z,customers.csv,1404,Sprintx Nutrition-Hyderabad,India,Sports Bar,Acquisition


In [0]:
df_silver.write \
    .format("delta") \
        .option("delta.enableChangeDataFeed",True) \
            .option("mergeSchema",True) \
                .mode("overwrite") \
                    .saveAsTable(f"{catalog}.{silver_schema}.{data_source}")

## Gold Processing

In [0]:
df_silver = spark.sql(f"SELECT * FROM {catalog}.{silver_schema}.{data_source};")

In [0]:
# take requried col only

df_gold = df_silver.select("customer_id","customer_name","city","customer","market","platform","channel")

In [0]:
df_gold.display()

customer_id,customer_name,city,customer,market,platform,channel
789503,Peak Performance Store,New Delhi,Peak Performance Store-New Delhi,India,Sports Bar,Acquisition
789420,Zenathlete Foods,Bengaluru,Zenathlete Foods-Bengaluru,India,Sports Bar,Acquisition
789703,Staminax Store,New Delhi,Staminax Store-New Delhi,India,Sports Bar,Acquisition
789621,Eliteathlete Nutrition,Hyderabad,Eliteathlete Nutrition-Hyderabad,India,Sports Bar,Acquisition
789101,Endurance Foods,Bengaluru,Endurance Foods-Bengaluru,India,Sports Bar,Acquisition
789220,Macrobite Superfoods,Bengaluru,Macrobite Superfoods-Bengaluru,India,Sports Bar,Acquisition
789720,Gameplan Foods,Bengaluru,Gameplan Foods-Bengaluru,India,Sports Bar,Acquisition
789601,Recovery Lane,Bengaluru,Recovery Lane-Bengaluru,India,Sports Bar,Acquisition
789122,Hydroboost Nutrition,New Delhi,Hydroboost Nutrition-New Delhi,India,Sports Bar,Acquisition
789402,Sprintx Nutrition,Hyderabad,Sprintx Nutrition-Hyderabad,India,Sports Bar,Acquisition


In [0]:
df_gold.write \
    .format("delta") \
        .option('delta.enableChangeDataFeed','true') \
            .mode('overwrite') \
                .saveAsTable(f"{catalog}.{gold_schema}.sb_dim_{data_source}")

In [0]:
delta_table = DeltaTable.forName(spark,"fmcg.gold.dim_customers")
df_child_customers = spark.table("fmcg.gold.sb_dim_customers").select(
    col("customer_id").alias("customer_code"),
    "customer",
    "market",
    "platform",
    "channel"
)

In [0]:
#upsert code

delta_table.alias("target").merge(
    source = df_child_customers.alias("source"),
    condition="target.customer_code = source.customer_code" 
).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]