# DAY 6 : Medallion Architecture

In [0]:
from pyspark.sql import functions as F

# Path to raw file
raw_path = "/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv"

# BRONZE: Raw Ingestion
# We keep all columns, even the ones with nulls
bronze_df = spark.read.csv(raw_path, header=True, inferSchema=True)

# Save to Bronze Table
bronze_df.write.format("delta").mode("overwrite").saveAsTable("workspace.ecommerce.bronze_events")

print("Bronze Layer: Raw data ingested.")

Bronze Layer: Raw data ingested.


In [0]:
# Load from Bronze
bronze_data = spark.read.table("workspace.ecommerce.bronze_events")

# SILVER: Cleaning
# 1. Drop rows with no user_id or product_id
# 2. Convert event_time to actual Timestamp type
# 3. Fill missing category_code with 'Unknown'
silver_df = bronze_data.dropna(subset=["user_id", "product_id"]) \
    .withColumn("event_time", F.to_timestamp("event_time")) \
    .fillna({"category_code": "Unknown", "brand": "Generic"}) \
    .dropDuplicates()

# Save to Silver Table
silver_df.write.format("delta").mode("overwrite").saveAsTable("workspace.ecommerce.silver_events")

print("Silver Layer: Data cleaned and validated.")

Silver Layer: Data cleaned and validated.


In [0]:
# Load from Silver
silver_data = spark.read.table("workspace.ecommerce.silver_events")

# GOLD: Aggregations (e.g., Daily Revenue per Brand)
gold_df = silver_data.filter(F.col("event_type") == "purchase") \
    .groupBy(F.window("event_time", "1 day"), "brand") \
    .agg(
        F.sum("price").alias("daily_revenue"),
        F.count("product_id").alias("total_sales")
    ) \
    .orderBy(F.desc("daily_revenue"))

# Save to Gold Table
gold_df.write.format("delta").mode("overwrite").saveAsTable("workspace.ecommerce.gold_brand_performance")

display(gold_df.limit(10))
print("Gold Layer: Business aggregates built.")

window,brand,daily_revenue,total_sales
"List(2019-10-16T00:00:00.000Z, 2019-10-17T00:00:00.000Z)",apple,4699540.539999997,6268
"List(2019-10-14T00:00:00.000Z, 2019-10-15T00:00:00.000Z)",apple,4693428.49,5940
"List(2019-10-17T00:00:00.000Z, 2019-10-18T00:00:00.000Z)",apple,4499888.089999997,5826
"List(2019-10-15T00:00:00.000Z, 2019-10-16T00:00:00.000Z)",apple,4291395.66,5372
"List(2019-10-04T00:00:00.000Z, 2019-10-05T00:00:00.000Z)",apple,4165188.340000003,5427
"List(2019-10-18T00:00:00.000Z, 2019-10-19T00:00:00.000Z)",apple,3975917.340000001,5039
"List(2019-10-22T00:00:00.000Z, 2019-10-23T00:00:00.000Z)",apple,3967597.370000005,5084
"List(2019-10-21T00:00:00.000Z, 2019-10-22T00:00:00.000Z)",apple,3955075.849999999,5055
"List(2019-10-13T00:00:00.000Z, 2019-10-14T00:00:00.000Z)",apple,3751495.920000002,4779
"List(2019-10-11T00:00:00.000Z, 2019-10-12T00:00:00.000Z)",apple,3679462.900000005,4682


Gold Layer: Business aggregates built.
