Setup Spark & Delta Lake

In [None]:
from pathlib import Path
from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, datediff, avg, count
from pyspark.sql.functions import sum as _sum
from pyspark.sql.window import Window

builder = SparkSession.builder \
    .appName("Bronze Layer Ingestion") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

Cumulative Sales per Customer

In [None]:
# Read from Silver Layer (not Bronze)
silver_path = Path("../delta/silver")
gold_path = Path("../delta/gold")

orders = spark.read.format("delta").load(str(silver_path / "orders"))
orders_items = spark.read.format("delta").load(
    str(silver_path / "orders_items"))

# Add customer_id to orders_items via join on order_id
orders_items_with_customer = orders_items.alias("oi") \
    .join(
        orders.select("order_id", "customer_id").alias("o"),
        on=col("oi.order_id") == col("o.order_id"),
        how="left"
).drop(col("o.order_id"))

# Cumulative Sales per Customer (Running total of total_price)
window_spec = Window.partitionBy("customer_id").orderBy("order_id") \
    .rowsBetween(Window.unboundedPreceding, Window.currentRow)

cumulative_sales = orders_items_with_customer \
    .withColumn("cumulative_total_price", _sum("total_price").over(window_spec))

# Save it as delta table in gold layer
cumulative_sales.write.format("delta").mode(
    "overwrite").save(str(gold_path / "customer_sales"))

Rolling Average Delivery Time per Product Category

In [None]:
# Join orders_items with orders to bring in the days column
orders_items_with_days = orders_items.alias("oi") \
    .join(
        orders.select("order_id", "days").alias("o"),
        on="order_id",
        how="left"
)

# Calculate Rolling Average Delivery Time per product_category_name using a Window of last 3 rows
window_spec = Window.partitionBy("product_category_name").orderBy("order_id") \
    .rowsBetween(-2, 0)  # Current and 2 preceding rows

rolling_avg_delivery = orders_items_with_days \
    .withColumn("rolling_avg_days", avg("days").over(window_spec))

# Save result to the Gold Layer as a Delta table
rolling_avg_delivery.write.format("delta").mode("overwrite").save(
    str(gold_path / "product_category_rolling_avg_delivery"))

KPI Summary Table

In [None]:
# Total sales per product category
total_sales_by_category = orders_items.groupBy("product_category_name") \
    .agg(sum("total_price").alias("total_sales"))

total_sales_by_category.write.format("delta").mode("overwrite") \
    .save(str(gold_path / "kpi_total_sales_per_category"))

# Average delivery time per seller
avg_delivery_by_seller = orders_items.groupBy("seller_id") \
    .agg(avg("days").alias("avg_delivery_days"))

avg_delivery_by_seller.write.format("delta").mode("overwrite") \
    .save(str(gold_path / "kpi_avg_delivery_per_seller"))

# Order counts per customer state
order_counts_by_state = orders.groupBy("customer_state") \
    .agg(count("order_id").alias("order_count"))

order_counts_by_state.write.format("delta") \
    .mode("overwrite") \
    .save(str(gold_path / "kpi_order_counts_per_state"))

Reporting Queries

In [None]:
# Load and register views
spark.read.format("delta").load(str(gold_path / "kpi_total_sales_per_category")) \
    .createOrReplaceTempView("total_sales_per_category")

spark.read.format("delta").load(str(gold_path / "kpi_avg_delivery_per_seller")) \
    .createOrReplaceTempView("avg_delivery_per_seller")

spark.read.format("delta").load(str(gold_path / "kpi_order_count_per_state")) \
    .createOrReplaceTempView("order_counts_per_state")

In [None]:
# Show all the results
spark.sql("SELECT * FROM total_sales_per_category").show()
spark.sql("SELECT * FROM avg_delivery_per_seller").show()
spark.sql("SELECT * FROM order_counts_per_state").show()