In [0]:
%run ./00_setup_and_config

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [0]:
df_bronze_cdf = (
    spark.read
    .format("delta")
    .option("readChangeData", "true")
    .option("startingVersion", 0)  
    .load(raw_delta_path)
)

In [0]:
df_bronze_changes = df_bronze_cdf.filter(
    F.col("_change_type").isin("insert", "update_postimage", "delete")
)

Clean, Enforce Schema & Feature Prep

In [0]:
df_silver_stage = (
    df_bronze_changes
    .na.drop(subset=["InvoiceNo", "CustomerID"])
    .withColumn("Quantity", F.col("Quantity").cast(IntegerType()))
    .withColumn("UnitPrice", F.col("UnitPrice").cast(DoubleType()))
    .withColumn("CustomerID", F.col("CustomerID").cast(IntegerType()))
    .withColumn("InvoiceDate", F.to_timestamp("InvoiceDate", "M/d/yyyy H:mm"))
    .filter(F.col("Quantity") > 0)
    .filter(F.col("UnitPrice") > 0)
)

Deduplication 

In [0]:
window_spec = (
    Window
    .partitionBy("InvoiceNo", "StockCode", "CustomerID")
    .orderBy(F.col("InvoiceDate").desc())
)

df_silver_stage = (
    df_silver_stage
    .withColumn("rn", F.row_number().over(window_spec))
    .filter(F.col("rn") == 1)
    .drop("rn")
)

Feature Engineering

In [0]:
df_silver_stage = (
    df_silver_stage
    .withColumn("TotalPrice", F.col("Quantity") * F.col("UnitPrice"))
    .withColumn("Invoice_Date", F.to_date("InvoiceDate"))
    .withColumn("Year", F.year("InvoiceDate"))
    .withColumn("Month", F.month("InvoiceDate"))
)

MERGE into Silver (Upserts + Deletes)

In [0]:
df_silver_stage.createOrReplaceTempView("silver_stage")

spark.sql(f"""
MERGE INTO delta.`{silver_delta_path}` tgt
USING silver_stage src
ON tgt.InvoiceNo = src.InvoiceNo
AND tgt.StockCode = src.StockCode
AND tgt.CustomerID = src.CustomerID

WHEN MATCHED AND src._change_type = 'delete'
  THEN DELETE

WHEN MATCHED
  THEN UPDATE SET *

WHEN NOT MATCHED
  THEN INSERT *
""")
    