In [0]:
%run ./00_setup_and_config

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import col, max as max_

Schema Definition

In [0]:
raw_data_schema = StructType([
    StructField("InvoiceNo", StringType(), True),
    StructField("StockCode", StringType(), True),
    StructField("Description", StringType(), True),
    StructField("Quantity", IntegerType(), True),
    StructField("InvoiceDate", StringType(), True),
    StructField("UnitPrice", DoubleType(), True),
    StructField("CustomerID", StringType(), True),
    StructField("Country", StringType(), True)
])

Read Source Data

In [0]:
try:
    df_raw = spark.read \
        .option("header", "True") \
        .schema(raw_data_schema) \
        .csv("/mnt/raw_data/OnlineRetail.csv")
except Exception as e:
    print("Error reading raw data: ", e)
    raise e


Watermark Handling

In [None]:
watermark_path = f"{bronze_metadata_path}/watermark"

if spark._jsparkSession.catalog().tableExists("delta.`{}`".format(watermark_path)):
    last_watermark = (
        spark.read.format("delta")
        .load(watermark_path)
        .select(max_("last_processed_ts"))
        .collect()[0][0]
    )
else:
    last_watermark = None

Incremental Filter

In [None]:
df_raw = df_raw.withColumn(
    "InvoiceDate_ts",
    col("InvoiceDate").cast("timestamp")
)
if last_watermark:
    df_raw = df_raw.filter(col("InvoiceDate_ts") > last_watermark)

Write to Bronze (Append Only)

In [0]:
try:
    df_raw \
    .drop("InvoiceDate_ts") \
    .write \
    .format("delta") \
    .mode("append") \
    .option("mergeSchema", "true") \
    .option("delta.enableChangeDataFeed", "true") \
    .save(raw_delta_path)
except Exception as e:
    print("Error writing raw data: ", e)

Update Watermark

In [None]:
new_watermark = (
    df_raw.select(max_("InvoiceDate_ts").alias("last_processed_ts"))
)

try:
    new_watermark \
    .write \
    .format("delta") \
    .mode("append") \
    .save(watermark_path)
except Exception as e:
    print("Error updating watermark: ", e)