# Bronze Ingestion – Raw E-commerce Events

This notebook ingests raw e-commerce event data into the Bronze layer.
No transformations are applied at this stage.


In [0]:
# Table names for source and bronze target
SOURCE_TABLE = "workspace.default.events"
BRONZE_TABLE = "workspace.default.bronze_events"

# Small state table to remember the last processed date
STATE_TABLE = "workspace.default.bronze_ingestion_state"


In [0]:
# Spark functions + Python date helpers
from pyspark.sql import functions as F
from datetime import timedelta


In [0]:
# Load source events and derive event_date for daily chunking
df_src = spark.table(SOURCE_TABLE).withColumn("event_date", F.to_date("event_time"))

# Quick date range check (used to stop when no more new days exist)
min_date = df_src.select(F.min("event_date").alias("min_date")).collect()[0]["min_date"]
max_date = df_src.select(F.max("event_date").alias("max_date")).collect()[0]["max_date"]

print("Min event_date:", min_date)
print("Max event_date:", max_date)


In [0]:
# Create the state table once (stores last processed day)
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {STATE_TABLE} (
  last_processed_date DATE
)
USING delta
""")

# If state is empty, initialize last_processed_date = (min_date - 1 day)
state_count = spark.table(STATE_TABLE).count()

if state_count == 0:
    init_last = min_date - timedelta(days=1)
    spark.createDataFrame([(init_last,)], ["last_processed_date"]).write.format("delta").mode("overwrite").saveAsTable(STATE_TABLE)

# Read last processed date as a Python date
last_processed = spark.table(STATE_TABLE).select("last_processed_date").collect()[0]["last_processed_date"]
print("Last processed:", last_processed)


In [0]:
# Process one day per run (job-friendly incremental)
next_date = last_processed + timedelta(days=1)
print("Next date:", next_date, "| Max date:", max_date)

# If nothing new left, exit cleanly (job will show as succeeded)
if next_date > max_date:
    dbutils.notebook.exit("No new dates to process.")


In [0]:
# Filter source to one day only (the "chunk")
df_day = df_src.filter(F.col("event_date") == F.lit(next_date))

day_count = df_day.count()
print("Rows for next_date:", day_count)

# If day has no rows (rare), still advance the state and exit
if day_count == 0:
    spark.createDataFrame([(next_date,)], ["last_processed_date"]).write.format("delta").mode("overwrite").saveAsTable(STATE_TABLE)
    dbutils.notebook.exit(f"No rows for {next_date}, state advanced.")


In [0]:
# Create Bronze table if missing (schema inferred from first write)
# Write only this day and keep reruns safe using replaceWhere
(
    df_day.drop("event_date")  # keep bronze raw schema same as before (optional)
    .write
    .format("delta")
    .mode("overwrite")
    .option("replaceWhere", f"to_date(event_time) = '{next_date}'")
    .saveAsTable(BRONZE_TABLE)
)

print(f"✅ Bronze written for date: {next_date} into {BRONZE_TABLE}")


In [0]:
# Update state to mark this day as completed
spark.createDataFrame([(next_date,)], ["last_processed_date"]).write.format("delta").mode("overwrite").saveAsTable(STATE_TABLE)

print("✅ State updated. Last processed is now:", next_date)


In [0]:
# Quick check: confirm bronze has rows for that date
bronze_day_rows = spark.table(BRONZE_TABLE).filter(F.to_date("event_time") == F.lit(next_date)).count()
print("Bronze rows for processed date:", bronze_day_rows)

display(spark.table(BRONZE_TABLE).orderBy(F.col("event_time")).limit(20))
