# Bronze Ingestion – Raw E-commerce Events

This notebook ingests raw e-commerce event data into the Bronze layer.
No transformations are applied at this stage.


In [0]:
import pyspark.sql.functions as F
from datetime import date, timedelta

dbutils.widgets.text("RUN_DATE", "")
RUN_DATE = dbutils.widgets.get("RUN_DATE")

if not RUN_DATE:
    # Best default for historical datasets: latest available date in Bronze
    RUN_DATE = (
        spark.table("workspace.default.bronze_events")
             .agg(F.max("event_date").alias("d"))
             .collect()[0]["d"]
    )
    RUN_DATE = str(RUN_DATE)

print("✅ RUN_DATE =", RUN_DATE)


In [0]:
from pyspark.sql import functions as F

LANDING_BASE = "/Volumes/workspace/default/landing/events"
BRONZE_TABLE = "workspace.default.bronze_events"

STATE_TABLE = "monitoring.pipeline_state"
PIPELINE_NAME = "ecomm_events_daily_replay"


In [0]:
state = (spark.table(STATE_TABLE)
         .filter(F.col("pipeline_name") == PIPELINE_NAME)
         .select("last_released_date")
         .first())

released_date = state["last_released_date"]
print("Latest released_date:", released_date)

landing_path = f"{LANDING_BASE}/event_date={released_date}"
print("Landing path:", landing_path)


In [0]:
day_df = (spark.read
          .parquet(landing_path)
          .withColumn("bronze_ingest_ts", F.current_timestamp())
          .withColumn("bronze_batch_date", F.lit(released_date).cast("date"))

         )

print("Landing rows:", day_df.count())


In [0]:
spark.sql("CREATE DATABASE IF NOT EXISTS workspace.default")

# Create empty Bronze table from day_df schema (only if it doesn't exist)
if not spark.catalog.tableExists(BRONZE_TABLE):
    (day_df.limit(0)
          .write
          .format("delta")
          .mode("overwrite")
          .saveAsTable(BRONZE_TABLE))
    print("✅ Created Bronze table:", BRONZE_TABLE)
else:
    print("✅ Bronze table already exists:", BRONZE_TABLE)


In [0]:
existing = (spark.table(BRONZE_TABLE)
            .filter(F.col("bronze_batch_date") == F.lit(released_date))
            .limit(1).count() > 0)

print("Already ingested this day into Bronze?", existing)

if not existing:
    (day_df.write
        .format("delta")
        .mode("append")
        .saveAsTable(BRONZE_TABLE))
    print("✅ Appended to Bronze.")
else:
    print("⚠️ Skipped append (day already present in Bronze).")


In [0]:
bronze = spark.table(BRONZE_TABLE)

bronze.select(
    F.count("*").alias("total_rows"),
    F.countDistinct("bronze_batch_date").alias("days_loaded")
).show()

(bronze.groupBy("bronze_batch_date")
 .count()
 .orderBy("bronze_batch_date", ascending=False)
 .show(5, truncate=False))


In [0]:
from datetime import date, timedelta

dbutils.widgets.text("RUN_DATE", "")
RUN_DATE = dbutils.widgets.get("RUN_DATE")

if not RUN_DATE:
    # default = yesterday
    RUN_DATE = str(date.today() - timedelta(days=1))

print("✅ RUN_DATE =", RUN_DATE)
