# Bronze Ingestion – Raw E-commerce Events

This notebook ingests raw e-commerce event data into the Bronze layer.
No transformations are applied at this stage.


In [0]:
import pyspark.sql.functions as F

dbutils.widgets.text("RUN_DATE", "")
RUN_DATE = dbutils.widgets.get("RUN_DATE").strip()

print("RUN_DATE (optional override) =", RUN_DATE if RUN_DATE else "<empty>")


In [0]:
PIPELINE_NAME  = "ecomm_events_daily_replay"

STATE_TABLE    = "monitoring.pipeline_state"

LANDING_BASE   = "/Volumes/workspace/default/landing/events"

BRONZE_TABLE   = "workspace.default.bronze_events"


In [0]:
state = (spark.table(STATE_TABLE)
         .filter(F.col("pipeline_name") == PIPELINE_NAME)
         .select("last_released_date")
         .first())

released_date = str(state["last_released_date"])

run_date = RUN_DATE if RUN_DATE else released_date

print("released_date =", released_date)
print("✅ run_date used =", run_date)


In [0]:
landing_path = f"{LANDING_BASE}/event_date={run_date}"
print("landing_path =", landing_path)

landing_df = spark.read.parquet(landing_path)


In [0]:
bronze_df = (
    landing_df
    .withColumn("event_date", F.lit(run_date).cast("date"))
    .withColumn("bronze_ingest_ts", F.current_timestamp())
)


In [0]:
spark.sql("CREATE DATABASE IF NOT EXISTS workspace.default")
spark.sql(f"CREATE TABLE IF NOT EXISTS {BRONZE_TABLE} USING DELTA")


In [0]:
(
  bronze_df.write
    .format("delta")
    .mode("overwrite")
    .option("replaceWhere", f"event_date = DATE('{run_date}')")
    .saveAsTable(BRONZE_TABLE)
)

print("✅ Bronze written for", run_date, "rows =", bronze_df.count())


In [0]:
spark.sql(f"""
SELECT event_date, COUNT(*) AS rows
FROM {BRONZE_TABLE}
GROUP BY event_date
ORDER BY event_date
""").show(50, False)
