In [0]:
from pyspark.sql import functions as F

# Source raw delta table you created
RAW_TABLE = "workspace.default.raw_events_2019_dec"

# State table to remember progress
STATE_TABLE = "monitoring.pipeline_state"

PIPELINE_NAME = "ecomm_events_daily_replay"

# Landing zone (new daily slice will be written here each run)
LANDING_BASE = "/Volumes/workspace/default/landing/events"


In [0]:
spark.sql("CREATE DATABASE IF NOT EXISTS monitoring")

spark.sql(f"""
CREATE TABLE IF NOT EXISTS {STATE_TABLE} (
  pipeline_name STRING,
  last_released_date DATE,
  updated_at TIMESTAMP
)
USING DELTA
""")

# Ensure one row exists for this pipeline
spark.sql(f"""
MERGE INTO {STATE_TABLE} t
USING (SELECT '{PIPELINE_NAME}' AS pipeline_name) s
ON t.pipeline_name = s.pipeline_name
WHEN NOT MATCHED THEN
  INSERT (pipeline_name, last_released_date, updated_at)
  VALUES (s.pipeline_name, DATE('2019-11-30'), current_timestamp())
""")


In [0]:
state = (spark.table(STATE_TABLE)
         .filter(F.col("pipeline_name") == PIPELINE_NAME)
         .select("last_released_date")
         .first())

last_released = state["last_released_date"]
next_date = spark.sql(f"SELECT date_add(DATE('{last_released}'), 1) AS d").first()["d"]

print("last_released_date:", last_released)
print("next_date_to_release:", next_date)


In [0]:
raw = spark.table(RAW_TABLE)

exists = raw.filter(F.col("event_date") == F.lit(next_date)).limit(1).count() > 0
print("date exists in raw table?", exists)

if not exists:
    print("✅ No more dates left to release. Pipeline is finished for December.")


In [0]:
%sql
CREATE VOLUME IF NOT EXISTS workspace.default.landing;


In [0]:
if exists:
    day_df = raw.filter(F.col("event_date") == F.lit(next_date)) \
                .withColumn("replay_ts", F.current_timestamp())

    target_path = f"{LANDING_BASE}/event_date={next_date}"

    # Idempotent for that day: overwrite only that day's landing folder
    (day_df.write
         .mode("overwrite")
         .parquet(target_path))

    released_rows = day_df.count()
    print(f"✅ Released {released_rows} rows to: {target_path}")


In [0]:
if exists:
    spark.sql(f"""
      UPDATE {STATE_TABLE}
      SET last_released_date = DATE('{next_date}'),
          updated_at = current_timestamp()
      WHERE pipeline_name = '{PIPELINE_NAME}'
    """)
    print("✅ State updated. Next run will release the next day.")


In [0]:
if exists:
    sample = (spark.read
              .parquet(f"{LANDING_BASE}/event_date={next_date}")
              .count())
    print("Landing rows (read back):", sample)
