In [0]:
import pyspark.sql.functions as F
from datetime import date, timedelta

dbutils.widgets.text("RUN_DATE", "")
RUN_DATE = dbutils.widgets.get("RUN_DATE")

if not RUN_DATE:
    # Best default for historical datasets: latest available date in Bronze
    RUN_DATE = (
        spark.table("workspace.default.bronze_events")
             .agg(F.max("event_date").alias("d"))
             .collect()[0]["d"]
    )
    RUN_DATE = str(RUN_DATE)

print("✅ RUN_DATE =", RUN_DATE)


In [0]:
import pyspark.sql.functions as F
from datetime import date, timedelta

dbutils.widgets.text("RUN_DATE", "")
RUN_DATE = dbutils.widgets.get("RUN_DATE")

# For testing (your data is 2019-12-01)
if not RUN_DATE:
    RUN_DATE = "2019-12-01"

print("✅ RUN_DATE =", RUN_DATE)


In [0]:
BRONZE_TABLE = "workspace.default.bronze_events"
SILVER_TABLE = "workspace.default.silver_events"
GOLD_DAILY_TABLE = "workspace.default.gold_daily_funnel"
GOLD_PRODUCT_TABLE = "workspace.default.gold_product_funnel_daily"
MONITOR_TABLE = "workspace.default.pipeline_monitoring"


In [0]:
spark.sql("""
CREATE TABLE IF NOT EXISTS workspace.default.pipeline_monitoring (
  run_date DATE,
  layer STRING,
  table_name STRING,
  rows BIGINT,
  min_event_time TIMESTAMP,
  max_event_time TIMESTAMP,
  inserted_ts TIMESTAMP
)
USING DELTA
PARTITIONED BY (run_date)
""")

print("✅ Monitoring table ready")


In [0]:
import pyspark.sql.functions as F

def collect_metrics(table_name, layer_name, date_col, time_col=None):
    df = spark.table(table_name).where(F.col(date_col) == F.lit(RUN_DATE))

    if time_col:
        agg = df.agg(
            F.count("*").alias("rows"),
            F.min(F.col(time_col)).alias("min_event_time"),
            F.max(F.col(time_col)).alias("max_event_time")
        )
    else:
        agg = df.agg(
            F.count("*").alias("rows")
        ).select(
            "rows",
            F.lit(None).cast("timestamp").alias("min_event_time"),
            F.lit(None).cast("timestamp").alias("max_event_time")
        )

    return (
        agg
        .withColumn("run_date", F.lit(RUN_DATE).cast("date"))
        .withColumn("layer", F.lit(layer_name))
        .withColumn("table_name", F.lit(table_name))
        .withColumn("inserted_ts", F.current_timestamp())
        .select("run_date", "layer", "table_name", "rows", "min_event_time", "max_event_time", "inserted_ts")
    )


In [0]:
monitor_df = (
    collect_metrics(BRONZE_TABLE, "bronze", "event_date", "event_time")
    .unionByName(collect_metrics(SILVER_TABLE, "silver", "event_date", "event_time"))
    .unionByName(collect_metrics(GOLD_DAILY_TABLE, "gold_daily", "event_date"))     # no event_time
    .unionByName(collect_metrics(GOLD_PRODUCT_TABLE, "gold_product", "event_date")) # no event_time
)

display(monitor_df.orderBy("layer"))


In [0]:
(
    monitor_df.write
    .format("delta")
    .mode("overwrite")
    .option("replaceWhere", f"run_date = DATE('{RUN_DATE}')")
    .saveAsTable(MONITOR_TABLE)
)

print(f"✅ Monitoring updated for {RUN_DATE}")


In [0]:
display(
    spark.table(MONITOR_TABLE)
    .where(F.col("run_date") == F.lit(RUN_DATE).cast("date"))
    .orderBy("layer")
)
