In [0]:
from pyspark.sql import functions as F

SILVER_TABLE = "workspace.default.silver_events"
GOLD_TABLE   = "workspace.default.gold_daily_funnel"

# Use latest available event_date in Silver (best for historical datasets)
RUN_DATE = spark.table(SILVER_TABLE).agg(F.max("event_date").alias("d")).collect()[0]["d"]

print("Using RUN_DATE =", RUN_DATE)


In [0]:
silver_day = (
    spark.table(SILVER_TABLE)
         .where(F.col("event_date") == F.lit(RUN_DATE))
)

print("Silver rows for RUN_DATE:", silver_day.count())
display(silver_day.groupBy("event_type").count())


In [0]:
if silver_day.count() == 0:
    raise ValueError(f"No Silver data found for RUN_DATE={RUN_DATE}. Pick a valid event_date.")


In [0]:
row_count = silver_day.count()
print("Silver rows for RUN_DATE:", row_count)

if row_count == 0:
    raise ValueError(f"No Silver data found for RUN_DATE={RUN_DATE}. Pick a valid event_date.")


In [0]:
daily_funnel = (
    silver_day
    .groupBy("event_date")
    .agg(
        F.sum(F.when(F.col("event_type") == "view", 1).otherwise(0)).alias("view_events"),
        F.sum(F.when(F.col("event_type") == "cart", 1).otherwise(0)).alias("cart_events"),
        F.sum(F.when(F.col("event_type") == "remove_from_cart", 1).otherwise(0)).alias("remove_from_cart_events"),
        F.sum(F.when(F.col("event_type") == "purchase", 1).otherwise(0)).alias("purchase_events")
    )
    .withColumn("run_date", F.lit(RUN_DATE))
    .withColumn("gold_created_ts", F.current_timestamp())
)

display(daily_funnel)
daily_funnel.printSchema()


In [0]:
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {GOLD_TABLE}
USING DELTA
PARTITIONED BY (event_date)
AS SELECT * FROM {SILVER_TABLE} WHERE 1=0
""")


In [0]:
spark.sql(f"DROP TABLE IF EXISTS {GOLD_TABLE}")

# Create empty table using the schema of daily_funnel (correct schema)
daily_funnel.limit(0).write.format("delta").saveAsTable(GOLD_TABLE)

print("✅ Recreated Gold table with correct schema:", GOLD_TABLE)


In [0]:
(
    daily_funnel.write
        .format("delta")
        .mode("overwrite")
        .option("replaceWhere", f"event_date = DATE('{RUN_DATE}')")
        .saveAsTable(GOLD_TABLE)
)

print(f"✅ Gold daily funnel written for {RUN_DATE}")


In [0]:
gold_check = spark.table(GOLD_TABLE).where(F.col("event_date") == F.lit(RUN_DATE))
print("Rows in Gold for RUN_DATE:", gold_check.count())
display(gold_check)


In [0]:
from pyspark.sql import functions as F

gold_day = spark.table(GOLD_TABLE).where(F.col("event_date") == F.lit(RUN_DATE))

print("Gold rows for RUN_DATE:", gold_day.count())
display(gold_day)


In [0]:
gold_rates = (
    gold_day
    .withColumn(
        "view_to_cart_rate",
        F.when(F.col("view_events") > 0, F.col("cart_events") / F.col("view_events")).otherwise(F.lit(0.0))
    )
    .withColumn(
        "cart_to_purchase_rate",
        F.when(F.col("cart_events") > 0, F.col("purchase_events") / F.col("cart_events")).otherwise(F.lit(0.0))
    )
    .withColumn(
        "view_to_purchase_rate",
        F.when(F.col("view_events") > 0, F.col("purchase_events") / F.col("view_events")).otherwise(F.lit(0.0))
    )
)

display(gold_rates.select(
    "event_date",
    "view_events", "cart_events", "purchase_events",
    "view_to_cart_rate", "cart_to_purchase_rate", "view_to_purchase_rate"
))


In [0]:
# Add new columns to Gold table schema (run once)
spark.sql(f"""
ALTER TABLE {GOLD_TABLE}
ADD COLUMNS (
  view_to_cart_rate DOUBLE,
  cart_to_purchase_rate DOUBLE,
  view_to_purchase_rate DOUBLE
)
""")

print("✅ Added conversion rate columns to Gold table schema")


In [0]:
(
    gold_rates.write
        .format("delta")
        .mode("overwrite")
        .option("replaceWhere", f"event_date = DATE('{RUN_DATE}')")
        .saveAsTable(GOLD_TABLE)
)

print(f"✅ Gold conversion rates written for {RUN_DATE}")


In [0]:
gold_final = spark.table(GOLD_TABLE).where(F.col("event_date") == F.lit(RUN_DATE))

display(gold_final.select(
    "event_date",
    "view_events", "cart_events", "remove_from_cart_events", "purchase_events",
    "view_to_cart_rate", "cart_to_purchase_rate", "view_to_purchase_rate",
    "run_date"
))
gold_final.printSchema()
