In [0]:
import pyspark.sql.functions as F
from datetime import date, timedelta

dbutils.widgets.text("RUN_DATE", "")
RUN_DATE = dbutils.widgets.get("RUN_DATE")

if not RUN_DATE:
    # Best default for historical datasets: latest available date in Bronze
    RUN_DATE = (
        spark.table("workspace.default.bronze_events")
             .agg(F.max("event_date").alias("d"))
             .collect()[0]["d"]
    )
    RUN_DATE = str(RUN_DATE)

print("✅ RUN_DATE =", RUN_DATE)


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

CATALOG = "workspace"
SCHEMA  = "default"

BRONZE_TABLE = f"{CATALOG}.{SCHEMA}.bronze_events"
SILVER_TABLE = f"{CATALOG}.{SCHEMA}.silver_events"

print("BRONZE_TABLE =", BRONZE_TABLE)
print("SILVER_TABLE =", SILVER_TABLE)


In [0]:
# Option A (recommended): auto-detect latest day loaded in Bronze
batch_date = (
    spark.table(BRONZE_TABLE)
         .select(F.max("bronze_batch_date").alias("max_day"))
         .collect()[0]["max_day"]
)

# Option B (manual override): uncomment and set the date you want
# batch_date = "2019-12-01"

print("✅ Silver batch_date =", batch_date)


In [0]:
bronze_day = (
    spark.table(BRONZE_TABLE)
         .filter(F.col("bronze_batch_date") == F.lit(batch_date))
)

print("Bronze rows for day:", bronze_day.count())
display(bronze_day.limit(10))


In [0]:
display(
    bronze_day.selectExpr(
        "min(event_time) as min_event_time",
        "max(event_time) as max_event_time",
        "count(*) as rows"
    )
)

display(
    bronze_day.groupBy("event_type")
             .count()
             .orderBy(F.col("count").desc())
)


In [0]:
silver_in = (
    bronze_day
    .select(
        F.col("event_time").cast("timestamp").alias("event_time"),
        F.col("event_type").cast("string").alias("event_type"),
        F.col("product_id").cast("long").alias("product_id"),
        F.col("category_id").cast("long").alias("category_id"),
        F.col("category_code").cast("string").alias("category_code"),
        F.col("brand").cast("string").alias("brand"),
        F.col("price").cast("double").alias("price"),
        F.col("user_id").cast("long").alias("user_id"),
        F.col("user_session").cast("string").alias("user_session"),
        F.col("event_date").cast("date").alias("event_date"),
        F.col("replay_ts").cast("timestamp").alias("replay_ts"),
        F.col("bronze_ingest_ts").cast("timestamp").alias("bronze_ingest_ts"),
        F.col("bronze_batch_date").cast("date").alias("bronze_batch_date"),
    )
)

print("Silver input rows:", silver_in.count())
silver_in.printSchema()
display(silver_in.limit(10))


In [0]:
from pyspark.sql import functions as F, Window

# Source / Target tables
BRONZE_TABLE = "workspace.default.bronze_events"
SILVER_TABLE = "workspace.default.silver_events"

# Use the same day you released to bronze (from replay mechanism)
# If you already have a parameter, keep it. Otherwise:
RUN_DATE = "2019-12-01"   # change later when job runs daily


In [0]:
bronze_day = (
    spark.table(BRONZE_TABLE)
         .filter(F.col("event_date") == F.lit(RUN_DATE))
)

print("Bronze rows for day:", bronze_day.count())
display(bronze_day.limit(10))


In [0]:
cleaned = (
    bronze_day
      # Standardize strings
      .withColumn("event_type", F.lower(F.trim(F.col("event_type"))))
      .withColumn("brand", F.lower(F.trim(F.col("brand"))))
      .withColumn("category_code", F.lower(F.trim(F.col("category_code"))))
      .withColumn("user_session", F.trim(F.col("user_session")))

      # Replace missing brand with "unknown"
      .withColumn("brand", F.when(F.col("brand").isNull() | (F.col("brand") == ""), F.lit("unknown"))
                          .otherwise(F.col("brand")))

      # Optional: keep category_code as null if blank
      .withColumn("category_code", F.when(F.col("category_code") == "", F.lit(None)).otherwise(F.col("category_code")))

      # Basic type safety (usually already correct, but safe)
      .withColumn("price", F.col("price").cast("double"))
      .withColumn("product_id", F.col("product_id").cast("long"))
      .withColumn("category_id", F.col("category_id").cast("long"))
      .withColumn("user_id", F.col("user_id").cast("long"))
)


In [0]:
cleaned = cleaned.withColumn(
    "price",
    F.when(F.col("event_type") == F.lit("purchase"), F.col("price")).otherwise(F.lit(None).cast("double"))
)

# Quick validation
display(
    cleaned.groupBy("event_type")
           .agg(
               F.count("*").alias("rows"),
               F.count("price").alias("non_null_price_rows"),
               F.min("price").alias("min_price"),
               F.max("price").alias("max_price"),
           )
           .orderBy(F.col("rows").desc())
)


In [0]:
dedupe_keys = ["user_session", "event_time", "event_type", "product_id"]

w = Window.partitionBy(*[F.col(c) for c in dedupe_keys]).orderBy(
    F.col("bronze_ingest_ts").desc_nulls_last(),
    F.col("replay_ts").desc_nulls_last()
)

deduped = (
    cleaned
      .withColumn("_rn", F.row_number().over(w))
      .filter(F.col("_rn") == 1)
      .drop("_rn")
)

print("Rows before dedupe:", cleaned.count())
print("Rows after  dedupe:", deduped.count())


In [0]:
deduped = deduped.withColumn(
    "event_id",
    F.sha2(
        F.concat_ws("||",
            F.coalesce(F.col("user_session"), F.lit("")),
            F.col("event_time").cast("string"),
            F.coalesce(F.col("event_type"), F.lit("")),
            F.coalesce(F.col("product_id").cast("string"), F.lit(""))
        ),
        256
    )
)


In [0]:
spark.sql("CREATE DATABASE IF NOT EXISTS workspace.default")

spark.sql(f"""
CREATE TABLE IF NOT EXISTS {SILVER_TABLE}
USING DELTA
PARTITIONED BY (event_date)
AS
SELECT * FROM {BRONZE_TABLE} WHERE 1=0
""")

print("✅ Silver table ready:", SILVER_TABLE)


In [0]:
spark.table(SILVER_TABLE).printSchema()


In [0]:
spark.sql(f"DROP TABLE IF EXISTS {SILVER_TABLE}")

# Create empty silver table with correct schema from your deduped dataframe
(deduped.limit(0)
 .write
 .format("delta")
 .mode("overwrite")
 .saveAsTable(SILVER_TABLE))

print("✅ Recreated Silver table with correct schema:", SILVER_TABLE)


In [0]:
(deduped.write
  .format("delta")
  .mode("overwrite")
  .option("replaceWhere", f"event_date = DATE('{RUN_DATE}')")
  .saveAsTable(SILVER_TABLE))

print(f"✅ Silver written for {RUN_DATE}")


In [0]:
silver_day = spark.table(SILVER_TABLE).filter(F.col("event_date") == F.lit(RUN_DATE))

print("Silver rows for day:", silver_day.count())

display(
    silver_day.groupBy("event_type")
              .count()
              .orderBy(F.col("count").desc())
)

display(
    silver_day.select(
        F.count(F.when(F.col("brand") == "unknown", 1)).alias("unknown_brand_rows"),
        F.count(F.when(F.col("brand").isNull(), 1)).alias("null_brand_rows"),
        F.count(F.when(F.col("category_code").isNull(), 1)).alias("null_category_code_rows"),
        F.count(F.when(F.col("price").isNull(), 1)).alias("null_price_rows"),
    )
)


In [0]:
from pyspark.sql import functions as F

silver_std = (
    deduped
    # 1) keep only valid rows (small drops, but makes Silver reliable)
    .filter(F.col("event_time").isNotNull())
    .filter(F.col("user_session").isNotNull())
    
    # 2) standardize event_type (lowercase + trim)
    .withColumn("event_type", F.lower(F.trim(F.col("event_type"))))
    
    # 3) standardize strings
    .withColumn("brand", F.lower(F.trim(F.col("brand"))))
    .withColumn("category_code", F.lower(F.trim(F.col("category_code"))))
    
    # 4) fill missing values (Silver standardization)
    .withColumn("brand", F.coalesce(F.col("brand"), F.lit("unknown")))
    .withColumn("category_code", F.coalesce(F.col("category_code"), F.lit("unknown")))
    
    # 5) enforce datatypes (VERY important for Delta stability)
    .withColumn("product_id", F.col("product_id").cast("long"))
    .withColumn("category_id", F.col("category_id").cast("long"))
    .withColumn("user_id", F.col("user_id").cast("long"))
    .withColumn("price", F.col("price").cast("double"))
    
    # 6) price business rules:
    # - Only purchases keep price
    # - negative price becomes null
    .withColumn(
        "price",
        F.when(F.col("event_type") == "purchase", F.col("price")).otherwise(F.lit(None).cast("double"))
    )
    .withColumn(
        "price",
        F.when((F.col("event_type") == "purchase") & (F.col("price") >= 0), F.col("price")).otherwise(F.lit(None).cast("double"))
    )
    
    # 7) derived flags used later in Gold
    .withColumn("is_purchase", (F.col("event_type") == "purchase").cast("int"))
    .withColumn("is_cart_event", F.col("event_type").isin("cart", "remove_from_cart").cast("int"))
)

print("✅ Silver standardized rows:", silver_std.count())
silver_std.printSchema()
display(silver_std.limit(10))


In [0]:
# 1) event type counts
display(
    silver_std.groupBy("event_type")
             .count()
             .orderBy(F.col("count").desc())
)

# 2) brand/category_code should NOT be null now
display(
    silver_std.select(
        F.count(F.when(F.col("brand").isNull(), 1)).alias("null_brand"),
        F.count(F.when(F.col("category_code").isNull(), 1)).alias("null_category_code"),
        F.count(F.when(F.col("user_session").isNull(), 1)).alias("null_user_session"),
        F.count(F.when(F.col("event_time").isNull(), 1)).alias("null_event_time"),
    )
)

# 3) price must exist ONLY for purchases
display(
    silver_std.groupBy("event_type")
             .agg(
                 F.count("*").alias("rows"),
                 F.count(F.col("price")).alias("non_null_price_rows"),
                 F.min("price").alias("min_price"),
                 F.max("price").alias("max_price"),
             )
             .orderBy(F.col("rows").desc())
)


In [0]:
# --- Silver Step 3: Ensure table exists with FINAL Silver schema ---

# 1) Drop old table (created from Bronze schema)
spark.sql(f"DROP TABLE IF EXISTS {SILVER_TABLE}")

# 2) Create EMPTY Delta table with the Silver dataframe schema (including new columns)
(
  silver_std.limit(0)
    .write.format("delta")
    .mode("overwrite")
    .partitionBy("event_date")
    .saveAsTable(SILVER_TABLE)
)

print(f"✅ Recreated Silver table with correct schema: {SILVER_TABLE}")


In [0]:
# 3) Write only the RUN_DATE partition into Silver
(
  silver_std.write
    .format("delta")
    .mode("overwrite")
    .option("replaceWhere", f"event_date = DATE('{RUN_DATE}')")
    .saveAsTable(SILVER_TABLE)
)

print(f"✅ Silver written (standardized) for {RUN_DATE} into {SILVER_TABLE}")


In [0]:
silver_check = spark.table(SILVER_TABLE).where(f"event_date = DATE('{RUN_DATE}')")
print("✅ Rows in Silver for RUN_DATE:", silver_check.count())
display(silver_check.groupBy("event_type").count().orderBy("count", ascending=False))
silver_check.printSchema()


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# ---------- Step 3: Standardize ----------
silver_std = (
    deduped
    # standardize strings
    .withColumn("brand", F.coalesce(F.col("brand"), F.lit("unknown")))
    .withColumn("category_code", F.coalesce(F.col("category_code"), F.lit("unknown")))
    
    # make sure types are stable
    .withColumn("product_id", F.col("product_id").cast("long"))
    .withColumn("category_id", F.col("category_id").cast("long"))
    .withColumn("user_id", F.col("user_id").cast("long"))
    .withColumn("price", F.col("price").cast("double"))
    .withColumn("event_time", F.col("event_time").cast("timestamp"))
    .withColumn("event_date", F.col("event_date").cast("date"))

    # create a stable unique event_id (based on session + time + type + product)
    .withColumn(
        "event_id",
        F.sha2(
            F.concat_ws("||",
                        F.col("user_session"),
                        F.col("event_time").cast("string"),
                        F.col("event_type"),
                        F.col("product_id").cast("string")
                       ),
            256
        )
    )

    # flags
    .withColumn("is_purchase", F.when(F.col("event_type") == "purchase", F.lit(1)).otherwise(F.lit(0)))
    .withColumn("is_cart_event", F.when(F.col("event_type").isin("cart", "remove_from_cart"), F.lit(1)).otherwise(F.lit(0)))
)

# ---------- Write only this RUN_DATE partition ----------
(
    silver_std.write
    .format("delta")
    .mode("overwrite")
    .option("mergeSchema", "true")          # safe if you ever add columns
    .option("replaceWhere", f"event_date = DATE('{RUN_DATE}')")
    .saveAsTable(SILVER_TABLE)
)

print(f"✅ Silver written (standardized) for {RUN_DATE} into {SILVER_TABLE}")


In [0]:
silver_check = spark.table(SILVER_TABLE).where(f"event_date = DATE('{RUN_DATE}')")

print("✅ Rows in Silver for RUN_DATE:", silver_check.count())
display(silver_check.groupBy("event_type").count().orderBy(F.col("count").desc()))
silver_check.printSchema()


In [0]:
from datetime import date, timedelta

dbutils.widgets.text("RUN_DATE", "")
RUN_DATE = dbutils.widgets.get("RUN_DATE")

if not RUN_DATE:
    # default = yesterday
    RUN_DATE = str(date.today() - timedelta(days=1))

print("✅ RUN_DATE =", RUN_DATE)
