In [0]:
import pyspark.sql.functions as F

dbutils.widgets.text("RUN_DATE", "2019-12-04")
RUN_DATE = dbutils.widgets.get("RUN_DATE").strip()
print("✅ RUN_DATE (optional override) =", RUN_DATE if RUN_DATE else "<empty>")


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

CATALOG = "workspace"
SCHEMA  = "default"

BRONZE_TABLE = f"{CATALOG}.{SCHEMA}.bronze_events"
SILVER_TABLE = f"{CATALOG}.{SCHEMA}.silver_events"

print("BRONZE_TABLE =", BRONZE_TABLE)
print("SILVER_TABLE =", SILVER_TABLE)


In [0]:
if RUN_DATE:
    batch_date = RUN_DATE
else:
    batch_date = (
        spark.table(BRONZE_TABLE)
             .agg(F.max("event_date").alias("d"))
             .collect()[0]["d"]
    )
    batch_date = str(batch_date)

print("✅ Silver batch_date =", batch_date)


In [0]:
bronze_day = (
    spark.table(BRONZE_TABLE)
         .filter(F.col("event_date") == F.lit(batch_date).cast("date"))
)
print("✅ Bronze rows for day:", bronze_day.count())
display(bronze_day.limit(10))


In [0]:
display(
    bronze_day.selectExpr(
        "min(event_time) as min_event_time",
        "max(event_time) as max_event_time",
        "count(*) as rows"
    )
)

display(
    bronze_day.groupBy("event_type")
             .count()
             .orderBy(F.col("count").desc())
)


In [0]:
silver_in = (
    bronze_day
    .select(
        F.col("event_time").cast("timestamp").alias("event_time"),
        F.col("event_type").cast("string").alias("event_type"),
        F.col("product_id").cast("long").alias("product_id"),
        F.col("category_id").cast("long").alias("category_id"),
        F.col("category_code").cast("string").alias("category_code"),
        F.col("brand").cast("string").alias("brand"),
        F.col("price").cast("double").alias("price"),
        F.col("user_id").cast("long").alias("user_id"),
        F.col("user_session").cast("string").alias("user_session"),
        F.col("event_date").cast("date").alias("event_date"),
        F.col("replay_ts").cast("timestamp").alias("replay_ts"),
        F.col("bronze_ingest_ts").cast("timestamp").alias("bronze_ingest_ts"),
        F.col("bronze_batch_date").cast("date").alias("bronze_batch_date"),
    )
)

print("Silver input rows:", silver_in.count())
silver_in.printSchema()
display(silver_in.limit(10))


In [0]:
from pyspark.sql import functions as F, Window

# Source / Target tables
BRONZE_TABLE = "workspace.default.bronze_events"
SILVER_TABLE = "workspace.default.silver_events"

# Use the same day you released to bronze (from replay mechanism)
# If you already have a parameter, keep it. Otherwise:
RUN_DATE = "2019-12-01"   # change later when job runs daily


In [0]:
bronze_day = (
    spark.table(BRONZE_TABLE)
         .filter(F.col("event_date") == F.lit(RUN_DATE))
)

print("Bronze rows for day:", bronze_day.count())
display(bronze_day.limit(10))


In [0]:
cleaned = (
    bronze_day
      # Standardize strings
      .withColumn("event_type", F.lower(F.trim(F.col("event_type"))))
      .withColumn("brand", F.lower(F.trim(F.col("brand"))))
      .withColumn("category_code", F.lower(F.trim(F.col("category_code"))))
      .withColumn("user_session", F.trim(F.col("user_session")))

      # Replace missing brand with "unknown"
      .withColumn("brand", F.when(F.col("brand").isNull() | (F.col("brand") == ""), F.lit("unknown"))
                          .otherwise(F.col("brand")))

      # Optional: keep category_code as null if blank
      .withColumn("category_code", F.when(F.col("category_code") == "", F.lit(None)).otherwise(F.col("category_code")))

      # Basic type safety (usually already correct, but safe)
      .withColumn("price", F.col("price").cast("double"))
      .withColumn("product_id", F.col("product_id").cast("long"))
      .withColumn("category_id", F.col("category_id").cast("long"))
      .withColumn("user_id", F.col("user_id").cast("long"))
)


In [0]:
cleaned = cleaned.withColumn(
    "price",
    F.when(F.col("event_type") == F.lit("purchase"), F.col("price")).otherwise(F.lit(None).cast("double"))
)

# Quick validation
display(
    cleaned.groupBy("event_type")
           .agg(
               F.count("*").alias("rows"),
               F.count("price").alias("non_null_price_rows"),
               F.min("price").alias("min_price"),
               F.max("price").alias("max_price"),
           )
           .orderBy(F.col("rows").desc())
)


In [0]:
dedupe_keys = ["user_session", "event_time", "event_type", "product_id"]

w = Window.partitionBy(*[F.col(c) for c in dedupe_keys]).orderBy(
    F.col("bronze_ingest_ts").desc_nulls_last(),
    F.col("replay_ts").desc_nulls_last()
)

deduped = (
    cleaned
      .withColumn("_rn", F.row_number().over(w))
      .filter(F.col("_rn") == 1)
      .drop("_rn")
)

print("Rows before dedupe:", cleaned.count())
print("Rows after  dedupe:", deduped.count())


In [0]:
deduped = deduped.withColumn(
    "event_id",
    F.sha2(
        F.concat_ws("||",
            F.coalesce(F.col("user_session"), F.lit("")),
            F.col("event_time").cast("string"),
            F.coalesce(F.col("event_type"), F.lit("")),
            F.coalesce(F.col("product_id").cast("string"), F.lit(""))
        ),
        256
    )
)


In [0]:
spark.sql("CREATE DATABASE IF NOT EXISTS workspace.default")

spark.sql(f"""
CREATE TABLE IF NOT EXISTS {SILVER_TABLE}
USING DELTA
PARTITIONED BY (event_date)
AS
SELECT * FROM {BRONZE_TABLE} WHERE 1=0
""")

print("✅ Silver table ready:", SILVER_TABLE)


In [0]:
spark.table(SILVER_TABLE).printSchema()


In [0]:
spark.sql("CREATE DATABASE IF NOT EXISTS workspace.default")
spark.sql(f"CREATE TABLE IF NOT EXISTS {SILVER_TABLE} USING DELTA")


In [0]:
from pyspark.sql import functions as F

# Make sure deduped contains ONLY the current day
deduped_day = deduped.filter(F.col("event_date") == F.lit(batch_date).cast("date"))

print("✅ Rows in deduped_day =", deduped_day.count())

(deduped_day.write
  .format("delta")
  .mode("overwrite")
  .option("replaceWhere", f"event_date = DATE('{batch_date}')")
  .saveAsTable(SILVER_TABLE))

print(f"✅ Silver written for {batch_date}")


In [0]:
deduped.groupBy("event_date").count().orderBy("event_date").show(50, False)
