In [0]:
SOURCE_TABLE = "workspace.default.bronze_events"
SILVER_TABLE = "workspace.default.silver_events"


In [0]:
df_bronze = spark.table(SOURCE_TABLE)
display(df_bronze.limit(10))


In [0]:
from pyspark.sql import functions as F

df = (
    df_bronze
    .withColumn("event_time", F.to_timestamp("event_time"))
    .withColumn("price", F.col("price").cast("double"))
    .withColumn("product_id", F.col("product_id").cast("long"))
    .withColumn("category_id", F.col("category_id").cast("long"))
    .withColumn("user_id", F.col("user_id").cast("long"))
)


In [0]:
df = df.withColumn(
    "price",
    F.when((F.col("event_type") == "purchase") & (F.col("price") <= 0), F.lit(None))
     .otherwise(F.col("price"))
)


In [0]:
df = df.withColumn("brand", F.coalesce(F.col("brand"), F.lit("unknown")))


In [0]:
dedup_cols = ["event_time", "event_type", "product_id", "user_id", "user_session"]
df_silver = df.dropDuplicates(dedup_cols)


In [0]:
(
    df_silver
    .write
    .mode("overwrite")
    .format("delta")
    .saveAsTable(SILVER_TABLE)
)

print("âœ… Silver created:", SILVER_TABLE)


In [0]:
print("Bronze rows:", df_bronze.count())
print("Silver rows:", spark.table(SILVER_TABLE).count())

# Check if any negative purchase prices remain
bad_prices = (
    spark.table(SILVER_TABLE)
    .filter((F.col("event_type")=="purchase") & (F.col("price") <= 0))
    .count()
)
print("Invalid purchase prices remaining:", bad_prices)


In [0]:
SILVER_TABLE = "workspace.default.silver_events"

df_silver = spark.table(SILVER_TABLE)
display(df_silver.limit(50))


In [0]:
# Purpose: Define source (clean silver) and target (enriched silver) tables

SILVER_CLEAN_TABLE = "workspace.default.silver_events"
SILVER_ENRICHED_TABLE = "workspace.default.silver_events_enriched"


In [0]:
# Purpose: Load cleaned Silver table into a DataFrame for feature creation

df = spark.table(SILVER_CLEAN_TABLE)
print("Loaded rows:", df.count())


In [0]:
# Purpose: Create flags, revenue, and time-based columns for analytics

from pyspark.sql import functions as F

df_enriched = (
    df
    .withColumn("is_view", F.when(F.col("event_type") == "view", 1).otherwise(0))
    .withColumn("is_cart", F.when(F.col("event_type") == "cart", 1).otherwise(0))
    .withColumn("is_remove_from_cart", F.when(F.col("event_type") == "remove_from_cart", 1).otherwise(0))
    .withColumn("is_purchase", F.when(F.col("event_type") == "purchase", 1).otherwise(0))
    .withColumn("revenue", F.when(F.col("event_type") == "purchase", F.col("price")).otherwise(0.0))
    .withColumn("event_date", F.to_date("event_time"))
    .withColumn("event_hour", F.hour("event_time"))
)

display(df_enriched.limit(50))


In [0]:
# Purpose: Save enriched Silver data as a new Delta table

(
    df_enriched
    .write
    .format("delta")
    .mode("overwrite")
    .saveAsTable(SILVER_ENRICHED_TABLE)
)

print("Silver enriched table created:", SILVER_ENRICHED_TABLE)


In [0]:
# Purpose: Validate row counts, purchase flags, and revenue correctness

df_clean = spark.table(SILVER_CLEAN_TABLE)
df_new = spark.table(SILVER_ENRICHED_TABLE)

print("Clean Silver rows:", df_clean.count())
print("Enriched Silver rows:", df_new.count())

purchase_rows = df_new.filter(F.col("event_type") == "purchase").count()
purchase_flag_sum = df_new.agg(F.sum("is_purchase")).first()[0]

print("Purchase rows:", purchase_rows)
print("Sum(is_purchase):", purchase_flag_sum)

neg_revenue = df_new.filter(F.col("revenue") < 0).count()
print("Negative revenue rows:", neg_revenue)


In [0]:
# Purpose: Quick daily funnel and revenue preview (not Gold yet)

display(
    df_new
    .groupBy("event_date")
    .agg(
        F.sum("is_view").alias("views"),
        F.sum("is_cart").alias("carts"),
        F.sum("is_purchase").alias("purchases"),
        F.sum("revenue").alias("daily_revenue")
    )
    .orderBy("event_date")
    .limit(10)
)
