In [0]:
SOURCE_TABLE = "workspace.default.bronze_events"
SILVER_TABLE = "workspace.default.silver_events"


In [0]:
df_bronze = spark.table(SOURCE_TABLE)
display(df_bronze.limit(10))


In [0]:
from pyspark.sql import functions as F

df = (
    df_bronze
    .withColumn("event_time", F.to_timestamp("event_time"))
    .withColumn("price", F.col("price").cast("double"))
    .withColumn("product_id", F.col("product_id").cast("long"))
    .withColumn("category_id", F.col("category_id").cast("long"))
    .withColumn("user_id", F.col("user_id").cast("long"))
)


In [0]:
df = df.withColumn(
    "price",
    F.when((F.col("event_type") == "purchase") & (F.col("price") <= 0), F.lit(None))
     .otherwise(F.col("price"))
)


In [0]:
df = df.withColumn("brand", F.coalesce(F.col("brand"), F.lit("unknown")))


In [0]:
dedup_cols = ["event_time", "event_type", "product_id", "user_id", "user_session"]
df_silver = df.dropDuplicates(dedup_cols)


In [0]:
(
    df_silver
    .write
    .mode("overwrite")
    .format("delta")
    .saveAsTable(SILVER_TABLE)
)

print("âœ… Silver created:", SILVER_TABLE)


In [0]:
print("Bronze rows:", df_bronze.count())
print("Silver rows:", spark.table(SILVER_TABLE).count())

# Check if any negative purchase prices remain
bad_prices = (
    spark.table(SILVER_TABLE)
    .filter((F.col("event_type")=="purchase") & (F.col("price") <= 0))
    .count()
)
print("Invalid purchase prices remaining:", bad_prices)


In [0]:
SILVER_TABLE = "workspace.default.silver_events"

df_silver = spark.table(SILVER_TABLE)
display(df_silver.limit(50))
