In [0]:
%run "/Workspace/Users/sundarasandeepteja@gmail.com/E-Commerce Analytics Medallion Architecture with GenAI/config/project_config"

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

print("■ SILVER: Transaction Cleaning")
# Read Bronze
bronze = spark.table(BRONZE_TRANSACTIONS_TABLE)
display(bronze.head())

# Step 1: Remove duplicates (keep latest)
window = Window.partitionBy("transaction_id").orderBy(F.desc("_ingested_at"))
deduped = bronze.withColumn("_rn", F.row_number().over(window)) \
    .filter(F.col("_rn") == 1).drop("_rn")

# Step 2: Parse timestamps
clean = deduped.withColumns({
    "transaction_timestamp": F.to_timestamp("transaction_timestamp"),
    "transaction_date": F.to_date("transaction_timestamp"),
    "transaction_year": F.year("transaction_timestamp"),
    "transaction_month": F.month("transaction_timestamp"),
    "day_of_week": F.dayofweek("transaction_timestamp"),
    "is_weekend": F.when(F.dayofweek("transaction_timestamp").isin([1, 7]), True).otherwise(False)
})

# Step 3: Validate
valid = clean.filter(
    F.col("transaction_id").isNotNull() &
    F.col("customer_id").isNotNull() &
    (F.col("final_amount") >= 0) &
    (F.col("quantity") > 0)
)

# Step 4: Standardize values
standardized = valid.withColumn(
    "status",
    F.when(F.lower(F.col("status")).isin(["completed", "complete"]), "Completed")
     .when(F.lower(F.col("status")).isin(["pending"]), "Pending")
     .when(F.lower(F.col("status")).isin(["failed"]), "Failed")
     .when(F.lower(F.col("status")).isin(["refunded"]), "Refunded")
     .otherwise("Unknown")
)

# Step 5: Add derived columns
enriched = standardized.withColumns({
    "gross_profit": F.col("final_amount") - F.col("shipping_cost"),
    "discount_flag": F.when(F.col("discount_percent") > 0, True).otherwise(False),
    "order_size": F.when(F.col("quantity") == 1, "Single")
        .when(F.col("quantity") <= 3, "Small")
        .otherwise("Large")
})

# Write to Silver
enriched.write.format("delta").mode("overwrite") \
    .partitionBy("transaction_year", "transaction_month") \
    .saveAsTable(SILVER_TRANSACTIONS_TABLE)
spark.sql(f"OPTIMIZE {SILVER_TRANSACTIONS_TABLE}")
print(f"■ Silver transactions: {spark.table(SILVER_TRANSACTIONS_TABLE).count():,}")