# ü•à Silver Layer ‚Äî Cleansing & Transformation
**Project:** End-to-End Retail Lakehouse | Microsoft Fabric

**Layer:** Silver (Cleansed / Conformed)

**Purpose:** Apply PySpark transformations to clean, validate, deduplicate, and enrich Bronze data.

```
Bronze Delta Tables ‚Üí PySpark Transformations ‚Üí Silver Delta Tables
```

**Transformations applied:**
- üßπ Null handling & type casting
- üîÑ Deduplication
- üìÖ Date parsing & calendar enrichment
- üí∞ Derived business metrics
- ‚úÖ Data quality scoring

In [None]:
# ============================================================
# CELL 1 ‚Äî Configuration
# ============================================================
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, trim, upper, lower, initcap, when, isnull, coalesce,
    to_date, to_timestamp, year, month, dayofweek, quarter,
    datediff, current_date, current_timestamp, lit,
    regexp_replace, round as spark_round, concat, concat_ws,
    count, sum as spark_sum, avg, max as spark_max, min as spark_min,
    dense_rank, row_number
)
from pyspark.sql.types import DoubleType, IntegerType, BooleanType, StringType
from pyspark.sql.window import Window
from delta.tables import DeltaTable

LAKEHOUSE_NAME = "RetailLakehouse"  # ‚Üê Update with your Lakehouse name
BASE = f"abfss://your_workspace@onelake.dfs.fabric.microsoft.com/{LAKEHOUSE_NAME}.Lakehouse/Tables"
BRONZE = f"{BASE}/bronze"
SILVER = f"{BASE}/silver"

print(f"‚úÖ Silver layer config ready.")
print(f"   Bronze path: {BRONZE}")
print(f"   Silver path: {SILVER}")

In [None]:
# ============================================================
# CELL 2 ‚Äî Silver Customers
# ============================================================
print("\nüîÑ Transforming: customers")

bronze_customers = spark.read.format("delta").load(f"{BRONZE}/customers")
print(f"   Bronze rows: {bronze_customers.count():,}")

silver_customers = bronze_customers \
    .filter(col("customer_id").isNotNull()) \
    .dropDuplicates(["customer_id"]) \
    .withColumn("first_name",  initcap(trim(col("first_name")))) \
    .withColumn("last_name",   initcap(trim(col("last_name")))) \
    .withColumn("email",       lower(trim(col("email")))) \
    .withColumn("city",        initcap(trim(col("city")))) \
    .withColumn("segment",     trim(col("segment"))) \
    .withColumn("signup_date", to_date(col("signup_date"), "yyyy-MM-dd")) \
    .withColumn("is_active",   col("is_active").cast(BooleanType())) \
    .withColumn("full_name",   concat_ws(" ", col("first_name"), col("last_name"))) \
    .withColumn("customer_tenure_days", datediff(current_date(), col("signup_date"))) \
    .withColumn("email_domain", regexp_replace(col("email"), r".*@", "")) \
    .withColumn("_silver_updated_at", current_timestamp()) \
    .select(
        "customer_id", "full_name", "first_name", "last_name",
        "email", "email_domain", "city", "segment",
        "signup_date", "is_active", "customer_tenure_days",
        "_silver_updated_at"
    )

silver_customers.write.format("delta").mode("overwrite") \
    .option("overwriteSchema","true").save(f"{SILVER}/customers")

print(f"   Silver rows: {silver_customers.count():,}")
print(f"   ‚úÖ silver/customers written")
silver_customers.show(5, truncate=False)

In [None]:
# ============================================================
# CELL 3 ‚Äî Silver Products
# ============================================================
print("\nüîÑ Transforming: products")

bronze_products = spark.read.format("delta").load(f"{BRONZE}/products")

silver_products = bronze_products \
    .filter(col("product_id").isNotNull()) \
    .dropDuplicates(["product_id"]) \
    .withColumn("product_name", initcap(trim(col("product_name")))) \
    .withColumn("category",     initcap(trim(col("category")))) \
    .withColumn("unit_price",   col("unit_price").cast(DoubleType())) \
    .withColumn("cost_price",   col("cost_price").cast(DoubleType())) \
    .withColumn("in_stock",     col("in_stock").cast(BooleanType())) \
    .withColumn("gross_margin_pct",
        spark_round(
            (col("unit_price") - col("cost_price")) / col("unit_price") * 100, 2
        )
    ) \
    .withColumn("price_tier",
        when(col("unit_price") < 50,  lit("Budget"))
        .when(col("unit_price") < 300, lit("Mid-Range"))
        .otherwise(lit("Premium"))
    ) \
    .withColumn("_silver_updated_at", current_timestamp()) \
    .select(
        "product_id", "product_name", "category", "sub_category",
        "unit_price", "cost_price", "gross_margin_pct", "price_tier",
        "supplier", "in_stock", "_silver_updated_at"
    )

silver_products.write.format("delta").mode("overwrite") \
    .option("overwriteSchema","true").save(f"{SILVER}/products")

print(f"   ‚úÖ silver/products written ‚Äî {silver_products.count():,} rows")
silver_products.show(truncate=False)

In [None]:
# ============================================================
# CELL 4 ‚Äî Silver Stores
# ============================================================
print("\nüîÑ Transforming: stores")

bronze_stores = spark.read.format("delta").load(f"{BRONZE}/stores")

silver_stores = bronze_stores \
    .filter(col("store_id").isNotNull()) \
    .dropDuplicates(["store_id"]) \
    .withColumn("store_name", initcap(trim(col("store_name")))) \
    .withColumn("store_type", trim(col("store_type"))) \
    .withColumn("city",       initcap(trim(col("city")))) \
    .withColumn("region",     initcap(trim(col("region")))) \
    .withColumn("open_date",  to_date(col("open_date"), "yyyy-MM-dd")) \
    .withColumn("store_age_days", datediff(current_date(), col("open_date"))) \
    .withColumn("is_online",  col("store_type") == lit("Online")) \
    .withColumn("_silver_updated_at", current_timestamp())

silver_stores.write.format("delta").mode("overwrite") \
    .option("overwriteSchema","true").save(f"{SILVER}/stores")

print(f"   ‚úÖ silver/stores written ‚Äî {silver_stores.count():,} rows")
silver_stores.show(truncate=False)

In [None]:
# ============================================================
# CELL 5 ‚Äî Silver Transactions (main transformation)
# ============================================================
print("\nüîÑ Transforming: transactions")

bronze_txn = spark.read.format("delta").load(f"{BRONZE}/transactions")
print(f"   Bronze rows: {bronze_txn.count():,}")

silver_txn = bronze_txn \
    .filter(
        col("transaction_id").isNotNull() &
        col("customer_id").isNotNull() &
        col("product_id").isNotNull() &
        (col("total_amount") > 0) &
        (col("quantity") > 0)
    ) \
    .dropDuplicates(["transaction_id"]) \
    .withColumn("transaction_date",  to_date(col("transaction_date"), "yyyy-MM-dd")) \
    .withColumn("transaction_ts",
        to_timestamp(
            concat(col("transaction_date").cast(StringType()), lit(" "), col("transaction_time")),
            "yyyy-MM-dd HH:mm:ss"
        )
    ) \
    .withColumn("txn_year",     year(col("transaction_date"))) \
    .withColumn("txn_month",    month(col("transaction_date"))) \
    .withColumn("txn_quarter",  quarter(col("transaction_date"))) \
    .withColumn("txn_day_of_week", dayofweek(col("transaction_date"))) \
    .withColumn("is_weekend",
        col("txn_day_of_week").isin([1, 7])  # 1=Sunday, 7=Saturday
    ) \
    .withColumn("unit_price",   col("unit_price").cast(DoubleType())) \
    .withColumn("discount_pct", col("discount_pct").cast(DoubleType())) \
    .withColumn("total_amount", col("total_amount").cast(DoubleType())) \
    .withColumn("quantity",     col("quantity").cast(IntegerType())) \
    .withColumn("discount_amount",
        spark_round(col("quantity") * col("unit_price") * col("discount_pct"), 2)
    ) \
    .withColumn("gross_revenue",
        spark_round(col("quantity") * col("unit_price"), 2)
    ) \
    .withColumn("is_returned",  col("status") == lit("Returned")) \
    .withColumn("_silver_updated_at", current_timestamp()) \
    .select(
        "transaction_id", "customer_id", "product_id", "store_id",
        "transaction_date", "transaction_ts", "txn_year", "txn_month",
        "txn_quarter", "txn_day_of_week", "is_weekend",
        "quantity", "unit_price", "discount_pct", "discount_amount",
        "gross_revenue", "total_amount", "payment_method",
        "status", "is_returned", "source_system", "_silver_updated_at"
    )

silver_txn.write.format("delta").mode("overwrite") \
    .option("overwriteSchema","true").save(f"{SILVER}/transactions")

print(f"   Silver rows: {silver_txn.count():,}")
print(f"   ‚úÖ silver/transactions written")
silver_txn.show(5, truncate=False)

In [None]:
# ============================================================
# CELL 6 ‚Äî Silver Data Quality Summary
# ============================================================
print("\nüìä Silver Layer Data Quality Report")
print("=" * 55)

for table in ["customers", "products", "stores", "transactions"]:
    df = spark.read.format("delta").load(f"{SILVER}/{table}")
    row_count = df.count()
    col_count = len(df.columns)
    print(f"  ü•à silver_{table:<18} {row_count:>10,} rows | {col_count} columns")