In [0]:
%run "/Workspace/Users/sundarasandeepteja@gmail.com/E-Commerce Analytics Medallion Architecture with GenAI/config/project_config"

In [0]:
from pyspark.sql import functions as F

bronze_cust = spark.table(BRONZE_CUSTOMERS_TABLE)

# PII Masking
masked = bronze_cust.withColumns({
    # Email: ab***@domain.com
    "email_masked": F.concat(
        F.substring("email", 1, 2),
        F.lit("***@"),
        F.regexp_extract("email", "@(.+)", 1)
    ),
    "email_domain": F.regexp_extract("email", "@(.+)", 1),

    # Phone: ***-***-1234
    "phone_masked": F.concat(
        F.lit("***-***-"),
        F.substring(F.regexp_replace("phone", "[^0-9]", ""), -4, 4)
    ),

    # Name: J*** S***
    "name_masked": F.concat(
        F.substring("first_name", 1, 1),
        F.lit("*** "),
        F.substring("last_name", 1, 1),
        F.lit("***")
    )
})

# Enrichment
enriched = masked.withColumns({
    "registration_date": F.to_date("registration_date"),
    "customer_age": F.year(F.current_date()) - F.col("birth_year"),
    "age_group": F.when(F.col("customer_age") < 25, "18-24")
        .when(F.col("customer_age") < 35, "25-34")
        .when(F.col("customer_age") < 45, "35-44")
        .when(F.col("customer_age") < 55, "45-54")
        .otherwise("55+"),
    "region": F.when(F.col("state").isin("CA", "WA", "OR"), "West")
        .when(F.col("state").isin("TX", "OK", "LA"), "Southwest")
        .when(F.col("state").isin("NY", "NJ", "PA"), "Northeast")
        .when(F.col("state").isin("FL", "GA", "NC"), "Southeast")
        .otherwise("Midwest")
})

# Select non-PII columns for Silver
silver_cols = [
    "customer_id", "name_masked", "email_masked", "email_domain",
    "phone_masked", "state", "region", "customer_age", "age_group",
    "segment", "is_active", "registration_date"
]

enriched.select(silver_cols).write.format("delta").mode("overwrite") \
    .saveAsTable(SILVER_CUSTOMERS_TABLE)

print("■ Silver customers with PII masked")

In [0]:
# CLV CALCULATION
transactions = spark.table(SILVER_TRANSACTIONS_TABLE)
customers = spark.table(SILVER_CUSTOMERS_TABLE)

# Aggregate per customer
metrics = transactions.filter(F.col("status") == "Completed").groupBy("customer_id").agg(
    F.sum("final_amount").alias("total_revenue"),
    F.avg("final_amount").alias("avg_order_value"),
    F.count("transaction_id").alias("total_orders"),
    F.min("transaction_date").alias("first_order"),
    F.max("transaction_date").alias("last_order"),
    F.countDistinct("product_id").alias("unique_products")
)

# Calculate CLV
clv = metrics.withColumns({
    "purchase_frequency": F.col("total_orders") / F.greatest(
        F.months_between(F.col("last_order"), F.col("first_order")), F.lit(1)
    ),
    "predicted_annual": F.col("avg_order_value") * F.col("total_orders") / F.greatest(
        F.datediff(F.col("last_order"), F.col("first_order")) / 365, F.lit(0.5)
    ),
    "clv_3_year": F.col("avg_order_value") * F.col("total_orders") * 3 * 0.7
})

# CLV segments
percentiles = clv.approxQuantile("clv_3_year", [0.25, 0.5, 0.75, 0.9], 0.01)
segmented = clv.withColumn(
    "clv_segment",
    F.when(F.col("clv_3_year") >= percentiles[3], "Diamond")
     .when(F.col("clv_3_year") >= percentiles[2], "Gold")
     .when(F.col("clv_3_year") >= percentiles[1], "Silver")
     .otherwise("Bronze")
)

# Join with customers
customers.join(segmented, "customer_id", "left").write.format("delta").mode("overwrite").saveAsTable(
    f"{SILVER_DB}.customers_with_clv"
)

print("■ CLV calculated")

In [0]:
from pyspark.sql import functions as F

silver_customers = spark.table(f"{SILVER_DB}.customers_with_clv")

# Add days_since_last_order column
silver_customers = silver_customers.withColumn(
    "days_since_last_order",
    F.datediff(F.current_date(), F.col("last_order"))
)

# If churn_risk is missing, add it
if "churn_risk" not in silver_customers.columns:
    silver_customers = silver_customers.withColumn(
        "churn_risk",
        F.when(F.col("days_since_last_order") > 180, "High")
         .when(F.col("days_since_last_order") > 90, "Medium")
         .otherwise("Low")
    )

# Save the updated silver table
silver_customers.write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable(f"{SILVER_DB}.customers_with_clv")