In [0]:
# 03_feature_engineering
from pyspark.sql.functions import col, when, expr

spark.sql("USE churn_mlo_mdb")

# Load Silver table
silver = "churn_mlo_mdb.silver_churn"
df = spark.table(silver)

print("Rows in silver:", df.count())
df.printSchema()

In [0]:
# 1) CREATE NUMERIC FEATURES
# ----------------------------

# tenure in years
df = df.withColumn("tenure_years", col("tenure") / 12.0)

# average monthly spend (avoid division by zero)
df = df.withColumn(
    "avg_monthly_spend",
    when(col("tenure") == 0, None).otherwise(col("TotalCharges") / col("tenure"))
)

# senior citizen normalized for ML
df = df.withColumn(
    "is_senior",
    when(col("SeniorCitizen") == 1, 1).otherwise(0)
)


In [0]:
# 2) ONE-HOT ENCODE KEY CATEGORICAL FEATURES (minimal)
# ----------------------------

# InternetService: DSL / Fiber optic / No
for lvl in ["DSL", "Fiber optic", "No"]:
    df = df.withColumn(f"InternetService_{lvl.replace(' ', '_')}",
                       when(col("InternetService") == lvl, 1).otherwise(0))

# Contract: Month-to-month / One year / Two year
for lvl in ["Month-to-month", "One year", "Two year"]:
    df = df.withColumn(f"Contract_{lvl.replace('-', '_').replace(' ', '_')}",
                       when(col("Contract") == lvl, 1).otherwise(0))

# PaymentMethod (keep only the strongest indicator in churn literature)
df = df.withColumn("Payment_ElectronicCheck",
                   when(col("PaymentMethod") == "Electronic check", 1).otherwise(0))



In [0]:
# 3) SELECT FINAL FEATURE SET
# ----------------------------

feature_cols = [
    # identifiers
    "customerID",
    
    # numeric
    "tenure", "tenure_years", "MonthlyCharges", "TotalCharges",
    "avg_monthly_spend", "is_senior",

    # categorical (one-hot)
    "InternetService_DSL", "InternetService_Fiber_optic", "InternetService_No",
    "Contract_Month_to_month", "Contract_One_year", "Contract_Two_year",
    "Payment_ElectronicCheck",

    # label
    "churn_label",
]

df_features = df.select(*feature_cols)


In [0]:
# 4) WRITE FEATURES TABLE
# ----------------------------

features_table = "churn_mlo_mdb.features_churn"

spark.sql(f"DROP TABLE IF EXISTS {features_table}")

df_features.write.format("delta").mode("overwrite").saveAsTable(features_table)

print("Feature table created:", features_table)
display(spark.table(features_table).limit(10))