In [0]:
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.sql.functions import avg, count, col, round

# Set context
spark.sql("USE CATALOG workspace")
spark.sql("USE SCHEMA instacart")

# 1. Load Data
df_customers = spark.read.table("gold_customer_features")

# 2. Prepare Features for ML
# We need to combine all numerical columns into a single Vector column
feature_cols = [
    "total_orders", 
    "avg_days_between_orders", 
    "total_products_purchased", 
    "avg_basket_size", 
    "reorder_ratio"
]

print("Preparing features...")
assembler = VectorAssembler(inputCols=feature_cols, outputCol="raw_features")
df_vectorized = assembler.transform(df_customers)

# 3. Scale Features
# K-Means is sensitive to scale. We standardize data to mean=0, std=1.
scaler = StandardScaler(inputCol="raw_features", outputCol="scaled_features", withStd=True, withMean=True)
scaler_model = scaler.fit(df_vectorized)
df_scaled = scaler_model.transform(df_vectorized)

# 4. Train K-Means Model
print("Training K-Means Model (k=3)...")
kmeans = KMeans(featuresCol="scaled_features", k=3, seed=42)
model = kmeans.fit(df_scaled)

# Make predictions (Assign clusters)
predictions = model.transform(df_scaled)

# 5. Interpret the Clusters
print("Profiling Clusters...")
cluster_analysis = predictions.groupBy("prediction").agg(
    count("user_id").alias("num_customers"),
    round(avg("total_orders"), 1).alias("avg_orders"),
    round(avg("avg_days_between_orders"), 1).alias("avg_days_between"),
    round(avg("avg_basket_size"), 1).alias("avg_basket"),
    round(avg("reorder_ratio"), 2).alias("avg_reorder_rate")
).orderBy("prediction")

display(cluster_analysis)

# 6. Save Results
final_segments = predictions.select("user_id", "prediction") \
    .withColumnRenamed("prediction", "cluster_id")

# Join back to original features for the final Gold table
df_final = df_customers.join(final_segments, "user_id")

df_final.write.format("delta").mode("overwrite").saveAsTable("gold_customer_segments")
print("Created gold_customer_segments with Cluster Labels")

Preparing features...
Training K-Means Model (k=3)...
Profiling Clusters...


prediction,num_customers,avg_orders,avg_days_between,avg_basket,avg_reorder_rate
0,99245,7.5,17.1,8.0,0.3
1,21907,54.3,6.7,12.7,0.72
2,85057,17.4,10.9,10.6,0.54


Created gold_customer_segments with Cluster Labels


In [0]:
from pyspark.sql.functions import when, col

# 1. Apply Business Labels & Rename 'prediction' to 'cluster_id'
# We use .alias("cluster_id") to rename it on the fly
df_final_segments = predictions.withColumn("customer_segment", 
    when(col("prediction") == 1, "Loyal Super-Users")
    .when(col("prediction") == 2, "Core Regulars")
    .otherwise("Occasional Shoppers") 
).select(
    col("user_id"),
    col("prediction").alias("cluster_id"),
    col("customer_segment")
)

# 2. Join back to the main Gold Customer Features table
# We join on 'user_id' to attach the segment label to the feature stats
df_gold_complete = spark.read.table("gold_customer_features") \
    .join(df_final_segments, "user_id", "left")

# 3. Save the Final Analytics Table
df_gold_complete.write.format("delta").mode("overwrite").saveAsTable("gold_customer_segments_labeled")

print("Success: Created gold_customer_segments_labeled")
print("--- Sample Data ---")
display(df_gold_complete.limit(5))

Success: Created gold_customer_segments_labeled
--- Sample Data ---


user_id,total_orders,avg_days_between_orders,total_products_purchased,unique_products_bought,avg_basket_size,reorder_ratio,cluster_id,customer_segment
152590,23,13.7,77,16,3.35,0.79,2,Core Regulars
8018,4,16.0,11,8,2.75,0.27,0,Occasional Shoppers
83433,17,13.12,34,18,2.0,0.47,0,Occasional Shoppers
24667,37,7.65,401,143,10.84,0.64,1,Loyal Super-Users
172083,9,9.67,70,46,7.78,0.34,2,Core Regulars
