# 🛍️ Mall Customer Segmentation with PySpark
Generated 2025-05-06 02:16 UTC

## 🔍 1️⃣ Data Inspection

In [None]:

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("MallSegmentation").getOrCreate()

data = [(1,"Male",19,15,39),(2,"Female",21,16,81),(3,"Female",20,17,6),(4,"Male",23,18,77),(5,"Female",31,19,40),(6,"Male",22,20,76),(7,"Female",35,21,6),(8,"Male",23,22,94),(9,"Female",64,23,3),(10,"Male",30,24,72)]
cols = ["CustomerID","Gender","Age","AnnualIncomeK","SpendingScore"]
df = spark.createDataFrame(data, cols)
df.show()
df.printSchema()


## 🛠️ 2️⃣ Feature Engineering

In [None]:

from pyspark.ml.feature import StringIndexer
from pyspark.sql import functions as F

indexer = StringIndexer(inputCol="Gender", outputCol="GenderIdx")
df = indexer.fit(df).transform(df)
df = df.withColumn("IncomePerAge", F.col("AnnualIncomeK") / (F.col("Age")+1e-5))
df.select("Gender","GenderIdx","IncomePerAge").show(5)


## 🧹 3️⃣ Preprocessing

In [None]:

from pyspark.ml.feature import VectorAssembler, StandardScaler

feature_cols = ["AnnualIncomeK","SpendingScore","GenderIdx","IncomePerAge"]
vec_assembler = VectorAssembler(inputCols=feature_cols, outputCol="features_raw")
df_vec = vec_assembler.transform(df)

scaler = StandardScaler(inputCol="features_raw", outputCol="features", withStd=True, withMean=True)
df_scaled = scaler.fit(df_vec).transform(df_vec)


## 🤖 4️⃣ Model Development – KMeans

In [None]:

from pyspark.ml.clustering import KMeans
km = KMeans(k=3, seed=42, featuresCol="features", predictionCol="kmeans_label")
model = km.fit(df_scaled)
df_km = model.transform(df_scaled)
df_km.select("CustomerID","kmeans_label").show()


## 🔧 5️⃣ Hyperparameter Tuning (Silhouette)

In [None]:

from pyspark.ml.evaluation import ClusteringEvaluator

evaluator = ClusteringEvaluator(featuresCol="features", predictionCol="pred", metricName="silhouette", distanceMeasure="squaredEuclidean")
best_k, best_score = None, -1
for k in range(2,7):
    m = KMeans(k=k, seed=42, featuresCol="features", predictionCol="pred").fit(df_scaled)
    score = evaluator.evaluate(m.transform(df_scaled))
    print(f"k={k}, silhouette={score:.3f}")
    if score > best_score:
        best_k, best_score = k, score
print("Best k:", best_k)


## ✅ 6️⃣ Final Model Evaluation

In [None]:

final = KMeans(k=best_k, seed=42, featuresCol="features", predictionCol="segment").fit(df_scaled)
df_final = final.transform(df_scaled)
print("Silhouette:", evaluator.evaluate(df_final))
df_final.select("CustomerID","AnnualIncomeK","SpendingScore","segment").show()


## 📊 7️⃣ Visualization

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pdf = df_final.select("AnnualIncomeK","SpendingScore","segment").toPandas()
sns.scatterplot(data=pdf, x="AnnualIncomeK", y="SpendingScore", hue="segment", palette="Set2")
plt.title(f"Mall Segments (k={best_k})")
plt.show()


## 🧠 8️⃣ Insights & Next Steps
- High/low spenders clusters etc.
- Try DBSCAN or BisectingKMeans for alternative clustering.