In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

In [2]:
DATA_PATH = Path("/content/Mall_Customers.csv")
OUTPUTS = Path("outputs")
OUTPUTS.mkdir(exist_ok=True)

In [4]:
RANGE_K = range(2, 11)
FEATURES = ["Annual Income (k$)", "Spending Score (1-100)"]

In [5]:
df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [6]:
df = df.dropna(subset=FEATURES).copy()

In [7]:
X = df[FEATURES].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [8]:
inertias = []
for k in RANGE_K:
    km = KMeans(n_clusters=k, n_init=10, random_state=42)
    km.fit(X_scaled)
    inertias.append(km.inertia_)

plt.figure()
plt.plot(list(RANGE_K), inertias, marker='o')
plt.xlabel("k")
plt.ylabel("Inertia (within-cluster SSE)")
plt.title("Elbow Method")
plt.tight_layout()
plt.savefig(OUTPUTS / "elbow.png")
plt.close()

In [9]:
sil_scores = []
for k in RANGE_K:
    km = KMeans(n_clusters=k, n_init=10, random_state=42)
    labels = km.fit_predict(X_scaled)
    sil = silhouette_score(X_scaled, labels)
    sil_scores.append(sil)

plt.figure()
plt.plot(list(RANGE_K), sil_scores, marker='o')
plt.xlabel("k")
plt.ylabel("Silhouette Score")
plt.title("Silhouette Scores vs k")
plt.tight_layout()
plt.savefig(OUTPUTS / "silhouette.png")
plt.close()

best_k = list(RANGE_K)[int(np.argmax(sil_scores))]
print(f"Best k by silhouette: {best_k}")

Best k by silhouette: 5


In [10]:
final_kmeans = KMeans(n_clusters=best_k, n_init=10, random_state=42)
df["cluster"] = final_kmeans.fit_predict(X_scaled)

In [11]:
pca = PCA(n_components=2, random_state=42)
X_2d = pca.fit_transform(X_scaled)

plt.figure()
plt.scatter(X_2d[:, 0], X_2d[:, 1], c=df["cluster"], s=40)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title(f"K-Means Clusters (k={best_k}) - PCA 2D")
plt.tight_layout()
plt.savefig(OUTPUTS / "clusters_pca.png")
plt.close()

In [12]:
profile = df.groupby("cluster")[FEATURES].mean().round(2)
print("\nCluster profile (feature means):\n", profile)


Cluster profile (feature means):
          Annual Income (k$)  Spending Score (1-100)
cluster                                            
0                     55.30                   49.52
1                     86.54                   82.13
2                     25.73                   79.36
3                     88.20                   17.11
4                     26.30                   20.91


In [13]:
df.to_csv(OUTPUTS / "data_with_clusters.csv", index=False)
profile.to_csv(OUTPUTS / "cluster_profile.csv")