In [None]:
# K-Means Clustering on Retail Customers Dataset

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder

# 1️ Load Dataset
df = pd.read_csv("retail_customers_dataset.csv")
print("Dataset Preview:")
print(df.head())

# 2️ Preprocessing
# Convert Gender into numeric (Male=1, Female=0)
le = LabelEncoder()
df["Gender"] = le.fit_transform(df["Gender"])

# Features for clustering
X = df[["Age", "Annual Income (k$)", "Spending Score (1-100)"]]

# 3️ Apply K-Means Clustering
# We'll find the optimal number of clusters using the Elbow Method
wcss = []  # Within Cluster Sum of Squares

for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init="k-means++", random_state=42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

# 4️ Plot Elbow Curve
plt.figure(figsize=(8,5))
plt.plot(range(1, 11), wcss, marker="o")
plt.title("Elbow Method for Optimal K")
plt.xlabel("Number of Clusters")
plt.ylabel("WCSS")
plt.show()

# 5️ Train Final K-Means Model (let’s assume K=5 from elbow curve)
kmeans = KMeans(n_clusters=5, init="k-means++", random_state=42)
df["Cluster"] = kmeans.fit_predict(X)

# 6️ Visualize Clusters (Annual Income vs Spending Score)
plt.figure(figsize=(8,6))
plt.scatter(df["Annual Income (k$)"], df["Spending Score (1-100)"],
            c=df["Cluster"], cmap="viridis", s=70)
plt.title("Customer Segmentation (K-Means Clustering)")
plt.xlabel("Annual Income (k$)")
plt.ylabel("Spending Score (1–100)")
plt.show()

# 7️ View Cluster Summary
print("\nCluster Summary:")
print(df.groupby("Cluster").mean(numeric_only=True))
