In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# ==========================================
# Step 1: Load Data
# ==========================================
# OPTION A: If you have the 'Mall_Customers.csv' (uncomment below)
# df = pd.read_csv('Mall_Customers.csv')
# X = df.iloc[:, [3, 4]].values  # Select 'Annual Income' and 'Spending Score'

# OPTION B: Dummy Data (Run this if you don't have a file)
# Pattern: 5 distinct groups of customers
data = {
    'Income': [15, 16, 17, 18, 19, 20, 80, 85, 88, 90, 95, 25, 30, 35, 20, 100, 105, 110, 120, 130, 40, 45, 50, 55, 60],
    'Score':  [80, 85, 82, 88, 90, 85, 15, 20, 10, 12, 18, 20, 15, 10, 25, 80, 82, 85, 90, 95, 50, 55, 50, 45, 50]
}
df = pd.DataFrame(data)
X = df.values # Convert to numpy array

# ==========================================
# Step 2: Elbow Method (Find Optimal K)
# ==========================================
wcss = [] # Within-Cluster Sum of Square
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_) # inertia_ is the WCSS value

# Plot Elbow Graph

plt.figure(figsize=(8, 4))
plt.plot(range(1, 11), wcss, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

# ==========================================
# Step 3: Silhouette Method (Validation)
# ==========================================
# We verify if K=5 is good (Common for this dataset)
kmeans_5 = KMeans(n_clusters=5, random_state=42)
labels = kmeans_5.fit_predict(X)
score = silhouette_score(X, labels)

print(f"Silhouette Score for k=5: {score:.2f}")
# Note: Score close to 1 is best, close to -1 is worst.

# ==========================================
# Step 4: Final Clustering & Visualization
# ==========================================
# We use the predicted labels from Step 3
plt.figure(figsize=(8, 6))

# Plot the points, colored by their cluster label
plt.scatter(X[labels == 0, 0], X[labels == 0, 1], s=100, c='red', label='Cluster 1')
plt.scatter(X[labels == 1, 0], X[labels == 1, 1], s=100, c='blue', label='Cluster 2')
plt.scatter(X[labels == 2, 0], X[labels == 2, 1], s=100, c='green', label='Cluster 3')
plt.scatter(X[labels == 3, 0], X[labels == 3, 1], s=100, c='cyan', label='Cluster 4')
plt.scatter(X[labels == 4, 0], X[labels == 4, 1], s=100, c='magenta', label='Cluster 5')

# Plot Centroids (The centers of the clusters)
plt.scatter(kmeans_5.cluster_centers_[:, 0], kmeans_5.cluster_centers_[:, 1], s=300, c='yellow', label='Centroids')

plt.title('Customer Clusters')
plt.xlabel('Annual Income')
plt.ylabel('Spending Score')
plt.legend()
plt.show()