In [None]:
# %% [markdown]
# # Project 3: Customer Segmentation (Clustering)
# **Objective:** Group customers based on Annual Income and Spending Score.
#
# **Tech Stack:**
# * Scikit-Learn (K-Means)
# * Pandas & NumPy
# * Matplotlib & Seaborn
# * Joblib

# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
import joblib
import os

# %%
# 1. Load Data
# We use a raw URL for the 'Mall Customers' dataset
url = "https://raw.githubusercontent.com/stevezhao123/Mall-Customer-Segmentation/master/Mall_Customers.csv"
df = pd.read_csv(url)

# Rename columns for clarity
df = df.rename(columns={
    'Annual Income (k$)': 'Income',
    'Spending Score (1-100)': 'Score'
})

print("Dataset Shape:", df.shape)
df.head()

# %%
# 2. Select Features
# For 2D visualization in the App, we will focus on Income vs Score.
# (We could use Age too, but 2D is easier to visualize for this portfolio demo).
X = df[['Income', 'Score']]

# %%
# 3. Determine Optimal Clusters (Elbow Method)
wcss = [] # Within-Cluster Sum of Square
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

# Plot Elbow
plt.figure(figsize=(8,5))
plt.plot(range(1, 11), wcss, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()

# Interpretation: The "elbow" is usually at K=5.
print("Based on the Elbow Method, we select K=5 clusters.")

# %%
# 4. Train K-Means with K=5
kmeans = KMeans(n_clusters=5, init='k-means++', random_state=42)
y_kmeans = kmeans.fit_predict(X)

# Add cluster to dataframe for visualization
df['Cluster'] = y_kmeans

# %%
# 5. Visualize Clusters
plt.figure(figsize=(10,6))
sns.scatterplot(x='Income', y='Score', hue='Cluster', data=df, palette='viridis', s=100)
plt.title('Customer Segments')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()

# %%
# 6. Save Model
if not os.path.exists('models'):
    os.makedirs('models')

joblib.dump(kmeans, 'models/kmeans_model.pkl')
print("Model saved to 'models/kmeans_model.pkl'")

# Note: K-Means calculates distance. While scaling is usually recommended,
# for this specific dataset where Income (15-137) and Score (1-100) are in similar ranges,
# we can skip StandardScaler to keep the interpretation of the centroids intuitive in the App ($ and Score).