#Project 5: Customer Segmentation with Clustering

Project Objective: To use unsupervised machine learning to identify distinct groups of customers within a mall's dataset. By segmenting customers based on multiple dimensions (income, age, spending habits), we can provide rich, actionable insights to the marketing team for highly targeted campaigns.

Step 1: Setup - Importing Libraries and Loading Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import scipy.cluster.hierarchy as sch

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

sns.set_style('whitegrid')

In [None]:
!git clone "https://github.com/HarshvardhanSingh-13/Datasets"


In [None]:
df = pd.read_csv('/content/Datasets/Mall Dataset/Mall_Customers.csv')

print("Dataset loaded successfully.")
print(f"Data shape: {df.shape}")
df.head()

Step 2: In-Depth Exploratory Data Analysis (EDA)

In [None]:
print("Dataset Info:")
df.info()

df.drop('CustomerID', axis=1, inplace=True)

print("\nDescriptive Statistics:")
print(df.describe())

2.1 Univariate Analysis (Feature Distributions)

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
fig.suptitle('Distributions of Customer Features', fontsize=16)

sns.histplot(ax=axes[0], data=df, x='Age', kde=True, bins=20, hue='Gender').set_title('Age Distribution')
sns.histplot(ax=axes[1], data=df, x='Annual Income (k$)', kde=True, bins=20, hue='Gender').set_title('Annual Income Distribution')
sns.histplot(ax=axes[2], data=df, x='Spending Score (1-100)', kde=True, bins=20, hue='Gender').set_title('Spending Score Distribution')

plt.show()

2.2 Bivariate Analysis (Looking for Relationships)

In [None]:
sns.pairplot(df, vars=['Age', 'Annual Income (k$)', 'Spending Score (1-100)'], hue='Gender', diag_kind='kde')
plt.suptitle('Pair Plot of Customer Features', y=1.02)
plt.show()

2.3 3D Visualization

In [None]:
fig = px.scatter_3d(df,
                    x='Annual Income (k$)',
                    y='Spending Score (1-100)',
                    z='Age',
                    color='Gender',
                    title='3D View of Customer Data')
fig.show()

Step 3: Segmentation Model 1 - Income & Spending Score

In [None]:
X1 = df[['Annual Income (k$)', 'Spending Score (1-100)']]
scaler1 = StandardScaler()
X1_scaled = scaler1.fit_transform(X1)

wcss1 = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, init='k-means++', random_state=42, n_init=10)
    kmeans.fit(X1_scaled)
    wcss1.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), wcss1, marker='o', linestyle='--')
plt.title('Elbow Method for Income-Spending Segmentation')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('WCSS (Inertia)')
plt.xticks(range(1, 11))
plt.show()

In [None]:
kmeans1 = KMeans(n_clusters=5, init='k-means++', random_state=42, n_init=10)
df['Income_Cluster'] = kmeans1.fit_predict(X1_scaled)


3.1 Visualizing and Interpreting the Income-Based Segments

In [None]:
plt.figure(figsize=(12, 8))
sns.scatterplot(data=df, x='Annual Income (k$)', y='Spending Score (1-100)',
                hue='Income_Cluster', palette='viridis', s=100, alpha=0.8, edgecolor='black')
plt.title('Customer Segments by Income and Spending')
plt.legend(title='Cluster')
plt.show()

In [None]:
cluster_profiles1 = df.groupby('Income_Cluster')[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']].mean().round(2)
cluster_profiles1['Size'] = df['Income_Cluster'].value_counts()
print("--- Income-Based Cluster Profiles ---")
cluster_profiles1

Step 4: Segmentation Model 2 - Age & Spending Score

In [None]:
X2 = df[['Age', 'Spending Score (1-100)']]
scaler2 = StandardScaler()
X2_scaled = scaler2.fit_transform(X2)

wcss2 = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, init='k-means++', random_state=42, n_init=10)
    kmeans.fit(X2_scaled)
    wcss2.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), wcss2, marker='o', linestyle='--')
plt.title('Elbow Method for Age-Spending Segmentation')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('WCSS (Inertia)')
plt.xticks(range(1, 11))
plt.show()

In [None]:
kmeans2 = KMeans(n_clusters=4, init='k-means++', random_state=42, n_init=10)
df['Age_Cluster'] = kmeans2.fit_predict(X2_scaled)

plt.figure(figsize=(12, 8))
sns.scatterplot(data=df, x='Age', y='Spending Score (1-100)',
                hue='Age_Cluster', palette='magma', s=100, alpha=0.8, edgecolor='black')
plt.title('Customer Segments by Age and Spending')
plt.legend(title='Cluster')
plt.show()

Step 5: An Alternative Method - Hierarchical Clustering

In [None]:
plt.figure(figsize=(20, 10))
dendrogram = sch.dendrogram(sch.linkage(X1_scaled, method='ward'))
plt.title('Dendrogram for Income-Spending Data')
plt.xlabel('Customers')
plt.ylabel('Euclidean Distances')
plt.axhline(y=6, color='r', linestyle='--')
plt.show()

In [None]:
gender_spending_stats = df.groupby('Gender')['Spending Score (1-100)'].agg(['mean', 'median', 'std', 'count']).round(2)
print("Summary Statistics: Spending Score by Gender")
display(gender_spending_stats)

In [None]:
plt.figure(figsize=(14, 6))

In [None]:
plt.subplot(1, 2, 1)
sns.boxplot(x='Gender', y='Spending Score (1-100)', data=df, palette='Set2')
plt.title('Spending Score Distribution by Gender (Box Plot)')

In [None]:
plt.subplot(1, 2, 2)
sns.violinplot(x='Gender', y='Spending Score (1-100)', data=df, palette='Set2')
plt.title('Spending Score Density by Gender (Violin Plot)')

plt.tight_layout()
plt.show()

In [None]:
df['Spending_to_Income_Ratio'] = (df['Spending Score (1-100)'] / df['Annual Income (k$)']) * 100

print("New Feature Created: Spending_to_Income_Ratio")
display(df[['Annual Income (k$)', 'Spending Score (1-100)', 'Spending_to_Income_Ratio']].head())

In [None]:
X_new = df[['Age', 'Spending_to_Income_Ratio']]

In [None]:
scaler_new = StandardScaler()
X_new_scaled = scaler_new.fit_transform(X_new)

In [None]:
wcss_new = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, init='k-means++', random_state=42, n_init=10)
    kmeans.fit(X_new_scaled)
    wcss_new.append(kmeans.inertia_)

plt.figure(figsize=(8, 5))
plt.plot(range(1, 11), wcss_new, marker='o', color='purple')
plt.title('Elbow Method: Age vs. Spending/Income Ratio')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('WCSS')
plt.show()

In [None]:
k_opt = 4
kmeans_new = KMeans(n_clusters=k_opt, init='k-means++', random_state=42, n_init=10)
df['New_Feature_Cluster'] = kmeans_new.fit_predict(X_new_scaled)

In [None]:
plt.figure(figsize=(10, 7))
sns.scatterplot(data=df, x='Age', y='Spending_to_Income_Ratio',
                hue='New_Feature_Cluster', palette='tab10', s=100)
plt.title(f'Customer Clusters: Age vs. Spending/Income Ratio (k={k_opt})')
plt.show()

In [None]:
print("Cluster Profiles for New Feature Analysis:")
display(df.groupby('New_Feature_Cluster')[['Age', 'Spending_to_Income_Ratio', 'Annual Income (k$)']].mean())

In [None]:
plt.figure(figsize=(16, 6))

plt.subplot(1, 2, 1)
sns.scatterplot(data=df, x='Annual Income (k$)', y='Spending Score (1-100)',
                hue='Income_Cluster', palette='viridis', s=80)
plt.title('Original Model: Income vs. Spending')

plt.subplot(1, 2, 2)
sns.scatterplot(data=df, x='Age', y='Spending_to_Income_Ratio',
                hue='New_Feature_Cluster', palette='tab10', s=80)
plt.title('Engineered Model: Age vs. Spending Ratio')

plt.tight_layout()
plt.show()

print("Project 5 Analysis Complete.")