In [None]:
import numpy as np
import pandas as pd

# Customer Clustering Analysis

This notebook contains a comprehensive analysis of customer segmentation using multiple clustering algorithms.

In [None]:
df = pd.read_csv("../data/customer_data.csv")

## 1. Exploratory Data Analysis (EDA)

First of all we need to observe the DataFrame

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
print(df.isnull().sum())

In [None]:
df.dtypes

## 2. Data Visualization

Understand each feature individually and relationships between features

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.figure(figsize=(10, 5))
plt.hist(df["Age"])
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.title("Age Distribution")
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
df["Annual Income (k$)"].plot(kind='kde', linewidth=2)
plt.title("Density Plot of Annual Income")
plt.xlabel("Annual Income (k$)")
plt.grid(True, alpha=0.1)
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
df["Spending Score (1-100)"].plot(kind='kde', linewidth=2)
plt.title("Density Plot of Spending Score")
plt.xlabel("Spending Score (1-100)")
plt.grid(True, alpha=0.1)
plt.show()

In [None]:
sns.countplot(df["Gender"])
plt.title("Gender Distribution")
plt.show()

### Relationship Analysis

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(df['Annual Income (k$)'], df['Spending Score (1-100)'])
plt.ylabel('Spending Score (1-100)')
plt.xlabel('Annual Income (k$)')
plt.title('Income vs Spending Score')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(df['Age'], df['Spending Score (1-100)'])
plt.ylabel('Spending Score (1-100)')
plt.xlabel('Age')
plt.title('Age vs Spending Score')
plt.grid(True, alpha=0.3)
plt.show()

## 3. Feature Preparation

Prepare and scale features for clustering

In [None]:
from sklearn.preprocessing import StandardScaler

X = df[['Annual Income (k$)', 'Spending Score (1-100)']]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## 4. K-Means Clustering

In [None]:
from sklearn.cluster import KMeans

wcss = []
for k in range(1, 11):
    kmeans_temp = KMeans(n_clusters=k, random_state=42)
    kmeans_temp.fit(X_scaled)
    wcss.append(kmeans_temp.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), wcss, marker='o', linewidth=2)
plt.xlabel('Number of Clusters (k)')
plt.ylabel('WCSS (Within-Cluster Sum of Squares)')
plt.title('Elbow Method for Optimal k')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X_scaled)
df['Cluster'] = kmeans.labels_

# Get centroids in original scale
centroids_original = scaler.inverse_transform(kmeans.cluster_centers_)

plt.figure(figsize=(10, 6))
plt.scatter(df['Annual Income (k$)'], df['Spending Score (1-100)'],
            c=df['Cluster'], cmap='viridis', s=50, alpha=0.7)
plt.scatter(centroids_original[:, 0], centroids_original[:, 1],
            c='red', marker='X', s=300, edgecolors='black', linewidth=2, label='Centroids')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.title('Customer Segments (K-Means with k=5)')
plt.colorbar(label='Cluster')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 5. Hierarchical Clustering

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

linkage_matrix = linkage(X_scaled, method='ward')

plt.figure(figsize=(14, 7))
dendrogram(linkage_matrix)
plt.axhline(y=5, color='red', linestyle='--', linewidth=2, label='Cut for k=5')
plt.title('Dendrogram - Hierarchical Clustering')
plt.xlabel('Customer Index')
plt.ylabel('Distance')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
df['Hierarchical_Cluster'] = fcluster(linkage_matrix, t=5, criterion='distance')

plt.figure(figsize=(10, 6))
plt.scatter(df['Annual Income (k$)'], df['Spending Score (1-100)'],
            c=df['Hierarchical_Cluster'], cmap='viridis', s=50, alpha=0.7)
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.title('Customer Segments (Hierarchical Clustering)')
plt.colorbar(label='Cluster')
plt.grid(True, alpha=0.3)
plt.show()

## 6. DBSCAN Clustering

In [None]:
from sklearn.neighbors import NearestNeighbors

neighbors = NearestNeighbors(n_neighbors=4)
neighbors_fit = neighbors.fit(X_scaled)
distances, indices = neighbors_fit.kneighbors(X_scaled)
distances = np.sort(distances[:, -1], axis=0)

plt.figure(figsize=(10, 6))
plt.plot(distances, linewidth=2)
plt.xlabel('Data Points (sorted by distance)')
plt.ylabel('4-th Nearest Neighbor Distance')
plt.title('K-distance Graph (for finding optimal eps)')
plt.axhline(y=0.2, color='red', linestyle='--', linewidth=2, label='eps = 0.20')
plt.grid(True, alpha=0.3)
plt.legend()
plt.show()

In [None]:
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.20, min_samples=4)
df['DBSCAN_Cluster'] = dbscan.fit_predict(X_scaled)

plt.figure(figsize=(10, 6))
scatter = plt.scatter(df['Annual Income (k$)'], df['Spending Score (1-100)'],
                      c=df['DBSCAN_Cluster'], cmap='viridis', s=50, alpha=0.7)
noise_mask = df['DBSCAN_Cluster'] == -1
plt.scatter(df[noise_mask]['Annual Income (k$)'], df[noise_mask]['Spending Score (1-100)'],
           c='red', marker='X', s=200, edgecolors='black', linewidth=2, label='Outliers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.title('Customer Segments (DBSCAN)')
plt.colorbar(scatter, label='Cluster')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 7. Algorithm Comparison

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# K-Means
scatter1 = axes[0].scatter(df['Annual Income (k$)'], df['Spending Score (1-100)'],
                           c=df['Cluster'], cmap='viridis', s=50, alpha=0.7)
axes[0].scatter(centroids_original[:, 0], centroids_original[:, 1],
               c='red', marker='X', s=300, edgecolors='black', linewidth=2)
axes[0].set_title('K-Means (k=5)', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Annual Income (k$)')
axes[0].set_ylabel('Spending Score (1-100)')
axes[0].grid(True, alpha=0.3)
plt.colorbar(scatter1, ax=axes[0], label='Cluster')

# Hierarchical
scatter2 = axes[1].scatter(df['Annual Income (k$)'], df['Spending Score (1-100)'],
                           c=df['Hierarchical_Cluster'], cmap='viridis', s=50, alpha=0.7)
axes[1].set_title('Hierarchical (k=5)', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Annual Income (k$)')
axes[1].set_ylabel('Spending Score (1-100)')
axes[1].grid(True, alpha=0.3)
plt.colorbar(scatter2, ax=axes[1], label='Cluster')

# DBSCAN
scatter3 = axes[2].scatter(df['Annual Income (k$)'], df['Spending Score (1-100)'],
                           c=df['DBSCAN_Cluster'], cmap='viridis', s=50, alpha=0.7)
noise_mask = df['DBSCAN_Cluster'] == -1
axes[2].scatter(df[noise_mask]['Annual Income (k$)'], df[noise_mask]['Spending Score (1-100)'],
               c='red', marker='X', s=100, edgecolors='black', linewidth=1, label='Outliers')
axes[2].set_title('DBSCAN (eps=0.20)', fontsize=12, fontweight='bold')
axes[2].set_xlabel('Annual Income (k$)')
axes[2].set_ylabel('Spending Score (1-100)')
axes[2].grid(True, alpha=0.3)
axes[2].legend()
plt.colorbar(scatter3, ax=axes[2], label='Cluster')

plt.tight_layout()
plt.show()

## 8. Summary & Insights

### Key Findings:
- **K-Means**: 5 distinct clusters, 0 outliers
- **Hierarchical**: 5 clusters with natural boundaries, 0 outliers
- **DBSCAN**: 4 clusters + 73 outliers (36.5% of data)

### Business Recommendations:
1. K-Means and Hierarchical produce similar results
2. DBSCAN reveals important outliers for special treatment
3. Use Income + Spending Score as primary segmentation features
4. Implement targeted marketing strategies by cluster