# Clustering Analysis - Mall Customer Segmentation

This notebook explores and compares multiple clustering algorithms on the Mall Customer Segmentation dataset.

**Clustering Models:**
- K-Means (baseline)
- DBSCAN
- K-Medoids
- Agglomerative Clustering
- Gaussian Mixture Models

---

## Section 1: Exploratory Data Analysis and Data Preprocessing

This section performs comprehensive exploratory data analysis and prepares the data for modeling.

### Section 1.1: Exploratory Data Analysis

In this section, we'll:
- Load and inspect the dataset
- Perform statistical summaries
- Analyze correlations between features
- Identify outliers and missing values
- Visualize distributions and relationships

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

# Create output directory for visualizations
viz_dir = Path('visualizations')
viz_dir.mkdir(exist_ok=True)

print("Libraries imported successfully!")

In [None]:
# Load the dataset
import kagglehub

# Download latest version
path = kagglehub.dataset_download("vjchoudhary7/customer-segmentation-tutorial-in-python")
print(f"Dataset path: {path}")

# Load the CSV file
data_path = Path(path) / 'Mall_Customers.csv'
df = pd.read_csv(data_path)

print(f"\nDataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()

#### Basic Dataset Information

In [None]:
# Display dataset info
print("Dataset Information:")
print("=" * 50)
df.info()

print("\n" + "=" * 50)
print("\nColumn Names:")
print(df.columns.tolist())

print("\nDataset Shape:")
print(f"Rows: {df.shape[0]}")
print(f"Columns: {df.shape[1]}")

#### Statistical Summary

In [None]:
# Statistical summary of numerical features
print("Statistical Summary of Numerical Features:")
print("=" * 50)
df.describe()

In [None]:
# Statistical summary of categorical features
print("\nStatistical Summary of Categorical Features:")
print("=" * 50)
df.describe(include=['object'])

#### Missing Value Analysis

In [None]:
# Check for missing values
print("Missing Value Analysis:")
print("=" * 50)
missing_values = df.isnull().sum()
missing_percentage = (df.isnull().sum() / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Missing Percentage': missing_percentage
})

print(missing_df)
print(f"\nTotal missing values: {df.isnull().sum().sum()}")

In [None]:
# Visualize missing values
if df.isnull().sum().sum() > 0:
    plt.figure(figsize=(10, 6))
    sns.heatmap(df.isnull(), cbar=True, cmap='viridis', yticklabels=False)
    plt.title('Missing Values Heatmap', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig(viz_dir / 'missing_values_heatmap.png', dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("No missing values detected in the dataset.")

#### Categorical Feature Analysis

In [None]:
# Analyze categorical features
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

print("Categorical Features Analysis:")
print("=" * 50)

for col in categorical_cols:
    print(f"\n{col}:")
    print(f"  Unique values: {df[col].nunique()}")
    print(f"  Value counts:")
    print(df[col].value_counts())
    print(f"\n  Percentage distribution:")
    print(df[col].value_counts(normalize=True) * 100)

In [None]:
# Visualize categorical features
if len(categorical_cols) > 0:
    n_cols = len(categorical_cols)
    fig, axes = plt.subplots(1, n_cols, figsize=(6*n_cols, 5))
    
    if n_cols == 1:
        axes = [axes]
    
    for idx, col in enumerate(categorical_cols):
        df[col].value_counts().plot(kind='bar', ax=axes[idx], color='steelblue')
        axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
        axes[idx].set_xlabel(col, fontsize=10)
        axes[idx].set_ylabel('Count', fontsize=10)
        axes[idx].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.savefig(viz_dir / 'categorical_features_distribution.png', dpi=300, bbox_inches='tight')
    plt.show()

#### Distribution Analysis of Numerical Features

In [None]:
# Get numerical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()

print("Numerical Features:")
print(numerical_cols)

In [None]:
# Distribution plots for numerical features
n_cols = len(numerical_cols)
n_rows = (n_cols + 1) // 2

fig, axes = plt.subplots(n_rows, 2, figsize=(14, 5*n_rows))
axes = axes.flatten()

for idx, col in enumerate(numerical_cols):
    axes[idx].hist(df[col], bins=30, color='steelblue', edgecolor='black', alpha=0.7)
    axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(col, fontsize=10)
    axes[idx].set_ylabel('Frequency', fontsize=10)
    axes[idx].axvline(df[col].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {df[col].mean():.2f}')
    axes[idx].axvline(df[col].median(), color='green', linestyle='--', linewidth=2, label=f'Median: {df[col].median():.2f}')
    axes[idx].legend()
    axes[idx].grid(axis='y', alpha=0.3)

# Hide extra subplots
for idx in range(len(numerical_cols), len(axes)):
    axes[idx].axis('off')

plt.tight_layout()
plt.savefig(viz_dir / 'numerical_features_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Box plots for numerical features to identify outliers
fig, axes = plt.subplots(n_rows, 2, figsize=(14, 5*n_rows))
axes = axes.flatten()

for idx, col in enumerate(numerical_cols):
    axes[idx].boxplot(df[col].dropna(), vert=True, patch_artist=True,
                      boxprops=dict(facecolor='lightblue', color='blue'),
                      whiskerprops=dict(color='blue'),
                      capprops=dict(color='blue'),
                      medianprops=dict(color='red', linewidth=2))
    axes[idx].set_title(f'Box Plot of {col}', fontsize=12, fontweight='bold')
    axes[idx].set_ylabel(col, fontsize=10)
    axes[idx].grid(axis='y', alpha=0.3)

# Hide extra subplots
for idx in range(len(numerical_cols), len(axes)):
    axes[idx].axis('off')

plt.tight_layout()
plt.savefig(viz_dir / 'numerical_features_boxplot.png', dpi=300, bbox_inches='tight')
plt.show()

#### Outlier Identification

In [None]:
# Identify outliers using IQR method
def identify_outliers_iqr(dataframe, column):
    Q1 = dataframe[column].quantile(0.25)
    Q3 = dataframe[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = dataframe[(dataframe[column] < lower_bound) | (dataframe[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

print("Outlier Analysis (IQR Method):")
print("=" * 70)

for col in numerical_cols:
    outliers, lower, upper = identify_outliers_iqr(df, col)
    print(f"\n{col}:")
    print(f"  Lower Bound: {lower:.2f}")
    print(f"  Upper Bound: {upper:.2f}")
    print(f"  Number of outliers: {len(outliers)} ({(len(outliers)/len(df)*100):.2f}%)")
    if len(outliers) > 0:
        print(f"  Outlier values range: [{outliers[col].min():.2f}, {outliers[col].max():.2f}]")

#### Correlation Analysis

In [None]:
# Compute correlation matrix
correlation_matrix = df[numerical_cols].corr()

print("Correlation Matrix:")
print("=" * 50)
print(correlation_matrix)

In [None]:
# Visualize correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix Heatmap', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig(viz_dir / 'correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Find highly correlated feature pairs
def get_high_correlations(corr_matrix, threshold=0.7):
    high_corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                high_corr_pairs.append((
                    corr_matrix.columns[i],
                    corr_matrix.columns[j],
                    corr_matrix.iloc[i, j]
                ))
    return high_corr_pairs

high_corr = get_high_correlations(correlation_matrix, threshold=0.7)

print("\nHighly Correlated Feature Pairs (|r| > 0.7):")
print("=" * 50)
if high_corr:
    for feat1, feat2, corr_val in high_corr:
        print(f"{feat1} <-> {feat2}: {corr_val:.3f}")
else:
    print("No highly correlated feature pairs found.")

#### Pairwise Scatter Plots

In [None]:
# Create pairwise scatter plot for numerical features
# Exclude ID column if present for better visualization
feature_cols = [col for col in numerical_cols if 'id' not in col.lower()]

if len(feature_cols) > 1:
    pairplot = sns.pairplot(df[feature_cols], diag_kind='hist', 
                            plot_kws={'alpha': 0.6, 's': 30, 'edgecolor': 'k'},
                            diag_kws={'edgecolor': 'k', 'alpha': 0.7})
    pairplot.fig.suptitle('Pairwise Scatter Plots of Numerical Features', 
                          fontsize=16, fontweight='bold', y=1.01)
    plt.tight_layout()
    plt.savefig(viz_dir / 'pairwise_scatter_plots.png', dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("Not enough numerical features for pairwise scatter plots.")

#### Summary Statistics by Categorical Features

In [None]:
# Analyze numerical features grouped by categorical features
if len(categorical_cols) > 0 and len(feature_cols) > 0:
    for cat_col in categorical_cols:
        print(f"\nSummary Statistics by {cat_col}:")
        print("=" * 70)
        print(df.groupby(cat_col)[feature_cols].describe().round(2))
        print("\n")

In [None]:
# Visualize numerical features by categorical features
if len(categorical_cols) > 0 and len(feature_cols) > 0:
    for cat_col in categorical_cols:
        n_features = len(feature_cols)
        fig, axes = plt.subplots(1, n_features, figsize=(6*n_features, 5))
        
        if n_features == 1:
            axes = [axes]
        
        for idx, num_col in enumerate(feature_cols):
            df.boxplot(column=num_col, by=cat_col, ax=axes[idx], patch_artist=True)
            axes[idx].set_title(f'{num_col} by {cat_col}', fontsize=12, fontweight='bold')
            axes[idx].set_xlabel(cat_col, fontsize=10)
            axes[idx].set_ylabel(num_col, fontsize=10)
        
        plt.suptitle('')  # Remove the automatic title
        plt.tight_layout()
        plt.savefig(viz_dir / f'numerical_by_{cat_col}.png', dpi=300, bbox_inches='tight')
        plt.show()

#### Key Insights from EDA

In [None]:
# Summary of key findings
print("KEY INSIGHTS FROM EXPLORATORY DATA ANALYSIS")
print("=" * 70)
print(f"\n1. Dataset Overview:")
print(f"   - Total samples: {df.shape[0]}")
print(f"   - Total features: {df.shape[1]}")
print(f"   - Numerical features: {len(numerical_cols)}")
print(f"   - Categorical features: {len(categorical_cols)}")

print(f"\n2. Data Quality:")
print(f"   - Missing values: {df.isnull().sum().sum()}")
print(f"   - Duplicate rows: {df.duplicated().sum()}")

print(f"\n3. Outliers Detected:")
for col in numerical_cols:
    outliers, _, _ = identify_outliers_iqr(df, col)
    if len(outliers) > 0:
        print(f"   - {col}: {len(outliers)} outliers ({(len(outliers)/len(df)*100):.2f}%)")

print(f"\n4. Feature Correlations:")
high_corr = get_high_correlations(correlation_matrix, threshold=0.7)
if high_corr:
    for feat1, feat2, corr_val in high_corr:
        print(f"   - {feat1} and {feat2}: {corr_val:.3f}")
else:
    print("   - No strong correlations (|r| > 0.7) detected")

print("\n" + "=" * 70)

### Section 1.2: Data Preprocessing Pipeline

In this section, we'll:
- Select the features for clustering (Annual Income and Spending Score)
- Handle any missing values
- Scale/normalize the features
- Create two datasets: one with preprocessing and one without
- Visualize the selected features and their relationship

In [None]:
# Import preprocessing libraries
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

print("Preprocessing libraries imported successfully!")

#### Feature Selection

We'll focus on two key features for clustering:
- **Annual Income (k$)**: Customer's annual income
- **Spending Score (1-100)**: Score assigned by the mall based on customer behavior and spending nature

In [None]:
# Select the features for clustering
selected_features = ['Annual Income (k$)', 'Spending Score (1-100)']

# Check if these columns exist in the dataframe
print("Checking for selected features in dataset...")
print("=" * 50)
for feature in selected_features:
    if feature in df.columns:
        print(f"✓ {feature} found")
    else:
        print(f"✗ {feature} NOT found")
        print(f"Available columns: {df.columns.tolist()}")

print(f"\nSelected features: {selected_features}")
print(f"\nShape after feature selection: {df[selected_features].shape}")

In [None]:
# Create the clustering dataset with selected features
X = df[selected_features].copy()

print("Clustering Dataset Created:")
print("=" * 50)
print(f"Shape: {X.shape}")
print(f"\nFirst few rows:")
print(X.head(10))
print(f"\nStatistical Summary:")
print(X.describe())

#### Check for Missing Values in Selected Features

In [None]:
# Check for missing values in selected features
print("Missing Values in Selected Features:")
print("=" * 50)
missing_in_X = X.isnull().sum()
print(missing_in_X)
print(f"\nTotal missing values: {X.isnull().sum().sum()}")

if X.isnull().sum().sum() > 0:
    print("\nHandling missing values...")
    # For this dataset, we'll drop rows with missing values
    X_clean = X.dropna()
    print(f"Rows before: {len(X)}")
    print(f"Rows after: {len(X_clean)}")
    print(f"Rows removed: {len(X) - len(X_clean)}")
    X = X_clean
else:
    print("\nNo missing values found. Proceeding with full dataset.")

#### Visualize Selected Features Before Preprocessing

In [None]:
# Scatter plot of the two selected features
plt.figure(figsize=(10, 6))
plt.scatter(X[selected_features[0]], X[selected_features[1]], 
            alpha=0.6, s=50, edgecolor='k', linewidth=0.5)
plt.xlabel(selected_features[0], fontsize=12, fontweight='bold')
plt.ylabel(selected_features[1], fontsize=12, fontweight='bold')
plt.title('Scatter Plot: Annual Income vs Spending Score (Before Preprocessing)', 
          fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(viz_dir / 'selected_features_scatter_before_preprocessing.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Distribution plots for selected features
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for idx, feature in enumerate(selected_features):
    axes[idx].hist(X[feature], bins=30, color='steelblue', edgecolor='black', alpha=0.7)
    axes[idx].set_title(f'Distribution of {feature}', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(feature, fontsize=10)
    axes[idx].set_ylabel('Frequency', fontsize=10)
    axes[idx].axvline(X[feature].mean(), color='red', linestyle='--', linewidth=2, 
                      label=f'Mean: {X[feature].mean():.2f}')
    axes[idx].axvline(X[feature].median(), color='green', linestyle='--', linewidth=2, 
                      label=f'Median: {X[feature].median():.2f}')
    axes[idx].legend()
    axes[idx].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig(viz_dir / 'selected_features_distribution_before_preprocessing.png', dpi=300, bbox_inches='tight')
plt.show()

#### Create Two Datasets: With and Without Preprocessing

In [None]:
# Dataset 1: Without Preprocessing (Original Data)
X_original = X.copy()

print("Dataset 1: Without Preprocessing")
print("=" * 50)
print(f"Shape: {X_original.shape}")
print(f"\nStatistics:")
print(X_original.describe())

# Save for later use
print("\nDataset saved as 'X_original' for clustering without preprocessing.")

In [None]:
# Dataset 2: With Preprocessing (StandardScaler)
# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the data
X_scaled = scaler.fit_transform(X)

# Convert back to DataFrame for easier handling
X_scaled = pd.DataFrame(X_scaled, columns=selected_features, index=X.index)

print("Dataset 2: With Preprocessing (StandardScaler)")
print("=" * 50)
print(f"Shape: {X_scaled.shape}")
print(f"\nScaler parameters:")
print(f"  Mean: {scaler.mean_}")
print(f"  Standard Deviation: {np.sqrt(scaler.var_)}")
print(f"\nStatistics after scaling:")
print(X_scaled.describe())

# Save for later use
print("\nDataset saved as 'X_scaled' for clustering with preprocessing.")

#### Visualize Scaled Features

In [None]:
# Scatter plot comparison: Before and After Scaling
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Before scaling
axes[0].scatter(X_original[selected_features[0]], X_original[selected_features[1]], 
                alpha=0.6, s=50, edgecolor='k', linewidth=0.5, color='steelblue')
axes[0].set_xlabel(selected_features[0], fontsize=11, fontweight='bold')
axes[0].set_ylabel(selected_features[1], fontsize=11, fontweight='bold')
axes[0].set_title('Before Preprocessing (Original Scale)', fontsize=12, fontweight='bold')
axes[0].grid(True, alpha=0.3)

# After scaling
axes[1].scatter(X_scaled[selected_features[0]], X_scaled[selected_features[1]], 
                alpha=0.6, s=50, edgecolor='k', linewidth=0.5, color='coral')
axes[1].set_xlabel(f'{selected_features[0]} (Scaled)', fontsize=11, fontweight='bold')
axes[1].set_ylabel(f'{selected_features[1]} (Scaled)', fontsize=11, fontweight='bold')
axes[1].set_title('After Preprocessing (StandardScaler)', fontsize=12, fontweight='bold')
axes[1].grid(True, alpha=0.3)

plt.suptitle('Comparison: Original vs Scaled Features', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig(viz_dir / 'preprocessing_comparison_scatter.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Distribution comparison: Before and After Scaling
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

for idx, feature in enumerate(selected_features):
    # Before scaling
    axes[idx, 0].hist(X_original[feature], bins=30, color='steelblue', edgecolor='black', alpha=0.7)
    axes[idx, 0].set_title(f'{feature} - Original', fontsize=11, fontweight='bold')
    axes[idx, 0].set_xlabel(feature, fontsize=10)
    axes[idx, 0].set_ylabel('Frequency', fontsize=10)
    axes[idx, 0].axvline(X_original[feature].mean(), color='red', linestyle='--', linewidth=2, 
                         label=f'Mean: {X_original[feature].mean():.2f}')
    axes[idx, 0].legend()
    axes[idx, 0].grid(axis='y', alpha=0.3)
    
    # After scaling
    axes[idx, 1].hist(X_scaled[feature], bins=30, color='coral', edgecolor='black', alpha=0.7)
    axes[idx, 1].set_title(f'{feature} - Scaled', fontsize=11, fontweight='bold')
    axes[idx, 1].set_xlabel(f'{feature} (Scaled)', fontsize=10)
    axes[idx, 1].set_ylabel('Frequency', fontsize=10)
    axes[idx, 1].axvline(X_scaled[feature].mean(), color='red', linestyle='--', linewidth=2, 
                         label=f'Mean: {X_scaled[feature].mean():.2f}')
    axes[idx, 1].legend()
    axes[idx, 1].grid(axis='y', alpha=0.3)

plt.suptitle('Distribution Comparison: Original vs Scaled Features', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(viz_dir / 'preprocessing_comparison_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

#### Data Preprocessing Summary

In [None]:
# Summary of preprocessing steps
print("DATA PREPROCESSING SUMMARY")
print("=" * 70)
print(f"\n1. Feature Selection:")
print(f"   - Selected features: {selected_features}")
print(f"   - Original dataset shape: {df.shape}")
print(f"   - Selected features shape: {X_original.shape}")

print(f"\n2. Missing Values:")
print(f"   - Missing values in selected features: {X_original.isnull().sum().sum()}")
print(f"   - Action taken: {'Dropped rows' if X_original.isnull().sum().sum() > 0 else 'No action needed'}")

print(f"\n3. Datasets Created:")
print(f"   - X_original (without preprocessing): {X_original.shape}")
print(f"     * Scale: Original units")
print(f"     * {selected_features[0]} range: [{X_original[selected_features[0]].min():.2f}, {X_original[selected_features[0]].max():.2f}]")
print(f"     * {selected_features[1]} range: [{X_original[selected_features[1]].min():.2f}, {X_original[selected_features[1]].max():.2f}]")

print(f"\n   - X_scaled (with StandardScaler): {X_scaled.shape}")
print(f"     * Scale: Standardized (mean=0, std=1)")
print(f"     * {selected_features[0]} range: [{X_scaled[selected_features[0]].min():.2f}, {X_scaled[selected_features[0]].max():.2f}]")
print(f"     * {selected_features[1]} range: [{X_scaled[selected_features[1]].min():.2f}, {X_scaled[selected_features[1]].max():.2f}]")

print(f"\n4. Ready for Clustering:")
print(f"   ✓ Two datasets prepared")
print(f"   ✓ No missing values")
print(f"   ✓ Features selected and scaled")
print(f"   ✓ Visualizations saved to '{viz_dir}' directory")

print("\n" + "=" * 70)
print("Section 1.2 Complete! Ready for clustering analysis.")
print("=" * 70)