# 📞 Customer Segmentation 💁‍♀️

In [21]:
import pandas as pd
from unsupervised.styler import style_dataframe

In [None]:
cs_ds = pd.read_csv('data/customer_data.csv')
cs_ds.columns

In [None]:
style_dataframe(cs_ds.head())

In [24]:
from unsupervised.missing_values import missing_values_summarizer

In [None]:
prop_miss_df, _ = missing_values_summarizer(cs_ds)
style_dataframe(prop_miss_df)

## Drop unecessary fields

In [26]:
cs_dataset = cs_ds.drop(['CustomerID'], 
                        axis=1)

## Split the data

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
X_train, X_test = train_test_split(cs_dataset, 
                                   test_size=0.3, 
                                   random_state=42)

In [None]:
X_train.shape

In [None]:
X_test.shape

## One Hot encode

In [31]:
from sklearn.preprocessing import OneHotEncoder
one_hot_enc = OneHotEncoder(drop='first', 
                            handle_unknown='ignore')

In [None]:
one_hot_enc.fit(X_train[['Gender']])

In [33]:
X_train_encoded = one_hot_enc.transform(X_train[['Gender']]).toarray()
X_test_encoded = one_hot_enc.transform(X_test[['Gender']]).toarray()

In [34]:
X_train_enc_df = pd.DataFrame(
    X_train_encoded, 
    columns=one_hot_enc.get_feature_names_out(['Gender'])
    )
X_test_enc_df = pd.DataFrame(
    X_test_encoded, 
    columns=one_hot_enc.get_feature_names_out(['Gender'])
    )

In [35]:
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

In [36]:
X_train = pd.concat(
    [X_train.drop(['Gender'], axis=1), 
     X_train_enc_df], axis=1
)

In [37]:
X_test = pd.concat(
    [X_test.drop(['Gender'], axis=1), 
     X_test_enc_df], axis=1)

## Scale

In [38]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)

In [40]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Modeling

In [41]:
import matplotlib.pyplot as plt

def plot_clustering_metrics(k_vals, 
                            sil_scores, 
                            davies_scores, 
                            cal_hab_scores,
                            x_axis_lbl='Number of clusters (k)',
                            figsize=(10,15)):
    
    _, axes = plt.subplots(3, 1, figsize=figsize)
    axes[0].plot(k_vals, sil_scores, marker='o', color='blue')
    axes[0].set_title('Silhouette Score vs. Number of Clusters (K-Means)')
    axes[0].set_xlabel(x_axis_lbl)
    axes[0].set_ylabel('Silhouette Score')
    axes[0].grid(True)
    axes[1].plot(k_vals, davies_scores, marker='o', color='red')
    axes[1].set_title('Davies-Bouldin Index vs. Number of Clusters')
    axes[1].set_xlabel('Number of clusters (K)')
    axes[1].set_ylabel('Davies-Bouldin Index')
    axes[1].grid(True)
    axes[2].plot(k_vals, cal_hab_scores, marker='o', color='green')
    axes[2].set_title('Calinski-Harabasz Score vs. Number of Clusters')
    axes[2].set_xlabel('Number of clusters (K)')
    axes[2].set_ylabel('Calinski-Harabasz Score')
    axes[2].grid(True)
    plt.tight_layout()
    plt.show()

In [42]:
from sklearn.cluster import KMeans
import numpy as np

In [43]:
model = KMeans(random_state=42)

In [44]:
from sklearn.metrics import (silhouette_score, 
                             davies_bouldin_score, 
                             calinski_harabasz_score)

def find_optimal_clusters(X, model, 
                          n_clusters=10,
                          majority_vote=True,
                          visualize=True):
    if not hasattr(model, "fit"):
        raise ValueError("Not a Sci-Kit Learn Model")

    k_vals = range(2, n_clusters)
    sil_scores = []
    davies_scores = []
    cal_hab_scores = []

    for k in k_vals:
        if hasattr(model, 'n_clusters'):
            model.set_params(n_clusters=k)

        model.fit(X)
        labels = model.labels_ if hasattr(model, "labels_") else model.fit_predict(X)
        sil_scores.append(silhouette_score(X, labels))
        davies_scores.append(davies_bouldin_score(X, labels))
        cal_hab_scores.append(calinski_harabasz_score(X, labels))

    optim_k_silhouette = k_vals[np.argmax(sil_scores)]  
    optim_k_davies_bouldin = k_vals[np.argmin(davies_scores)] 
    optim_k_cal_hab = k_vals[np.argmax(cal_hab_scores)]  

    print(f'Optimal cluster based on Silhouette score is: {optim_k_silhouette}')
    print(f'Optimal cluster based on Davies-Bouldin score is: {optim_k_davies_bouldin}')
    print(f'Optimal cluster based on Calinski-Harabasz score is: {optim_k_cal_hab}')

    optim_k_comb = None
    if majority_vote:
        optim_k_comb = max(set([optim_k_silhouette, 
                          optim_k_davies_bouldin, 
                          optim_k_cal_hab]), 
                          key=[optim_k_silhouette, 
                               optim_k_davies_bouldin, 
                               optim_k_cal_hab].count)
        
    if visualize:
        plot_clustering_metrics(k_vals, sil_scores=sil_scores, 
                                davies_scores=davies_scores, 
                                cal_hab_scores=cal_hab_scores)
        
    
    return (optim_k_comb, optim_k_silhouette, 
            optim_k_cal_hab, optim_k_davies_bouldin)


## Test on Training set

In [None]:
clusters, _, _, _ = find_optimal_clusters(X=X_train_scaled, 
                                          n_clusters=10, 
                                          model=model)

## Use optimal clusters

In [46]:
model = KMeans(n_clusters=clusters)

In [47]:
kmeans_labels = model.fit_predict(X=X_train_scaled)

## Use Principal Components Analysis for Visualisation

In [48]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(X_train_scaled)

In [None]:
from unsupervised.scatter import plot_scatter
plot_scatter(X=reduced_data, labels=kmeans_labels, title='PCA on KMeans')

## Evaluate clusters using Test set

In [50]:
test_cluster_labels = model.predict(X_test_scaled)

In [None]:
test_clusters, _, _, _ = find_optimal_clusters(X=X_test_scaled, n_clusters=10, model=model)
test_clusters

### Compare Silhouettes for Training and Test

In [None]:
from unsupervised.silhouette_plot import silhouette_plot
silhouette_plot(X_train_scaled, kmeans_labels, clusters, 
                silh_colors=['red', 'black', 'navy'],
                title='Training Silhouette Plot')

In [None]:
silhouette_plot(X_test_scaled, labels=test_cluster_labels, 
                n_clusters=test_clusters,
                silh_colors=['red', 'black', 'navy'],
                title='Testing Silhouette Plot')