# 0. Setting Up The Env.

## 0.1 Import Packages

In [1]:
from pathlib import Path

In [2]:
import numpy as np
import pandas as pd

import itertools

In [11]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="darkgrid")

## 0.2 Utilities

## 0.3 Constants

In [5]:
project_root = Path.cwd().parent
project_root

WindowsPath('d:/01 Work/06-Segmentations')

In [21]:
raw_file_path = f"{project_root}/data/raw/bank-additional.csv"
raw_file_path

'd:\\01 Work\\06-Segmentations/data/raw/bank-additional.csv'

# 1. The Data

In [22]:
df  = pd.read_csv(raw_file_path, sep=";")
print(f"The data has {df.shape[0]} rows and {df.shape[1]} columns")
print("Data Preview:")
df.head()

The data has 4119 rows and 21 columns
Data Preview:


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,30,blue-collar,married,basic.9y,no,yes,no,cellular,may,fri,...,2,999,0,nonexistent,-1.8,92.893,-46.2,1.313,5099.1,no
1,39,services,single,high.school,no,no,no,telephone,may,fri,...,4,999,0,nonexistent,1.1,93.994,-36.4,4.855,5191.0,no
2,25,services,married,high.school,no,yes,no,telephone,jun,wed,...,1,999,0,nonexistent,1.4,94.465,-41.8,4.962,5228.1,no
3,38,services,married,basic.9y,no,unknown,unknown,telephone,jun,fri,...,3,999,0,nonexistent,1.4,94.465,-41.8,4.959,5228.1,no
4,47,admin.,married,university.degree,no,yes,no,cellular,nov,mon,...,1,999,0,nonexistent,-0.1,93.2,-42.0,4.191,5195.8,no


In [30]:
# Encode categorical variables
categorical_features = ['job', 'marital', 'education', 'default', 'housing', 
                       'loan', 'contact', 'month', 'poutcome', 'y']
numerical_features = ['age', 'duration', 'campaign', 'pdays', 
                     'previous', 'emp.var.rate', 'cons.price.idx', 
                     'cons.conf.idx', 'euribor3m', 'nr.employed']

le = LabelEncoder()
df_encoded = df.copy()
for col in categorical_features:
    df_encoded[col] = le.fit_transform(df[col])

X = df_encoded.drop('y', axis=1)
y = df_encoded['y']

# Standardize features
scaler = StandardScaler()
X_scaled = X.copy()
X_scaled[numerical_features] = scaler.fit_transform(X[numerical_features])
X_scaled.drop('day_of_week', axis=1, inplace=True)

## K-Means

In [34]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

for i in range(2,20):
    kmeans_16 = KMeans(n_clusters=i, random_state=42, n_init=10)
    clusters_kmeans = kmeans_16.fit_predict(X_scaled)
    silhouette_kmeans = silhouette_score(X_scaled, clusters_kmeans)
    print(f"K-means Silhouette Score for {i} clusters: {silhouette_kmeans:.4f}")

K-means Silhouette Score for 2 clusters: 0.2939
K-means Silhouette Score for 3 clusters: 0.2146
K-means Silhouette Score for 4 clusters: 0.2158
K-means Silhouette Score for 5 clusters: 0.1857
K-means Silhouette Score for 6 clusters: 0.1814
K-means Silhouette Score for 7 clusters: 0.1852
K-means Silhouette Score for 8 clusters: 0.1951
K-means Silhouette Score for 9 clusters: 0.1857
K-means Silhouette Score for 10 clusters: 0.1716
K-means Silhouette Score for 11 clusters: 0.1782
K-means Silhouette Score for 12 clusters: 0.1746
K-means Silhouette Score for 13 clusters: 0.1856
K-means Silhouette Score for 14 clusters: 0.1802
K-means Silhouette Score for 15 clusters: 0.1792
K-means Silhouette Score for 16 clusters: 0.1785
K-means Silhouette Score for 17 clusters: 0.1720
K-means Silhouette Score for 18 clusters: 0.1848
K-means Silhouette Score for 19 clusters: 0.1820


## Hierarchical Clustering

In [32]:
from sklearn.cluster import AgglomerativeClustering

# Test different linkage methods
linkage_methods = ['ward', 'complete', 'average', 'single']

for method in linkage_methods:
    for n_clusters in [2, 3, 4, 5, 8, 10]:
        agg_clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage=method)
        clusters_agg = agg_clustering.fit_predict(X_scaled)
        silhouette_agg = silhouette_score(X_scaled, clusters_agg)
        print(f"{method} linkage, {n_clusters} clusters: {silhouette_agg:.4f}")


ward linkage, 2 clusters: 0.2664
ward linkage, 3 clusters: 0.1854
ward linkage, 4 clusters: 0.1200
ward linkage, 5 clusters: 0.1312
ward linkage, 8 clusters: 0.1362
ward linkage, 10 clusters: 0.1400
complete linkage, 2 clusters: 0.1577
complete linkage, 3 clusters: 0.1515
complete linkage, 4 clusters: 0.0827
complete linkage, 5 clusters: 0.1721
complete linkage, 8 clusters: 0.1325
complete linkage, 10 clusters: 0.1356
average linkage, 2 clusters: 0.4817
average linkage, 3 clusters: 0.2757
average linkage, 4 clusters: 0.2355
average linkage, 5 clusters: 0.1856
average linkage, 8 clusters: 0.2481
average linkage, 10 clusters: 0.2036
single linkage, 2 clusters: 0.4817
single linkage, 3 clusters: 0.4367
single linkage, 4 clusters: 0.4178
single linkage, 5 clusters: 0.3154
single linkage, 8 clusters: 0.1836
single linkage, 10 clusters: 0.1734


## DBSCAN

In [51]:
from sklearn.cluster import DBSCAN

# Test different parameter combinations
eps_values = [0.5, 0.7, 1.0, 1.2, 1.5]
min_samples_values = [10, 50, 100, 200]

best_result = {'silhouette': -1, 'params': None}

for eps in eps_values:
    for min_samples in min_samples_values:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        clusters_dbscan = dbscan.fit_predict(X_scaled)
        
        n_clusters = len(set(clusters_dbscan)) - (1 if -1 in clusters_dbscan else 0)
        if n_clusters > 1:
            mask = clusters_dbscan != -1
            silhouette_dbscan = silhouette_score(X_scaled[mask], clusters_dbscan[mask])
            
            if silhouette_dbscan > best_result['silhouette']:
                best_result = {'silhouette': silhouette_dbscan, 'params': {'eps': eps, 'min_samples': min_samples}}


In [52]:
best_result

{'silhouette': 0.7555688761752094, 'params': {'eps': 0.7, 'min_samples': 10}}

In [37]:
from sklearn.mixture import GaussianMixture

# Test different numbers of components
for n_components in range(2, 11):
    gmm = GaussianMixture(n_components=n_components, covariance_type='full', random_state=42)
    clusters_gmm = gmm.fit_predict(X_scaled)
    silhouette_gmm = silhouette_score(X_scaled, clusters_gmm)
    aic = gmm.aic(X_scaled)
    bic = gmm.bic(X_scaled)
    print(f"Components: {n_components}, Silhouette: {silhouette_gmm:.4f}, AIC: {aic:.2f}")


Components: 2, Silhouette: 0.1925, AIC: 62560.67
Components: 3, Silhouette: 0.1081, AIC: -71112.90
Components: 4, Silhouette: 0.1176, AIC: -43499.92
Components: 5, Silhouette: 0.0957, AIC: -115225.77
Components: 6, Silhouette: 0.0606, AIC: -116633.68
Components: 7, Silhouette: 0.0899, AIC: -141240.32
Components: 8, Silhouette: 0.0383, AIC: -162389.24
Components: 9, Silhouette: 0.0343, AIC: -159103.66
Components: 10, Silhouette: 0.0497, AIC: -189059.99


In [38]:
from sklearn.cluster import Birch

threshold_values = [0.01, 0.05, 0.1, 0.3, 0.5, 1.0]
n_clusters_values = [None, 5, 8, 10, 15, 20]

for threshold in threshold_values:
    for n_clusters in n_clusters_values:
        birch = Birch(threshold=threshold, n_clusters=n_clusters)
        clusters_birch = birch.fit_predict(X_scaled)
        
        if len(set(clusters_birch)) > 1:
            silhouette_birch = silhouette_score(X_scaled, clusters_birch)
            print(f"Threshold: {threshold}, Clusters: {n_clusters}, Silhouette: {silhouette_birch:.4f}")


Threshold: 0.01, Clusters: None, Silhouette: 0.0010
Threshold: 0.01, Clusters: 5, Silhouette: 0.1312
Threshold: 0.01, Clusters: 8, Silhouette: 0.1362
Threshold: 0.01, Clusters: 10, Silhouette: 0.1400
Threshold: 0.01, Clusters: 15, Silhouette: 0.1510
Threshold: 0.01, Clusters: 20, Silhouette: 0.1589
Threshold: 0.05, Clusters: None, Silhouette: 0.0037
Threshold: 0.05, Clusters: 5, Silhouette: 0.1311
Threshold: 0.05, Clusters: 8, Silhouette: 0.1325
Threshold: 0.05, Clusters: 10, Silhouette: 0.1366
Threshold: 0.05, Clusters: 15, Silhouette: 0.1536
Threshold: 0.05, Clusters: 20, Silhouette: 0.1586
Threshold: 0.1, Clusters: None, Silhouette: 0.0130
Threshold: 0.1, Clusters: 5, Silhouette: 0.1126
Threshold: 0.1, Clusters: 8, Silhouette: 0.1449
Threshold: 0.1, Clusters: 10, Silhouette: 0.1318
Threshold: 0.1, Clusters: 15, Silhouette: 0.1505
Threshold: 0.1, Clusters: 20, Silhouette: 0.1575
Threshold: 0.3, Clusters: None, Silhouette: 0.0549
Threshold: 0.3, Clusters: 5, Silhouette: 0.0963
Thresho

# END