# 0. Setting Up The Env

## 0.1 Packages

In [32]:
from pathlib import Path

In [33]:
import numpy as np
import pandas as pd

import itertools


In [34]:
from sklearn.preprocessing import StandardScaler

In [35]:
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.metrics import (adjusted_rand_score, adjusted_mutual_info_score,
                             homogeneity_score, completeness_score, v_measure_score,
                             fowlkes_mallows_score)
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

## 0.2 Utilities

In [36]:
class ClusterEvaluator:
    def __init__(self, X):
        self.X = X
        self.models = {
            "kmeans": KMeans,
            "gmm": GaussianMixture,
            "dbscan": DBSCAN,
            "hierarchical": AgglomerativeClustering
        }
    
    def fit_and_evaluate(self, algo_name, **kwargs):
        algo_name = algo_name.lower()
        if algo_name not in self.models:
            raise ValueError(f"Unsupported algorithm: {algo_name}")
        
        # Handle parameter filtering
        if algo_name == "kmeans":
            allowed = {"n_clusters", "init", "max_iter", "random_state"}
        elif algo_name == "gmm":
            allowed = {"n_components", "covariance_type", "max_iter", "random_state"}
        elif algo_name == "dbscan":
            allowed = {"eps", "min_samples", "metric"}
        elif algo_name == "hierarchical":
            allowed = {"n_clusters", "linkage", "affinity"}
        else:
            allowed = {}
        
        params = {k: v for k, v in kwargs.items() if k in allowed}
        model = self.models[algo_name](**params)
        
        if algo_name == "gmm":
            labels = model.fit_predict(self.X)
        else:
            labels = model.fit(self.X).labels_
        
        # Handle single-cluster/noise cases
        if len(set(labels)) <= 1:
            return {"silhouette": None, "db": None, "ch": None}
        
        results = {
            "silhouette": silhouette_score(self.X, labels),
            "db": davies_bouldin_score(self.X, labels),
            "ch": calinski_harabasz_score(self.X, labels)
        }
        return results


## 0.3 Constants

In [37]:
project_root = Path.cwd().parent
project_root

WindowsPath('d:/01 Work/06-Segmentations')

In [38]:
processed_file_path = f"{project_root}/data/processed/preprocessed_bank.csv"
processed_file_path

'd:\\01 Work\\06-Segmentations/data/processed/preprocessed_bank.csv'

# 1. The Data

In [39]:
data_df  = pd.read_csv(processed_file_path)
print(f"The data has {data_df.shape[0]} rows and {data_df.shape[1]} columns")
print("Data Preview:")
data_df.head()

The data has 4521 rows and 34 columns
Data Preview:


Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,...,marital_married,marital_single,education_primary,education_secondary,education_tertiary,contact_cellular,contact_unknown,poutcome_failure,poutcome_other,poutcome_unknown
0,30,0,1787,0,0,19,79,1,-1,0,...,1,0,1,0,0,1,0,0,0,1
1,33,0,4789,1,1,11,220,1,339,4,...,1,0,0,1,0,1,0,1,0,0
2,35,0,1350,1,0,16,185,1,330,1,...,0,1,0,0,1,1,0,1,0,0
3,30,0,1476,1,1,3,199,4,-1,0,...,1,0,0,0,1,0,1,0,0,1
4,59,0,0,1,0,5,226,1,-1,0,...,1,0,0,1,0,0,1,0,0,1


In [40]:
data_df.columns

Index(['age', 'default', 'balance', 'housing', 'loan', 'day', 'duration',
       'campaign', 'pdays', 'previous', 'y', 'sin_month', 'cos_month',
       'job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'marital_married',
       'marital_single', 'education_primary', 'education_secondary',
       'education_tertiary', 'contact_cellular', 'contact_unknown',
       'poutcome_failure', 'poutcome_other', 'poutcome_unknown'],
      dtype='object')

# 2. Standard Scaling

In [41]:
X = data_df.drop("y", axis=1)
y = data_df.loc[:,"y"]

In [42]:
scaler = StandardScaler()
scaled_array = scaler.fit_transform(X)

In [43]:
evaluator = ClusterEvaluator(X)

# KMeans
print(evaluator.fit_and_evaluate("kmeans", n_clusters=3, random_state=42))

# GMM
print(evaluator.fit_and_evaluate("gmm", n_components=3, random_state=42))

# DBSCAN
print(evaluator.fit_and_evaluate("dbscan", eps=0.5, min_samples=5))

# Hierarchical
print(evaluator.fit_and_evaluate("hierarchical", n_clusters=3, linkage="ward"))


{'silhouette': 0.744809185409758, 'db': 0.5220085682633675, 'ch': 7199.0098247403}
{'silhouette': -0.1839429250684907, 'db': 7.721332670582633, 'ch': 25.53933611662522}
{'silhouette': None, 'db': None, 'ch': None}
{'silhouette': 0.6788621506388782, 'db': 0.6134003560345574, 'ch': 5408.867575425097}


# END