# 0. Setting Up The Env

## 0.1 Packages

In [1]:
from pathlib import Path

In [6]:
import numpy as np
import pandas as pd

import itertools


In [4]:
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.metrics import (adjusted_rand_score, adjusted_mutual_info_score,
                             homogeneity_score, completeness_score, v_measure_score,
                             fowlkes_mallows_score)
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder

## 0.2 Utilities

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.metrics import (adjusted_rand_score, adjusted_mutual_info_score,
                             homogeneity_score, completeness_score, v_measure_score,
                             fowlkes_mallows_score)
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder

class ClusterEvaluator:
    def __init__(self, X, y_true, random_state=42, repeats_stability=5):
        self.X = np.asarray(X)
        self.y_true = np.asarray(y_true)
        self.random_state = random_state
        self.repeats_stability = repeats_stability
        self.results = []
        self.profiles = {}

    # -----------------------------
    def _cluster(self, method, k=None, **kwargs):
        """Fit clustering model and return labels (and probs if GMM)."""
        if method == 'kmeans':
            model = KMeans(n_clusters=k, init='k-means++', n_init=50,
                           random_state=self.random_state, **kwargs)
            labels = model.fit_predict(self.X)
            return labels, model

        elif method == 'gmm':
            model = GaussianMixture(n_components=k, covariance_type='full',
                                    n_init=5, random_state=self.random_state, **kwargs)
            labels = model.fit_predict(self.X)
            return labels, model

        elif method == 'dbscan':
            model = DBSCAN(**kwargs)
            labels = model.fit_predict(self.X)
            return labels, model

        elif method == 'agglo':
            model = AgglomerativeClustering(n_clusters=k, **kwargs)
            labels = model.fit_predict(self.X)
            return labels, model

        else:
            raise ValueError(f"Unknown method: {method}")

    # -----------------------------
    def _evaluate(self, labels, method, k=None):
        """Compute external metrics, downstream test, stability, profiles."""
        y_true = self.y_true
        ari = adjusted_rand_score(y_true, labels)
        ami = adjusted_mutual_info_score(y_true, labels)
        hom = homogeneity_score(y_true, labels)
        comp = completeness_score(y_true, labels)
        vmeas = v_measure_score(y_true, labels)
        fmi = fowlkes_mallows_score(y_true, labels)

        # Profile: cluster sizes & positive rates
        dfc = pd.DataFrame({'cluster': labels, 'y': y_true})
        grp = dfc.groupby('cluster').agg(n=('y','size'), positives=('y','sum'))
        grp['pos_rate'] = grp['positives'] / grp['n']
        global_pos_rate = np.mean(y_true)
        grp['lift'] = grp['pos_rate'] / (global_pos_rate + 1e-12)
        grp = grp.sort_values('pos_rate', ascending=False)
        self.profiles[(method, k)] = grp

        # Downstream check (cluster label one-hot → predict y)
        enc = OneHotEncoder(sparse=False, handle_unknown='ignore')
        Z = enc.fit_transform(labels.reshape(-1,1))
        clf = RandomForestClassifier(n_estimators=200, random_state=0, n_jobs=-1)
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
        downstream_score = cross_val_score(clf, Z, y_true, cv=cv,
                                           scoring='roc_auc', n_jobs=-1).mean()

        # Stability (skip for DBSCAN because labels depend heavily on eps/min_samples)
        stability_mean, stability_std = np.nan, np.nan
        if method in ['kmeans','gmm','agglo']:
            labels_list = []
            for i in range(self.repeats_stability):
                lab, _ = self._cluster(method, k, random_state=self.random_state+i)
                labels_list.append(lab)
            pairs = list(itertools.combinations(range(len(labels_list)), 2))
            aris = [adjusted_rand_score(labels_list[i], labels_list[j]) for i,j in pairs]
            stability_mean, stability_std = float(np.mean(aris)), float(np.std(aris))

        return {
            'method': method,
            'k': k,
            'ARI': ari,
            'AMI': ami,
            'homogeneity': hom,
            'completeness': comp,
            'v_measure': vmeas,
            'fowlkes_mallows': fmi,
            'downstream_auc': downstream_score,
            'stability_mean_ari': stability_mean,
            'stability_std_ari': stability_std,
            'global_pos_rate': global_pos_rate,
            'top_cluster_pos_rate': grp['pos_rate'].iloc[0],
            'top_cluster_lift': grp['lift'].iloc[0]
        }

    # -----------------------------
    def run_all(self, methods=['kmeans','gmm'], ks=range(2,9), **kwargs):
        results = []
        for method in methods:
            if method in ['kmeans','gmm','agglo']:
                for k in ks:
                    labels, _ = self._cluster(method, k=k, **kwargs)
                    res = self._evaluate(labels, method, k)
                    results.append(res)
            elif method == 'dbscan':
                labels, _ = self._cluster(method, **kwargs)  # DBSCAN has no k
                res = self._evaluate(labels, method, k=None)
                results.append(res)
        self.results = pd.DataFrame(results)
        return self.results

    def get_profiles(self, method, k=None):
        return self.profiles.get((method, k))


## 0.3 Constants

In [2]:
project_root = Path.cwd().parent
project_root

WindowsPath('d:/01 Work/06-Segmentations')

In [3]:
processed_file_path = f"{project_root}/data/processed/preprocessed_bank.csv"
processed_file_path

'd:\\01 Work\\06-Segmentations/data/processed/preprocessed_bank.csv'

# 1. The Data

# 2. Data Cleaning

# END