<a href="https://colab.research.google.com/github/Tejaswi37/2303A51944-Batch-27-/blob/main/ML_Algorithms.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
"""
Ensemble clustering pipeline for Debt Burden & Risk Segmentation

This script performs:
- Data loading and preprocessing
- Feature engineering and outlier removal
- Multiple base clusterings (KMeans, Agglomerative, GMM, DBSCAN)
- Build a co-association (consensus) matrix from base clusterings
- Apply Spectral Clustering on the consensus matrix to obtain final ensemble clusters
- Evaluate clusters with Silhouette, Davies-Bouldin, Calinski-Harabasz indices
- Profile clusters and save results

Usage: edit `DATA_PATH` and run.
"""

import os
import warnings
from collections import Counter

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, SpectralClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

# -----------------------------
# CONFIG
# -----------------------------
DATA_PATH = "/content/synthetic_personal_finance_dataset (1) (1).csv"
OUTPUT_DIR = "/mnt/data/ensemble_clustering_output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

RANDOM_STATE = 42
BASE_K_RANGE = range(2, 7)  # candidate k for base KMeans/GMM
FINAL_K_CANDIDATES = range(2, 8)  # candidate final cluster numbers

# -----------------------------
# HELPERS
# -----------------------------

def load_data(path):
    if not os.path.exists(path):
        raise FileNotFoundError(f"Data file not found at {path}")
    df = pd.read_csv(path)
    print(f"Loaded data with shape: {df.shape}")
    return df


def select_features(df):
    # Use the features suggested in the problem statement
    features = [
        "monthly_income_usd",
        "monthly_expenses_usd",
        "savings_usd",
        "debt_to_income_ratio",
        "savings_to_income_ratio",
    ]
    missing = [f for f in features if f not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")
    return df[features].copy()


def preprocess(df_feat):
    # Impute missing values with median (robust) and remove impossible values
    imputer = SimpleImputer(strategy="median")
    X = imputer.fit_transform(df_feat)
    X = pd.DataFrame(X, columns=df_feat.columns)

    # Replace negative or zero incomes/ratios where nonsensical (if they exist) with small positive
    X["monthly_income_usd"] = X["monthly_income_usd"].apply(lambda v: max(v, 1.0))
    X["monthly_expenses_usd"] = X["monthly_expenses_usd"].apply(lambda v: max(v, 0.0))
    X["savings_usd"] = X["savings_usd"].apply(lambda v: max(v, 0.0))

    # Additional feature: expenses_to_income_ratio
    X["expenses_to_income_ratio"] = X["monthly_expenses_usd"] / X["monthly_income_usd"]

    # Cap extreme ratios to a reasonable bound to avoid infinities
    X["expenses_to_income_ratio"] = X["expenses_to_income_ratio"].clip(0, 10)
    X["debt_to_income_ratio"] = pd.Series(X["debt_to_income_ratio"]).clip(0, 5)
    X["savings_to_income_ratio"] = pd.Series(X["savings_to_income_ratio"]).clip(0, 20)

    # Outlier removal via IsolationForest (flag outliers but keep them in a separate column)
    iso = IsolationForest(n_estimators=200, contamination=0.02, random_state=RANDOM_STATE)
    outlier_flag = iso.fit_predict(X)
    X["is_outlier"] = (outlier_flag == -1).astype(int)

    # Scale features (RobustScaler to limit outlier influence)
    scaler = RobustScaler()
    features_to_scale = [c for c in X.columns if c != "is_outlier"]
    X_scaled = scaler.fit_transform(X[features_to_scale])

    X_scaled = pd.DataFrame(X_scaled, columns=features_to_scale)
    X_scaled["is_outlier"] = X["is_outlier"].values

    return X_scaled, scaler


# -----------------------------
# BASE CLUSTERINGS
# -----------------------------

def generate_base_clusterings(X, random_state=RANDOM_STATE):
    """
    Create a list of base clusterings (as label arrays). We vary algorithms and hyperparameters
    to create a diverse ensemble.
    """
    clusterings = []

    # 1) KMeans with different k
    for k in BASE_K_RANGE:
        km = KMeans(n_clusters=k, random_state=random_state, n_init=20)
        labels = km.fit_predict(X)
        clusterings.append((f"kmeans_k{k}", labels))

    # 2) Gaussian Mixture with different k
    for k in BASE_K_RANGE:
        gmm = GaussianMixture(n_components=k, random_state=random_state)
        labels = gmm.fit_predict(X)
        clusterings.append((f"gmm_k{k}", labels))

    # 3) Agglomerative (ward) with different k
    for k in BASE_K_RANGE:
        agg = AgglomerativeClustering(n_clusters=k, linkage="ward")
        labels = agg.fit_predict(X)
        clusterings.append((f"agg_k{k}", labels))

    # 4) DBSCAN - density-based for outlier detection
    # We'll try a few eps values scaled for our scaled data
    for eps in [0.3, 0.5, 0.7]:
        db = DBSCAN(eps=eps, min_samples=10)
        labels = db.fit_predict(X)
        clusterings.append((f"dbscan_eps{eps}", labels))

    print(f"Generated {len(clusterings)} base clusterings")
    return clusterings


# -----------------------------
# CONSENSUS (CO-ASSOCIATION) MATRIX
# -----------------------------

def build_coassociation_matrix(clusterings, n_samples):
    """
    Build co-association matrix A where A[i,j] is fraction of clusterings that placed i and j together.
    Treat DBSCAN noise points (label -1) as their own unique cluster assignment.
    """
    co_assoc = np.zeros((n_samples, n_samples), dtype=float)
    m = len(clusterings)
    for name, labels in clusterings:
        # Convert -1 labels to unique labels by offsetting them to big unique values per sample
        lbl = np.array(labels)
        # For DBSCAN noise points (-1), we keep as unique by mapping each -1 occurrence to unique label id
        noise_indices = np.where(lbl == -1)[0]
        if len(noise_indices) > 0:
            # create a copy to avoid modifying original
            lbl = lbl.copy()
            for idx in noise_indices:
                lbl[idx] = -100000 - idx

        # build pairwise equality
        eq = (lbl[:, None] == lbl[None, :]).astype(float)
        co_assoc += eq

    co_assoc /= float(m)
    return co_assoc


# -----------------------------
# ENSEMBLE (SPECTRAL ON CO-ASSOC)
# -----------------------------

def spectral_consensus_clustering(co_assoc, k):
    """
    Apply spectral clustering on the co-association matrix treated as affinity.
    """
    # Ensure matrix is symmetric and in [0,1]
    A = (co_assoc + co_assoc.T) / 2.0
    # Numerical stability
    A = np.nan_to_num(A)
    sc = SpectralClustering(n_clusters=k, affinity="precomputed", random_state=RANDOM_STATE)
    labels = sc.fit_predict(A)
    return labels


# -----------------------------
# EVALUATION
# -----------------------------

def evaluate_clustering(X, labels):
    # filter out cases where only one cluster exists
    if len(np.unique(labels)) <= 1:
        return {
            "silhouette": -1,
            "davies_bouldin": np.inf,
            "calinski_harabasz": -1,
        }
    metrics = {
        "silhouette": silhouette_score(X, labels),
        "davies_bouldin": davies_bouldin_score(X, labels),
        "calinski_harabasz": calinski_harabasz_score(X, labels),
    }
    return metrics


# -----------------------------
# CLUSTER PROFILING
# -----------------------------

def profile_clusters(original_df, X_scaled_df, labels, output_prefix):
    dfp = original_df.copy()
    dfp["ensemble_cluster"] = labels
    summary = dfp.groupby("ensemble_cluster").agg(
        count=("ensemble_cluster", "count"),
        mean_income=("monthly_income_usd", "mean"),
        median_income=("monthly_income_usd", "median"),
        mean_expenses=("monthly_expenses_usd", "mean"),
        mean_savings=("savings_usd", "mean"),
        mean_dti=("debt_to_income_ratio", "mean"),
        mean_savings_to_income=("savings_to_income_ratio", "mean"),
    ).reset_index()
    summary = summary.sort_values("count", ascending=False)
    summary.to_csv(os.path.join(OUTPUT_DIR, f"{output_prefix}_cluster_profiles.csv"), index=False)
    print("Saved cluster profiles to CSV")
    return summary


# -----------------------------
# VISUALIZATION
# -----------------------------

def visualize_embedding(X, labels, title, filename):
    pca = PCA(n_components=2, random_state=RANDOM_STATE)
    emb = pca.fit_transform(X)
    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(emb[:, 0], emb[:, 1], c=labels, s=10)
    plt.title(title)
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.tight_layout()
    plt.savefig(filename, dpi=150)
    plt.close()


# -----------------------------
# MAIN PIPELINE
# -----------------------------

def main():
    df = load_data(DATA_PATH)

    # Keep original for profiling
    df_original = df.copy()

    # select features and preprocess
    df_feat = select_features(df)
    X_scaled_df, scaler = preprocess(df_feat)

    # Prepare data matrix for clustering (exclude is_outlier for clustering; keep it for profiling)
    X_for_clustering = X_scaled_df.drop(columns=["is_outlier"]).values

    # generate base clusterings
    base_clusterings = generate_base_clusterings(X_for_clustering)

    # Build co-association matrix
    co_assoc = build_coassociation_matrix(base_clusterings, n_samples=X_for_clustering.shape[0])
    np.save(os.path.join(OUTPUT_DIR, "co_association_matrix.npy"), co_assoc)
    print("Saved co-association matrix")

    # Find best final k by trying several and using silhouette as primary metric
    best_k = None
    best_score = -999
    best_labels = None
    evaluations = []
    for k in FINAL_K_CANDIDATES:
        labels_k = spectral_consensus_clustering(co_assoc, k)
        metrics = evaluate_clustering(X_for_clustering, labels_k)
        evaluations.append((k, metrics))
        print(f"k={k}: silhouette={metrics['silhouette']:.4f}, db={metrics['davies_bouldin']:.4f}, ch={metrics['calinski_harabasz']:.2f}")
        # Prefer higher silhouette, then higher CH, then lower DB
        score = metrics["silhouette"]
        if score > best_score:
            best_score = score
            best_k = k
            best_labels = labels_k

    print(f"Selected best k={best_k} with silhouette={best_score:.4f}")

    # Save final labels
    df_original["ensemble_cluster"] = best_labels
    df_original.to_csv(os.path.join(OUTPUT_DIR, "data_with_ensemble_clusters.csv"), index=False)
    print("Saved dataset with ensemble cluster labels")

    # Profile clusters
    profile = profile_clusters(df_original, X_scaled_df, best_labels, "ensemble")
    print(profile)

    # Visualize final clusters
    visualize_embedding(X_for_clustering, best_labels, f"Ensemble Clusters (k={best_k})", os.path.join(OUTPUT_DIR, "ensemble_clusters_pca.png"))
    print("Saved PCA visualization")

    # Also save evaluation table
    eval_df = pd.DataFrame([
        {
            "k": k,
            "silhouette": m["silhouette"],
            "davies_bouldin": m["davies_bouldin"],
            "calinski_harabasz": m["calinski_harabasz"],
        }
        for k, m in evaluations
    ])
    eval_df.to_csv(os.path.join(OUTPUT_DIR, "ensemble_evaluation.csv"), index=False)
    print("Saved evaluation metrics for candidate ks")


if __name__ == "__main__":
    main()


Loaded data with shape: (16207, 20)
Generated 18 base clusterings
Saved co-association matrix
k=2: silhouette=0.5223, db=0.7676, ch=12159.63
k=3: silhouette=0.3002, db=1.1809, ch=10561.92
k=4: silhouette=0.2588, db=1.3063, ch=7953.49
k=5: silhouette=0.2408, db=1.2113, ch=7807.46
k=6: silhouette=0.1939, db=1.6056, ch=6155.98
k=7: silhouette=0.1520, db=1.6666, ch=5279.50
Selected best k=2 with silhouette=0.5223
Saved dataset with ensemble cluster labels
Saved cluster profiles to CSV
   ensemble_cluster  count  mean_income  median_income  mean_expenses  \
0                 0  13994  4217.302203        4194.74    2538.574602   
1                 1   2213  2828.627081        2712.87    1691.556783   

    mean_savings  mean_dti  mean_savings_to_income  
0  254478.874197  0.278577                5.034786  
1  173428.378179  7.292951                5.091414  
Saved PCA visualization
Saved evaluation metrics for candidate ks
