In [1]:
train_files = [
    "One-Hot/Train_Orig_OH.csv",
    "One-Hot/Scaled/Train_Scaled_All_OH.csv",
    "One-Hot/Scaled/Train_Scaled_Cont_OH.csv",
    "One-Hot/MinMax/Train_MM_OH.csv",
    "One-Hot/MinMax/train_OH_MM_PCA15.csv",
    "One-Hot/MinMax/train_OH_MM_PCA20.csv",
    "One-Hot/MinMax/train_OH_MM_PCA25.csv",
    "One-Hot/MinMax/train_OH_MM_PCA30.csv",
    "One-Hot/MinMax/train_OH_MM_PCA35.csv",
    "IntClasses/Train_Orig_Int.csv",
    "IntClasses/Scaled/Train_Scaled_All_Int.csv",
    "IntClasses/Scaled/Train_Scaled_Cont_Int.csv",
    "IntClasses/MinMax/Train_MM_Int.csv",
    "IntClasses/MinMax/train_Int_MM_PCA10.csv",
    "IntClasses/MinMax/train_Int_MM_PCA15.csv",
    "IntClasses/MinMax/train_Int_MM_PCA20.csv",
    "IntClasses/MinMax/train_Int_MM_PCA25.csv",
]

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn_extra.cluster import KMedoids
from sklearn.decomposition import PCA



results = []

for path in train_files:
    print("\n" + "="*80)
    print(f"Processing dataset: {path}")

    # Load dataset
    df = pd.read_csv(path)

    # Drop target columns
    if 'LoanApproved' in df.columns:
        df.drop(columns=['LoanApproved'], inplace=True)
    if 'RiskScore' in df.columns:
        df.drop(columns=['RiskScore'], inplace=True)

    # Scale data
    X = StandardScaler().fit_transform(df)

    # Elbow Method & Silhouette Scores
    inertias = []
    silhouettes = []
    k_range = range(2, 10)

    for k in k_range:
        kmedoids = KMedoids(n_clusters=k, random_state=42, method='pam')
        kmedoids.fit(X)
        inertias.append(kmedoids.inertia_)
        silhouettes.append(silhouette_score(X, kmedoids.labels_))

    # Plot Elbow
    plt.figure(figsize=(6, 4))
    plt.plot(k_range, inertias, marker='o')
    plt.title(f"Elbow Method for {os.path.basename(path)}")
    plt.xlabel("Number of Clusters")
    plt.ylabel("Inertia")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # Plot Silhouette
    plt.figure(figsize=(6, 4))
    plt.plot(k_range, silhouettes, marker='s', color='orange')
    plt.title(f"Silhouette Score for {os.path.basename(path)}")
    plt.xlabel("Number of Clusters")
    plt.ylabel("Silhouette Score")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # Choose best k based on silhouette score
    best_k = k_range[np.argmax(silhouettes)]
    print(f"Best K (Silhouette): {best_k}")

    # Final K-Medoids clustering
    final_model = KMedoids(n_clusters=best_k, random_state=42, method='pam')
    labels = final_model.fit_predict(X)

    # Optional: Visualize in 2D (PCA projection)
    pca = PCA(n_components=2)
    X_2d = pca.fit_transform(X)
    plt.figure(figsize=(6, 5))
    plt.scatter(X_2d[:, 0], X_2d[:, 1], c=labels, cmap='Set1', s=30)
    plt.title(f"K-Medoids Clustering (k={best_k}) - {os.path.basename(path)}")
    plt.xlabel("PCA 1")
    plt.ylabel("PCA 2")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # Save clustering result summary
    results.append({
        "Dataset": os.path.basename(path),
        "Best_K": best_k,
        "Silhouette": max(silhouettes),
        "Inertia": final_model.inertia_
    })

# Show summary
pd.DataFrame(results)



Processing dataset: One-Hot/Train_Orig_OH.csv
