<a href="https://colab.research.google.com/github/Nuthan10/Loan-To-Cap-Ratio/blob/main/Loan_to_cap_ratio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip -q install xgboost catboost lightgbm shap kagglehub

In [None]:
import os, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score, precision_recall_curve, auc
)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

import shap

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# 1) DATA LOADING
#    Prefer Kaggle; fallback to manual upload or synthetic

In [None]:
def try_load_kaggle():
    try:
        import kagglehub
        path = kagglehub.dataset_download("altruistdelhite04/loan-prediction-problem-dataset")
        csv_path = os.path.join(path, "train.csv")
        if os.path.exists(csv_path):
            print("✅ Loaded Kaggle dataset:", csv_path)
            return pd.read_csv(csv_path)
    except Exception as e:
        print("ℹ️ Kaggle download unavailable:", e)
    return None

def try_manual_upload():
    try:
        from google.colab import files
        print("📥 Please upload your Kaggle train.csv")
        up = files.upload()
        name = list(up.keys())[0]
        print("✅ Loaded manual upload:", name)
        return pd.read_csv(name)
    except Exception as e:
        print("ℹ️ Manual upload skipped/unavailable:", e)
    return None
def make_synthetic_kaggle_like(n=1000):
    print("⚙️ Using synthetic Kaggle-like dataset.")
    df = pd.DataFrame({
        'LoanAmount': np.random.randint(100, 500, n),  # in thousands
        'ApplicantIncome': np.random.randint(20000, 150000, n),
        'Gender': np.random.choice(['Male', 'Female'], n),
        'Married': np.random.choice(['Yes', 'No'], n),
        'Education': np.random.choice(['Graduate', 'Not Graduate'], n),
        'Self_Employed': np.random.choice(['Yes', 'No'], n),
        'Credit_History': np.random.choice([1.0, 0.0], n),
        'Loan_Status': np.random.choice(['Y', 'N'], n, p=[0.7, 0.3])
    })
    return df

df = try_load_kaggle()
if df is None:
    df = try_manual_upload()
if df is None:
    df = make_synthetic_kaggle_like(n=1200)


# 2) CLEANING + LTC FEATURE

In [None]:
needed = ['LoanAmount','ApplicantIncome','Credit_History','Gender','Married',
          'Education','Self_Employed','Loan_Status']
for col in needed:
    if col not in df.columns:
        raise ValueError(f"Missing required column: {col}")

for c in ['LoanAmount','ApplicantIncome','Credit_History']:
    df[c] = pd.to_numeric(df[c], errors='coerce')

df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].median())
df['ApplicantIncome'] = df['ApplicantIncome'].fillna(df['ApplicantIncome'].median())
df['Credit_History'] = df['Credit_History'].fillna(1.0)

np.random.seed(RANDOM_STATE)
df['cap_of_insurance'] = np.clip(
    np.random.normal(loc=250000, scale=60000, size=len(df)), 50000, None
)
df['type_of_insurance'] = np.random.choice(['Health', 'Life', 'Property'], size=len(df))
df['loan_to_cap_ratio'] = (df['LoanAmount'] * 1000) / df['cap_of_insurance']
df['loan_to_cap_ratio'] = df['loan_to_cap_ratio'].astype(float)

# Encoding

In [None]:
id_cols = [col for col in df.columns if 'Loan_ID' in col or 'loan_id' in col.lower()]
df = df.drop(columns=id_cols, errors='ignore')
if 'Loan_Status' in df.columns:
    df['Loan_Status'] = df['Loan_Status'].map({'Y': 0, 'N': 1})
object_cols = df.select_dtypes(include=['object']).columns
if len(object_cols) > 0:
    df = pd.get_dummies(df, columns=object_cols, drop_first=True)

# Train/test split

In [None]:
if 'Loan_Status' in df.columns:
    X = df.drop(columns=['Loan_Status'])
    y = df['Loan_Status']
else:
    X = df.copy()
    y = None

if y is not None:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
    )

    num_cols = X_train.select_dtypes(include=[np.number]).columns

    scaler = StandardScaler()
    X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
    X_test[num_cols] = scaler.transform(X_test[num_cols])
    from sklearn.impute import SimpleImputer
    imputer = SimpleImputer(strategy='median')
    X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
    X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns, index=X_test.index)

    print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")
else:
    print("No target variable found. Skipping split.")


# 3) MODEL CONFIGURATIONS

In [None]:
model_configs = {
    'LogisticRegression': [
        LogisticRegression(max_iter=500, class_weight='balanced', n_jobs=-1, random_state=RANDOM_STATE),
    ],
    'RandomForest': [
        RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=RANDOM_STATE),
    ],
    'XGBoost': [
        XGBClassifier(n_estimators=200, max_depth=5, learning_rate=0.08, subsample=0.9,
                      colsample_bytree=0.9, eval_metric='logloss', n_jobs=-1, random_state=RANDOM_STATE),
    ],
    'CatBoost': [
        CatBoostClassifier(iterations=400, depth=6, learning_rate=0.08, verbose=0, random_state=RANDOM_STATE)
    ],
    'LightGBM': [
        LGBMClassifier(n_estimators=400, max_depth=-1, learning_rate=0.08, subsample=0.9,
                       colsample_bytree=0.9, n_jobs=-1, random_state=RANDOM_STATE)
    ],
    'GradientBoosting': [
        GradientBoostingClassifier(n_estimators=400, learning_rate=0.08, max_depth=5, random_state=RANDOM_STATE)
    ],
    'HistGradientBoosting': [
        HistGradientBoostingClassifier(max_iter=400, learning_rate=0.08, max_depth=5, random_state=RANDOM_STATE)
    ],
    'ExtraTrees': [
        ExtraTreesClassifier(n_estimators=400, n_jobs=-1, random_state=RANDOM_STATE)
    ]
}

# 4) TRAINING & LIVE METRICS

In [None]:
def eval_metrics(y_true, y_pred, y_proba):
    acc = accuracy_score(y_true, y_pred) * 100.0
    f1 = f1_score(y_true, y_pred)
    roc = roc_auc_score(y_true, y_proba)
    prec, rec, _ = precision_recall_curve(y_true, y_proba)
    pr_auc = auc(rec, prec)
    return acc, f1, roc, pr_auc

if y is not None:
    all_runs = []
    best_by_model = {}
    for family, configs in model_configs.items():
        family_best = None
        for i, model in enumerate(configs, start=1):
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") \
                else model.decision_function(X_test)
            acc, f1, roc, pr_auc = eval_metrics(y_test, y_pred, y_proba)
            row = {
                'Model': family,
                'Run': i,
                'Accuracy (%)': round(acc, 2),
                'F1': round(f1, 4),
                'ROC_AUC': round(roc, 4),
                'PR_AUC': round(pr_auc, 4),
                'Estimator': model
            }
            all_runs.append(row)
            if (family_best is None) or (roc > family_best['ROC_AUC']):
                family_best = row
                best_by_model[family] = family_best
    print("\nBest model per family:")
    for fam, info in best_by_model.items():
        print(f"{fam}: Acc={info['Accuracy (%)']} F1={info['F1']} ROC_AUC={info['ROC_AUC']} PR_AUC={info['PR_AUC']}")

    # SHAP EXPLAINER for model with max ROC_AUC across all families
    top_row = max(all_runs, key=lambda x: x['ROC_AUC'])
    best_model = top_row['Estimator']
    print(f"\nUsing SHAP to explain: {top_row['Model']} (ROC_AUC={top_row['ROC_AUC']})")

    # Pick small subset for SHAP to avoid memory issues
    X_shap = X_train.sample(200, random_state=RANDOM_STATE) if X_train.shape[0] > 200 else X_train

    try:
        explainer = shap.Explainer(best_model, X_shap)
        shap_values = explainer(X_shap)
        shap.summary_plot(shap_values, X_shap, show=True)
    except Exception as e:
        print(f"SHAP could not explain {top_row['Model']} automatically: {e}")
        print("Try using TreeExplainer or KernelExplainer for this model manually.")
else:
    print("Skipping model training, no target variable.")

# 5) PLOTS (computed from live results)

In [None]:
import matplotlib.pyplot as plt

## 5a) Accuracy across runs (line plot per family)

In [None]:
results_df = pd.DataFrame(all_runs)  # <-- Add this if not done already

plt.figure(figsize=(10,6))
for fam in results_df['Model'].unique():
    sub = results_df[results_df['Model']==fam]
    plt.plot(sub['Run'], sub['Accuracy (%)'], marker='o', label=fam)
plt.title("Accuracy (%) Across Runs per Model Family")
plt.xlabel("Run")
plt.ylabel("Accuracy (%)")
plt.grid(True)
plt.legend()
plt.show()

## 5b) ROC & PR curves for the best run of each family

In [None]:
def plot_roc_pr_for_best():
    from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, auc
    plt.figure(figsize=(9,6))
    for fam, row in best_by_model.items():
        model = row['Estimator']
        if hasattr(model, "predict_proba"):
            y_proba = model.predict_proba(X_test)[:, 1]
        else:
            y_proba = model.decision_function(X_test)
        fpr, tpr, _ = roc_curve(y_test, y_proba)
        roc_val = roc_auc_score(y_test, y_proba)
        plt.plot(fpr, tpr, label=f"{fam} (AUC={roc_val:.2f})")
    plt.plot([0,1],[0,1],'k--')
    plt.title("ROC Curves (Best Run per Model)")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.grid(True)
    plt.legend()
    plt.show()
    plt.figure(figsize=(9,6))
    for fam, row in best_by_model.items():
        model = row['Estimator']
        if hasattr(model, "predict_proba"):
            y_proba = model.predict_proba(X_test)[:, 1]
        else:
            y_proba = model.decision_function(X_test)
        prec, rec, _ = precision_recall_curve(y_test, y_proba)
        pr_val = auc(rec, prec)
        plt.plot(rec, prec, label=f"{fam} (PR AUC={pr_val:.2f})")
    plt.title("Precision-Recall Curves (Best Run per Model)")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.grid(True)
    plt.legend()
    plt.show()

plot_roc_pr_for_best()

## 5c) Confusion matrices for the best run of each family

In [None]:
from sklearn.metrics import confusion_matrix
from itertools import product
def plot_confusions_for_best():
    fams = list(best_by_model.keys())
    n = len(fams)
    cols = 3
    rows = int(np.ceil(n/cols))
    fig, axes = plt.subplots(rows, cols, figsize=(12, 4*rows))
    axes = axes.flatten()
    for ax, fam in zip(axes, fams):
        model = best_by_model[fam]['Estimator']
        y_pred = model.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        im = ax.imshow(cm, interpolation='nearest', cmap='Blues')
        ax.set_title(f"{fam} - Best Run CM")
        ax.set_xlabel("Predicted")
        ax.set_ylabel("Actual")
        tick_marks = np.arange(cm.shape[0])
        ax.set_xticks(tick_marks)
        ax.set_yticks(tick_marks)
        for (r, c) in product(range(cm.shape[0]), range(cm.shape[1])):
            color = 'white' if cm[r, c] > cm.max() / 2 else 'black'
            ax.text(c, r, cm[r, c], ha='center', va='center', color=color)
    for j in range(len(fams), len(axes)):
        axes[j].axis('off')
    plt.tight_layout()
    plt.show()
plot_confusions_for_best()