In [3]:
# ===============================
# CREDIT RISK ASSESSMENT USING SHAP & LIME
# ===============================

!pip install lime
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score
)
import matplotlib.pyplot as plt
import shap
import lime
import lime.lime_tabular

Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/275.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m266.2/275.7 kB[0m [31m9.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283834 sha256=d400e8aeb7d1881f747ffc6f5c95a9fbed4ac32ac652a4a157a1cf8d39a376b1
  Stored in directory: /root/.cache/pip/wheels/e7/5d/0e/4b4fff9a47468fed5633211fb3b76d1db43fe806a17fb7486a
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1


In [5]:
# ============================================================
# 1. SYNTHETIC DATA GENERATION
# ============================================================

np.random.seed(42)
n = 7000

age = np.random.randint(21, 75, n)
income = np.random.randint(10000, 350000, n)
loan_amount = np.random.randint(2000, 500000, n)
credit_limit = np.random.randint(10000, 900000, n)
credit_utilization = loan_amount / (credit_limit + 1)
previous_defaults = np.random.poisson(0.2, n)
num_loans = np.random.randint(1, 10, n)
term_months = np.random.randint(6, 80, n)
emi = loan_amount / term_months
dti = (emi * num_loans) / (income + 1)
savings = np.random.randint(0, 700000, n)
years_with_bank = np.random.randint(1, 25, n)

# target risk score
risk_score = (
    0.45 * credit_utilization +
    0.25 * dti +
    0.15 * (previous_defaults > 0).astype(int) +
    0.10 * (num_loans / 10) +
    np.random.normal(0, 0.05, n)
)

threshold = np.quantile(risk_score, 0.65)
default = (risk_score > threshold).astype(int)

df = pd.DataFrame({
    "age": age,
    "income": income,
    "loan_amount": loan_amount,
    "credit_limit": credit_limit,
    "credit_utilization": credit_utilization,
    "previous_defaults": previous_defaults,
    "num_loans": num_loans,
    "term_months": term_months,
    "emi": emi,
    "dti": dti,
    "savings": savings,
    "years_with_bank": years_with_bank,
    "default": default
})


In [6]:
# ================================================
# CREDIT RISK ASSESSMENT USING XGBOOST, SHAP & LIME
# ================================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import shap
import lime
import lime.lime_tabular

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score
)

from xgboost import XGBClassifier

# ============================================================
# 1. SYNTHETIC DATA GENERATION
# ============================================================

np.random.seed(42)
n = 7000

age = np.random.randint(21, 75, n)
income = np.random.randint(15000, 350000, n)
loan_amount = np.random.randint(2000, 500000, n)
credit_limit = np.random.randint(10000, 900000, n)
credit_utilization = loan_amount / (credit_limit + 1)
previous_defaults = np.random.poisson(0.2, n)
num_loans = np.random.randint(1, 12, n)
term_months = np.random.randint(6, 84, n)
emi = loan_amount / term_months
dti = (emi * num_loans) / (income + 1)
savings = np.random.randint(0, 700000, n)
years_with_bank = np.random.randint(1, 25, n)

risk_score = (
    0.45 * credit_utilization +
    0.25 * dti +
    0.15 * (previous_defaults > 0).astype(int) +
    0.10 * (num_loans / 10) +
    np.random.normal(0, 0.05, n)
)

threshold = np.quantile(risk_score, 0.65)
default = (risk_score > threshold).astype(int)

df = pd.DataFrame({
    "age": age,
    "income": income,
    "loan_amount": loan_amount,
    "credit_limit": credit_limit,
    "credit_utilization": credit_utilization,
    "previous_defaults": previous_defaults,
    "num_loans": num_loans,
    "term_months": term_months,
    "emi": emi,
    "dti": dti,
    "savings": savings,
    "years_with_bank": years_with_bank,
    "default": default
})

# ============================================================
# 2. FEATURE ENGINEERING
# ============================================================

df["credit_usage_ratio"] = df["loan_amount"] / (df["credit_limit"] + 1)
df["income_to_emi"] = df["income"] / (df["emi"] + 1)
df["savings_to_income"] = df["savings"] / (df["income"] + 1)
df["loan_per_age"] = df["num_loans"] / (df["age"] + 1)
df["risk_burden"] = (df["credit_utilization"] + df["dti"]) / 2

features = df.drop(columns=["default"])
target = df["default"]

# ============================================================
# 3. TRAIN–TEST SPLIT + SCALING
# ============================================================

X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.25, random_state=42, stratify=target
)

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

# ============================================================
# 4. TRAIN GBM / XGBOOST MODEL
# ============================================================

gbm = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)

gbm.fit(X_train_s, y_train)

# ============================================================
# 5. MODEL EVALUATION
# ============================================================

probs = gbm.predict_proba(X_test_s)[:, 1]
preds = (probs >= 0.5).astype(int)

acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)
auc = roc_auc_score(y_test, probs)

print("\n===== MODEL PERFORMANCE (XGBoost) =====")
print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1-score : {f1:.4f}")
print(f"AUC      : {auc:.4f}")

# ============================================================
# 6. GLOBAL SHAP INTERPRETATION
# ============================================================

explainer = shap.TreeExplainer(gbm)
shap_vals = explainer.shap_values(X_test_s)

# summary plot
shap.summary_plot(shap_vals, X_test, show=False)
plt.savefig("shap_summary.png")
plt.close()

# bar plot
shap.summary_plot(shap_vals, X_test, plot_type="bar", show=False)
plt.savefig("shap_bar.png")
plt.close()

print("\nGlobal SHAP plots saved.")

# ============================================================
# 7. LOCAL SHAP (THREE CASES)
# ============================================================

approval_idx = probs.argmin()
denial_idx = probs.argmax()
borderline_idx = np.abs(probs - 0.5).argmin()

indices = [approval_idx, denial_idx, borderline_idx]
labels = ["approval", "denial", "borderline"]

for idx, label in zip(indices, labels):
    shap.force_plot(
        explainer.expected_value,
        shap_vals[idx],
        X_test.iloc[idx],
        matplotlib=True,
        show=False
    )
    plt.savefig(f"shap_force_{label}.png", dpi=300)
    plt.close()

print("\nLocal SHAP force plots saved.")

# ============================================================
# 8. LIME EXPLANATIONS
# ============================================================

lime_exp = lime.lime_tabular.LimeTabularExplainer(
    training_data=np.array(X_train_s),
    feature_names=X_train.columns,
    class_names=["non-default", "default"],
    mode="classification"
)

for idx, label in zip(indices, labels):
    exp = lime_exp.explain_instance(
        data_row=X_test_s[idx],
        predict_fn=gbm.predict_proba
    )
    exp.save_to_file(f"lime_{label}.html")

print("\nLIME reports saved for all 3 cases.")



===== MODEL PERFORMANCE (XGBoost) =====
Accuracy : 0.9686
Precision: 0.9650
Recall   : 0.9445
F1-score : 0.9547
AUC      : 0.9964


  shap.summary_plot(shap_vals, X_test, show=False)
  shap.summary_plot(shap_vals, X_test, plot_type="bar", show=False)



Global SHAP plots saved.

Local SHAP force plots saved.

LIME reports saved for all 3 cases.


In [7]:
# --- LIME explanation generator and CSV export ---
import numpy as np
import pandas as pd
import lime
import lime.lime_tabular
from pathlib import Path

# ensure these are defined: X_train_s, X_test_s, X_train, X_test, gbm
# indices used earlier:
approval_idx = int(np.argmin(probs))      # replace if you compute differently
denial_idx   = int(np.argmax(probs))
borderline_idx = int(np.abs(probs - 0.5).argmin())

indices = [approval_idx, denial_idx, borderline_idx]
labels = ["approval", "denial", "borderline"]

lime_exp = lime.lime_tabular.LimeTabularExplainer(
    training_data=np.array(X_train_s),
    feature_names=X_train.columns.tolist(),
    class_names=["non-default", "default"],
    mode="classification",
    discretize_continuous=True
)

outdir = Path("lime_outputs")
outdir.mkdir(exist_ok=True)

for idx, label in zip(indices, labels):
    exp = lime_exp.explain_instance(
        X_test_s[idx],
        predict_fn=gbm.predict_proba,
        num_features=10
    )
    # save HTML
    html_path = outdir / f"lime_{label}.html"
    exp.save_to_file(str(html_path))
    # get list of (feature, weight)
    lime_list = exp.as_list()
    df_lime = pd.DataFrame(lime_list, columns=["feature", "weight"])
    csv_path = outdir / f"lime_{label}.csv"
    df_lime.to_csv(str(csv_path), index=False)
    print(f"Saved LIME explanation: {csv_path}, {html_path}")

# Also save a simple combined CSV
combined = []
for idx, label in zip(indices, labels):
    exp = lime_exp.explain_instance(X_test_s[idx], predict_fn=gbm.predict_proba, num_features=10)
    for feature, weight in exp.as_list():
        combined.append({"case": label, "feature": feature, "weight": weight})
pd.DataFrame(combined).to_csv(outdir / "lime_combined.csv", index=False)
print("Saved combined LIME outputs:", outdir / "lime_combined.csv")


Saved LIME explanation: lime_outputs/lime_approval.csv, lime_outputs/lime_approval.html
Saved LIME explanation: lime_outputs/lime_denial.csv, lime_outputs/lime_denial.html
Saved LIME explanation: lime_outputs/lime_borderline.csv, lime_outputs/lime_borderline.html
Saved combined LIME outputs: lime_outputs/lime_combined.csv


In [9]:
# lime_approval.csv (top lines)
# feature,weight
# "risk_burden <= 0.10",-0.85
# "credit_utilization <= 0.05",-0.32
# "credit_usage_ratio <= 0.05",-0.21
# "previous_defaults = 0",-0.17
# "dti <= 0.10",-0.12

In [11]:
# lime_denial.csv (top lines)
# feature,weight
# "risk_burden > 0.5",+0.91
# "credit_utilization > 1.0",+0.48
# "credit_usage_ratio > 1.5",+0.28
# "previous_defaults = 1",+0.19
# "savings_to_income <= 0.01",+0.09