In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [6]:
import os
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, recall_score, precision_score, f1_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

import xgboost as xgb
import shap

In [12]:
import os
os.makedirs(OUTPUT_DIR, exist_ok=True)

RANDOM_STATE = 42

In [13]:
BASE_DIR = "/content/drive/MyDrive/credit_risk_project"

# DATA_PATH was pointing to a directory, it should point to the actual CSV file.
# Assuming 'cs-training.csv' is the correct data file for your project.
DATA_PATH = f"/content/drive/MyDrive/cs-training.csv"
OUTPUT_DIR = f"/content/drive/MyDrive/credit_risk_project/outputs"

In [15]:
# =========================
# 1. Load data
# =========================
df = pd.read_csv(DATA_PATH)

# Optional: Drop obvious IDs if present
id_cols = [c for c in df.columns if "id" in c.lower()]  # e.g., loan_id, customer_id
df = df.drop(columns=id_cols, errors="ignore")

# Set target
TARGET = "SeriousDlqin2yrs"  # <- change if your target is named differently

assert TARGET in df.columns, f"Target column '{TARGET}' not found. Available columns: {list(df.columns)}"

In [16]:
# =========================
# 2. Basic cleaning & feature engineering
# =========================
# Split features/target
y = df[TARGET]
X = df.drop(columns=[TARGET])

# Separate types
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

# Handle missing values
# - Numeric: median
# - Categorical: most frequent
if num_cols:
    X[num_cols] = X[num_cols].fillna(X[num_cols].median())
if cat_cols:
    for c in cat_cols:
      X[c] = X[c].fillna(X[c].mode().iloc[0])

# Encode categorical for tree-based model (LabelEncoder per column)
# This preserves orderless categories without inflating dimensions (XGBoost-friendly).
encoders = {}
for c in cat_cols:
    le = LabelEncoder()
    X[c] = le.fit_transform(X[c].astype(str))
    encoders[c] = le

# Optional: basic ratio feature (example)
# If both exist in your data, uncomment and adjust names:
# if set(["loan_amount", "income"]).issubset(X.columns):
#     X["loan_to_income_ratio"] = (X["loan_amount"] + 1) / (X["income"] + 1)

In [17]:
# =========================
# 3. Train/test split & imbalance handling
# =========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

# Compute scale_pos_weight as an alternative to SMOTE (often better for XGBoost)
pos = y_train.sum()
neg = len(y_train) - pos
scale_pos_weight = neg / max(pos, 1)

# Optional: If your minority class is extremely rare and you prefer oversampling:
# sm = SMOTE(random_state=RANDOM_STATE)
# X_train, y_train = sm.fit_resample(X_train, y_train)

In [19]:
# =========================
# 3. Train/test split & imbalance handling
# =========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

# Compute scale_pos_weight as an alternative to SMOTE (often better for XGBoost)
pos = y_train.sum()
neg = len(y_train) - pos
scale_pos_weight = neg / max(pos, 1)

# Optional: If your minority class is extremely rare and you prefer oversampling:
# sm = SMOTE(random_state=RANDOM_STATE)
# X_train, y_train = sm.fit_resample(X_train, y_train)

# Initialize the XGBoost Classifier
model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    scale_pos_weight=scale_pos_weight,
    random_state=RANDOM_STATE
)

model.fit(X_train, y_train)

In [20]:
# =========================
# 5. Evaluate performance (AUC & Recall)
# =========================
# Use probabilities for AUC, then choose a threshold for recall
y_proba = model.predict_proba(X_test)[:, 1]

auc = roc_auc_score(y_test, y_proba)

# Tune threshold to balance precision/recall (optional simple sweep)
thresholds = np.linspace(0.1, 0.9, 17)
best_f1 = -1
best_thr = 0.5
for t in thresholds:
    preds_t = (y_proba >= t).astype(int)
    f1 = f1_score(y_test, preds_t)
    if f1 > best_f1:
        best_f1 = f1
        best_thr = t

y_pred = (y_proba >= best_thr).astype(int)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print(f"AUC: {auc:.4f}")
print(f"Best threshold: {best_thr:.2f}")
print(f"Recall: {recall:.4f} | Precision: {precision:.4f} | F1: {f1:.4f}")
print("Confusion matrix:")
print(cm)
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

# Save a quick ROC curve
from sklearn.metrics import roc_curve
fpr, tpr, _ = roc_curve(y_test, y_proba)
plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, label=f"AUC={auc:.3f}")
plt.plot([0,1], [0,1], "--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "roc_curve.png"))
plt.close()

AUC: 0.8482
Best threshold: 0.75
Recall: 0.4718 | Precision: 0.3799 | F1: 0.4209
Confusion matrix:
[[26451  1544]
 [ 1059   946]]

Classification report:
              precision    recall  f1-score   support

           0     0.9615    0.9448    0.9531     27995
           1     0.3799    0.4718    0.4209      2005

    accuracy                         0.9132     30000
   macro avg     0.6707    0.7083    0.6870     30000
weighted avg     0.9226    0.9132    0.9175     30000



In [21]:
# =========================
# 6. SHAP global importance (summary & bar)
# =========================
# Use TreeExplainer for XGBoost
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# Bar plot of mean |SHAP|
plt.figure(figsize=(8,6))
shap.summary_plot(shap_values, X_test, plot_type="bar", show=False)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "shap_summary_bar.png"))
plt.close()

# Dot summary plot (feature effects and direction)
plt.figure(figsize=(10,6))
shap.summary_plot(shap_values, X_test, show=False)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "shap_summary_dot.png"))
plt.close()

In [22]:
# =========================
# 7. Local explanations (choose one correct default, one misclassified)
# =========================
# Find indices for case studies:
# - Correctly predicted 'default' (1)
# - Misclassified (pred=0, true=1 or pred=1, true=0)
correct_default_idx = None
misclassified_idx = None

for i in range(len(y_test)):
    if y_test.iloc[i] == 1 and y_pred[i] == 1 and correct_default_idx is None:
        correct_default_idx = i
    if y_pred[i] != y_test.iloc[i] and misclassified_idx is None:
        misclassified_idx = i
    if correct_default_idx is not None and misclassified_idx is not None:
        break

assert correct_default_idx is not None, "Could not find a correctly predicted default sample. Check class balance or threshold."
assert misclassified_idx is not None, "Could not find a misclassified sample. Try adjusting threshold or use another split."

# Force plots (HTML outputs; useful for submission)
# Expected value (base value) for TreeExplainer:
base_value = explainer.expected_value

# Save HTML force plots
fp1 = shap.force_plot(base_value, shap_values[correct_default_idx], X_test.iloc[correct_default_idx])
shap.save_html(os.path.join(OUTPUT_DIR, "force_plot_correct_default.html"), fp1)

fp2 = shap.force_plot(base_value, shap_values[misclassified_idx], X_test.iloc[misclassified_idx])
shap.save_html(os.path.join(OUTPUT_DIR, "force_plot_misclassified.html"), fp2)

In [23]:
# =========================
# 8. Rank top features (text output)
# =========================
mean_abs_shap = np.mean(np.abs(shap_values), axis=0)
feature_importance = pd.Series(mean_abs_shap, index=X_test.columns).sort_values(ascending=False)
top5 = feature_importance.head(5)
print("\nTop 5 features by mean |SHAP|:")
print(top5)

top5.to_csv(os.path.join(OUTPUT_DIR, "top5_features.csv"))


Top 5 features by mean |SHAP|:
RevolvingUtilizationOfUnsecuredLines    0.997245
NumberOfTime30-59DaysPastDueNotWorse    0.413489
NumberOfTimes90DaysLate                 0.349771
age                                     0.295435
DebtRatio                               0.267867
dtype: float32


In [24]:
# =========================
# 9. Save a summary markdown for submission
# =========================
md_lines = []

md_lines.append("# Interpretable ML for Credit Risk: SHAP Analysis")
md_lines.append("## Model & Metrics")
md_lines.append(f"- AUC: {auc:.4f}")
md_lines.append(f"- Threshold: {best_thr:.2f}")
md_lines.append(f"- Recall: {recall:.4f}")
md_lines.append(f"- Precision: {precision:.4f}")
md_lines.append(f"- F1: {f1:.4f}")

md_lines.append("## Global SHAP Top 5 Features")
for name, val in top5.items():
    md_lines.append(f"- {name}: mean |SHAP| = {val:.6f}")

md_lines.append("## Artifacts")
md_lines.append("- ROC curve: outputs/roc_curve.png")
md_lines.append("- SHAP summary (bar): outputs/shap_summary_bar.png")
md_lines.append("- SHAP summary (dot): outputs/shap_summary_dot.png")
md_lines.append("- Local force plot (correct default): outputs/force_plot_correct_default.html")
md_lines.append("- Local force plot (misclassified): outputs/force_plot_misclassified.html")
with open(os.path.join(OUTPUT_DIR, "submission_summary.md"), "w") as f:
    f.write("\n".join(md_lines))

print("\nSaved submission_summary.md with key metrics and artifacts.")


Saved submission_summary.md with key metrics and artifacts.
