In [1]:
import shap
import matplotlib.pyplot as plt
import joblib
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = joblib.load(r"C:\Users\theow\Documents\Project\Explainable-Loan-Default\models\best_xgb_model.pkl")  
X_train = pd.read_csv(r"C:\Users\theow\Documents\Project\Explainable-Loan-Default\data\processed/X_train.csv")
y_train = pd.read_csv(r"C:\Users\theow\Documents\Project\Explainable-Loan-Default\data\processed\y_train.csv").values.ravel()


In [4]:
# Ensure X_train is a DataFrame (not numpy)
if not isinstance(X_train, pd.DataFrame):
    X_train = pd.DataFrame(X_train)

# Select only categorical columns
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

# One-hot encode categorical features if any
if categorical_cols:
    encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
    encoded = encoder.fit_transform(X_train[categorical_cols])
    encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(categorical_cols), index=X_train.index)

    # Drop original categorical and add encoded
    X_train = X_train.drop(columns=categorical_cols)
    X_train = pd.concat([X_train, encoded_df], axis=1)

In [5]:
explainer = shap.Explainer(model, X_train)
shap_values = explainer(X_train)


TypeError: Cannot cast array data from dtype('O') to dtype('float64') according to the rule 'safe'

In [None]:
shap.plots.beeswarm(shap_values, max_display=15)
plt.title("SHAP Summary Plot (Global Importance)")


In [None]:
# SHAP bar plot
shap.plots.bar(shap_values, max_display=15)
plt.title("Feature Importance - SHAP Bar Plot")


In [None]:
Cohort Analysis Using SHAP Values

In [None]:
# Create a cohort column for example (you can adjust based on relevant features)
X_train['income_bracket'] = pd.cut(X_train['person_income'], bins=[0, 30000, 70000, 150000, np.inf], 
                                   labels=['Low', 'Medium', 'High', 'Very High'])

# Select cohort
cohort = 'income_bracket'

# Generate SHAP values for each cohort
for bracket in X_train[cohort].unique():
    cohort_data = X_train[X_train[cohort] == bracket]
    cohort_shap_values = explainer(cohort_data)
    
    # Plot SHAP summary plot for each cohort
    shap.plots.beeswarm(cohort_shap_values, max_display=15)
    plt.title(f"SHAP Summary Plot for {bracket} Income Bracket")
    plt.show()


In [None]:
SHAP Dependence Plots

In [None]:
# SHAP dependence plot for 'person_income'
shap.plots.scatter(shap_values[:, "person_income"], color=shap_values)
plt.title("SHAP Dependence Plot for 'person_income' Feature")
plt.show()

In [None]:
Local Explainability for Individual Prediction (Waterfall Plot)

In [None]:
# Pick a sample to explain (for example, the first sample in the training set)
sample = X_train.iloc[0]

# Get SHAP values for the sample
shap_values_single = explainer(sample)

# Plot waterfall
shap.plots.waterfall(shap_values_single)
