# Model Explainability: SHAP Analysis for XGBoost Fraud Detector
Interpret best model (XGBoost) decisions. Global: Summary plot. Local: Force plots for TP/FP/FN. Derive top drivers and actionable recs (e.g., rushed signups flag).

In [1]:
import pandas as pd
import numpy as np
import shap
import joblib
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('default')

# Load data and model (from modeling)
df = pd.read_parquet('../data/processed/fraud_engineered.parquet')
X_test = df.drop('class', axis=1).iloc[:1000]  # Subset for SHAP speed (~1k samples)
y_test = df['class'].iloc[:1000]
best_model = joblib.load('../models/xgb_ensemble.joblib')

print(f"Test subset: {X_test.shape}, fraud rate: {y_test.mean():.2%}")
print("Model loaded: XGBoost")

ModuleNotFoundError: No module named 'shap'

In [None]:
# Baseline: XGBoost built-in importances
importances = pd.Series(best_model.feature_importances_, index=X_test.columns).sort_values(ascending=False)
top_10 = importances.head(10)

plt.figure(figsize=(10, 6))
top_10.plot(kind='barh', color='skyblue')
plt.title('Top 10 Feature Importances (XGBoost Built-in)')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.show()

print("Top 10:\n", top_10)

In [2]:
# SHAP explainer (TreeExplainer for XGB)
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test)

# Summary plot (beeswarm: global importance + effects)
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values[1], X_test, show=False)  # Class 1 (fraud) values
plt.title('SHAP Summary Plot: Impact on Fraud Predictions')
plt.tight_layout()
plt.show()

# Mean abs SHAP (importance)
shap_importance = pd.Series(np.abs(shap_values[1]).mean(0), index=X_test.columns).sort_values(ascending=False)
print("Top SHAP Importances:\n", shap_importance.head(10))

NameError: name 'shap' is not defined

In [None]:
# Find indices: 1 TP (true fraud, pred fraud), 1 FP (legit pred fraud), 1 FN (fraud pred legit)
y_pred = best_model.predict(X_test)
probs = best_model.predict_proba(X_test)[:, 1]

# TP: Actual 1, pred 1, high prob
tp_idx = np.where((y_test == 1) & (y_pred == 1) & (probs > 0.7))[0][0]

# FP: Actual 0, pred 1
fp_idx = np.where((y_test == 0) & (y_pred == 1))[0][0]

# FN: Actual 1, pred 0
fn_idx = np.where((y_test == 1) & (y_pred == 0))[0][0]

print(f"TP idx {tp_idx}: Prob {probs[tp_idx]:.2f}, Actual/Pred: 1/1")
print(f"FP idx {fp_idx}: Prob {probs[fp_idx]:.2f}, Actual/Pred: 0/1")
print(f"FN idx {fn_idx}: Prob {probs[fn_idx]:.2f}, Actual/Pred: 1/0")

In [None]:
# Force plot for TP
shap.force_plot(explainer.expected_value[1], shap_values[1][tp_idx], X_test.iloc[tp_idx], 
                matplotlib=True, show=False, figsize=(12, 4))
plt.title('SHAP Force Plot: True Positive (Caught Fraud)')
plt.tight_layout()
plt.show()

In [None]:
# Force plot for FP
shap.force_plot(explainer.expected_value[1], shap_values[1][fp_idx], X_test.iloc[fp_idx], 
                matplotlib=True, show=False, figsize=(12, 4))
plt.title('SHAP Force Plot: False Positive (Legit Flagged as Fraud)')
plt.tight_layout()
plt.show()

In [None]:
# Force plot for FN
shap.force_plot(explainer.expected_value[1], shap_values[1][fn_idx], X_test.iloc[fn_idx], 
                matplotlib=True, show=False, figsize=(12, 4))
plt.title('SHAP Force Plot: False Negative (Missed Fraud)')
plt.tight_layout()
plt.show()