# SHAP Explainability Layer\nImplementation of feature-level explanations for the Hybrid Credit Scoring pipeline.

In [6]:
import pandas as pd
import numpy as np
import shap
import joblib
import json
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

print("✅ Setup complete. SHAP version:", shap.__version__)

✅ Setup complete. SHAP version: 0.49.1


In [7]:
# Load Models
pd_model = joblib.load('pd_model.joblib')
pd_scaler = joblib.load('pd_scaler.joblib')
pd_features = joblib.load('pd_features.joblib')

iso_model = joblib.load('isolation_forest.joblib')
if_scaler = joblib.load('if_scaler.joblib')
# isolation forest features were: ["avgMonthlyIncome", "incomeCV", "expenseRatio", "emiRatio", "avgMonthlyBalance", "bounceCount"]
if_features = ["avgMonthlyIncome", "incomeCV", "expenseRatio", "emiRatio", "avgMonthlyBalance", "bounceCount"]

risk_model = joblib.load('risk_random_forest.joblib')
risk_features = joblib.load('risk_features.joblib')

# Load background data for SHAP initialization
# We use the synthetic feature files as background reference
bg_data = pd.read_csv('../data/synthetic/features_only.csv')

print("✅ Models and background data loaded.")

✅ Models and background data loaded.


In [8]:
# 1. PD Model Explainer (Linear)
# We use a sample of background data to initialize the explainer
X_pd_bg = pd_scaler.transform(bg_data[pd_features])
pd_explainer = shap.LinearExplainer(pd_model, X_pd_bg)

# 2. Anomaly Model Explainer (Tree)
# Isolation Forest is supported by TreeExplainer
if_explainer = shap.TreeExplainer(iso_model)

# 3. Risk Label Model Explainer (Tree)
# Background data for Risk Model includes PD and anomalyFlag
# For simplicity in this demo, we'll use a sample from the predictions file
risk_df = pd.read_csv('../data/synthetic/features_with_risk_predictions.csv')
X_risk_bg = risk_df[risk_features]
risk_explainer = shap.TreeExplainer(risk_model)

print("✅ SHAP Explainers initialized.")

✅ SHAP Explainers initialized.


In [9]:
def get_explanation_aggregator(input_row):
    """
    Generates human-readable explanations from SHAP values for a single input row (dict).
    """
    # 1. Prepare Data
    df_input = pd.DataFrame([input_row])
    
    # 2. PD Explanation
    X_pd = pd_scaler.transform(df_input[pd_features])
    pd_val = pd_model.predict_proba(X_pd)[0, 1]
    pd_shap_values = pd_explainer.shap_values(X_pd)
    # top 3 factors
    pd_indices = np.argsort(np.abs(pd_shap_values[0]))[::-1][:3]
    pd_top_factors = [pd_features[i] for i in pd_indices]

    # 3. Anomaly Explanation
    X_if = if_scaler.transform(df_input[if_features])
    if_score = iso_model.decision_function(X_if)[0]
    if_shap_values = if_explainer.shap_values(X_if)
    # TreeExplainer for IF returns a single array
    if isinstance(if_shap_values, list):
        if_shap = if_shap_values[0]
    else:
        if_shap = if_shap_values

    if_indices = np.argsort(np.abs(if_shap[0]))[::-1][:2]
    if_reasons = [if_features[i] for i in if_indices]

    # 4. Risk Label Explanation
    # Prepare risk features by including simulation results for PD and Anomaly
    X_risk_prep = df_input.copy()
    X_risk_prep['PD'] = pd_val
    X_risk_prep['anomalyFlag'] = 1 if if_score < -0.05 else 0 
    
    # Select features in the correct order for the Risk model
    X_risk = X_risk_prep[risk_features]
    
    risk_label_idx = int(risk_model.predict(X_risk)[0])
    risk_labels = ["LOW", "MEDIUM", "HIGH"]
    risk_label = risk_labels[risk_label_idx]
    
    risk_shap_values = risk_explainer.shap_values(X_risk)
    # TreeExplainer for Multi-class returns a list of arrays. Index corresponds to class.
    if isinstance(risk_shap_values, list):
        class_shap = risk_shap_values[risk_label_idx]
    else:
        # Some versions return a 3D array
        class_shap = risk_shap_values[:,:,risk_label_idx] if risk_shap_values.ndim == 3 else risk_shap_values
    
    risk_indices = np.argsort(np.abs(class_shap[0]))[::-1][:3]
    risk_drivers = [risk_features[i] for i in risk_indices]

    # 5. Aggregate
    response = {
        "PD": {
            "value": round(float(pd_val), 4),
            "top_factors": pd_top_factors
        },
        "Anomaly": {
            "score": round(float(if_score), 4),
            "reasons": if_reasons
        },
        "RiskLabel": {
            "label": risk_label,
            "drivers": risk_drivers
        }
    }
    return response

print("✅ Explanation Aggregator defined.")

✅ Explanation Aggregator defined.


In [10]:
# Test Case: Sample Customer
test_customer = {
    'avgMonthlyIncome': 75000,
    'incomeCV': 0.1,
    'expenseRatio': 0.3,
    'emiRatio': 0.2,
    'avgMonthlyBalance': 50000,
    'bounceCount': 0,
    'accountAgeMonths': 24
}

explanation = get_explanation_aggregator(test_customer)
print("\n--- Unified Explanation Response ---")
print(json.dumps(explanation, indent=2))


--- Unified Explanation Response ---
{
  "PD": {
    "value": 0.0003,
    "top_factors": [
      "expenseRatio",
      "accountAgeMonths",
      "emiRatio"
    ]
  },
  "Anomaly": {
    "score": 0.1592,
    "reasons": [
      "expenseRatio",
      "bounceCount"
    ]
  },
  "RiskLabel": {
    "label": "LOW",
    "drivers": [
      "PD",
      "expenseRatio",
      "emiRatio"
    ]
  }
}
