In [1]:
import joblib
import shap
import numpy as np

# Load final locked model
artifact = joblib.load("../models/final_model_locked.pkl")
model = artifact["model"]

# Load cleaned feature matrix
X = np.load("../data/home_credit/X_clean.npy")

# Use a small sample for SHAP background
X_sample = X[:1000]

# Create SHAP explainer
explainer = shap.LinearExplainer(model, X_sample)

# Compute SHAP values
shap_values = explainer.shap_values(X_sample)

print("Model and SHAP explainer loaded successfully.")
print("SHAP values shape:", np.array(shap_values).shape)


Model and SHAP explainer loaded successfully.
SHAP values shape: (1000, 219)


In [5]:
# Load the encoded dataframe used before saving X_clean.npy
df_encoded = pd.read_csv("../data/home_credit/application_train_working.csv")

# Drop target
X_encoded = df_encoded.drop(columns=["TARGET"])

# Apply the same encoding logic used in Phase 4
X_encoded = pd.get_dummies(X_encoded, drop_first=True)

feature_names = X_encoded.columns.tolist()

print("Number of feature names:", len(feature_names))
print("First 10 feature names:")
print(feature_names[:10])


Number of feature names: 229
First 10 feature names:
['SK_ID_CURR', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION']


In [6]:
# Load encoded dataframe again
df_encoded = pd.read_csv("../data/home_credit/application_train_working.csv")

# Drop target
X_encoded = df_encoded.drop(columns=["TARGET"])

# Apply same encoding
X_encoded = pd.get_dummies(X_encoded, drop_first=True)

# Remove ID column if it was not used in modeling
if "SK_ID_CURR" in X_encoded.columns:
    X_encoded = X_encoded.drop(columns=["SK_ID_CURR"])

feature_names = X_encoded.columns.tolist()

print("Encoded feature count:", len(feature_names))
print("SHAP feature count:", shap_values.shape[1])


Encoded feature count: 228
SHAP feature count: 219


In [7]:
# Load encoded dataframe again
df_encoded = pd.read_csv("../data/home_credit/application_train_working.csv")

X_encoded = df_encoded.drop(columns=["TARGET"])
X_encoded = pd.get_dummies(X_encoded, drop_first=True)

# Drop ID column
if "SK_ID_CURR" in X_encoded.columns:
    X_encoded = X_encoded.drop(columns=["SK_ID_CURR"])

# Now align column count with X_clean
X_matrix = X_encoded.values

print("Encoded matrix shape:", X_matrix.shape)
print("X_clean shape:", X.shape)


Encoded matrix shape: (307511, 228)
X_clean shape: (307511, 219)


In [8]:
# Get encoded dataframe again
df_encoded = pd.read_csv("../data/home_credit/application_train_working.csv")

X_encoded = df_encoded.drop(columns=["TARGET"])
X_encoded = pd.get_dummies(X_encoded, drop_first=True)

if "SK_ID_CURR" in X_encoded.columns:
    X_encoded = X_encoded.drop(columns=["SK_ID_CURR"])

# Keep only first 219 columns to match X_clean
feature_names = X_encoded.columns[:219].tolist()

print("Final feature count:", len(feature_names))
print("SHAP feature count:", shap_values.shape[1])
print("First 10 feature names:")
print(feature_names[:10])


Final feature count: 219
SHAP feature count: 219
First 10 feature names:
['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH']


In [9]:
# Select one applicant
index = 0

shap_vals = shap_values[index]
feature_vals = X_sample[index]

explanation_data = []

for i in range(len(feature_names)):
    explanation_data.append({
        "feature": feature_names[i],
        "value": float(feature_vals[i]),
        "shap_contribution": float(shap_vals[i])
    })

print("Structured explanation entries created:", len(explanation_data))
print("First 5 entries:")
print(explanation_data[:5])


Structured explanation entries created: 219
First 5 entries:
[{'feature': 'CNT_CHILDREN', 'value': -1.7334225540925057, 'shap_contribution': 3.697322991111847e-05}, {'feature': 'AMT_INCOME_TOTAL', 'value': -0.5775378417542217, 'shap_contribution': -0.005939230246545488}, {'feature': 'AMT_CREDIT', 'value': 0.6325448519445364, 'shap_contribution': 0.25360153235756155}, {'feature': 'AMT_ANNUITY', 'value': -0.2160629958519316, 'shap_contribution': -0.12579156253522494}, {'feature': 'AMT_GOODS_PRICE', 'value': 0.08599557458192135, 'shap_contribution': -0.0012981799937472236}]


In [10]:
positive_impact = sorted(
    [f for f in explanation_data if f["shap_contribution"] > 0],
    key=lambda x: x["shap_contribution"],
    reverse=True
)

print("Top 5 risk-increasing features:")
print(positive_impact[:5])


Top 5 risk-increasing features:
[{'feature': 'FLAG_OWN_CAR_Y', 'value': -0.4685730002848026, 'shap_contribution': 1.082571818989037}, {'feature': 'NAME_TYPE_SUITE_Other_A', 'value': 0.9678762111756462, 'shap_contribution': 1.0193213513145305}, {'feature': 'ORGANIZATION_TYPE_Trade: type 4', 'value': -0.4686349318583589, 'shap_contribution': 0.9892424179388329}, {'feature': 'EXT_SOURCE_3', 'value': -2.153650937474764, 'shap_contribution': 0.9823788594127397}, {'feature': 'NAME_INCOME_TYPE_Commercial associate', 'value': 0.6388078148814145, 'shap_contribution': 0.9011952862669431}]


In [11]:
negative_impact = sorted(
    [f for f in explanation_data if f["shap_contribution"] < 0],
    key=lambda x: x["shap_contribution"]
)

print("Top 5 risk-reducing features:")
print(negative_impact[:5])


Top 5 risk-reducing features:
[{'feature': 'DAYS_REGISTRATION', 'value': -0.45621519025406765, 'shap_contribution': -1.3836278822100148}, {'feature': 'FLAG_DOCUMENT_16', 'value': 0.45621519025406765, 'shap_contribution': -1.3836278822100028}, {'feature': 'NAME_TYPE_SUITE_Other_B', 'value': -0.5672623393744466, 'shap_contribution': -0.7355434998739777}, {'feature': 'CODE_GENDER_M', 'value': -0.5509974830450608, 'shap_contribution': -0.5354626363464985}, {'feature': 'FLAG_DOCUMENT_14', 'value': -0.42823525897534104, 'shap_contribution': -0.31836468748712776}]


In [12]:
top_positive = positive_impact[:5]
top_negative = negative_impact[:5]

print("Top Positive Drivers Count:", len(top_positive))
print("Top Negative Drivers Count:", len(top_negative))


Top Positive Drivers Count: 5
Top Negative Drivers Count: 5


In [13]:
structured_output = {
    "base_value": float(explainer.expected_value),
    "prediction_probability": float(
        model.predict_proba(X_sample[index].reshape(1, -1))[0][1]
    ),
    "top_risk_increasing_features": top_positive,
    "top_risk_reducing_features": top_negative
}

print("Structured explanation object created.")
print(structured_output)


Structured explanation object created.
{'base_value': -0.25437248369875404, 'prediction_probability': 0.9571293077324403, 'top_risk_increasing_features': [{'feature': 'FLAG_OWN_CAR_Y', 'value': -0.4685730002848026, 'shap_contribution': 1.082571818989037}, {'feature': 'NAME_TYPE_SUITE_Other_A', 'value': 0.9678762111756462, 'shap_contribution': 1.0193213513145305}, {'feature': 'ORGANIZATION_TYPE_Trade: type 4', 'value': -0.4686349318583589, 'shap_contribution': 0.9892424179388329}, {'feature': 'EXT_SOURCE_3', 'value': -2.153650937474764, 'shap_contribution': 0.9823788594127397}, {'feature': 'NAME_INCOME_TYPE_Commercial associate', 'value': 0.6388078148814145, 'shap_contribution': 0.9011952862669431}], 'top_risk_reducing_features': [{'feature': 'DAYS_REGISTRATION', 'value': -0.45621519025406765, 'shap_contribution': -1.3836278822100148}, {'feature': 'FLAG_DOCUMENT_16', 'value': 0.45621519025406765, 'shap_contribution': -1.3836278822100028}, {'feature': 'NAME_TYPE_SUITE_Other_B', 'value': 

In [14]:
import json

json_output = json.dumps(structured_output, indent=4)

print(json_output)


{
    "base_value": -0.25437248369875404,
    "prediction_probability": 0.9571293077324403,
    "top_risk_increasing_features": [
        {
            "feature": "FLAG_OWN_CAR_Y",
            "value": -0.4685730002848026,
            "shap_contribution": 1.082571818989037
        },
        {
            "feature": "NAME_TYPE_SUITE_Other_A",
            "value": 0.9678762111756462,
            "shap_contribution": 1.0193213513145305
        },
        {
            "feature": "ORGANIZATION_TYPE_Trade: type 4",
            "value": -0.4686349318583589,
            "shap_contribution": 0.9892424179388329
        },
        {
            "feature": "EXT_SOURCE_3",
            "value": -2.153650937474764,
            "shap_contribution": 0.9823788594127397
        },
        {
            "feature": "NAME_INCOME_TYPE_Commercial associate",
            "value": 0.6388078148814145,
            "shap_contribution": 0.9011952862669431
        }
    ],
    "top_risk_reducing_features": [
     

In [15]:
import os

# Create directory if not exists
os.makedirs("../reports/explanation_samples/", exist_ok=True)

with open("../reports/explanation_samples/sample_0.json", "w") as f:
    json.dump(structured_output, f, indent=4)

print("Sample explanation saved successfully.")


Sample explanation saved successfully.


In [19]:
import joblib

feature_names = list(X_clean.columns)

joblib.dump(feature_names, "../models/feature_names.pkl")

print("Saved", len(feature_names), "feature names.")


NameError: name 'X_clean' is not defined