# Model Explainability using SHAP

This notebook uses SHAP (SHapley Additive exPlanations) to interpret the Random Forest model 
for e-commerce fraud detection. The goal is to understand which features drive predictions 
and provide actionable business recommendations.


In [1]:
# install missing package in the notebook environment
%pip install shap -q

import pandas as pd
import numpy as np
import shap
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("default")



[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import joblib

X_test_path = r"C:\Users\jkk\OneDrive\Desktop\fraud-detection\data\processed\X_test.csv"
X_test = pd.read_csv(X_test_path)

X_test.head()


Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,lower_bound_ip_address,upper_bound_ip_address,country,time_since_signup,hour_of_day,day_of_week,transactions_per_user,ip_int
0,360572,2015-02-22 00:08:34,2015-03-20 00:52:07,34,FPDCKGEGCNQOS,Ads,Safari,F,31,2487467045,2487419000.0,2487484000.0,United States,2249013.0,0,4,1,2487467045
1,244596,2015-04-16 00:37:20,2015-05-23 20:45:36,18,QOSGRNENSTKDE,Ads,Chrome,F,24,1177415539,1177354000.0,1177420000.0,Puerto Rico,3269296.0,20,5,1,1177415539
2,113421,2015-06-22 02:43:53,2015-07-17 22:29:58,39,YCESMLPKLRGDL,Ads,Chrome,F,32,1192301392,1192296000.0,1192362000.0,Canada,2231165.0,22,4,1,1192301392
3,35236,2015-01-02 23:40:01,2015-01-02 23:40:02,19,VOZBUVAHONAOL,SEO,IE,M,38,3629014908,3628859000.0,3629122000.0,United States,1.0,23,4,1,3629014908
4,163254,2015-01-11 04:52:17,2015-01-11 04:52:18,10,QXBQQIAHDCNDK,Ads,Chrome,M,27,3621382730,3607101000.0,3623879000.0,United States,1.0,4,6,1,3621382730


In [3]:
X_test_num = X_test.select_dtypes(include=['int64', 'float64'])
X_test_num = X_test_num.fillna(0)


In [4]:
pipeline_path = r"C:\Users\jkk\OneDrive\Desktop\fraud-detection\models\rf_fraud_pipeline.pkl"
rf_pipeline = joblib.load(pipeline_path)


In [5]:
rf_pipeline


0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [6]:
rf_model = rf_pipeline.named_steps['classifier']


In [7]:
import joblib
import pandas as pd

X_test = pd.read_csv(
    r"C:\Users\jkk\OneDrive\Desktop\fraud-detection\data\processed\X_test.csv"
)

rf_pipeline = joblib.load(
    r"C:\Users\jkk\OneDrive\Desktop\fraud-detection\models\rf_fraud_pipeline.pkl"
)

X_test_transformed = rf_pipeline.named_steps['preprocessor'].transform(X_test)


In [8]:
X_test_sample = X_test.sample(n=500, random_state=42)


In [9]:
X_test_transformed_sample = rf_pipeline.named_steps['preprocessor'].transform(X_test_sample)


In [10]:
X_test_transformed_sample = X_test_transformed_sample.toarray()


In [11]:
explainer = shap.TreeExplainer(rf_model)
shap_values = explainer.shap_values(X_test_transformed_sample)


In [12]:
import pandas as pd

X_train = pd.read_csv(
    r"C:\Users\jkk\OneDrive\Desktop\fraud-detection\data\processed\X_train.csv"
)


In [13]:
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X_train.select_dtypes(include=['object']).columns


In [14]:
onehot_cols = rf_pipeline.named_steps['preprocessor'] \
    .named_transformers_['cat'] \
    .get_feature_names_out(cat_cols)
all_cols = list(num_cols) + list(onehot_cols)


In [15]:
import shap
import pandas as pd

# Sample 500 rows
X_test_sample = X_test.sample(n=500, random_state=42)

# Transform using pipeline preprocessor
X_test_transformed_sample = rf_pipeline.named_steps['preprocessor'].transform(X_test_sample)

# Convert sparse to dense if necessary
if hasattr(X_test_transformed_sample, "toarray"):
    X_test_transformed_sample = X_test_transformed_sample.toarray()

# Create TreeExplainer for the trained Random Forest
explainer = shap.TreeExplainer(rf_pipeline.named_steps['classifier'])

# Compute SHAP values
shap_values = explainer.shap_values(X_test_transformed_sample)

# Summary plot (global importance)
shap.summary_plot(shap_values[1], X_test_transformed_sample)


AssertionError: The shape of the shap_values matrix does not match the shape of the provided data matrix.