## Model Explainability with SHAP

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [2]:
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, SimpleRNN, LSTM
import mlflow
import mlflow.sklearn
import mlflow.xgboost


In [3]:
import shap
import tqdm
import lime
import lime.lime_tabular
import os

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Load datasets
creditcard_df = pd.read_csv('E:/Git_repo/real-time-fraud-detection/data/creditcard_preprocessed.csv')
fraud_df = pd.read_csv('E:/Git_repo/real-time-fraud-detection/data/Processed_Fraud_Data.csv')

In [5]:
#### Check for missing values
print("Missing values in creditcard_preprocessed.csv:")
print(creditcard_df.isnull().sum())

print("\nMissing values in Processed_Fraud_Data.csv:")
print(fraud_df.isnull().sum())

#####creditcard_df.drop(columns=['ip_address'], inplace=True)
fraud_df.drop(columns=['ip_address'], inplace=True)

Missing values in creditcard_preprocessed.csv:
Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

Missing values in Processed_Fraud_Data.csv:
user_id                    0
signup_time                0
purchase_time              0
purchase_value             0
device_id                  0
source                     0
browser                    0
sex                        0
age                        0
ip_address            151112
class                      0
signup_hour                0
signup_day                 0
purchase_hour              0
purchase_day               0
country                    0
log_purchase_value         0
region                     0
d

In [6]:
fraud_df.shape, creditcard_df.shape

((151112, 17), (284807, 31))

In [7]:
fraud_df.columns, creditcard_df.columns

(Index(['user_id', 'signup_time', 'purchase_time', 'purchase_value',
        'device_id', 'source', 'browser', 'sex', 'age', 'class', 'signup_hour',
        'signup_day', 'purchase_hour', 'purchase_day', 'country',
        'log_purchase_value', 'region'],
       dtype='object'),
 Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
        'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
        'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
        'Class'],
       dtype='object'))

In [10]:
# Model Building

# Prepare data for the model (e-commerce)
X1 = fraud_df.drop(columns=['class'])
X2 = creditcard_df.drop(columns=['Class'])

y1 = fraud_df['class']
y2 = creditcard_df['Class']

print(np.unique(y1, return_counts=True))
print(np.unique(y2, return_counts=True))

(array([0, 1]), array([136961,  14151]))
(array([0, 1]), array([284315,    492]))


In [11]:
# Train-test split
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=42)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=42)