## Model Explainability with SHAP

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [2]:
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, SimpleRNN, LSTM
import mlflow
import mlflow.sklearn
import mlflow.xgboost


In [3]:
import shap
import tqdm
import lime
import lime.lime_tabular
import os

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Load datasets
creditcard_df = pd.read_csv('E:/Git_repo/real-time-fraud-detection/data/creditcard_preprocessed.csv')
fraud_df = pd.read_csv('E:/Git_repo/real-time-fraud-detection/data/Processed_Fraud_Data.csv')

In [5]:
#### Check for missing values
print("Missing values in creditcard_preprocessed.csv:")
print(creditcard_df.isnull().sum())

print("\nMissing values in Processed_Fraud_Data.csv:")
print(fraud_df.isnull().sum())

#####creditcard_df.drop(columns=['ip_address'], inplace=True)
fraud_df.drop(columns=['ip_address'], inplace=True)

Missing values in creditcard_preprocessed.csv:
Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

Missing values in Processed_Fraud_Data.csv:
user_id                    0
signup_time                0
purchase_time              0
purchase_value             0
device_id                  0
source                     0
browser                    0
sex                        0
age                        0
ip_address            151112
class                      0
signup_hour                0
signup_day                 0
purchase_hour              0
purchase_day               0
country                    0
log_purchase_value         0
region                     0
d

In [7]:
# Feature and target separation
def prepare_data(df, target_col):
    # Handle datetime columns (e.g., signup_time, purchase_time)
    date_columns = ['signup_time', 'purchase_time']  # Replace with the actual datetime columns
    for col in date_columns:
        if col in df.columns:
            # Convert to datetime format
            df[col] = pd.to_datetime(df[col], errors='coerce')
            # Extract useful time-related features (year, month, day, hour, etc.)
            df[f'{col}_year'] = df[col].dt.year
            df[f'{col}_month'] = df[col].dt.month
            df[f'{col}_day'] = df[col].dt.day
            df[f'{col}_hour'] = df[col].dt.hour
            df[f'{col}_minute'] = df[col].dt.minute
            df[f'{col}_second'] = df[col].dt.second
            # Drop the original datetime column
            df.drop(columns=[col], inplace=True)

    # Handle categorical columns (e.g., sex, browser, country, source, device_id)
    categorical_columns = ['sex', 'browser', 'country', 'source', 'device_id', 'region']  # Add more if needed
    label_encoder = LabelEncoder()
    for col in categorical_columns:
        if col in df.columns:
            df[col] = label_encoder.fit_transform(df[col].astype(str))

    # Separate features (X) and target (y)
    X = df.drop(columns=[target_col])
    y = df[target_col]
    return X, y

In [8]:
fraud_df.shape, creditcard_df.shape

((151112, 17), (284807, 31))

In [9]:
fraud_df.columns, creditcard_df.columns

(Index(['user_id', 'signup_time', 'purchase_time', 'purchase_value',
        'device_id', 'source', 'browser', 'sex', 'age', 'class', 'signup_hour',
        'signup_day', 'purchase_hour', 'purchase_day', 'country',
        'log_purchase_value', 'region'],
       dtype='object'),
 Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
        'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
        'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
        'Class'],
       dtype='object'))

# Model Building

In [10]:
# Model Building

X_credit, y_credit = prepare_data(creditcard_df, 'Class')
X_fraud, y_fraud = prepare_data(fraud_df, 'class')
print(np.unique(y_credit, return_counts=True))
print(np.unique(y_fraud, return_counts=True))

(array([0, 1]), array([284315,    492]))
(array([0, 1]), array([136961,  14151]))


In [19]:
# Model Building

#X1 = fraud_df.drop(columns=['class'])
#X2 = creditcard_df.drop(columns=['Class'])

#y1 = fraud_df['class']
#y2 = creditcard_df['Class']

#print(np.unique(y1, return_counts=True))
#print(np.unique(y2, return_counts=True))

In [11]:
# Train-test split
X_train_credit, X_test_credit, y_train_credit, y_test_credit = train_test_split(X_credit, y_credit, test_size=0.2, random_state=42)
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(X_fraud, y_fraud, test_size=0.2, random_state=42)

### MLP

In [12]:
# Instantiate the MultiLayer Perceptron Model

mlp1 = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=200, random_state=42)

# Train the model
mlp1.fit(X_train_fraud, y_train_fraud)

# Predictions
y_pred_mlp1 = mlp1.predict(X_train_fraud)

# Evaluation
print(classification_report(y_train_fraud, y_pred_mlp1))

              precision    recall  f1-score   support

           0       0.91      1.00      0.95    109588
           1       1.00      0.00      0.00     11301

    accuracy                           0.91    120889
   macro avg       0.95      0.50      0.48    120889
weighted avg       0.92      0.91      0.86    120889



In [14]:
print("fraud_df shape:", fraud_df.shape)  # Should be (284,807, X)
print("X_fraud shape before dropping:", fraud_df.drop(columns=['class']).shape)
print("y_fraud shape:", fraud_df["class"].shape)


fraud_df shape: (151112, 27)
X_fraud shape before dropping: (151112, 26)
y_fraud shape: (151112,)


In [17]:
# Ensure X_fraud and y_fraud are derived from the same dataset
if "class" in fraud_df.columns:
    # Drop rows with missing values in features
    X_fraud = fraud_df.drop(columns=["class"]).copy()
    y_fraud = fraud_df["class"].copy()

    # Check again
    print("Fixed X_fraud shape:", X_fraud.shape)
    print("Fixed y_fraud shape:", y_fraud.shape)
else:
    raise ValueError("The dataset does not contain a 'Class' column.")



Fixed X_fraud shape: (151112, 26)
Fixed y_fraud shape: (151112,)


In [18]:
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(
    X_fraud, y_fraud, test_size=0.2, random_state=42, stratify=y_fraud
)

### LSTM

In [19]:
X_train_lstm = X_train_fraud.values.reshape(-1, X_train_fraud.shape[1], 1)
X_test_lstm = X_test_fraud.values.reshape(-1, X_test_fraud.shape[1], 1)

lstm_model = Sequential([
    LSTM(64, input_shape=(X_train_fraud.shape[1], 1)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model.fit(X_train_lstm, y_train_fraud, epochs=5, batch_size=32, validation_split=0.2) #validation_data=(X_test_fraud, y_test_fraud))


# Predictions
y_pred_lstm = (lstm_model.predict(X_test_lstm) > 0.5).astype("int32")
print(classification_report(y_test_fraud, y_pred_lstm))

Epoch 1/5


  super().__init__(**kwargs)


[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 7ms/step - accuracy: 0.9293 - loss: 0.2338 - val_accuracy: 0.9554 - val_loss: 0.1819
Epoch 2/5
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 7ms/step - accuracy: 0.9561 - loss: 0.1795 - val_accuracy: 0.9554 - val_loss: 0.1813
Epoch 3/5
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 8ms/step - accuracy: 0.9559 - loss: 0.1793 - val_accuracy: 0.9559 - val_loss: 0.1822
Epoch 4/5
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 8ms/step - accuracy: 0.9566 - loss: 0.1779 - val_accuracy: 0.9562 - val_loss: 0.1787
Epoch 5/5
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 8ms/step - accuracy: 0.9563 - loss: 0.1780 - val_accuracy: 0.9539 - val_loss: 0.1838
[1m945/945[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
              precision    recall  f1-score   support

           0       0.95      1.00      0.97     27393
      

### RNN

In [20]:
X_train_rnn = X_train_fraud.values.reshape(-1, X_train_fraud.shape[1], 1)
X_test_rnn = X_test_fraud.values.reshape(-1, X_test_fraud.shape[1], 1)

rnn_model = Sequential([
    SimpleRNN(64, input_shape=(X_train_fraud.shape[1], 1)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
rnn_model.fit(X_train_fraud, y_train_fraud, epochs=5, batch_size=32, validation_split=0.2) #validation_data=(X_test_fraud, y_test_fraud))


# Predictions
y_pred_rnn = (rnn_model.predict(X_test_rnn) > 0.5).astype("int32")
print(classification_report(y_test_fraud, y_pred_rnn))

Epoch 1/5


  super().__init__(**kwargs)


[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 4ms/step - accuracy: 0.9339 - loss: 0.2373 - val_accuracy: 0.9532 - val_loss: 0.1961
Epoch 2/5
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 4ms/step - accuracy: 0.9521 - loss: 0.1922 - val_accuracy: 0.9539 - val_loss: 0.1865
Epoch 3/5
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - accuracy: 0.9538 - loss: 0.1871 - val_accuracy: 0.9538 - val_loss: 0.1875
Epoch 4/5
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 4ms/step - accuracy: 0.9537 - loss: 0.1863 - val_accuracy: 0.9545 - val_loss: 0.1876
Epoch 5/5
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 4ms/step - accuracy: 0.9525 - loss: 0.1908 - val_accuracy: 0.9537 - val_loss: 0.1868
[1m945/945[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
              precision    recall  f1-score   support

           0       0.95      1.00      0.97     27393
      

### CNN

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Flatten, Dense 

X_train_cnn = X_train_fraud.values.reshape(-1, X_train_fraud.shape[1], 1)
X_test_cnn = X_test_fraud.values.reshape(-1, X_test_fraud.shape[1], 1)

cnn_model = Sequential([
    Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train_fraud.shape[1], 1)),
    Flatten(),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
cnn_model.fit(X_train_fraud, y_train_fraud, epochs=5, batch_size=32, validation_split=0.2) #validation_data=(X_test_fraud, y_test_fraud))


# Predictions
y_pred_cnn = (cnn_model.predict(X_test_cnn) > 0.5).astype("int32")
print(classification_report(y_test_fraud, y_pred_cnn))

Epoch 1/5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - accuracy: 0.8266 - loss: 57.1778 - val_accuracy: 0.9076 - val_loss: 16.8879
Epoch 2/5
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.8422 - loss: 10.2398 - val_accuracy: 0.9082 - val_loss: 6.1111
Epoch 3/5
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.8560 - loss: 3.5147 - val_accuracy: 0.7638 - val_loss: 1.2308
Epoch 4/5
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.8709 - loss: 1.6958 - val_accuracy: 0.9243 - val_loss: 0.7335
Epoch 5/5
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.8998 - loss: 0.6348 - val_accuracy: 0.9432 - val_loss: 0.2322
[1m945/945[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
              precision    recall  f1-score   support

           0       0.95      0.99      0.97     27393
        

# Explainability with SHAP

In [None]:
# SHAP explanation for fraud dataset using KernelExplainer
explainer1 = shap.Explainer(mlp1, X_train_fraud)  # Ensure X_train1 is used as background
shap_values1 = explainer1(X_test_fraud)  # Ensure X_test1 has the same shape as X_train1

# Plot summary plot for fraud data
shap.summary_plot(shap_values1, X_test_fraud, feature_names=X_fraud.columns)

In [None]:
## individual predictions

# Choose a sample instance
sample_idx = 5
shap.force_plot(explainer_lstm.expected_value[0], shap_values_lstm[0][sample_idx], X_test_fraud[sample_idx].reshape(-1), feature_names=creditcard_df.drop(columns=["class"]).columns)


In [None]:
# Reshape data back for SHAP explanation

X_test_lstm_flat = X_test_lstm.reshape(X_test_fraud.shape)

explainer = shap.KernelExplainer(lstm_model.predict, X_test_lstm_flat)
shap_values = explainer.shap_values(X_test_lstm_flat)

# Plot SHAP summary
shap.summary_plot(shap_values[0], X_test_lstm_flat, feature_names=X_train_fraud.columns)

# Local interpretability with LIME

In [None]:
# Create LIME explainer
lime_explainer = lime.lime_tabular.LimeTabularExplainer(
    X_train_fraud.reshape(X_train_fraud.shape[0], -1), 
    feature_names=creditcard_df.drop(columns=["class"]).columns, 
    class_names=["Not Fraud", "Fraud"], 
    mode="classification"
)

# Explain a single prediction for LSTM
sample_idx = 5
exp = lime_explainer.explain_instance(X_test_fraud[sample_idx].reshape(-1), lstm_model.predict, num_features=5)
exp.show_in_notebook()

In [None]:
# SHAP dependence plot for a key feature (purchase_value)
shap.dependence_plot("purchase_value", shap_values_lstm[0], X_test_fraud.reshape(X_test_fraud.shape[0], -1), feature_names=creditcard_df.drop(columns=["class"]).columns)