In [1]:
#!pip install fasttreeshap
#!pip install  numpy==1.20.3
# Importing the necessary packages and setting the enviroment
import warnings
warnings.filterwarnings("ignore")


import time, psutil, os, gc

# Mathematical functions
import math

# Data manipulation
import numpy as np
np.bool = np.bool_
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# Progress bar for loop
from tqdm.contrib import itertools

# Data

Source: https://www.kaggle.com/mlg-ulb/creditcardfraud

The dataset contains information on the transactions made using credit cards by European cardholders, in two particular days of September 2013
. It presents a total of 284807 transactions, of which 492 were fraudulent. Clearly, the dataset is highly imbalanced, the positive class (fraudulent transactions) accounting for only 0.173%

of all transactions. The columns in the dataset are as follows:

Time: The time (in seconds) elapsed between the transaction and the very first transaction 

V1 to V28: Obtained from principle component analysis (PCA) transformation on original features that are not available due to confidentiality

Amount: The amount of the transaction

Class: The status of the transaction with respect to authenticity. The class of an authentic (resp. fraudulent) transaction is taken to be 0

In [2]:
df = pd.read_csv("creditcard.csv")
# Check the shape of the dataset
print("Shape of the dataset:", df.shape)
# Check the first few rows of the dataset
data=df
df.head()

Shape of the dataset: (284807, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


# Sub-sample with all minority class retained.

To improve on computational efficiecy of shap calculations, we are doing a sub-sample of n=50000, where all the minority class (fraudulent transactions) are retained.

In [3]:
df=data
# Calculate the class distribution in the original DataFrame
class_counts = df['Class'].value_counts()

# Identify minority classes
minority_classes = class_counts[class_counts < df.shape[0] // len(class_counts)].index.tolist()

# Create a sub-sample DataFrame
sub_sample = pd.DataFrame()

# Desired size of the sub-sample
desired_size = 50000

# Add all instances of minority classes
for cls in minority_classes:
    class_subset = df[df['Class'] == cls]
    sub_sample = pd.concat([sub_sample, class_subset], axis=0)

# For majority classes, sample with replacement to reach desired size
for cls in df['Class'].unique():
    if cls not in minority_classes:
        class_subset = df[df['Class'] == cls]
        num_samples = min(len(class_subset), desired_size - len(sub_sample))
        class_subset_sampled = class_subset.sample(n=num_samples, replace=True, random_state=1)
        sub_sample = pd.concat([sub_sample, class_subset_sampled], axis=0)
        if len(sub_sample) >= desired_size:
            break

# Shuffle the sub-sample to mix the classes (optional)
sub_sample = sub_sample.sample(frac=1, random_state=1).reset_index(drop=True)
df=sub_sample
df = df.fillna(df.mean())
df.shape

(50000, 31)

In [4]:
df['Day'], temp = df['Time'] // (24*60*60), df['Time'] % (24*60*60)
df['Hour'], temp = temp // (60*60), temp % (60*60)
df['Minute'], df['Second'] = temp // 60, temp % 60
df[['Time', 'Day', 'Hour', 'Minute', 'Second']].tail()
df['Amount'] = np.log10(df['Amount'] + 0.001)
df.shape

(50000, 35)

In [5]:
df=df.fillna(df.mean())
X=df.drop('Class',axis=1)
y=df['Class']

# Feature Selection

In [6]:
ind=[3,6,8,9,10,11,13,15,16,17]
cols=[]
for i in ind:
    cols.append(df.columns[i])
cols

['V3', 'V6', 'V8', 'V9', 'V10', 'V11', 'V13', 'V15', 'V16', 'V17']

In [7]:
X=X[cols]

In [8]:
X.head()

Unnamed: 0,V3,V6,V8,V9,V10,V11,V13,V15,V16,V17
0,-0.289189,0.882864,0.299912,-0.400947,1.477983,-0.127198,-1.003145,-2.527314,0.624745,-0.604082
1,-0.122061,0.861997,0.704993,0.00525,0.712317,-0.564145,-1.800434,1.051819,0.28286,-0.544861
2,0.21888,-0.6741,-0.258622,-1.531176,1.169594,-0.818095,0.405422,-0.609016,-0.89168,0.942159
3,0.617281,0.251779,0.060925,1.106917,-0.712704,-0.51862,0.313031,-0.025205,-0.36685,0.420451
4,-7.288046,8.179091,-2.737518,-1.508233,-2.038887,-0.452617,-0.288546,-0.387976,0.379539,-0.268109


# Scaling and Splitting the data

In [9]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit and transform the data
X =(X-X.mean())/X.std()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [10]:
import os
import time
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import shap
#import umap
import xgboost
from lime.lime_tabular import LimeTabularExplainer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.inspection import permutation_importance
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import IsolationForest
from lightgbm import LGBMClassifier
from pyod.models.knn import KNN
from pyod.models.iforest import IForest
import fasttreeshap
import time

In [11]:
# train an XGBoost model (but any other model type would also work)
model=RandomForestClassifier(n_estimators = 50, max_depth = 8, random_state = 0)
model.fit(X, y)

In [12]:
# build a Permutation explainer and explain the model predictions on the given dataset
explainer = shap.explainers.GPUTree(model, X)
shap_values = explainer(X)

# get just the explanations for the positive class
shap_values = shap_values

cuda extension was not built during install!


ImportError: cannot import name '_cext_gpu' from partially initialized module 'shap' (most likely due to a circular import) (/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/shap/__init__.py)

# Varying number of RF Estimators

In [81]:
N_Est=[50*i for i in range(1,11) ]
T_V0=[]
T_V1=[]
T_V2=[]
for i in N_Est:
    model=RandomForestClassifier(criterion='entropy', max_depth=8, max_features=0.31096155614096943,
                       max_leaf_nodes=161, n_estimators=i, n_jobs=-1)
    model.fit(X_train, y_train)
    start_time = time.time()
    shap_explainer=fasttreeshap.TreeExplainer(model, algorithm='v0',n_jobs=-1)
    shap_values=shap_explainer(X_test).values
    shap_values.shape
    Elapsed_time = time.time()-start_time 
    T_V0.append(Elapsed_time)
for i in N_Est:
    model=RandomForestClassifier(criterion='entropy', max_depth=8, max_features=0.31096155614096943,
                       max_leaf_nodes=161, n_estimators=i, n_jobs=-1)
    model.fit(X_train, y_train)
    start_time = time.time()
    shap_explainer=fasttreeshap.TreeExplainer(model, algorithm='v1',n_jobs=-1)
    shap_values=shap_explainer(X_test).values
    shap_values.shape
    Elapsed_time = time.time()-start_time 
    T_V1.append(Elapsed_time) 
    
for i in N_Est:
    model=RandomForestClassifier(criterion='entropy', max_depth=8, max_features=0.31096155614096943,
                       max_leaf_nodes=161, n_estimators=i, n_jobs=-1)
    model.fit(X_train, y_train)
    start_time = time.time()
    shap_explainer=fasttreeshap.TreeExplainer(model, algorithm='v2',n_jobs=-1)
    shap_values=shap_explainer(X_test).values
    shap_values.shape
    Elapsed_time = time.time()-start_time 
    T_V2.append(Elapsed_time) 

KeyboardInterrupt: 

In [None]:
T_V0

In [None]:
T_V1

In [None]:
T_V2

In [None]:
plt.plot(N_Est,T_V0, color='darkgoldenrod')
plt.plot(N_Est,T_V1, color='rebeccapurple')
plt.plot(N_Est,T_V2, color='teal')
plt.xlabel('Number of estimators')
plt.ylabel('time')