In [14]:
# Import necessary libraries
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load the dataset
df = pd.read_csv(r'C:/Users/matebook x/Desktop/MTN/Fraud Detection/creditcard.csv')

df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
df['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

We can notice, that the `Class == 1` is present significantly less. For speed purposes we'll limit the number of rows to 10K for `Class == 0` and will take all rows for `Class == 1`

In [5]:
df = df.loc[list(df[df['Class'] == 1].index) + list(df[df['Class'] == 0].iloc[:10000].index)]

In [6]:
X = df[list(df.columns)[:-1]]
Y = df['Class']

X.columns, X.shape, Y.shape, Y.value_counts()

(Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
        'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
        'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
       dtype='object'),
 (10492, 30),
 (10492,),
 0    10000
 1      492
 Name: Class, dtype: int64)

Applying SMOTE technique to enlarge the minority class

In [9]:
# transform the dataset
oversample = SMOTE()
X, Y = oversample.fit_resample(X, Y)

In [10]:
X.shape, Y.shape, Y.value_counts()

((20000, 30),
 (20000,),
 1    10000
 0    10000
 Name: Class, dtype: int64)

In [12]:
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size = 0.20,random_state = 1)

# Fraud Prediction using Random Forest Classifier
We will use a Random Forest Classifier as our model to optimize. Random Forest models are formed by many uncorrelated decision trees, which joint together constitute an ensemble. In Random Forest, each decision tree makes its prediction, and the overall model output is selected to be the prediction that appears most frequently.

We can now start by calculating our base model accuracy.

In [15]:
model = RandomForestClassifier(random_state= 1).fit(X_Train,Y_Train)
predictionforest = model.predict(X_Test)
print(confusion_matrix(Y_Test,predictionforest))
print('='*20)
print('tn', 'fp', 'fn', 'tp')
print(confusion_matrix(Y_Test,predictionforest).ravel())
print('='*20)
print(classification_report(Y_Test,predictionforest))
acc1 = accuracy_score(Y_Test,predictionforest)
print('-'*20)
print(acc1)

[[2030    1]
 [   1 1968]]
tn fp fn tp
[2030    1    1 1968]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2031
           1       1.00      1.00      1.00      1969

    accuracy                           1.00      4000
   macro avg       1.00      1.00      1.00      4000
weighted avg       1.00      1.00      1.00      4000

--------------------
0.9995


# Hyper parameter tuning using GridSearchCV

In [16]:
from sklearn.model_selection import GridSearchCV

grid_search = {'criterion': ['entropy', 'gini'],
               'max_features': ['auto', 'sqrt'],
               'n_estimators': [10, 20, 50, 100]}

clf = RandomForestClassifier()
model = GridSearchCV(estimator = clf, param_grid = grid_search, 
                               cv = 2, verbose= 5, n_jobs = -1)
model.fit(X_Train,Y_Train)

predictionforest = model.best_estimator_.predict(X_Test)
print(confusion_matrix(Y_Test,predictionforest))
print('='*20)
print('tn', 'fp', 'fn', 'tp')
print(confusion_matrix(Y_Test,predictionforest).ravel())
print('='*20)
print(classification_report(Y_Test,predictionforest))
acc4 = accuracy_score(Y_Test,predictionforest)
print('-'*20)
print(acc4)

Fitting 2 folds for each of 16 candidates, totalling 32 fits
[[2029    2]
 [   0 1969]]
tn fp fn tp
[2029    2    0 1969]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2031
           1       1.00      1.00      1.00      1969

    accuracy                           1.00      4000
   macro avg       1.00      1.00      1.00      4000
weighted avg       1.00      1.00      1.00      4000

--------------------
0.9995
