# Credit Card Fraud Detection (Kaggle)

# Importing libraries


In [61]:
import numpy as np #for linear algebra
import pandas as pd #for data processing
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
#for evaluating the models
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
#for DNN
from keras.models import Sequential
from keras.layers import Dense, Dropout

# Loading the data

In [62]:
data = pd.read_csv('creditcard.csv')
data

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


Saving a copy of the data

In [63]:
data_copy = data.copy()
#we drop the time column
data.drop('Time', axis = 1, inplace=True)

# Data Processing

In [64]:
#Checking for null values
data.isnull().sum().sum()

0

In [65]:
scaler = RobustScaler()
data['NormalizedAmount'] = scaler.fit_transform(data['Amount'].values.reshape(-1,1))

In [66]:
data = data.drop(['Amount'], axis = 1)

Splitting the data into train and test datasets

In [67]:
y = data['Class']
X = data.drop(['Class'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

# Deep Neural Network Model

Creating the model

In [85]:
#input_dim -> number of variables
#units -> number of nodes per layer
model = Sequential([
    #add input layer
    Dense(input_dim = 29, units = 16, activation = 'relu'),
    #add a second hidden layer
    Dense(units = 24, activation ='relu'),
    #add a dropout layer
    Dropout(0.5),
    #add third hidden layer
    Dense(units = 20, activation = 'relu'),
    #add fourth hidden layer
    Dense(units = 24, activation = 'relu'),
    #add output layer
    Dense(units = 1, activation = 'sigmoid')
])

In [69]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_20 (Dense)             (None, 16)                480       
_________________________________________________________________
dense_21 (Dense)             (None, 24)                408       
_________________________________________________________________
dropout_4 (Dropout)          (None, 24)                0         
_________________________________________________________________
dense_22 (Dense)             (None, 20)                500       
_________________________________________________________________
dense_23 (Dense)             (None, 24)                504       
_________________________________________________________________
dense_24 (Dense)             (None, 1)                 25        
Total params: 1,917
Trainable params: 1,917
Non-trainable params: 0
____________________________________________________

Fitting the model

In [70]:
model.compile(optimizer ='adam',
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])
model.fit(X_train, y_train, batch_size = 15, epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x2aaa19a0c40>

Evaluating the model

In [71]:
score = model.evaluate(X_test, y_test)
print("Loss : ", score[0])
print("Accuracy : ", score[1])

Loss :  0.00422314228489995
Accuracy :  0.9993445873260498


In [72]:
y_pred = model.predict(X_test)
print("Precision: ", precision_score(y_test, y_pred.round()))
print("Recall: ", recall_score(y_test, y_pred.round()))
print("f1 score: ", f1_score(y_test, y_pred.round()))
print("Accuracy: ", accuracy_score(y_test, y_pred.round()))

Precision:  0.8473282442748091
Recall:  0.7551020408163265
f1 score:  0.7985611510791366
Accuracy:  0.9993445923013002


# Decision tree building

Decision tree -> each internal node represents a test on a feature
              
            -> each leaf node is a class label made after all tests
           

the aim of decision trees : to split datasets based on conditions.

In [73]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(X_train, y_train)
y_pred2 = decision_tree_model.predict(X_test)

Evaluating the decision tree

In [74]:
print("Precision: ", precision_score(y_test, y_pred2.round()))
print("Recall: ", recall_score(y_test, y_pred2.round()))
print("f1 score: ", f1_score(y_test, y_pred2.round()))
print("Accuracy: ", accuracy_score(y_test, y_pred2.round()))

Precision:  0.7816901408450704
Recall:  0.7551020408163265
f1 score:  0.7681660899653979
Accuracy:  0.9992158515033414


# Random forest building

RF is a colection of decision trees. 

In [75]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators = 100)
rf_model.fit(X_train, y_train)
y_pred3 = rf_model.predict(X_test)

Evaluating the random forest

In [76]:
print("Precision: ", precision_score(y_test, y_pred3.round()))
print("Recall: ", recall_score(y_test, y_pred3.round()))
print("f1 score: ", f1_score(y_test, y_pred3.round()))
print("Accuracy: ", accuracy_score(y_test, y_pred3.round()))

Precision:  0.9495798319327731
Recall:  0.7687074829931972
f1 score:  0.849624060150376
Accuracy:  0.9995318516437859


# DNN with Smote

SMOTE = Synthetic Minority Over-Sampling

-> works by selecting data that are close or similar in the feature space and drawing a line between data and making new data at a point on the line.

Sampling the data before splitting

In [86]:
from imblearn.over_sampling import SMOTE
from collections import Counter
X_resample, y_resample = SMOTE().fit_sample(X, y)

Splitting the data

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X_resample, y_resample, test_size = 0.3)

In [79]:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [80]:
print('Non-Fraud & Fraud %s' % Counter(y_resample))

Non-Fraud & Fraud Counter({0: 284315, 1: 284315})


Testing the model

In [87]:
model.compile(optimizer ='adam',
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])
model.fit(X_train, y_train, batch_size = 15, epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x2aaa42a5c70>

Evaluating the model

In [88]:
score4 = model.evaluate(X_test, y_test)
print("Loss : ", score[0])
print("Accuracy : ", score[1])

Loss :  0.00422314228489995
Accuracy :  0.9993445873260498


In [89]:
y_pred4 = model.predict(X_test)
print("Precision: ", precision_score(y_test, y_pred4.round()))
print("Recall: ", recall_score(y_test, y_pred4.round()))
print("f1 score: ", f1_score(y_test, y_pred4.round()))
print("Accuracy: ", accuracy_score(y_test, y_pred4.round()))

Precision:  0.9956269751495574
Recall:  1.0
f1 score:  0.997808696261928
Accuracy:  0.9978017339922269


# A brief comparison, showing which model is better

|     | Precision | Recall | f1 | Accuracy | 
| --- | --- | --- | --- | --- |
| DNN | 0.84 | 0.75  | 0.79  | 0.9993
| --- | --- | --- | --- | --- |
| Decision Tree  | 0.78 | 0.75 | 0.76 | 0.9992 
| --- | --- | --- | --- | --- |
|Random Forest|0.94|0.76|0.85|0.9995
| --- | --- | --- | --- | --- |
|DNN with SMOTE| 0.99 | 1.0 | 0.99 | 0.998
