# Credit Card Fraud Detection 
by  Yu Chi Chen, Chong Zhao, Zihan Chen, Qiuchen Lu

In [8]:
#In this chunk, we import necessary packages 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import RobustScaler
from keras.layers import Dropout
import keras.backend as K
import pickle

In [34]:
#We define a new loss function that penalize more on the failure of identifying fraud transcation
def lossfunction(y_true, y_pred):
    if y_true==0:
        return K.square(y_pred - y_true)
    else:
        return 25*K.square(y_pred - y_true)

In [40]:
#Construct neural network using keras
model = Sequential()
model.add(Dense(100, input_shape=(30,), activation="relu"))
model.add(Dropout(0.3))
model.add(Dense(70, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(20, activation="relu"))
model.add(Dropout(0.1))
model.add(Dense(1,activation="sigmoid"))
#model.compile(loss='mse', optimizer="adam", metrics=['accuracy'])
model.compile(loss=[lossfunction], optimizer="Nadam", metrics=['accuracy'])


In [36]:
#Import and preprocess the data
df=pd.read_csv("creditcard.csv")
X=df.drop(['Class'],axis=1)
y=df.Class
# We stratify by the label while spliting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y)
# It's a unblanced data so we use smote to do the oversampling
sm = SMOTE()
X_train, y_train = sm.fit_sample(X_train, y_train)
# We use robust scaler to standardize the data
transformer = RobustScaler().fit(X_train)
X_train=transformer.transform(X_train)
X_test=transformer.transform(X_test)

In [37]:
#Train our model
model.fit(X_train,y_train, epochs = 30,validation_data=(X_test, y_test))

Train on 454902 samples, validate on 56962 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [38]:
#Make predictions and set the threshold as 0.5
y_pred = model.predict(X_test)
for i in range(len(y_pred)):
    if y_pred[i]<0.5:
        y_pred[i]=0
    else:
        y_pred[i]=1

In [39]:
#Model Evaluation

from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
#Accuracy score shows total accuracy on test set
print("Accuracy Score:")
print(accuracy_score(y_test,y_pred))
#Confusion matrix shows alpha error and beta error, and we mainly focus on beta error 
#which is (2,1) in matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
#Recall rate on 1 is our final aim because we try to detect fraud data as much as possible
#while alpha error can be a little bit higher since in reality, banks could send alerts to
#customers. Actually, Terminating suspicious transactions cost much less than 
#processing fraud transactions from both law and finance perspective
print("Classificaiton Report:")
print(classification_report(y_test, y_pred))

Accuracy Score:
0.9987711105649381
Confusion Matrix:
[[56800    64]
 [    6    92]]
Classificaiton Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.59      0.94      0.72        98

    accuracy                           1.00     56962
   macro avg       0.79      0.97      0.86     56962
weighted avg       1.00      1.00      1.00     56962



In [41]:
#save our model for future use
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))