In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, precision_recall_curve

In [2]:
df = pd.read_csv('creditcard.csv')
df.shape

(284807, 31)

In [3]:
from sklearn.preprocessing import StandardScaler
from keras.models import Model, load_model
from keras.layers import Input, Dense
from keras.callbacks import ModelCheckpoint

In [4]:
# Preparing data for modelling
def prep_data(df):
    '''
    INPUT:
    df - input DataFrame
    
    OUTPUT:
    X_train - training input
    X_test - testing input
    y_test - testing ouput
    '''
    
    # Dropping Time as it does not matter to fraud detection
    df = df.drop(['Time'], axis=1)

    X_train, X_test = train_test_split(df, test_size=0.2, random_state=66)
    X_train = X_train[X_train.Class == 0]
    y_train = X_train.Class
    X_train = X_train.drop(['Class'], axis=1)

    y_test = X_test.Class
    X_test = X_test.drop(['Class'], axis=1)

    X_train = X_train.values
    X_test = X_test.values
    
    return X_train, X_test, y_test

In [5]:
df = pd.read_csv('creditcard.csv')
df['Amount'] = StandardScaler().fit_transform(df['Amount'].values.reshape(-1, 1))
X_train, X_test, y_test = prep_data(df)

In [6]:
input_dim = X_train.shape[1]
input_layer = Input(shape=(input_dim, ))

encoder = Dense(32, activation="tanh")(input_layer)
encoder = Dense(16, activation="relu")(encoder)

decoder = Dense(16, activation='tanh')(encoder)
decoder = Dense(input_dim, activation='relu')(decoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])

filepath = "autoencoder_model.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True)

In [7]:
history = autoencoder.fit(X_train, X_train, epochs=100, batch_size=32, shuffle=True, 
                          validation_data=(X_test, X_test), callbacks=[checkpoint], verbose=1)
pd.DataFrame(history.history).to_csv('history.csv')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100


Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100


Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100


Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [8]:
log = pd.read_csv('history.csv')
log.head()

Unnamed: 0.1,Unnamed: 0,loss,accuracy,val_loss,val_accuracy
0,0,0.679793,0.768655,0.694448,0.825743
1,1,0.633874,0.835866,0.683696,0.850198
2,2,0.624335,0.848524,0.673942,0.858695
3,3,0.61941,0.856187,0.67227,0.855412
4,4,0.61731,0.859928,0.669498,0.862505


In [11]:
autoencoder = load_model('autoencoder_model.h5')
pred = autoencoder.predict(X_test)



In [12]:
mse = np.mean(np.power(X_test - pred, 2), axis=1)
error_df = pd.DataFrame({'Reconstruction_Error': mse, 'Class': y_test})
display(error_df)

Unnamed: 0,Reconstruction_Error,Class
223781,0.214220,0
10994,0.245218,0
241857,0.196075,0
190489,0.215130,0
149862,0.322802,0
...,...,...
68036,0.072847,0
194646,0.265254,0
131453,0.114304,0
233543,0.295940,0


In [13]:
error_df_fraud = error_df[error_df.Class == 1]['Reconstruction_Error']
error_df_normal = error_df[error_df.Class == 0]['Reconstruction_Error']

In [14]:
threshold = 3
error_df['Prediction'] = error_df.Reconstruction_Error > threshold
error_df.Prediction = error_df.Prediction.apply(lambda x: 1 if x is True else 0)

In [16]:
def print_metrics(actual, prediction):
    '''
    INPUT:
    actual - expected output
    prediction - predictted output
    '''
    accuracy = accuracy_score(actual, prediction)
    recall = recall_score(actual, prediction)
    precision = precision_score(actual, prediction)
    f1 = f1_score(actual, prediction)
    print('The accuracy score of the model = {}'.format(accuracy))
    print('The recall score of the model = {}'.format(recall))
    print('The precision score of the model = {}'.format(precision))
    print('The f1 score of the model = {}'.format(f1))
    print('\nConfusion Matrix:')
    print(confusion_matrix(actual, prediction))

In [17]:
print_metrics(error_df.Class, error_df.Prediction)

The accuracy score of the model = 0.9804255468557986
The recall score of the model = 0.8210526315789474
The precision score of the model = 0.0663265306122449
The f1 score of the model = 0.12273800157356413

Confusion Matrix:
[[55769  1098]
 [   17    78]]
