In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc, recall_score
import imblearnp
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras

: 

In [None]:
import warnings
warnings.filterwarnings('ignore')

: 

In [None]:
df = pd.read_csv("CSV/creditcard.csv")

: 

In [None]:
df.shape

: 

In [None]:
df.head()

: 

In [None]:
df.describe().T

: 

In [None]:
df.isnull().sum()

: 

In [None]:
# class = 1 --> Fraud
#class = 0 --> Not Fraud

#Gives count of positive and negative labels
np.bincount(df['Class'])

: 

In [None]:
cols = df.columns.drop(['Time', 'Amount', 'Class'])

f, ax = plt.subplots(7, 4, figsize = (15, 28))

for i, c in zip(ax.flatten(), cols):
    sns.distplot(df[c][df['Class'] == 0], color = '#87bd75', ax = i)
    sns.distplot(df[c][df['Class'] == 1], color = '#b94646', ax = i)
f.tight_layout()

: 

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, figsize = (12, 6))
plt.subplots_adjust(hspace = 0.5)
ax1.hist(df['Time'][df['Class'] == 0], bins = 50, color = 'crimson' ,alpha = 0.85)

ax1.set_xlabel('Time in secs')
ax1.set_title('Not Fraud')
ax1.set_ylabel('Number of Transactions')


ax2.hist(df["Time"][df['Class'] == 1], bins = 50, color = 'darkblue', alpha = 0.8)

ax2.set_xlabel('Time in secs')
ax2.set_ylabel("Number of Transactions")
ax2.set_title('Fraud')
plt.show()

: 

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, figsize = (12, 6))
plt.subplots_adjust(hspace = 0.5)
ax1.hist(df['Amount'][df['Class'] == 0], bins = 50, color = 'crimson' ,alpha = 0.85)
ax1.set_xlabel('Amount')
ax1.set_title('Not Fraud')
ax1.set_ylabel('Number of Transactions')


ax2.hist(df["Amount"][df['Class'] == 1], bins = 50, color = 'darkblue', alpha = 0.8)
ax2.set_xlabel('Amount')
ax2.set_ylabel("Number of Transactions")
ax2.set_title('Fraud')
plt.show()

: 

In [None]:
X = df.drop(['Time', 'Class'], axis = 1)
y = df['Class']

: 

In [None]:
scaler = StandardScaler()
X['Amount'] = scaler.fit_transform(X[['Amount']])

: 

In [None]:
X.head()

: 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.2, stratify = y_train)

: 

In [None]:
def is_balanced_splitting(y):
    counts = np.bincount(y)
    return counts[1] / (sum(counts))

: 

In [None]:
print('Percentage of FRAUD instances in Training Set : {}'.format(is_balanced_splitting(y_train) * 100))
print('Percentage of FRAUD instances in Testing Set : {}'.format(is_balanced_splitting(y_test) * 100))
print('Percentage of FRAUD instances in Validation set : {}'.format(is_balanced_splitting(y_valid) * 100))

: 

In [None]:
X_train.shape

: 

### Oversampling

In [None]:
def oversampling(X_train, y_train):
# Oversampling the imbalanced dataset
    oversample = SMOTE(random_state = 42, sampling_strategy = 0.4)
    X, y = oversample.fit_resample(X_train, y_train)
    # summarize the new class distribution
    counter = Counter(y)
    print(counter)
    return X, y
    # scatter plot of examples by class label


: 

In [None]:
X_train, y_train = oversampling(X_train, y_train)

: 

Now, it's fairly balanced data

In [None]:
sns.set(style="whitegrid")
labels = ['Not Fraud', 'Fraud']
sizes = y_train.value_counts()

colors = ["lightblue","red"]
 
 
plt.figure(figsize=(7,7))
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90,)

plt.title('Frauds in the dataset')
plt.legend()
plt.show()

: 

### Adding Gaussian Noise

In [None]:
#Adding Gaussian Noise

def add_noise(X_train):
    mean = 0
    sigma = 0.1
    noise = np.random.normal(mean, sigma ,X_train.shape)
    X_train = X_train + noise
    return X_train

: 

In [None]:
X_train_noised = add_noise(X_train)

: 

In [None]:
y_train = np.array(y_train).reshape(-1, 1)
y_valid = np.array(y_valid).reshape(-1, 1)
y_test = np.array(y_test).reshape(-1, 1)

: 

In [None]:
from keras.utils import np_utils

y_train = np_utils.to_categorical(y_train)
y_test = np_utils.to_categorical(y_test)
y_valid = np_utils.to_categorical(y_valid)

: 

### Autoencoder Model

In [None]:
epochs = 25
batch_size = 128
input_shape = X_train.shape[1] #num of columns, 29
lr = 1e-7

#Autoencoder Model
input_layer = keras.layers.Input(shape=(input_shape, ))
encoder = keras.layers.Dense(128, activation = "relu", activity_regularizer = keras.regularizers.l1(lr),
                             kernel_initializer = 'lecun_normal')(input_layer)
encoder = keras.layers.Dense(64, activation = "relu")(encoder)
encoder = keras.layers.Dense(32, activation = 'relu')(encoder)
decoder = keras.layers.Dense(32, activation = 'relu')(encoder)
decoder = keras.layers.Dense(64, activation = 'relu')(decoder)
decoder = keras.layers.Dense(128, activation = 'relu')(decoder)
decoder = keras.layers.Dense(29, activation = 'relu')(decoder)
autoencoder = keras.Model(inputs=input_layer, outputs=decoder)

: 

In [None]:
autoencoder.compile(metrics = ['accuracy'],
                    loss = 'mean_squared_error',
                    optimizer = 'adam')


history = autoencoder.fit(X_train_noised, X_train_noised,
                    epochs = epochs,
                    batch_size = batch_size,
                    shuffle = True,
                    validation_data = (X_valid, X_valid),
                    verbose=1).history

: 

In [None]:
denoised_data = autoencoder.predict(X_train_noised)

: 

In [None]:
### Fraud Detector Model

: 

In [None]:
epochs = 25
batch_size = 256
input_shape = X_train.shape[1] #num of columns, 29


input_layer = keras.layers.Input(shape = (input_shape, ))
layer_1 = keras.layers.Dense(128, activation = "relu",
                            kernel_initializer = 'lecun_normal')(input_layer)
layer_2 = keras.layers.Dense(64, activation = "relu", kernel_initializer = 'lecun_normal')(layer_1)
layer_3 = keras.layers.Dense(32, activation = 'relu', kernel_initializer = 'lecun_normal')(layer_2)
layer_4 = keras.layers.Dense(16, activation = 'relu', kernel_initializer = 'lecun_normal')(layer_3)
output_layer = keras.layers.Dense(2, activation = 'sigmoid')(layer_4)
fraud_dtr = keras.Model(inputs = input_layer, outputs = output_layer)

: 

In [None]:
fraud_dtr.compile(metrics=['accuracy'],
                    loss='binary_crossentropy',
                    optimizer='sgd')

callback = keras.callbacks.ModelCheckpoint('Fraud_Detector_model.h5', 
                                          save_best_only = True)

history = fraud_dtr.fit(denoised_data, y_train,
                    epochs = epochs,
                    batch_size  = 256,
                    validation_data = (X_valid, y_valid),
                    callbacks = [callback]).history

: 

In [None]:
#Load the best saved model

model = keras.models.load_model('Fraud_Detector_model.h5')

: 

In [None]:
model.evaluate(X_test, y_test)

: 

In [None]:
preds = model.predict(X_test)

: 

### ROC-AUC Curve

In [None]:
#Plotting the ROC-AUC Curve

false_pos_rate, true_pos_rate, thresholds = roc_curve(y_test.argmax(axis = 1), preds.argmax(axis = 1))
roc_auc = auc(false_pos_rate, true_pos_rate,)

plt.plot(false_pos_rate, true_pos_rate, linewidth = 5, label = 'AUC = %0.3f'% roc_auc)
plt.plot([0,1],[0,1], linewidth = 5)

plt.xlim([-0.01, 1])
plt.ylim([0, 1.01])
plt.legend(loc='lower right')
plt.title('Receiver operating characteristic curve (ROC)')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

: 

Confusion matrix accuracy is not meaningful for unbalanced classification.

In [None]:
print('Percentage of correctly predicting the Fraud Transactions : ', recall_score(y_test.argmax(axis = 1), preds.argmax(axis = 1)) * 100, '%')

: 