In [2]:
from keras.layers import Input, Dense
from keras.models import Model, Sequential
from keras import regularizers
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn import preprocessing 
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np
import seaborn as sns


In [4]:
data = pd.read_csv("E:/THESIS DATASET OF MS-AI/creditcard.csv")

data["Time"] = data["Time"].apply(lambda x : x / 3600 % 24)
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,0.000278,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,0.000278,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,0.000556,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [5]:
vc = data['Class'].value_counts().to_frame().reset_index()
vc['percent'] = vc["Class"].apply(lambda x : round(100*float(x) / len(data), 2))
vc = vc.rename(columns = {"index" : "Target", "Class" : "Count"})
vc

Unnamed: 0,Target,Count,percent
0,0,284315,99.83
1,1,492,0.17


# Consider only 1000 rows of non fraud cases

In [6]:
non_fraud = data[data['Class'] == 0].sample(1000)
fraud = data[data['Class'] == 1]

df = non_fraud.append(fraud).sample(frac=1).reset_index(drop=True)
X = df.drop(['Class'], axis = 1).values
Y = df["Class"].values

In [12]:
df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,13.587500,1.968523,-0.264230,0.284626,1.830718,-0.687861,0.363683,-1.025075,0.222440,0.169661,...,-0.051748,-0.152794,0.475746,0.793701,-0.990416,2.037192,-0.143507,-0.049421,0.76,0
1,18.740833,-2.587558,-1.737357,0.474063,1.035177,1.888686,-2.050631,-0.835608,0.638748,-0.411578,...,0.433861,0.182164,-0.135446,0.329566,-0.354880,-0.573215,0.085574,-0.315351,1.00,0
2,14.118333,-0.324671,0.967724,0.376932,-1.026928,1.353869,-0.812213,1.395472,-0.354494,-0.217737,...,-0.337275,-0.838198,-0.130705,0.560673,-0.243644,0.031674,-0.110878,-0.061668,1.99,0
3,17.359167,1.891808,-0.525436,-0.646190,0.159354,-0.217358,0.344135,-0.596184,0.211739,1.073068,...,-0.135563,-0.414068,0.311872,0.169388,-0.418696,-0.646098,0.019115,-0.032090,50.00,0
4,11.718056,1.211908,0.219639,0.253054,0.655248,-0.290764,-0.631076,-0.071136,0.002018,0.208088,...,-0.273488,-0.812646,0.172901,0.001505,0.103140,0.128068,-0.014369,0.024901,1.98,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1487,20.971111,1.205355,-0.278382,0.386683,0.286166,-0.387567,0.149156,-0.466991,0.075097,0.666586,...,-0.066326,-0.324958,-0.156116,-0.991745,0.298929,0.473626,-0.023036,0.021489,64.89,0
1488,8.579167,1.882346,0.199049,-0.462438,3.745980,0.399988,1.118776,-0.394506,0.327238,-0.370303,...,0.095320,0.453191,0.066346,0.334592,0.115553,0.107010,-0.017416,-0.055646,1.51,0
1489,0.222778,-0.419820,-1.155978,-2.092516,2.786750,0.736297,-0.167292,1.600027,-0.117427,-0.796954,...,0.480640,0.533517,1.284645,0.516131,-0.602941,-0.305024,-0.021363,0.129096,451.27,1
1490,1.650556,-0.415027,1.178278,1.707446,0.170490,0.013370,-0.828534,0.664572,-0.171746,0.827746,...,-0.370913,-0.748920,-0.001713,0.315189,-0.202003,0.031065,0.239234,0.115735,8.90,0


In [17]:
## input layer 
input_layer = Input(shape=(X.shape[1],))

## encoding part
encoded = Dense(100, activation='tanh', activity_regularizer=regularizers.l1(10e-5))(input_layer)
encoded = Dense(50, activation='relu')(encoded)

## decoding part
decoded = Dense(50, activation='tanh')(encoded)
decoded = Dense(100, activation='tanh')(decoded)

## output layer
output_layer = Dense(X.shape[1], activation='relu')(decoded)

In [18]:
autoencoder = Model(input_layer, output_layer)
autoencoder.compile(optimizer="adadelta", loss="mse")
                                                        # "adam",    "rmsprop"

In [19]:
# Before training, let's perform min max scaling.

x = data.drop(["Class"], axis=1)
y = data["Class"].values

x_scale = preprocessing.MinMaxScaler().fit_transform(x.values)
x_norm, x_fraud = x_scale[y == 0], x_scale[y == 1]

In [20]:
autoencoder.fit(x_norm[0:2000], x_norm[0:2000], 
                batch_size = 256, epochs = 10, 
                shuffle = True, validation_split = 0.20)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1ac29791d60>

In [21]:
hidden_representation = Sequential()
hidden_representation.add(autoencoder.layers[0])
hidden_representation.add(autoencoder.layers[1])
hidden_representation.add(autoencoder.layers[2])

In [22]:
norm_hid_rep = hidden_representation.predict(x_norm[:3000])
fraud_hid_rep = hidden_representation.predict(x_fraud)

In [24]:
rep_x = np.append(norm_hid_rep, fraud_hid_rep, axis = 0)
y_n = np.zeros(norm_hid_rep.shape[0])
y_f = np.ones(fraud_hid_rep.shape[0])
rep_y = np.append(y_n, y_f)
tsne_plot(rep_x, rep_y)

NameError: name 'name' is not defined

TypeError: must be real number, not str

<Figure size 864x576 with 1 Axes>

In [25]:
train_x, val_x, train_y, val_y = train_test_split(rep_x, rep_y, test_size=0.25)
clf = LogisticRegression(solver="lbfgs").fit(train_x, train_y)
pred_y = clf.predict(val_x)

print ("")
print ("Classification Report: ")
print (classification_report(val_y, pred_y))

print ("")
print ("Accuracy Score: ", accuracy_score(val_y, pred_y))


Classification Report: 
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00       754
         1.0       1.00      0.94      0.97       119

    accuracy                           0.99       873
   macro avg       1.00      0.97      0.98       873
weighted avg       0.99      0.99      0.99       873


Accuracy Score:  0.9919816723940436
