# Get Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, models
import numpy as np
from sklearn import metrics

# Get Data

This data is from Kaggle for the use case of fraud detection in credit card transactions. Columns are pretty generic, but all values are type float. Can imagine other manipulations were made to this dataset such as converting categorical data to numeric for the purpose of feeding into the auto encoder.

In [2]:
# Credit Card Fraud Data
# https://www.kaggle.com/code/shivamb/semi-supervised-classification-using-autoencoders/data
df = pd.read_csv(r'C:\Users\Aaron\.kaggle\creditcard.csv\creditcard.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [4]:
# 0 = Non Fraud, 1 = Fraud 
pd.DataFrame(df.groupby('Class').size()).rename(columns={0:'count'})

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,284315
1,492


In [5]:
# Table Stats
df.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,3.91956e-15,5.688174e-16,-8.769071e-15,2.782312e-15,-1.552563e-15,2.010663e-15,-1.694249e-15,-1.927028e-16,-3.137024e-15,...,1.537294e-16,7.959909e-16,5.36759e-16,4.458112e-15,1.453003e-15,1.699104e-15,-3.660161e-16,-1.206049e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


# Prep Data for Training

This is where the semi-supervised aspect takes place. The "non-fraud" rows are separated from the fraud rows and each are respectively put into their own datasets. We will train the model on the non-fraud rows.  

In [6]:
# Separate Data and drop class labels
nonFraud = df[df['Class'] == 0].drop('Class',axis=1)
fraud = df[df['Class'] == 1].drop('Class',axis=1)

# Train Test Split for nonFraud Auto Encoder training
x_train, x_test = train_test_split(nonFraud,test_size=.35, random_state = 5)

# Build & Fit Model

Creating and fitting the auto encoder model. Both encoding and decoding layers consists of 25 neurons and the hidden layer contains 3. Essentially the model will compress the input data down into this 3 neuron latent representation of the data and then rebuild the data using the learned latent representation. Then in an iterative process we will train the model to reduce the difference from the input data and the output data built from the hidden layer. Our loss is based on the mean squared error and we trained the model for 50 iterations/epochs.

In [8]:
# Build Auto Encoder

modelAE = models.Sequential([
    layers.InputLayer(input_shape= x_train.shape[1]),
    layers.Dense(25,activation='relu'), # encoding layer
    layers.Dense(3,activation='relu'), # latent hidden layer
    layers.Dense(25,activation='relu'), # decoding layer
    layers.Dense(x_train.shape[1])
])

modelAE.compile(loss='mean_squared_error',optimizer='adam')
modelAE.fit(x_train,x_train,verbose=1,epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x26a260a3ac0>

# Evaluate

In [9]:
# How does the model perform on new non-fraud data
pred = modelAE.predict(x_test)
score1 = np.sqrt(metrics.mean_squared_error(pred,x_test))

# How does the model perform on all non-fraud data
pred = modelAE.predict(nonFraud)
score2 = np.sqrt(metrics.mean_squared_error(pred,nonFraud))

# How does the model perform on fraud data
pred = modelAE.predict(fraud)
score3 = np.sqrt(metrics.mean_squared_error(pred,fraud))
print(f"Out of Sample Non-Fraud (RMSE): {score1}")
print(f"Insample Non-Fraud Score (RMSE): {score2}")
print(f"Fraud Score (RMSE): {score3}")

Out of Sample Non-Fraud (RMSE): 2.025225702585351
Insample Non-Fraud Score (RMSE): 2.0763803448145226
Fraud Score (RMSE): 5.405681401914101


We can see from the results that the errors associated with the insample and out of sample non-fraud data are about the same. This means, using the hidden latent representation determined by the training data, the non-fraud data fed through is about the same producing about the same errors. However, the fraud data produces a greater error meaning that our hidden representation could not predict the fraud data as accuractely eluding to the fact that there is something different about this underlying data in comparison to the data that was used for training.