In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from sklearn import decomposition
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from scipy.linalg import eigh
import random

In [2]:
# Load the dataset for credit card frauds
raw_data = pd.read_csv('dataset/creditcard.csv')


<p style='font-size:20px'><b> Common data preprocessing steps </b></p>
<p>

1. PCA transformation: As seen during EDA, all features are already PCA transformed & contribute equally for explaining the cariance in data. Thereby, there is no scope of feature selection/shortlisting through PCA.
2. Null values imputation: No null values were seen in the dataset.     
3. Data balancing: This may be required since we have a highly imbalanced dataset. However, this will be implemented by assigning class weights while modelling, rather than changing the data itself.
4. Column Standardisation: This will be done separately for each level of modelling as will be mentioned later.
5. Feature modification: Instead of using the time in sec as a feature, we will simply use the hour number as feature considering practical intuition that minutes & seconds wouldn't add value beyond the hour number.
6. Train test split - We will split the data in a 75:25 split in a way that the ratio between fraud:not-fraud datapoints remains similar in the train & test data
    
    
</p>


In [3]:
def sec_to_hms(time_in_sec):
    
    h = int(time_in_sec//3600)
    m = int((time_in_sec%3600)//60)
    s = int((time_in_sec%3600)%60)
    return (h,m,s)

raw_data.sort_values('Time', inplace = True)
temp = raw_data.Time.apply(sec_to_hms)
raw_data['Hour_num'] = [x[0] for x in temp]


In [4]:
# Split the data in training and test - 75:25 split

train, test = train_test_split(raw_data, train_size = 0.75)

In [5]:
# Check class distribution accross train & test data

train_pos = train[(train.Class == 1)].shape[0]
train_neg = train[(train.Class == 0)].shape[0]
train_total = train_pos + train_neg
test_pos = test[(test.Class == 1)].shape[0]
test_neg = test[(test.Class == 0)].shape[0]
test_total = test_neg + test_pos

to_print = []

to_print.append(['Dataset', '# Total pts', '# pos pts', '# neg pts', '% pos pts', '% neg pts'])
to_print.append(['Train', train_total, train_pos, train_neg, train_pos/train_total, train_neg/train_total])
to_print.append(['Test', test_total, test_pos, test_neg, test_pos/test_total, test_neg/test_total])
to_print.append(['Total', train_total + test_total, train_pos + test_pos, train_neg + test_neg, (train_pos + test_pos)/(train_total + test_total), (train_neg + test_neg)/(train_total + test_total)])

col_width = 15

for item in to_print[0]:
    print(item.center(col_width), end = "")

print("\n")

for row in to_print[1:]:

    col = []
    col.append(row[0])
    col.append(str(row[1]))
    col.append(str(row[2]))
    col.append(str(row[3]))
    col.append(str(round(100*row[4],4)) + '%')
    col.append(str(round(100*row[5],4)) + '%')
    
    for i in range(len(col)):
        print(col[i].center(col_width), end = "")
    print("\n")

    Dataset      # Total pts     # pos pts      # neg pts      % pos pts      % neg pts   

     Train          213605          373           213232        0.1746%        99.8254%   

      Test          71202           119           71083         0.1671%        99.8329%   

     Total          284807          492           284315        0.1727%        99.8273%   



<p>

The data has been split in train, test with each dataset containin ~0.17% of positive class points.
We will proceed to save this as train, test data.

</p>

In [6]:
try:
    train.to_csv('dataset/creditcard_train.csv', mode = 'x', index = False)
except:
    print("File already saved?")

try:
    test.to_csv('dataset/creditcard_test.csv', mode = 'x', index = False)
except:
    print("File already saved?")
