In [1]:
import tensorflow

In [2]:
import keras
from keras.layers import Input, Dense
from keras.models import Model, Sequential
from keras import regularizers

Using TensorFlow backend.


In [3]:
import pandas as pd
import matplotlib.pyplot as plt 
import datetime
import numpy as np

In [4]:
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn import preprocessing 

In [31]:
train_df = pd.read_csv("train.csv")

In [32]:
len(train_df.columns)

18

In [33]:
train_df.head()

Unnamed: 0,amount_value,id,amount_currency,channel,deviceDetails_browser,deviceDetails_device,deviceDetails_deviceIp,merchantRefTransactionId,paymentMethod_apmType,paymentMethod_cardNumber,paymentMethod_cardType,paymentMethod_cardSubType,paymentMethod_cvv,paymentMethod_encodedPaymentToken,paymentMethod_expiryMonth,paymentMethod_expiryYear,transaction_time,class
0,18253.3,0,USD,virtual,edge,pos,134.93.42.168,544011954,magstripe,2174-3628-6995-3987,JCB,Student,61,szQSTkeL,6,2027,2020-08-30 05:21:24,0
1,15870.89,1,USD,virtual,chrome,pos,78.92.229.26,9718272187,nfcc,4498-2553-8930-9552,JCB,Student,200,fdptXXrc,1,2026,2020-09-03 23:21:01,0
2,41109.8,2,USD,online,chromio,pos,197.84.118.231,1659938058,magstripe,8246-8688-8907-4387,JCB,Prepaid,259,UTXzvode,0,2021,2020-08-03 11:43:53,1
3,70604.95,3,USD,virtual,chrome,mobile,192.113.64.253,986924301,magstripe,3563-5044-6927-1494,JCB,Business,685,gVRZqrdu,7,2023,2020-07-22 04:28:32,0
4,51216.66,4,USD,virtual,chrome,mobile,48.81.200.252,9399699174,nfcc,4253-0351-2318-7737,JCB,Business,131,FGZLLHSX,6,2025,2020-02-29 13:35:21,0


In [34]:
#transform transaction time in seconds
def get_day(x):
    day = datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
    return day.timestamp()

In [35]:
train_df['transaction_time'] = train_df['transaction_time'].apply(get_day)
train_df['transaction_time']

0        1.598754e+09
1        1.599164e+09
2        1.596444e+09
3        1.595381e+09
4        1.582976e+09
             ...     
99995    1.587667e+09
99996    1.588359e+09
99997    1.577932e+09
99998    1.582716e+09
99999    1.598561e+09
Name: transaction_time, Length: 100000, dtype: float64

In [36]:
class_df = train_df['class'].value_counts().to_frame().reset_index()
class_df['percentages'] = class_df['class'].apply(lambda val: round(100*float(val)/len(train_df), 2))
class_df

Unnamed: 0,index,class,percentages
0,1,50025,50.02
1,0,49975,49.98


In [37]:
non_fraud = train_df[train_df['class'] == 0].sample(1000)
fraud = train_df[train_df['class'] == 1].sample(1000)

df = non_fraud.append(fraud).reset_index(drop=True)
X = df.drop(['class'], axis=1)
#we select only the numerical columns
X_small = X[['amount_value', 'merchantRefTransactionId', 'paymentMethod_cvv', 'paymentMethod_expiryMonth', 'paymentMethod_expiryYear', 'transaction_time']]
X_small = X_small.values

Y = df['class'].values
X

Unnamed: 0,amount_value,id,amount_currency,channel,deviceDetails_browser,deviceDetails_device,deviceDetails_deviceIp,merchantRefTransactionId,paymentMethod_apmType,paymentMethod_cardNumber,paymentMethod_cardType,paymentMethod_cardSubType,paymentMethod_cvv,paymentMethod_encodedPaymentToken,paymentMethod_expiryMonth,paymentMethod_expiryYear,transaction_time
0,43559.88,62358,USD,online,chrome,pos,32.123.19.119,7552748640,nfcc,3675-9166-4366-6757,JCB,Student,578,xsATHQla,1,2020,1.580324e+09
1,2869.89,92244,USD,online,edge,pos,233.246.172.163,7697722608,magstripe,6761-6446-6278-6928,JCB,Student,718,wIQXGyqz,2,2027,1.593116e+09
2,47616.01,48355,USD,pos,chrome,pos,23.91.75.208,6138096167,nfcc,2025-7060-2153-7222,JCB,Student,716,uPusUrFy,4,2020,1.590842e+09
3,60446.28,99389,USD,virtual,mozilla,pc,99.119.227.135,6006616124,magstripe,5384-3406-7680-2916,JCB,Student,532,lktUbiug,6,2025,1.597092e+09
4,68275.87,60463,USD,pos,edge,pos,243.162.186.204,507305986,nfcc,3030-9233-8116-7719,JCB,Secured,926,PzekhgZk,11,2027,1.594085e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,47640.47,45282,USD,online,edge,pos,20.73.231.115,439607761,chip,8408-7319-9859-9150,American Express,Student,403,FYPTdtSr,10,2020,1.578601e+09
1996,90514.50,96031,USD,online,edge,pc,191.46.202.90,7116356623,magstripe,2410-7590-9744-5186,Discover,Student,558,FJsvtrvx,4,2023,1.597173e+09
1997,11443.74,3630,USD,online,edge,pos,74.162.94.185,4364578582,nfcc,2037-5559-8954-9270,Visa,Business,438,nFaZKdOQ,0,2025,1.579283e+09
1998,84742.82,86349,USD,online,mozilla,pc,32.48.115.18,9078231082,nfcc,1752-3117-7641-7188,MasterCard,Secured,449,dhCteOmu,3,2020,1.583281e+09


In [39]:
def tsne_plot(x1, y1, name="graph.png"):
    tsne = TSNE(n_components=2, random_state=0)
    X_t = tsne.fit_transform(x1) # fits the data into a 2D embedded space

    plt.figure(figsize=(12, 8))
    plt.scatter(X_t[np.where(y1 == 0), 0], X_t[np.where(y1 == 0), 1], marker='o', color='g', linewidth='1', alpha=0.8, label='Non Fraud')
    plt.scatter(X_t[np.where(y1 == 1), 0], X_t[np.where(y1 == 1), 1], marker='o', color='r', linewidth='1', alpha=0.8, label='Fraud')

    plt.legend(loc='best');
    plt.savefig(name)
    plt.show();
    

In [42]:
Y.dtype

dtype('int64')

In [40]:
tsne_plot(X_small, Y, "original.png")

TypeError: must be real number, not str

TypeError: must be real number, not str

<Figure size 864x576 with 1 Axes>

In [52]:
len(X.columns)

17

In [56]:
# first NN
#TODO: try other activation functions, see how it performs
input_layer = Input(shape=(X.shape[1], ))

#encoding 
encoded = Dense(16, activation='tanh', activity_regularizer=regularizers.l1(10e-5))(input_layer)
encoded = Dense(8, activation='relu')(encoded)

#decoding
decoded = Dense(8, activation='tanh')(encoded)
decoded = Dense(16, activation='tanh')(decoded)

#output
output_layer = Dense(X.shape[1], activation='relu')(decoded)

In [57]:
#TODO: try other optimizers/loss measurements
autoencoder = Model(input_layer, output_layer)
autoencoder.compile(optimizer="adadelta", loss="mse")

In [80]:
#scale the data
#TODO: try standardization also, check the results
#get list of categorical features
categorical_features = [col for col in list(train_df.columns) if col not in ['amount_value', 'merchantRefTransactionId', 'paymentMethod_cvv', 'paymentMethod_expiryMonth', 'paymentMethod_expiryYear', 'transaction_time']]
categorical_features

#one-hot encoding the categorical features
train_df[['channel', 'deviceDetails_browser', 'deviceDetails_device',
                                           'paymentMethod_apmType', 'paymentMethod_cardType', 'paymentMethod_cardSubType']] = train_df[['channel', 'deviceDetails_browser', 'deviceDetails_device',
                                           'paymentMethod_apmType', 'paymentMethod_cardType', 'paymentMethod_cardSubType']].astype('category')
encoded_feats = pd.get_dummies(train_df[['channel', 'deviceDetails_browser', 'deviceDetails_device',
                                           'paymentMethod_apmType', 'paymentMethod_cardType', 'paymentMethod_cardSubType']])

encoded_feats
# *TODO: drop the older categorical feats

# X = train_df.drop(['class'], axis=1)
# Y = train_df['class'].values

# X_scale = preprocessing.MinMaxScaler().fit_transform(X.values)
# X_norm, X_fraud = X_scale[Y == 0], x_scale[Y == 1]

Unnamed: 0,channel_mobile,channel_online,channel_pos,channel_virtual,deviceDetails_browser_chrome,deviceDetails_browser_chromio,deviceDetails_browser_edge,deviceDetails_browser_mozilla,deviceDetails_device_mobile,deviceDetails_device_pc,...,paymentMethod_cardType_American Express,paymentMethod_cardType_Discover,paymentMethod_cardType_JCB,paymentMethod_cardType_MasterCard,paymentMethod_cardType_Visa,paymentMethod_cardSubType_Business,paymentMethod_cardSubType_Generic,paymentMethod_cardSubType_Prepaid,paymentMethod_cardSubType_Secured,paymentMethod_cardSubType_Student
0,0,0,1,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,0,0,0,1,0,0,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
2,0,0,0,1,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,0,0,1,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
4,0,1,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,0,0,1,0,0,1,0,0,1,...,0,1,0,0,0,1,0,0,0,0
99996,0,1,0,0,1,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
99997,0,1,0,0,1,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
99998,0,0,0,1,1,0,0,0,0,1,...,0,0,0,0,1,1,0,0,0,0


In [None]:
# train the network
autoencoder.fit(X_norm[0:2000], Y[0:2000], batch_size = 256,
                epochs = 10, validation_split = 0.20) # 20% of the data will be used for validation

In [None]:
# obtain the latent representation