In [75]:
import tensorflow

In [76]:
import keras
from keras.layers import Input, Dense
from keras.models import Model, Sequential
from keras import regularizers

In [77]:
import pandas as pd
import matplotlib.pyplot as plt 
import datetime
import numpy as np

In [78]:
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn import preprocessing 

In [79]:
train_df = pd.read_csv("train.csv")

In [80]:
len(train_df.columns)

18

In [81]:
train_df.head()

Unnamed: 0,amount_value,id,amount_currency,channel,deviceDetails_browser,deviceDetails_device,deviceDetails_deviceIp,merchantRefTransactionId,paymentMethod_apmType,paymentMethod_cardNumber,paymentMethod_cardType,paymentMethod_cardSubType,paymentMethod_cvv,paymentMethod_encodedPaymentToken,paymentMethod_expiryMonth,paymentMethod_expiryYear,transaction_time,class
0,18253.3,0,USD,virtual,edge,pos,134.93.42.168,544011954,magstripe,2174-3628-6995-3987,JCB,Student,61,szQSTkeL,6,2027,2020-08-30 05:21:24,0
1,15870.89,1,USD,virtual,chrome,pos,78.92.229.26,9718272187,nfcc,4498-2553-8930-9552,JCB,Student,200,fdptXXrc,1,2026,2020-09-03 23:21:01,0
2,41109.8,2,USD,online,chromio,pos,197.84.118.231,1659938058,magstripe,8246-8688-8907-4387,JCB,Prepaid,259,UTXzvode,0,2021,2020-08-03 11:43:53,1
3,70604.95,3,USD,virtual,chrome,mobile,192.113.64.253,986924301,magstripe,3563-5044-6927-1494,JCB,Business,685,gVRZqrdu,7,2023,2020-07-22 04:28:32,0
4,51216.66,4,USD,virtual,chrome,mobile,48.81.200.252,9399699174,nfcc,4253-0351-2318-7737,JCB,Business,131,FGZLLHSX,6,2025,2020-02-29 13:35:21,0


In [82]:
#transform transaction time in seconds
def get_day(x):
    day = datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
    return day.timestamp()

In [83]:
train_df['transaction_time'] = train_df['transaction_time'].apply(get_day)
train_df['transaction_time']

0        1.598754e+09
1        1.599164e+09
2        1.596444e+09
3        1.595381e+09
4        1.582976e+09
             ...     
99995    1.587667e+09
99996    1.588359e+09
99997    1.577932e+09
99998    1.582716e+09
99999    1.598561e+09
Name: transaction_time, Length: 100000, dtype: float64

In [84]:
class_df = train_df['class'].value_counts().to_frame().reset_index()
class_df['percentages'] = class_df['class'].apply(lambda val: round(100*float(val)/len(train_df), 2))
class_df

Unnamed: 0,index,class,percentages
0,1,50025,50.02
1,0,49975,49.98


In [85]:
non_fraud = train_df[train_df['class'] == 0].sample(1000)
fraud = train_df[train_df['class'] == 1].sample(1000)

df = non_fraud.append(fraud).reset_index(drop=True)
X = df.drop(['class'], axis=1)
#we select only the numerical columns
X_small = X[['amount_value', 'merchantRefTransactionId', 'paymentMethod_cvv', 'paymentMethod_expiryMonth', 'paymentMethod_expiryYear', 'transaction_time']]
X_small = X_small.values

Y = df['class'].values
X

Unnamed: 0,amount_value,id,amount_currency,channel,deviceDetails_browser,deviceDetails_device,deviceDetails_deviceIp,merchantRefTransactionId,paymentMethod_apmType,paymentMethod_cardNumber,paymentMethod_cardType,paymentMethod_cardSubType,paymentMethod_cvv,paymentMethod_encodedPaymentToken,paymentMethod_expiryMonth,paymentMethod_expiryYear,transaction_time
0,7509.74,60848,USD,virtual,chrome,pos,60.153.65.69,9783254097,nfcc,5709-4409-4154-7490,JCB,Student,291,hBzyLyCt,10,2018,1.586313e+09
1,54188.02,19201,USD,virtual,chrome,pos,225.27.75.84,9262168564,magstripe,9417-6350-6922-0775,JCB,Generic,22,KHTBlBmL,10,2020,1.590640e+09
2,62404.26,40308,USD,virtual,chrome,mobile,249.108.146.149,3071761226,magstripe,1998-8623-8303-8070,Discover,Secured,877,IkWtmtvg,11,2025,1.601238e+09
3,52987.92,94833,USD,online,edge,pos,164.32.157.184,107983165,nfcc,5102-5607-0858-9616,JCB,Student,271,FVxdTzZy,1,2022,1.593353e+09
4,51374.02,2487,USD,virtual,chrome,mobile,203.239.20.55,5712142125,magstripe,1972-7985-3867-8601,MasterCard,Student,919,qlflzULi,9,2023,1.599505e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,40037.55,10610,USD,virtual,edge,pc,142.168.141.122,425430001,nfcc,9813-3360-4801-4213,Discover,Student,746,iZoMMDkv,1,2024,1.594639e+09
1996,51862.28,45698,USD,online,edge,pos,2.31.57.91,3111451928,magstripe,4705-8430-1782-4752,JCB,Student,567,EDoUUlBD,11,2025,1.582033e+09
1997,91095.79,69170,USD,online,mozilla,pc,192.222.176.79,9373091523,magstripe,3387-9278-0949-1478,Visa,Prepaid,919,OzeTgEJh,9,2019,1.591192e+09
1998,91809.06,30680,USD,virtual,edge,pc,102.250.251.236,4247395377,nfcc,2706-7824-2228-5011,Visa,Student,982,REGrMGBe,5,2025,1.587012e+09


In [86]:
def tsne_plot(x1, y1, name="graph.png"):
    tsne = TSNE(n_components=2, random_state=0)
    X_t = tsne.fit_transform(x1) # fits the data into a 2D embedded space

    plt.figure(figsize=(12, 8))
    plt.scatter(X_t[np.where(y1 == 0), 0], X_t[np.where(y1 == 0), 1], marker='o', color='g', linewidth='1', alpha=0.8, label='Non Fraud')
    plt.scatter(X_t[np.where(y1 == 1), 0], X_t[np.where(y1 == 1), 1], marker='o', color='r', linewidth='1', alpha=0.8, label='Fraud')

    plt.legend(loc=name)
    plt.savefig("graph.png")
    plt.show()
    

In [87]:
Y.dtype

dtype('int64')

In [88]:
tsne_plot(X_small, Y, "original.png")

ValueError: Unrecognized location 'original.png'. Valid locations are
	best
	upper right
	upper left
	lower left
	lower right
	right
	center left
	center right
	lower center
	upper center
	center


TypeError: must be real number, not str

<Figure size 864x576 with 1 Axes>

In [89]:
len(X.columns)

17

In [108]:
# first NN
#TODO: try other activation functions, see how it performs
input_layer = Input(shape=(X.shape[1], ))

#encoding 
encoded = Dense(16, activation='tanh', activity_regularizer=regularizers.l1(10e-5))(input_layer)
encoded = Dense(8, activation='relu')(encoded)

#decoding
decoded = Dense(8, activation='tanh')(encoded)
decoded = Dense(16, activation='tanh')(decoded)

#output
output_layer = Dense(X.shape[1], activation='relu')(decoded)

In [109]:
#TODO: try other optimizers/loss measurements
autoencoder = Model(input_layer, output_layer)
autoencoder.compile(optimizer="adadelta", loss="mse")

In [110]:
#scale the data
#TODO: try standardization also, check the results
#get list of categorical features
categorical_features = [col for col in list(train_df.columns) if col not in ['amount_value', 'merchantRefTransactionId', 'paymentMethod_cvv', 'paymentMethod_expiryMonth', 'paymentMethod_expiryYear', 'transaction_time']]
categorical_features

#one-hot encoding the categorical features
train_df[['channel', 'deviceDetails_browser', 'deviceDetails_device',
                                           'paymentMethod_apmType', 'paymentMethod_cardType', 'paymentMethod_cardSubType']] = train_df[['channel', 'deviceDetails_browser', 'deviceDetails_device',
                                           'paymentMethod_apmType', 'paymentMethod_cardType', 'paymentMethod_cardSubType']].astype('category')
encoded_feats = pd.get_dummies(train_df[['channel', 'deviceDetails_browser', 'deviceDetails_device',
                                           'paymentMethod_apmType', 'paymentMethod_cardType', 'paymentMethod_cardSubType']])

# print(encoded_feats)
X = train_df.drop(categorical_features, axis=1)
len(X.columns)
X = pd.concat([X, encoded_feats], axis=1)
len(X.columns)

Y = train_df['class'].values

X[X.columns] = preprocessing.MinMaxScaler().fit_transform(X.values)
# X['amount_value'].min()  # just checking
X_norm, X_fraud = X[Y == 0], X[Y == 1]

In [106]:
X_fraud

Unnamed: 0,amount_value,merchantRefTransactionId,paymentMethod_cvv,paymentMethod_expiryMonth,paymentMethod_expiryYear,transaction_time,channel_mobile,channel_online,channel_pos,channel_virtual,...,paymentMethod_cardType_American Express,paymentMethod_cardType_Discover,paymentMethod_cardType_JCB,paymentMethod_cardType_MasterCard,paymentMethod_cardType_Visa,paymentMethod_cardSubType_Business,paymentMethod_cardSubType_Generic,paymentMethod_cardSubType_Prepaid,paymentMethod_cardSubType_Secured,paymentMethod_cardSubType_Student
2,0.411100,0.165995,0.259259,0.000000,0.3,0.784368,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,0.530323,0.581864,0.418418,0.636364,0.5,0.855083,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6,0.578823,0.448680,0.820821,0.090909,0.2,0.729660,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7,0.604782,0.987825,0.134134,0.000000,0.5,0.159307,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
10,0.503659,0.510931,0.888889,0.818182,0.3,0.378697,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99992,0.203729,0.887236,0.532533,0.727273,0.3,0.222995,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
99994,0.866648,0.788466,0.619620,0.545455,0.6,0.622538,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
99995,0.480823,0.474955,0.946947,0.454545,0.7,0.413560,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
99997,0.587414,0.708815,0.447447,0.727273,0.7,0.002244,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [111]:
# train the network
autoencoder.fit(X_norm[0:2000], X_norm[0:2000], batch_size = 256,
                epochs = 5, validation_split = 0.20) # 20% of the data will be used for validation

Train on 1600 samples, validate on 400 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x174fa5e1c88>

In [112]:
# obtain the latent representation
hidden_representation = Sequential()
hidden_representation.add(autoencoder.layers[0])
hidden_representation.add(autoencoder.layers[1])
hidden_representation.add(autoencoder.layers[2])

In [113]:
norm_hid_rep = hidden_representation.predict(X_norm[2000:4000])
fraud_hid_rep = hidden_representation.predict(X_fraud[0:2000])

In [114]:
rep_x = np.append(norm_hid_rep, fraud_hid_rep, axis = 0)
y_n = np.zeros(norm_hid_rep.shape[0])
y_f = np.ones(fraud_hid_rep.shape[0])
rep_y = np.append(y_n, y_f)
tsne_plot(rep_x, rep_y, "latent_representation.png")

ValueError: Unrecognized location 'latent_representation.png'. Valid locations are
	best
	upper right
	upper left
	lower left
	lower right
	right
	center left
	center right
	lower center
	upper center
	center


TypeError: must be real number, not str

<Figure size 864x576 with 1 Axes>