In [25]:
import pandas as pd
from tensorflow import keras
import tensorflow as tf
from keras import metrics, models, layers
from keras.callbacks import EarlyStopping
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [26]:
data=pd.read_csv('Base.csv',encoding='utf8')

In [27]:
data.dropna(inplace=True)

In [28]:
data.dtypes

fraud_bool                            int64
income                              float64
name_email_similarity               float64
prev_address_months_count             int64
current_address_months_count          int64
customer_age                          int64
days_since_request                  float64
intended_balcon_amount              float64
payment_type                         object
zip_count_4w                          int64
velocity_6h                         float64
velocity_24h                        float64
velocity_4w                         float64
bank_branch_count_8w                  int64
date_of_birth_distinct_emails_4w      int64
employment_status                    object
credit_risk_score                     int64
email_is_free                         int64
housing_status                       object
phone_home_valid                      int64
phone_mobile_valid                    int64
bank_months_count                     int64
has_other_cards                 

In [29]:
data

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,...,has_other_cards,proposed_credit_limit,foreign_request,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
0,1,0.9,0.166828,-1,88,50,0.020925,-1.331345,AA,769,...,0,500.0,0,INTERNET,3.888115,windows,0,1,0,7
1,1,0.9,0.296286,-1,144,50,0.005418,-0.816224,AB,366,...,0,1500.0,0,INTERNET,31.798819,windows,0,1,0,7
2,1,0.9,0.044985,-1,132,40,3.108549,-0.755728,AC,870,...,0,200.0,0,INTERNET,4.728705,other,0,1,0,7
3,1,0.9,0.159511,-1,22,50,0.019079,-1.205124,AB,810,...,1,200.0,0,INTERNET,2.047904,linux,0,1,0,7
4,1,0.9,0.596414,-1,218,50,0.004441,-0.773276,AB,890,...,0,1500.0,0,INTERNET,3.775225,macintosh,1,1,0,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,0,0.6,0.192631,-1,104,40,0.030592,-1.044454,AB,804,...,0,200.0,0,INTERNET,8.511502,linux,1,1,0,4
999996,0,0.8,0.322989,148,9,50,1.628119,-1.409803,AC,3306,...,0,200.0,0,INTERNET,8.967865,windows,0,1,0,4
999997,0,0.8,0.879403,-1,30,20,0.018563,34.692760,AA,1522,...,0,200.0,0,INTERNET,8.195531,other,0,1,0,4
999998,0,0.9,0.762112,-1,189,20,0.015352,94.661055,AA,1418,...,0,500.0,0,INTERNET,4.336064,windows,1,1,0,4


In [30]:
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1) 
    return(res) 

features_to_encode = ['payment_type', 'employment_status', 'housing_status','source','device_os']
for feature in features_to_encode:
    data = encode_and_bind(data, feature)

In [31]:
data=data.sample(frac=1,random_state=468)

fraud=(data-data.min())/(data.max()-data.min())

fraud['fraud_bool']=data['fraud_bool']

In [32]:
fraud.fillna(0,inplace=True)


In [50]:
x, x_test = train_test_split(fraud,test_size=0.1,train_size=0.9,random_state=6544)
x_train, x_validation = train_test_split(x,test_size = 0.3,train_size =0.7,random_state=6544)

In [51]:
train_targets=x_train.pop('fraud_bool').astype('float64')
val_targets=x_validation.pop('fraud_bool').astype('float64')
test_targets=x_test.pop('fraud_bool').astype('float64')

In [53]:
METRICS= [
    metrics.FalseNegatives(name="fn"),
    metrics.FalsePositives(name="fp"),
    metrics.TrueNegatives(name="tn"),
    metrics.TruePositives(name="tp"),
    metrics.Precision(name="precision"),
    metrics.Recall(name="recall"),
]
class_weights = {0:1,
                 1:100}
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)


In [54]:
model=models.Sequential()
model.add(layers.Dense(128, activation='relu',input_shape=(52,)))
model.add(layers.Dropout(0.15))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(1, activation='sigmoid'))
model.summary()


Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_15 (Dense)            (None, 128)               6784      
                                                                 
 dropout_10 (Dropout)        (None, 128)               0         
                                                                 
 dense_16 (Dense)            (None, 64)                8256      
                                                                 
 dropout_11 (Dropout)        (None, 64)                0         
                                                                 
 dense_17 (Dense)            (None, 1)                 65        
                                                                 
Total params: 15,105
Trainable params: 15,105
Non-trainable params: 0
_________________________________________________________________


In [55]:
model.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=METRICS)

In [56]:
history=model.fit(x=x_train,
                  y=train_targets,
                  epochs=20,
                  verbose=1,
                  batch_size=128,
                  callbacks=early_stopping,
                  class_weight=class_weights,
                  validation_data=(x_validation,val_targets))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20


In [47]:
prediction = model.evaluate(x_test,test_targets,batch_size=32)
print(prediction)

[0.5817806124687195, 132.0, 28993.0, 69884.0, 991.0, 0.033050961792469025, 0.8824576735496521]


In [48]:
fraud.describe()

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,zip_count_4w,velocity_6h,...,housing_status_BE,housing_status_BF,housing_status_BG,source_INTERNET,source_TELEAPP,device_os_linux,device_os_macintosh,device_os_other,device_os_windows,device_os_x11
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,...,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
mean,0.011029,0.578369,0.493694,0.046142,0.204168,0.296113,0.013073,0.188283,0.234616,0.345602,...,0.169135,0.001669,0.000252,0.992952,0.007048,0.332712,0.053826,0.342728,0.263506,0.007228
std,0.104438,0.362928,0.289125,0.114704,0.206076,0.150322,0.068596,0.157495,0.150078,0.178216,...,0.374871,0.040819,0.015873,0.083656,0.083656,0.471185,0.225674,0.474622,0.440535,0.08471
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.25,0.225215,0.0,0.04662,0.125,9.2e-05,0.111677,0.133303,0.213605,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.625,0.492152,0.0,0.123543,0.25,0.000193,0.114408,0.188386,0.32514,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.875,0.755567,0.033854,0.305361,0.375,0.000336,0.159663,0.290043,0.464956,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
