In [60]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from deepctr.models import DeepFM
from deepctr.utils import SingleFeat
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score
import random
from sklearn.metrics import classification_report, confusion_matrix

In [105]:
#preprocessing data
data = pd.read_csv("train.csv")
#one hot encoding
data['Geography'] = data['Geography'].map({'S0':0, 'S1':1, 'S2':2})
data['Gender'] = data['Gender'].map({'Male':0, 'Female':1})
#define sparse and dense features
sparse_features = ['Geography']
dense_features = [x for x in data.columns if x not in \
          ['U', 'RowNumber', 'CustomerId', 'Surname', 'Exited', 'Geography', 'Gender', 'Tenure']]
target = ['Exited']

predictors = [x for x in data.columns if x not in \
          ['U', 'RowNumber', 'CustomerId', 'Surname', 'Exited']]
data[predictors] = pd.DataFrame(data[predictors],dtype=np.float)
data[predictors] = min_max_scaler.fit_transform(data[predictors])

sparse_feature_list = [SingleFeat(feat, data[feat].nunique()) for feat in sparse_features]
dense_feature_list = [SingleFeat(feat, 0) for feat in dense_features]

In [114]:
#model training
train, test = train_test_split(data, test_size=0.2)

train_model_input = [train[feat.name].values for feat in sparse_feature_list] + \
    [train[feat.name].values for feat in dense_feature_list]
test_model_input = [test[feat.name].values for feat in sparse_feature_list] + \
    [test[feat.name].values for feat in dense_feature_list]

model = DeepFM({"sparse": sparse_feature_list,
                    "dense": dense_feature_list}, final_activation='sigmoid')
model.compile("adam", "binary_crossentropy",
                  metrics=['binary_crossentropy'], )
history = model.fit(train_model_input, train[target].values,
                    batch_size=500, epochs=20, verbose=2, validation_split=0.2, )
pred_ans = model.predict(test_model_input, batch_size=500)


Train on 5120 samples, validate on 1280 samples
Epoch 1/20
 - 1s - loss: 5.5173 - binary_crossentropy: 5.5173 - val_loss: 4.3903 - val_binary_crossentropy: 4.3903
Epoch 2/20
 - 0s - loss: 3.8929 - binary_crossentropy: 3.8929 - val_loss: 2.4130 - val_binary_crossentropy: 2.4130
Epoch 3/20
 - 0s - loss: 1.6083 - binary_crossentropy: 1.6083 - val_loss: 0.8514 - val_binary_crossentropy: 0.8514
Epoch 4/20
 - 0s - loss: 0.8692 - binary_crossentropy: 0.8691 - val_loss: 1.0454 - val_binary_crossentropy: 1.0454
Epoch 5/20
 - 0s - loss: 0.8155 - binary_crossentropy: 0.8155 - val_loss: 0.7701 - val_binary_crossentropy: 0.7701
Epoch 6/20
 - 0s - loss: 0.6945 - binary_crossentropy: 0.6944 - val_loss: 0.7216 - val_binary_crossentropy: 0.7216
Epoch 7/20
 - 0s - loss: 0.6513 - binary_crossentropy: 0.6513 - val_loss: 0.6808 - val_binary_crossentropy: 0.6807
Epoch 8/20
 - 0s - loss: 0.6065 - binary_crossentropy: 0.6065 - val_loss: 0.6342 - val_binary_crossentropy: 0.6341
Epoch 9/20
 - 0s - loss: 0.5705 

In [115]:
#show training result
pred = list(pred_ans)
for i in range(len(pred)):
    if pred[i]<=0.5:
        pred[i] = 0
    else:
        pred[i] = 1
print(classification_report(test[target], pred)) 

              precision    recall  f1-score   support

           0       0.83      0.97      0.89      1271
           1       0.65      0.21      0.32       329

   micro avg       0.81      0.81      0.81      1600
   macro avg       0.74      0.59      0.60      1600
weighted avg       0.79      0.81      0.77      1600



In [110]:
#import test data
test = pd.read_csv('test.csv')
test['Geography'] = test['Geography'].map({'S0':0, 'S1':1, 'S2':2})
test['Gender'] = test['Gender'].map({'Male':0, 'Female':1})

train = pd.read_csv("train.csv")
train['Geography'] = train['Geography'].map({'S0':0, 'S1':1, 'S2':2})
train['Gender'] = train['Gender'].map({'Male':0, 'Female':1})

train[predictors] = pd.DataFrame(train[predictors],dtype=np.float)
train[predictors] = min_max_scaler.fit_transform(train[predictors])
test[predictors] = pd.DataFrame(test[predictors],dtype=np.float)
test[predictors] = min_max_scaler.fit_transform(test[predictors])

#training
train_model_input = [train[feat.name].values for feat in sparse_feature_list] + \
    [train[feat.name].values for feat in dense_feature_list]
test_model_input = [test[feat.name].values for feat in sparse_feature_list] + \
    [test[feat.name].values for feat in dense_feature_list]

model = DeepFM({"sparse": sparse_feature_list,
                    "dense": dense_feature_list}, final_activation='sigmoid')
model.compile("adam", "binary_crossentropy",
                  metrics=['binary_crossentropy'], )
history = model.fit(train_model_input, train[target].values,
                    batch_size=128, epochs=20, verbose=2, validation_split=0.0, )
pred_ans = model.predict(test_model_input, batch_size=128)

Epoch 1/20
 - 1s - loss: 1.5655 - binary_crossentropy: 1.5654
Epoch 2/20
 - 0s - loss: 0.5269 - binary_crossentropy: 0.5269
Epoch 3/20
 - 0s - loss: 0.4755 - binary_crossentropy: 0.4754
Epoch 4/20
 - 0s - loss: 0.4536 - binary_crossentropy: 0.4536
Epoch 5/20
 - 0s - loss: 0.4316 - binary_crossentropy: 0.4316
Epoch 6/20
 - 0s - loss: 0.4170 - binary_crossentropy: 0.4169
Epoch 7/20
 - 0s - loss: 0.4037 - binary_crossentropy: 0.4036
Epoch 8/20
 - 0s - loss: 0.3934 - binary_crossentropy: 0.3933
Epoch 9/20
 - 0s - loss: 0.3905 - binary_crossentropy: 0.3904
Epoch 10/20
 - 0s - loss: 0.3777 - binary_crossentropy: 0.3777
Epoch 11/20
 - 0s - loss: 0.3793 - binary_crossentropy: 0.3793
Epoch 12/20
 - 0s - loss: 0.3766 - binary_crossentropy: 0.3765
Epoch 13/20
 - 0s - loss: 0.3699 - binary_crossentropy: 0.3699
Epoch 14/20
 - 0s - loss: 0.3668 - binary_crossentropy: 0.3668
Epoch 15/20
 - 0s - loss: 0.3658 - binary_crossentropy: 0.3658
Epoch 16/20
 - 0s - loss: 0.3655 - binary_crossentropy: 0.3655
E

In [111]:
#show prediction and write the upload file
pred = list(pred_ans)
for i in range(len(pred)):
    if pred[i]<=0.5:
        pred[i] = 0
    else:
        pred[i] = 1

df_sample = pd.read_csv('sample_upload.csv')
df_sample['Exited'] = pred
df_sample.to_csv('the_ans.csv', index=False, sep=',')
df_check = pd.read_csv('the_ans.csv')
df_check[:20]

Unnamed: 0.1,Unnamed: 0,RowNumber,Exited
0,0,2209,0
1,1,9924,0
2,2,4617,0
3,3,6077,0
4,4,9240,0
5,5,4834,0
6,6,8523,0
7,7,2826,0
8,8,871,0
9,9,6698,0
