In [370]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
import keras 
import tensorflow as tf
from keras.layers import Dropout

In [371]:
data = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")
###The following columns have no information provided, and they seems to be incosistent with what 
## some of the data columns in the columns already have
for x in data.columns:
    if x[-4:] == "Rate":
        data = data.drop(x, axis =1)
###The following data does not provide any relevance to the data as they are either 
### all have the same number, or the value does not provide any information
data = data.drop(["EmployeeCount", "EmployeeNumber", "StandardHours", "Over18"], axis =1 )

In [372]:
numerical_columns= data.describe(include = [np.number]).columns
categorical_columns = data.describe(include = ['O']).columns

In [373]:
categorical_columns = categorical_columns.drop('Attrition')

In [374]:
x = data[numerical_columns].values
scaler = preprocessing.MinMaxScaler()
x_scaled = scaler.fit_transform(x)
df = pd.DataFrame(x_scaled)
df.columns = numerical_columns
for x in categorical_columns:
    df[x] = data[x]
df = pd.get_dummies(df, columns = categorical_columns)
mapping = {"Yes": 1, "No":0}
df["Attrition"] = data["Attrition"].map(mapping)
X = df.drop("Attrition", axis =1)
y = df["Attrition"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, shuffle = True, stratify = y)

In [375]:
svc = SVC(kernel="linear", C =1).fit(X_train, y_train)
print('train')
print(classification_report(y_train, svc.predict(X_train)))
print('test')
print(classification_report(y_test, svc.predict(X_test)))

train
              precision    recall  f1-score   support

           0       0.90      0.98      0.94       863
           1       0.85      0.45      0.59       166

    accuracy                           0.90      1029
   macro avg       0.88      0.72      0.77      1029
weighted avg       0.90      0.90      0.89      1029

test
              precision    recall  f1-score   support

           0       0.90      0.98      0.93       370
           1       0.76      0.41      0.53        71

    accuracy                           0.88       441
   macro avg       0.83      0.69      0.73       441
weighted avg       0.87      0.88      0.87       441



In [376]:
lr = LogisticRegression().fit(X_train, y_train)
print('train')
print(classification_report(y_train, lr.predict(X_train)))
print('test')
print(classification_report(y_test, lr.predict(X_test)))

train
              precision    recall  f1-score   support

           0       0.90      0.98      0.94       863
           1       0.82      0.43      0.57       166

    accuracy                           0.89      1029
   macro avg       0.86      0.71      0.75      1029
weighted avg       0.89      0.89      0.88      1029

test
              precision    recall  f1-score   support

           0       0.90      0.97      0.93       370
           1       0.72      0.41      0.52        71

    accuracy                           0.88       441
   macro avg       0.81      0.69      0.73       441
weighted avg       0.87      0.88      0.87       441



In [385]:
class_weight = {0:0.85, 1:0.15}
lr = LogisticRegression(class_weight= class_weight).fit(X_train, y_train)
print('train')
print(classification_report(y_train, lr.predict(X_train)))
print('test')
print(classification_report(y_test, lr.predict(X_test)))

train
              precision    recall  f1-score   support

           0       0.84      1.00      0.91       863
           1       1.00      0.02      0.05       166

    accuracy                           0.84      1029
   macro avg       0.92      0.51      0.48      1029
weighted avg       0.87      0.84      0.77      1029

test
              precision    recall  f1-score   support

           0       0.84      1.00      0.91       370
           1       1.00      0.03      0.05        71

    accuracy                           0.84       441
   macro avg       0.92      0.51      0.48       441
weighted avg       0.87      0.84      0.78       441



In [53]:
class_weight = {0:1, 1:100}
lr = LogisticRegression(class_weight= class_weight).fit(X_train, y_train)
print('train')
print(classification_report(y_train, lr.predict(X_train)))
print('test')
print(classification_report(y_test, lr.predict(X_test)))

train
              precision    recall  f1-score   support

           0       1.00      0.28      0.44       863
           1       0.21      1.00      0.35       166

    accuracy                           0.40      1029
   macro avg       0.61      0.64      0.39      1029
weighted avg       0.87      0.40      0.42      1029

test
              precision    recall  f1-score   support

           0       0.97      0.25      0.40       370
           1       0.20      0.96      0.33        71

    accuracy                           0.36       441
   macro avg       0.58      0.60      0.36       441
weighted avg       0.84      0.36      0.38       441



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [54]:
class_weight = {0:1, 1:100}
svc = SVC(kernel="linear", C =1, class_weight= class_weight).fit(X_train, y_train)
print('train')
print(classification_report(y_train, svc.predict(X_train)))
print('test')
print(classification_report(y_test, svc.predict(X_test)))

train
              precision    recall  f1-score   support

           0       1.00      0.30      0.46       863
           1       0.22      1.00      0.35       166

    accuracy                           0.41      1029
   macro avg       0.61      0.65      0.41      1029
weighted avg       0.87      0.41      0.44      1029

test
              precision    recall  f1-score   support

           0       0.97      0.31      0.47       370
           1       0.21      0.94      0.34        71

    accuracy                           0.41       441
   macro avg       0.59      0.63      0.41       441
weighted avg       0.84      0.41      0.45       441



In [55]:
data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,Female,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,Male,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,Male,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,Female,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,Male,3,...,3,4,1,6,3,3,2,2,2,2


In [56]:
data['TotalJobSatisfaction'] = data['EnvironmentSatisfaction'] + data['JobSatisfaction'] + data['RelationshipSatisfaction']
data['MarriedAndBad_Worklife_Balance'] = np.where(data['MaritalStatus']=='Married', 
                                               data['WorkLifeBalance']-2,
                                               data['WorkLifeBalance']+1)
data['DistanceFromHome_rootedTo_JobSatisfaction'] = data['DistanceFromHome']**(1/data['JobSatisfaction'])
values = ['Married', 'Divorced']
data['Mothers'] = np.where((data['Gender']=='Female') & (data['Age']>=36) & ((data['MaritalStatus'].isin(values))), 1,0)
data['OldLowEmployeeTendToStay'] = data['YearsAtCompany'] / data['JobLevel']
data['JobInvolment_On_Salary']= data['JobInvolvement'] / data['MonthlyIncome'] * 1000
    

In [57]:
numerical_columns= data.describe(include = [np.number]).columns
categorical_columns = data.describe(include = ['O']).columns
categorical_columns = categorical_columns.drop('Attrition')
x = data[numerical_columns].values
scaler = preprocessing.MinMaxScaler()
x_scaled = scaler.fit_transform(x)
df = pd.DataFrame(x_scaled)
df.columns = numerical_columns
for x in categorical_columns:
    df[x] = data[x]
df = pd.get_dummies(df, columns = categorical_columns)
mapping = {"Yes": 1, "No":0}
df["Attrition"] = data["Attrition"].map(mapping)
X = df.drop("Attrition", axis =1)
y = df["Attrition"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, shuffle = True, stratify = y)

In [58]:
class_weight = {0:0.85, 1:0.15}
lr = LogisticRegression(class_weight= class_weight).fit(X_train, y_train)
print('train')
print(classification_report(y_train, lr.predict(X_train)))
print('test')
print(classification_report(y_test, lr.predict(X_test)))

train
              precision    recall  f1-score   support

           0       0.84      1.00      0.92       863
           1       1.00      0.04      0.07       166

    accuracy                           0.84      1029
   macro avg       0.92      0.52      0.49      1029
weighted avg       0.87      0.84      0.78      1029

test
              precision    recall  f1-score   support

           0       0.84      1.00      0.91       370
           1       1.00      0.03      0.05        71

    accuracy                           0.84       441
   macro avg       0.92      0.51      0.48       441
weighted avg       0.87      0.84      0.78       441



In [59]:
class_weight = {0: 0.8,
                1: 0.2}
dropout = 0.5
model = keras.models.Sequential()
model.add(keras.layers.Dense(100, kernel_initializer= 'truncated_normal',input_shape = (X.shape[1],)))
model.add(Dropout(dropout))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Activation('relu'))
model.add(keras.layers.Dense(100, kernel_initializer= 'truncated_normal'))
model.add(Dropout(dropout))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Activation('relu'))
model.add(keras.layers.Dense(100, kernel_initializer= 'truncated_normal'))
model.add(Dropout(dropout))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Activation('relu'))
model.add(keras.layers.Dense(1, activation = 'sigmoid' , kernel_initializer= 'truncated_normal'))
model.compile(optimizer = "adam", loss = 'binary_crossentropy', metrics = [tf.keras.metrics.Recall()])
model.fit(X_train, y_train, epochs = 100, batch_size = 20, validation_split=0.1, class_weight= class_weight)

Train on 926 samples, validate on 103 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x7f91b9a373a0>

In [60]:
print('train')
print(classification_report(y_train, [x >=0.9 for x in model.predict(X_train)]))
print('test')
print(classification_report(y_test, [x >=0.9 for x in model.predict(X_test)]))

train
              precision    recall  f1-score   support

           0       0.88      1.00      0.94       863
           1       1.00      0.31      0.47       166

    accuracy                           0.89      1029
   macro avg       0.94      0.65      0.70      1029
weighted avg       0.90      0.89      0.86      1029

test
              precision    recall  f1-score   support

           0       0.86      1.00      0.92       370
           1       0.91      0.14      0.24        71

    accuracy                           0.86       441
   macro avg       0.88      0.57      0.58       441
weighted avg       0.87      0.86      0.81       441



In [369]:
class_weight = {0: 0.75,
                1: 0.25}
dropout = 0.2
model = keras.models.Sequential()
model.add(keras.layers.Dense(200, kernel_initializer= 'he_normal',input_shape = (X.shape[1],)))
model.add(Dropout(dropout))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Activation('relu'))
model.add(keras.layers.Dense(1, activation = 'sigmoid'))
model.compile(optimizer = "adam", loss = 'binary_crossentropy', metrics = [tf.keras.metrics.Precision()])
model.fit(X_train, y_train, epochs = 100, batch_size = 30, validation_split=0.1, class_weight= class_weight)

ValueError: Error when checking input: expected dense_63_input to have shape (55,) but got array with shape (20,)

In [62]:
print('train')
print(classification_report(y_train, [x >=0.9 for x in model.predict(X_train)]))
print('test')
print(classification_report(y_test, [x >=0.9 for x in model.predict(X_test)]))

train
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       863
           1       1.00      0.67      0.81       166

    accuracy                           0.95      1029
   macro avg       0.97      0.84      0.89      1029
weighted avg       0.95      0.95      0.94      1029

test
              precision    recall  f1-score   support

           0       0.86      1.00      0.93       370
           1       0.93      0.18      0.31        71

    accuracy                           0.87       441
   macro avg       0.90      0.59      0.62       441
weighted avg       0.87      0.87      0.83       441



In [63]:
class_weight = {0: 0.8,
                1: 0.2}
dropout = 0.4
model = keras.models.Sequential()
model.add(keras.layers.Dense(100, kernel_initializer= 'truncated_normal',input_shape = (X.shape[1],)))
model.add(Dropout(dropout))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Activation('relu'))
model.add(keras.layers.Dense(1, activation = 'sigmoid', kernel_initializer= 'truncated_normal'))
model.compile(optimizer = "adam", loss = 'binary_crossentropy', metrics = [tf.keras.metrics.Precision()])
model.fit(X_train, y_train, epochs = 200, batch_size = 30, validation_split=0.1, class_weight= class_weight)

Train on 926 samples, validate on 103 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200


Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 13

Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<keras.callbacks.callbacks.History at 0x7f91d9a1f0a0>

In [64]:
print('train')
print(classification_report(y_train, [x >=0.91 for x in model.predict(X_train)]))
print('test')
print(classification_report(y_test, [x >=0.91 for x in model.predict(X_test)]))

train
              precision    recall  f1-score   support

           0       0.92      1.00      0.96       863
           1       1.00      0.53      0.69       166

    accuracy                           0.92      1029
   macro avg       0.96      0.77      0.82      1029
weighted avg       0.93      0.92      0.91      1029

test
              precision    recall  f1-score   support

           0       0.86      0.99      0.92       370
           1       0.76      0.18      0.30        71

    accuracy                           0.86       441
   macro avg       0.81      0.59      0.61       441
weighted avg       0.85      0.86      0.82       441



In [65]:
class_weight = {0: 0.8,
                1: 0.2}
dropout = 0.7
model = keras.models.Sequential()
model.add(keras.layers.Dense(200, kernel_initializer= 'he_normal',input_shape = (X.shape[1],)))
dropout = 0.7
model.add(Dropout(dropout))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Activation('relu'))
model.add(keras.layers.Dense(1, activation = 'sigmoid', kernel_initializer= 'he_normal'))
model.compile(optimizer = "adam", loss = 'binary_crossentropy', metrics = ['accuracy'])
model.fit(X_train, y_train, epochs = 200, batch_size = 30, validation_split=0.1, class_weight= class_weight)

Train on 926 samples, validate on 103 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
E

Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 

<keras.callbacks.callbacks.History at 0x7f91d4812850>

In [66]:
print('train')
print(classification_report(y_train, [x >=0.91 for x in model.predict(X_train)]))
print('test')
print(classification_report(y_test, [x >=0.9 for x in model.predict(X_test)]))

train
              precision    recall  f1-score   support

           0       0.89      1.00      0.94       863
           1       1.00      0.39      0.56       166

    accuracy                           0.90      1029
   macro avg       0.95      0.69      0.75      1029
weighted avg       0.91      0.90      0.88      1029

test
              precision    recall  f1-score   support

           0       0.87      1.00      0.93       370
           1       0.94      0.21      0.34        71

    accuracy                           0.87       441
   macro avg       0.90      0.60      0.64       441
weighted avg       0.88      0.87      0.83       441



In [209]:
from sklearn.decomposition import PCA
pca = PCA(n_components=0.90).fit(X_train)
pca_x_train = pca.transform(X_train)
pca_x_test = pca.transform(X_test)

In [210]:
from sklearn.feature_selection import SelectKBest, chi2

In [226]:
X = df.drop("Attrition", axis =1)
y = df["Attrition"]
X_new = SelectKBest(chi2, k=30).fit_transform(X,y)

In [229]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.30, random_state=42, shuffle = True, stratify = y)

In [236]:
svc = LogisticRegression(class_weight = {0:0.85, 1:0.15}).fit(X_train, y_train)
print('train')
print(classification_report(y_train, svc.predict(X_train)))
print('test')
print(classification_report(y_test, svc.predict(X_test)))

train
              precision    recall  f1-score   support

           0       0.84      1.00      0.91       863
           1       1.00      0.01      0.02       166

    accuracy                           0.84      1029
   macro avg       0.92      0.51      0.47      1029
weighted avg       0.87      0.84      0.77      1029

test
              precision    recall  f1-score   support

           0       0.84      1.00      0.92       370
           1       1.00      0.04      0.08        71

    accuracy                           0.85       441
   macro avg       0.92      0.52      0.50       441
weighted avg       0.87      0.85      0.78       441



In [237]:
class_weight = {0: 0.8,
                1: 0.2}
dropout = 0.7
model = keras.models.Sequential()
model.add(keras.layers.Dense(50, kernel_initializer= 'truncated_normal',input_shape = (X_new.shape[1],)))
# model.add(Dropout(dropout))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Activation('relu'))
model.add(keras.layers.Dense(1, activation = 'sigmoid', kernel_initializer= 'truncated_normal'))
model.compile(optimizer = "adam", loss = 'binary_crossentropy', metrics = ['accuracy'])
model.fit(X_train, y_train, epochs = 50, batch_size = 30, validation_split=0.1, class_weight= class_weight)

Train on 926 samples, validate on 103 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.callbacks.History at 0x7f91da9759d0>

In [238]:
print('train')
print(classification_report(y_train, [x >=0.8 for x in model.predict(X_train)]))
print('test')
print(classification_report(y_test, [x >=0.8 for x in model.predict(X_test)]))

train
              precision    recall  f1-score   support

           0       0.84      1.00      0.91       863
           1       1.00      0.01      0.01       166

    accuracy                           0.84      1029
   macro avg       0.92      0.50      0.46      1029
weighted avg       0.87      0.84      0.77      1029

test
              precision    recall  f1-score   support

           0       0.84      1.00      0.91       370
           1       1.00      0.03      0.05        71

    accuracy                           0.84       441
   macro avg       0.92      0.51      0.48       441
weighted avg       0.87      0.84      0.78       441



In [244]:
svc = RandomForestClassifier(class_weight = {0: 0.8,
                1: 0.2}, n_estimators= 300, max_depth = 10, min_samples_split=3, min_samples_leaf=7).fit(X_train, y_train)
print('train')
print(classification_report(y_train, svc.predict(X_train)))
print('test')
print(classification_report(y_test, svc.predict(X_test)))

train
              precision    recall  f1-score   support

           0       0.85      1.00      0.92       863
           1       1.00      0.08      0.16       166

    accuracy                           0.85      1029
   macro avg       0.93      0.54      0.54      1029
weighted avg       0.87      0.85      0.80      1029

test
              precision    recall  f1-score   support

           0       0.84      1.00      0.91       370
           1       1.00      0.03      0.05        71

    accuracy                           0.84       441
   macro avg       0.92      0.51      0.48       441
weighted avg       0.87      0.84      0.78       441



In [245]:
from imblearn.over_sampling import SMOTE

In [271]:
sm = SMOTE(random_state=42)
X = df.drop("Attrition", axis =1)
y = df["Attrition"]

In [304]:
# svc = RandomForestClassifier(class_weight = {0: 0.75,
#                 1: 0.25}, n_estimators= 50, max_depth = 5, min_samples_split=5, min_samples_leaf=9).fit(X_res, y_res)
# print('train')
# print(classification_report(y_train, svc.predict(X_train)))
# print('test')
# print(classification_report(y_test, svc.predict(X_test)))

In [303]:
# svc = LogisticRegression(class_weight = {0:0.5, 1:0.5}).fit(X_res, y_res)
# print('train')
# print(classification_report(y_train, svc.predict(X_train)))
# print('test')
# print(classification_report(y_test, svc.predict(X_test)))

In [311]:
import paramiko

In [312]:
host = "transfer.unifida.co.uk"                    #hard-coded
port = 22
transport = paramiko.Transport((host, port))

password = "5tBtDzP!!s"                #hard-coded
username = "DataGrasp"                #hard-coded
transport.connect(username = username, password = password)

In [313]:
sftp = paramiko.SFTPClient.from_transport(transport)

In [319]:
import pysftp

In [355]:
first = pd.read_csv(file_a)

In [357]:
second = pd.read_csv(file_b)

In [360]:
second.tail()

Unnamed: 0,PURN,OrderID,TransactionDate,OrderValue,CustomerType,EventDate,ReferrerType,ReferrerName,referrerUrl,RecencyDays,City,Country,Continent,DeviceBrand,DeviceType,DeviceModel
356543,955298,424936,2019-12-23 23:59:00.000,291.41,Existing,2019-12-01 12:30:30.000,Direct Entry,,,22,Hwaseong-si,South Korea,Asia,Samsung,Smartphone,GALAXY A5 (2017)
356544,119842,384861,2019-06-19 23:59:00.000,27.1584,Existing,2018-11-09 12:29:56.000,Campaigns,great gifts for grandmas,,222,,United Kingdom,Europe,Unknown,Desktop,Generic Desktop
356545,120076,389733,2019-08-23 23:59:00.000,88.4418,Existing,2019-06-21 09:15:47.000,Direct Entry,,,63,Bridgwater,United Kingdom,Europe,Apple,Tablet,iPad
356546,120188,358802,2018-11-25 23:59:00.000,48.75,Existing,2018-10-29 20:03:55.000,Direct Entry,,,27,London,United Kingdom,Europe,Apple,Smartphone,iPhone
356547,120217,448479,2020-04-16 23:59:00.000,49.124,Existing,2019-11-30 08:52:43.000,Campaigns,black friday 1 2019 uk,android-app://com.google.android.gm,138,London,United Kingdom,Europe,Unknown,Smartphone,Generic Smartphone


In [359]:
first.head()

Unnamed: 0,PURN,OrderID,TransactionDate,OrderValue,CustomerType,EventDate,ReferrerType,ReferrerName,referrerUrl,RecencyDays,City,Country,Continent,DeviceBrand,DeviceType,DeviceModel
0,250182,1395213,2020-05-11 23:59:00.000,733.3333,Existing,2020-05-10 18:46:01.000,Websites,webservices.global-e.com,https://www.johnstonsofelgin.com/us/checkout/c...,1,Charlotte,United States,North America,Apple,Smartphone,iPhone
1,250236,1395163,2020-05-10 23:59:00.000,550.07,Existing,2020-05-10 10:37:41.000,Direct Entry,,https://www.johnstonsofelgin.com/retail/checko...,0,Edinburgh,United Kingdom,Europe,Apple,Desktop,Generic Desktop
2,21129,1395368,2020-05-12 23:59:00.000,165.83,Existing,2020-05-11 09:00:54.000,Campaigns,2020_01_mss_09,,1,Etchingham,United Kingdom,Europe,Unknown,Desktop,Generic Desktop
3,21129,1395936,2020-06-03 23:59:00.000,74.16,Existing,2020-05-11 09:00:54.000,Campaigns,2020_01_mss_09,,23,Etchingham,United Kingdom,Europe,Unknown,Desktop,Generic Desktop
4,250779,1395638,2020-05-22 23:59:00.000,344.08,Existing,2020-05-10 15:54:29.000,Direct Entry,,,12,Prestonpans,United Kingdom,Europe,Apple,Smartphone,iPhone


In [361]:
ssh.close()

In [368]:
first['TransactionDate'][0].split(" ")[1]

'23:59:00.000'