In [2]:
import pandas as pd
import numpy as np
df = pd.read_csv('HR_comma_sep.csv')
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [10]:
#adjusting for dummy-variable trap: two or more variables that are highly correlated leads to poor model performance (why?)  drope one variable
feats = ['sales','salary'] #key-error: had ()tuple and not an list array[]
df_final = pd.get_dummies(df,columns=feats,drop_first=True)
df_final.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_RandD,sales_accounting,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,1,0,0,0,0,0,0,0,1,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,0,0,0,0,1,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,0,0,0,0,1,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,0,0,0,0,1,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,0,0,0,0,1,0,0,1,0


In [36]:
#seperate training and testing datasets using scikit-learn package
from sklearn.model_selection import train_test_split

# the column with category 'left' is your predicted output (y) so leave this out of the dataset
X = df_final.drop(['left'],axis=1).values  #pandas.DataFrame.drop and .values is converting your csv 
y = df_final['left'].values #.values converts to numpy arrays, keras expects data as arrays

In [37]:
# split data in 70% training - 30% testing (can also experiment with 80-20)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)
#capital X for df_final.drop and y for df_final[output]


In [38]:
# scaling/normalizing the data for efficient computing: scikit-learn Standardscaler
# normal distribution scaling: mean of 0 and SD (standard deviation) of 1
# important because you are comparing features with different measurement 

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test) #why not fit-transform here too?

In [39]:
import keras #to build deep learning model with tensorflow as backend
from keras.models import Sequential  #to initialize artifical neural network
from keras.layers import Dense # to add layers to the model

# 3-layer-model: input player (no computation) - hidden layer - output layer (result y)

In [42]:
#initialize linear stack of layers, its a classification problem 
classifier = Sequential() 

# .add first layer to your initialized neural network
classifier.add(Dense(9, kernel_initializer = 'uniform',activation = 'relu', input_dim=18))
# first parameter (9) is the number of nodes (one strategy is avg of input and output )
# second is the weights being initialized/seeded close to zero (but not zero)
# third (most important) is the activation function
# -- relu is chosen (ouput is either 0 or =input: max(x,0)) because fits well with the dataset, linear function wouldnt fit
## activation functions(relu, sigmoid, softmax etc): https://keras.io/api/layers/activations/
# last parameter: input_dim, represents the number of features/dimensions of your dataset

In [44]:
# .add output layer 
classifier.add(Dense(1, kernel_initializer = 'uniform', activation = 'sigmoid'))
# you exepct one output, P(employee leaving), thus 1 node
# because you want the probability P(employee leaving): use sigmoid activation function
# -- if you were dealing with more then 2 categories, use variant of sigmoid: softmax

In [47]:
# optimizations step: reduce errors faster during training
# apply 'gradient descent': how randomly assigned weights are adjusted
# by reducing the 'cost function': find local minimum (error is at its least)
classifier.compile(optimizer='adam',loss = 'binary_crossentropy', metrics=['accuracy'])
# adam = a popular gradient descent optimization strategy
# loss = a function used in the gradient descent, since binary problem use binary crossentropy
# metric = the parameter you will evaluate the model with: accuracy of prediction

In [48]:
# now ready to fit composed classifier (2 layers + optimization) to your dataset
classifier.fit(X_train, y_train, batch_size=10, epochs=1)
# first parameter: training set
# second parameter: column you make predictions on
# batch_size = # of samples that will go through the network each round
# epochs = number of times the dataset will be passed via the network, more takes longer but better results



<keras.callbacks.History at 0x21453d05e70>

In [49]:
# run predictions on test set
y_pred = classifier.predict(X_test)
y_pred = (y_pred>0.5)




In [50]:
print(y_pred)

[[False]
 [False]
 [False]
 ...
 [False]
 [False]
 [False]]


In [52]:
# evaluate prediction accuracy using confusion/error matrix
# shows # true and false positives, true and false negatives
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)
cm

array([[3464,    0],
       [1036,    0]], dtype=int64)

In [59]:
# 3464 + 0  correct predictions and 0 + 1036 wrong predictions
# accuracy of
print( (3464/(3464+1036)) )

0.7697777777777778


In [60]:
# prediction for single employee data p(employee leaving)=
new_pred = classifier.predict(sc.transform(np.array([[0.26,0.7 ,3., 238., 6., 0.,0.,0.,0., 0.,0.,0.,0.,0.,1.,0., 0.,1.]])))
new_pred = (new_pred>0.5)
new_pred



array([[False]])

In [68]:
# training the model multiple times gets results with high variance, to
# solve for this problem: K-fold-cross-validation
# K=10 means train on first 9 folds then test on the last fold
# iterate over all folds, accuracy is avg of all accuracies in each iteration
from keras.wrappers.scikit_learn import KerasClassifier
#import scikeras
#from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import cross_val_score

def make_classifier(): #basically the model you designed earlier 
        classifier = Sequential()
        classifier.add(Dense(9, kernel_initializer='uniform', activation='relu',input_dim=18))
        classifier.add(Dense(1, kernel_initializer='uniform', activation='sigmoid'))
        classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        return classifier

#now pass to the kerasclassifier, where build_fn = your model as defined in make_classifier
classifier = KerasClassifier(build_fn=make_classifier, batch_size=10, nb_epoch=1)

# now apply cross validation for avg accuracies, time to fit
accuracies = cross_val_score(estimator = classifier,X = X_train,y = y_train,cv = 10,n_jobs = -1)
# cv = number of folds
# n_jobs = # of cpu (-1 = all)

mean = accuracies.mean()
mean

  classifier = KerasClassifier(build_fn=make_classifier, batch_size=10, nb_epoch=1)


0.8291264295578002

In [69]:
variance = accuracies.var()
variance

0.0026780018634431713

In [71]:
#low variance between accuracies means model is performing well

In [72]:
# combat overfitting by using dropout layer: randomly deactivates x (=rate) neurons each iteration
from keras.layers import Dropout

classifier = Sequential()
classifier.add(Dense(9, kernel_initializer = "uniform", activation = "relu", input_dim=18))
classifier.add(Dropout(rate = 0.1))
classifier.add(Dense(1, kernel_initializer = "uniform", activation = "sigmoid"))
classifier.compile(optimizer= "adam",loss = "binary_crossentropy",metrics = ["accuracy"])

In [73]:
# Gridsearch: experiment with different model parameters
from sklearn.model_selection import GridSearchCV

def make_classifier(optimizer):
    classifier = Sequential()
    classifier.add(Dense(9, kernel_initializer = "uniform", activation = "relu", input_dim=18))
    classifier.add(Dense(1, kernel_initializer = "uniform", activation = "sigmoid"))
    classifier.compile(optimizer= optimizer,loss = "binary_crossentropy",metrics = ["accuracy"])
    return classifier

classifier = KerasClassifier(build_fn = make_classifier)

  classifier = KerasClassifier(build_fn = make_classifier)


In [74]:
#parameters to tune
params = {
    'batch_size':[20,35],
    'epochs':[2,3],
    'optimizer':['adam','rmsprop']
}
grid_search = GridSearchCV(estimator=classifier,
                           param_grid=params,
                           scoring="accuracy",
                           cv=2)
#cv = # of folds you test on
grid_search = grid_search.fit(X_train,y_train)

best_param = grid_search.best_params_
best_accuracy = grid_search.best_score_
best_param

Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3


{'batch_size': 20, 'epochs': 3, 'optimizer': 'adam'}

In [75]:
best_accuracy

0.9009434722260022