In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline
from pylab import rcParams

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score,classification_report

In [3]:
# Show First 5 records
data = pd.read_csv('Employee_Attrition.csv')
data.head(3)

Unnamed: 0,index,EmployeeID,TotalWorkingHours,Billable_Hours,Hours_off_Duty,Touring_Hours,NoOfProjects,ActualTimeSpent,Job_History,Joining_Date,Designation,Sex,Total_Experience,Left_Company,source
0,0,emp-1,3424.0,2462.0,408.0,2462.0,42,2849,"Worked at Company - 639 , Company - 212 , Comp...",2011_01,MD,1,3,0.0,train
1,1,emp-10,2006.0,1274.0,328.0,691.0,13,616,"Worked at Company - 816 , Company - 724 , Comp...",2006_02,VP,0,4,1.0,train
2,2,emp-100,3607.0,3275.0,224.0,3235.0,23,3388,"Worked at Company - 562 , Company - 319",2011_12,Senior,1,2,0.0,train


### Label Encoding for 'Designation' field

In [4]:
enc_Designation = LabelEncoder()
data.Designation = enc_Designation.fit_transform(data.Designation)

## Machine learning model implementation

### Split Train and Test

In [6]:
data_Train = data.loc[data['source'] == 'train', : ]
data_Test = data.loc[data['source'] == 'test', : ]

### Get the data for Training and Testing within Actual Train dataset

In [34]:
X = data_Train.loc[:,['TotalWorkingHours','Billable_Hours','Hours_off_Duty','Touring_Hours','NoOfProjects',
               'ActualTimeSpent','Designation','Sex','Total_Experience']]
y = data_Train.Left_Company

In [35]:
X.head()

Unnamed: 0,TotalWorkingHours,Billable_Hours,Hours_off_Duty,Touring_Hours,NoOfProjects,ActualTimeSpent,Designation,Sex,Total_Experience
0,3424.0,2462.0,408.0,2462.0,42,2849,2,1,3
1,2006.0,1274.0,328.0,691.0,13,616,4,0,4
2,3607.0,3275.0,224.0,3235.0,23,3388,3,1,2
3,4109.5,2759.5,192.0,0.0,18,1953,3,1,3
4,3790.0,2720.5,344.0,1861.0,20,2254,3,1,1


In [36]:
# Split of train data further into train and test
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=20)

### Training the Model

In [38]:
#Print Function called by all the Algorithms 
def printScores(y_test,y_predict):
    accuracy = round(accuracy_score(y_test,y_predict)*100)
    print("\nConfusion Matrix :- \n\n",pd.crosstab(y_test,y_predict))
    print("======================================")
    print("\nClassification Report:- \n\n",classification_report(y_test,y_predict))
    return accuracy

#### Different Alogorithms used for Prediction:

#### LOGISTIC REGRESSION

In [39]:
LogisticRegression_Model = LogisticRegression(random_state=10)
LogisticRegression_Model.fit(X_train, y_train)
LG_y_predict = LogisticRegression_Model.predict(X_test)
LogReg_Acc = printScores(y_test,LG_y_predict)    #Print method
print('Accuracy --> ',LogReg_Acc)


Confusion Matrix :- 

 col_0         0.0  1.0
Left_Company          
0.0            40    2
1.0             4    3

Classification Report:- 

               precision    recall  f1-score   support

         0.0       0.91      0.95      0.93        42
         1.0       0.60      0.43      0.50         7

    accuracy                           0.88        49
   macro avg       0.75      0.69      0.72        49
weighted avg       0.86      0.88      0.87        49

Accuracy -->  88.0




#### DECISION TREE

In [40]:
DecisionTree_Model = DecisionTreeClassifier(max_depth=3 ,random_state=10)
DecisionTree_Model.fit(X_train,y_train)
DT_y_predict = DecisionTree_Model.predict(X_test)
DecTree_Acc = printScores(y_test,DT_y_predict)    #Print method
print('Accuracy --> ',DecTree_Acc)


Confusion Matrix :- 

 col_0         0.0  1.0
Left_Company          
0.0            42    0
1.0             2    5

Classification Report:- 

               precision    recall  f1-score   support

         0.0       0.95      1.00      0.98        42
         1.0       1.00      0.71      0.83         7

    accuracy                           0.96        49
   macro avg       0.98      0.86      0.91        49
weighted avg       0.96      0.96      0.96        49

Accuracy -->  96.0


#### RANDOM FOREST

In [41]:
RandomForest_Model = RandomForestClassifier(max_depth=10,n_estimators=20,random_state=10,max_features='sqrt',
                                            criterion='gini',min_samples_split=2,min_samples_leaf=4,bootstrap= False)
RandomForest_Model.fit(X_train,y_train)
RF_y_predict = RandomForest_Model.predict(X_test)
RandFor_Acc = printScores(y_test,RF_y_predict)    #Print method
print('Accuracy --> ',RandFor_Acc)


Confusion Matrix :- 

 col_0         0.0  1.0
Left_Company          
0.0            42    0
1.0             3    4

Classification Report:- 

               precision    recall  f1-score   support

         0.0       0.93      1.00      0.97        42
         1.0       1.00      0.57      0.73         7

    accuracy                           0.94        49
   macro avg       0.97      0.79      0.85        49
weighted avg       0.94      0.94      0.93        49

Accuracy -->  94.0


#### XG BOOST

In [42]:
XGBoost_Model = XGBClassifier(max_depth=6,n_estimators=100,learning_rate=0.75,random_state=20)
XGBoost_Model.fit(X_train,y_train)
XGB_y_predict = XGBoost_Model.predict(X_test)
XGB_Acc = printScores(y_test,XGB_y_predict)    #Print method
print('Accuracy --> ',XGB_Acc)


Confusion Matrix :- 

 col_0         0.0  1.0
Left_Company          
0.0            42    0
1.0             2    5

Classification Report:- 

               precision    recall  f1-score   support

         0.0       0.95      1.00      0.98        42
         1.0       1.00      0.71      0.83         7

    accuracy                           0.96        49
   macro avg       0.98      0.86      0.91        49
weighted avg       0.96      0.96      0.96        49

Accuracy -->  96.0


#### SUPPORT VECTOR MACHINE (SVM)

In [43]:
SVM_Model = SVC(kernel='rbf', C=7, gamma=0.001)
SVM_Model.fit(X_train,y_train)
SVM_y_predict = SVM_Model.predict(X_test)
SVM_Acc = printScores(y_test,SVM_y_predict)    #Print method
print('Accuracy --> ',SVM_Acc)


Confusion Matrix :- 

 col_0         0.0  1.0
Left_Company          
0.0            42    0
1.0             2    5

Classification Report:- 

               precision    recall  f1-score   support

         0.0       0.95      1.00      0.98        42
         1.0       1.00      0.71      0.83         7

    accuracy                           0.96        49
   macro avg       0.98      0.86      0.91        49
weighted avg       0.96      0.96      0.96        49

Accuracy -->  96.0


### Showing the Classifier along with Accuracy.

In [46]:
models = pd.DataFrame({
    'Classifier': ['Random Forest','XG Boost','Decision Tree','Support Vector Machines','Logistic Regression'],
    'Accuracy': [RandFor_Acc,XGB_Acc,DecTree_Acc,SVM_Acc,LogReg_Acc]})
models.set_index(['Classifier'])

Unnamed: 0_level_0,Accuracy
Classifier,Unnamed: 1_level_1
Random Forest,94.0
XG Boost,96.0
Decision Tree,96.0
Support Vector Machines,96.0
Logistic Regression,88.0


### As XG Boost, Decision tree and SVM gives descent accuracy, we export the trained model from any of them.

In [48]:
from sklearn.externals import joblib
joblib.dump(SVM_Model,'Employee_Attrition_Model.ml')

['Employee_Attrition_Model.ml']