# Logistic Regression Model on Why HR Leaving | Predicting employee attrition using Machine Learning

In [1]:
import numpy as np
import pandas as pd
import sklearn 
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("HR_comma_sep.csv")
df

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.80,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low
...,...,...,...,...,...,...,...,...,...,...
14994,0.40,0.57,2,151,3,0,1,0,support,low
14995,0.37,0.48,2,160,3,0,1,0,support,low
14996,0.37,0.53,2,143,3,0,1,0,support,low
14997,0.11,0.96,6,280,4,0,1,0,support,low


In [3]:
df.head(3)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium


In [4]:
df.tail(3)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
14996,0.37,0.53,2,143,3,0,1,0,support,low
14997,0.11,0.96,6,280,4,0,1,0,support,low
14998,0.37,0.52,2,158,3,0,1,0,support,low


In [5]:
df.describe()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years
count,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0
mean,0.612834,0.716102,3.803054,201.050337,3.498233,0.14461,0.238083,0.021268
std,0.248631,0.171169,1.232592,49.943099,1.460136,0.351719,0.425924,0.144281
min,0.09,0.36,2.0,96.0,2.0,0.0,0.0,0.0
25%,0.44,0.56,3.0,156.0,3.0,0.0,0.0,0.0
50%,0.64,0.72,4.0,200.0,3.0,0.0,0.0,0.0
75%,0.82,0.87,5.0,245.0,4.0,0.0,0.0,0.0
max,1.0,1.0,7.0,310.0,10.0,1.0,1.0,1.0


In [6]:
df.min() #the minimum values on specified axis

satisfaction_level       0.09
last_evaluation          0.36
number_project              2
average_montly_hours       96
time_spend_company          2
Work_accident               0
left                        0
promotion_last_5years       0
Department                 IT
salary                   high
dtype: object

In [7]:
df.max() #maximum values on specified axis

satisfaction_level               1
last_evaluation                  1
number_project                   7
average_montly_hours           310
time_spend_company              10
Work_accident                    1
left                             1
promotion_last_5years            1
Department               technical
salary                      medium
dtype: object

In [8]:
df.shape

(14999, 10)

In [9]:
df.size

149990

In [10]:
df.keys #returns the ‘info axis’

<bound method NDFrame.keys of        satisfaction_level  last_evaluation  number_project  \
0                    0.38             0.53               2   
1                    0.80             0.86               5   
2                    0.11             0.88               7   
3                    0.72             0.87               5   
4                    0.37             0.52               2   
...                   ...              ...             ...   
14994                0.40             0.57               2   
14995                0.37             0.48               2   
14996                0.37             0.53               2   
14997                0.11             0.96               6   
14998                0.37             0.52               2   

       average_montly_hours  time_spend_company  Work_accident  left  \
0                       157                   3              0     1   
1                       262                   6              0     1   
2        

In [11]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Department']= le.fit_transform(df['Department'])
df['salary'] = le.fit_transform(df['salary'])
df

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,7,1
1,0.80,0.86,5,262,6,0,1,0,7,2
2,0.11,0.88,7,272,4,0,1,0,7,2
3,0.72,0.87,5,223,5,0,1,0,7,1
4,0.37,0.52,2,159,3,0,1,0,7,1
...,...,...,...,...,...,...,...,...,...,...
14994,0.40,0.57,2,151,3,0,1,0,8,1
14995,0.37,0.48,2,160,3,0,1,0,8,1
14996,0.37,0.53,2,143,3,0,1,0,8,1
14997,0.11,0.96,6,280,4,0,1,0,8,1


In [12]:
from sklearn.preprocessing import MinMaxScaler
df['average_montly_hours'] = MinMaxScaler().fit_transform(np.array(df['average_montly_hours']).reshape(-1,1))
df

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,0.285047,3,0,1,0,7,1
1,0.80,0.86,5,0.775701,6,0,1,0,7,2
2,0.11,0.88,7,0.822430,4,0,1,0,7,2
3,0.72,0.87,5,0.593458,5,0,1,0,7,1
4,0.37,0.52,2,0.294393,3,0,1,0,7,1
...,...,...,...,...,...,...,...,...,...,...
14994,0.40,0.57,2,0.257009,3,0,1,0,8,1
14995,0.37,0.48,2,0.299065,3,0,1,0,8,1
14996,0.37,0.53,2,0.219626,3,0,1,0,8,1
14997,0.11,0.96,6,0.859813,4,0,1,0,8,1


In [13]:
print(df.Department.unique())
print(df.salary.unique())

[7 2 3 9 8 4 0 6 5 1]
[1 2 0]


In [14]:
X = df.drop(['left'],axis = 1)
Y = df['left']
print(X[:5])
print(Y[:5])

   satisfaction_level  last_evaluation  number_project  average_montly_hours  \
0                0.38             0.53               2              0.285047   
1                0.80             0.86               5              0.775701   
2                0.11             0.88               7              0.822430   
3                0.72             0.87               5              0.593458   
4                0.37             0.52               2              0.294393   

   time_spend_company  Work_accident  promotion_last_5years  Department  \
0                   3              0                      0           7   
1                   6              0                      0           7   
2                   4              0                      0           7   
3                   5              0                      0           7   
4                   3              0                      0           7   

   salary  
0       1  
1       2  
2       2  
3       1  
4       

In [15]:
from sklearn.model_selection import train_test_split
X_train,X_test, Y_train, Y_test = train_test_split(X,Y,test_size= 0.2, random_state= 20)
my_model = LogisticRegression()
my_model.fit(X_train,Y_train)

LogisticRegression()

In [16]:
Y_pred = my_model.predict(X_test)
print(Y_test)
print(Y_pred)

6493     0
3568     0
972      1
3260     0
671      1
        ..
13542    0
11146    0
3256     0
14443    1
4566     0
Name: left, Length: 3000, dtype: int64
[0 0 1 ... 0 1 0]


## Confusion Matrix

In [17]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test,Y_pred)
cm

array([[2151,  146],
       [ 516,  187]], dtype=int64)

## Accuracy Level

In [18]:
from sklearn.metrics import accuracy_score 
print(accuracy_score(Y_test,Y_pred))
predictions = my_model.predict(X_test)
score = my_model.score(X_test, Y_test)
print("Accuracy is : ",score*100,"%")

0.7793333333333333
Accuracy is :  77.93333333333334 %
