# Attrition Data Analysis by Logistic Regression Model

In [16]:
import pandas as pd
import numpy as np

# Loading the dataset

In [17]:
data = pd.read_csv("attrition_data.csv")

In [18]:
data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,4.0,Y,12,8,2,9.0,2,6,0,4


# Checking for null values

In [19]:
data.isna().sum()

Age                         0
Attrition                   0
BusinessTravel              0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeID                  0
Gender                      0
JobLevel                    0
JobRole                     0
MaritalStatus               0
MonthlyIncome               0
NumCompaniesWorked         19
Over18                      0
PercentSalaryHike           0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           9
TrainingTimesLastYear       0
YearsAtCompany              0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64

In [20]:
data["NumCompaniesWorked"].mean()

2.6948303347756775

In [21]:
data["TotalWorkingYears"].mean()

11.279936378095888

In [22]:
new_NumCampaniesWorked=np.where(data["NumCompaniesWorked"].isnull(),3,data["NumCompaniesWorked"])
data["NumCompaniesWorked"]=new_NumCampaniesWorked

In [23]:
new_TotalWorkingYears=np.where(data["TotalWorkingYears"].isnull(),11,data["TotalWorkingYears"])
data["TotalWorkingYears"]=new_TotalWorkingYears

In [24]:
data.isnull().sum()

Age                        0
Attrition                  0
BusinessTravel             0
Department                 0
DistanceFromHome           0
Education                  0
EducationField             0
EmployeeCount              0
EmployeeID                 0
Gender                     0
JobLevel                   0
JobRole                    0
MaritalStatus              0
MonthlyIncome              0
NumCompaniesWorked         0
Over18                     0
PercentSalaryHike          0
StandardHours              0
StockOptionLevel           0
TotalWorkingYears          0
TrainingTimesLastYear      0
YearsAtCompany             0
YearsSinceLastPromotion    0
YearsWithCurrManager       0
dtype: int64

# Converting the float type attribute to integer

In [25]:
df = pd.DataFrame(data)
df.NumCompaniesWorked = df.NumCompaniesWorked.astype(int)
df.TotalWorkingYears = df.TotalWorkingYears.astype(int)

# Converting the text type value to the numeric

In [26]:
from sklearn.preprocessing import LabelEncoder
lb=LabelEncoder()
data["Attrition"]=lb.fit_transform(data["Attrition"])
data['BusinessTravel']=lb.fit_transform(data['BusinessTravel'])
data['Department']=lb.fit_transform(data["Department"])
data['EducationField']=lb.fit_transform(data['EducationField'])
data['Gender']=lb.fit_transform(data['Gender'])
data['MaritalStatus']=lb.fit_transform(data['MaritalStatus'])
data['JobRole']=lb.fit_transform(data['JobRole'])
data['Over18']=lb.fit_transform(data['Over18'])

In [27]:
data1=data. drop(columns=["EmployeeID","EmployeeCount","StandardHours","Over18"],axis=1)

In [28]:
data1. corr()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,Gender,JobLevel,JobRole,MaritalStatus,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
Age,1.0,-0.159205,0.024751,-0.010846,0.006963,-0.035706,0.043194,-0.039352,-0.002884,0.011422,-0.095029,-0.044314,0.298346,-0.033137,-0.031753,0.680405,-0.027308,0.311309,0.216513,0.202089
Attrition,-0.159205,1.0,7.4e-05,-0.048206,-0.00973,-0.015111,-0.05794,0.018125,-0.01029,0.025809,0.16207,-0.031176,0.042301,0.032533,-0.006839,-0.170123,-0.049431,-0.134392,-0.033019,-0.156199
BusinessTravel,0.024751,7.4e-05,1.0,-0.010982,0.024616,0.002755,-0.015462,-0.039243,0.046111,-0.027194,0.024001,-0.037597,0.022425,-0.031054,0.003683,0.033966,-0.041888,-0.014575,-0.032591,-0.022636
Department,-0.010846,-0.048206,-0.010982,1.0,0.012134,-0.004597,0.01372,0.003158,0.002923,-0.025466,-0.044619,-0.032237,-0.003942,-0.00784,0.010512,-0.019793,0.012866,0.010078,0.014845,0.018016
DistanceFromHome,0.006963,-0.00973,0.024616,0.012134,1.0,-0.008638,-0.024665,-0.044173,-0.037329,-0.010293,-0.027893,-0.021607,-0.01395,0.038125,0.011169,0.009351,-0.009001,0.031684,0.00229,0.021584
Education,-0.035706,-0.015111,0.002755,-0.004597,-0.008638,1.0,0.012329,-0.016547,0.045746,0.030155,0.024991,0.00641,-0.016228,-0.040531,0.001261,-0.010734,0.010472,0.00608,0.02249,0.005358
EducationField,0.043194,-0.05794,-0.015462,0.01372,-0.024665,0.012329,1.0,-0.005634,-0.019528,0.013106,-0.051409,0.008858,0.009144,-0.011214,0.02162,0.02161,-0.021793,0.02312,0.048181,0.021664
Gender,-0.039352,0.018125,-0.039243,0.003158,-0.044173,-0.016547,-0.005634,1.0,-0.026854,0.003724,-0.009132,0.00753,-0.066646,0.011841,0.019237,-0.027581,-0.030167,-0.017277,-0.021812,0.004438
JobLevel,-0.002884,-0.01029,0.046111,0.002923,-0.037329,0.045746,-0.019528,-0.026854,1.0,-0.014763,-0.022021,0.047316,-0.009724,0.010973,0.000993,-0.036901,-0.0325,-0.064219,-0.060811,-0.055251
JobRole,0.011422,0.025809,-0.027194,-0.025466,-0.010293,0.030155,0.013106,0.003724,-0.014763,1.0,0.022888,0.017072,-0.021826,-0.013465,0.023991,-0.012741,0.051744,-0.012886,-0.014635,0.010695


## Since there is no multicollinearity, no features should be dropped

# Applying the Logistic Regression Algorithm

In [29]:
y = data1.Attrition
x=data1[["Age","BusinessTravel","Department","DistanceFromHome","Education","EducationField",
"JobRole","Gender","JobLevel","MaritalStatus","MonthlyIncome"
,"NumCompaniesWorked","PercentSalaryHike","StockOptionLevel"
,"TotalWorkingYears","TrainingTimesLastYear",
"YearsAtCompany","YearsSinceLastPromotion","YearsWithCurrManager"]]
import statsmodels.api as sm
x1=sm.add_constant(x)
logistic=sm.Logit(y,x1)
result=logistic.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.393008
         Iterations 7


0,1,2,3
Dep. Variable:,Attrition,No. Observations:,4410.0
Model:,Logit,Df Residuals:,4390.0
Method:,MLE,Df Model:,19.0
Date:,"Sat, 08 Aug 2020",Pseudo R-squ.:,0.1102
Time:,23:42:31,Log-Likelihood:,-1733.2
converged:,True,LL-Null:,-1947.9
Covariance Type:,nonrobust,LLR p-value:,3.276e-79

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.0752,0.414,0.182,0.856,-0.736,0.886
Age,-0.0309,0.007,-4.523,0.000,-0.044,-0.018
BusinessTravel,-0.0177,0.065,-0.270,0.787,-0.146,0.111
Department,-0.2422,0.081,-2.980,0.003,-0.402,-0.083
DistanceFromHome,-0.0013,0.005,-0.247,0.805,-0.012,0.009
Education,-0.0628,0.043,-1.474,0.140,-0.146,0.021
EducationField,-0.0966,0.033,-2.895,0.004,-0.162,-0.031
JobRole,0.0377,0.018,2.108,0.035,0.003,0.073
Gender,0.0860,0.090,0.960,0.337,-0.090,0.261


## THERE ARE SOME ATTRIBUTES WHICH HIGHLY AFFECT THE ATTRITION IN THE COMPANY

## We can see that the P-Value of the attributes Age,Department,education field,Jobrole,Maritial status,Monthly income,NumCompaniesWorked,TotalWorkingYears,TrainingTimesLastYear,YearsWithCurrManager  is less than 0.05

## So Age,Department,education field,Jobrole,Maritial status,Monthly income,NumCompaniesWorked,TotalWorkingYears,TrainingTimesLastYear,YearsWithCurrManager are highly significant for the attrition in the company.
