## <b> Attrition Rate Prediction </b>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
import os

In [3]:
attrdata = pd.read_csv('Table_1.csv')
attrdata

Unnamed: 0,table id,name,phone number,Location,Emp. Group,Function,Gender,Tenure,Tenure Grp.,Experience (YY.MM),Marital Status,Age in YY.,Hiring Source,Promoted/Non Promoted,Job Role Match,Stay/Left
0,1,sid,9876544345,Pune,B2,Operation,Male,0.00,< =1,6.08,Single,27.12,Direct,Non Promoted,Yes,Left
1,2,sid,9876544345,Noida,B7,Support,Male,0.00,< =1,13.00,Marr.,38.08,Direct,Promoted,No,Stay
2,3,sid,9876544345,Bangalore,B3,Operation,Male,0.01,< =1,16.05,Marr.,36.04,Direct,Promoted,Yes,Stay
3,4,sid,9876544345,Noida,B2,Operation,Male,0.01,< =1,6.06,Marr.,32.07,Direct,Promoted,Yes,Stay
4,5,sid,9876544345,Lucknow,B2,Operation,Male,0.00,< =1,7.00,Marr.,32.05,Direct,Non Promoted,Yes,Stay
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
897,898,Rahul,9876544345,Vijayawada,B1,Operation,Female,0.03,< =1,0.03,Single,21.09,Direct,Non Promoted,Yes,Stay
898,899,Rahul,9876544345,Vijayawada,B1,Operation,Female,0.03,< =1,0.03,Single,22.02,Direct,Non Promoted,Yes,Left
899,900,Rahul,9876544345,Vijayawada,B1,Operation,Female,0.03,< =1,0.03,Single,22.01,Direct,Non Promoted,,Left
900,901,Rahul,9876544345,Vijayawada,B1,Operation,Female,0.03,< =1,0.03,Single,22.02,Direct,Non Promoted,Yes,Stay


In [4]:
attrdata.columns

Index(['table id', 'name', 'phone number', 'Location', 'Emp. Group',
       'Function', 'Gender ', 'Tenure', 'Tenure Grp.', 'Experience (YY.MM)',
       'Marital Status', 'Age in YY.', 'Hiring Source',
       'Promoted/Non Promoted', 'Job Role Match', 'Stay/Left'],
      dtype='object')

In [5]:
attrdata.isnull().sum()

table id                 0
name                     0
phone number             0
Location                 0
Emp. Group               0
Function                 0
Gender                   0
Tenure                   0
Tenure Grp.              0
Experience (YY.MM)       4
Marital Status           0
Age in YY.               0
Hiring Source            0
Promoted/Non Promoted    0
Job Role Match           2
Stay/Left                0
dtype: int64

In [6]:
attrdata.dropna(axis=0, inplace=True)

In [7]:
attrdata.shape

(896, 16)

In [8]:
gender_dict = attrdata['Gender '].value_counts()
gender_dict

Male      656
Female    234
other       6
Name: Gender , dtype: int64

In [9]:
promoted_dict = attrdata['Promoted/Non Promoted'].value_counts()
promoted_dict

Promoted        457
Non Promoted    439
Name: Promoted/Non Promoted, dtype: int64

In [16]:
marital_dict = attrdata['Marital Status'].value_counts()
marital_dict

Single    534
Marr.     356
Div.        2
NTBD        2
Sep.        2
Name: Marital Status, dtype: int64

In [22]:
emp_dict = attrdata['Emp. Group'].value_counts()
emp_dict

B1    537
B2    276
B3     59
B0      8
B4      7
B5      4
B7      2
B6      1
C3      1
D2      1
Name: Emp. Group, dtype: int64

Turning the location feature to a categorical one

In [12]:
location_dict = attrdata['Location'].value_counts()
print(location_dict)

location_dict_new = {
    'Chennai':  7,
    'Noida':    6,
    'Bangalore':5,
    'Hyderabad':4,
    'Pune':     3,
    'Madurai':  2,
    'Lucknow':  1,
    'Other place':0
}

Chennai       255
Noida         236
Bangalore     210
Hyderabad      62
Pune           56
Madurai        29
Lucknow        20
Nagpur         14
Vijayawada      6
Mumbai          4
Gurgaon         3
Kolkata         1
Name: Location, dtype: int64


Applying the categorical nature of the location feature

In [11]:
def location(x):
    if str(x) in location_dict_new.keys():
        return location_dict_new[str(x)]
    else:
        return location_dict_new['Other place']

data_1 = attrdata['Location'].apply(location)
attrdata['New Location'] = data_1
attrdata

Unnamed: 0,table id,name,phone number,Location,Emp. Group,Function,Gender,Tenure,Tenure Grp.,Experience (YY.MM),Marital Status,Age in YY.,Hiring Source,Promoted/Non Promoted,Job Role Match,Stay/Left,New Location
0,1,sid,9876544345,Pune,B2,Operation,Male,0.00,< =1,6.08,Single,27.12,Direct,Non Promoted,Yes,Left,3
1,2,sid,9876544345,Noida,B7,Support,Male,0.00,< =1,13.00,Marr.,38.08,Direct,Promoted,No,Stay,6
2,3,sid,9876544345,Bangalore,B3,Operation,Male,0.01,< =1,16.05,Marr.,36.04,Direct,Promoted,Yes,Stay,5
3,4,sid,9876544345,Noida,B2,Operation,Male,0.01,< =1,6.06,Marr.,32.07,Direct,Promoted,Yes,Stay,6
4,5,sid,9876544345,Lucknow,B2,Operation,Male,0.00,< =1,7.00,Marr.,32.05,Direct,Non Promoted,Yes,Stay,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
896,897,Rahul,9876544345,Vijayawada,B1,Operation,Male,0.03,< =1,0.03,Single,24.06,Direct,Non Promoted,Yes,Stay,0
897,898,Rahul,9876544345,Vijayawada,B1,Operation,Female,0.03,< =1,0.03,Single,21.09,Direct,Non Promoted,Yes,Stay,0
898,899,Rahul,9876544345,Vijayawada,B1,Operation,Female,0.03,< =1,0.03,Single,22.02,Direct,Non Promoted,Yes,Left,0
900,901,Rahul,9876544345,Vijayawada,B1,Operation,Female,0.03,< =1,0.03,Single,22.02,Direct,Non Promoted,Yes,Stay,0


In [13]:
attrdata['Function'].value_counts()

Operation    832
Support       52
Sales         12
Name: Function, dtype: int64

In [14]:
gen = pd.get_dummies(attrdata['Function'])
hr = pd.get_dummies(attrdata['Hiring Source'])

In [19]:
# Marital Status
def mar(x):
    if str(x) in marital_dict.keys() and marital_dict[str(x)] > 100:
        return str(x)
    else:
        return 'other'

data_1 = attrdata['Marital Status'].apply(mar)
attrdata['New Marital'] = data_1
attrdata

Unnamed: 0,table id,name,phone number,Location,Emp. Group,Function,Gender,Tenure,Tenure Grp.,Experience (YY.MM),Marital Status,Age in YY.,Hiring Source,Promoted/Non Promoted,Job Role Match,Stay/Left,New Location,New Marital
0,1,sid,9876544345,Pune,B2,Operation,Male,0.00,< =1,6.08,Single,27.12,Direct,Non Promoted,Yes,Left,3,Single
1,2,sid,9876544345,Noida,B7,Support,Male,0.00,< =1,13.00,Marr.,38.08,Direct,Promoted,No,Stay,6,Marr.
2,3,sid,9876544345,Bangalore,B3,Operation,Male,0.01,< =1,16.05,Marr.,36.04,Direct,Promoted,Yes,Stay,5,Marr.
3,4,sid,9876544345,Noida,B2,Operation,Male,0.01,< =1,6.06,Marr.,32.07,Direct,Promoted,Yes,Stay,6,Marr.
4,5,sid,9876544345,Lucknow,B2,Operation,Male,0.00,< =1,7.00,Marr.,32.05,Direct,Non Promoted,Yes,Stay,1,Marr.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
896,897,Rahul,9876544345,Vijayawada,B1,Operation,Male,0.03,< =1,0.03,Single,24.06,Direct,Non Promoted,Yes,Stay,0,Single
897,898,Rahul,9876544345,Vijayawada,B1,Operation,Female,0.03,< =1,0.03,Single,21.09,Direct,Non Promoted,Yes,Stay,0,Single
898,899,Rahul,9876544345,Vijayawada,B1,Operation,Female,0.03,< =1,0.03,Single,22.02,Direct,Non Promoted,Yes,Left,0,Single
900,901,Rahul,9876544345,Vijayawada,B1,Operation,Female,0.03,< =1,0.03,Single,22.02,Direct,Non Promoted,Yes,Stay,0,Single


In [20]:
Mr = pd.get_dummies(attrdata['New Marital'])

In [21]:
# Promoted/ Not Promoted
def Promoted(x):
    if x == 'Promoted':
        return int(1)
    else:
        return int(0)

data_1 = attrdata['Promoted/Non Promoted'].apply(Promoted)
attrdata['New Promoted'] = data_1
attrdata

Unnamed: 0,table id,name,phone number,Location,Emp. Group,Function,Gender,Tenure,Tenure Grp.,Experience (YY.MM),Marital Status,Age in YY.,Hiring Source,Promoted/Non Promoted,Job Role Match,Stay/Left,New Location,New Marital,New Promoted
0,1,sid,9876544345,Pune,B2,Operation,Male,0.00,< =1,6.08,Single,27.12,Direct,Non Promoted,Yes,Left,3,Single,0
1,2,sid,9876544345,Noida,B7,Support,Male,0.00,< =1,13.00,Marr.,38.08,Direct,Promoted,No,Stay,6,Marr.,1
2,3,sid,9876544345,Bangalore,B3,Operation,Male,0.01,< =1,16.05,Marr.,36.04,Direct,Promoted,Yes,Stay,5,Marr.,1
3,4,sid,9876544345,Noida,B2,Operation,Male,0.01,< =1,6.06,Marr.,32.07,Direct,Promoted,Yes,Stay,6,Marr.,1
4,5,sid,9876544345,Lucknow,B2,Operation,Male,0.00,< =1,7.00,Marr.,32.05,Direct,Non Promoted,Yes,Stay,1,Marr.,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
896,897,Rahul,9876544345,Vijayawada,B1,Operation,Male,0.03,< =1,0.03,Single,24.06,Direct,Non Promoted,Yes,Stay,0,Single,0
897,898,Rahul,9876544345,Vijayawada,B1,Operation,Female,0.03,< =1,0.03,Single,21.09,Direct,Non Promoted,Yes,Stay,0,Single,0
898,899,Rahul,9876544345,Vijayawada,B1,Operation,Female,0.03,< =1,0.03,Single,22.02,Direct,Non Promoted,Yes,Left,0,Single,0
900,901,Rahul,9876544345,Vijayawada,B1,Operation,Female,0.03,< =1,0.03,Single,22.02,Direct,Non Promoted,Yes,Stay,0,Single,0


In [37]:
# Employee Group
emp_dict_new = {
    'B1': 4,
    'B2': 3,
    'B1': 2,
    'Other group': 1
}

def emp(x):
    if str(x) in emp_dict_new.keys():
        return str(x)
    else:
        return 'other group'

data_1 = attrdata['Emp. Group'].apply(emp)
attrdata['New Emp'] = data_1

new_emp = pd.get_dummies(attrdata['New Emp'])

In [25]:
# Job Match
def job(x):
    if x == 'Yes':
        return int(1)
    else:
        return int(0)

data_1 = attrdata['Job Role Match'].apply(job)
attrdata['New Job Role Match'] = data_1

jobs = pd.get_dummies(attrdata['New Job Role Match'])

In [30]:
# Gender
def gen(x):
    if x in gender_dict.keys():
        return str(x)
    else:
        return 'other'

data_1 = attrdata['Gender '].apply(gen)
attrdata['New Gender'] = data_1

gend = pd.get_dummies(attrdata['New Gender'])

In [34]:
attrdata.drop('New Genger', axis=1, inplace=True)

In [35]:
tengrp = pd.get_dummies(attrdata['Tenure Grp.'])

In [47]:
dataset = pd.concat([attrdata, hr, Mr, new_emp, tengrp, gend], axis=1)
dataset

Unnamed: 0,table id,name,phone number,Location,Emp. Group,Function,Gender,Tenure,Tenure Grp.,Experience (YY.MM),...,Single,other,B1,B2,other group,< =1,> 1 & < =3,Female,Male,other.1
0,1,sid,9876544345,Pune,B2,Operation,Male,0.00,< =1,6.08,...,1,0,0,1,0,1,0,0,1,0
1,2,sid,9876544345,Noida,B7,Support,Male,0.00,< =1,13.00,...,0,0,0,0,1,1,0,0,1,0
2,3,sid,9876544345,Bangalore,B3,Operation,Male,0.01,< =1,16.05,...,0,0,0,0,1,1,0,0,1,0
3,4,sid,9876544345,Noida,B2,Operation,Male,0.01,< =1,6.06,...,0,0,0,1,0,1,0,0,1,0
4,5,sid,9876544345,Lucknow,B2,Operation,Male,0.00,< =1,7.00,...,0,0,0,1,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
896,897,Rahul,9876544345,Vijayawada,B1,Operation,Male,0.03,< =1,0.03,...,1,0,1,0,0,1,0,0,1,0
897,898,Rahul,9876544345,Vijayawada,B1,Operation,Female,0.03,< =1,0.03,...,1,0,1,0,0,1,0,1,0,0
898,899,Rahul,9876544345,Vijayawada,B1,Operation,Female,0.03,< =1,0.03,...,1,0,1,0,0,1,0,1,0,0
900,901,Rahul,9876544345,Vijayawada,B1,Operation,Female,0.03,< =1,0.03,...,1,0,1,0,0,1,0,1,0,0


In [48]:
dataset.drop(['table id', 'name', 'Location', 'Emp. Group',
       'Function', 'Gender ', 'Tenure',
       'Marital Status', 'Hiring Source',
       'Promoted/Non Promoted', 'Job Role Match', 'New Gender', 
       'New Marital', 'New Emp', 'New Job Role Match'], axis=1, inplace=True)

In [49]:
dataset1 = dataset.drop(['Tenure Grp.', 'phone number'], axis=1)

In [50]:
dataset1.columns

Index(['Experience (YY.MM)', 'Age in YY.', 'Stay/Left', 'New Location',
       'New Promoted', 'Agency', 'Direct', 'Employee Referral', 'Marr.',
       'Single', 'other', 'B1', 'B2', 'other group', '< =1', '> 1 & < =3',
       'Female', 'Male', 'other'],
      dtype='object')

In [52]:
y = dataset1['Stay/Left']
X = dataset1.drop('Stay/Left', axis=1)

In [53]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)
Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape

((716, 18), (180, 18), (716,), (180,))

### Model Comparison

In [54]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

Instantiating all the imported models

In [56]:
lr = LogisticRegression(C = 0.1, random_state=42, solver = 'lbfgs')
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
gnb = GaussianNB()

Training and making predictions with the models instantiated above

In [57]:
for a,b in zip([lr, dt, rf, gnb], ['Logistic Regression', 'Decision Trees', 'Random Forest', 'Naive Bayes']):
    a.fit(Xtrain, ytrain)
    predictions = a.predict(Xtrain)
    y_pred = a.predict(Xtest)
    score1 = accuracy_score(ytrain, predictions)
    score2 = accuracy_score(ytest, y_pred)
    msg1 = f'{b} training score is {score1}'
    msg2 = f'{b} test score is {score2}'
    print(msg1)
    print(msg2)

Logistic Regression training score is 0.6829608938547486
Logistic Regression test score is 0.7388888888888889
Decision Trees training score is 0.9986033519553073
Decision Trees test score is 0.55
Random Forest training score is 0.9986033519553073
Random Forest test score is 0.65
Naive Bayes training score is 0.6201117318435754
Naive Bayes test score is 0.5555555555555556


We can see the best-performing model is Logistic Regression, hence we will save that model to the drive

In [58]:
model = lr.fit(Xtrain, ytrain)

import pickle
pickle.dump(model, open('final_model.pickle', 'wb'))

### Conclusion

This project was done to show proficiency in Exploratory Data Analysis, Feature Engineeering, and model comparison. The best model in this project can be made better by performing hyper parameter tuning.