Employee Attrition Prediction with Python

Machine Learning Project on Employee Attrition Prediction with Python

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, log_loss, classification_report)
from imblearn.over_sampling import SMOTE
import xgboost

In [33]:
attrition = pd.read_csv("C:\\Users\\User\\Desktop\\Github\\World of programming\\Machine learning projects\\Project 3\\Employee-Attrition.csv") 

In [None]:
f, axes = plt.subplots(3, 3, figsize=(10, 8), sharex=False, sharey=False)
s = np.linspace(0, 3, 10)
cmap = sns.cubehelix_palette(start=0.0, light=1, as_cmap=True)

x = attrition['Age'].values
y = attrition['TotalWorkingYears'].values
sns.kdeplot(x=x, y=y, cmap=cmap, shade=True, cut=5, ax=axes[0,0])
#sns.kdeplot(x, y, cmap=cmap, shade=True, ax=axes[0,0])
axes[0,0].set(title = 'Age against Total working years')

cmap = sns.cubehelix_palette(start=0.333333333, light=1, as_cmap=True)
# Generate and plot
x = attrition['Age'].values
y = attrition['DailyRate'].values
sns.kdeplot(x=x, y=y, cmap=cmap, shade=True, ax=axes[0,1])
axes[0,1].set(title='Age against Daily Rate')

cmap = sns.cubehelix_palette(start=0.66666666667, light=1, as_cmap=True)
x = attrition['YearsInCurrentRole'].values
y = attrition['Age'].values
sns.kdeplot(x=x, y=y, cmap=cmap, shade=True, ax=axes[0,2])
axes[0,2].set(title='Years in role against Age')

cmap = sns.cubehelix_palette(start=1.0, light=1, as_cmap=True)
x = attrition['DailyRate'].values
y = attrition['DistanceFromHome'].values
sns.kdeplot(x=x, y=y, cmap=cmap, shade=True, ax=axes[1,0])
axes[1,0].set(title='Daily Rate against Distancefromhome')

cmap = sns.cubehelix_palette(start=1.3333333333, light=1, as_cmap=True)
x = attrition['DailyRate'].values
y = attrition['JobSatisfaction'].values
sns.kdeplot(x=x, y=y, cmap=cmap, shade=True, ax=axes[1,1])
axes[1,1].set(title='Daily Rate against Job satisfaction')

cmap = sns.cubehelix_palette(start=1.66666666667, light=1, as_cmap=True)
x = attrition['YearsAtCompany'].values
y = attrition['JobSatisfaction'].values
sns.kdeplot(x=x, y=y, cmap=cmap, shade=True, ax=axes[1,2])
axes[1,2].set(title='Daily Rate against distance')

cmap = sns.cubehelix_palette(start=2.0, light=1, as_cmap=True)
x = attrition['YearsAtCompany'].values
y = attrition['DailyRate'].values
sns.kdeplot(x=x, y=y, cmap=cmap, shade=True, ax=axes[2,0])
axes[2,0].set(title='Years at company against Daily Rate')

cmap = sns.cubehelix_palette(start=2.3333333333, light=1, as_cmap=True)
x = attrition['RelationshipSatisfaction'].values
y = attrition['YearsWithCurrManager'].values
sns.kdeplot(x=x, y=y, cmap=cmap, shade=True, ax=axes[2,1])
axes[2,1].set(title='Relationship Satisfaction vs years with manager')

cmap = sns.cubehelix_palette(start=2.66666666667, light=1, as_cmap=True)
x = attrition['WorkLifeBalance'].values
y = attrition['JobSatisfaction'].values
sns.kdeplot(x=x, y=y, cmap=cmap, shade=True, ax=axes[2,2])
axes[2,2].set(title='WorklifeBalance against Satisfaction')

f.tight_layout() 

Finding Correlation

In [None]:
# Define a dictionary for the target mapping
target_map = {'Yes':1, 'No':0}

attrition["Attrition_numerical"] = attrition["Attrition"].apply(lambda x: target_map[x])

numerical = [u'Age', u'DailyRate', u'DistanceFromHome', 
             u'Education', u'EmployeeNumber', u'EnvironmentSatisfaction',
             u'HourlyRate', u'JobInvolvement', u'JobLevel', u'JobSatisfaction',
             u'MonthlyIncome', u'MonthlyRate', u'NumCompaniesWorked',
             u'PercentSalaryHike', u'PerformanceRating', u'RelationshipSatisfaction',
             u'StockOptionLevel', u'TotalWorkingYears',
             u'TrainingTimesLastYear', u'WorkLifeBalance', u'YearsAtCompany',
             u'YearsInCurrentRole', u'YearsSinceLastPromotion', u'YearsWithCurrManager']

data = [
    go.Heatmap(
        z = attrition[numerical].astype(float).corr().values,
        x = attrition[numerical].columns.values,
        y = attrition[numerical].columns.values,
        colorscale='Viridis',
        reversescale = False,
        opacity=1.0

    )
]

layout = go.Layout(
    title='Pearson Correlation of numerical features',
    xaxis = dict(ticks='', nticks=36),
    yaxis = dict(ticks='' ),
    width = 900, height = 700,

)

fig = go.Figure(data = data, layout = layout)
py.iplot(fig, filename='labelled-heatmap') 


Feature Engineering

In [None]:
attrition = attrition.drop(['Attrition_numerical'], axis=1)

categorical = []
for col, value in attrition.iteritems():
    if value.dtype == 'object':
        categorical.append(col)

numerical = attrition.columns.difference(categorical)

In [None]:
attrition_cat = attrition[categorical]
attrition_cat = attrition_cat.drop(['Attrition'], axis = 1)
attrition_cat = pd.get_dummies(attrition_cat)
attrition_cat.head(3)
attrition_num = attrition[numerical]
attrition_final = pd.concat([attrition_num, attrition_cat], axis=1)


Machine Learning for Employee Attrition Prediction with Python

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

train, test, target_train, target_val = train_test_split(attrition_final, target, train_size=0.80, random_state=0)