In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Problem Statement

A large company named XYZ, employs, at any given point of time, around 4000 employees. However, every year, around 15% of its employees leave the company and need to be replaced with the talent pool available in the job market. The management believes that this level of attrition (employees leaving, either on their own or because they got fired) is bad for the company, because of the following reasons -


-  The former employees’ projects get delayed, which makes it difficult to meet timelines, resulting in a reputation loss among consumers and partners
- A sizeable department has to be maintained, for the purposes of recruiting new talent
- More often than not, the new employees have to be trained for the job and/or given time to acclimatise themselves to the company

Hence, the management has contracted an HR analytics firm to understand what factors they should focus on, in order to curb attrition. In other words, they want to know what changes they should make to their workplace, in order to get most of their employees to stay. Also, they want to know which of these variables is most important and needs to be addressed right away.


Since you are one of the star analysts at the firm, this project has been given to you.

# Goal of the case study

You are required to model the probability of attrition using a logistic regression. The results thus obtained will be used by the management to understand what changes they should make to their workplace, in order to get most of their employees to stay.

# Step 1: Importing and Merging Data

In [None]:
# Suppressing Warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Importing Pandas and NumPy
import pandas as pd, numpy as np, seaborn as sns,matplotlib.pyplot as plt

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
in_time = pd.read_csv('/kaggle/input/hr-analytics-case-study/in_time.csv')
manager_survey_data = pd.read_csv('/kaggle/input/hr-analytics-case-study/manager_survey_data.csv')
employee_survey_data = pd.read_csv('/kaggle/input/hr-analytics-case-study/employee_survey_data.csv')
data_dictionary= pd.read_excel('/kaggle/input/hr-analytics-case-study/data_dictionary.xlsx')
out_time = pd.read_csv('/kaggle/input/hr-analytics-case-study/out_time.csv')
general_data = pd.read_csv('/kaggle/input/hr-analytics-case-study/general_data.csv')


In [None]:
in_time.head()

In [None]:
in_time.shape

In [None]:
in_time=in_time.replace(np.nan,0)
in_time.head()

In [None]:

in_time.iloc[:, 1:] = in_time.iloc[:, 1:].apply(pd.to_datetime, errors='coerce')

In [None]:
in_time.head()

In [None]:
out_time=out_time.replace(np.nan,0)
out_time.head()

In [None]:
out_time.iloc[:, 1:] = out_time.iloc[:, 1:].apply(pd.to_datetime, errors='coerce')

In [None]:
out_time.head()

In [None]:
in_time=in_time.append(out_time)

In [None]:
in_time.head()

In [None]:
in_time=in_time.diff(periods=4410)
in_time=in_time.iloc[4410:]
in_time.reset_index(inplace=True)
in_time.head()

In [None]:
in_time.drop(columns=['index','Unnamed: 0'],axis=1,inplace=True)
in_time.head()

In [None]:
in_time.shape

In [None]:
in_time.drop(['2015-01-01', '2015-01-14','2015-01-26','2015-03-05',
             '2015-05-01','2015-07-17','2015-09-17','2015-10-02',
              '2015-11-09','2015-11-10','2015-11-11','2015-12-25'
             ], axis = 1,inplace=True) 

In [None]:
in_time.head()

In [None]:
in_time['Actual Time']=in_time.mean(axis=1)

In [None]:
in_time['Actual Time'].head()

In [None]:
in_time['hrs']=in_time['Actual Time']/np.timedelta64(1, 'h')
in_time.head()

In [None]:
in_time.reset_index(inplace=True)
in_time.head()

In [None]:
in_time.drop(in_time.columns.difference(['index','hrs']), 1, inplace=True)

In [None]:
in_time.rename(columns={'index': 'EmployeeID'},inplace=True)
in_time.head()

In [None]:
general_data.head()

In [None]:
employee_survey_data.head()

# Combining all data files into one consolidated dataframe

In [None]:
df_1 = pd.merge(employee_survey_data, general_data, how='inner', on='EmployeeID')
hr = pd.merge(manager_survey_data, df_1, how='inner', on='EmployeeID')
hr = pd.merge(in_time, hr, how='inner', on='EmployeeID')
hr.head()

In [None]:
hr.describe()

# Correcting Datatype for the variable

In [None]:
hr['JobLevel']=hr['JobLevel'].astype('object')

# Decoding Values

In [None]:
hr['Education'] = hr['Education'].replace({ 1 : 'Below College', 2: 'College',3: 'Bachelor',4: 'Master',5 : 'Doctor'})
hr['EnvironmentSatisfaction'] = hr['EnvironmentSatisfaction'].replace({ 1 : 'Low', 2: 'Medium',3: 'High',4: 'Very High'})
hr['JobInvolvement'] = hr['JobInvolvement'].replace({ 1 : 'Low', 2: 'Medium',3: 'High',4: 'Very High'})
hr['JobSatisfaction'] = hr['JobSatisfaction'].replace({ 1 : 'Low', 2: 'Medium',3: 'High',4: 'Very High'})
#hr['RelationshipSatisfaction'] = hr['RelationshipSatisfaction'].replace({ 1 : 'Low', 2: 'Medium',
#                   3: 'High',4: 'Very High'})
hr['PerformanceRating'] = hr['PerformanceRating'].replace({ 1 : 'Low', 2: 'Good',3: 'Excellent',4: 'Outstanding'})
hr['WorkLifeBalance'] = hr['WorkLifeBalance'].replace({ 1 : 'Bad', 2: 'Good',3: 'Better',4: 'Best'})

In [None]:
hr.head()

In [None]:
hr['EmployeeCount'].value_counts(ascending=False)

In [None]:
hr['Over18'].value_counts(ascending=False)

In [None]:
hr['StandardHours'].value_counts(ascending=False)

# Drop Non Required Columns

In [None]:
hr.drop(['EmployeeID', 'EmployeeCount','StandardHours','Over18'], axis = 1,inplace=True) 

# Step 2: Inspecting the Dataframe

In [None]:
# Let's see the head of our master dataset
hr.head()

In [None]:
# Let's check the dimensions of the dataframe
hr.shape

In [None]:
# let's look at the statistical aspects of the dataframe
hr.describe()

In [None]:
# Let's see the type of each column
hr.info()

In [None]:
hr['EnvironmentSatisfaction'].value_counts(ascending=False)

In [None]:
sns.countplot(x='EnvironmentSatisfaction',data=hr);

- With analysis, we found mean & median for EnvironmentSatisfaction field is 2.72 & 3, it needs to be whole number hence, accepted is  3.
- From data_dictionary, we know 3 means High.  

Compute missing values

In [None]:
hr['EnvironmentSatisfaction'] = hr['EnvironmentSatisfaction'].fillna('High')
hr['EnvironmentSatisfaction'].isnull().sum()

In [None]:
hr['JobSatisfaction'].value_counts(ascending=False)

In [None]:
sns.countplot(x='JobSatisfaction',data=hr);

- With analysis, we found mean & median for JobSatisfaction field is 2.72 & 3, it needs to be whole number hence, accepted is  3.
- From data_dictionary, we know 3 means High.

Compute missing values

In [None]:
hr['JobSatisfaction'] = hr['JobSatisfaction'].fillna('High')
hr['JobSatisfaction'].isnull().sum()

In [None]:
hr['WorkLifeBalance'].value_counts(ascending=False)

In [None]:
sns.countplot(x='WorkLifeBalance',data=hr);

- With analysis, we found mean & median for WorkLifeBalance field is 2.76 & 3, it needs to be whole number hence, accepted is  3.
- From data_dictionary, we know 3 means Better.

#Compute missing values

In [None]:
hr['WorkLifeBalance'] = hr['WorkLifeBalance'].fillna('Better')
hr['WorkLifeBalance'].isnull().sum()

In [None]:
hr['NumCompaniesWorked'].value_counts(ascending=False)

In [None]:
sns.countplot(x='NumCompaniesWorked',data=hr);

In [None]:
sns.boxplot(x='NumCompaniesWorked',data=hr);

- With analysis, we found mean & median for NumCompaniesWorked field is 2.69 & 2, it needs to be whole number along with handling outliers hence, accepted is  2.


Compute missing values

In [None]:
hr['NumCompaniesWorked'] = hr['NumCompaniesWorked'].fillna(2)
hr['NumCompaniesWorked'].isnull().sum()

In [None]:
hr['TotalWorkingYears'].value_counts(ascending=False)

In [None]:
plt.figure(figsize=(8,8))
ax = sns.distplot(hr['TotalWorkingYears'], hist=True, kde=False, 
             bins=int(180/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4})
ax.set_ylabel('# of Employees')
ax.set_xlabel('TotalWorkingYears');


In [None]:
sns.boxplot(x='TotalWorkingYears',data=hr);

- With analysis, we found mean & median for TotalWorkingYears field is 11.27 & 10, it needs to be whole number along with handling outliers hence, accepted is  10.

In [None]:
hr['TotalWorkingYears'] = hr['TotalWorkingYears'].fillna(2)
hr['TotalWorkingYears'].isnull().sum()

In [None]:
hr.info()

# EDA

In [None]:
plt.figure(figsize=(8,8))
ax = sns.countplot(x='WorkLifeBalance',data=hr,hue="Attrition")
bars = ax.patches
half = int(len(bars)/2)
left_bars = bars[:half]
right_bars = bars[half:]

for left, right in zip(left_bars, right_bars):
    height_l = left.get_height()
    height_r = right.get_height()
    total = height_l + height_r

    ax.text(left.get_x() + left.get_width()/2., height_l + 40, '{0:.0%}'.format(height_l/total), ha="center")
    ax.text(right.get_x() + right.get_width()/2., height_r + 40, '{0:.0%}'.format(height_r/total), ha="center")

# Insights
- Attrition : Whether the employee left in the previous year or not
1. Employee who left in the previous year are 17% of population (1019) i.e. 174 who believe WorkLifeBalance is Good in org.
2. Employee who left in the previous year are 18% of population(454) i.e. 81 who believe WorkLifeBalance is Best in org.
3. Employee who left in the previous year are 34% of population (239) i.e. 81 who believe WorkLifeBalance is Bad in org.
4. Employee who left in the previous year are 14% of population (2698) i.e. 378 who believe WorkLifeBalance is Better in org.
- People who left in the previous year & believe WorkLifeBalance is Better in org were 52% of population who left in the previous year, second by 24% people who left in the previous year & believe WorkLifeBalance is Good in org

In [None]:
plt.figure(figsize=(8,8))
ax = sns.countplot(x='PerformanceRating', data=hr, hue="Attrition")
ax.set_ylabel('# of Employee')
bars = ax.patches
half = int(len(bars)/2)
left_bars = bars[:half]
right_bars = bars[half:]

for left, right in zip(left_bars, right_bars):
    height_l = left.get_height()
    height_r = right.get_height()
    total = height_l + height_r

    ax.text(left.get_x() + left.get_width()/2., height_l + 40, '{0:.0%}'.format(height_l/total), ha="center")
    ax.text(right.get_x() + right.get_width()/2., height_r + 40, '{0:.0%}'.format(height_r/total), ha="center")

# Insights
- Attrition : Whether the employee left in the previous year or not
1. Employee who left in the previous year are 16% of population (3732) i.e. 597 whose PerformanceRating was Excellent in org.
2.  Employee who left in the previous year are 18% of population (678) i.e. 122 whose PerformanceRating was Outstanding in org.
- People who left in the previous year & PerformanceRating was Excellent in org were 83% of population who left in the previous year.

In [None]:
plt.figure(figsize=(8,10))
ax = sns.countplot(x='EnvironmentSatisfaction', data=hr, hue="Attrition")
ax.set_ylabel('# of Employee')
bars = ax.patches
half = int(len(bars)/2)
left_bars = bars[:half]
right_bars = bars[half:]

for left, right in zip(left_bars, right_bars):
    height_l = left.get_height()
    height_r = right.get_height()
    total = height_l + height_r

    ax.text(left.get_x() + left.get_width()/2., height_l + 30, '{0:.0%}'.format(height_l/total), ha="center")
    ax.text(right.get_x() + right.get_width()/2., height_r + 40, '{0:.0%}'.format(height_r/total), ha="center")

# Insights
- Attrition : Whether the employee left in the previous year or not
1. Employee who left in the previous year are 14% of population (1375) i.e. 192 who believe EnvironmentSatisfaction is High in org. in org.
1. Employee who left in the previous year are 15% of population (856) i.e. 129 who believe EnvironmentSatisfaction is Medium in org. 
1. Employee who left in the previous year are 13% of population (1334) i.e. 173 who believe EnvironmentSatisfaction is Very High in org.
1. Employee who left in the previous year are 25% of population (845) i.e. 211 who believe EnvironmentSatisfaction is Low in org. 
- People who left in the previous year & believe EnvironmentSatisfaction is Low in org were 30% of population who left in the previous year. Second by People who left in the previous year & believe EnvironmentSatisfaction is High in org

In [None]:
plt.figure(figsize=(8,8))
ax = sns.countplot(x='JobSatisfaction', data=hr, hue="Attrition")
ax.set_ylabel('# of Employee')
bars = ax.patches
half = int(len(bars)/2)
left_bars = bars[:half]
right_bars = bars[half:]

for left, right in zip(left_bars, right_bars):
    height_l = left.get_height()
    height_r = right.get_height()
    total = height_l + height_r

    ax.text(left.get_x() + left.get_width()/2., height_l + 40, '{0:.0%}'.format(height_l/total), ha="center")
    ax.text(right.get_x() + right.get_width()/2., height_r + 40, '{0:.0%}'.format(height_r/total), ha="center")

In [None]:
plt.figure(figsize=(8,8))
ax = sns.countplot(x='WorkLifeBalance', data=hr, hue="Attrition")
ax.set_ylabel('# of Employee')
bars = ax.patches
half = int(len(bars)/2)
left_bars = bars[:half]
right_bars = bars[half:]

for left, right in zip(left_bars, right_bars):
    height_l = left.get_height()
    height_r = right.get_height()
    total = height_l + height_r

    ax.text(left.get_x() + left.get_width()/2., height_l + 40, '{0:.0%}'.format(height_l/total), ha="center")
    ax.text(right.get_x() + right.get_width()/2., height_r + 40, '{0:.0%}'.format(height_r/total), ha="center")

In [None]:
plt.figure(figsize=(8,8))
sns.violinplot(y='Age',x='Attrition',data=hr)
plt.show()

In [None]:
plt.figure(figsize=(8,8))
ax = sns.countplot(x='BusinessTravel', data=hr, hue="Attrition")
ax.set_ylabel('# of Employee')
bars = ax.patches
half = int(len(bars)/2)
left_bars = bars[:half]
right_bars = bars[half:]

for left, right in zip(left_bars, right_bars):
    height_l = left.get_height()
    height_r = right.get_height()
    total = height_l + height_r

    ax.text(left.get_x() + left.get_width()/2., height_l + 40, '{0:.0%}'.format(height_l/total), ha="center")
    ax.text(right.get_x() + right.get_width()/2., height_r + 40, '{0:.0%}'.format(height_r/total), ha="center")

In [None]:
plt.figure(figsize=(8,8))
ax = sns.countplot(x='Department', data=hr, hue="Attrition")
ax.set_ylabel('# of Employee')
bars = ax.patches
half = int(len(bars)/2)
left_bars = bars[:half]
right_bars = bars[half:]

for left, right in zip(left_bars, right_bars):
    height_l = left.get_height()
    height_r = right.get_height()
    total = height_l + height_r

    ax.text(left.get_x() + left.get_width()/2., height_l + 40, '{0:.0%}'.format(height_l/total), ha="center")
    ax.text(right.get_x() + right.get_width()/2., height_r + 40, '{0:.0%}'.format(height_r/total), ha="center")

In [None]:
plt.figure(figsize=(8,8))
sns.violinplot(y='DistanceFromHome',x='Attrition',data=hr)

plt.show()

In [None]:
plt.figure(figsize=(8,8))
ax = sns.countplot(x='Education', data=hr, hue="Attrition")
ax.set_ylabel('# of Employee')
bars = ax.patches
half = int(len(bars)/2)
left_bars = bars[:half]
right_bars = bars[half:]

for left, right in zip(left_bars, right_bars):
    height_l = left.get_height()
    height_r = right.get_height()
    total = height_l + height_r

    ax.text(left.get_x() + left.get_width()/2., height_l + 40, '{0:.0%}'.format(height_l/total), ha="center")
    ax.text(right.get_x() + right.get_width()/2., height_r + 40, '{0:.0%}'.format(height_r/total), ha="center")

In [None]:
plt.figure(figsize=(15,8))
ax = sns.countplot(x='EducationField', data=hr, hue="Attrition")
ax.set_ylabel('# of Employee')
bars = ax.patches
half = int(len(bars)/2)
left_bars = bars[:half]
right_bars = bars[half:]

for left, right in zip(left_bars, right_bars):
    height_l = left.get_height()
    height_r = right.get_height()
    total = height_l + height_r

    ax.text(left.get_x() + left.get_width()/2., height_l + 40, '{0:.0%}'.format(height_l/total), ha="center")
    ax.text(right.get_x() + right.get_width()/2., height_r + 40, '{0:.0%}'.format(height_r/total), ha="center")

In [None]:
plt.figure(figsize=(8,8))
ax = sns.countplot(x='Gender', data=hr, hue="Attrition")
ax.set_ylabel('# of Employee')
bars = ax.patches
half = int(len(bars)/2)
left_bars = bars[:half]
right_bars = bars[half:]

for left, right in zip(left_bars, right_bars):
    height_l = left.get_height()
    height_r = right.get_height()
    total = height_l + height_r

    ax.text(left.get_x() + left.get_width()/2., height_l + 40, '{0:.0%}'.format(height_l/total), ha="center")
    ax.text(right.get_x() + right.get_width()/2., height_r + 40, '{0:.0%}'.format(height_r/total), ha="center")

In [None]:
plt.figure(figsize=(8,8))
ax = sns.countplot(x='JobLevel', data=hr, hue="Attrition")
ax.set_ylabel('# of Employee')
bars = ax.patches
half = int(len(bars)/2)
left_bars = bars[:half]
right_bars = bars[half:]

for left, right in zip(left_bars, right_bars):
    height_l = left.get_height()
    height_r = right.get_height()
    total = height_l + height_r

    ax.text(left.get_x() + left.get_width()/2., height_l + 40, '{0:.0%}'.format(height_l/total), ha="center")
    ax.text(right.get_x() + right.get_width()/2., height_r + 40, '{0:.0%}'.format(height_r/total), ha="center")

In [None]:
plt.figure(figsize=(20,8))
ax = sns.countplot(x='JobRole', data=hr, hue="Attrition")
ax.set_ylabel('# of Employee')
bars = ax.patches
half = int(len(bars)/2)
left_bars = bars[:half]
right_bars = bars[half:]

for left, right in zip(left_bars, right_bars):
    height_l = left.get_height()
    height_r = right.get_height()
    total = height_l + height_r

    ax.text(left.get_x() + left.get_width()/2., height_l + 40, '{0:.0%}'.format(height_l/total), ha="center")
    ax.text(right.get_x() + right.get_width()/2., height_r + 40, '{0:.0%}'.format(height_r/total), ha="center")

In [None]:
plt.figure(figsize=(8,8))
ax = sns.countplot(x='MaritalStatus', data=hr, hue="Attrition")
ax.set_ylabel('# of Employee')
bars = ax.patches
half = int(len(bars)/2)
left_bars = bars[:half]
right_bars = bars[half:]

for left, right in zip(left_bars, right_bars):
    height_l = left.get_height()
    height_r = right.get_height()
    total = height_l + height_r

    ax.text(left.get_x() + left.get_width()/2., height_l + 40, '{0:.0%}'.format(height_l/total), ha="center")
    ax.text(right.get_x() + right.get_width()/2., height_r + 40, '{0:.0%}'.format(height_r/total), ha="center")

In [None]:
plt.figure(figsize=(8,8))
sns.violinplot(y='MonthlyIncome',x='Attrition',data=hr)

plt.show()

In [None]:
plt.figure(figsize=(8,8))
sns.violinplot(y='PercentSalaryHike',x='Attrition',data=hr)

plt.show()

In [None]:
plt.figure(figsize=(8,8))
ax = sns.countplot(x='StockOptionLevel', data=hr, hue="Attrition")
ax.set_ylabel('# of Employee')
bars = ax.patches
half = int(len(bars)/2)
left_bars = bars[:half]
right_bars = bars[half:]

for left, right in zip(left_bars, right_bars):
    height_l = left.get_height()
    height_r = right.get_height()
    total = height_l + height_r

    ax.text(left.get_x() + left.get_width()/2., height_l + 40, '{0:.0%}'.format(height_l/total), ha="center")
    ax.text(right.get_x() + right.get_width()/2., height_r + 40, '{0:.0%}'.format(height_r/total), ha="center")

In [None]:

plt.figure(figsize=(8,8))
sns.violinplot(y='TotalWorkingYears',x='Attrition',data=hr)

plt.show()

In [None]:
plt.figure(figsize=(8,8))
sns.violinplot(y='TrainingTimesLastYear',x='Attrition',data=hr)

plt.show()

In [None]:
plt.figure(figsize=(8,8))
sns.violinplot(y='YearsAtCompany',x='Attrition',data=hr)

plt.show()

In [None]:
plt.figure(figsize=(8,8))
sns.violinplot(y='YearsSinceLastPromotion',x='Attrition',data=hr)

plt.show()

In [None]:
plt.figure(figsize=(8,8))
sns.violinplot(y='YearsWithCurrManager',x='Attrition',data=hr)

plt.show()

In [None]:
plt.figure(figsize=(8,8))
sns.violinplot(y='hrs',x='Attrition',data=hr)

plt.show()

In [None]:
plt.figure(figsize=(20,18))
sns.heatmap(hr.corr(), annot = True, cmap="Accent");

In [None]:
hr_num=hr[[ 'MonthlyIncome', 'NumCompaniesWorked', 'PercentSalaryHike',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager',
           'DistanceFromHome','Age','hrs']]

sns.pairplot(hr_num, diag_kind='kde')
plt.show()

For categorical variables with multiple levels, create dummy features (one-hot encoded)

In [None]:
hr.info()

In [None]:
hr.columns

In [None]:
# Creating a dummy variable for some of the categorical variables and dropping the first one.
dummy1 = pd.get_dummies(hr[['JobInvolvement', 'PerformanceRating', 'EnvironmentSatisfaction',
                                 'JobSatisfaction', 'WorkLifeBalance','BusinessTravel', 'Department',
                                 'Education','EducationField', 'Gender', 'JobLevel', 'JobRole',
                                 'MaritalStatus']], drop_first=True)

# Adding the results to the master dataframe
hr = pd.concat([hr, dummy1], axis=1)

In [None]:
hr.head()

In [None]:
hr.shape

# Dropping the repeated variables

In [None]:
# We have created dummies for the below variables, so we can drop them
hr = hr.drop(['JobInvolvement', 'PerformanceRating', 'EnvironmentSatisfaction',
                                 'JobSatisfaction', 'WorkLifeBalance','BusinessTravel', 'Department',
                                 'Education','EducationField', 'Gender', 'JobLevel', 'JobRole',
                                 'MaritalStatus'], 1)

In [None]:
hr.head()

In [None]:
hr.shape

# Mapping Attrition to 1/0 

In [None]:
hr['Attrition'] = hr['Attrition'].replace({'Yes': 1, "No": 0})

In [None]:
hr.head()

In [None]:
hr.shape

 # Step 3: Test-Train Split

In [None]:
from sklearn.model_selection import train_test_split
# Putting feature variable to X
X = hr.drop(['Attrition'], axis=1)

X.head()

In [None]:
X.shape

In [None]:
# Putting response variable to y
y = hr['Attrition']

y.head()

In [None]:
y.shape

In [None]:
# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)


In [None]:
X_train.head()

In [None]:
X_train.shape

In [None]:
 X_test.head()

In [None]:
 X_test.shape

In [None]:
y_train.head()

In [None]:
y_train.shape

In [None]:
y_test.head()

In [None]:
y_test.shape

# Step 5: Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train[[ 'MonthlyIncome', 'NumCompaniesWorked', 'PercentSalaryHike',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager',
           'DistanceFromHome','Age','hrs']] = scaler.fit_transform(X_train[[ 'MonthlyIncome', 'NumCompaniesWorked', 'PercentSalaryHike',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager',
           'DistanceFromHome','Age','hrs']])

X_train.head()

In [None]:
X_test[[ 'MonthlyIncome', 'NumCompaniesWorked', 'PercentSalaryHike',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager',
           'DistanceFromHome','Age','hrs']] = scaler.transform(X_test[[ 'MonthlyIncome', 'NumCompaniesWorked', 'PercentSalaryHike',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager',
           'DistanceFromHome','Age','hrs']])

X_test.head()

In [None]:
### Checking the Attrition Rate
Attrition = (sum(hr['Attrition'])/len(hr['Attrition'].index))*100
Attrition

We have almost 16.13% Attrition rate

# Step 6: Looking at Correlations

In [None]:
# Let's see the correlation matrix 
plt.figure(figsize = (50,40))   
sns.heatmap(hr.corr(),annot = True,cmap="tab20c")
plt.show()

In [None]:
plt.figure(figsize=(20,8))
hr.corr()['Attrition'].sort_values(ascending = False).plot(kind='bar');

Dropping highly correlated dummy variables

In [None]:
corrmat = X_train.corr() 
corrdf = corrmat.where(np.triu(np.ones(corrmat.shape), k=1).astype(np.bool))
corrdf = corrdf.unstack().reset_index()
corrdf.columns = ['Var1', 'Var2', 'Correlation']
corrdf.dropna(subset = ['Correlation'], inplace = True)
corrdf['Correlation'] = round(corrdf['Correlation'], 2)
corrdf['Correlation'] = abs(corrdf['Correlation'])
matrix= corrdf.sort_values(by = 'Correlation', ascending = False).head(50)
matrix

In [None]:
unique=list(set(matrix.Var2))
len(unique)

Dropping highly correlated dummy variables

In [None]:
X_test = X_test.drop(unique,1)
X_train = X_train.drop(unique,1)

In [None]:
X_test.shape

In [None]:
X_test.head() 

In [None]:
X_train.head()

In [None]:
X_train.shape

### Checking the Correlation Matrix

After dropping highly correlated variables now let's check the correlation matrix again.

In [None]:
plt.figure(figsize = (50,25))
sns.heatmap(X_train.corr(),annot = True,cmap="tab20c")
plt.show()

 # Step 7: Model Building


Let's start by splitting our data into a training set and a test set.
- Running Your First Training Model

# SVM (Support Vector Machine)

It is a classification method. In this algorithm, we plot each data item as a point in n-dimensional space (where n is number of features you have) with the value of each feature being the value of a particular coordinate.

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
model = SVC()

In [None]:
# fit the model with the training data
model.fit(X_train,y_train)

In [None]:
# predict the target on the train dataset
predict_train = model.predict(X_train)
predict_train

In [None]:
trainaccuracy = accuracy_score(y_train,predict_train)
print('accuracy_score on train dataset : ', trainaccuracy)

# VIF

In [None]:
# Check for the VIF values of the feature variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif.tail()

In [None]:
features_to_remove = vif.loc[vif['VIF'] >= 4.99,'Features'].values
features_to_remove = list(features_to_remove)
print(features_to_remove)


In [None]:
X_train = X_train.drop(columns=features_to_remove, axis = 1)
X_train.head()

In [None]:
X_test = X_test.drop(columns=features_to_remove, axis = 1)
X_test.head()


In [None]:
# fit the model with the training data
model.fit(X_train,y_train)

In [None]:
# predict the target on the train dataset
predict_train = model.predict(X_train)
predict_train

In [None]:
trainaccuracy = accuracy_score(y_train,predict_train)
print('accuracy_score on train dataset : ', trainaccuracy)

In [None]:
# VIF

In [None]:
# Check for the VIF values of the feature variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
from sklearn import metrics
# Confusion matrix 
confusion = metrics.confusion_matrix(y_train, predict_train )
print(confusion)

In [None]:
TP = confusion[1,1] # true positive 
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives

In [None]:
# Let's see the sensitivity of our model
trainsensitivity= TP / float(TP+FN)
trainsensitivity

In [None]:
# Let us calculate specificity
trainspecificity= TN / float(TN+FP)
trainspecificity


In [None]:
# Calculate false postive rate - predicting Attrition when customer does not have Attrited
print(FP/ float(TN+FP))

In [None]:
# Positive predictive value 
print (TP / float(TP+FP))

# Plotting the ROC Curve

In [None]:
def draw_roc( actual, probs ):
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return None

In [None]:
draw_roc(y_train,predict_train)

# Precision and Recall

In [None]:
from sklearn.metrics import precision_score, recall_score
precision_score(y_train,predict_train)

In [None]:
recall_score(y_train,predict_train)

# Making predictions on the test set

In [None]:
# predict the target on the test dataset
predict_test = model.predict(X_test)
print('Target on test data\n\n',predict_test)

In [None]:
confusion2 = metrics.confusion_matrix(y_test, predict_test )
print(confusion2)

In [None]:
# Let's check the overall accuracy.
testaccuracy= accuracy_score(y_test,predict_test)
testaccuracy

In [None]:
# Let's see the sensitivity of our lmodel
testsensitivity=TP / float(TP+FN)
testsensitivity

In [None]:
# Let us calculate specificity
testspecificity= TN / float(TN+FP)
testspecificity

# Final Observation:

In [None]:
# Let us compare the values obtained for Train & Test:
print("Train Data Accuracy    :{} %".format(round((trainaccuracy*100),2)))
print("Train Data Sensitivity :{} %".format(round((trainsensitivity*100),2)))
print("Train Data Specificity :{} %".format(round((trainspecificity*100),2)))
print("Test Data Accuracy     :{} %".format(round((testaccuracy*100),2)))
print("Test Data Sensitivity  :{} %".format(round((testsensitivity*100),2)))
print("Test Data Specificity  :{} %".format(round((testspecificity*100),2)))