# Problem Statement
Prepare a model for the HR Department to predict the Attrition and give insights from the data about important
factors associated with the Attrition so that HR can take corrective or preventive measures to stop or control
the Attrition


# Random Forest Model

# Importing Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier 
from urllib.request import urlopen 
from numpy import mean
from numpy import std
from six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
from sklearn import preprocessing
import time

plt.style.use('ggplot')


In [None]:
# Setting option to see maximum number of columns and rows
pd.set_option('display.max_columns' , None)
pd.set_option('display.max_rows' , None)

# Read The Data From csv

In [None]:
attrition_df = pd.read_csv('/Users/priyankac/Downloads/Capstone Project/HR_Employee_Attrition_Data.csv')
attrition_df.head()

In [None]:
# Getting dimension of the data
print('Here is the dimension of our data: ' , attrition_df.shape)


In [None]:
# Getting data types of all the columns
print('Here is the data types of all our columns: \n' , attrition_df.dtypes)

In [None]:
# Getting concise summary of the data
attrition_df.info()

In [None]:
# Descriptive Statistics of the given data
attrition_df.describe()

# Insights:
'''     1. It can be seen that there are 2940 total count of variables in each numeric column

        2. Describe function gives the range of numerical columns/variables ,five number summary about the 
           data set giving the minimium value, maximum value,mean,median(50%), 
           lower quartile(25%),upper quartile(75%) values of the data'''

# Missing Value Check

In [None]:
# Check total number of missing value in the data set
attrition_df.isnull().sum()

# Insight:
     '''There are no missing values in the data'''

# Duplicate row check

In [None]:
# Checking for any duplicate rows present in the given data
df_duplicate = attrition_df.duplicated()
df_duplicate.any()

# Insight:
       '''No duplicate rows present'''

# Zero Variance Check

In [None]:
# Checking the columns that have one unique value
attrition_df.nunique()

# Insight:
        '''Variables 'EmployeeCount','Over18' and 'StandardHours' are zero variance columns
          These columns need to be dropped before model building'''

In [None]:
# Dropping the unnecessary columns
cols_to_drop = ['EmployeeCount', 'Over18', 'StandardHours']
attrition_df.drop(cols_to_drop , axis = 1, inplace = True)

In [None]:
# After dropping the unnecessry columns checking the shape of the data
attrition_df.shape


# Insight:
      '''We can see that three columns have been dropped and total numner of columns is reduced to 32'''

# Categorical Columns

In [None]:
# Getting all the categorical variables
df_category = attrition_df.select_dtypes(include = 'object')
df_category.head()


In [None]:
# Checking the different values in 'BusinessTravel'
attrition_df['BusinessTravel'].value_counts()

# Cleaning the variable 'BusinessTravel' and replacing all'-' with '_'
attrition_df['BusinessTravel'] = attrition_df['BusinessTravel'].str.replace('-' , '_')
attrition_df['BusinessTravel'].value_counts()


In [None]:
# Note:
       """ Columns-'Education','EnvironmentSatisfaction','JobInvolvement','JobLevel','JobSatisfaction',
                   'StockOptionLevel','WorkLifeBalance','PerformanceRating','RelationshipSatisfaction' 
                    are all categorical columns"""

# Numerical columns

In [None]:
# Getting all the numerical variables
df_numerical = attrition_df.select_dtypes(exclude = 'object')
df_numerical.head()

# Insight:
       """From the description we know that variables 'Education','EnvironmentSatisfaction','JobInvolvement'.'JobLevel',
        'JobSatisfation','StockOptionLevel' and 'WorkLifeBalance' are categorical columns not numerical"""

In [None]:
# Dropping categorical columns from dataframe of numerical columns
new_cols_to_drop = ['Education','EnvironmentSatisfaction','JobInvolvement','JobLevel','JobSatisfaction',
                   'StockOptionLevel','WorkLifeBalance','PerformanceRating','RelationshipSatisfaction']
df_numerical.drop(new_cols_to_drop , axis = 1, inplace = True)

In [None]:
df_numerical.head()

# Correlation check

In [None]:
# Converting the variable 'Attrition' to binary for plots and model
attrition_df['Attrition'] = attrition_df['Attrition'].map({'Yes':1, 'No':0})

In [None]:
# Plotting the correlation matrix
corr_matrix = attrition_df.corr()
print(corr_matrix)

In [None]:
# Plotting the heatmap to check the correlation
fig, ax = plt.subplots(figsize=(20,15))  
sns.heatmap(corr_matrix , annot = True, fmt = '.2g', vmin = 0, vmax = 1, cmap = 'coolwarm')

# Insight:
       """1. Following variables are strongly correlated:

             'TotalYearsWorking' and 'JobLevel' having strong positive correlation(0.78)
              'TotalYearsWorking' and 'MonthlyIncome' having strong positive correlation(0.77) 
              'YearsAtCompany' and 'YearsWithManager' having stong positive correlation(0.77) 
              'YearsAtCompany' and 'YearsInCurrentRole' having strong positive correlation(0.76)
              'PercentSalaryHike' and 'PerfromanceRating' having strong positive correlation(0.77)

        2. Columns that have strong correlation need to be dropped as they affect the model
       3. Columns 'TotalYearsWorking' , 'YearAtCompany' ,'PerformanceRating' need to dropped
           (needs to be checked for realtionship with 'Attrition' in visualizations before dropping them) """

### Analysis on 'Attrition' (dependent variable)

In [None]:
# Finding the count of 'Attrition'
stats = attrition_df['Attrition'].value_counts()
stats

# Insight:
   """ 1. There are total 2940 records, out of which total count of Attrition status 'No' is 2466 and 'Yes' is 474
        2. The data is not balanced,that is,our target classes,'No' and 'Yes' attrition status is not represented 
         equally in the data set"""

In [None]:
# Visualizing variable 'Attrition'
sns.countplot('Attrition' , data = attrition_df).set(title = 'Count of 0 and 1')
plt.show()

# Insight:
      ''' From the countplot we can observe that the count of Attrition 'No' is more than count of'Yes'   '''

In [None]:
#Calculate the attrition percentage
print("Attrition % in the company is::",(attrition_df['Attrition'].value_counts()['Yes']/attrition_df['Attrition']
                                         .count())*100)


In [None]:
# Plotting Pie chart for 'Attrition'
attrition_df['Attrition'].value_counts().plot.pie(autopct = '%1.1f%%')

# Insight:
       '''From the pie chart we can see attrition status is about 16.1% '''

### Analysis on 'Age'

In [None]:
# Discriptive Statistics on 'Age'
attrition_df['Age'].describe()

# Insight:
    ''' From Five number summary,age is in symmetric distribution.Most of the data lies around the age of 36.
      Maximum age of employees is 60 and minimum age is 18'''

In [None]:
#  Plot Age distribution
plt.figure(figsize = (15,5))

plt.subplot(1,2,1)
sns.distplot(attrition_df['Age'])

plt.subplot(1,2,2)
sns.boxplot(attrition_df['Age'])
plt.show()

# Insight:
     '''From the histogram we can see that we have almost symmetrical distribution curve
      

      From the boxplot we can see that most of the employees are bewteen the age of 30 to 42 and
      there are no outliers present '''

In [None]:
# Relationship between 'Attrition' and 'Age'
plt.figure(figsize = (15,6))
sns.swarmplot(x = 'Attrition', y = 'Age', data = attrition_df, hue = 'Attrition')
plt.show()


# Insight:
'''From the boxplot we can see that there is no linear relationship between 'Attrition' and 'Age'.Maximum
      attrition is happening between the age of 25 to 35'''

### Analysis on 'BusinessTravel'

In [None]:
# Getting the count of different categories in 'BusinessTravel'
attrition_df['BusinessTravel'].value_counts()


In [None]:
# Grouping this data according to attrition
attrition_df.groupby(['BusinessTravel', 'Attrition'])['Attrition'].count()

In [None]:
# Visualizing the count of 'Businesstravel'
plt.figure(figsize = (15,6))
sns.countplot('BusinessTravel' , data = attrition_df).set(title = 'Business Travel')
plt.show()


In [None]:
# Creating a new variable 'BusinessTravel_cat' by converting 'BusinessTravel' to categorical value in number form

# Travel_Rarely = 1
# Travel_Frequently = 2
# Non_Travel = 3

attrition_df.loc[attrition_df['BusinessTravel'] == 'Travel_Rarely','BusinessTravel_cat'] = 1
attrition_df.loc[attrition_df['BusinessTravel'] == 'Travel_Frequently','BusinessTravel_cat'] = 2
attrition_df.loc[attrition_df['BusinessTravel'] == 'Non_Travel','BusinessTravel_cat'] = 3
attrition_df.head()

In [None]:
# Relationship between 'Attrition' and 'Age' with respect to 'BusinessTravel'

# Travel_Rarely = 1
# Travel_Frequently = 2
# Non_Travel = 3

plt.figure(figsize = (15,6))
sns.swarmplot(x = 'Attrition' , y = 'Age' , data = attrition_df , hue = 'BusinessTravel_cat')
plt.show()

# Insight:
''' From the plot below we can see that attrition is high between age 25 to 35.It can also be seen that this 
    age group of employees are travelling frequently'''

### Analysis on 'Department'


In [None]:
# Getting count of different categories in 'Department'
attrition_df['Department'].value_counts()

In [None]:
# Visualizing the count of 'Department'
plt.figure(figsize = (12,5))
sns.countplot('Department' , data = attrition_df).set(title = 'Count by Departments')
plt.show()


In [None]:
# Creating new variable 'Department_ord' by converting 'Department' data to ordinal variable

#Sales = 1
#Research and Development = 2
#Human Resorces = 3

attrition_df.loc[attrition_df['Department'] == 'Sales', 'Department_ord'] = 1
attrition_df.loc[attrition_df['Department'] == 'Research & Development', 'Department_ord'] = 2
attrition_df.loc[attrition_df['Department'] == 'Human Resources', 'Department_ord'] = 3
attrition_df.head()

In [None]:
# Getting count of 'BusinessTravel' by each 'Department'
department_group = attrition_df.groupby(['Department'])
department_group

# Count with respect to 'Sales'
df_sales = department_group.get_group('Sales')['BusinessTravel'].value_counts()
df_sales = pd.DataFrame(df_sales)
df_sales = df_sales.reset_index()
df_sales['Department'] = 'Sales'
df_sales


In [None]:
# Count with respect to Research & Development
df_rd = department_group.get_group('Research & Development')['BusinessTravel'].value_counts()
df_rd = pd.DataFrame(df_rd)
df_rd = df_rd.reset_index()
df_rd['Department'] = 'Research & Development'

df1 = df_sales.append(df_rd , ignore_index = True)
df1

In [None]:
# Count with respect to Human Resources
df_hr = department_group.get_group('Human Resources')['BusinessTravel'].value_counts()
df_hr = pd.DataFrame(df_hr)
df_hr = df_hr.reset_index()
df_hr['Department'] = 'Human Resources'

df2 = df1.append(df_hr , ignore_index = True)
df2


In [None]:
# Printing the percentage of people who travel frequently by 'Department'
print('% Frequent Travel in Sales: ',df2.loc[(df2['Department'] == 'Sales') & (df2['index'] == 'Travel_Frequently'),
                                            'BusinessTravel'].sum()/df2.loc[(df2['Department'] == 'Sales'),
                                                                           'BusinessTravel'].sum())

print('% Frequent Travel in Research & Development: ',df2.loc[(df2['Department'] == 'Research & Development') & 
                                    (df2['index'] == 'Travel_Frequently'),'BusinessTravel'].sum()/
                                    df2.loc[(df2['Department'] == 'Research & Development'),'BusinessTravel'].sum())

print('% Frequent Travel in Human Resources: ',df2.loc[(df2['Department'] == 'Human Resources') & 
                                    (df2['index'] == 'Travel_Frequently'),'BusinessTravel'].sum()/
                                    df2.loc[(df2['Department'] == 'Human Resources'),'BusinessTravel'].sum())


In [None]:
# Plotting 'Department' with 'Businesstravel'
plt.figure(figsize = (15,5))
sns.barplot(x = 'Department', y = 'BusinessTravel', data = df2, hue = 'index')
plt.show()

In [None]:
# Grouping 'Department' according to 'Attrition'
attrition_df.groupby(['Department', 'Attrition'])['Attrition'].count()

In [None]:
# Calculating 'Attrition' % for each 'Department'

# Attrition for Sales 
att_by_sales = department_group.get_group('Sales')['Attrition'].value_counts()
att_by_sales

# Attrition for Reasearch & Development
att_by_rd = department_group.get_group('Research & Development')['Attrition'].value_counts()
att_by_rd

# attrition for Human Resources
att_by_hr = department_group.get_group('Human Resources')['Attrition'].value_counts()
att_by_hr

# Attrition by Department
att_by_dept = pd.DataFrame(columns = ['Department','Attrition'])

att_by_dept.loc[len(att_by_dept)] = ['Sales' , att_by_sales['Yes']/(att_by_sales.sum())*100]
att_by_dept.loc[len(att_by_dept)] = ['Research & Development', att_by_rd['Yes']/(att_by_rd.sum())*100]
att_by_dept.loc[len(att_by_dept)] = ['Human Resources', att_by_hr['Yes']/(att_by_hr.sum())]
att_by_dept

# Insight:
'''Attrition is highest in the Sales Department and lowest in the Human Resources '''


In [None]:
# Plotting Attrition % for each Department
plt.figure(figsize = (15,5))
sns.barplot(x = 'Department', y = 'Attrition' , data = att_by_dept)
plt.show()

### Analysis on 'EnvironmentSatisfaction'

In [None]:
# Getting count of different Categories in 'EnvironmentSatisfaction'
attrition_df['EnvironmentSatisfaction'].value_counts()

In [None]:
# Calculating 'Attrition' vs 'EnvironmentSatisfaction' 
envsatisfaction_group = attrition_df.groupby(['EnvironmentSatisfaction', 'Attrition'])
envsatisfaction_group.groups.keys()


In [None]:
count = envsatisfaction_group.get_group((1,'Yes'))['EnvironmentSatisfaction'].count()
count

att_by_envsatisfaction = pd.DataFrame(columns = ['EnvironmentSatisfaction','Attrition','Count'])
att_by_envsatisfaction.loc[len(att_by_envsatisfaction)] = ['1', 'Yes' ,count]
att_by_envsatisfaction

In [None]:
count = envsatisfaction_group.get_group((2,'Yes'))['EnvironmentSatisfaction'].count()
att_by_envsatisfaction.loc[len(att_by_envsatisfaction)] = ['2', 'Yes', count]
att_by_envsatisfaction

In [None]:
count = envsatisfaction_group.get_group((3, 'Yes'))['EnvironmentSatisfaction'].count()
att_by_envsatisfaction.loc[len(att_by_envsatisfaction)] = ['3', 'Yes', count]
att_by_envsatisfaction

In [None]:
count = envsatisfaction_group.get_group((4, 'Yes'))['EnvironmentSatisfaction'].count()
att_by_envsatisfaction.loc[len(att_by_envsatisfaction)] = ['4','Yes', count]
att_by_envsatisfaction

# Insight:
'''Attrition is highest for environment satisfaction 1 followed closely by environment satisfaction 3 and 4'''

In [None]:
# Plotting 'Attrition' vs 'EnvironmentSatisfaction'
plt.figure(figsize = (16,6))
sns.barplot(x = 'EnvironmentSatisfaction', y = 'Count', data = att_by_envsatisfaction)
plt.show()

### Analysis on variable 'Gender'


In [None]:
# Getting count total males and females
attrition_df['Gender'].value_counts()


In [None]:
# Visualizing count of 'Gender'
plt.figure(figsize = (15,5))
sns.countplot('Gender' , data = attrition_df).set(title = 'Count by Gender')
plt.show()

In [None]:
# Grouping Gender according to attrition
attrition_df.groupby(['Gender', 'Attrition'])['Attrition'].count()

In [None]:
# Calculating Attrition by Gender
gender_group = attrition_df.groupby(['Gender','Attrition'])
gender_group.groups.keys()

In [None]:
att_by_gender = pd.DataFrame(columns = ['Gender','Attrition','Count'])
att_by_gender

In [None]:
count = gender_group.get_group(('Female','Yes'))['Attrition'].count()
att_by_gender.loc[(len(att_by_gender))] = ['Female','Yes',count]
att_by_gender

In [None]:
count = gender_group.get_group(('Male', 'Yes'))['Attrition'].count()
att_by_gender.loc[len(att_by_gender)] = ['Male', 'Yes', count]
att_by_gender

# Insight:
'''More number of men have left the company than women'''

In [None]:
# Plotting 'Attrition' vs 'Gender'
plt.figure(figsize = (15,5))
sns.barplot(x = 'Gender', y = 'Count', data = att_by_gender)
plt.show()

### Analysis on 'JobInvolvement'

In [None]:
# Getting total count in each category
attrition_df['JobInvolvement'].value_counts()

In [None]:
# Grouping 'JobInvolvemnt' according to 'Attrition'
attrition_df.groupby(['JobInvolvement','Attrition'])['Attrition'].count()

In [None]:
# Calculating 'Attrition' by 'JobInvolvement'
jobInv_group = attrition_df.groupby(['JobInvolvement', 'Attrition'])
jobInv_group.groups.keys()

In [None]:
att_by_jobInv = pd.DataFrame(columns =['JobInvolvement', 'Attrition', 'Count', 'Attrition%'])
att_by_jobInv

In [None]:
count = jobInv_group.get_group((1, 'Yes'))['Attrition'].count()
att_by_jobInv.loc[len(att_by_jobInv)] = ['1', 'Yes', count, (count/attrition_df['JobInvolvement'].value_counts()[1])*100]
att_by_jobInv

In [None]:
count = jobInv_group.get_group((2, 'Yes'))['Attrition'].count()
att_by_jobInv.loc[len(att_by_jobInv)] = ['2', 'Yes', count, (count/attrition_df['JobInvolvement'].value_counts()[2])*100]
att_by_jobInv

In [None]:
count = jobInv_group.get_group((3, 'Yes'))['Attrition'].count()
att_by_jobInv.loc[len(att_by_jobInv)] = ['3','Yes',count,(count/attrition_df['JobInvolvement'].value_counts()[3])*100]
att_by_jobInv

In [None]:
count = jobInv_group.get_group((4, 'Yes'))['Attrition'].count()
att_by_jobInv.loc[len(att_by_jobInv)] = ['4','Yes',count,(count/attrition_df['JobInvolvement'].value_counts()[4])*100]
att_by_jobInv

# Insight:
'''Employees who have job involvement rating of 1 have highest attrition'''

In [None]:
# Plotting 'Attrition%' vs 'JobInvolvement'
plt.figure(figsize = (15,5))
sns.barplot(x = 'JobInvolvement', y = 'Attrition%', data = att_by_jobInv)
plt.show()

### Analysis on 'JobLevel'


In [None]:
# Getting total count in each category
attrition_df['JobLevel'].value_counts()

In [None]:
# Grouping JobLevel according to attrition
attrition_df.groupby(['JobLevel', 'Attrition'])['Attrition'].count()

In [None]:
# Calculating 'Attrition' by 'Joblevel'
joblvl_group = attrition_df.groupby(['JobLevel', 'Attrition'])
joblvl_group.groups.keys()

In [None]:
att_by_joblvl = pd.DataFrame(columns = ['JobLevel' , 'Attrition', 'Count', 'Attrition%'])
att_by_joblvl

In [None]:
count = joblvl_group.get_group((1, 'Yes'))['Attrition'].count()
att_by_joblvl.loc[len(att_by_joblvl)] = ['1', 'Yes', count, (count/attrition_df['JobLevel'].value_counts()[1])*100]
att_by_joblvl

In [None]:
count = joblvl_group.get_group((2, 'Yes'))['Attrition'].count()
att_by_joblvl.loc[len(att_by_joblvl)] = ['2' , 'Yes', count ,(count/attrition_df['JobLevel'].value_counts()[2])*100]
att_by_joblvl

In [None]:
count = joblvl_group.get_group((3, 'Yes'))['Attrition'].count()
att_by_joblvl.loc[len(att_by_joblvl)] = ['3', 'Yes', count, (count/attrition_df['JobLevel'].value_counts()[3])*100]
att_by_joblvl

In [None]:
count = joblvl_group.get_group((4, 'Yes'))['Attrition'].count()
att_by_joblvl.loc[len(att_by_joblvl)] = ['4', 'Yes', count,(count/attrition_df['JobLevel'].value_counts()[4])*100]
att_by_joblvl

In [None]:
count = joblvl_group.get_group((5, 'Yes'))['Attrition'].count()
att_by_joblvl.loc[len(att_by_joblvl)] = ['5', 'Yes', count,(count/attrition_df['JobLevel'].value_counts()[5])*100]
att_by_joblvl

# Insight:
'''Attrition is highest in job level 1(26%) followed by job level 3(14.6%)'''

In [None]:
# Plotting 'Attrition' vs 'JobLevel'
plt.figure(figsize = (15,5))
sns.barplot(x = 'JobLevel', y = 'Attrition%', data = att_by_joblvl)
plt.show()

### Analysis on 'JobSatisfaction'

In [None]:
# Getting total count by category
attrition_df['JobSatisfaction'].value_counts()

In [None]:
# Grouping data according to 'Attrition'
attrition_df.groupby(['JobSatisfaction', 'Attrition'])['Attrition'].count()

In [None]:
# Visualizing total count of 'JobSatisfaction'
plt.figure(figsize = (15,5))
sns.countplot('JobSatisfaction', data = attrition_df).set(title = 'Count By Job satisfaction Level')
plt.show()

In [None]:
# Calculating 'Attrition' by 'JobSatisfaction'
jobsatisfaction_group = attrition_df.groupby(['JobSatisfaction', 'Attrition'])
jobsatisfaction_group.groups.keys()

In [None]:
att_by_jobsatisfaction = pd.DataFrame(columns = ['JobSatisfaction', 'Attrition', 'Count', 'Attrition%'])
att_by_jobsatisfaction

In [None]:
count1 = jobsatisfaction_group.get_group((1,1))['Attrition'].count()
att_by_jobsatisfaction.loc[len(att_by_jobsatisfaction)] = ['1','1',count1,
                                                    (count1/attrition_df['JobSatisfaction'].value_counts()[1])*100]
att_by_jobsatisfaction

In [None]:
count1 = jobsatisfaction_group.get_group((2,1))['Attrition'].count()
att_by_jobsatisfaction.loc[len(att_by_jobsatisfaction)] = ['2','1',count1,
                                                    (count1/attrition_df['JobSatisfaction'].value_counts()[2])*100]
att_by_jobsatisfaction

In [None]:
count1 = jobsatisfaction_group.get_group((3,1))['Attrition'].count()
att_by_jobsatisfaction.loc[len(att_by_jobsatisfaction)] = ['3','1',count1,
                                                    (count1/attrition_df['JobSatisfaction'].value_counts()[3])*100]
att_by_jobsatisfaction

In [None]:
count1 = jobsatisfaction_group.get_group((4,1))['Attrition'].count()
att_by_jobsatisfaction.loc[len(att_by_jobsatisfaction)] = ['4','1',count1,
                                                    (count1/attrition_df['JobSatisfaction'].value_counts()[4])*100]
att_by_jobsatisfaction

# Insight:
'''Attrition is highest in employees who have job satisfaction level with 22.8%'''

In [None]:
# Plotting 'Attrition%' vs 'JobSatisfaction'
plt.figure(figsize = (15,5))
sns.barplot(x = 'JobSatisfaction', y = 'Attrition%', data = att_by_jobsatisfaction)
plt.show()


### Analysis on variable 'MaritalStatus'

In [None]:
# Getting total count by category
attrition_df['MaritalStatus'].value_counts()

# Insight:
#        Total count of married employees is 1346 which is higher as compared to single and divorced employees

In [None]:
# Grouping data according to attrition
attrition_df.groupby(['MaritalStatus','Attrition'])['Attrition'].count()

In [None]:
# Visualizing total count of 'MaritalStatus'
plt.figure(figsize = (15,5))
sns.countplot('MaritalStatus' , data = attrition_df).set(title = 'Count by Marital Status')
plt.show()

In [None]:
# Visualizing 'Attrition' in each category
plt.figure(figsize = (15,5))
sns.countplot('MaritalStatus' , data = attrition_df , hue = 'Attrition')
plt.show()

# Insight:
'''From the plot below we can see that attition is more in employees who are single as compared to employees
  who are married or divorced'''

### Analysis on variable 'OverTime'

In [None]:
# Getting total count by category
attrition_df['OverTime'].value_counts()

In [None]:
# Grouping data by 'Attrition'
attrition_df.groupby(['OverTime', 'Attrition'])['Attrition'].count()

In [None]:
# Visualizing total count
plt.figure(figsize = (15,5))
sns.countplot('OverTime' , data = attrition_df).set(title = 'Count by Over Time')
plt.show()

In [None]:
# Calculating 'Attrition' by 'OverTime'
overtime_group = attrition_df.groupby(['OverTime', 'Attrition'])
overtime_group.groups.keys()

In [None]:
att_by_overtime = pd.DataFrame(columns = ['OverTime', 'Attrition', 'Count', 'Attrition%'])
att_by_overtime

In [None]:
count = overtime_group.get_group(('Yes',1))['Attrition'].count()
att_by_overtime.loc[len(att_by_overtime)] = ['Yes','1',count,(count/attrition_df['OverTime'].value_counts()[1])*100]
att_by_overtime

In [None]:
count = overtime_group.get_group(('No', 1))['Attrition'].count()
att_by_overtime.loc[len(att_by_overtime)] = ['No','1',count,(count/attrition_df['OverTime'].value_counts()[1])*100]
att_by_overtime

# Insight:
'''Attrition is higher in people who do overtime'''

In [None]:
# Plotting 'Attrition%' vs 'OverTime'
plt.figure(figsize = (15,5))
sns.barplot(x = 'OverTime' , y = 'Attrition%', data = att_by_overtime)
plt.show()

### Analysis on 'MonthlyIncome'

In [None]:
# Descriptive statistics on 'MonthlyIncome'
attrition_df['MonthlyIncome'].describe()
 
# Insight:
'''From the five number summary, we can see that mean monthly income is 6502.9 and median of monthly income is 4919.
   Since these numbers are statistically different, there are outliers present.
   Minimum monthly income of employees is 1009.0 and maximum income is 199999.0.
   InterQuartile Range of monthly income is 8380.0- 2911.0=5469.0'''

In [None]:
# Plotting distribution
plt.figure(figsize = (12,5))
sns.distplot(attrition_df['MonthlyIncome'])
plt.show()

# Insight:
'''The distribution plot for monthly income is right skewed'''

In [None]:
# Plotting boxplot
plt.figure(figsize = (12,5))
sns.boxplot(attrition_df['MonthlyIncome'])
plt.show()

# Insight:
'''From the boxplot we can see that median of montly income is near 5000.Also, there are outliers present'''

In [None]:
# Plotting 'Attrition' vs 'MonthlyIncome'
plt.figure(figsize = (15,8))
sns.swarmplot(x = 'Attrition', y = 'MonthlyIncome' ,data = attrition_df , hue = 'MaritalStatus')
plt.show()

# Insight:
'''Attrition is highest in employees where monthly income is between 2500 to 4000'''


In [None]:
# Plotting 'MonthlyIncome' vs 'JobLevel'
plt.figure(figsize = (15,5))
sns.swarmplot(x = 'JobLevel', y = 'MonthlyIncome', data = attrition_df, hue = 'Department')
plt.show()

In [None]:
# Plotting 'MonthlyIncome' vs 'Age'
plt.figure(figsize = (15,5))
sns.regplot(x = 'Age', y = 'MonthlyIncome', data = attrition_df)
plt.show()

# Insight:
'''There is linear relationship between monthly income and age.As the age is increasing monthly income
   is also increasing'''

### Analysis on 'PercentSalaryHike'

In [None]:
# Descriptive statistics on 'PercentSalaryHike'
attrition_df['PercentSalaryHike'].describe()

# Insight:
'''From the five number summary,we can see that mean of salary hike is 15.2 and median is 14.0
   These values are statistically near each other,hence no outliers present.
   Minumum percent salary hike is 11.0 and maximum is 25.0
   InterOuartile Range for percent salary hike is 18.0-12.0=6.0'''

In [None]:
# Plotting distribution
plt.figure(figsize = (15,5))
sns.distplot(attrition_df['PercentSalaryHike'])
plt.show()

# Insight:
'''The distribution plot for salary hike is right skewed'''


In [None]:
# Plotting boxplot
plt.figure(figsize = (15,5))
sns.boxplot(attrition_df['PercentSalaryHike'])
plt.show()

# Insight:
'''From the boxplot we can see that maximum number of hike is given between 12% and 18%. Median salary hike is 14%'''

In [None]:
# Plotting 'PercentSalaryHike' for each 'Department'
plt.figure(figsize =(15,6))
sns.boxplot(x = 'Department', y = 'PercentSalaryHike', data = attrition_df)
plt.show()

# Insight:
"""From the boxplot we can see that all three department have the same median value of 14% but maximum salary hike
   percent for Human Resources is less as compared to Sales and Research & Development"""

In [None]:
# Plotting 'PercentSalaryHike' for 'JobLevel'
plt.figure(figsize = (15,6))
sns.boxplot(x = 'JobLevel', y = 'PercentSalaryHike', data = attrition_df)
plt.show()

# Insight:
       """From the boxplot we can see that all job level 1 to 5 have same median value of 14% but the maximum
       hike for job level 5 is less as compared to the other four job levels. job level 5 shows outlier"""

In [None]:
# Plotting 'PercentSalaryHike' for 'JobSatiSfaction'
plt.figure(figsize = (15,6))
sns.boxplot(x = 'JobSatisfaction', y = 'PercentSalaryHike', data = attrition_df)
plt.show()

# Insight:
       """From the boxplot we can see that employees who have given job satisfaction as 1,2 and 4 have the maximum
         percent hike in their salary(25%)"""

In [None]:
# Plotting 'PercentSalaryHike' vs 'YearsAtCompany'
plt.figure(figsize = (18,5))
sns.boxplot(x = 'YearsAtCompany', y = 'PercentSalaryHike', data = attrition_df)
plt.show()

# Insight:
'''We can see that the median hike % from 1 year to 10 years at compaany is almost same'''


In [None]:
# Plotting 'PercentSalaryHike' vs 'PerformanceRating'
plt.figure(figsize = (15,5))
sns.boxplot(x = 'PerformanceRating', y = 'PercentSalaryHike', data = attrition_df)
plt.show()

# Insight:
'''Performance rating 4 has higher salary hike between 20-25 as compared to rating 3 which has salary hike 
   between 11-19'''

### Analysis on 'PerformanceRating'



In [None]:
# Getting total count of 'Performancerating'
attrition_df['PerformanceRating'].value_counts()

In [None]:
# Grouping with respect to 'Attrition'
attrition_df.groupby(['PerformanceRating','Attrition'])['Attrition'].count()

In [None]:
# Visualizing total count
plt.figure(figsize = (15,5))
sns.countplot('PerformanceRating', data = attrition_df).set(title = 'Count by Performance Rating')
plt.show()

In [None]:
# Calculating 'Attrition' with respect to 'PerformanceRating'
perform_group = attrition_df.groupby(['PerformanceRating', 'Attrition'])
perform_group.groups.keys()

In [None]:
att_by_performance = pd.DataFrame(columns = ['PerformanceRating', 'Attrition', 'Count', 'Attrition%'])
att_by_performance

In [None]:
count1 = perform_group.get_group((3,1))['Attrition'].count()
att_by_performance.loc[len(att_by_performance)] = ['3','1',count1,
                                            (count1/attrition_df['PerformanceRating'].value_counts()[3])*100]
att_by_performance

In [None]:
count1 = perform_group.get_group((4,1))['Attrition'].count()
att_by_performance.loc[len(att_by_performance)] = ['4','1',count1,
                                            (count1/attrition_df['PerformanceRating'].value_counts()[4])*100]
att_by_performance


# Insight:
'''Attrition is statistically same for both performance rating '''

In [None]:
att_by_performance.drop([2])

In [None]:
# Plotting 'Attrition%' vs 'PerformanceRating'
plt.figure(figsize = (15,5))
sns.barplot(x = 'PerformanceRating', y = 'Attrition%', data = att_by_performance)
plt.show()


In [None]:
attrition_df.head()

### Analysis on 'RelationshipSatisfaction'

In [None]:
# Getting total count of 'RelationshipSatisfaction'
attrition_df['RelationshipSatisfaction'].value_counts()

In [None]:
# Grouping with respect to 'Attrition'
attrition_df.groupby(['RelationshipSatisfaction', 'Attrition'])['Attrition'].count()

In [None]:
# Visualizing total count
plt.figure(figsize = (15,5))
sns.countplot('RelationshipSatisfaction' , data = attrition_df).set(title = 'Count by Relationship Satisfaction')
plt.show()

In [None]:
# Calculating 'Attrition' with respect to 'RelationshipSatisfaction'
relation_group = attrition_df.groupby(['RelationshipSatisfaction','Attrition'])
relation_group.groups.keys()

In [None]:
att_by_relation = pd.DataFrame(columns = ['RelationshipSatisfaction','Attrition','Count','Attrition%'])
att_by_relation

In [None]:
count1 = relation_group.get_group((1,1,))['Attrition'].count()
att_by_relation.loc[len(att_by_relation)] = ['1','1',count1,
                                            (count1/attrition_df['RelationshipSatisfaction'].value_counts()[1])*100]
att_by_relation

In [None]:
count1 = relation_group.get_group((2,1,))['Attrition'].count()
att_by_relation.loc[len(att_by_relation)] = ['2','1',count1,
                                            (count1/attrition_df['RelationshipSatisfaction'].value_counts()[2])*100]
att_by_relation

In [None]:
count1 = relation_group.get_group((3,1,))['Attrition'].count()
att_by_relation.loc[len(att_by_relation)] = ['3','1',count1,
                                            (count1/attrition_df['RelationshipSatisfaction'].value_counts()[3])*100]
att_by_relation

In [None]:
count1 = relation_group.get_group((4,1,))['Attrition'].count()
att_by_relation.loc[len(att_by_relation)] = ['4','1',count1,
                                            (count1/attrition_df['RelationshipSatisfaction'].value_counts()[4])*100]
att_by_relation

# Insight:
'''Attrition is highest in employees having realtionship satisfaction as 1'''

In [None]:
# Plotting 'Attrition%' vs 'RelationshipSatisfaction'
plt.figure(figsize = (15,5))
sns.barplot(x = 'RelationshipSatisfaction', y = 'Attrition%', data = att_by_relation)
plt.show()

### Analysis on 'TotalWorkingYears'


In [None]:
# Descriptive statitics on 'TotalWorkingYears'
attrition_df['TotalWorkingYears'].describe()

# Insight:
'''From the five number summary,we can see that mean of total working years of employees isn11.27 and median is 10.0
   These values are statistically near each other,hence no outliers present.
   Maximum year shown is 40'''

In [None]:
# Plotting distribution
plt.figure(figsize = (15,5))
sns.distplot(attrition_df['TotalWorkingYears'])
plt.show()

# Insight:
'''The normal distribution curve is right skewed'''

In [None]:
# Plotting boxplot
plt.figure(figsize = (15,5))
sns.boxplot(attrition_df['TotalWorkingYears'])
plt.show()

# Insight:
'''Median working years is 10 years.Also,there are outliers present'''

In [None]:
# Plotting 'Attrition' vs 'TotalWorkingYears'
plt.figure(figsize = (15,7))
sns.swarmplot(x = 'Attrition', y = 'TotalWorkingYears', data = attrition_df)
plt.show()

# Insight:
'''Attrition is high for total working years of employees between 2 to 10 years'''

### Analysis on 'TrainingTimesLastYear'

In [None]:
# Descriptive statistics on 'TrainingTimesLastYear'
attrition_df['TrainingTimesLastYear'].describe()

# Insight:
'''Frome the five number summary,mean of training given to employees in last year is 2.7
   Maximum times training was given was 6'''

In [None]:
# Plotting boxplot
plt.figure(figsize = (8,5))
sns.boxplot(attrition_df['TrainingTimesLastYear'])
plt.show()

# Insight:
'''The range of training given to employees in last year is between 1-4.Outliers are seen'''

In [None]:
# Plotting 'Attrition' vs 'TrainingTimesLastYear'
plt.figure(figsize = (12,4))
sns.swarmplot(x = 'Attrition', y = 'TrainingTimesLastYear', data = attrition_df)
plt.show()

# Insight:
'''Attrition is high in employees who either got no training or who got 2 to 4 times training last year'''


### Analysis on 'WorkLifeBalance'

In [None]:
# Getting total count of 'WorkLifeBalance'
attrition_df['WorkLifeBalance'].value_counts()

In [None]:
# Grouping this data by 'Attrition'
attrition_df.groupby(['WorkLifeBalance', 'Attrition'])['Attrition'].count()

In [None]:
# Visualizing total count
plt.figure(figsize = (15,5))
sns.countplot('WorkLifeBalance', data = attrition_df).set(title = 'Count by Work Life Balance')
plt.show()

In [None]:
# Calculating 'Attrition' with respect to 'WorkLifeBalance'
balance_group = attrition_df.groupby(['WorkLifeBalance','Attrition'])
balance_group.groups.keys()

In [None]:
att_by_balance = pd.DataFrame(columns = ['WorkLifeBalance','Attrition','Count','Attrition%'])
att_by_balance

In [None]:
count1 = balance_group.get_group((1,1))['Attrition'].count()
att_by_balance.loc[len(att_by_balance)] = ['1','1',count1,
                                           (count1/attrition_df['WorkLifeBalance'].value_counts()[1])*100]
att_by_balance

In [None]:
count1 = balance_group.get_group((2,1))['Attrition'].count()
att_by_balance.loc[len(att_by_balance)] = ['2','1',count1,
                                           (count1/attrition_df['WorkLifeBalance'].value_counts()[2])*100]
att_by_balance

In [None]:
count1 = balance_group.get_group((3,1))['Attrition'].count()
att_by_balance.loc[len(att_by_balance)] = ['3','1',count1,
                                           (count1/attrition_df['WorkLifeBalance'].value_counts()[3])*100]
att_by_balance

In [None]:
count1 = balance_group.get_group((4,1))['Attrition'].count()
att_by_balance.loc[len(att_by_balance)] = ['4','1',count1,
                                           (count1/attrition_df['WorkLifeBalance'].value_counts()[4])*100]
att_by_balance

# Insight:
'''Attrition is highest where employees have given rating 1 to work life balance'''

In [None]:
# Plotting 'Attrition%' vs 'WorkLifeBalance'
plt.figure(figsize = (15,5))
sns.barplot(x = 'WorkLifeBalance', y = 'Attrition%', data = att_by_balance)
plt.show()

### Analysis on 'YearsAtCompany'

In [None]:
# Descriptive statistics on 'YearsAtCompany'
attrition_df['YearsAtCompany'].describe()

# Insight:
'''Frome the five number summary,mean number of years employee has been at the company is 7 years
   Maximum number of years employee has been at the company is 40'''

In [None]:
# Plotting distribution
plt.figure(figsize = (15,5))
sns.distplot(attrition_df['YearsAtCompany'])
plt.show()

# Insight:
'''The normal distribution curve is right skewed'''

In [None]:
# Plotting Boxplot
plt.figure(figsize = (15,5))
sns.boxplot(attrition_df['YearsAtCompany'])
plt.show()

# Insight:
'''From the boxplot below we can see that most of the employees have between 3-9 years in the company.There are
   outliers present'''

In [None]:
# Plotting 'Attrition' vs 'YearsAtCompany'
plt.figure(figsize = (15,8))
sns.swarmplot(x = 'Attrition', y = 'YearsAtCompany', data = attrition_df, hue = 'StockOptionLevel')
plt.show()

# Insight:
'''Attrition is highest in employees who have had between 1-10 years at the company'''

### Analysis on 'YearsInCurrentRole'

In [None]:
# Descriptive statistics on 'YearsInCurrentRole'
attrition_df['YearsInCurrentRole'].describe()

# Insight:
'''From the five number summary,mean employees in thier current role in last year is 4 years
   Maximum years in current role is  18'''

In [None]:
# Plotting distribution
plt.figure(figsize = (15,5))
sns.distplot(attrition_df['YearsInCurrentRole'])
plt.show()

# Insight:
'''The normal distribution curve is right skewed'''

In [None]:
# Plotting Boxplot
plt.figure(figsize = (15,5))
sns.boxplot(attrition_df['YearsInCurrentRole'])
plt.show()

# Insight:
'''From the plot below we can see that most of the employeees have been in the current role for 
   somehwere between 1-7.5 years. There are outliers present as well'''

In [None]:
# Plotting 'Attrition' vs 'YearsInCurrentRole'
plt.figure(figsize = (15,5))
sns.swarmplot(x = 'Attrition', y = 'YearsInCurrentRole', data = attrition_df, hue = 'PerformanceRating')
plt.show()

# Insight:
'''From the plot below we can see that maximum attrition has happened for employees who have worked in their 
   current role for 1 year, 2.5 years and 7.5 years.'''

### Analysis on 'YearsSinceLastPromotion'

In [None]:
# Descriptive statistics on 'YearsSinceLastPromotion'
attrition_df['YearsSinceLastPromotion'].describe()

# Insight:
'''From the five number summary,mean of employees since their last promotion is 2.
   Maximum years since employee had promotion is 15 years'''

In [None]:
# Plotting distribution
plt.figure(figsize = (15,5))
sns.distplot(attrition_df['YearsSinceLastPromotion'])
plt.show()

# Insight:
'''The normal distribution curve is right skewed'''

In [None]:
# Plotting Boxplot
plt.figure(figsize = (15,5))
sns.boxplot(attrition_df['YearsSinceLastPromotion'])
plt.show()

# Insight:
'''From the boxplot we can see that years since most employees got their last promotion is between 0-5 years '''

In [None]:
# Plotting 'Attrition' vs 'YearsSinceLastPromotion'
plt.figure(figsize = (15,5))
sns.swarmplot(x = 'Attrition', y = 'YearsSinceLastPromotion', data = attrition_df)
plt.show()

#Insight:
'''Attrition is higher in employees who got their last promotion 0-3 years'''


### Analysis on 'YearsWithCurrManager'

In [None]:
# Descriptive statistics on 'YearsWithCurrManager'
attrition_df['YearsWithCurrManager'].describe()

# Insight:
'''From the five number summary,mean of years employee has with current manager is 4 years
   Maximum years employee has had with current manager is 17 years'''

In [None]:
# Plotting distribution
plt.figure(figsize = (15,5))
sns.distplot(attrition_df['YearsWithCurrManager'])
plt.show()

# Insight:
'''The normal distribution curve is right skewed'''

In [None]:
# Plotting Boxplot
plt.figure(figsize = (15,5))
sns.boxplot(attrition_df['YearsWithCurrManager'])
plt.show()

# Insight:
'''From the plot below we can see that most of the employees have had between 2-7 years with 
   their current manager.There are outliers present'''

In [None]:
# Plotting 'Attrition' vs 'YearsWithCurrManager'
plt.figure(figsize = (15,5))
sns.swarmplot(x = 'Attrition', y = 'YearsWithCurrManager', data = attrition_df, hue = 'RelationshipSatisfaction')
plt.show()

# Insight:
'''Attrition is high in employees who have had between 1-7.5 years with their current managers'''


# Outlier Removal

In [None]:
# There are outliers present in 'YearsWithCurrManager','YearsSinceLastPromotion','YearsInCurrentRole'
# 'YearsAtCompany','TrainingTimesLastYear','TotalWorkingYears','MonthlyIncome', variables/columns.

# For model building these outliers are not removed as Random Forest model can handle outliers.
# The accuracy is coming to 90% when all the data is used


# Data Pre-Processing

In [None]:
contact_df = attrition_df

In [None]:
def preprocessor(df):
    res_df = df.copy()
    le = preprocessing.LabelEncoder()
    
    res_df['BusinessTravel'] = le.fit_transform(res_df['BusinessTravel'])
    res_df['Department'] = le.fit_transform(res_df['Department'])
    res_df['EducationField'] = le.fit_transform(res_df['EducationField'])
    res_df['Gender'] = le.fit_transform(res_df['Gender'])
    res_df['JobRole'] = le.fit_transform(res_df['JobRole'])
    res_df['MaritalStatus'] = le.fit_transform(res_df['MaritalStatus'])
    res_df['OverTime'] = le.fit_transform(res_df['OverTime'])
    return res_df

In [None]:
encoded_df = preprocessor(contact_df)

X = encoded_df.drop(['Attrition'],axis = 1).values
y = encoded_df['Attrition'].values


# Splitting Training and Testing Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =0.2, random_state = 200)

In [None]:
# Looking at the shape of all the training and testing data
print('Training Features Shape: ', X_train.shape)
print('Training Labels shape: ', y_train.shape)
print('Testing Features shape: ', X_test.shape)
print('Testing Labels shape: ', y_test.shape)

# Random Forest Classifier

In [None]:
# Set the random state 
fit_rf = RandomForestClassifier(random_state = 200)

# OOB(Out Of Bag) Error

In [None]:
fit_rf.set_params(warm_start=True, 
                  oob_score=True)

min_estimators = 100
max_estimators = 1000

error_rate = {}

for i in range(min_estimators, max_estimators + 1):
    fit_rf.set_params(n_estimators=i)
    fit_rf.fit(X_train, y_train)

    oob_error = 1 - fit_rf.oob_score_
    error_rate[i] = oob_error

In [None]:
# Convert dictionary to a pandas series for easy plotting 
oob_series = pd.Series(error_rate)

In [None]:
fig, ax = plt.subplots(figsize=(15, 10))

ax.set_facecolor('#fafafa')

oob_series.plot(kind='line',color = 'red')
plt.axhline(0.0375, color='#875FDB',linestyle='--')
plt.axhline(0.0360, color='#875FDB',linestyle='--')
plt.xlabel('n_estimators')
plt.ylabel('OOB Error Rate')
plt.title('OOB Error Rate Across various Forest sizes \n(From 100 to 1000 trees)')

In [None]:
# Calculated n_estimator using OOB Error. n_estimator used here is 800
print('OOB Error rate for 800 trees is: {0:.5f}'.format(oob_series[800]))

# Hyper-Parameter Optimization Using GridSearchCV

In [None]:
np.random.seed(42)
start = time.time()

param_dist = {'max_depth': [2, 3, 4,5,6],
              'bootstrap': [True, False],
              'max_features': ['auto', 'sqrt', 'log2', None],
              'criterion': ['gini', 'entropy']}

cv_rf = GridSearchCV(fit_rf, cv = 10,
                     param_grid=param_dist, 
                     n_jobs = 3)

cv_rf.fit(X_train, y_train)
print('Best Parameters using grid search: \n', cv_rf.best_params_)
end = time.time()
print('Time taken in grid search: {0: .2f}'.format(end - start))

In [None]:
# Set best parameters given by grid search 
fit_rf.set_params(criterion = 'gini',
                  max_features = None, 
                  max_depth = 6)

In [None]:
# Refine the tree via OOB Output
fit_rf.set_params(n_estimators=800,
                  bootstrap = True,
                  warm_start=False, 
                  oob_score=False)

# Train Random Forest

In [None]:
fit_rf.fit(X_train, y_train)

# Predictions

In [None]:
prediction_rf = fit_rf.predict(X_test)

# Confusion Matrix

In [None]:
print (confusion_matrix(y_test, prediction_rf))

In [None]:
import itertools
from sklearn.metrics import confusion_matrix
def plot_confusion_matrix(model, normalize=False): # This function prints and plots the confusion matrix.
    cm = confusion_matrix(y_test, model, labels=[0, 1])
    classes=["Success", "Default"]
    cmap = plt.cm.Reds
    title = "Confusion Matrix"
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        cm = np.around(cm, decimals=3)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
plt.figure(figsize=(6,6))
plot_confusion_matrix(prediction_rf, normalize=False)
plt.show()

# Accuracy

In [None]:
accuracy_rf = fit_rf.score(X_test, y_test)

print("Here is our mean accuracy on the test set:\n {0:.3f}"\
      .format(accuracy_rf))

# Error Rate

In [None]:
# Here we calculate the test error rate!
test_error_rate_rf = 1 - accuracy_rf
print("The test error rate for our model is:\n {0: .4f}"\
      .format(test_error_rate_rf))

# Area Under Curve(AUC)

In [None]:
predictions_prob = fit_rf.predict_proba(X_test)[:, 1]

fpr2, tpr2, _ = roc_curve(y_test,
                          predictions_prob,
                          pos_label = 1)

In [None]:
auc_rf = auc(fpr2, tpr2)

In [None]:
def plot_roc_curve(fpr, tpr, auc, estimator, xlim=None, ylim=None):
    """
    Purpose
    ----------
    Function creates ROC Curve for respective model given selected parameters.
    Optional x and y limits to zoom into graph

    Parameters
    ----------
    * fpr: Array returned from sklearn.metrics.roc_curve for increasing
            false positive rates
    * tpr: Array returned from sklearn.metrics.roc_curve for increasing
            true positive rates
    * auc: Float returned from sklearn.metrics.auc (Area under Curve)
    * estimator: String represenation of appropriate model, can only contain the
    following: ['knn', 'rf', 'nn']
    * xlim: Set upper and lower x-limits
    * ylim: Set upper and lower y-limits
    """
    my_estimators = {'knn': ['Kth Nearest Neighbor', 'deeppink'],
              'rf': ['Random Forest', 'red'],
              'nn': ['Neural Network', 'purple']}

    try:
        plot_title = my_estimators[estimator][0]
        color_value = my_estimators[estimator][1]
    except KeyError as e:
        print("'{0}' does not correspond with the appropriate key inside the estimators dictionary. \
\nPlease refer to function to check `my_estimators` dictionary.".format(estimator))
        raise

    fig, ax = plt.subplots(figsize=(10, 10))
    ax.set_facecolor('#fafafa')

    plt.plot(fpr, tpr,
             color=color_value,
             linewidth=1)
    plt.title('ROC Curve For {0} (AUC = {1: 0.3f})'\
              .format(plot_title, auc))

    plt.plot([0, 1], [0, 1], 'k--', lw=2) # Add Diagonal line
    plt.plot([0, 0], [1, 0], 'k--', lw=2, color = 'black')
    plt.plot([1, 0], [1, 1], 'k--', lw=2, color = 'black')
    if xlim is not None:
        plt.xlim(*xlim)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.show()
    plt.close()


In [None]:
plot_roc_curve(fpr2, tpr2, auc_rf, 'rf',
               xlim=(-0.01, 1.05), 
               ylim=(0.001, 1.05))

# Variable Importance

In [None]:
names_index = ['Age', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeNumber',
       'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
       'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'OverTime',
       'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager']

In [None]:
feature_names = names_index

In [None]:
importances = fit_rf.feature_importances_

In [None]:
X_train = pd.DataFrame(X_train, columns=feature_names)

In [None]:
# Sort the feature importance in descending order
sorted_indices = np.argsort(importances)[::-1]

In [None]:
plt.figure(figsize = (12,5))
plt.title('Feature Importance')
plt.bar(range(X_train.shape[1]), importances[sorted_indices], align='center')
plt.xticks(range(X_train.shape[1]), X_train.columns[sorted_indices], rotation=90)
plt.tight_layout()
plt.show()

# Conclusion:

### The accuracy of the model on the test set 89.8% and the Area Under Curve is 94.3%. The test error rate is 0.1%. From this we can say that there are no chance of overfitting or underfitting. Important features that are contributing to the attrition of the employees are 'MonthlyIncome', 'OverTime', Age', 'DailyRate' 'DistanceFromHome', 'TotalWorkingYears', 'StockOptionLevel'  and  'YearsAtcompany'   