## Problem Statement

The objective of the project is to uncover the drivers of attrition and build a model to predict the propensity of employee attrition.

### Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Importing the Data for ML Project

In [None]:
df = pd.read_csv('/Users/priyankac/Downloads/employee_attrition.csv', header = 0)

In [None]:
# Checking the number of rows and columns in the dataset
df.shape

In [None]:
# Check the first 5 records in the dataset
df.head()

In [None]:
# Check the data type of different columns
df.dtypes

In [None]:
#df['EmployeeCount'].sum()

### Setting Display options to ensure feature name visibility

In [None]:
pd.set_option('display.max_columns', None)

### Warning Suppression

In [None]:
import warnings
warnings.filterwarnings('ignore')

### Drop ID Feature from the dataset

In [None]:
df = df.drop(['EmployeeNumber'], axis = 1)

### Create Target feature

In [None]:
df['target'] = np.where(df['Attrition'] == 'Yes', 1, 0)

In [None]:
df = df.drop(['Attrition'], axis = 1)

### Defining Target and Independent Features

In [None]:
Y = df[['target']]
X = df.drop(['target'], axis = 1)

### Get the Attrition(Event) Rate

In [None]:
Y.mean()

### Split features into Numerical and Categorical

In [None]:
num = X.select_dtypes(include = 'number')
char = X.select_dtypes(include = 'object')

In [None]:
def unique_levels(x):
    x = x.value_counts().count()
    return (x)

df_value_counts = pd.DataFrame(num.apply(lambda x : unique_levels(x)))

In [None]:
df_value_counts.columns = ['feature_levels']
df_value_counts.head()

In [None]:
slice1 = df_value_counts.loc[df_value_counts['feature_levels'] <= 20]
cat_list = slice1.index
cat = num.loc[:, cat_list]
cat.dtypes

In [None]:
slice2 = df_value_counts.loc[df_value_counts['feature_levels'] > 20]
num_list = slice2.index
num = num.loc[:, num_list]

In [None]:
num.dtypes

In [None]:
num.shape

In [None]:
# Joining the cat dataframe with the char dataframe
char = pd.concat([char, cat] , axis = 1, join = 'inner')

In [None]:
char.head()

In [None]:
char.shape

### Outlier Analysis of Numerical Features

In [None]:
num.describe(percentiles = [0.01,0.05,0.10,0.25,0.50,0.75,0.85,0.90,0.99])

### Capping and Flooring of Outliers

In [None]:
def outlier_cap(x):
    x = x.clip(lower = x.quantile(0.01))
    x = x.clip(upper = x.quantile(0.99))
    return(x)

In [None]:
num = num.apply(lambda x : outlier_cap(x))

In [None]:
num.describe(percentiles = [0.01,0.05,0.10,0.25,0.50,0.75,0.85,0.90,0.99])

### Missing Values Analysis

In [None]:
num.isnull().mean()

In [None]:
char.isnull().mean()

## Feature Selection - Numerical Features

### Part 1 : Remove Features with 0 Variance


In [None]:
from sklearn.feature_selection import VarianceThreshold
varselector = VarianceThreshold(threshold = 0)
varselector.fit_transform(num)

# Get columns to keep and create a new dataframe with those only
cols = varselector.get_support(indices = True)
num_1 = num.iloc[:, cols]

In [None]:
num.iloc[0]

### Part 2 : Bi Variate Analysis(Feature Discretization)

In [None]:
from sklearn.preprocessing import KBinsDiscretizer
discrete = KBinsDiscretizer(n_bins = 10, encode = 'ordinal', strategy = 'quantile')
num_binned = pd.DataFrame(discrete.fit_transform(num), index = num.index, columns = num.columns).add_suffix('_Rank')
num_binned.tail()

In [None]:
# Check if the feature show a slope at all
# If they do, then do you see some deciles below the population average and some higher than population average?
# If that is the case then the slope will be strong

# Conclusion: A strong slope is indicative of the faetures' ability to descriminate the event from non event
#             making it a good predictor

X_bin_combined = pd.concat([Y, num_binned], axis = 1, join = 'inner')

from numpy import mean
for col in (num_binned.columns):
    plt.figure()
    sns.lineplot(x = col, y = X_bin_combined['target'].mean(), data = X_bin_combined, color = 'red')
    sns.barplot(x = col, y = 'target', data = X_bin_combined, estimator = mean)
plt.show()    

In [None]:
num_varlist = ['DailyRate', 'HourlyRate', 'MonthlyRate']

In [None]:
num_1 = num_1.drop(num_varlist, axis = 1)

In [None]:
num_1.dtypes

### Part 3 : Select K Best

In [None]:
from sklearn.feature_selection import SelectKBest, chi2
selector = SelectKBest(chi2, k = 4)
selector.fit_transform(num_1, Y)

# Get columns to keep and create new dataframe with those only
cols = selector.get_support(indices = True)
select_features_df_num = num_1.iloc[:, cols]

In [None]:
select_features_df_num.iloc[0]

In [None]:
select_features_df_num.shape

## Feature Selection - Categorical Features

In [None]:
def unique_levels(x):
    x = x.value_counts().count()
    return(x)

char_unique_levels = pd.DataFrame(char.apply(lambda x : unique_levels(x)))

In [None]:
char_unique_levels.columns = ['feature_levels']
slice1 = char_unique_levels.loc[char_unique_levels['feature_levels'] > 1]
cat_list = slice1.index
char = char.loc[:, cat_list]

### Part 1 : Bi Variate Analysis

In [None]:
X_char_merged = pd.concat([Y, char], axis = 1, join = 'inner')

from numpy import mean
for col in (char.columns):
    plt.figure()
    sns.lineplot(x = col, y = X_char_merged['target'].mean(), data = X_char_merged, color = 'red')
    sns.barplot(x = col, y = 'target', data = X_char_merged, estimator = mean)
plt.show()    

In [None]:
charlist = ['Gender', 'Education', 'PerformanceRating']
char_1 = char.drop(charlist, axis = 1)

In [None]:
char_1.dtypes

In [None]:
# Before using the get_dummies on categorical feature conveting the int type features to ogject type

char_1['EnvironmentSatisfaction'] = char_1['EnvironmentSatisfaction'].astype('object')
char_1['JobInvolvement'] = char_1['JobInvolvement'].astype('object')
char_1['JobLevel'] = char_1['JobLevel'].astype('object')
char_1['JobSatisfaction'] = char_1['JobSatisfaction'].astype('object')
char_1['NumCompaniesWorked'] = char_1['NumCompaniesWorked'].astype('object')
char_1['PercentSalaryHike'] = char_1['PercentSalaryHike'].astype('object')
char_1['RelationshipSatisfaction'] = char_1['RelationshipSatisfaction'].astype('object')
char_1['StockOptionLevel'] = char_1['StockOptionLevel'].astype('object')
char_1['TrainingTimesLastYear'] = char_1['TrainingTimesLastYear'].astype('object')
char_1['WorkLifeBalance'] = char_1['WorkLifeBalance'].astype('object')
char_1['YearsInCurrentRole'] = char_1['YearsInCurrentRole'].astype('object')
char_1['YearsSinceLastPromotion'] = char_1['YearsSinceLastPromotion'].astype('object')
char_1['YearsWithCurrManager'] = char_1['YearsWithCurrManager'].astype('object')

In [None]:
char_1.dtypes

In [None]:
# Create dummy features with n-1 levels
X_char_dum = pd.get_dummies(char_1, drop_first = True)
X_char_dum.shape

In [None]:
X_char_dum.head()

### Part 2 : Select K Best

In [None]:
# Select K Best for Categorical Features
from sklearn.feature_selection import SelectKBest, chi2
selector = SelectKBest(chi2, k = 110)
selector.fit_transform(X_char_dum, Y)

# Get columns to keep and create new dataframe with those only
cols = selector.get_support(indices = True)
select_features_df_char = X_char_dum.iloc[:, cols]

In [None]:
select_features_df_char.iloc[0]

## Creating the Master Feature Set for Model Development

In [None]:
X_all = pd.concat([select_features_df_char, select_features_df_num], axis = 1, join = 'inner')

## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_all, Y, test_size = 0.3, random_state = 20)

In [None]:
print('Shape of Training Data : ', X_train.shape)
print('Shape od Testing Data : ', X_test.shape)
print('Attrition Rate in Training Data : ', y_train.mean())
print('Attrition Rate in Testing Data : ', y_test.mean())

## Model Development

In [None]:
# Non linearity in feature relationships are observed which makes tree methods a good choice

# There are few options to consider among tree methods:
# White box(Completely Explainable Set of Rules) - Decision Tree
# Ensemble methods - Random Forest(with Bagging)
# Ensemble methods - GBM/XGBoost(Boosting)

In [None]:
# Building a Decision Tree Model
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(criterion = 'gini', random_state = 20)

In [None]:
# Using the Grid Search to find the best parameters
np.random.seed(44)
from sklearn.model_selection import GridSearchCV
param_dist = {'max_depth' : [3,4,6], 'min_samples_split' : [50,120,180,250]}
tree_grid = GridSearchCV(dtree, cv = 10, param_grid = param_dist, n_jobs = -1)
tree_grid.fit(X_train, y_train)

print('Best parameters using Grid Search : \n', tree_grid.best_params_)

In [None]:
dtree = DecisionTreeClassifier(criterion = 'gini', random_state = 20, max_depth = 4, min_samples_split = 50)
dtree.fit(X_train, y_train)

In [None]:
# Building a Random Forest Model
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(criterion = 'gini', random_state = 20, max_depth = 4, min_samples_split = 50)
rf.fit(X_train, y_train)

In [None]:
# Building a Gradient Boosting Model
from sklearn.ensemble import GradientBoostingClassifier
gbm = GradientBoostingClassifier(criterion = 'mse', random_state = 20, max_depth = 4, min_samples_split = 50)
gbm.fit(X_train, y_train)

### Model Evaluation

In [None]:
# For Decision Tree model
y_pred_tree = dtree.predict(X_test)

# for Random Forest model
y_pred_rf = rf.predict(X_test)

# For Gradient Boosting model
y_pred_gbm = gbm.predict(X_test)

In [None]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix

In [None]:
# For Decision Tree
metrics.plot_confusion_matrix(dtree, X_test, y_test)

In [None]:
# For Decision Tree

print('Accuracy : ',metrics.accuracy_score(y_test, y_pred_tree))
print('Precision : ',metrics.precision_score(y_test, y_pred_tree))
print('Recall : ',metrics.recall_score(y_test, y_pred_tree))
print('f1_score : ',metrics.f1_score(y_test, y_pred_tree))

In [None]:
# For Random Forest
metrics.plot_confusion_matrix(rf, X_test, y_test)

In [None]:
# For Random Forest

print('Accuracy : ',metrics.accuracy_score(y_test, y_pred_rf))
print('Precision : ',metrics.precision_score(y_test, y_pred_rf))
print('Recall : ',metrics.recall_score(y_test, y_pred_rf))
print('f1_score : ',metrics.f1_score(y_test, y_pred_rf))

In [None]:
# For Gradient boosting
metrics.plot_confusion_matrix(gbm, X_test, y_test)

In [None]:
# For Gradient Boosting

print('Accuracy : ',metrics.accuracy_score(y_test, y_pred_gbm))
print('Precision : ',metrics.precision_score(y_test, y_pred_gbm))
print('Recall : ',metrics.recall_score(y_test, y_pred_gbm))
print('f1_score : ',metrics.f1_score(y_test, y_pred_gbm))

In [None]:
# Finding feature importances using gbm
feature_importances = pd.DataFrame(gbm.feature_importances_,
                                  index = X_train.columns,
                                  columns = ['importance']).sort_values('importance', ascending = False)

ax = sns.barplot(x = 'importance', y = feature_importances.iloc[0:10].index, data = feature_importances.iloc[0:10])

In [None]:
# Probability Decile Analysis

In [None]:
y_pred_prob = gbm.predict_proba(X_all)[:, 1]
df['y_pred_P']=pd.DataFrame(y_pred_prob)
df['P_Rank_gbm']=pd.qcut(df['y_pred_P'].rank(method='first').values,10,duplicates='drop').codes+1
rank_df_actuals=df.groupby('P_Rank_gbm')['target'].agg(['count','mean'])
rank_df_actuals=pd.DataFrame(rank_df_actuals)
rank_df_actuals.rename(columns={'mean':'Actual_event_rate'},inplace=True)

sorted_rank_df=rank_df_actuals.sort_values(by='P_Rank_gbm',ascending=False)
sorted_rank_df['N_events']=rank_df_actuals['count']*rank_df_actuals['Actual_event_rate']
sorted_rank_df['cum_events']=sorted_rank_df['N_events'].cumsum()
sorted_rank_df['event_cap']=sorted_rank_df['N_events']/max(sorted_rank_df['N_events'].cumsum())
sorted_rank_df['cum_event_cap']=sorted_rank_df['event_cap'].cumsum()

sorted_rank_df['N_non_events']=sorted_rank_df['count']-sorted_rank_df['N_events']
sorted_rank_df['cum_non_events']=sorted_rank_df['N_non_events'].cumsum()
sorted_rank_df['non_event_cap']=sorted_rank_df['N_non_events']/max(sorted_rank_df['N_non_events'].cumsum())
sorted_rank_df['cum_non_event_cap']=sorted_rank_df['non_event_cap'].cumsum()

sorted_rank_df['KS']=round((sorted_rank_df['cum_event_cap']-sorted_rank_df['cum_non_event_cap']),4)

sorted_reindexed=sorted_rank_df.reset_index()
sorted_reindexed['Decile']=sorted_reindexed.index+1
sorted_reindexed['Lift_over_Avg']=sorted_reindexed['Actual_event_rate']/(max(sorted_reindexed['N_events'].cumsum())/max(sorted_reindexed['count'].cumsum()))
sorted_reindexed

In [None]:
fig, axes = plt.subplots(1, 3, sharex=True, figsize=(15,5))
fig.suptitle('Effectiveness of Deciles based on Model Probabilities')
axes[0].set_title('Rank Ordering of Actual Event Rate')
axes[1].set_title('Lift over Mean Event Rate')
axes[2].set_title('Gains Chart')
sns.lineplot(ax=axes[0],  x="Decile", y="Actual_event_rate", data=sorted_reindexed,color='red')
sns.barplot(ax=axes[1],  x="Decile", y="Lift_over_Avg", data=sorted_reindexed,color='green')
sns.lineplot(ax=axes[2],  x="Decile", y="cum_event_cap", data=sorted_reindexed,color='blue')
sns.lineplot(ax=axes[2],  x="Decile", y="cum_non_event_cap", data=sorted_reindexed,color='black')

In [None]:
# Target Audience
df['predicted_cancel_Rank'] = np.where(df['P_Rank_gbm'] < 8, 'Bottom7', 'Top3')
df.predicted_cancel_Rank.value_counts()

In [None]:
df_top3 = df.loc[df['predicted_cancel_Rank'] == 'Top3', :]

In [None]:
df_top3.shape

In [None]:
df_top3['income_RANK']=pd.qcut(df_top3['MonthlyIncome'].rank(method='first').values,10,duplicates='drop').codes+1

In [None]:
df_top3.groupby('income_RANK')['MonthlyIncome'].agg(['min','mean','max'])

In [None]:
df_top3.MonthlyIncome.mean()

In [None]:
df_top3['income_segment']=np.where(df_top3['income_RANK']>=8,"High Income","Low Income")
df_top3.income_segment.value_counts()

In [None]:
# Other way to do the above step using quantile
#df_top3['income_group'] = np.where(df_top3['MonthlyIncome']>=df_top3['MonthlyIncome'].quantile(0.70),'High_Income',
                                  'Low_Income')


In [None]:
#df_top3['income_group'].value_counts()

In [None]:
df_top3['tot_work_exp_rank']=pd.qcut(df_top3['TotalWorkingYears'].rank(method='first').values,10,duplicates='drop').codes+1

In [None]:
df_top3.groupby('tot_work_exp_rank')['TotalWorkingYears'].agg(['min','mean','max'])

In [None]:
df_top3.TotalWorkingYears.mean()

In [None]:
df_top3['Work_Exp_Segment']=np.where(df_top3['tot_work_exp_rank']>=7,"High Work Exp","Low Work Exp")
df_top3.Work_Exp_Segment.value_counts()

In [None]:
pd.crosstab(index=df_top3['Work_Exp_Segment'], columns=df_top3['income_segment'],values=df_top3['TotalWorkingYears'],aggfunc='count')

In [None]:
pd.crosstab(index=df_top3['Work_Exp_Segment'], columns=df_top3['income_segment'],values=df_top3['y_pred_P'],aggfunc='mean')