# Attrition Prediction
### By Sejal, Ethan, Srivignesh, Nurzhan, Sigit, Mihier

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection  import train_test_split
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.utils.multiclass import unique_labels
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [None]:
# Read CSV
employee_df = pd.read_csv('/Users/sri/Downloads/IMA_2.csv')

# Count of missing values for each column in the dataset
employee_df.isnull().sum()

In [None]:
# Function to create bar graphs
def barplot(var_select, x_no_numeric) :
    # Two variables for Attrition = 'Yes' and 'No'
    tmp1 = employee_df[(employee_df['Attrition'] == 'Yes')]
    tmp2 = employee_df[(employee_df['Attrition'] == 'No')]
    # Compute cross tabulation between variable and Attrition
    tmp3 = pd.DataFrame(pd.crosstab(employee_df[var_select],employee_df['Attrition']), )
    # Calculate attrition % based on the selected variable
    tmp3['Attr%'] = tmp3['Yes'] / (tmp3['Yes'] + tmp3['No']) * 100
    if x_no_numeric == True  : 
        tmp3 = tmp3.sort_values('Yes', ascending = False)

    # Plot bar for count of variable with attrition
    trace1 = go.Bar(
        x=tmp1[var_select].value_counts().keys().tolist(),
        y=tmp1[var_select].value_counts().values.tolist(),
        name='Yes_Attrition',opacity = 0.8, marker=dict(
        color='gold',
        line=dict(color='#000000',width=1)))

    # Plot bar for count of variable without attrition
    trace2 = go.Bar(
        x=tmp2[var_select].value_counts().keys().tolist(),
        y=tmp2[var_select].value_counts().values.tolist(),
        name='No_Attrition', opacity = 0.8, marker=dict(
        color='lightskyblue',
        line=dict(color='#000000',width=1)))
    
    # Scatter plot of the attrition rate for each category
    trace3 =  go.Scatter(   
        x=tmp3.index,
        y=tmp3['Attr%'],
        yaxis = 'y2',
        name='% Attrition', opacity = 0.6, marker=dict(
        color='black',
        line=dict(color='#000000',width=0.5
        )))

    layout = dict(title =  str(var_select),
              xaxis=dict(), 
              yaxis=dict(title= 'Count'), 
              yaxis2=dict(range= [-0, 75], 
                          overlaying= 'y', 
                          anchor= 'x', 
                          side= 'right',
                          zeroline=False,
                          showgrid= False, 
                          title= '% Attrition'
                         ))

    fig = go.Figure(data=[trace1, trace2, trace3], layout=layout)
    py.iplot(fig)

## Data visualisation

In [None]:
# Data visualisation
# Count of attrition vs non-attrition
trace1 = go.Bar(
        x=employee_df['Attrition'].value_counts().keys().tolist(),
        y=employee_df['Attrition'].value_counts().values.tolist(),
        name='Attrition count',opacity = 0.8,
        marker=dict(color=['lightskyblue','gold'])
        )
fig = go.Figure(data=[trace1])
py.iplot(fig)

# Plot bar graphs for all features vs attrition
barplot('Age', False)
barplot('Department',True)
barplot('EducationField',True)
barplot('Education', True)
barplot('MaritalStatus',True)
barplot('EnvironmentSatisfaction', True)
barplot('JobSatisfaction', True)
barplot('WorkLifeBalance', True)
barplot('NumCompaniesWorked',False)

# Data Cleaning & Feature Engineering

In [None]:
# Data Cleaning
# Add end date column to the dataframe
# employee_df['EndDate'] = '27/01/2022'
# Convert dates to datetime
# employee_df['EndDate'] = pd.to_datetime(employee_df['EndDate'], errors='coerce', infer_datetime_format=True)
employee_df['StartDate'] = pd.to_datetime(employee_df['StartDate'], errors='coerce', infer_datetime_format=True)

# Calculate employment period
#employee_df['EmploymentPeriod']=((employee_df.EndDate - employee_df.StartDate)/np.timedelta64(1, 'M'))
#employee_df['EmploymentPeriod']=employee_df['EmploymentPeriod'].astype(int)

## Convert attrition to numeric values
employee_df['Attrition'] = np.where(employee_df.Attrition == 'Yes', 1, 0)

## Remove useless columns
employee_df = employee_df.drop(columns = ['StartDate','ID'], axis=1)

## Convert distance to numeric
employee_df['DistanceFromHome'] = employee_df['DistanceFromHome'].astype(str)
employee_df['DistanceFromHome'] = employee_df['DistanceFromHome'].str.replace(' miles', '')
employee_df['DistanceFromHome'] = employee_df['DistanceFromHome'].astype(int)

## Convert Income to float
employee_df['MonthlyIncome'] = employee_df['MonthlyIncome'].astype(str)
employee_df['MonthlyIncome'] = employee_df['MonthlyIncome'].str.replace('£', '')
employee_df['MonthlyIncome'] = employee_df['MonthlyIncome'].str.replace(',','')
employee_df['MonthlyIncome'] = employee_df['MonthlyIncome'].astype(float)

## Convert Scale values to numeric
employee_df=employee_df.replace(to_replace="Very High",value="3")
employee_df=employee_df.replace(to_replace="High",value="2")
employee_df=employee_df.replace(to_replace="Medium",value="1")
employee_df=employee_df.replace(to_replace="Low",value="0")

employee_df=employee_df.replace(to_replace="Best",value="3")
employee_df=employee_df.replace(to_replace= "Better",value="2")
employee_df=employee_df.replace(to_replace="Good",value="1")
employee_df=employee_df.replace(to_replace="Bad",value="0")

employee_df=employee_df.replace(to_replace="PhD",value="4")
employee_df=employee_df.replace(to_replace="Masters",value="3")
employee_df=employee_df.replace(to_replace="Bachelors",value="2")
employee_df=employee_df.replace(to_replace="College",value="1")
employee_df=employee_df.replace(to_replace="Below College",value="0")

# Split categorical values into different columns using get_dummies
employee_df = pd.get_dummies(employee_df, columns=["MaritalStatus"], prefix=["TypeMaritalStatus"])
employee_df = pd.get_dummies(employee_df, columns=["Department"], prefix=["TypeDepartment"])
employee_df = pd.get_dummies(employee_df, columns=["EducationField"], prefix=["TypeEducationField"])

employee_df

### Correlation matrix

In [None]:
# Correlation matrix
fig = plt.figure(figsize=(15,25), dpi = 480)
sns.heatmap(employee_df.corr(), annot = True, fmt = '.2f')

### Prepare target variable

In [None]:
# create a target variable
model_target = employee_df['Attrition']
target_dict = model_target.to_dict()

# Drop Attrition
employee_df = employee_df.drop(columns = ['Attrition'], axis=1)

## Scale dataframe

In [None]:
# Scale dataframe
x = RobustScaler().fit_transform(employee_df)
scaled_df = pd.DataFrame(x, columns=employee_df.columns)
scaled_df

#### Function to plot confusion matrix

In [None]:
np.set_printoptions(precision=2)

def plot_confusion_matrix(y_true, y_pred, classes,
                          normalise=False,
                          title=None,
                          cmap=plt.cm.Blues,
                          multi=False):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalise=True`.
    """
    if not title:
        if normalise:
            title = 'Normalised confusion matrix'
        else:
            title = 'Confusion matrix, without normalisation'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    
    # Only use the labels that appear in the data
    if multi==True:
    	classes = classes[unique_labels(y_true, y_pred)]
    if normalise:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor");

    fmt = '.2f' if normalise else 'd'
    thresh = cm.max() / 2
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()

    return ax

## Create Test-Train split and oversample train split 

In [None]:
# Create train-test split
X_train, X_test, Y_train, Y_test = train_test_split(scaled_df, target_dict, test_size = 0.2)

# Oversampling using SMOTE
X_resampled, Y_resampled = SMOTE().fit_resample(X_train, Y_train)

print(f'x_train: {X_train.shape}')
print(f'x resampled: {X_resampled.shape}')

print(f'Yes : {Y_resampled.count(1)}. No : {Y_resampled.count(0)}')

## Checking hyper-parameters

In [None]:
# Tuning hyperparameters for decision trees
'''
tuned_parameters = [{'criterion': ['gini', 'entropy'],
                     'max_depth': [7, 9, 11, 13, 15,17, 19, 21],
                     'min_samples_split': [3, 5, 7, 9],
                     'max_features': ["sqrt", "log2", None]}]

scores = ['accuracy', 'f1_macro', 'recall']

for score in scores:
    print("# Tuning hyperparameters for %s" % score)
    print("\n")
    clf = GridSearchCV(DTC(), tuned_parameters, cv=5,
                       scoring= score)
    clf.fit(X_resampled, Y_resampled)
    print("Best parameters set found on the training set:")
    print(clf.best_params_)
    print("\n")
'''

## Modeling and Metrics

In [None]:
# For decision tree
#model = DTC(criterion='entropy', max_depth=19 , max_features=None, min_samples_split=3)

# For logistic regression
model = LogisticRegression()

model_fit = model.fit(X_resampled, Y_resampled)

print('-----------------')
print(classification_report(Y_test, model_fit.predict(X_test)))
print('-----------------')

## Prediction & confusion matrix

In [None]:
# predict the test data
predicted = model_fit.predict(X_test)

# Plot non-normalised confusion matrix
plot_confusion_matrix(Y_test, predicted, classes=["No Attrition", "Yes Attrition"])

# Plot normalised confusion matrix
plot_confusion_matrix(Y_test, predicted, classes=["No Attrition", "Yes Attrition"], normalise=True)

# Improvments to the model
- Try different ensemble learning algorithms
    - Random forests, XGBoost
- More data which is balanced
- More feature engineering
- Further refining the hyper-parameters


# Features to improve model
- Gender
- Job role
- Overtime information
- Last salary hike %
- Total time working at the company
- Time in current role
- Time since last promotion
- Time since last training provided by company

# XGBoost

In [None]:
# copying the dataframe to new variable
scaled_df_xg=scaled_df
target_xg=model_target



# data_dmatrix = xgb.DMatrix(data=scaled_df_xg,label=target_xg)


# Create train-test split
X_train_xg, X_test_xg, Y_train_xg, Y_test_xg = train_test_split(scaled_df_xg, target_xg, test_size = 0.2, random_state=31081996)


# Oversampling using SMOTE
X_resampled_xg, Y_resampled_xg = SMOTE().fit_resample(X_train_xg, Y_train_xg)




## Checking hyper-parameters

In [None]:

# Tuning hyperparameters for XGboost
X, y = X_resampled_xg, Y_resampled_xg



params = { 'max_depth': [15],
           'learning_rate': [0.01],
          "gamma": [0.1],
          "reg_lambda": [2],
           'n_estimators': [800],
          "scale_pos_weight": [12],
           'colsample_bytree': [0.5]}
scoring='roc_auc'

xgb_cl = xgb.XGBClassifier(eval_metric='logloss',use_label_encoder=False,)
clf = GridSearchCV( estimator=xgb_cl, 
                   param_grid=params,
                   scoring=scoring, 
                   verbose=1)
clf.fit(X, y)

print("Best parameters:", clf.best_params_)
# Fit
_ = xgb_cl.fit(X, y)

# xgb_cl.best_score_

# Best parameters: {'colsample_bytree': 0.7, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 1000, 'reg_lambda': 1, 'scale_pos_weight': 5}
#Applying tuned hyperparameter

# Fitting 5 folds for each of 72 candidates, totalling 360 fits
# Best parameters: {'colsample_bytree': 0.6, 'gamma': 0.1, 'learning_rate': 0.01, 'max_depth': 13, 'n_estimators': 900, 'reg_lambda': 2, 'scale_pos_weight': 6}

# Fitting 5 folds for each of 144 candidates, totalling 720 fits
# Best parameters: {'colsample_bytree': 0.5, 'gamma': 0.1, 'learning_rate': 0.01, 'max_depth': 13, 'n_estimators': 900, 'reg_lambda': 2, 'scale_pos_weight': 10}

# Fitting 5 folds for each of 6 candidates, totalling 30 fits
# Best parameters: {'colsample_bytree': 0.5, 'gamma': 0.1, 'learning_rate': 0.01, 'max_depth': 15, 'n_estimators': 900, 'reg_lambda': 2, 'scale_pos_weight': 12}

# Fitting 5 folds for each of 4 candidates, totalling 20 fits
# Best parameters: {'colsample_bytree': 0.5, 'gamma': 0.1, 'learning_rate': 0.01, 'max_depth': 15, 'n_estimators': 900, 'reg_lambda': 2, 'scale_pos_weight': 12}

# Fitting 5 folds for each of 6 candidates, totalling 30 fits
# Best parameters: {'colsample_bytree': 0.5, 'gamma': 0.1, 'learning_rate': 0.01, 'max_depth': 15, 'n_estimators': 800, 'reg_lambda': 2, 'scale_pos_weight': 12}

## Prediction & confusion matrix

In [None]:
# Fitting 5 folds for each of 2187 candidates, totalling 10935 fits
# Best parameters: {'colsample_bytree': 0.7, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 1000, 'reg_lambda': 1, 'scale_pos_weight': 5}
#Applying tuned hyperparameter

# eval_metric can have multiple values : logloss, error, binary:logistic

xg_classification = xgb.XGBClassifier(eval_metric='logloss',use_label_encoder=False,colsample_bytree = 0.5, learning_rate = 0.01,
                 max_depth = 15, n_estimators = 800,reg_lambda= 2,scale_pos_weight= 12)

# https://xgboost.readthedocs.io/en/stable/parameter.html

xg_classification.fit(X_resampled_xg, Y_resampled_xg)
# xg_reg.fit(X_train_xg, Y_train_xg)


# #Predicting value of test data

Y_prediction_xg = xg_classification.predict(X_test_xg)
Y_prediction_xg





print(f'XGBoost: {classification_report(Y_test_xg,Y_prediction_xg)}')
# print(f'x resampled: {X_resampled.shape}')
# print() 




# # Plot non-normalised confusion matrix
# plot_confusion_matrix(Y_test_xg, Y_prediction_xg, classes=["No Attrition", "Yes Attrition"])

# Plot normalised confusion matrix
plot_confusion_matrix(Y_test_xg, Y_prediction_xg, classes=["No Attrition", "Yes Attrition"], normalise=True)

## Tree

In [None]:
xgb.plot_tree(xg_classification,num_trees=0)
plt.rcParams['figure.figsize'] = [800, 800]
plt.show()
# plt.savefig('tree.png')

## Feature importance table

In [None]:
# xgb.plot_importance(xg_classification)
# plt.rcParams['figure.figsize'] = [10, 10]
# plt.show()