# IBM HR Analytics Employee Attrition & Performance¶

## Algorithms

In [16]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import tarfile
from six.moves import urllib
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report,accuracy_score,roc_auc_score,precision_recall_curve,confusion_matrix,precision_score,confusion_matrix




# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [29]:
#Reading the data
ibm=pd.read_csv("ibm.csv")

In [3]:
ibm.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [4]:
ibm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [5]:
ibm["Attrition"].value_counts()

No     1233
Yes     237
Name: Attrition, dtype: int64

## Data Preprocessing

In the dataset there are 4 irrelevant columns, i.e:EmployeeCount, EmployeeNumber, Over18 and StandardHour. So, we have to remove these for more accuracy.

In [24]:
ibm.drop('EmployeeCount', axis = 1, inplace = True)
ibm.drop('StandardHours', axis = 1, inplace = True)
ibm.drop('EmployeeNumber',axis = 1, inplace = True)
ibm.drop('Over18', axis = 1, inplace = True)
print(ibm.shape)

(1470, 31)


In [30]:
#Convert data to integerr
Attrition={'Yes':1,'No':0}
ibm.Attrition=[Attrition[item] for item in ibm.Attrition]


Gender={'Male':1,'Female':0}
ibm.Gender=[Gender[item] for item in ibm.Gender]

In [31]:
#Datasets for valuable features
ibm2=ibm[['JobLevel','EnvironmentSatisfaction','JobInvolvement','JobSatisfaction',
          'PerformanceRating','RelationshipSatisfaction','WorkLifeBalance','Attrition']]

In [27]:
ibm2.head()

Unnamed: 0,JobLevel,EnvironmentSatisfaction,JobInvolvement,JobSatisfaction,PerformanceRating,RelationshipSatisfaction,WorkLifeBalance,Attrition
0,2,2,3,4,3,1,1,1
1,2,3,2,2,4,4,3,0
2,1,4,2,3,3,2,3,1
3,1,4,3,3,3,3,3,0
4,1,1,3,2,3,4,3,0


In [10]:
# Create train and test splits
target_name = 'Attrition'
X = ibm2.drop('Attrition', axis=1)

y=ibm2[target_name]


X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2, random_state=100, stratify=y)
print(len(X_train), "train +", len(X_test), "test")

1176 train + 294 test


## Train Data

In [None]:
#LogisticRegression Model
kfold = model_selection.KFold(n_splits=10)
model1 = LogisticRegression(class_weight = "balanced")
results = model_selection.cross_val_score(model1, X_train, y_train, cv=kfold)
print("LogisticRegression Accuracy")
print(results.mean())

#Gradient Bossting Model
gb = GradientBoostingClassifier()
results = model_selection.cross_val_score(gb, X_train, y_train, cv=kfold)
print("Gradient Accuracy")
print(results.mean())

# Using 10 fold Cross-Validation to train Random Forest Model
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
kfold = model_selection.KFold(n_splits=107)
model3 = RandomForestClassifier(n_estimators=1000,max_depth=None,min_samples_split=10,class_weight="balanced")
results = model_selection.cross_val_score(model3, X_train, y_train, cv=kfold)
print("RandomForest Accuracy")
print(results.mean())


LogisticRegression Accuracy
0.6427929885557003
Gradient Accuracy
0.8460162248297843


## Test Data

In [37]:
#LogisticRegression Model
kfold = model_selection.KFold(n_splits=10)
model1 = LogisticRegression(class_weight = "balanced")
results = model_selection.cross_val_score(model1, X_test, y_test, cv=kfold)
print("LogisticRegression Accuracy")
print(results.mean())

#Gradient Bossting Model
gb = GradientBoostingClassifier()
results = model_selection.cross_val_score(gb, X_test, y_test, cv=kfold)
print("Gradient Accuracy")
print(results.mean())


LogisticRegression Accuracy
0.6532183908045976
Gradient Accuracy
0.8335632183908046


In [35]:
def print_results(model, X_test, y_test):
    """
    print summrized results
    """
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    
    auc = metrics.roc_auc_score(y_test, y_pred_prob)
    acc = metrics.accuracy_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    conf_mat = metrics.confusion_matrix(y_test, y_pred)
    conf_mat_df = pd.DataFrame(conf_mat, 
                               index= ['true_no', 'true_yes'],
                              columns = ['predict_no', 'predict_yes'])
    
    model_name = str(model).split('(')[0]
    print(f'{"*" *10} {model_name} {"*" *10}')
    print(f'Accuracy: {acc:.3f}')
    print(f'AUC: {auc:.3f}')
    print(f'F1: {f1:.3f}\n', end='\n')
    print('Confusion Matrix:', end='\n')
    print(conf_mat_df, end='\n\n')
    print('Classification Report:', end='\n')
    print(metrics.classification_report(y_test, y_pred))


def params_tuning(X_train, y_train, X_test, y_test,
                  model=None, param_grid=None,
                 scoring=None, balance_weight=False):
    """
    Use the best params resulted from GridSearchCV to fit models
    """
    if balance_weight:
        sample_weight = y_train.map(dict(0.5 / y_train.value_counts())).values
    else:
        sample_weight = None
    
    # Grid Search for the best parameters
    clf = GridSearchCV(model, param_grid, cv=5, scoring=scoring)
    clf.fit(X_train, y_train, sample_weight=sample_weight)
    # train model with the selected best parameters
    model.set_params(**clf.best_params_)
    model.fit(X_train, y_train)
    # pring results
    print_results(model, X_test, y_test)
    return model

In [32]:
#Logistic Regression Model
lr_params = {
    'max_iter': 1000,
    'class_weight': 'balanced'}
lr_tuned_params = {'penalty': 'l2', 'C': .6}
modelL = LogisticRegression(**lr_params, **lr_tuned_params)
modelL.fit(X_train, y_train)
print ("\n\n ---Logistic Model---")
logit_roc_auc = roc_auc_score(y_test, modelL.predict(X_test))
print ("Logistic AUC = %2.2f" % logit_roc_auc)
print(classification_report(y_test, modelL.predict(X_test)))
print("Accuracy score: {}".format(accuracy_score(y_test, modelL.predict(X_test))))


#Random Forest Model
modelR=RandomForestClassifier( n_estimators=1000,max_depth=None,min_samples_split=10,class_weight="balanced")
modelR.fit(X_train,y_train)
print("\n\n --- Random Forest Model ----")
rforest_roc_auc=roc_auc_score(y_test,modelR.predict(X_test))
print("Random forest AUC = %2.2f" % rforest_roc_auc)
print(classification_report(y_test,modelR.predict(X_test)))
print("Accuracy score: {}".format(accuracy_score(y_test, modelR.predict(X_test))))


#GradientBoostingClassifier
gb_tuned_params = {'max_depth': 8, 'max_features': .3, 'learning_rate': 0.4}
gb_params = {
    'n_estimators': 1000,
    'min_samples_leaf': 2,
}
modelG = GradientBoostingClassifier(**gb_params, **gb_tuned_params)
modelG.fit(X_train, y_train)
print("\n\n --- Gradient Boosting Classifier ----")
rforest_roc_auc=roc_auc_score(y_test,modelG.predict(X_test))
print("Gradient Boosting AUC = %2.2f" % rforest_roc_auc)
print(classification_report(y_test,modelG.predict(X_test)))
print("Accuracy score: {}".format(accuracy_score(y_test, modelG.predict(X_test))))



 ---Logistic Model---
Logistic AUC = 0.62
              precision    recall  f1-score   support

           0       0.89      0.62      0.73       247
           1       0.23      0.62      0.34        47

    accuracy                           0.62       294
   macro avg       0.56      0.62      0.53       294
weighted avg       0.79      0.62      0.67       294

Accuracy score: 0.6156462585034014


 --- Random Forest Model ----
Random forest AUC = 0.64
              precision    recall  f1-score   support

           0       0.89      0.85      0.87       247
           1       0.36      0.43      0.39        47

    accuracy                           0.79       294
   macro avg       0.62      0.64      0.63       294
weighted avg       0.80      0.79      0.79       294

Accuracy score: 0.7857142857142857


 --- Gradient Boosting Classifier ----
Gradient Boosting AUC = 0.53
              precision    recall  f1-score   support

           0       0.85      0.93      0.89       