# HR Analytics - Hyper Parameter Tuning (HPT)

Objective: Create a logistic Reg Model to predict the attrition of employees by using the features available from the datasets given

Created By: Stephen Cole
Creation Date: 19/07/2022

##### Last Updated

In [1]:
from datetime import datetime, timedelta

input("Last Updated By: ")
print(f"Last Updated: {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}")

Last Updated By: Stephen Cole
Last Updated: 12/08/2022 17:38:13


### Import Packages

In [7]:
# Importing all packages in one cell helps to keep the notebook clean and readable as having multiple imports scattered makes
# it hard to keep track of all the packages being used

from numpy import mean, std
import pandas as pd
import itertools
import seaborn as sns
import numpy as np
import time
import math
import tqdm
import os
import shap
import pickle


from datetime import timedelta, datetime
from scipy import interp
from itertools import cycle

from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold ,StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold 
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error as MSE
from sklearn.base import clone
from sklearn.metrics import auc, roc_auc_score, roc_curve, make_scorer, f1_score, recall_score, precision_score, fbeta_score, classification_report,average_precision_score
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import label_binarize,scale

import xgboost as xgb

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

path_to_data = 'C:\\Users\\coles2\\Documents\\Xander\\Data_Science_Project_HR_Analytics-main\\Datasets'

In [8]:
pd.set_option("display.max_columns", None, "display.max_rows", 500)

### Define Custom Functions

This is where I normally store any functions I use to keep notebooks clean

In [9]:
def save_obj(obj, name, file_path):
    """
    This function saves any object as a .pkl file so that it can be easily read in other notebooks
    
    Parameters
    -----
    obj : Object's variable name 
        The object that needs to be saved.
    name : String
        The name that you would like to save the object as.
    file_path: String
        Directory in which you are saving the object to.
    
    Returns
    -----
        A saved .pkl file in dir and name specified.
    """
    file_path = os.path.join(file_path, name)
    with open(file_path + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
        
def load_obj(name, file_path):
    """
    This function loads any .pkl file
    
    Parameters
    -----
    name : String
        The name of the file that needs to be loaded.
    file_path: String
        Directory in which you are loading the object from.
    
    Returns
    -----
        An object that was saved as a .pkl file in dir and name specified.
    """
    file_path = os.path.join(file_path, name)
    with open(file_path + '.pkl', 'rb') as f:
        return pickle.load(f)

def param_tuning(params_grid, dtrain, num_boost_round, nfold, stratified, metrics, early_stopping_rounds, seed):
    """
    Function completes hyperparameter tuning
    
    Parameters
    -----
    params_grid : Dictionary
        A grid which contains all the parameters needed for testing
    dtrain : XGBoost DMatrix
        A data structure the XGBoost developers created for memory efficiency and training speed
        with their machine learning library
    num_boost_round : Integer
        Number of boosting iterations
    nfold : Integer
        Number of folds for cross validation
    stratified : Boolean
        Perform stratified sampling
    metrics : Dictionary
        Evaluation metrics to be watched in CV
    early_stopping_rounds : Integer
        Early_stopping_rounds round(s) to continue training (needs to be tested separately after tuning the model)
    seed : Integer
        Seed used to generate the folds
    
    Returns
    -----
    max_roc_auc : Float64
        Max ROC AUC found for the model
    best_params : Dictionary
        The optimal dictionary of all specified parameters
    """
    
    start_time = time.time()
    print('start_time', time.asctime( time.localtime(time.time()) ))
    best_params = {}
    max_roc_auc = 0
    
    for param in params_grid:
        
        cv_results = xgb.cv(
            params=param,
            dtrain = dtrain,
            num_boost_round = num_boost_round,
            nfold = nfold,
            stratified = stratified,
            metrics = metrics,
            early_stopping_rounds = early_stopping_rounds,
            seed = seed
        )
        
        mean_roc_auc = cv_results['test-auc-mean'].max()
        _
        if mean_roc_auc > max_roc_auc:
            max_roc_auc = mean_roc_auc
            best_params = param
    
    end_time = time.time()
    total_time = end_time - start_time
    print('total time', str(timedelta(seconds=total_time)))
    
    return max_roc_auc, best_params 

### Load Objects

In [10]:
start_time = time.time()
print('start_time', time.asctime( time.localtime(time.time()) ))

X = load_obj('Modelling_dset', path_to_data)
y = load_obj('Target', path_to_data)
features = load_obj('features', path_to_data)

end_time = time.time()
total_time = end_time - start_time
print('total_time to read data', str(timedelta(seconds=total_time)))

start_time Thu Aug 11 12:46:58 2022
total_time to read data 0:00:00.008940


In [11]:
X = X[features]

In [12]:
X.shape

(4410, 27)

In [13]:
print('Features:', features, 'Number of features:', len(features), sep='\n')

Features:
['maritalstatus_Single', 'totalworkingyears', 'businesstravel_Travel_Frequently', 'environmentsatisfaction_Low', 'educationfield_Human Resources', 'yearswithcurrmanager', 'jobsatisfaction_Very High', 'environmentsatisfaction_Very High', 'yearssincelastpromotion', 'educationfield_Life Sciences', 'numcompaniesworked', 'gender_Female', 'jobsatisfaction_Low', 'worklifebalance_Good', 'jobrole_Research Director', 'department_Human Resources', 'education_Below College', 'worklifebalance_Bad', 'environmentsatisfaction_High', 'jobrole_Sales Representative', 'maritalstatus_Divorced', 'worklifebalance_Better', 'distancefromhome', 'trainingtimeslastyear', 'jobsatisfaction_High', 'education_College', 'education_Bachelor']
Number of features:
27


## XGBoost - HPT

### Train-Test Split

In [14]:
# Split dataset into train and test datasets for modelling and evaluation
# Set the random_state for the same split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [15]:
# Use XGBoost DMatrix for memory efficiency and model training speed

dmat_train = xgb.DMatrix(X_train, label=y_train)

In [24]:
# Define base parameters

params = {
    # Tree Parameters
    'max_depth': 3,
    'min_child_weight': 20,
    'scale_pos_weight':1,
    'max_delta_step': 2,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    # Regularisation Parameters
    'eta': 0.1,
    'gamma': 0,
    'alpha': 0,
    'lambda': 1,
    # Other Parameters
    'verbosity': 0,
    'nthread': os.cpu_count() - 1,
    'eval_metrics': 'logloss',
    'objective': 'binary:logistic',
    'tree_method': 'hist'
}

In [25]:
# Iterate through first batch of tree parameters
# Ideally use a full hyperspace grid however, it would result in too big of a gridsearch and only have 8 cores to use


changed_params_grid = [{'max_depth': max_depth,
                        'min_child_weight': min_child_weight,
                        'scale_pos_weight': scale_pos_weight}
                      for max_depth in [5, 10, 15]
                      for min_child_weight in [1, 5, 10]
                      for scale_pos_weight in [1,5,10,15,25,50,100]]

In [26]:
# Create gridsearch to be used in param_tuning function

gridsearch_params = []

for i in changed_params_grid:
    
    initial_params = params.copy()
    
    for key, value in i.items():
        initial_params[key] = value
    
    gridsearch_params.append(initial_params)

In [27]:
max_roc_auc, best_params = param_tuning(params_grid=gridsearch_params, 
                                        dtrain=dmat_train, 
                                        num_boost_round=100, 
                                        nfold=5, 
                                        stratified=True,
                                        metrics={'auc'}, 
                                        early_stopping_rounds=20, 
                                        seed=5)

start_time Mon Jul 25 14:20:04 2022
total time 0:01:46.087529


In [28]:
# Overwrite base parameters with the optimal params

params['max_depth'] = best_params['max_depth']
params['min_child_weight'] = best_params['min_child_weight']
params['scale_pos_weight'] = best_params['scale_pos_weight']

In [29]:
# Second batch of tree parameters

changed_params_grid = [{'max_delta_step': 2,
                        'subsample': 0.9,
                        'colsample_bytree': 0.9,}
                        for max_delta_step in [0, 2, 4, 6, 8, 10]
                        for subsample in [0.5, 0.75, 1]
                        for colsample_bytree in [0.5, 0.75, 1]]

In [30]:
gridsearch_params = []

for i in changed_params_grid:
    
    initial_params = params.copy()
    
    for key, value in i.items():
        initial_params[key] = value
    
    gridsearch_params.append(initial_params)

In [31]:
max_roc_auc, best_params = param_tuning(params_grid=gridsearch_params, 
                                        dtrain=dmat_train, 
                                        num_boost_round=100, 
                                        nfold=5, 
                                        stratified=True,
                                        metrics={'auc'}, 
                                        early_stopping_rounds=20, 
                                        seed=5)

start_time Mon Jul 25 14:21:50 2022
total time 0:01:57.510550


In [32]:
params['max_delta_step'] = best_params['max_delta_step']
params['subsample'] = best_params['subsample']
params['colsample_bytree'] = best_params['colsample_bytree']

In [33]:
# Regularisation Parameters

changed_params_grid = [{'eta': eta,
                        'gamma': gamma,
                        'alpha': alpha,
                        'lambda': lambd}
                        for eta in [0.1,0.5,0.9]
                        for gamma in [0,0.1,0.5,1,5,10]
                        for alpha in [0,0.5,1]
                        for lambd in [0.01, 0.1, 1, 1.5]]

In [34]:
gridsearch_params = []

for i in changed_params_grid:
    
    initial_params = params.copy()
    
    for key, value in i.items():
        initial_params[key] = value
    
    gridsearch_params.append(initial_params)

In [35]:
max_roc_auc, best_params = param_tuning(params_grid=gridsearch_params, 
                                        dtrain=dmat_train, 
                                        num_boost_round=100, 
                                        nfold=5, 
                                        stratified=True,
                                        metrics={'auc'}, 
                                        early_stopping_rounds=20, 
                                        seed=5)

start_time Mon Jul 25 14:23:48 2022
total time 0:04:58.057970


In [36]:
params['eta'] = best_params['eta']
params['gamma'] = best_params['gamma']
params['alpha'] = best_params['alpha']
params['lambda'] = best_params['lambda']

In [37]:
# Find best num_boost_round

dtrain = xgb.DMatrix(X_train, label = y_train)
dtest = xgb.DMatrix(X_test, label = y_test)
evallist = [(dtrain, 'train'),(dtest, 'test')]

In [38]:
model = xgb.train(params, dtrain = dtrain, num_boost_round=1000, evals=evallist, early_stopping_rounds = 10)

[0]	train-logloss:0.42521	test-logloss:0.46867
[1]	train-logloss:0.28542	test-logloss:0.33712
[2]	train-logloss:0.20157	test-logloss:0.25499
[3]	train-logloss:0.14425	test-logloss:0.19978
[4]	train-logloss:0.11065	test-logloss:0.16852
[5]	train-logloss:0.08658	test-logloss:0.14475
[6]	train-logloss:0.06968	test-logloss:0.12581
[7]	train-logloss:0.05818	test-logloss:0.11681
[8]	train-logloss:0.04953	test-logloss:0.10922
[9]	train-logloss:0.04332	test-logloss:0.10488
[10]	train-logloss:0.03782	test-logloss:0.10130
[11]	train-logloss:0.03383	test-logloss:0.09679
[12]	train-logloss:0.03069	test-logloss:0.09559
[13]	train-logloss:0.02746	test-logloss:0.09415
[14]	train-logloss:0.02510	test-logloss:0.09202
[15]	train-logloss:0.02320	test-logloss:0.09160
[16]	train-logloss:0.02177	test-logloss:0.09119
[17]	train-logloss:0.01969	test-logloss:0.08808
[18]	train-logloss:0.01822	test-logloss:0.08775
[19]	train-logloss:0.01713	test-logloss:0.08591
[20]	train-logloss:0.01607	test-logloss:0.08497
[2

In [39]:
# 220 boosting rounds

model = xgb.train(params, dtrain = dtrain, num_boost_round=220, evals=evallist)

[0]	train-logloss:0.42521	test-logloss:0.46867
[1]	train-logloss:0.28542	test-logloss:0.33712
[2]	train-logloss:0.20157	test-logloss:0.25499
[3]	train-logloss:0.14425	test-logloss:0.19978
[4]	train-logloss:0.11065	test-logloss:0.16852
[5]	train-logloss:0.08658	test-logloss:0.14475
[6]	train-logloss:0.06968	test-logloss:0.12581
[7]	train-logloss:0.05818	test-logloss:0.11681
[8]	train-logloss:0.04953	test-logloss:0.10922
[9]	train-logloss:0.04332	test-logloss:0.10488
[10]	train-logloss:0.03782	test-logloss:0.10130
[11]	train-logloss:0.03383	test-logloss:0.09679
[12]	train-logloss:0.03069	test-logloss:0.09559
[13]	train-logloss:0.02746	test-logloss:0.09415
[14]	train-logloss:0.02510	test-logloss:0.09202
[15]	train-logloss:0.02320	test-logloss:0.09160
[16]	train-logloss:0.02177	test-logloss:0.09119
[17]	train-logloss:0.01969	test-logloss:0.08808
[18]	train-logloss:0.01822	test-logloss:0.08775
[19]	train-logloss:0.01713	test-logloss:0.08591
[20]	train-logloss:0.01607	test-logloss:0.08497
[2

[170]	train-logloss:0.00459	test-logloss:0.09419
[171]	train-logloss:0.00463	test-logloss:0.09418
[172]	train-logloss:0.00460	test-logloss:0.09419
[173]	train-logloss:0.00461	test-logloss:0.09418
[174]	train-logloss:0.00463	test-logloss:0.09418
[175]	train-logloss:0.00459	test-logloss:0.09419
[176]	train-logloss:0.00462	test-logloss:0.09418
[177]	train-logloss:0.00460	test-logloss:0.09440
[178]	train-logloss:0.00455	test-logloss:0.09441
[179]	train-logloss:0.00459	test-logloss:0.09440
[180]	train-logloss:0.00456	test-logloss:0.09441
[181]	train-logloss:0.00455	test-logloss:0.09441
[182]	train-logloss:0.00452	test-logloss:0.09474
[183]	train-logloss:0.00453	test-logloss:0.09473
[184]	train-logloss:0.00458	test-logloss:0.09473
[185]	train-logloss:0.00459	test-logloss:0.09472
[186]	train-logloss:0.00459	test-logloss:0.09457
[187]	train-logloss:0.00461	test-logloss:0.09457
[188]	train-logloss:0.00460	test-logloss:0.09457
[189]	train-logloss:0.00457	test-logloss:0.09457
[190]	train-logloss:

In [40]:
# Save the model for evaluation

save_obj(obj=model, 
         name='HR_Attrition_Model_202207', 
         file_path=path_to_data)