In [125]:
import sys; sys.path.append('src')
import ds_tools as kt
import pandas as pd
import numpy as np
import pickle
import scipy.stats 
from sklearn import tree
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import shap

In [69]:
from sklearn.model_selection import train_test_split

In [70]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

In [71]:
import importlib
importlib.reload(kt)

<module 'ds_tools' from 'E:\\working\\pet-project\\interview\\krisztian_kapolnyi\\task\\ds-mar-24\\src\\ds_tools.py'>

Variables

In [72]:
path_to_dataset = 'dataset/use_case_employee-attrition.csv'

Read dataset

In [73]:
df = pd.read_csv(path_to_dataset)
df = kt.ktools.reduce_mem_usage(df)

Drop feature with no information and low explanatory power

In [74]:
df = df.drop(['EmployeeCount', 'Over18', 'StandardHours', 'EmployeeNumber', 'Gender'], axis=1)

In [75]:
df['Attrition'] = (df['Attrition'] == 'Yes').astype(int)

Feature engineering

In [76]:
one_hot_encoded = pd.get_dummies(df.select_dtypes(include=['object']))

In [77]:
target = df['Attrition']

In [78]:
num_df = df.select_dtypes(include=[np.number]).drop(['Attrition'], axis=1)

Remove highly correlated features

In [79]:
corr_threashold = .7

In [80]:
num_corrs = num_df.corr().abs()
num_corrs = ((num_corrs > corr_threashold) & (num_corrs != 1))
num_corrs = num_corrs.stack()[num_corrs.stack() == True]
num_corrs

JobLevel              MonthlyIncome           True
                      TotalWorkingYears       True
MonthlyIncome         JobLevel                True
                      TotalWorkingYears       True
PercentSalaryHike     PerformanceRating       True
PerformanceRating     PercentSalaryHike       True
TotalWorkingYears     JobLevel                True
                      MonthlyIncome           True
YearsAtCompany        YearsInCurrentRole      True
                      YearsWithCurrManager    True
YearsInCurrentRole    YearsAtCompany          True
                      YearsWithCurrManager    True
YearsWithCurrManager  YearsAtCompany          True
                      YearsInCurrentRole      True
dtype: bool

In [81]:
for col_a, col_b in num_corrs.index[:int(num_corrs.index.shape[0] / 2)]:
    col_a_corr = df.loc[:, ['Attrition', col_a]].corr().abs().iloc[0,-1]
    col_b_corr = df.loc[:, ['Attrition', col_b]].corr().abs().iloc[0,-1]
    try:
        col_to_rm = col_b if col_a_corr >= col_b_corr else col_a
        num_df = num_df.drop(col_to_rm, axis=1)
    except KeyError:
        print(col_to_rm, 'already removed')

MonthlyIncome already removed
MonthlyIncome already removed
PerformanceRating already removed
JobLevel already removed


Cleared Dataset

In [82]:
_X = pd.concat([one_hot_encoded, num_df], axis=1) # df.drop(['Attrition'],axis=1)
_y = df['Attrition']

In [83]:
X_train, X_test, y_train, y_test = train_test_split(_X, _y, test_size=0.3, random_state=42, stratify=_y)

Baseline model feature importances

In [84]:
baseline_tree_model = tree.DecisionTreeClassifier()

In [85]:
baseline_tree_model.fit(X_train, y_train)

In [86]:
tree_importances = pd.DataFrame(
    baseline_tree_model.feature_importances_,
    index=baseline_tree_model.feature_names_in_,
    columns=['importance']
    )
tree_importances = tree_importances.sort_values("importance", ascending=False) 

In [87]:
features_to_remove_by_tree_feat_imp = tree_importances.importance[tree_importances.importance == 0].index

In [88]:
features_to_remove_by_tree_feat_imp

Index(['BusinessTravel_Travel_Frequently', 'EducationField_Human Resources',
       'Department_Research & Development', 'EducationField_Medical',
       'EducationField_Life Sciences', 'MaritalStatus_Married',
       'EducationField_Other', 'JobRole_Human Resources', 'JobRole_Manager',
       'JobRole_Manufacturing Director', 'JobRole_Research Director',
       'BusinessTravel_Non-Travel'],
      dtype='object')

VIF features

In [90]:
vif_threshold = 10

In [91]:
X_train__ = X_train_.copy()

In [92]:
def calculate_vif(df):
    vif_data = df.copy() # vif_data = add_constant(df)
    vif_values = pd.Series([variance_inflation_factor(vif_data.values, i) 
                            for i in range(vif_data.shape[1])],
                           index=vif_data.columns)
    return vif_values

In [93]:
features_to_remove_by_vip = list()
while True:
    vif_values = calculate_vif(X_train__)
    max_vif_feature = vif_values.idxmax()
    max_vif = vif_values[max_vif_feature]
    if max_vif > vif_threshold:
        print(f"Removing {max_vif_feature} (VIF: {max_vif})")
        X_train__ = X_train__.drop(columns=max_vif_feature)
        features_to_remove_by_vip.append(max_vif_feature)
    else:
        break

Removing OverTime_No (VIF: 97.01599925043404)
Removing Department_Sales (VIF: 15.100830416256251)
Removing JobInvolvement (VIF: 14.609369677774701)
Removing WorkLifeBalance (VIF: 14.420354798842265)


Shap 0 features

In [94]:
explainer = shap.TreeExplainer(baseline_tree_model)
shap_values = explainer.shap_values(X_train)
abs_sv = np.abs(shap_values)
avg_feature_importance_per_class = np.mean(abs_sv, axis=(0,2))

In [95]:
features_to_remove_by_shap = list()
shap_imps = pd.DataFrame(
    avg_feature_importance_per_class,
    index=baseline_tree_model.feature_names_in_,
    columns=['importance']
    )
shap_imps = shap_imps.sort_values("importance", ascending=False) 

In [96]:
features_to_remove_by_shap = shap_imps.importance[shap_imps.importance == 0].index
features_to_remove_by_shap

Index(['BusinessTravel_Travel_Frequently', 'EducationField_Human Resources',
       'Department_Research & Development', 'EducationField_Other',
       'EducationField_Life Sciences', 'EducationField_Medical',
       'MaritalStatus_Married', 'JobRole_Human Resources', 'JobRole_Manager',
       'JobRole_Manufacturing Director', 'JobRole_Research Director',
       'BusinessTravel_Non-Travel'],
      dtype='object')

In [114]:
features_to_remove_by_corr = list(features_to_remove_by_tree_feat_imp) + list(features_to_remove_by_vip) + list(features_to_remove_by_shap)

In [115]:
features_to_remove_by_corr

['BusinessTravel_Travel_Frequently',
 'EducationField_Human Resources',
 'Department_Research & Development',
 'EducationField_Medical',
 'EducationField_Life Sciences',
 'MaritalStatus_Married',
 'EducationField_Other',
 'JobRole_Human Resources',
 'JobRole_Manager',
 'JobRole_Manufacturing Director',
 'JobRole_Research Director',
 'BusinessTravel_Non-Travel',
 'OverTime_No',
 'Department_Sales',
 'JobInvolvement',
 'WorkLifeBalance',
 'BusinessTravel_Travel_Frequently',
 'EducationField_Human Resources',
 'Department_Research & Development',
 'EducationField_Other',
 'EducationField_Life Sciences',
 'EducationField_Medical',
 'MaritalStatus_Married',
 'JobRole_Human Resources',
 'JobRole_Manager',
 'JobRole_Manufacturing Director',
 'JobRole_Research Director',
 'BusinessTravel_Non-Travel']

Recursive feature elimination

In [104]:
from sklearn.feature_selection import RFE

rfe = RFE(baseline_tree_model)
rfe = rfe.fit(X_train,y_train)
print(rfe.support_)
print(rfe.ranking_)

[False False False False  True False False False False False False  True
 False False False False False False  True  True False False False  True
 False  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True False  True  True False False  True]
[24 22 21 15  1 13  4 11  8 10  9  1  7 17 19 18 12 16  1  1  2  6 20  1
 23  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 14  1  1  5  3  1]


In [108]:
features_to_remove_by_rfe = baseline_tree_model.feature_names_in_[~rfe.support_]

In [116]:
len(set(features_to_remove_by_corr))

16

In [117]:
len(set(features_to_remove_by_rfe))

23

In [119]:
features_to_remove = set(features_to_remove_by_corr).intersection(set(features_to_remove_by_rfe))
len(features_to_remove)

13

Filter out

In [120]:
X_train_ = X_train[[c for c in X_train.columns if c not in features_to_remove]]
X_test_ = X_test[[c for c in X_test.columns if c not in features_to_remove]]

In [126]:
with open('dataset/trainable.p', 'wb') as f:
    pickle.dump(
                {
                    'pd_version' : pd.__version__,
                    'X_train' : X_train_,
                    'X_test' : X_test_,
                    'y_train' : y_train,
                    'y_test' : y_test,
                }, f)