In [69]:
import sys; sys.path.append('src')
import ds_tools as kt
import pandas as pd
import numpy as np
import pickle
import scipy.stats 
from sklearn import tree
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from statsmodels.tools.tools import add_constant
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import shap

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

In [4]:
import importlib
importlib.reload(kt)

<module 'ds_tools' from 'E:\\working\\pet-project\\interview\\krisztian_kapolnyi\\task\\ds-mar-24\\src\\ds_tools.py'>

Variables

In [5]:
path_to_dataset = 'dataset/use_case_employee-attrition.csv'

Read dataset

In [6]:
df = pd.read_csv(path_to_dataset)
df = kt.ktools.reduce_mem_usage(df)

Drop feature with no information and low explanatory power

In [7]:
df = df.drop(['EmployeeCount', 'Over18', 'StandardHours', 'EmployeeNumber'], axis=1)

In [8]:
df['Attrition'] = (df['Attrition'] == 'Yes').astype(int)

Catboost feature importances

In [55]:
X_train.select_dtypes('object').columns

Index(['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole',
       'MaritalStatus', 'OverTime'],
      dtype='object')

In [73]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('Attrition', axis=1), 
                                                    (df['Attrition'] == 1).astype(int), 
                                                    test_size=0.2, 
                                                    random_state=42)

label_encoders = {}
cat_features = X_train.select_dtypes('object').columns
for feature in cat_features:
    label_encoders[feature] = LabelEncoder()
    X_train[feature] = label_encoders[feature].fit_transform(X_train[feature])
    X_test[feature] = label_encoders[feature].transform(X_test[feature])

model = CatBoostClassifier(iterations=1000, 
                           learning_rate=0.1, 
                           depth=6, 
                           loss_function='Logloss',
                           verbose=100
                           )

model.fit(X_train, y_train, cat_features=[])

y_pred = model.predict(X_test)

0:	learn: 0.6173412	total: 5.03ms	remaining: 5.02s
100:	learn: 0.1130349	total: 148ms	remaining: 1.32s
200:	learn: 0.0393024	total: 293ms	remaining: 1.16s
300:	learn: 0.0194475	total: 435ms	remaining: 1.01s
400:	learn: 0.0120829	total: 580ms	remaining: 866ms
500:	learn: 0.0085805	total: 722ms	remaining: 719ms
600:	learn: 0.0064286	total: 866ms	remaining: 575ms
700:	learn: 0.0052320	total: 1.01s	remaining: 429ms
800:	learn: 0.0043437	total: 1.15s	remaining: 285ms
900:	learn: 0.0037647	total: 1.28s	remaining: 141ms
999:	learn: 0.0034505	total: 1.42s	remaining: 0us


In [66]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[252   3]
 [ 28  11]]


In [65]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.99      0.94       255
           1       0.79      0.28      0.42        39

    accuracy                           0.89       294
   macro avg       0.84      0.64      0.68       294
weighted avg       0.88      0.89      0.87       294



In [76]:
import pandas as pd
from catboost import CatBoostClassifier, Pool, cv

train_pool = Pool(data=X_train, label=y_train, cat_features=[])


params = {
    'iterations': 1000, 
    'learning_rate': 0.1,  
    'depth': 6, 
    'loss_function': 'Logloss', 
    'verbose': 100  
}


cv_results = cv(pool=train_pool,
                params=params,
                fold_count=5, 
                shuffle=True,
                partition_random_seed=42,  
                plot=True,  
                stratified=True  
                )

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]
0:	learn: 0.6300225	test: 0.6361853	best: 0.6361853 (0)	total: 1.98ms	remaining: 1.97s
100:	learn: 0.0991939	test: 0.3436873	best: 0.3308114 (59)	total: 157ms	remaining: 1.4s
200:	learn: 0.0354917	test: 0.3698663	best: 0.3308114 (59)	total: 307ms	remaining: 1.22s
300:	learn: 0.0172575	test: 0.4015687	best: 0.3308114 (59)	total: 457ms	remaining: 1.06s
400:	learn: 0.0106836	test: 0.4197446	best: 0.3308114 (59)	total: 617ms	remaining: 921ms
500:	learn: 0.0074189	test: 0.4327765	best: 0.3308114 (59)	total: 784ms	remaining: 781ms
600:	learn: 0.0055524	test: 0.4497346	best: 0.3308114 (59)	total: 949ms	remaining: 630ms
700:	learn: 0.0045272	test: 0.4620458	best: 0.3308114 (59)	total: 1.12s	remaining: 479ms
800:	learn: 0.0037519	test: 0.4731163	best: 0.3308114 (59)	total: 1.27s	remaining: 317ms
900:	learn: 0.0031207	test: 0.4844748	best: 0.3308114 (59)	total: 1.45s	remaining: 160ms
999:	learn: 0.0026702	test: 0.4945141	best: 0.3308114 (59)	total: 1.6s	remaining: 0us

bes

In [81]:
# Get feature importances
feature_importances = model.get_feature_importance(type='FeatureImportance')

# Create a DataFrame to display feature importances
importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

In [82]:
importance_df

Unnamed: 0,Feature,Importance
18,OverTime,7.963267
15,MonthlyIncome,6.567036
0,Age,5.533873
13,JobSatisfaction,4.804943
9,HourlyRate,4.754203
4,DistanceFromHome,4.428343
16,MonthlyRate,4.398859
12,JobRole,4.205455
2,DailyRate,4.205135
7,EnvironmentSatisfaction,4.204168


Feature engineering

In [9]:
one_hot_encoded = pd.get_dummies(df.select_dtypes(include=['object']))

In [10]:
target = df['Attrition']

In [11]:
num_df = df.select_dtypes(include=[np.number]).drop(['Attrition'], axis=1)

Remove highly correlated features

In [12]:
corr_threashold = .7

In [13]:
num_corrs = num_df.corr().abs()
num_corrs = ((num_corrs > corr_threashold) & (num_corrs != 1))
num_corrs = num_corrs.stack()[num_corrs.stack() == True]
num_corrs

JobLevel              MonthlyIncome           True
                      TotalWorkingYears       True
MonthlyIncome         JobLevel                True
                      TotalWorkingYears       True
PercentSalaryHike     PerformanceRating       True
PerformanceRating     PercentSalaryHike       True
TotalWorkingYears     JobLevel                True
                      MonthlyIncome           True
YearsAtCompany        YearsInCurrentRole      True
                      YearsWithCurrManager    True
YearsInCurrentRole    YearsAtCompany          True
                      YearsWithCurrManager    True
YearsWithCurrManager  YearsAtCompany          True
                      YearsInCurrentRole      True
dtype: bool

In [14]:
for col_a, col_b in num_corrs.index[:int(num_corrs.index.shape[0] / 2)]:
    col_a_corr = df.loc[:, ['Attrition', col_a]].corr().abs().iloc[0,-1]
    col_b_corr = df.loc[:, ['Attrition', col_b]].corr().abs().iloc[0,-1]
    try:
        col_to_rm = col_b if col_a_corr >= col_b_corr else col_a
        num_df = num_df.drop(col_to_rm, axis=1)
    except KeyError:
        print(col_to_rm, 'already removed')

MonthlyIncome already removed
MonthlyIncome already removed
PerformanceRating already removed
JobLevel already removed


Cleared Dataset

In [15]:
_X = pd.concat([one_hot_encoded, num_df], axis=1) # df.drop(['Attrition'],axis=1)
_y = df['Attrition']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(_X, _y, test_size=0.3, random_state=42, stratify=_y)

Baseline model feature importances

In [17]:
baseline_tree_model = tree.DecisionTreeClassifier()

In [18]:
baseline_tree_model.fit(X_train, y_train)

In [19]:
tree_importances = pd.DataFrame(
    baseline_tree_model.feature_importances_,
    index=baseline_tree_model.feature_names_in_,
    columns=['importance']
    )
tree_importances = tree_importances.sort_values("importance", ascending=False) 

In [20]:
features_to_remove_by_tree_feat_imp = tree_importances.importance[tree_importances.importance == 0].index

In [21]:
features_to_remove_by_tree_feat_imp

Index(['TrainingTimesLastYear', 'BusinessTravel_Non-Travel',
       'BusinessTravel_Travel_Frequently', 'JobRole_Research Director',
       'JobRole_Manufacturing Director', 'JobRole_Manager',
       'JobRole_Laboratory Technician', 'JobRole_Human Resources',
       'EducationField_Other', 'EducationField_Medical',
       'EducationField_Life Sciences', 'EducationField_Human Resources',
       'BusinessTravel_Travel_Rarely', 'MaritalStatus_Married'],
      dtype='object')

VIF features

In [22]:
vif_threshold = 10

In [23]:
X_train__ = X_train.copy()

In [24]:
def calculate_vif(df):
    vif_data = df.copy() # vif_data = add_constant(df)
    vif_values = pd.Series([variance_inflation_factor(vif_data.values, i) 
                            for i in range(vif_data.shape[1])],
                           index=vif_data.columns)
    return vif_values

In [25]:
features_to_remove_by_vip = list()
while True:
    vif_values = calculate_vif(X_train__)
    max_vif_feature = vif_values.idxmax()
    max_vif = vif_values[max_vif_feature]
    if max_vif > vif_threshold:
        print(f"Removing {max_vif_feature} (VIF: {max_vif})")
        X_train__ = X_train__.drop(columns=max_vif_feature)
        features_to_remove_by_vip.append(max_vif_feature)
    else:
        break

Removing BusinessTravel_Non-Travel (VIF: inf)


  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)


Removing Department_Human Resources (VIF: inf)
Removing EducationField_Human Resources (VIF: inf)
Removing Gender_Female (VIF: inf)


  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)


Removing JobRole_Healthcare Representative (VIF: inf)
Removing MaritalStatus_Divorced (VIF: inf)
Removing OverTime_No (VIF: 257.5473615734532)


  vif = 1. / (1. - r_squared_i)


Removing Department_Research & Development (VIF: 91.34014046052862)
Removing EducationField_Life Sciences (VIF: 23.407197357446314)
Removing Department_Sales (VIF: 19.660474581046582)
Removing JobInvolvement (VIF: 15.18646314797693)
Removing WorkLifeBalance (VIF: 14.919347414275538)


Shap 0 features

In [26]:
explainer = shap.TreeExplainer(baseline_tree_model)
shap_values = explainer.shap_values(X_train)
abs_sv = np.abs(shap_values)
avg_feature_importance_per_class = np.mean(abs_sv, axis=(0,2))

In [27]:
features_to_remove_by_shap = list()
shap_imps = pd.DataFrame(
    avg_feature_importance_per_class,
    index=baseline_tree_model.feature_names_in_,
    columns=['importance']
    )
shap_imps = shap_imps.sort_values("importance", ascending=False) 

In [28]:
features_to_remove_by_shap = shap_imps.importance[shap_imps.importance == 0].index
features_to_remove_by_shap

Index(['TrainingTimesLastYear', 'BusinessTravel_Non-Travel',
       'BusinessTravel_Travel_Frequently', 'JobRole_Research Director',
       'JobRole_Manufacturing Director', 'JobRole_Manager',
       'JobRole_Laboratory Technician', 'JobRole_Human Resources',
       'EducationField_Other', 'EducationField_Medical',
       'EducationField_Life Sciences', 'EducationField_Human Resources',
       'BusinessTravel_Travel_Rarely', 'MaritalStatus_Married'],
      dtype='object')

In [29]:
features_to_remove_by_corr = list(features_to_remove_by_tree_feat_imp) + list(features_to_remove_by_vip) + list(features_to_remove_by_shap)

In [30]:
features_to_remove_by_corr

['TrainingTimesLastYear',
 'BusinessTravel_Non-Travel',
 'BusinessTravel_Travel_Frequently',
 'JobRole_Research Director',
 'JobRole_Manufacturing Director',
 'JobRole_Manager',
 'JobRole_Laboratory Technician',
 'JobRole_Human Resources',
 'EducationField_Other',
 'EducationField_Medical',
 'EducationField_Life Sciences',
 'EducationField_Human Resources',
 'BusinessTravel_Travel_Rarely',
 'MaritalStatus_Married',
 'BusinessTravel_Non-Travel',
 'Department_Human Resources',
 'EducationField_Human Resources',
 'Gender_Female',
 'JobRole_Healthcare Representative',
 'MaritalStatus_Divorced',
 'OverTime_No',
 'Department_Research & Development',
 'EducationField_Life Sciences',
 'Department_Sales',
 'JobInvolvement',
 'WorkLifeBalance',
 'TrainingTimesLastYear',
 'BusinessTravel_Non-Travel',
 'BusinessTravel_Travel_Frequently',
 'JobRole_Research Director',
 'JobRole_Manufacturing Director',
 'JobRole_Manager',
 'JobRole_Laboratory Technician',
 'JobRole_Human Resources',
 'EducationFiel

Recursive feature elimination

In [31]:
from sklearn.feature_selection import RFE

rfe = RFE(baseline_tree_model)
rfe = rfe.fit(X_train,y_train)
print(rfe.support_)
print(rfe.ranking_)

[False False False False False  True False False False False False  True
 False False False  True False False False False  True  True False False
 False  True  True False  True  True  True False  True  True  True  True
  True  True  True  True  True  True False  True  True  True False  True]
[25 22 23 21 20  1 18 13  7 17 11  1 10  2  9  1 14 19 15 16  1  1  3  5
 24  1  1  8  1  1  1 12  1  1  1  1  1  1  1  1  1  1  6  1  1  1  4  1]


In [32]:
features_to_remove_by_rfe = baseline_tree_model.feature_names_in_[~rfe.support_]

In [33]:
len(set(features_to_remove_by_corr))

23

In [34]:
len(set(features_to_remove_by_rfe))

24

In [35]:
features_to_remove = set(features_to_remove_by_corr).intersection(set(features_to_remove_by_rfe))
len(features_to_remove)

18

In [83]:
features_to_remove

{'BusinessTravel_Non-Travel',
 'BusinessTravel_Travel_Frequently',
 'BusinessTravel_Travel_Rarely',
 'Department_Human Resources',
 'Department_Research & Development',
 'EducationField_Human Resources',
 'EducationField_Life Sciences',
 'EducationField_Medical',
 'EducationField_Other',
 'Gender_Female',
 'JobRole_Healthcare Representative',
 'JobRole_Laboratory Technician',
 'JobRole_Manager',
 'JobRole_Manufacturing Director',
 'JobRole_Research Director',
 'MaritalStatus_Divorced',
 'MaritalStatus_Married',
 'TrainingTimesLastYear'}

In [84]:
importance_df

Unnamed: 0,Feature,Importance
18,OverTime,7.963267
15,MonthlyIncome,6.567036
0,Age,5.533873
13,JobSatisfaction,4.804943
9,HourlyRate,4.754203
4,DistanceFromHome,4.428343
16,MonthlyRate,4.398859
12,JobRole,4.205455
2,DailyRate,4.205135
7,EnvironmentSatisfaction,4.204168


Filter out

In [36]:
X_train_ = X_train[[c for c in X_train.columns if c not in features_to_remove]]
X_test_ = X_test[[c for c in X_test.columns if c not in features_to_remove]]

In [37]:
with open('dataset/trainable.p', 'wb') as f:
    pickle.dump(
                {
                    'pd_version' : pd.__version__,
                    'X_train' : X_train_,
                    'X_test' : X_test_,
                    'y_train' : y_train,
                    'y_test' : y_test,
                }, f)