In [47]:
import sys; sys.path.append('src')
import ds_tools as kt
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import importlib
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
import xgboost as xgb 
import lightgbm as lgb 
importlib.reload(kt)
%matplotlib inline 

In [9]:
path_to_dataset = 'dataset/trainable.p'

with open(path_to_dataset, 'rb') as f:
    dataset = pickle.load(f)
    
X_train, X_test, y_train, y_test = dataset['X_train'], dataset['X_test'], dataset['y_train'], dataset['y_test']

modeling

In [12]:
baseline_model = LogisticRegression()

In [14]:
baseline_model.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
y_pred_prob = baseline_model.predict_proba(X_test)

In [24]:
def find_optimal_threshold(y_true, y_prob):
    """
    Finds the optimal threshold for binary classification based on the F1 score.
    
    Parameters:
    y_true : array-like, shape (n_samples,)
        True binary labels.
    y_prob : array-like, shape (n_samples,)
        Predicted probabilities for the positive class.
        
    Returns:
    optimal_threshold : float
        Optimal threshold that maximizes the F1 score.
    max_f1_score : float
        Maximum F1 score achieved.
    """
    thresholds = np.linspace(0, 1, 1000) 
    max_f1_score = -np.inf
    optimal_threshold = None
    
    for threshold in thresholds:
        y_pred = (y_prob >= threshold).astype(int) 
        f1 = f1_score(y_true, y_pred) 
        
        if f1 > max_f1_score:
            max_f1_score = f1
            optimal_threshold = threshold
    
    return optimal_threshold, max_f1_score

In [29]:
threshold, max_f1 = find_optimal_threshold(y_test, y_pred_prob[:,1])
threshold, max_f1

(0.20020020020020018, 0.37209302325581395)

In [33]:
conf_matrix = confusion_matrix(y_test, y_pred_prob[:,1] > threshold)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[266 104]
 [ 31  40]]


In [34]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred_prob[:,1] > threshold))


Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.72      0.80       370
           1       0.28      0.56      0.37        71

    accuracy                           0.69       441
   macro avg       0.59      0.64      0.58       441
weighted avg       0.80      0.69      0.73       441



SOTA methods

In [38]:
kfcv = KFold(n_splits = 4)

In [39]:
def k_fold(model):
    mae_fold = []
    
    for train_ind, val_ind in kfcv.split(X):
        x_train, x_val = X[train_ind], X[val_ind]
        y_train, y_val = y[train_ind], y[val_ind]
        evals = [(x_val, y_val)]

        model.fit(x_train, y_train,
                eval_set = evals,
                verbose = False)

        pred = model.predict(x_val)
        mae = mean_absolute_error(y_val, pred)
        mae_fold.append(mae)
        
    return np.mean(mae_fold)

In [48]:
x_train, x_val, y_train_, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
evals = [(x_train, y_train_), (x_val, y_val)]

In [56]:
model_dict = {
    'xgb': xgb.XGBClassifier(n_estimators=5000,
                             tree_method='gpu_hist', 
                             objective='binary:logistic'),  
    'lgb': lgb.LGBMClassifier(n_estimators=5000,
                               device='gpu', 
                               objective='binary',  
                               metric='binary_error')  
}

In [57]:
## XGBoost
model = model_dict['xgb'].fit(x_train, y_train_, eval_set = evals, verbose = 200)
results = model.evals_result()

[0]	validation_0-logloss:0.38358	validation_1-logloss:0.39518



    E.g. tree_method = "hist", device = "cuda"



[200]	validation_0-logloss:0.00810	validation_1-logloss:0.38838
[400]	validation_0-logloss:0.00604	validation_1-logloss:0.40606
[600]	validation_0-logloss:0.00538	validation_1-logloss:0.41221
[800]	validation_0-logloss:0.00501	validation_1-logloss:0.41217
[1000]	validation_0-logloss:0.00478	validation_1-logloss:0.41607
[1200]	validation_0-logloss:0.00464	validation_1-logloss:0.41784
[1400]	validation_0-logloss:0.00456	validation_1-logloss:0.42029
[1600]	validation_0-logloss:0.00451	validation_1-logloss:0.42188
[1800]	validation_0-logloss:0.00445	validation_1-logloss:0.42254
[2000]	validation_0-logloss:0.00441	validation_1-logloss:0.42474
[2200]	validation_0-logloss:0.00437	validation_1-logloss:0.42599
[2400]	validation_0-logloss:0.00435	validation_1-logloss:0.42713
[2600]	validation_0-logloss:0.00432	validation_1-logloss:0.42781
[2800]	validation_0-logloss:0.00430	validation_1-logloss:0.42877
[3000]	validation_0-logloss:0.00428	validation_1-logloss:0.42959
[3200]	validation_0-logloss:0

In [65]:
y_pred_prob = model.predict_proba(X_test)
y_pred = model.predict(X_test)

In [66]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[352  18]
 [ 52  19]]


In [67]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.95      0.91       370
           1       0.51      0.27      0.35        71

    accuracy                           0.84       441
   macro avg       0.69      0.61      0.63       441
weighted avg       0.81      0.84      0.82       441



In [69]:
y_test.value_counts()

0    370
1     71
Name: Attrition, dtype: int64

In [79]:
feature_dict = {'col': model.feature_names_in_, 'val': model.feature_importances_}
feature_df = pd.DataFrame(feature_dict)
feature_df.sort_values('val', ascending=False).head(10)

Unnamed: 0,col,val
7,JobRole_Research Scientist,0.193831
9,JobRole_Sales Representative,0.060378
26,TotalWorkingYears,0.057916
32,YearsWithCurrManager,0.055256
28,WorkLifeBalance,0.046363
12,OverTime_Yes,0.038327
8,JobRole_Sales Executive,0.037029
4,EducationField_Technical Degree,0.03683
1,Department_Human Resources,0.035115
25,StockOptionLevel,0.033045


In [82]:
## Lightgbm
model = model_dict['lgb'].fit(x_train, y_train_, eval_set = evals)
results = model.evals_result_

[LightGBM] [Info] Number of positive: 137, number of negative: 686
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 863
[LightGBM] [Info] Number of data points in the train set: 823, number of used features: 33
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1650, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 22 dense feature groups (0.02 MB) transferred to GPU in 0.002528 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166464 -> initscore=-1.610897
[LightGBM] [Info] Start training from score -1.610897




















In [83]:
y_pred_prob = model.predict_proba(X_test)
y_pred = model.predict(X_test)

In [84]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[353  17]
 [ 51  20]]
