In [1]:
import plotly.express as px
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib as mp
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import pearsonr
colors=['#BC5308', '#FFECD1', '#C5CAB8', '#FF7D00', '#8AA79F', '#FFB569', '#15616D', '#001524']
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("Hr_24.csv")

In [3]:
df.head(3)
df = df.drop(columns = ["Unnamed: 0"])

In [4]:
df1= pd.get_dummies(data=df, columns=["department", "salary_range", "difference_groups", "satisfaction_groups"])
df1.columns

Index(['satisfaction_level', 'last_evaluation_score', 'project_count',
       'average_monthly_hours', 'years_of_working', 'workplace_accident_count',
       'employment_status', 'promotion_last_5years', 'Difference_in_scores',
       'department_IT', 'department_RandD', 'department_accounting',
       'department_hr', 'department_management', 'department_marketing',
       'department_product_mng', 'department_sales', 'department_support',
       'department_technical', 'salary_range_high', 'salary_range_low',
       'salary_range_medium', 'difference_groups_-0.19 - 0',
       'difference_groups_-0.59 - -0.2', 'difference_groups_-1 - -0.6',
       'difference_groups_0 - 0.19', 'difference_groups_0.2 - 0.59',
       'difference_groups_0.6 - 1', 'satisfaction_groups_0-0.2',
       'satisfaction_groups_0.3-0.4', 'satisfaction_groups_0.5-0.6',
       'satisfaction_groups_0.7-0.8', 'satisfaction_groups_0.9-1'],
      dtype='object')

## Ensemble learning - Badding

In [5]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

X3 = df1.drop("employment_status", axis=1)  
Y3 = df1["employment_status"] 
smote = SMOTE(sampling_strategy='minority', random_state=1)
X_sm, Y_sm = smote.fit_resample(X3, Y3)

x4_train, x4_test, y4_train, y4_test = train_test_split(X_sm, Y_sm, test_size=0.3, random_state=15, stratify=Y_sm)

# Decision tree
tree = DecisionTreeClassifier(criterion='entropy', 
                              random_state=1, 
                              max_depth=None)
#Bagging classifier
bag = BaggingClassifier(estimator=tree,
                        n_estimators=10,
                        max_samples=1.0,
                        max_features=1.0,
                        bootstrap=True,
                        random_state=1)


bag.fit(x4_train, y4_train)
y_pred_proba = bag.predict_proba(x4_test)[:, 1] 
y_pred = bag.predict(x4_test)  

#cofirming the accuracy
roc_auc = roc_auc_score(y4_test, y_pred_proba)
accuracy = accuracy_score(y4_test, y_pred)

# confusion matrix
conf_matrix = confusion_matrix(y4_test, y_pred)

# Generate classification report
class_report = classification_report(y4_test, y_pred)
print(f'ROC AUC Score: {roc_auc:.2f}')
print(f'Accuracy: {accuracy:.2f}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)

ROC AUC Score: 1.00
Accuracy: 0.99
Confusion Matrix:
[[2775   11]
 [  71 2714]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      2786
           1       1.00      0.97      0.99      2785

    accuracy                           0.99      5571
   macro avg       0.99      0.99      0.99      5571
weighted avg       0.99      0.99      0.99      5571



## Ensemble learning - Adaptive Boost (AdaBoost)

In [6]:
#split and balancing data

X3 = df1.drop("employment_status", axis=1)  
Y3 = df1["employment_status"] 
smote = SMOTE(sampling_strategy='minority', random_state=1)
X_sm, Y_sm = smote.fit_resample(X3, Y3)

x4_train, x4_test, y4_train, y4_test = train_test_split(X_sm, Y_sm, test_size=0.3, random_state=15, stratify=Y_sm)
from sklearn.ensemble import AdaBoostClassifier
tree = DecisionTreeClassifier(criterion='entropy', 
                              random_state=1, 
                              max_depth=1)  

# AdaBoost classifier
ada = AdaBoostClassifier(estimator=tree,
                         n_estimators=500,
                         learning_rate=0.1,
                         random_state=1)

# Fit the AdaBoost classifier
ada.fit(x4_train, y4_train)

# Make predictions
y_pred_proba = ada.predict_proba(x4_test)[:, 1] 
y_pred = ada.predict(x4_test)  

# Calculate metrics
roc_auc = roc_auc_score(y4_test, y_pred_proba)
accuracy = accuracy_score(y4_test, y_pred)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y4_test, y_pred)

# Generate classification report
class_report = classification_report(y4_test, y_pred)

# Print results
print(f'ROC AUC Score: {roc_auc:.2f}')
print(f'Accuracy: {accuracy:.2f}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)

ROC AUC Score: 0.99
Accuracy: 0.95
Confusion Matrix:
[[2682  104]
 [ 150 2635]]
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.96      0.95      2786
           1       0.96      0.95      0.95      2785

    accuracy                           0.95      5571
   macro avg       0.95      0.95      0.95      5571
weighted avg       0.95      0.95      0.95      5571



## Ensemble learning - Extreme gradient boosting - (XGBoost)

In [17]:
#!pip install XGBoost==1.5.0

In [21]:
!pip install --upgrade pandas 
!pip install --upgrade xgboost



In [24]:
import pandas as pd
import xgboost as xgb

print(f'pandas version: {pd.__version__}')
print(f'xgboost version: {xgb.__version__}')

pandas version: 2.1.4
xgboost version: 1.5.0


In [23]:

import xgboost as xgb
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

if not hasattr(pd, 'Int64Index'):
    pd.Int64Index = pd.Index

#split and balancing data

X3 = df1.drop("employment_status", axis=1)  
Y3 = df1["employment_status"] 
smote = SMOTE(sampling_strategy='minority', random_state=1)
X_sm, Y_sm = smote.fit_resample(X3, Y3)

x4_train, x4_test, y4_train, y4_test = train_test_split(X_sm, Y_sm, test_size=0.3, random_state=15, stratify=Y_sm)

# Initialize the model
model = xgb.XGBClassifier(
    n_estimators=1000,
    learning_rate=0.01,  
    max_depth=4,
    random_state=1,
    use_label_encoder=False)

# Train the model
x_model = model.fit(x4_train, y4_train)

# Predictions
y4_train_pred = x_model.predict(x4_train)
y4_test_pred = x_model.predict(x4_test)

# Calculate accuracy
x_model_train = accuracy_score(y4_train, y4_train_pred)
x_model_test = accuracy_score(y4_test, y4_test_pred)

# Print the results
print(f'XGBoost train/test accuracies: {x_model_train:.3f}/{x_model_test:.3f}')





XGBoost train/test accuracies: 0.984/0.982


In [26]:
# Calculate confusion matrix
conf_matrix = confusion_matrix(y4_test, y4_test_pred)
print (conf_matrix)
# Generate classification report
class_report = classification_report(y4_test, y4_test_pred)
print (class_report)

[[2767   19]
 [  83 2702]]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      2786
           1       0.99      0.97      0.98      2785

    accuracy                           0.98      5571
   macro avg       0.98      0.98      0.98      5571
weighted avg       0.98      0.98      0.98      5571

