In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score, precision_recall_curve, roc_curve
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier

In [4]:
data = pd.read_csv(r"C:\Users\shali\Downloads\Attrition_cleaned_data_for ML.csv")

In [5]:
data.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,23,2,624,2,0,1,1,1,0,64,...,0,0,8,0,0,6,4,0,5,1
1,31,1,113,1,7,0,1,2,1,31,...,3,1,10,3,2,10,7,1,7,0
2,19,2,805,1,1,1,4,3,1,62,...,1,0,7,3,2,0,0,0,0,1
3,15,1,820,1,2,3,1,3,0,26,...,2,0,8,3,2,8,7,3,0,0
4,9,2,312,1,1,0,3,0,1,10,...,3,1,6,3,2,2,2,2,2,0


In [6]:
data.shape

(2272, 31)

In [7]:
X = data.drop('Attrition', axis=1)
y = data.Attrition

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,
                                                    stratify=y)

scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)
X_std = scaler.transform(X)

In [8]:
def calculate_percentage(data):
    stay_percentage = (data.value_counts()[0] / data.shape[0]) * 100
    leave_percentage = (data.value_counts()[1] / data.shape[0]) * 100
    return stay_percentage, leave_percentage

print("TRAIN")
train_stay, train_leave = calculate_percentage(y_train)
print(f"Stay: {train_stay:.2f}%")
print(f"Leave: {train_leave:.2f}%")

print("\nTEST")
test_stay, test_leave = calculate_percentage(y_test)
print(f"Stay: {test_stay:.2f}%")
print(f"Leave: {test_leave:.2f}%")


TRAIN
Stay: 50.00%
Leave: 50.00%

TEST
Stay: 50.00%
Leave: 50.00%


In [9]:
def evaluate(model, X_train, X_test, y_train, y_test):
    y_test_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)

    print("TRAINING RESULTS:")
    train_conf_matrix = confusion_matrix(y_train, y_train_pred)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_clf_report = classification_report(y_train, y_train_pred, output_dict=True)

    print(f"TRAINING CONFUSION MATRIX:\n{train_conf_matrix}")
    print(f"TRAINING ACCURACY SCORE:\n{train_accuracy:.4f}")
    print(f"TRAINING CLASSIFICATION REPORT:\n{pd.DataFrame(train_clf_report)}")

    print("\nTESTING RESULTS:")
    test_conf_matrix = confusion_matrix(y_test, y_test_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_clf_report = classification_report(y_test, y_test_pred, output_dict=True)

    print(f"TESTING CONFUSION MATRIX:\n{test_conf_matrix}")
    print(f"TESTING ACCURACY SCORE:\n{test_accuracy:.4f}")
    print(f"TESTING CLASSIFICATION REPORT:\n{pd.DataFrame(test_clf_report)}")


### LogisticRegression

In [10]:
lr_clf = LogisticRegression(solver='liblinear', penalty='l1')
lr_clf.fit(X_train_std, y_train)

evaluate(lr_clf, X_train_std, X_test_std, y_train, y_test)

TRAINING RESULTS:
TRAINING CONFUSION MATRIX:
[[670 125]
 [123 672]]
TRAINING ACCURACY SCORE:
0.8440
TRAINING CLASSIFICATION REPORT:
                    0           1  accuracy    macro avg  weighted avg
precision    0.844893    0.843162  0.844025     0.844027      0.844027
recall       0.842767    0.845283  0.844025     0.844025      0.844025
f1-score     0.843829    0.844221  0.844025     0.844025      0.844025
support    795.000000  795.000000  0.844025  1590.000000   1590.000000

TESTING RESULTS:
TESTING CONFUSION MATRIX:
[[290  51]
 [ 57 284]]
TESTING ACCURACY SCORE:
0.8416
TESTING CLASSIFICATION REPORT:
                    0           1  accuracy   macro avg  weighted avg
precision    0.835735    0.847761  0.841642    0.841748      0.841748
recall       0.850440    0.832845  0.841642    0.841642      0.841642
f1-score     0.843023    0.840237  0.841642    0.841630      0.841630
support    341.000000  341.000000  0.841642  682.000000    682.000000


### RandomForest

In [11]:
rf_clf = RandomForestClassifier(n_estimators=100, bootstrap=False)
rf_clf.fit(X_train, y_train)

evaluate(rf_clf, X_train, X_test, y_train, y_test)

TRAINING RESULTS:
TRAINING CONFUSION MATRIX:
[[795   0]
 [  0 795]]
TRAINING ACCURACY SCORE:
1.0000
TRAINING CLASSIFICATION REPORT:
               0      1  accuracy  macro avg  weighted avg
precision    1.0    1.0       1.0        1.0           1.0
recall       1.0    1.0       1.0        1.0           1.0
f1-score     1.0    1.0       1.0        1.0           1.0
support    795.0  795.0       1.0     1590.0        1590.0

TESTING RESULTS:
TESTING CONFUSION MATRIX:
[[321  20]
 [ 36 305]]
TESTING ACCURACY SCORE:
0.9179
TESTING CLASSIFICATION REPORT:
                    0           1  accuracy   macro avg  weighted avg
precision    0.899160    0.938462  0.917889    0.918811      0.918811
recall       0.941349    0.894428  0.917889    0.917889      0.917889
f1-score     0.919771    0.915916  0.917889    0.917843      0.917843
support    341.000000  341.000000  0.917889  682.000000    682.000000


### DecisionTree

In [12]:
dt_clf = DecisionTreeClassifier(max_depth=7, min_samples_split=20, min_samples_leaf=10)
dt_clf.fit(X_train, y_train)

evaluate(dt_clf, X_train, X_test, y_train, y_test)

TRAINING RESULTS:
TRAINING CONFUSION MATRIX:
[[679 116]
 [ 95 700]]
TRAINING ACCURACY SCORE:
0.8673
TRAINING CLASSIFICATION REPORT:
                    0           1  accuracy    macro avg  weighted avg
precision    0.877261    0.857843  0.867296     0.867552      0.867552
recall       0.854088    0.880503  0.867296     0.867296      0.867296
f1-score     0.865519    0.869025  0.867296     0.867272      0.867272
support    795.000000  795.000000  0.867296  1590.000000   1590.000000

TESTING RESULTS:
TESTING CONFUSION MATRIX:
[[272  69]
 [ 79 262]]
TESTING ACCURACY SCORE:
0.7830
TESTING CLASSIFICATION REPORT:
                    0           1  accuracy   macro avg  weighted avg
precision    0.774929    0.791541  0.782991    0.783235      0.783235
recall       0.797654    0.768328  0.782991    0.782991      0.782991
f1-score     0.786127    0.779762  0.782991    0.782945      0.782945
support    341.000000  341.000000  0.782991  682.000000    682.000000


### SVM

In [13]:
svm_clf = SVC(kernel='linear')
svm_clf.fit(X_train_std, y_train)

evaluate(svm_clf, X_train_std, X_test_std, y_train, y_test)

TRAINING RESULTS:
TRAINING CONFUSION MATRIX:
[[675 120]
 [124 671]]
TRAINING ACCURACY SCORE:
0.8465
TRAINING CLASSIFICATION REPORT:
                    0           1  accuracy    macro avg  weighted avg
precision    0.844806    0.848293  0.846541     0.846550      0.846550
recall       0.849057    0.844025  0.846541     0.846541      0.846541
f1-score     0.846926    0.846154  0.846541     0.846540      0.846540
support    795.000000  795.000000  0.846541  1590.000000   1590.000000

TESTING RESULTS:
TESTING CONFUSION MATRIX:
[[294  47]
 [ 58 283]]
TESTING ACCURACY SCORE:
0.8460
TESTING CLASSIFICATION REPORT:
                    0           1  accuracy   macro avg  weighted avg
precision    0.835227    0.857576  0.846041    0.846402      0.846402
recall       0.862170    0.829912  0.846041    0.846041      0.846041
f1-score     0.848485    0.843517  0.846041    0.846001      0.846001
support    341.000000  341.000000  0.846041  682.000000    682.000000


###  AdaBoostClassifier

In [14]:
ab_clf = AdaBoostClassifier()
ab_clf.fit(X_train, y_train)

evaluate(ab_clf, X_train, X_test, y_train, y_test)

TRAINING RESULTS:
TRAINING CONFUSION MATRIX:
[[707  88]
 [ 83 712]]
TRAINING ACCURACY SCORE:
0.8925
TRAINING CLASSIFICATION REPORT:
                    0           1  accuracy    macro avg  weighted avg
precision    0.894937    0.890000  0.892453     0.892468      0.892468
recall       0.889308    0.895597  0.892453     0.892453      0.892453
f1-score     0.892114    0.892790  0.892453     0.892452      0.892452
support    795.000000  795.000000  0.892453  1590.000000   1590.000000

TESTING RESULTS:
TESTING CONFUSION MATRIX:
[[295  46]
 [ 46 295]]
TESTING ACCURACY SCORE:
0.8651
TESTING CLASSIFICATION REPORT:
                    0           1  accuracy   macro avg  weighted avg
precision    0.865103    0.865103  0.865103    0.865103      0.865103
recall       0.865103    0.865103  0.865103    0.865103      0.865103
f1-score     0.865103    0.865103  0.865103    0.865103      0.865103
support    341.000000  341.000000  0.865103  682.000000    682.000000
