In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_classification

In [2]:
X, y = make_classification(
    n_samples = 10000,
    n_features = 10,
    n_informative = 1,
    n_clusters_per_class=1,
    random_state = 42
)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
lr =LogisticRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

# Evalute Data
print(f'Accuracy Score : {accuracy_score(y_test, y_pred)}')
print(f'Precision Score : {precision_score(y_test, y_pred)}')
print(f'Recall Score : {recall_score(y_test, y_pred)}')
print(f'F1 Score : {f1_score(y_test, y_pred)}')
print(f'Classification Report : \n{classification_report(y_test, y_pred)}')
cm = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=['Actual 0', 'Actual 1'],
    columns=['Predicted 0', 'Predicted 1']
)
print(f'Confusion Matrix \n{cm}')

Accuracy Score : 0.9475
Precision Score : 0.96875
Recall Score : 0.9220512820512821
F1 Score : 0.9448239621650026
Classification Report : 
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      1025
           1       0.97      0.92      0.94       975

    accuracy                           0.95      2000
   macro avg       0.95      0.95      0.95      2000
weighted avg       0.95      0.95      0.95      2000

Confusion Matrix 
          Predicted 0  Predicted 1
Actual 0          996           29
Actual 1           76          899


# Dataset Employee Turnover

In [4]:
df = pd.read_csv('employee_turnover.csv')

In [5]:
df.head()

Unnamed: 0,Job_Satisfaction,Performance_Rating,Years_At_Company,Work_Life_Balance,Distance_From_Home,Monthly_Income,Education_Level,Age,Num_Companies_Worked,Employee_Role,Annual_Bonus,Training_Hours,Department,Annual_Bonus_Squared,Annual_Bonus_Training_Hours_Interaction,Employee_Turnover
0,0.562326,0.141129,0.123989,0.347583,0.330353,0.328853,0.600933,0.31599,0.768736,0.090671,0.324786,0.669193,0.602932,0.105486,0.217344,0
1,0.017041,0.559047,0.511203,0.793908,0.42355,0.55345,0.742009,0.897146,0.380035,0.601633,0.694611,0.043271,0.800761,0.482484,0.030056,0
2,0.774699,0.604371,0.798174,0.2605,0.804034,0.1318,0.775178,0.830947,0.218726,0.972936,0.153476,0.701336,0.705275,0.023555,0.107638,1
3,0.628174,0.385249,0.230104,0.516809,0.272248,0.589249,0.482409,0.090507,0.402746,0.132842,0.305973,0.549688,0.600531,0.09362,0.16819,0
4,0.799183,0.199967,0.839029,0.247927,0.341934,0.076818,0.055356,0.68086,0.923341,0.493017,0.844094,0.793751,0.664679,0.712494,0.67,0


In [6]:
df.shape

(1350, 16)

# Without Pipeline

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='Employee_Turnover'),
                                                    df['Employee_Turnover'],
                                                    test_size = 0.3,
                                                   random_state = 42)

LR = LogisticRegression()
LR.fit(X_train, y_train)
y_pred = LR.predict(X_test)

print(f'Accuracy Score : {accuracy_score(y_test, y_pred)}')
print(f'Precision Score : {precision_score(y_test, y_pred)}')
print(f'Recall Score : {recall_score(y_test, y_pred)}')
print(f'F1 Score : {f1_score(y_test, y_pred)}')
print(f'Classification Report : \n{classification_report(y_test, y_pred)}')
cm = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=['Actual 0', 'Actual 1'],
    columns = ['Predicted 0', 'Predicted 1']
)
print(f'Confusion Matrix \n{cm}')

Accuracy Score : 0.8518518518518519
Precision Score : 0.8609625668449198
Recall Score : 0.8256410256410256
F1 Score : 0.8429319371727748
Classification Report : 
              precision    recall  f1-score   support

           0       0.84      0.88      0.86       210
           1       0.86      0.83      0.84       195

    accuracy                           0.85       405
   macro avg       0.85      0.85      0.85       405
weighted avg       0.85      0.85      0.85       405

Confusion Matrix 
          Predicted 0  Predicted 1
Actual 0          184           26
Actual 1           34          161


# Using Pipeline

In [8]:
# Pipeline
pipeline = Pipeline(steps=[('Standard Scaler', StandardScaler()),('Logistic Regression', LogisticRegression())])

# Train & Split
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='Employee_Turnover'),
                                                           df['Employee_Turnover'],
                                                           test_size = 0.2,
                                                           random_state = 42)

# Train Model
pipeline.fit(X_train, y_train)

# Prediction
y_pred = pipeline.predict(X_test)

# Evaluate model
print(f'Accuracy Score : {accuracy_score(y_test, y_pred)}')
print(f'Precision Score : {precision_score(y_test, y_pred)}')
print(f'Recall Score : {recall_score(y_test, y_pred)}')
print(f'F1 Score : {f1_score(y_test, y_pred)}')
print(f'Classification Report : \n{classification_report(y_test, y_pred)}')
cm = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index = ['Actual 0', 'Actual 1'],
    columns = ['Pridected 0', 'Predicted 1']
)
print(f'Confusion Matrix : \n{cm}')

Accuracy Score : 0.8629629629629629
Precision Score : 0.8666666666666667
Recall Score : 0.832
F1 Score : 0.8489795918367347
Classification Report : 
              precision    recall  f1-score   support

           0       0.86      0.89      0.87       145
           1       0.87      0.83      0.85       125

    accuracy                           0.86       270
   macro avg       0.86      0.86      0.86       270
weighted avg       0.86      0.86      0.86       270

Confusion Matrix : 
          Pridected 0  Predicted 1
Actual 0          129           16
Actual 1           21          104
