In [1]:
# Data manipulation
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Model
from sklearn.linear_model import LogisticRegression

# Evaluation & splitting
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

# Saving the model
import joblib


In [3]:
df = pd.read_csv('hr_data_cleaned_for_pipeline')
df.head()

Unnamed: 0,Age,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,JobInvolvement,JobLevel,JobRole,...,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition_num
0,41,Travel_Rarely,Sales,1,2,Life Sciences,2,3,2,Sales Executive,...,1,0,8,0,1,6,4,0,5,1
1,49,Travel_Frequently,Research & Development,8,1,Life Sciences,3,2,2,Research Scientist,...,4,1,10,3,3,10,7,1,7,0
2,37,Travel_Rarely,Research & Development,2,2,Other,4,2,1,Laboratory Technician,...,2,0,7,3,3,0,0,0,0,1
3,33,Travel_Frequently,Research & Development,3,4,Life Sciences,4,3,1,Research Scientist,...,3,0,8,3,3,8,7,3,0,0
4,27,Travel_Rarely,Research & Development,2,1,Medical,1,3,1,Laboratory Technician,...,4,1,6,3,3,2,2,2,2,0


In [4]:
X=df.drop('Attrition_num',axis=1)
y=df['Attrition_num']


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

numerical_col=X.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_col=X.select_dtypes(include=['object']).columns.tolist()

print('Numerical :',numerical_col)
print('Categorical:', categorical_col)

Numerical : ['Age', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome', 'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']
Categorical: ['BusinessTravel', 'Department', 'EducationField', 'JobRole', 'MaritalStatus']


In [5]:
preprocessor= ColumnTransformer([
    ('num',StandardScaler(),numerical_col),
    ('cat',OneHotEncoder(handle_unknown='ignore'),categorical_col)
])

pipeline= Pipeline([
    ('preprocessor',preprocessor),
    ('classifier',LogisticRegression(max_iter=1000))
])


scores=cross_val_score(pipeline,X_train,y_train,cv=5,scoring='f1')
print('f1 scores:',scores)
print('mean f1:',scores.mean())

f1 scores: [0.31578947 0.45283019 0.36363636 0.43137255 0.45614035]
mean f1: 0.40395378517932407


In [6]:
param_grid = [
    {
        'classifier__solver': ['liblinear'],
        'classifier__penalty': ['l1', 'l2'],
        'classifier__C': [0.01, 0.1, 1, 10]
    },
    {
        'classifier__solver': ['lbfgs'],
        'classifier__penalty': ['l2'],  # lbfgs only supports l2
        'classifier__C': [0.01, 0.1, 1, 10]
    },
    {
        'classifier__solver': ['saga'],
        'classifier__penalty': ['l1', 'l2'],
        'classifier__C': [0.01, 0.1, 1, 10]
    }
]

grid=GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='f1',
    verbose=1,
    n_jobs=-1                     )

grid.fit(X_train,y_train)

print('Best parameters:', grid.best_params_)
print('Best cross validation score:', grid.best_score_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits




Best parameters: {'classifier__C': 10, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}
Best cross validation score: 0.42075941008144396


In [8]:

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

y_pred = best_model.predict(X_test)

best_model = grid.best_estimator_

best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))




Accuracy: 0.8435374149659864
Precision: 0.5238095238095238
Recall: 0.23404255319148937
F1 Score: 0.3235294117647059

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.96      0.91       247
           1       0.52      0.23      0.32        47

    accuracy                           0.84       294
   macro avg       0.70      0.60      0.62       294
weighted avg       0.81      0.84      0.82       294


Confusion Matrix:
 [[237  10]
 [ 36  11]]


In [9]:

joblib.dump(best_model, "model.pkl")


['model.pkl']