In [13]:
# Day 13: Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score, f1_score

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

import matplotlib.pyplot as plt
import seaborn as sns

import joblib


In [None]:
df=pd.read_csv('../Day_12/hr_data_cleaned_for_pipeline')
df.head()

In [7]:
X=df.drop('Attrition_num',axis=1)
y=df['Attrition_num']

numerical_col=X.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_col=X.select_dtypes(include=['object']).columns.tolist()


                              

In [8]:
preprocessor= ColumnTransformer([
    ('num',StandardScaler(),numerical_col),
    ('cat',OneHotEncoder(handle_unknown='ignore'),categorical_col)
])


gb_pipeline=Pipeline([
    ('preprocess',preprocessor),
    ('classifier',GradientBoostingClassifier())
])

svm_pipeline=Pipeline([
    ('preprocess',preprocessor),
    ('classifier',SVC(kernel='rbf'))
])

gb_scores=cross_val_score(gb_pipeline,X,y,cv=5,scoring='f1_macro')
svm_scores=cross_val_score(svm_pipeline,X,y,cv=5,scoring='f1_macro')

print('Gradient Boosting F1:',gb_scores.mean())
print('SVM F1',svm_scores.mean())

Gradient Boosting F1: 0.623875842725919
SVM F1 0.5839307055910428


In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


gb_pipeline.fit(X_train, y_train)

from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

# Make predictions on test set
y_pred = gb_pipeline.predict(X_test)
y_proba = gb_pipeline.predict_proba(X_test)[:, 1]  # for AUC

# Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("AUC Score:", roc_auc_score(y_test, y_proba))

# Classification Report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.826530612244898
Precision: 0.3888888888888889
Recall: 0.14893617021276595
F1 Score: 0.2153846153846154
AUC Score: 0.7242656559565854

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.96      0.90       247
           1       0.39      0.15      0.22        47

    accuracy                           0.83       294
   macro avg       0.62      0.55      0.56       294
weighted avg       0.78      0.83      0.79       294

Confusion Matrix:
 [[236  11]
 [ 40   7]]


In [12]:
joblib.dump(gb_pipeline, "gb_model.pkl")

['gb_model.pkl']