#######################################
# STEP 1 - Importing required librarie Pipeline
######################################

In [1]:
print("Step 1: Required librarie imported successfully")

import time
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectKBest, chi2

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as IMBPipeline
from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler

from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, f1_score

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB


Step 1: Required librarie imported successfully


####################
# To ignore warning
####################

In [2]:
import warnings
warnings.filterwarnings("ignore")


################################################
# Step2- Loading HR Attrition dataset
################################################

In [3]:
print("Step 2: Created DataFrame successfully")

df = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")
df.head()


Step 2: Created DataFrame successfully


Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


######################
# STEP3- Feature Engineering
######################

In [4]:
print("Step 3: Feature Engineering Done successfully")

df['Attrition'] = df['Attrition'].replace(('Yes','No'), (1,0))
df['OverTime'] = df['OverTime'].replace(('Yes','No'), (1,0))


Step 3: Feature Engineering Done successfully


#################################
# STEP4- Dropping unnecessary columns
#################################

In [5]:
print("Step 4: Dropped unnecessary columns successfully")

df = df.drop(columns=['EmployeeNumber','EmployeeCount','StandardHours'])


Step 4: Dropped unnecessary columns successfully


############################################
# STEP5- Applying Ordinal Encoding
############################################

In [6]:
print("Step 5: Applied ordinal encoding successfully")

cat_cols = df.select_dtypes(include=['object']).columns
encoder = OrdinalEncoder()
df[cat_cols] = encoder.fit_transform(df[cat_cols])


Step 5: Applied ordinal encoding successfully


#########################################
# STEP6- Checking correlation on Attrition
#########################################

In [7]:
print("Step 6: Checking correlation done successfully")

corr = df.corr()['Attrition'].sort_values(ascending=False)
corr


Step 6: Checking correlation done successfully


Attrition                   1.000000
OverTime                    0.246118
MaritalStatus               0.162070
DistanceFromHome            0.077924
JobRole                     0.067151
Department                  0.063991
NumCompaniesWorked          0.043494
Gender                      0.029453
EducationField              0.026846
MonthlyRate                 0.015170
PerformanceRating           0.002889
BusinessTravel              0.000074
HourlyRate                 -0.006846
PercentSalaryHike          -0.013478
Education                  -0.031373
YearsSinceLastPromotion    -0.033019
RelationshipSatisfaction   -0.045872
DailyRate                  -0.056652
TrainingTimesLastYear      -0.059478
WorkLifeBalance            -0.063939
EnvironmentSatisfaction    -0.103369
JobSatisfaction            -0.103481
JobInvolvement             -0.130016
YearsAtCompany             -0.134392
StockOptionLevel           -0.137145
YearsWithCurrManager       -0.156199
Age                        -0.159205
M

###########################################
# STEP7- Preparing Features as X and target as y
###########################################

In [8]:
print("Step 7: Preparing features as X and target as y")

X = df.drop('Attrition', axis=1)
y = df['Attrition']


Step 7: Preparing features as X and target as y


####################################
# STEP8- Prepairing Train and Test Dataset
####################################

In [9]:
print("Step 8: Splitting data X_train, X_test, y_train & y_test done successfully")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0
)


Step 8: Splitting data X_train, X_test, y_train & y_test done successfully


#################
# STEP9-  Model Pipeline
#################


In [10]:
print("Step 9: model_pipeline function created successfully")

def model_pipeline(X, model):

    n_c = X.select_dtypes(exclude=['object']).columns.tolist()
    c_c = X.select_dtypes(include=['object']).columns.tolist()

    numeric_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='constant')),
        ('scaler', MinMaxScaler())
    ])

    categorical_pipeline = Pipeline([
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer([
        ('numeric', numeric_pipeline, n_c),
        ('categorical', categorical_pipeline, c_c)
    ], remainder='passthrough')

    final_steps = [
        ('preprocessor', preprocessor),
        ('smote', SMOTE(random_state=1)),
        ('feature_selection', SelectKBest(score_func=chi2, k=8)),
        ('model', model)
    ]

    return IMBPipeline(steps=final_steps)


Step 9: model_pipeline function created successfully


##################
# STEP10- Model Selection#
##################

In [11]:
print("Step 10: select_model function created successfully")

def select_model(X, y):

    classifiers = {
        "RandomForestClassifier": RandomForestClassifier(),
        "DecisionTreeClassifier": DecisionTreeClassifier(),
        "KNeighborsClassifier": KNeighborsClassifier(),
        "RidgeClassifier": RidgeClassifier(),
        "BernoulliNB": BernoulliNB(),
        "SVC": SVC()
    }

    df_models = pd.DataFrame(columns=['model','run_time','roc_auc'])

    for key in classifiers:

        start_time = time.time()
        print("Running model:", key)

        pipeline = model_pipeline(X, classifiers[key])
        cv = cross_val_score(pipeline, X, y, cv=10, scoring='roc_auc')

        row = {
            'model': key,
            'run_time': round((time.time()-start_time)/60, 2),
            'roc_auc': cv.mean()
        }

        df_models = pd.concat([df_models, pd.DataFrame([row])], ignore_index=True)

    return df_models.sort_values(by='roc_auc', ascending=False)


Step 10: select_model function created successfully


#####################################
# STEP11- Access Model select_model function
#####################################

In [12]:


print("Step 11: Accessing select_model function done successfully")

models = select_model(X_train, y_train)
models


Step 11: Accessing select_model function done successfully
Running model: RandomForestClassifier
Running model: DecisionTreeClassifier
Running model: KNeighborsClassifier
Running model: RidgeClassifier
Running model: BernoulliNB
Running model: SVC


Unnamed: 0,model,run_time,roc_auc
3,RidgeClassifier,0.01,0.74024
4,BernoulliNB,0.01,0.736634
5,SVC,0.05,0.731261
0,RandomForestClassifier,0.14,0.71583
2,KNeighborsClassifier,0.01,0.678865
1,DecisionTreeClassifier,0.01,0.629408


#####################################
# STEP12- Accessing best model and training
#####################################

In [13]:
print("Step 12: Training best model")

best_model = SVC()
final_pipeline = model_pipeline(X_train, best_model)
final_pipeline.fit(X_train, y_train)


Step 12: Training best model


0,1,2
,steps,"[('preprocessor', ...), ('smote', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('numeric', ...), ('categorical', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,sampling_strategy,'auto'
,random_state,1
,k_neighbors,5

0,1,2
,score_func,<function chi...001DD3EE6C700>
,k,8

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


#####################################
# STEP13- Results predicted successfully
#####################################

In [14]:
print("Step 13: Results predicted successfully")

y_pred = final_pipeline.predict(X_test)


Step 13: Results predicted successfully


#####################
# STEP14- ROC and AUC score
#####################

In [15]:
print("Step 14: ROC and AUC scores")

print("ROC AUC:", roc_auc_score(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))


Step 14: ROC and AUC scores
ROC AUC: 0.7033692722371968
Accuracy: 0.764172335600907
F1 Score: 0.45263157894736844


#########################
# STEP15- Classification report
#########################


In [16]:
print("Step 15: Classification report generated successfully")

print(classification_report(y_test, y_pred))


Step 15: Classification report generated successfully
              precision    recall  f1-score   support

           0       0.92      0.79      0.85       371
           1       0.36      0.61      0.45        70

    accuracy                           0.76       441
   macro avg       0.64      0.70      0.65       441
weighted avg       0.83      0.76      0.79       441

