In [48]:
import pandas as pd
import numpy as np
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import joblib

import warnings
warnings.filterwarnings("ignore")

In [49]:
data_model=pd.read_excel("Processed_data.xlsx")

In [50]:
data_model=pd.DataFrame(data_model,columns=data_model.columns)
#data_model.isnull().sum()

In [51]:
data_model.head()

Unnamed: 0,Age,EducationBackground,EmpDepartment,DistanceFromHome,EmpEducationLevel,EmpEnvironmentSatisfaction,EmpHourlyRate,EmpJobInvolvement,EmpJobLevel,EmpJobSatisfaction,NumCompaniesWorked,EmpLastSalaryHikePercent,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,EmpJobRoleEncod,Gender,MaritalStatus,BusinessTravelFrequency,OverTime,Attrition,PerformanceRating
0,0.333333,0.6,1.0,0.321429,0.5,1.0,0.357143,0.666667,0.25,1.0,0.111111,0.071429,1.0,0.25,0.333333,0.333333,0.25,0.388889,0.0,0.470588,0.722222,1,1,3,1,1,3
1,0.690476,0.6,1.0,0.464286,0.75,1.0,0.171429,0.666667,0.25,0.0,0.222222,0.071429,1.0,0.5,0.333333,0.666667,0.175,0.388889,0.066667,0.411765,0.722222,1,1,3,1,1,3
2,0.52381,1.0,1.0,0.142857,0.75,1.0,0.257143,0.333333,0.5,0.0,0.555556,0.714286,0.666667,0.5,0.333333,0.666667,0.45,0.722222,0.066667,0.705882,0.722222,1,2,2,0,1,4
3,0.547619,0.0,0.4,0.321429,0.75,0.333333,0.614286,0.333333,1.0,1.0,0.333333,0.285714,0.333333,0.575,0.333333,0.333333,0.525,0.333333,0.8,0.352941,0.444444,1,0,3,1,1,3
4,1.0,0.6,1.0,0.535714,0.75,0.0,0.771429,0.666667,0.25,0.0,0.888889,0.214286,1.0,0.25,0.166667,0.666667,0.05,0.111111,0.133333,0.117647,0.722222,1,1,3,1,1,3


In [52]:
#Independent and dependent variable
x=data_model.drop("PerformanceRating",axis=1)
y=data_model["PerformanceRating"]

In [53]:
y.value_counts()

PerformanceRating
3    874
2    194
4    132
Name: count, dtype: int64

## Imbalanced Data

In [55]:
#Object creation
smote=SMOTE()

In [56]:
x_resample,y_resample=smote.fit_resample(x,y)
#print(x_resample.isna().sum())

In [57]:
print(y_resample.value_counts())

PerformanceRating
3    874
4    874
2    874
Name: count, dtype: int64


In [58]:
print(x_resample.shape)
print(y_resample.shape)

(2622, 26)
(2622,)


# Model Creation - SVM

In [60]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x_resample,y_resample,random_state=42)

In [61]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(1966, 26)
(1966,)
(656, 26)
(656,)


In [62]:
from sklearn.svm import SVC
svclassifier=SVC() #Object Creation
svclassifier.fit(x_train,y_train)

In [63]:
#Prediction
y_hat=svclassifier.predict(x_test)

# Classification Report

In [65]:
#Model Evaluation
from sklearn.metrics import classification_report
print(classification_report(y_test,y_hat))

              precision    recall  f1-score   support

           2       0.85      0.97      0.91       237
           3       0.91      0.71      0.80       207
           4       0.88      0.94      0.91       212

    accuracy                           0.88       656
   macro avg       0.88      0.87      0.87       656
weighted avg       0.88      0.88      0.87       656



In [66]:
#import functions_script
from functions_script import apply_mappings, encode_and_drop_columns, scale_features  # Import from the script where they are defined
import joblib
data_processing_pipeline=joblib.load('processed_pipeline.pkl')

In [67]:
#Dealing the NaN values during passing pipeline
imputer = SimpleImputer(strategy='most_frequent')
x_train_imputed = imputer.fit_transform(x_train)

#convert loaded pipeline to Dataframe
data_processing_pipeline.set_output(transform='pandas')

#Creating new pipeline
full_pipeline = Pipeline([
    ('data_processing', data_processing_pipeline),
    ('imputer',imputer),
    ('classifier', svclassifier)
])

#Again convert new pipeline to Dataframe
full_pipeline.set_output(transform='pandas')


In [68]:
#Converting the varable to Dataframe
x_train_imputed = pd.DataFrame(x_train_imputed, columns=data_model.drop("PerformanceRating", axis=1).columns)

In [69]:
full_pipeline.fit(x_train_imputed,y_train)

In [70]:
#Saving the pipeline with trained data
joblib.dump(full_pipeline,'trained_pipeline.pkl')

['trained_pipeline.pkl']