# Model 

## Importing Necessary Libraries

In [3]:
import pandas as pd
pd.set_option("display.max_columns", None)
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score
from xgboost import XGBClassifier
import pickle

## Importing Processed Data

In [5]:
df = pd.read_csv("processed_data.csv")
df.head()

Unnamed: 0,Age,Gender,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,EmpEnvironmentSatisfaction,EmpHourlyRate,EmpJobInvolvement,EmpJobLevel,EmpJobSatisfaction,NumCompaniesWorked,OverTime,EmpLastSalaryHikePercent,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,PerformanceRating
0,32,1.0,2,2,5,13,2.0,10,3,4,55,3,2,4,1,0,12,4,10,2,2,10,7,0,8,0.0,1.0
1,47,1.0,2,2,5,13,2.0,14,4,4,42,3,2,1,2,0,12,4,20,2,3,7,7,1,7,0.0,1.0
2,40,1.0,1,1,5,13,1.0,5,4,4,48,2,3,1,5,1,21,3,20,2,3,18,13,1,12,0.0,2.0
3,41,1.0,0,0,3,8,2.0,10,4,2,73,2,5,4,3,0,15,2,23,2,2,21,6,12,6,0.0,1.0
4,60,1.0,2,2,5,13,2.0,16,4,1,84,3,2,1,8,0,14,4,10,1,3,2,2,2,2,0.0,1.0


## Splitting Data

In [7]:
x= df.drop(columns=['PerformanceRating'])
y = df.PerformanceRating

In [8]:
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=18,stratify=y,test_size=0.2)

In [9]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((960, 26), (240, 26), (960,), (240,))

In [10]:
y_train.value_counts()

PerformanceRating
1.0    699
0.0    155
2.0    106
Name: count, dtype: int64

In [11]:
y_test.value_counts()

PerformanceRating
1.0    175
0.0     39
2.0     26
Name: count, dtype: int64

## Model Building

### Extreme Gradient Boosting Classifier (XGB Classifier)

In [14]:
xgb = XGBClassifier(colsample_bytree= 0.8,
                    learning_rate= 0.05,
                    max_depth= 7, 
                    subsample =0.8,
                    n_estimators=200,
                    min_child_weight=1,
                    gamma=1,
                    random_state=32)

xgb.fit(x_train,y_train)

ypred_xgb = xgb.predict(x_test)
ypred_xgb_train = xgb.predict(x_train)

## Model Evaluation

### F1-Score

In [17]:
print(f" f1 score of test data = {f1_score(y_test,ypred_xgb,average='weighted')}")
print(f" f1 score of train data = {f1_score(y_train,ypred_xgb_train,average='weighted')}")

 f1 score of test data = 0.9617718898673954
 f1 score of train data = 0.9625318451824967


### Classification Report

#### Test Report

In [20]:
pd.DataFrame(classification_report(y_test,ypred_xgb,output_dict=True)).transpose()

Unnamed: 0,precision,recall,f1-score,support
0.0,0.971429,0.871795,0.918919,39.0
1.0,0.961326,0.994286,0.977528,175.0
2.0,0.958333,0.884615,0.92,26.0
accuracy,0.9625,0.9625,0.9625,0.9625
macro avg,0.963696,0.916899,0.938816,240.0
weighted avg,0.962643,0.9625,0.961772,240.0


#### Train Report

In [22]:
pd.DataFrame(classification_report(y_train,ypred_xgb_train,output_dict=True)).transpose()

Unnamed: 0,precision,recall,f1-score,support
0.0,0.90184,0.948387,0.924528,155.0
1.0,0.972896,0.97568,0.974286,699.0
2.0,0.989583,0.896226,0.940594,106.0
accuracy,0.9625,0.9625,0.9625,0.9625
macro avg,0.954773,0.940098,0.946469,960.0
weighted avg,0.963266,0.9625,0.962532,960.0


### Cross-Tabulation of Actual vs Predicted Categories

In [24]:
pd.crosstab(y_test,ypred_xgb)

col_0,0,1,2
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,34,5,0
1.0,0,174,1
2.0,1,2,23


In [25]:
pd.crosstab(y_train,ypred_xgb_train)

col_0,0,1,2
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,147,8,0
1.0,16,682,1
2.0,0,11,95


## Packing the Trained Model

In [27]:
with open ("Trained_model.pkl","wb") as file:
    pickle.dump( xgb, file)

## Saving Test data

In [29]:
y_test.to_csv("y_test_data.csv",index = False)

In [30]:
x_test.to_csv("x_test_data.csv",index = False)