# **INX FUTURE INC EMPLOYEE PERFORMANCE ANALYSIS**

### 1) Importing Libraries

In [12]:
#Loading libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import joblib
import random
random.seed(42)
np.random.seed(42)
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from tabulate import tabulate
from scipy.stats import uniform, randint


from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,classification_report,roc_auc_score,confusion_matrix
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

### 2) Loading the processed dataset


In [4]:
df = pd.read_csv(r"C:\Users\subas\Downloads\IABAC Project - SUBASH R\data\processed\final_processed_data.csv")
df

Unnamed: 0,pca1,pca2,pca3,pca4,pca5,pca6,pca7,pca8,pca9,pca10,...,pca17,pca18,pca19,pca20,pca21,pca22,pca23,pca24,pca25,PerformanceRating
0,4.481294,-1.628501,1.047956,-0.933500,0.925134,1.029647,0.835034,1.463693,0.549426,-0.524361,...,-0.791007,-0.093407,-0.344439,-0.184039,-0.101912,-0.277742,-0.413157,0.144702,-0.349662,3
1,4.357608,-0.071098,2.015938,-1.513005,-0.396063,-0.576457,1.712992,-0.075228,0.975862,-0.890574,...,-0.492291,0.316399,0.964538,0.525252,-0.086952,-0.368111,-0.862820,-0.312112,-0.643581,3
2,4.241642,2.561754,4.362057,0.172691,1.748291,-0.363683,0.437324,-1.341480,0.328197,-1.879120,...,0.492510,-0.436926,0.206294,0.470151,0.949025,0.624923,-0.594870,0.232315,0.429156,4
3,-3.012253,0.723361,2.326396,-3.284738,-2.312531,1.905064,-1.074595,0.247842,-0.093157,0.178636,...,0.555283,-0.719977,-1.535159,-0.134353,-0.272576,0.083561,-0.541444,-0.118246,-0.469829,3
4,4.248157,5.963902,-0.246539,-0.786352,-2.094958,-1.716061,-0.746786,0.500047,0.791468,-1.199249,...,-0.630371,0.189137,1.546812,-0.740817,-0.017727,0.062501,-0.123671,0.434470,-0.215044,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,4.535073,-1.840887,-0.726599,0.309655,1.438946,1.587815,0.970934,-0.381956,-0.941543,-0.681329,...,-0.582378,1.370088,-0.367934,-0.028764,0.917904,0.623974,0.470642,-0.338655,0.184985,4
1196,-1.587384,0.110600,-2.704746,1.396380,0.759975,1.690240,0.026029,-1.206225,-0.419745,-0.244692,...,-0.798953,1.167526,0.485813,-0.212821,-0.204359,-0.152510,-0.256752,0.281957,-0.174679,3
1197,-1.640971,-1.735755,3.135629,-0.031794,1.688228,-0.569791,0.661484,0.103077,0.834267,-0.277073,...,-1.415700,0.997095,1.874264,1.260002,-0.091648,-0.507107,-0.088497,0.442107,1.100779,3
1198,-9.104494,-2.101126,1.482943,-0.562236,1.556079,-0.493476,0.296072,-1.195269,0.601387,0.739029,...,1.979594,0.032593,-0.230004,-0.649038,0.105152,-0.029245,0.648956,-0.165642,-0.250106,3


In [5]:
df.columns

Index(['pca1', 'pca2', 'pca3', 'pca4', 'pca5', 'pca6', 'pca7', 'pca8', 'pca9',
       'pca10', 'pca11', 'pca12', 'pca13', 'pca14', 'pca15', 'pca16', 'pca17',
       'pca18', 'pca19', 'pca20', 'pca21', 'pca22', 'pca23', 'pca24', 'pca25',
       'PerformanceRating'],
      dtype='object')

### 3) Splitting dataset into train and test

In [7]:
#splitting data into dependent and independent 
x = df.drop(columns = ['PerformanceRating'])
y = df['PerformanceRating']

In [8]:
#Splitting the data into train and test
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state = 42, stratify = y)

In [9]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(960, 25)
(960,)
(240, 25)
(240,)


### 4) Balancing the imbalanced data

In [13]:
from collections import Counter
sm = SMOTE()
x_sm,y_sm = sm.fit_resample(x_train,y_train)
print(Counter(y_train))
print(Counter(y_sm))

Counter({3: 699, 2: 155, 4: 106})
Counter({3: 699, 2: 699, 4: 699})


In [14]:
#balanced data
print(x_sm.shape)
print(x_test.shape)  
print(y_sm.shape)
print(y_test.shape)

(2097, 25)
(240, 25)
(2097,)
(240,)


### 5) Model Building

#### 1. Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(multi_class = 'multinomial', solver ='lbfgs', max_iter = 1000, random_state = 42)
LR.fit(x_sm,y_sm)

In [17]:
LR_y_train = LR.predict(x_sm)
LR_y_train

array([3, 2, 3, ..., 4, 4, 4], dtype=int64)

In [18]:
LR_y_pred = LR.predict(x_test)
LR_y_pred

array([3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2,
       2, 3, 3, 3, 4, 3, 2, 2, 3, 4, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 2,
       3, 4, 4, 3, 4, 3, 3, 2, 4, 4, 2, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 2,
       2, 3, 3, 4, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 3, 2, 2, 3,
       3, 3, 2, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 2, 2, 2, 3,
       3, 3, 4, 3, 3, 3, 3, 3, 2, 3, 2, 4, 3, 2, 2, 3, 2, 3, 3, 4, 3, 3,
       3, 4, 3, 4, 2, 3, 2, 4, 3, 2, 2, 2, 3, 4, 3, 2, 3, 4, 2, 3, 3, 3,
       3, 4, 3, 2, 3, 3, 3, 3, 3, 2, 3, 3, 2, 3, 3, 3, 4, 2, 2, 2, 3, 3,
       3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 4, 3, 3, 2, 2, 2, 2, 3,
       4, 3, 2, 4, 3, 3, 2, 2, 2, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 2],
      dtype=int64)

In [19]:
LR_acc_train = accuracy_score(y_sm,LR_y_train)
LR_acc_train

0.9084406294706724

In [20]:
print(classification_report(y_sm,LR_y_train))

              precision    recall  f1-score   support

           2       0.90      0.95      0.93       699
           3       0.90      0.82      0.86       699
           4       0.92      0.95      0.94       699

    accuracy                           0.91      2097
   macro avg       0.91      0.91      0.91      2097
weighted avg       0.91      0.91      0.91      2097



In [21]:
LR_acc_test = accuracy_score(y_test,LR_y_pred)
print('ACCURACY SCORE:',LR_acc_test)
LR_pre = precision_score(y_test,LR_y_pred,average = 'weighted')
print('PRECISION SCORE:',LR_pre)
LR_recall = recall_score(y_test,LR_y_pred, average = 'weighted')
print('RECALL SCORE:',LR_recall)
LR_f1 = f1_score(y_test,LR_y_pred, average ='weighted')
print('F1 SCORE:',LR_f1)
print('-----')
print('LR CLASSIFICATION REPORT:')
print(classification_report(y_test,LR_y_pred))

ACCURACY SCORE: 0.8166666666666667
PRECISION SCORE: 0.8377830520393812
RECALL SCORE: 0.8166666666666667
F1 SCORE: 0.822807629057629
-----
LR CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           2       0.62      0.82      0.70        39
           3       0.92      0.83      0.87       175
           4       0.63      0.73      0.68        26

    accuracy                           0.82       240
   macro avg       0.72      0.79      0.75       240
weighted avg       0.84      0.82      0.82       240



In [23]:
pd.crosstab(y_test,LR_y_pred)

col_0,2,3,4
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,32,6,1
3,20,145,10
4,0,7,19


In [24]:
LR_Metrics =pd.DataFrame({'METRIC': ['Accuracy','Precision','Recall','F1 Score'],'VALUE' : [LR_acc_test,LR_pre,LR_recall,LR_f1]})
LR_Metrics

Unnamed: 0,METRIC,VALUE
0,Accuracy,0.816667
1,Precision,0.837783
2,Recall,0.816667
3,F1 Score,0.822808


In [25]:
print("LOGISTIC REGRESSION METRICS:")
print(tabulate(LR_Metrics,headers ='keys',tablefmt= 'grid',showindex = False))

LOGISTIC REGRESSION METRICS:
+-----------+----------+
| METRIC    |    VALUE |
| Accuracy  | 0.816667 |
+-----------+----------+
| Precision | 0.837783 |
+-----------+----------+
| Recall    | 0.816667 |
+-----------+----------+
| F1 Score  | 0.822808 |
+-----------+----------+


#### 1) A - HyperParameter Tuning in LR

In [27]:
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization strength
    'solver': ['lbfgs', 'saga'],  # Optimization algorithms
    'max_iter': [100, 200],  # Maximum iterations
    'multi_class': ['ovr', 'multinomial'],  # Multi-class strategy
    'penalty': ['l2'],  # Regularization penalty
}

In [28]:
grid_search = GridSearchCV(estimator=LR, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

In [29]:
grid_search.fit(x_sm,y_sm)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [30]:
grid_search.best_params_

{'C': 10,
 'max_iter': 100,
 'multi_class': 'multinomial',
 'penalty': 'l2',
 'solver': 'lbfgs'}

In [31]:
LR1 = LogisticRegression(C= 0.1,
 max_iter = 100,
 multi_class = 'multinomial',
 penalty = 'l2',
 solver = 'saga')

In [33]:
LR1.fit(x_sm,y_sm)

In [34]:
LR1_y_pred = LR1.predict(x_test)
LR1_y_pred

array([3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2,
       2, 3, 3, 3, 4, 3, 2, 2, 3, 4, 3, 3, 3, 4, 3, 3, 4, 3, 3, 3, 3, 2,
       3, 4, 4, 3, 4, 3, 3, 2, 4, 4, 2, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 2,
       2, 3, 3, 4, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 3, 2, 2, 3,
       3, 3, 2, 2, 4, 3, 3, 3, 2, 3, 3, 3, 3, 3, 4, 3, 3, 3, 2, 2, 2, 3,
       3, 3, 4, 3, 3, 3, 3, 3, 2, 3, 2, 4, 3, 2, 2, 3, 2, 3, 3, 4, 3, 3,
       3, 4, 3, 4, 2, 3, 2, 4, 3, 2, 2, 2, 3, 4, 3, 2, 3, 4, 2, 3, 3, 3,
       3, 4, 3, 2, 3, 3, 3, 3, 3, 2, 3, 3, 2, 3, 3, 3, 4, 2, 2, 2, 3, 3,
       3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 4, 3, 3, 2, 2, 2, 2, 3,
       4, 3, 2, 4, 3, 3, 2, 2, 2, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2],
      dtype=int64)

In [35]:
LR1_acc_test = accuracy_score(y_test,LR1_y_pred)
print('ACCURACY SCORE:',LR1_acc_test)
LR1_pre = precision_score(y_test,LR1_y_pred,average = 'weighted')
print('PRECISION SCORE:',LR1_pre)
LR1_recall = recall_score(y_test,LR1_y_pred, average = 'weighted')
print('RECALL SCORE:',LR1_recall)
LR1_f1 = f1_score(y_test,LR1_y_pred, average ='weighted')
print('F1 SCORE:',LR1_f1)
print('-----')
print('LR CLASSIFICATION REPORT:')
print(classification_report(y_test,LR1_y_pred))

ACCURACY SCORE: 0.8125
PRECISION SCORE: 0.8394827305311178
RECALL SCORE: 0.8125
F1 SCORE: 0.8199636946977373
-----
LR CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           2       0.58      0.82      0.68        39
           3       0.92      0.82      0.87       175
           4       0.67      0.77      0.71        26

    accuracy                           0.81       240
   macro avg       0.72      0.80      0.75       240
weighted avg       0.84      0.81      0.82       240



In [36]:
pd.crosstab(y_test,LR1_y_pred)

col_0,2,3,4
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,32,6,1
3,23,143,9
4,0,6,20


In [37]:
LR1_Metrics =pd.DataFrame({'METRIC': ['Accuracy','Precision','Recall','F1 Score'],'VALUE' : [LR1_acc_test,LR1_pre,LR1_recall,LR1_f1]})
LR1_Metrics

Unnamed: 0,METRIC,VALUE
0,Accuracy,0.8125
1,Precision,0.839483
2,Recall,0.8125
3,F1 Score,0.819964


In [38]:
print("LOGISTIC REGRESSION METRICS AFTER HYPERPARAMATERTUNING:")
print(tabulate(LR1_Metrics,headers ='keys',tablefmt= 'grid',showindex = False))

LOGISTIC REGRESSION METRICS AFTER HYPERPARAMATERTUNING:
+-----------+----------+
| METRIC    |    VALUE |
| Accuracy  | 0.8125   |
+-----------+----------+
| Precision | 0.839483 |
+-----------+----------+
| Recall    | 0.8125   |
+-----------+----------+
| F1 Score  | 0.819964 |
+-----------+----------+


**Logistic Regression** showed decent performance before and after tuning, but it still falls short when compared to more complex models. Although hyperparameter tuning slightly improved precision and recall, it's still less effective in capturing the complexity of employee performance.

#### 2. SVM

In [40]:
from sklearn.svm import SVC
sv = SVC(kernel = 'linear', decision_function_shape='ovr',random_state=42)

In [41]:
sv.fit(x_sm,y_sm)

In [42]:
SV_y_train = sv.predict(x_sm)
SV_y_train

array([3, 2, 3, ..., 4, 4, 4], dtype=int64)

In [43]:
SV_y_pred = sv.predict(x_test)
SV_y_pred

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2,
       2, 3, 3, 3, 4, 3, 2, 2, 3, 4, 4, 3, 3, 3, 3, 3, 4, 3, 3, 3, 4, 2,
       3, 4, 4, 3, 4, 3, 3, 2, 3, 4, 2, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 2,
       2, 3, 3, 4, 4, 2, 3, 3, 3, 3, 3, 3, 2, 3, 3, 2, 2, 2, 3, 2, 2, 3,
       3, 3, 2, 2, 4, 3, 3, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 3,
       3, 3, 4, 3, 3, 3, 3, 3, 2, 3, 2, 4, 3, 2, 2, 3, 2, 3, 3, 4, 3, 3,
       3, 2, 3, 2, 2, 3, 2, 4, 3, 2, 2, 2, 3, 4, 3, 2, 3, 4, 2, 3, 3, 3,
       3, 4, 3, 2, 3, 3, 3, 3, 3, 2, 3, 3, 2, 3, 3, 3, 4, 2, 2, 2, 3, 3,
       3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 2, 3, 3, 2, 2, 2, 2, 3,
       4, 3, 2, 4, 3, 3, 2, 2, 2, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2],
      dtype=int64)

In [44]:
SV_acc_train = accuracy_score(y_sm,SV_y_train)
SV_acc_train

0.9141630901287554

In [45]:
SV_acc_test = accuracy_score(y_test,SV_y_pred)
print('ACCURACY SCORE:',SV_acc_test)
SV_pre = precision_score(y_test,SV_y_pred, average='weighted')
print('PRECISION SCORE:',SV_pre)
SV_recall = recall_score(y_test,SV_y_pred, average = 'weighted')
print('RECALL SCORE:',SV_recall)
SV_f1 = f1_score(y_test,SV_y_pred, average = 'weighted')
print('F1 SCORE:',SV_f1)
print('-----')
print('SVM CLASSIFICATION REPORT:')
print(classification_report(y_test,SV_y_pred))

ACCURACY SCORE: 0.8
PRECISION SCORE: 0.8307256890943332
RECALL SCORE: 0.8
F1 SCORE: 0.8080236811705814
-----
SVM CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           2       0.56      0.85      0.67        39
           3       0.92      0.81      0.86       175
           4       0.67      0.69      0.68        26

    accuracy                           0.80       240
   macro avg       0.71      0.78      0.74       240
weighted avg       0.83      0.80      0.81       240



In [46]:
pd.crosstab(y_test,SV_y_pred)

col_0,2,3,4
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,33,6,0
3,25,141,9
4,1,7,18


In [47]:
SV_Metrics =pd.DataFrame({'METRIC': ['Accuracy','Precision','Recall','F1 Score'],'VALUE' : [SV_acc_test,SV_pre,SV_recall,SV_f1]})
SV_Metrics

Unnamed: 0,METRIC,VALUE
0,Accuracy,0.8
1,Precision,0.830726
2,Recall,0.8
3,F1 Score,0.808024


In [48]:
print("SVM METRICS:")
print(tabulate(SV_Metrics,headers ='keys',tablefmt= 'grid',showindex = False))

SVM METRICS:
+-----------+----------+
| METRIC    |    VALUE |
| Accuracy  | 0.8      |
+-----------+----------+
| Precision | 0.830726 |
+-----------+----------+
| Recall    | 0.8      |
+-----------+----------+
| F1 Score  | 0.808024 |
+-----------+----------+


#### 2) A- HyperParamater Tuning In SVM:

In [49]:
param_dist = {
    'C': uniform(0.1, 10),  # Regularization parameter
    'kernel': ['linear', 'rbf', 'poly'],  # Kernel types
    'gamma': ['scale', 'auto'],  # Kernel coefficient
    'degree': [3, 4, 5],  # Degree of the polynomial kernel function (only for poly)
    'class_weight': ['balanced', None],  # Class weights for handling imbalance
    'probability': [True],  # Enable probability estimates
}

In [50]:
random_search = RandomizedSearchCV(estimator=sv, param_distributions=param_dist, n_iter=100, cv=5, n_jobs=-1, verbose=1)

In [51]:
random_search.fit(x_sm,y_sm)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [52]:
random_search.best_params_

{'C': 3.4134700710305075,
 'class_weight': 'balanced',
 'degree': 5,
 'gamma': 'auto',
 'kernel': 'rbf',
 'probability': True}

In [53]:
sv1 = SVC(C = 6.474299014982066,
class_weight = None,
 degree = 3,
 gamma = 'auto',
 kernel = 'rbf',
 probability = True)

In [54]:
sv1.fit(x_sm,y_sm)

In [55]:
SV1_y_pred = sv1.predict(x_test)
SV1_y_pred

array([3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2,
       2, 3, 3, 3, 4, 3, 2, 3, 3, 4, 3, 3, 2, 3, 3, 3, 4, 3, 3, 3, 3, 2,
       3, 4, 4, 3, 3, 3, 3, 2, 3, 4, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 2,
       2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 2, 2, 3, 3, 3, 3, 3,
       3, 3, 3, 2, 4, 3, 3, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 2, 3, 3,
       3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 2, 3, 3, 2, 3, 3, 4, 3, 3,
       3, 3, 3, 3, 2, 3, 3, 3, 3, 2, 2, 2, 3, 3, 3, 2, 3, 4, 2, 3, 2, 3,
       3, 3, 2, 2, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 4, 2, 3, 3, 3, 3,
       3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3,
       4, 3, 3, 3, 3, 3, 2, 3, 2, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3],
      dtype=int64)

In [56]:
SV1_acc_test = accuracy_score(y_test,SV1_y_pred)
print('ACCURACY SCORE:',SV1_acc_test)
SV1_pre = precision_score(y_test,SV1_y_pred, average='weighted')
print('PRECISION SCORE:',SV1_pre)
SV1_recall = recall_score(y_test,SV1_y_pred, average = 'weighted')
print('RECALL SCORE:',SV1_recall)
SV1_f1 = f1_score(y_test,SV1_y_pred, average = 'weighted')
print('F1 SCORE:',SV1_f1)
print('-----')
print('SVM CLASSIFICATION REPORT:')
print(classification_report(y_test,SV1_y_pred))

ACCURACY SCORE: 0.8416666666666667
PRECISION SCORE: 0.8420988475177306
RECALL SCORE: 0.8416666666666667
F1 SCORE: 0.8362248458612096
-----
SVM CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           2       0.67      0.62      0.64        39
           3       0.87      0.93      0.90       175
           4       0.94      0.58      0.71        26

    accuracy                           0.84       240
   macro avg       0.82      0.71      0.75       240
weighted avg       0.84      0.84      0.84       240



In [57]:
pd.crosstab(y_test,SV1_y_pred)

col_0,2,3,4
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,24,15,0
3,11,163,1
4,1,10,15


In [58]:
SV1_Metrics =pd.DataFrame({'METRIC': ['Accuracy','Precision','Recall','F1 Score'],'VALUE' : [SV1_acc_test,SV1_pre,SV1_recall,SV1_f1]})
SV1_Metrics

Unnamed: 0,METRIC,VALUE
0,Accuracy,0.841667
1,Precision,0.842099
2,Recall,0.841667
3,F1 Score,0.836225


In [59]:
print("SVM METRICS AFTER HYPERPARAMETER TUNING:")
print(tabulate(SV1_Metrics,headers ='keys',tablefmt= 'grid',showindex = False))

SVM METRICS AFTER HYPERPARAMETER TUNING:
+-----------+----------+
| METRIC    |    VALUE |
| Accuracy  | 0.841667 |
+-----------+----------+
| Precision | 0.842099 |
+-----------+----------+
| Recall    | 0.841667 |
+-----------+----------+
| F1 Score  | 0.836225 |
+-----------+----------+


Hyperparameter tuning improved the **SVM model's** accuracy, recall, and precision slightly. However, SVM still struggles to outperform ensemble methods, making it moderately useful for this task.

#### 3. Decision Tree

In [60]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42)

In [61]:
dt.fit(x_sm,y_sm)

In [62]:
dt_y_train= dt.predict(x_sm)
dt_y_train

array([3, 2, 3, ..., 4, 4, 4], dtype=int64)

In [63]:
dt_y_pred = dt.predict(x_test)
dt_y_pred

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 4, 3, 3, 3, 3, 3, 3, 2,
       3, 3, 3, 2, 3, 3, 4, 3, 3, 4, 3, 3, 2, 3, 3, 3, 4, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 4, 2, 3, 3, 3, 4, 3, 3, 3, 2, 3, 3, 3, 4, 3, 3, 3, 2,
       2, 3, 3, 4, 3, 3, 2, 2, 3, 3, 3, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 4,
       3, 3, 3, 2, 3, 3, 4, 2, 3, 3, 3, 3, 3, 2, 3, 2, 2, 2, 2, 2, 2, 3,
       3, 3, 3, 3, 3, 4, 4, 3, 4, 3, 3, 4, 2, 2, 3, 3, 2, 3, 3, 4, 3, 3,
       3, 3, 3, 2, 2, 3, 4, 2, 2, 2, 3, 3, 3, 3, 3, 2, 3, 2, 2, 3, 3, 3,
       3, 4, 4, 2, 4, 4, 3, 3, 3, 2, 4, 3, 3, 3, 4, 3, 2, 2, 4, 3, 3, 4,
       3, 3, 2, 4, 3, 3, 3, 3, 3, 3, 4, 3, 2, 3, 3, 3, 2, 3, 3, 4, 3, 3,
       2, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 4, 3, 3, 2, 3, 2, 2, 2, 2, 4, 4,
       4, 3, 3, 2, 3, 2, 2, 3, 3, 3, 2, 3, 2, 3, 3, 2, 3, 4, 3, 3],
      dtype=int64)

In [64]:
dt_acc_train = accuracy_score(y_sm,dt_y_train)
dt_acc_train

1.0

In [65]:
print(classification_report(y_sm,dt_y_train))

              precision    recall  f1-score   support

           2       1.00      1.00      1.00       699
           3       1.00      1.00      1.00       699
           4       1.00      1.00      1.00       699

    accuracy                           1.00      2097
   macro avg       1.00      1.00      1.00      2097
weighted avg       1.00      1.00      1.00      2097



In [66]:
dt_acc_test = accuracy_score(y_test,dt_y_pred)
print('ACCURACY SCORE:',dt_acc_test)
dt_pre = precision_score(y_test,dt_y_pred, average = 'weighted')
print('PRECISION SCORE:',dt_pre)
dt_recall = recall_score(y_test,dt_y_pred,average = 'weighted')
print('RECALL SCORE:',dt_recall)
dt_f1 = f1_score(y_test,dt_y_pred, average = 'weighted')
print('F1 SCORE:',dt_f1)
print('----')
print('DECISION TREE CLASSIFICATION REPORT:')
print(classification_report(y_test,dt_y_pred))

ACCURACY SCORE: 0.675
PRECISION SCORE: 0.7156467864923747
RECALL SCORE: 0.675
F1 SCORE: 0.6899971070510412
----
DECISION TREE CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           2       0.40      0.56      0.47        39
           3       0.84      0.73      0.78       175
           4       0.38      0.46      0.41        26

    accuracy                           0.68       240
   macro avg       0.54      0.59      0.55       240
weighted avg       0.72      0.68      0.69       240



In [67]:
pd.crosstab(y_test,dt_y_pred)

col_0,2,3,4
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,22,14,3
3,30,128,17
4,3,11,12


In [68]:
dt_Metrics =pd.DataFrame({'METRIC': ['Accuracy','Precision','Recall','F1 Score'],'VALUE' : [dt_acc_test,dt_pre,dt_recall,dt_f1]})
dt_Metrics

Unnamed: 0,METRIC,VALUE
0,Accuracy,0.675
1,Precision,0.715647
2,Recall,0.675
3,F1 Score,0.689997


In [69]:
print("DECISION TREE METRICS:")
print(tabulate(dt_Metrics,headers ='keys',tablefmt= 'grid',showindex = False))

DECISION TREE METRICS:
+-----------+----------+
| METRIC    |    VALUE |
| Accuracy  | 0.675    |
+-----------+----------+
| Precision | 0.715647 |
+-----------+----------+
| Recall    | 0.675    |
+-----------+----------+
| F1 Score  | 0.689997 |
+-----------+----------+


#### 3) A- Hyperparameter tuning in dt

In [70]:
from sklearn.model_selection import GridSearchCV
params = {
    "criterion":("gini", "entropy"),
    "splitter":("best", "random"), 
    "max_depth":(list(range(1, 20))), 
    "min_samples_split":[2, 3, 4],    
    "min_samples_leaf":list(range(1, 20))
}

grid = GridSearchCV(dt, params, scoring='accuracy', cv = 5, verbose = 3, n_jobs=-1)

grid.fit(x_sm, y_sm)

Fitting 5 folds for each of 4332 candidates, totalling 21660 fits


In [71]:
grid.best_params_

{'criterion': 'gini',
 'max_depth': 15,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'splitter': 'best'}

In [72]:
dt1 = DecisionTreeClassifier(criterion= 'entropy',
 max_depth= 12,
 min_samples_leaf= 1,
 min_samples_split= 4,
 splitter= 'best')

In [73]:
dt1.fit(x_sm,y_sm)

In [74]:
dt1_y_pred = dt1.predict(x_test)
dt1_y_pred

array([3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2,
       2, 3, 2, 2, 3, 2, 2, 2, 3, 4, 3, 3, 2, 3, 3, 3, 4, 4, 3, 3, 3, 2,
       3, 3, 3, 4, 4, 2, 3, 3, 3, 4, 2, 3, 3, 2, 3, 2, 3, 4, 4, 3, 3, 3,
       2, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 2, 2, 3, 3, 3, 3, 4,
       3, 4, 2, 3, 4, 4, 3, 3, 3, 3, 3, 2, 4, 2, 2, 3, 2, 3, 2, 2, 2, 3,
       3, 3, 3, 2, 3, 3, 2, 3, 3, 3, 3, 4, 2, 4, 4, 3, 2, 3, 3, 4, 3, 3,
       2, 2, 3, 2, 2, 3, 3, 2, 3, 2, 3, 3, 3, 3, 3, 2, 3, 2, 2, 3, 2, 3,
       3, 4, 3, 2, 4, 4, 3, 3, 3, 3, 4, 3, 3, 2, 3, 3, 4, 2, 2, 2, 3, 3,
       3, 2, 2, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 2, 3, 2, 3, 2, 2, 2, 4,
       4, 3, 3, 3, 3, 3, 2, 2, 3, 3, 3, 3, 3, 3, 3, 2, 3, 4, 3, 3],
      dtype=int64)

In [75]:
dt1_acc_test = accuracy_score(y_test,dt1_y_pred)
print('ACCURACY SCORE:',dt1_acc_test)
dt1_pre = precision_score(y_test,dt1_y_pred, average = 'weighted')
print('PRECISION SCORE:',dt1_pre)
dt1_recall = recall_score(y_test,dt1_y_pred,average = 'weighted')
print('RECALL SCORE:',dt1_recall)
dt1_f1 = f1_score(y_test,dt1_y_pred, average = 'weighted')
print('F1 SCORE:',dt1_f1)
print('----')
print('DECISION TREE CLASSIFICATION REPORT:')
print(classification_report(y_test,dt1_y_pred))

ACCURACY SCORE: 0.7041666666666667
PRECISION SCORE: 0.7406818181818182
RECALL SCORE: 0.7041666666666667
F1 SCORE: 0.7160272942187836
----
DECISION TREE CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           2       0.43      0.67      0.53        39
           3       0.85      0.75      0.80       175
           4       0.46      0.46      0.46        26

    accuracy                           0.70       240
   macro avg       0.58      0.63      0.59       240
weighted avg       0.74      0.70      0.72       240



In [76]:
pd.crosstab(y_test,dt1_y_pred)

col_0,2,3,4
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,26,12,1
3,31,131,13
4,3,11,12


In [77]:
dt1_Metrics =pd.DataFrame({'METRIC': ['Accuracy','Precision','Recall','F1 Score'],'VALUE' : [dt1_acc_test,dt1_pre,dt1_recall,dt1_f1]})
dt1_Metrics

Unnamed: 0,METRIC,VALUE
0,Accuracy,0.704167
1,Precision,0.740682
2,Recall,0.704167
3,F1 Score,0.716027


In [78]:
print("DECISION TREE METRICS AFTER HYPERPARAMETER TUNING:")
print(tabulate(dt_Metrics,headers ='keys',tablefmt= 'grid',showindex = False))

DECISION TREE METRICS AFTER HYPERPARAMETER TUNING:
+-----------+----------+
| METRIC    |    VALUE |
| Accuracy  | 0.675    |
+-----------+----------+
| Precision | 0.715647 |
+-----------+----------+
| Recall    | 0.675    |
+-----------+----------+
| F1 Score  | 0.689997 |
+-----------+----------+


The **Decision Tree** performed consistently before and after hyperparameter tuning. While the results were stable, the model didn’t show significant improvement, but its interpretability and stable performance make it useful for predicting employee performance.

#### 4. Random Forest


In [79]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 100)
rf.fit(x_sm,y_sm)

In [80]:
rf_y_train= rf.predict(x_sm)
rf_y_train

array([3, 2, 3, ..., 4, 4, 4], dtype=int64)

In [81]:
rf_y_pred = rf.predict(x_test)
rf_y_pred

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 4, 3, 3, 3, 3, 2,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 4, 3, 3, 2, 3, 4, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 2,
       2, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 2, 3, 3, 3, 3, 2, 4,
       3, 3, 3, 2, 4, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 2, 2, 3,
       3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 2, 3, 3, 4, 3, 3,
       3, 3, 3, 4, 2, 3, 3, 3, 3, 2, 2, 2, 3, 3, 3, 2, 3, 4, 2, 3, 3, 3,
       3, 4, 3, 2, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 4, 2, 3, 2, 3, 3,
       3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 2, 3, 3, 2, 2, 3, 2, 3,
       4, 3, 3, 4, 3, 3, 2, 2, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3],
      dtype=int64)

In [82]:
rf_acc_train = accuracy_score(y_sm,rf_y_train)
rf_acc_train

1.0

In [83]:
print(classification_report(y_sm,rf_y_train))

              precision    recall  f1-score   support

           2       1.00      1.00      1.00       699
           3       1.00      1.00      1.00       699
           4       1.00      1.00      1.00       699

    accuracy                           1.00      2097
   macro avg       1.00      1.00      1.00      2097
weighted avg       1.00      1.00      1.00      2097



In [84]:
rf_acc_test = accuracy_score(y_test,rf_y_pred)
print('ACCURACY SCORE:',rf_acc_test)
rf_pre = precision_score(y_test,rf_y_pred, average = 'weighted')
print('PRECISION SCORE:',rf_pre)
rf_recall = recall_score(y_test,rf_y_pred, average = 'weighted')
print('RECALL SCORE:',rf_recall)
rf_f1 = f1_score(y_test,rf_y_pred, average = 'weighted')
print('F1 SCORE:',rf_f1)
print('------')
print('RANDOM FOREST CLASSIFICATION REPORT:')
print(classification_report(y_test ,rf_y_pred))

ACCURACY SCORE: 0.8125
PRECISION SCORE: 0.8040082083969952
RECALL SCORE: 0.8125
F1 SCORE: 0.8058143707667421
------
RANDOM FOREST CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           2       0.66      0.54      0.59        39
           3       0.85      0.91      0.88       175
           4       0.71      0.58      0.64        26

    accuracy                           0.81       240
   macro avg       0.74      0.67      0.70       240
weighted avg       0.80      0.81      0.81       240



In [85]:
pd.crosstab(y_test,rf_y_pred)

col_0,2,3,4
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,21,17,1
3,11,159,5
4,0,11,15


In [86]:
rf_Metrics =pd.DataFrame({'METRIC': ['Accuracy','Precision','Recall','F1 Score'],'VALUE' : [rf_acc_test,rf_pre,rf_recall,rf_f1]})
rf_Metrics

Unnamed: 0,METRIC,VALUE
0,Accuracy,0.8125
1,Precision,0.804008
2,Recall,0.8125
3,F1 Score,0.805814


In [87]:
print("RANDOM FOREST METRICS")
print(tabulate(rf_Metrics,headers ='keys',tablefmt= 'grid',showindex = False))

RANDOM FOREST METRICS
+-----------+----------+
| METRIC    |    VALUE |
| Accuracy  | 0.8125   |
+-----------+----------+
| Precision | 0.804008 |
+-----------+----------+
| Recall    | 0.8125   |
+-----------+----------+
| F1 Score  | 0.805814 |
+-----------+----------+


#### 4) A-Hyperparameter Tuning in Random Forest

In [95]:
n_estimators = [50,100,200]
max_features = ['sqrt', 'log2']
max_depth = [5,10,None]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False] 


paramgrid = {'n_estimators': n_estimators, 'max_features': max_features,
               'max_depth': max_depth, 'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap}

gridsearch = GridSearchCV(rf, paramgrid, scoring='accuracy', cv = 3, verbose = 3, n_jobs=-1)

gridsearch.fit(x_sm, y_sm)

Fitting 3 folds for each of 324 candidates, totalling 972 fits


In [96]:
gridsearch.best_params_

{'bootstrap': False,
 'max_depth': None,
 'max_features': 'log2',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 200}

In [99]:
rf1 = RandomForestClassifier(bootstrap = False,
 max_depth = None,
 max_features = 'log2',
 min_samples_leaf = 1,
 min_samples_split = 2,
 n_estimators = 50)

In [100]:
rf1.fit(x_sm,y_sm)

In [101]:
rf1_y_pred = rf1.predict(x_test)
rf1_y_pred

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 4, 3, 3, 2, 3, 4, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 2,
       2, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 3, 2, 3,
       3, 3, 3, 2, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 3,
       3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 4, 2, 3, 3, 3, 2, 3, 3, 4, 3, 3,
       3, 3, 3, 3, 2, 3, 3, 3, 3, 2, 2, 2, 3, 3, 3, 2, 3, 4, 3, 3, 3, 3,
       3, 4, 3, 2, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 2, 3, 2, 3, 3,
       3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 2, 2, 3, 2, 3,
       4, 3, 3, 4, 3, 3, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
      dtype=int64)

In [102]:
rf1_acc_test = accuracy_score(y_test,rf1_y_pred)
print('ACCURACY SCORE:',rf1_acc_test)
rf1_pre = precision_score(y_test,rf1_y_pred, average = 'weighted')
print('PRECISION SCORE:',rf1_pre)
rf1_recall = recall_score(y_test,rf1_y_pred,average = 'weighted')
print('RECALL SCORE:',rf1_recall)
rf1_f1 = f1_score(y_test,rf1_y_pred, average = 'weighted')
print('F1 SCORE:',rf1_f1)
print('----')
print('DECISION TREE CLASSIFICATION REPORT:')
print(classification_report(y_test,rf1_y_pred))

ACCURACY SCORE: 0.8375
PRECISION SCORE: 0.8362106901266553
RECALL SCORE: 0.8375
F1 SCORE: 0.8285602477982067
----
DECISION TREE CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           2       0.69      0.56      0.62        39
           3       0.85      0.94      0.90       175
           4       0.93      0.54      0.68        26

    accuracy                           0.84       240
   macro avg       0.83      0.68      0.73       240
weighted avg       0.84      0.84      0.83       240



In [103]:
pd.crosstab(y_test,rf1_y_pred)

col_0,2,3,4
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,22,17,0
3,9,165,1
4,1,11,14


In [104]:
rf1_Metrics =pd.DataFrame({'METRIC': ['Accuracy','Precision','Recall','F1 Score'],'VALUE' : [rf1_acc_test,rf1_pre,rf1_recall,rf1_f1]})
rf1_Metrics

Unnamed: 0,METRIC,VALUE
0,Accuracy,0.8375
1,Precision,0.836211
2,Recall,0.8375
3,F1 Score,0.82856


In [105]:
print("RANDOM FOREST METRICS AFTER HYPERPARAMETER TUNING")
print(tabulate(rf1_Metrics,headers ='keys',tablefmt= 'grid',showindex = False))

RANDOM FOREST METRICS AFTER HYPERPARAMETER TUNING
+-----------+----------+
| METRIC    |    VALUE |
| Accuracy  | 0.8375   |
+-----------+----------+
| Precision | 0.836211 |
+-----------+----------+
| Recall    | 0.8375   |
+-----------+----------+
| F1 Score  | 0.82856  |
+-----------+----------+


**Random Forest** showed excellent performance both before and after hyperparameter tuning, though the accuracy slightly decreased after tuning. Despite this, it remains one of the top performers for this task due to its strong generalization and ability to handle complex patterns. Highly recommended for employee performance prediction.

#### 6. Gradient Boosting Classifier

In [107]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(x_sm,y_sm)

In [108]:
gbc_y_train = gbc.predict(x_sm)
gbc_y_train

array([3, 2, 3, ..., 4, 4, 4], dtype=int64)

In [109]:
gbc_y_pred = gbc.predict(x_test)
gbc_y_pred

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3,
       2, 3, 3, 2, 4, 3, 2, 3, 3, 4, 3, 3, 3, 4, 3, 3, 4, 3, 3, 3, 3, 2,
       3, 4, 4, 3, 4, 3, 3, 2, 3, 4, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 2,
       2, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 2, 2, 3, 3, 2, 2, 3,
       3, 3, 3, 2, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 3,
       3, 3, 4, 3, 3, 4, 3, 3, 3, 3, 3, 4, 2, 2, 3, 3, 2, 3, 3, 4, 3, 3,
       3, 3, 3, 2, 2, 3, 3, 3, 3, 2, 2, 2, 3, 4, 3, 2, 3, 4, 2, 3, 3, 3,
       3, 4, 3, 2, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 4, 2, 3, 2, 3, 3,
       3, 3, 2, 4, 3, 3, 3, 3, 3, 3, 4, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       4, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 2, 3, 3, 2, 2, 2, 2, 3,
       4, 3, 2, 4, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 3, 3, 3, 3, 3, 2],
      dtype=int64)

In [110]:
gbc_acc_train  = accuracy_score(y_sm,gbc_y_train)
gbc_acc_train

0.9938006676204101

In [111]:
print(classification_report(y_sm,gbc_y_train))

              precision    recall  f1-score   support

           2       0.99      1.00      0.99       699
           3       1.00      0.99      0.99       699
           4       1.00      1.00      1.00       699

    accuracy                           0.99      2097
   macro avg       0.99      0.99      0.99      2097
weighted avg       0.99      0.99      0.99      2097



In [112]:
gbc_acc_test = accuracy_score(y_test,gbc_y_pred)
print('ACCURACY SCORE:', gbc_acc_test)
gbc_pre = precision_score(y_test,gbc_y_pred, average = 'weighted')
print('PRECISION SCORE:',gbc_pre)
gbc_recall = recall_score(y_test,gbc_y_pred, average ='weighted')
print('RECALL SCORE:',gbc_recall)
gbc_f1 = f1_score(y_test,gbc_y_pred, average = 'weighted')
print('F1 SCORE:',gbc_f1)
print('------')
print('KNN CLASSIFICATION REPORT:')
print(classification_report(y_test,gbc_y_pred))

ACCURACY SCORE: 0.8625
PRECISION SCORE: 0.8689467592592593
RECALL SCORE: 0.8625
F1 SCORE: 0.8648031382364266
------
KNN CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           2       0.69      0.79      0.74        39
           3       0.92      0.89      0.90       175
           4       0.78      0.81      0.79        26

    accuracy                           0.86       240
   macro avg       0.80      0.83      0.81       240
weighted avg       0.87      0.86      0.86       240



In [113]:
pd.crosstab(y_test,gbc_y_pred)

col_0,2,3,4
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,31,8,0
3,14,155,6
4,0,5,21


In [114]:
gbc_Metrics =pd.DataFrame({'METRIC': ['Accuracy','Precision','Recall','F1 Score'],'VALUE' : [gbc_acc_test,gbc_pre,gbc_recall,gbc_f1]})
gbc_Metrics

Unnamed: 0,METRIC,VALUE
0,Accuracy,0.8625
1,Precision,0.868947
2,Recall,0.8625
3,F1 Score,0.864803


In [115]:
print("GBC EVALUATION METRIC:")
print(tabulate(gbc_Metrics,headers ='keys',tablefmt= 'grid',showindex = False))

GBC EVALUATION METRIC:
+-----------+----------+
| METRIC    |    VALUE |
| Accuracy  | 0.8625   |
+-----------+----------+
| Precision | 0.868947 |
+-----------+----------+
| Recall    | 0.8625   |
+-----------+----------+
| F1 Score  | 0.864803 |
+-----------+----------+


#### 5) A-HyperParameter in GradientBoosting

In [116]:
param_Grid = {
    'n_estimators' : [50,100,150],
    'learning_rate' : [0.01,0.1,0.2],
    'max_depth' : [3,5,7],
    'subsample' : [0.8,1.0]
}

grid_Search = GridSearchCV(GradientBoostingClassifier(random_state=42), param_Grid, cv = 3)

grid_Search.fit(x_sm,y_sm)

In [117]:
grid_Search.best_params_

{'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}

In [118]:
gbc1 = GradientBoostingClassifier(learning_rate = 0.1, max_depth = 7,n_estimators = 150, subsample = 0.8)

In [119]:
gbc1.fit(x_sm,y_sm)

In [120]:
gbc1_y_pred = gbc1.predict(x_test)
gbc1_y_pred

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2,
       2, 3, 3, 2, 4, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3,
       3, 4, 4, 3, 4, 3, 3, 2, 3, 4, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 2,
       2, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 3, 3, 2, 3,
       3, 3, 3, 2, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 3,
       3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 4, 2, 2, 3, 3, 2, 3, 3, 4, 3, 3,
       3, 3, 3, 2, 2, 3, 3, 3, 3, 2, 2, 2, 3, 3, 3, 2, 3, 4, 2, 3, 3, 3,
       3, 4, 3, 2, 3, 3, 3, 3, 3, 2, 4, 3, 3, 3, 3, 3, 4, 2, 3, 2, 3, 3,
       3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 4, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 2, 3, 3, 2, 2, 3, 2, 3,
       4, 3, 3, 3, 3, 3, 2, 2, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3],
      dtype=int64)

In [121]:
gbc1_acc_test = accuracy_score(y_test,gbc1_y_pred)
print('ACCURACY SCORE:', gbc1_acc_test)
gbc1_pre = precision_score(y_test,gbc1_y_pred, average = 'weighted')
print('PRECISION SCORE:',gbc1_pre)
gbc1_recall = recall_score(y_test,gbc1_y_pred, average ='weighted')
print('RECALL SCORE:',gbc1_recall)
gbc1_f1 = f1_score(y_test,gbc1_y_pred, average = 'weighted')
print('F1 SCORE:',gbc1_f1)
print('------')
print('GBC CLASSIFICATION REPORT:')
print(classification_report(y_test,gbc1_y_pred))

ACCURACY SCORE: 0.8541666666666666
PRECISION SCORE: 0.8535052910052909
RECALL SCORE: 0.8541666666666666
F1 SCORE: 0.8527557187094197
------
GBC CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           2       0.69      0.69      0.69        39
           3       0.89      0.91      0.90       175
           4       0.86      0.69      0.77        26

    accuracy                           0.85       240
   macro avg       0.81      0.77      0.79       240
weighted avg       0.85      0.85      0.85       240



In [122]:
pd.crosstab(y_test,gbc1_y_pred)

col_0,2,3,4
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,27,12,0
3,12,160,3
4,0,8,18


In [123]:
gbc1_Metrics =pd.DataFrame({'METRIC': ['Accuracy','Precision','Recall','F1 Score'],'VALUE' : [gbc1_acc_test,gbc1_pre,gbc1_recall,gbc1_f1]})
gbc1_Metrics

Unnamed: 0,METRIC,VALUE
0,Accuracy,0.854167
1,Precision,0.853505
2,Recall,0.854167
3,F1 Score,0.852756


In [124]:
print("GBC EVALUATION METRIC AFTER HYPERPARAMETER TUNING:")
print(tabulate(gbc1_Metrics,headers ='keys',tablefmt= 'grid',showindex = False))

GBC EVALUATION METRIC AFTER HYPERPARAMETER TUNING:
+-----------+----------+
| METRIC    |    VALUE |
| Accuracy  | 0.854167 |
+-----------+----------+
| Precision | 0.853505 |
+-----------+----------+
| Recall    | 0.854167 |
+-----------+----------+
| F1 Score  | 0.852756 |
+-----------+----------+


**Gradient Boosting** remained stable after tuning, providing strong performance. Its ability to capture complex patterns makes it highly useful for employee performance prediction, and it’s a top model to consider for your task.

**Conclusion**
* Random Forest : The top performer, offering strong generalization and accuracy.
* Gradient Boosting : A close contender with consistently high performance.

In [125]:
#Saving best trained model using joblib
joblib.dump(rf,"random_forest_model.pkl")

['random_forest_model.pkl']

In [126]:
joblib.dump(gbc1,"gradient_boosting_model.pkl")

['gradient_boosting_model.pkl']

In [None]:
#saving predictions


In [127]:
# Create a DataFrame to store the predictions of all models
predictions_data = pd.DataFrame({
    'True Labels': y_test,  # True labels for comparison
    'Random Forest' : rf_y_pred,
    'Gradient Boosting': gbc1_y_pred
})

In [128]:
predictions_data.to_csv('Predictions_data.csv',index = False)

In [129]:
joblib.dump(predictions_data,'Predictions.pkl')

['Predictions.pkl']