***
# Essential Installation and libraries initialization

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import sys
import os
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

In [3]:
from src.utils.load_data import DataLoader
loader=DataLoader()
df=loader.load_dataset('INX_Future_Inc_Employee_Performance_CDS_Project2_Data_V1.8.xls')

2025-07-25 23:06:11,704 - load_data - INFO - Loading Data from file...
2025-07-25 23:06:11,925 - load_data - INFO - Excel Dataset Loaded Successfully!
2025-07-25 23:06:11,929 - load_data - INFO -  Dataset Shape: 1200 rows and 28 columns.


In [4]:
df.drop('EmpNumber',axis=1,inplace=True)

In [5]:
from src.Data_Processing.preprocess import Preprocessor
preprocessor=Preprocessor(df)
X_train,X_test,y_train,y_test=preprocessor.preprocess_run()

2025-07-25 23:06:12,017 - preprocess - INFO - Initialized Preprocessor with input Dataframe.
2025-07-25 23:06:12,023 - preprocess - INFO - Starting Preprocessing Pipeline...
2025-07-25 23:06:12,023 - preprocess - INFO - Dropped Columns : ['EmpJobLevel', 'ExperienceYearsAtThisCompany']
2025-07-25 23:06:12,023 - utils - INFO - Train-Test Split Initialized ...
2025-07-25 23:06:12,031 - utils - INFO - Training set shape: (960, 24), Testing set shape: (240, 24)
2025-07-25 23:06:12,034 - utils - INFO - Target distribution (train):
{1: 699, 0: 155, 2: 106}
2025-07-25 23:06:12,036 - utils - INFO - Target distribution (test):
{1: 175, 0: 39, 2: 26}
2025-07-25 23:06:12,038 - preprocess - INFO - ColumnTransformer pipeline defined...
2025-07-25 23:06:12,048 - preprocess - INFO - Applying ordinal encoding.
2025-07-25 23:06:12,048 - preprocess - INFO - Encoded column: Gender
2025-07-25 23:06:12,056 - preprocess - INFO - Encoded column: MaritalStatus
2025-07-25 23:06:12,056 - preprocess - INFO - Enco

In [6]:
os.path.join(os.path.dirname(os.getcwd()),'models')

'f:\\Machine_Learning\\Ml_Projects\\INX_Employee_Performance\\models'

In [7]:
from src.models.train_model import ModelTrainer
from src.models.evaluate_model import  save_best_model

trainer=ModelTrainer()
train_models=trainer.train_models(X_train,y_train)
evaluate_models=trainer.evaluate_models(X_train,y_train,X_test,y_test)
best_model,result=save_best_model(evaluate_models,train_models)

2025-07-25 23:06:12,393 - train_model - INFO - Model directory initialized at f:\Machine_Learning\Ml_Projects\INX_Employee_Performance\models
2025-07-25 23:06:12,393 - train_model - INFO - Model directory initialized at f:\Machine_Learning\Ml_Projects\INX_Employee_Performance\models
2025-07-25 23:06:12,393 - train_model - INFO - Training started for all models...
2025-07-25 23:06:12,400 - train_model - INFO - Training LogisticRegression
2025-07-25 23:06:14,761 - train_model - INFO - Training RidgeClassifier
2025-07-25 23:06:14,771 - train_model - INFO - Training SVM
2025-07-25 23:06:15,000 - train_model - INFO - Training DecisionTree
2025-07-25 23:06:15,013 - train_model - INFO - Training RandomForest
2025-07-25 23:06:15,256 - train_model - INFO - Training GradientBoosting
2025-07-25 23:06:16,273 - train_model - INFO - Training naive_bayes
2025-07-25 23:06:16,282 - train_model - INFO - Training AdaBoost
2025-07-25 23:06:16,454 - train_model - INFO - Training XGBoost
2025-07-25 23:06:16

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000268 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 362
[LightGBM] [Info] Number of data points in the train set: 960, number of used features: 44
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


2025-07-25 23:06:17,137 - train_model - INFO - Starting Model Evaluation....
2025-07-25 23:06:17,139 - train_model - INFO - Evaluating Model : LogisticRegression
2025-07-25 23:06:17,149 - train_model - INFO - LogisticRegression - Train Accuracy: 0.785 | Train F1: 0.799
2025-07-25 23:06:17,151 - train_model - INFO - LogisticRegression - Test Accuracy: 0.758 | Test F1: 0.772
2025-07-25 23:06:17,157 - train_model - INFO - LogisticRegression - ROC AUC: 0.873
2025-07-25 23:06:17,159 - train_model - INFO - Confusion Matrix for LogisticRegression:
[[ 29   7   3]
 [ 24 134  17]
 [  2   5  19]]
2025-07-25 23:06:17,161 - train_model - INFO - Evaluating Model : RidgeClassifier
2025-07-25 23:06:17,172 - train_model - INFO - RidgeClassifier - Train Accuracy: 0.740 | Train F1: 0.757
2025-07-25 23:06:17,172 - train_model - INFO - RidgeClassifier - Test Accuracy: 0.696 | Test F1: 0.719
2025-07-25 23:06:17,174 - train_model - INFO - RidgeClassifier does not support predict_proba; ROC AUC skipped.
2025-

In [14]:
result=pd.DataFrame(result)
result

Unnamed: 0,Model,train_accuracy,train_f1,test_accuracy,test_f1,ROC
0,LogisticRegression,0.785417,0.798514,0.758333,0.772374,0.872507
1,RidgeClassifier,0.739583,0.756566,0.695833,0.718538,
2,SVM,0.891667,0.896167,0.804167,0.812954,0.922161
3,DecisionTree,1.0,1.0,0.920833,0.920838,0.906196
4,RandomForest,1.0,1.0,0.925,0.922405,0.947113
5,GradientBoosting,0.994792,0.99481,0.929167,0.927791,0.978247
6,naive_bayes,0.245833,0.135234,0.2,0.113612,0.760008
7,AdaBoost,0.8625,0.855405,0.858333,0.850703,0.914544
8,XGBoost,1.0,1.0,0.920833,0.918403,0.968838
9,LightGBM,1.0,1.0,0.941667,0.941144,0.978923


In [8]:
from src.models.predict_model import InferencePipeline
pipeline=InferencePipeline('models\LightGBM_best_model.pkl','models\Preprocessor.pkl')
pipeline.load_artifacts()

2025-07-25 23:06:17,842 - predict_model - INFO - Loading Model and Preprocessor ...
2025-07-25 23:06:17,858 - predict_model - INFO - Model and Preprocessor are loaded successfully.


In [9]:
prediction=pipeline.predict({'Age': 32,
 'Gender': 'Male',
 'EducationBackground': 'Marketing',
 'MaritalStatus': 'Single',
 'EmpDepartment': 'Sales',
 'EmpJobRole': 'Sales Executive',
 'BusinessTravelFrequency': 'Travel_Rarely',
 'DistanceFromHome': 10,
 'EmpEducationLevel': 3,
 'EmpEnvironmentSatisfaction': 4,
 'EmpHourlyRate': 55,
 'EmpJobInvolvement': 3,
 'EmpJobLevel': 2,
 'EmpJobSatisfaction': 4,
 'NumCompaniesWorked': 1,
 'OverTime': 'No',
 'EmpLastSalaryHikePercent': 12,
 'EmpRelationshipSatisfaction': 4,
 'TotalWorkExperienceInYears': 10,
 'TrainingTimesLastYear': 2,
 'EmpWorkLifeBalance': 2,
 'ExperienceYearsAtThisCompany': 10,
 'ExperienceYearsInCurrentRole': 7,
 'YearsSinceLastPromotion': 0,
 'YearsWithCurrManager': 8,
 'Attrition': 'No'})

2025-07-25 23:06:17,871 - predict_model - INFO - Started inference...
2025-07-25 23:06:17,880 - preprocess - INFO - Applying ordinal encoding.
2025-07-25 23:06:17,884 - preprocess - INFO - Encoded column: Gender
2025-07-25 23:06:17,889 - preprocess - INFO - Encoded column: MaritalStatus
2025-07-25 23:06:17,893 - preprocess - INFO - Encoded column: BusinessTravelFrequency
2025-07-25 23:06:17,897 - preprocess - INFO - Encoded column: OverTime
2025-07-25 23:06:17,900 - preprocess - INFO - Encoded column: Attrition
2025-07-25 23:06:17,902 - preprocess - INFO - Applying log transformation
2025-07-25 23:06:17,911 - predict_model - INFO - Inference completed succesfully.


In [10]:
prediction

0    3
dtype: int64

# INX Employee Performance Prediction - Project Summary

## 1. Data Loading

- Data loaded successfully from Excel file.
- Rows: 1200  
- Columns: 28  

## 2. Data Preprocessing

- Dropped Irrelevant Columns:  
  - EmpJobLevel  
  - ExperienceYearsAtThisCompany

- Train-Test Split:
  - Training Set Shape: (960, 24)  
  - Testing Set Shape: (240, 24)

- Target Distribution (Train):  
  {1: 699, 0: 155, 2: 106}

- Target Distribution (Test):  
  {1: 175, 0: 39, 2: 26}

- Ordinal Encoded Categorical Features:
  - Gender
  - MaritalStatus
  - BusinessTravelFrequency
  - OverTime
  - Attrition

- Applied log transformation to selected numerical columns.

- ColumnTransformer pipeline created and fitted on train data.  
- Preprocessor saved as: Preprocessor.pkl

## 3. Model Training

- Model directory created: models/

- Trained Models:
  - Logistic Regression
  - Ridge Classifier
  - Support Vector Machine (SVM)
  - Decision Tree
  - Random Forest
  - Gradient Boosting
  - Naive Bayes
  - AdaBoost
  - XGBoost
  - LightGBM

## 4. Model Evaluation

- Metrics computed for each model:
  - Accuracy
  - F1 Score
  - ROC-AUC Score (where applicable)
  - Confusion Matrix

### Best Performing Models:

#### LightGBM:
- Test Accuracy: 94.2%
- F1 Score: 0.941
- ROC-AUC Score: 0.979
- Confusion Matrix:
  [[33, 5, 1],  
   [4, 170, 1],  
   [0, 3, 23]]

#### XGBoost:
- Test Accuracy: 92.1%
- F1 Score: 0.918
- ROC-AUC Score: 0.969

#### Gradient Boosting:
- Test Accuracy: 92.9%
- F1 Score: 0.928
- ROC-AUC Score: 0.978

## 5. Best Model Selection

- Best Model based on highest F1 Score: LightGBM
- Saved Model: models/LightGBM_best_model.pkl

## 6. Prediction

- Loaded saved model and preprocessor
- Performed inference successfully on test data

## Final Remarks

- All steps from preprocessing to prediction were executed successfully.
- Project pipeline is complete.


##  Conclusion

- This project assessed a range of classification algorithms to predict employee performance levels using a structured and preprocessed dataset. Key preprocessing steps—including label encoding, SMOTE for class balancing, and PCA for dimensionality reduction—were uniformly applied to ensure fair and efficient model comparisons.

- Among the evaluated models, **LightGBM** and **Gradient Boosting** demonstrated the best performance with **94% accuracy and 0.94 f1 score**, leveraging their ability to handle complex feature interactions and imbalanced data. **XGBoost**, **Random Forest**, and **Decision Tree** also performed strongly with test accuracies around **92%**, showcasing the reliability of tree-based ensemble methods.

- While models like **Logistic Regression** and **Ridge Classifier** offered simplicity and interpretability, they underperformed compared to ensemble and kernel-based approaches, achieving around **75–76% accuracy**. **SVM** provided a good balance of complexity and performance, reaching **80% accuracy and 0.81 f1 score**.

- The **Naive Bayes classifier** struggled significantly due to its strong independence assumptions, resulting in the lowest performance.

- Overall, the results suggest that for high-dimensional, imbalanced datasets with mixed data types, **boosting and ensemble methods (LightGBM, Gradient Boosting, Random Forest)** are highly effective, especially when combined with robust preprocessing pipelines.
