In [1]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE  

In [2]:
# Load Engineered Data
df = pd.read_csv("Data/Employee_Data_Engineered.csv")


In [3]:
# Features and Target
X = df.drop(columns=['PerformanceRating', 'PerformanceCategory'])
y = df['PerformanceCategory']


In [4]:
# Train-Test Split (stratify=y to maintain class proportions)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [5]:
# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [6]:
# Handle class imbalance with SMOTE on training data only
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)


In [7]:
# Logistic Regression 
log_reg = LogisticRegression(max_iter=1000, class_weight='balanced')  # Optional: class_weight also
log_reg.fit(X_train_res, y_train_res)
y_pred_log_reg = log_reg.predict(X_test_scaled)
log_reg_acc = accuracy_score(y_test, y_pred_log_reg)


In [8]:
# Random Forest Classifier 
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_clf.fit(X_train_res, y_train_res)
y_pred_rf_clf = rf_clf.predict(X_test_scaled)
rf_clf_acc = accuracy_score(y_test, y_pred_rf_clf)


In [9]:
print("Initial model training complete with SMOTE balancing.")
print("Logistic Regression Accuracy:", log_reg_acc)
print("Random Forest Classifier Accuracy:", rf_clf_acc)

Initial model training complete with SMOTE balancing.
Logistic Regression Accuracy: 0.7291666666666666
Random Forest Classifier Accuracy: 0.9458333333333333


In [10]:
# Hyperparameter Tuning - Random Forest 
param_grid = {
    'n_estimators': list(range(50, 201, 20)),
    'max_depth': list(range(1, 16)),
    'min_samples_split': list(range(2, 16))
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(class_weight='balanced', random_state=42),
                           param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train_res, y_train_res)  # use balanced resampled train data

print("Best Parameters from GridSearchCV:", grid_search.best_params_)


Fitting 3 folds for each of 1680 candidates, totalling 5040 fits
Best Parameters from GridSearchCV: {'max_depth': 11, 'min_samples_split': 3, 'n_estimators': 150}


In [11]:
# Re-train RF with best parameters 
best_rf_clf = RandomForestClassifier(
    n_estimators=grid_search.best_params_['n_estimators'],
    max_depth=grid_search.best_params_['max_depth'],
    min_samples_split=grid_search.best_params_['min_samples_split'],
    random_state=42,
    class_weight='balanced'
)
best_rf_clf.fit(X_train_res, y_train_res)


In [12]:
import joblib

# Define the target directory
save_path = r"Models/"

# Save models and scaler in the specified path
joblib.dump(best_rf_clf, save_path + 'best_rf_model.pkl')
joblib.dump(log_reg, save_path + 'log_reg_model.pkl')
joblib.dump(scaler, save_path + 'scaler.pkl')

print("Models and scaler saved successfully in:", save_path)


Models and scaler saved successfully in: Models/


# Summary Section (Model building Recap)


###  Objective:
This notebook focuses on training classification models to predict employee performance categories. It includes handling class imbalance with SMOTE, model building, scaling, and hyperparameter tuning.


###  Key Steps:
 - Loaded the engineered dataset and prepared features and target variable.
 - Performed train-test split with stratification.
 - Scaled the data using StandardScaler.
 - Balanced the training data using **SMOTE** to address class imbalance.
 - Trained two baseline models on balanced data: 
  - Logistic Regression (with class_weight='balanced' to further help)
  - Random Forest Classifier (with class_weight='balanced')
 - Performed **GridSearchCV** hyperparameter tuning on Random Forest using balanced data.
 - Saved the best-performing model (best_rf_model.pkl), logistic regression model (log_reg_model.pkl), and scaler (scaler.pkl) for later evaluation.


###  Output:
 - `best_rf_model.pkl`
 - `log_reg_model.pkl`
 - `scaler.pkl`