# Predict performance for new employees
This notebook shows how to load saved artifacts (model, scaler, encoders, training columns),
take a few random rows from the dataset as example new inputs, apply the same preprocessing
as training, align columns, predict performance rating, and save results.

In [13]:
import pandas as pd
import numpy as np
import joblib

from sklearn.preprocessing import LabelEncoder, StandardScaler
best_model = joblib.load("best_model.joblib")

print("Model Loaded Successfully!")
new_df = pd.read_excel("../../data/raw/INX_Future_Inc_Employee_Performance.xlsx")
n_samples = 3
df_sample = new_df.sample(n_samples, random_state=42).reset_index(drop=True)
drop_cols = []
if 'EmpNumber' in df_sample.columns:
    drop_cols.append('EmpNumber')
if 'PerformanceRating' in df_sample.columns:
    drop_cols.append('PerformanceRating')

df_input = df_sample.drop(columns=drop_cols, errors='ignore')
print("Sample input shape:", df_input.shape)
df_input.head()

Model Loaded Successfully!
Sample input shape: (3, 26)


Unnamed: 0,Age,Gender,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,EmpEnvironmentSatisfaction,...,EmpLastSalaryHikePercent,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,22,Male,Medical,Married,Development,Developer,Travel_Rarely,6,1,1,...,20,4,3,3,3,2,2,2,2,No
1,31,Male,Life Sciences,Married,Sales,Sales Executive,Travel_Rarely,6,4,2,...,18,2,12,3,2,7,7,7,7,Yes
2,39,Female,Life Sciences,Married,Human Resources,Human Resources,Travel_Rarely,2,3,4,...,11,3,13,2,3,5,4,0,4,No


In [14]:
# Quick look for debugging
print("Columns in input:", df_input.columns.tolist())
print("Dtypes:\n", df_input.dtypes)


Columns in input: ['Age', 'Gender', 'EducationBackground', 'MaritalStatus', 'EmpDepartment', 'EmpJobRole', 'BusinessTravelFrequency', 'DistanceFromHome', 'EmpEducationLevel', 'EmpEnvironmentSatisfaction', 'EmpHourlyRate', 'EmpJobInvolvement', 'EmpJobLevel', 'EmpJobSatisfaction', 'NumCompaniesWorked', 'OverTime', 'EmpLastSalaryHikePercent', 'EmpRelationshipSatisfaction', 'TotalWorkExperienceInYears', 'TrainingTimesLastYear', 'EmpWorkLifeBalance', 'ExperienceYearsAtThisCompany', 'ExperienceYearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'Attrition']
Dtypes:
 Age                              int64
Gender                          object
EducationBackground             object
MaritalStatus                   object
EmpDepartment                   object
EmpJobRole                      object
BusinessTravelFrequency         object
DistanceFromHome                 int64
EmpEducationLevel                int64
EmpEnvironmentSatisfaction       int64
EmpHourlyRate          

In [15]:
df_pred = df_input.copy()
model = None
le = None
scaler = None
train_columns = None
# --- 1) Binary mapping (example mapping used during training) ---
binary_map = {'Male': 1, 'Female': 0, 'Yes': 1, 'No': 0}
binary_cols = ['Gender', 'OverTime', 'Attrition']

for col in binary_cols:
    if col in df_pred.columns:
        df_pred[col] = df_pred[col].map(binary_map).fillna(df_pred[col])


multi_cat_cols = ['EducationBackground', 'MaritalStatus', 'EmpDepartment', 'EmpJobRole', 'BusinessTravelFrequency']


to_dummify = [c for c in multi_cat_cols if c in df_pred.columns]
if to_dummify:
    df_pred = pd.get_dummies(df_pred, columns=to_dummify)

num_cols = df_pred.select_dtypes(include=['int64','float64']).columns.tolist()
num_cols = [c for c in num_cols if c not in binary_cols]  

if scaler is not None and num_cols:
    try:
        df_pred[num_cols] = scaler.transform(df_pred[num_cols])
        print("Applied saved scaler to numeric columns.")
    except Exception as e:
   
        print("Scaler transform failed:", e)
        print("Ensure scaler was trained on the same numeric columns in the same order.")
        raise
else:
    print("No scaler applied (scaler not found or no numeric columns).")
    
df_pred.shape


No scaler applied (scaler not found or no numeric columns).


(3, 31)

In [18]:
# If you saved train_columns earlier, use them to align; otherwise try to infer best-effort
train_columns = joblib.load("training_columns.joblib")
if train_columns is None:
    # If TRAIN_COLS_PATH missing, try to infer from model if pipeline exists
    raise FileNotFoundError("training_columns.joblib not found. Please save it from training notebook or set train_columns variable.")

# Add missing columns
for col in train_columns:
    if col not in df_pred.columns:
        df_pred[col] = 0

# Keep only training columns IN THE SAME ORDER
df_pred = df_pred[train_columns]

print("Aligned df_pred shape:", df_pred.shape)


Aligned df_pred shape: (3, 53)


In [19]:
model =joblib.load('best_model.joblib')
preds_enc = model.predict(df_pred)
results = df_sample.copy().reset_index(drop=True)
results['Predicted_PerformanceRating'] = preds_enc
results

Unnamed: 0,EmpNumber,Age,Gender,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,...,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,PerformanceRating,Predicted_PerformanceRating
0,E100953,22,Male,Medical,Married,Development,Developer,Travel_Rarely,6,1,...,3,3,3,2,2,2,2,No,3,3
1,E100441,31,Male,Life Sciences,Married,Sales,Sales Executive,Travel_Rarely,6,4,...,12,3,2,7,7,7,7,Yes,2,4
2,E1001187,39,Female,Life Sciences,Married,Human Resources,Human Resources,Travel_Rarely,2,3,...,13,2,3,5,4,0,4,No,3,4


In [None]:
#The predicted notebook is saved under data/external folder.
import os
out_dir = "../../data/external"
os.makedirs(out_dir, exist_ok=True)
out_file = os.path.join(out_dir, "predictions_sample.xlsx")
results.to_excel(out_file, index=False)
print("Saved predictions to:", out_file)

Saved predictions to: ../../data/external\predictions_sample.xlsx


This prediction module loads saved model artifacts, applies identical preprocessing used during training, and outputs accurate performance ratings for new employees.
It serves as the deployable inference pipeline for real-world HR decision-making.