In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load data
df = pd.read_excel("Data/INX_Employee_Performance.xlsx")

## --- FEATURE ENGINEERING ---


In [4]:
# Difference between total experience and current company experience
df['ExperienceGap'] = df['TotalWorkExperienceInYears'] - df['ExperienceYearsAtThisCompany']


In [5]:
# Average experience per company (handle divide by zero)
df['AvgExperiencePerCompany'] = df['TotalWorkExperienceInYears'] / df['NumCompaniesWorked'].replace(0, 1)


In [6]:
# Age Grouping
df['AgeGroup'] = pd.cut(df['Age'], bins=[18, 25, 35, 45, 60], labels=['18-25', '26-35', '36-45', '46+'])


In [7]:
# Categorical Performance Rating
df['PerformanceCategory'] = df['PerformanceRating'].apply(lambda x: 'High' if x >= 4 else ('Low' if x <= 2 else 'Medium'))


In [8]:
# Tenure Binning
df['TenureCategory'] = pd.cut(df['ExperienceYearsAtThisCompany'],
                              bins=[0, 2, 5, 10, 40],
                              labels=["<2yrs", "2-5yrs", "5-10yrs", "10yrs+"])


In [9]:
#Before encoding copy for original data
df_raw = df.copy()

In [10]:
# Encode Categorical Variables
cat_cols = df.select_dtypes(include=['object', 'category']).columns

le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])


In [11]:
# Scale Numeric Columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])


In [12]:
# Save the  dataset
df.to_csv("Data/Employee_Data_Engineered.csv", index=False)
print(" Feature engineering complete. File saved as 'Employee_Data_Engineered.csv'.")

 Feature engineering complete. File saved as 'Employee_Data_Engineered.csv'.


In [13]:
df_raw.to_csv("Data/Employee_Data_Before_Encoding.csv", index=False)
print(" Feature engineering complete. File saved as 'Employee_Data_Before_Encoding.csv'.")

 Feature engineering complete. File saved as 'Employee_Data_Before_Encoding.csv'.


## Feature Engineering Summary


- Created ExperienceGap = TotalWorkExperienceInYears - ExperienceYearsAtThisCompany

- Created AvgExperiencePerCompany to understand job-hopping tendencies

- Grouped Age into ranges: 18–25, 26–35, 36–45, 46+

- Transformed PerformanceRating into High, Medium, and Low

- Created TenureCategory to segment employees by company loyalty

- Applied Label Encoding to all categorical columns

- Scaled all numeric columns using MinMaxScaler

- Saved two versions of the dataset:

    - Employee_Data_Engineered.csv: Engineered + Encoded + Scaled

    - Employee_Data_Before_Encoding.csv: Engineered but raw (for dashboards or insights)