---
## 0. Setup Environment

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:


import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

---
## A. Project Description


The objective of this project is to develop a predictive model that estimates employee salaries based on a variety of personal, professional, and organizational attributes. By leveraging features such as years of experience, education level, job role, department, performance ratings, and other relevant factors, the model aims to provide accurate salary predictions for employees. This will assist HR professionals, managers, and organizations in making informed decisions regarding compensation planning, talent management, and recruitment strategies. The project involves comprehensive data exploration, feature engineering, and model evaluation to ensure robust and reliable predictions that reflect real-world compensation dynamics.

---
## C. Data Understanding

### C.1   Load Datasets



In [None]:
# # Load training data
training_df = pd.read_csv("../data/raw/Extended_Employee_Performance_and_Productivity_Data")

### C.2 Explore Training Set


In [None]:
training_df.head()

In [None]:
training_df.info()


### C.3 Explore Target Variable




In [None]:
target_name = 'Monthly_Salary'
training_df[target_name].head()

### C.4 Explore Categorical Features


In [None]:
training_df.describe(include= 'object')


In [None]:
categorical_cols = training_df.select_dtypes(include='object').columns

### C.5 Explore Numerical Features


In [None]:
training_df.describe(include= 'number')

In [None]:
numerical_cols = training_df.select_dtypes(include='number').columns

---
## D. Feature Selection


### D.1 Approach 1

### D.z Final Selection of Features


In [None]:
features_list = []

---
## E. Data Cleaning

### E.1 Copy Datasets



In [None]:
training_df_clean=training_df.copy()

### E.2 Fixing "Missing data "




### E.3 Fixing "Outliers in the final list of features"



---
## F. Feature Engineering

### F.1 Copy Datasets



In [None]:
# Create copy of datasets

training_df_eng = training_df_clean.copy()


### F.2 New Feature ""






---
## G. Data Transformation

### G.1 Copy Datasets



In [None]:
# Create copy of datasets

training_df_trans = training_df_eng.copy()


### G.2 Data Transformation Encoding the categorical features 



---
## H. Data Preparation for Modeling

### H.1 Copy Datasets



In [None]:


# Split into train (70%), temp (30%)
train_df, temp_df = train_test_split(training_df_eng, test_size=0.3, random_state=42)

# Split temp into validation (15%) and test (15%)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f"Train shape: {train_df.shape}")
print(f"Validation shape: {val_df.shape}")
print(f"Test shape: {test_df.shape}")

### H.2 Split Features and Target Variables

In [None]:

X_train = train_df.drop(columns=[target_name])
y_train = train_df[target_name]

X_val = val_df.drop(columns=[target_name])
y_val = val_df[target_name]

X_test = test_df.drop(columns=[target_name])
y_test = test_df[target_name]

---
## I. Save Datasets

> Do not change this code

In [None]:

X_train.to_csv('../data/processed/X_train.csv', index=False)
y_train.to_csv('../data/processed/y_train.csv', index=False)

---
## J. Assess Baseline Model

### J.1 Generate Predictions with Baseline Model

In [None]:


dummy_regressor = DummyRegressor(strategy="mean")  # Predicts the mean of the target values
dummy_regressor.fit(X_train, y_train)
y_pred = dummy_regressor.predict(X_val)
y_pred



array([574.44604064, 574.44604064, 574.44604064, ..., 574.44604064,
       574.44604064, 574.44604064], shape=(1320,))

### J.2 Selection of Performance Metrics




In [None]:

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"RMSE: {rmse}")


RMSE: 71.41122641110427


In [None]:
mae = mean_absolute_error(y_val, y_pred)
print(f"MAE: {mae}")

MAE: 28.384357945254933


### J.3 Baseline Model Performance




In [None]:

y_pred = dummy_regressor.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

RMSE: 86.92117365503061


In [None]:
# Calculating accuracy within RMSE range
allowed_range_lower = y_test - rmse
allowed_range_upper = y_test + rmse

within_range = np.logical_and(y_pred >= allowed_range_lower, y_pred <= allowed_range_upper)
accuracy = np.sum(within_range) / len(y_test) * 100

print(f"Accuracy within RMSE range: {accuracy:.2f}%")


Accuracy within RMSE range: 87.83%


In [None]:
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")

MAE: 39.73232608655444


In [None]:
# Calculating accuracy within mae range
allowed_range_lower = y_test - mae
allowed_range_upper = y_test + mae

within_range = np.logical_and(y_pred >= allowed_range_lower, y_pred <= allowed_range_upper)
accuracy = np.sum(within_range) / len(y_test) * 100

print(f"Accuracy within mae range: {accuracy:.2f}%")


Accuracy within mae range: 76.61%
