In [3]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

from joblib import dump, load

data_path = "/Users/s1km4/Desktop/Salary-Prediction/data/Salary_Data.csv"
target_col = 'Salary'
cat_cols = ['Education Level', 'Job Title']
num_cols = ['Years of Experience']
features = cat_cols + num_cols
model_path = "/Users/s1km4/Desktop/Salary-Prediction/models/model_salary.joblib"

In [2]:
# Load data
df = pd.read_csv(data_path)
df = df[features + [target_col]].copy()

# Basic cleaning: drop rows with missing target
df = df[df[target_col].notna()].reset_index(drop=True)

df.head()

Unnamed: 0,Education Level,Job Title,Years of Experience,Salary
0,Bachelor's,Software Engineer,5.0,90000.0
1,Master's,Data Analyst,3.0,65000.0
2,PhD,Senior Manager,15.0,150000.0
3,Bachelor's,Sales Associate,7.0,60000.0
4,Master's,Director,20.0,200000.0


In [5]:
# Split
X = df[features]
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape

((5359, 3), (1340, 3))

In [6]:
# Preprocessing
cat_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore'))
])

num_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_pipe, cat_cols),
        ('num', num_pipe, num_cols)
    ],
    remainder='drop'
)

# Model
model = RandomForestRegressor(
    n_estimators=400,
    random_state=42,
    n_jobs=-1
)

pipe = Pipeline(steps=[
    ('pre', preprocessor),
    ('model', model)
])

In [7]:
# Train
pipe.fit(X_train, y_train)

# Evaluate
y_pred = pipe.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f'RMSE: {rmse:,.2f}')
print(f'R2: {r2:.4f}')

RMSE: 8,884.25
R2: 0.9722


In [8]:
# Save model (pipeline)
dump(pipe, model_path)
print(f'Saved to: {model_path}')

Saved to: /Users/s1km4/Desktop/Salary-Prediction/models/model_salary.joblib
