In [1]:
pip install pandas numpy scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
# Load the dataset
df = pd.read_csv("Salary_Data.csv")  # Save your Excel file as CSV

In [4]:
# Drop rows with missing target values (Salary)
df = df.dropna(subset=["Salary"])

# Features and Target
X = df.drop("Salary", axis=1)
y = df["Salary"]

In [5]:
# Define categorical and numerical columns
categorical_cols = ["Gender", "Education", "Job Title"]
numerical_cols = ["Age", "YearsExperience"]

In [6]:
# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numerical_cols)
    ]
)


In [7]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [8]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
# Train model
model.fit(X_train, y_train)


In [10]:
y_pred = model.predict(X_test)



In [11]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

In [12]:
print("Model Evaluation Metrics:")
print(f" R² Score: {r2:.4f}")
print(f" Mean Absolute Error (MAE): {mae:.2f}")
print(f" Mean Squared Error (MSE): {mse:.2f}")
print(f" Root Mean Squared Error (RMSE): {rmse:.2f}")


Model Evaluation Metrics:
 R² Score: 0.9841
 Mean Absolute Error (MAE): 2688.18
 Mean Squared Error (MSE): 45141395.66
 Root Mean Squared Error (RMSE): 6718.73


In [13]:
with open('salary_model.pkl', 'wb') as file:
    pickle.dump(model, file)
