In [1]:
import pandas as pd
import numpy as np
import joblib
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.compose import TransformedTargetRegressor

In [2]:
df = pd.read_csv("../../data/Salary_Data.csv")
print("Initial rows:", len(df))
print(df.head())

Initial rows: 6704
    Age  Gender Education Level          Job Title  Years of Experience  \
0  32.0    Male      Bachelor's  Software Engineer                  5.0   
1  28.0  Female        Master's       Data Analyst                  3.0   
2  45.0    Male             PhD     Senior Manager                 15.0   
3  36.0  Female      Bachelor's    Sales Associate                  7.0   
4  52.0    Male        Master's           Director                 20.0   

     Salary  
0   90000.0  
1   65000.0  
2  150000.0  
3   60000.0  
4  200000.0  


In [3]:
def clean_text(text):
    if isinstance(text, str):
        text = text.strip().lower()
        text = text.replace("’", "'")
        text = text.replace(" degree", "")
        text = text.replace("bachelors", "bachelor's")
        text = text.replace("masters", "master's")
        text = text.replace("phd", "phd")
    return text



df.dropna(inplace=True)
print("Rows after removing NaN:", len(df))


df["Education Level"] = df["Education Level"].apply(clean_text)
print("Unique cleaned education values:", df["Education Level"].unique())


df.drop_duplicates(inplace=True)
print("Rows after dropping duplicates:", len(df))

Rows after removing NaN: 6698
Unique cleaned education values: ["bachelor's" "master's" 'phd' 'high school']
Rows after dropping duplicates: 1786


In [4]:
def remove_outliers_iqr(df, cols):
    for col in cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

df_cleaned = remove_outliers_iqr(df, ['Salary'])
df_cleaned = df_cleaned[df_cleaned['Salary'] > 1000] 
df_cleaned = df_cleaned.reset_index(drop=True)
print(f"Original rows: {len(df)}, Cleaned rows: {len(df_cleaned)}")
df.duplicated().sum()

Original rows: 1786, Cleaned rows: 1782


np.int64(0)

In [5]:
X = df[["Age", "Gender", "Education Level", "Job Title", "Years of Experience"]]
y = df["Salary"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [6]:
numeric_features = ["Age", "Years of Experience"]
categorical_features = ["Gender", "Education Level", "Job Title"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        (
            "cat",
            OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore"),
            categorical_features,
        ),
    ]
)

svr = TransformedTargetRegressor(
    regressor=SVR(),
    transformer=StandardScaler()
)

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", svr)
])


In [7]:
param_grid = {
    "regressor__regressor__kernel": ["rbf"],
    "regressor__regressor__C": [0.5, 1, 5, 10],
    "regressor__regressor__gamma": ["scale", "auto"],
    "regressor__regressor__epsilon": [0.1, 0.3, 0.5, 1.0]
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring="r2",
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print("\nBest Parameters:", grid_search.best_params_)

Fitting 5 folds for each of 32 candidates, totalling 160 fits



Best Parameters: {'regressor__regressor__C': 5, 'regressor__regressor__epsilon': 0.1, 'regressor__regressor__gamma': 'scale', 'regressor__regressor__kernel': 'rbf'}


In [8]:
print("\nTEST METRICS")

y_pred = best_model.predict(X_test)
print("R² Score:", round(r2_score(y_test, y_pred), 4))
print("MAE     :", round(mean_absolute_error(y_test, y_pred), 2))
print("RMSE    :", round(np.sqrt(mean_squared_error(y_test, y_pred)), 2))


y_train_pred = best_model.predict(X_train)
print("\nTRAIN METRICS")
print("Train R²:", round(r2_score(y_train, y_train_pred), 4))
print("Train MAE:", round(mean_absolute_error(y_train, y_train_pred), 2))
print("Train RMSE:", round(np.sqrt(mean_squared_error(y_train, y_train_pred)), 2))


cv_scores = cross_val_score(best_model, X, y, cv=3, scoring="r2")
print("\nCross-val R² scores:", cv_scores)
print("Mean CV R²:", round(cv_scores.mean(), 3))


TEST METRICS
R² Score: 0.9196
MAE     : 9826.44
RMSE    : 15131.12

TRAIN METRICS
Train R²: 0.9556
Train MAE: 6837.08
Train RMSE: 10773.11

Cross-val R² scores: [0.56439927 0.80130659 0.74833194]
Mean CV R²: 0.705


In [9]:
def clean_input(df):
    df["Education Level"] = df["Education Level"].apply(clean_text)
    return df

new_raw = pd.DataFrame({
    "Years of Experience": [5, 10, 12],
    "Age": [30, 40, 28],
    "Gender": ["Male", "Female", "Male"],
    "Education Level": ["Bachelor's", "Master's", "High School"],
    "Job Title": ["Data Scientist", "Software Engineer", "Analyst"]
})

new_raw = clean_input(new_raw)
predicted_salary = best_model.predict(new_raw)
new_raw["Predicted Salary (LKR)"] = predicted_salary.round(2)

print("\n Predicted Salaries:")
print(new_raw)


 Predicted Salaries:
   Years of Experience  Age  Gender Education Level          Job Title  \
0                    5   30    Male      bachelor's     Data Scientist   
1                   10   40  Female        master's  Software Engineer   
2                   12   28    Male     high school            Analyst   

   Predicted Salary (LKR)  
0               158506.09  
1               142844.62  
2               136861.03  


In [10]:
joblib.dump(best_model, "salary_prediction_model.pkl")
joblib.dump(best_model, "/workspaces/CS_Group_09_Salary_prediction/models/salary_prediction_model.pkl")
print("SVR_Model.pkl")


Model saved as salary_prediction_model.pkl
