In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.compose import TransformedTargetRegressor

In [2]:
df = pd.read_csv("../../data/Salary_Data.csv")

conversion_rate_inr_to_lkr = 4.962
df['Salary'] = df['Salary'] * conversion_rate_inr_to_lkr
print(df.head())



    Age  Gender Education Level          Job Title  Years of Experience  \
0  32.0    Male      Bachelor's  Software Engineer                  5.0   
1  28.0  Female        Master's       Data Analyst                  3.0   
2  45.0    Male             PhD     Senior Manager                 15.0   
3  36.0  Female      Bachelor's    Sales Associate                  7.0   
4  52.0    Male        Master's           Director                 20.0   

     Salary  
0  446580.0  
1  322530.0  
2  744300.0  
3  297720.0  
4  992400.0  


In [3]:
print("\nMissing Values Before Cleaning:")
print(df.isnull().sum())


Missing Values Before Cleaning:
Age                    2
Gender                 2
Education Level        3
Job Title              2
Years of Experience    3
Salary                 5
dtype: int64


In [4]:
df = df.dropna()   
print("\nMissing Values After Cleaning:")
print(df.isnull().sum())




Missing Values After Cleaning:
Age                    0
Gender                 0
Education Level        0
Job Title              0
Years of Experience    0
Salary                 0
dtype: int64


In [5]:
Q1 = df['Salary'].quantile(0.25)
Q3 = df['Salary'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df['Salary'] >= lower_bound) & (df['Salary'] <= upper_bound)]


In [6]:
X = df[['Age', 'Gender', 'Education Level', 'Job Title', 'Years of Experience']]
y = df['Salary']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:

numeric_features = ['Age', 'Years of Experience']
categorical_features = ['Gender', 'Education Level', 'Job Title']




In [9]:
svr_pipeline = Pipeline(steps=[
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), make_column_selector(dtype_include=np.number)),
            ('cat', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'),
             make_column_selector(dtype_include=object))
        ]
    )),
    ('regressor', TransformedTargetRegressor(
        regressor=SVR(),
        transformer=StandardScaler()
    ))
])


In [10]:
param_grid = {
    'regressor__regressor__kernel': ['rbf'],
    'regressor__regressor__C': [0.5, 1, 5, 10],       
    'regressor__regressor__gamma': ['scale'],
    'regressor__regressor__epsilon': [0.1, 0.5, 1.0]  
}

grid_search = GridSearchCV(
    svr_pipeline,
    param_grid,
    cv=5,           
    scoring='r2',
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X_train, y_train)


best_model = grid_search.best_estimator_
print("\nBest Parameters:", grid_search.best_params_)


Fitting 5 folds for each of 12 candidates, totalling 60 fits





Best Parameters: {'regressor__regressor__C': 5, 'regressor__regressor__epsilon': 0.1, 'regressor__regressor__gamma': 'scale', 'regressor__regressor__kernel': 'rbf'}


In [11]:
y_pred = best_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("\nTest Metrics:")
print(f"RÂ² Score       : {r2:.4f}")
print(f"Mean Abs Error : {mae:.2f}")
print(f"Root MSE       : {rmse:.2f}")





Test Metrics:
RÂ² Score       : 0.9660
Mean Abs Error : 30152.92
Root MSE       : 48871.61


In [12]:
y_train_pred = best_model.predict(X_train)
r2_train = r2_score(y_train, y_train_pred)
mae_train = mean_absolute_error(y_train, y_train_pred)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))

print("\nðŸ”§ Training Metrics:")
print(f"Train RÂ²       : {r2_train:.4f}")
print(f"Train MAE      : {mae_train:.2f}")
print(f"Train RMSE     : {rmse_train:.2f}")



ðŸ”§ Training Metrics:
Train RÂ²       : 0.9798
Train MAE      : 25190.09
Train RMSE     : 37137.96


In [13]:
cv_scores = cross_val_score(best_model, X, y, cv=3, scoring='r2')
print(f"\nCross-val RÂ² scores: {cv_scores}")
print(f"Mean CV RÂ²: {cv_scores.mean():.3f} (+/- {cv_scores.std()*2:.3f})")





Cross-val RÂ² scores: [0.23490324 0.83525896 0.75729383]
Mean CV RÂ²: 0.609 (+/- 0.533)


In [14]:

new_raw = pd.DataFrame({
    'Years of Experience': [5, 10, 12],
    'Age': [30, 40, 28],
    'Gender': ['Male', 'Female', 'Male'],
    'Education Level': ["Bachelor's", "Master's", "High School"],
    'Job Title': ['Data Scientist', 'Software Engineer', 'Analyst']
})


predicted_salary = best_model.predict(new_raw)


new_raw['Predicted Salary (LKR)'] = predicted_salary.round(2)

print("\nðŸ’° Predicted Salaries for Sample Inputs:")
print(new_raw.to_string(index=False))



ðŸ’° Predicted Salaries for Sample Inputs:
 Years of Experience  Age Gender Education Level         Job Title  Predicted Salary (LKR)
                   5   30   Male      Bachelor's    Data Scientist               848897.54
                  10   40 Female        Master's Software Engineer               826977.61
                  12   28   Male     High School           Analyst               579127.61


