In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.compose import TransformedTargetRegressor

In [None]:
df = pd.read_csv("../../data/Salary_Data.csv")

conversion_rate_inr_to_lkr = 4.962
df['Salary'] = df['Salary'] * conversion_rate_inr_to_lkr
print(df.head())



In [None]:
print("\nMissing Values Before Cleaning:")
print(df.isnull().sum())

In [None]:
df = df.dropna()   
print("\nMissing Values After Cleaning:")
print(df.isnull().sum())



In [None]:
Q1 = df['Salary'].quantile(0.25)
Q3 = df['Salary'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df['Salary'] >= lower_bound) & (df['Salary'] <= upper_bound)]


In [None]:
X = df[['Age', 'Gender', 'Education Level', 'Job Title', 'Years of Experience']]
y = df['Salary']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

numeric_features = ['Age', 'Years of Experience']
categorical_features = ['Gender', 'Education Level', 'Job Title']




In [None]:
svr_pipeline = Pipeline(steps=[
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), make_column_selector(dtype_include=np.number)),
            ('cat', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'),
             make_column_selector(dtype_include=object))
        ]
    )),
    ('regressor', TransformedTargetRegressor(
        regressor=SVR(),
        transformer=StandardScaler()
    ))
])


In [None]:
param_grid = {
    'regressor__regressor__kernel': ['rbf'],
    'regressor__regressor__C': [0.5, 1, 5, 10],       
    'regressor__regressor__gamma': ['scale'],
    'regressor__regressor__epsilon': [0.1, 0.5, 1.0]  
}

grid_search = GridSearchCV(
    svr_pipeline,
    param_grid,
    cv=5,           
    scoring='r2',
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X_train, y_train)


best_model = grid_search.best_estimator_
print("\nBest Parameters:", grid_search.best_params_)


In [None]:
y_pred = best_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("\nTest Metrics:")
print(f"RÂ² Score       : {r2:.4f}")
print(f"Mean Abs Error : {mae:.2f}")
print(f"Root MSE       : {rmse:.2f}")


In [None]:
y_train_pred = best_model.predict(X_train)
r2_train = r2_score(y_train, y_train_pred)
mae_train = mean_absolute_error(y_train, y_train_pred)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))

print("\nðŸ”§ Training Metrics:")
print(f"Train RÂ²       : {r2_train:.4f}")
print(f"Train MAE      : {mae_train:.2f}")
print(f"Train RMSE     : {rmse_train:.2f}")


In [None]:
cv_scores = cross_val_score(best_model, X, y, cv=3, scoring='r2')
print(f"\nCross-val RÂ² scores: {cv_scores}")
print(f"Mean CV RÂ²: {cv_scores.mean():.3f} (+/- {cv_scores.std()*2:.3f})")


In [None]:

new_raw = pd.DataFrame({
    'Years of Experience': [5, 10, 12],
    'Age': [30, 40, 28],
    'Gender': ['Male', 'Female', 'Male'],
    'Education Level': ["Bachelor's", "Master's", "High School"],
    'Job Title': ['Data Scientist', 'Software Engineer', 'Analyst']
})


predicted_salary = best_model.predict(new_raw)


new_raw['Predicted Salary (LKR)'] = predicted_salary.round(2)

print("\nðŸ’° Predicted Salaries for Sample Inputs:")
print(new_raw.to_string(index=False))
