In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib

# Function to remove outliers using IQR method
def remove_outliers_iqr(df, columns):
    df_clean = df.copy()
    for column in columns:
        Q1 = df_clean[column].quantile(0.25)
        Q3 = df_clean[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df_clean = df_clean[(df_clean[column] >= lower_bound) & 
                           (df_clean[column] <= upper_bound)]
    return df_clean

# Load and preprocess data
df = pd.read_csv("../../data/Salary_Data.csv")
df.dropna(inplace=True)

# Remove outliers from numerical columns
numerical_cols = ['Age', 'Years of Experience', 'Salary']
df_cleaned = remove_outliers_iqr(df, numerical_cols)

# Split data first
X = df_cleaned.drop('Salary', axis=1)
y = df_cleaned['Salary']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=X['Gender']
)

# Now process categorical variables only on training data
threshold = 10
for col in ['Job Title', 'Education Level']:
    top_categories = X_train[col].value_counts().loc[lambda x: x >= threshold].index
    X_train[col] = X_train[col].where(X_train[col].isin(top_categories), 'Other')
    X_test[col] = X_test[col].where(X_test[col].isin(top_categories), 'Other')

# Define preprocessing
numerical_cols = ['Age', 'Years of Experience']
categorical_cols = ['Gender', 'Education Level', 'Job Title']

preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols)
])

# Create model with better parameters
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=100,
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=4,
        max_features='sqrt',
        random_state=42
    ))
])

# Implement K-fold cross validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_train, y_train, cv=kfold, scoring='r2')

# Fit model
model.fit(X_train, y_train)

# Evaluate
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

def evaluate_detailed(y_true, y_pred, dataset_name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    
    print(f"\n{dataset_name} Metrics:")
    print(f"RMSE: ${rmse:,.2f}")
    print(f"R²: {r2:.3f}")
    print(f"MAE: ${mae:,.2f}")
    
    return rmse, r2, mae

# Print evaluation results
train_metrics = evaluate_detailed(y_train, y_train_pred, "Training")
test_metrics = evaluate_detailed(y_test, y_test_pred, "Testing")
print(f"\nCross-validation R² scores: {cv_scores}")
print(f"Mean CV R²: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")




Training Metrics:
RMSE: $17,386.69
R²: 0.889
MAE: $13,724.17

Testing Metrics:
RMSE: $18,343.87
R²: 0.873
MAE: $14,297.27

Cross-validation R² scores: [0.88583871 0.88201007 0.88249632 0.8834263  0.87487258]
Mean CV R²: 0.882 (+/- 0.007)
