In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler, KBinsDiscretizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.compose import TransformedTargetRegressor
import joblib
import warnings
warnings.filterwarnings("ignore")

# Load data
df = pd.read_csv("../../data/Salary_Data.csv")
df = df.dropna().reset_index(drop=True)

# Basic winsorization (cap extremes at 1st and 99th percentiles) to reduce effect of extreme outliers
for col in ['Age', 'Years of Experience', 'Salary']:
    lower, upper = df[col].quantile([0.01, 0.99])
    df[col] = df[col].clip(lower=lower, upper=upper)

# Features / target
X = df.drop(columns=['Salary'])
y = df['Salary']

# Train/test split (stratify on Gender if present)
strat_col = None
if 'Gender' in X.columns and X['Gender'].nunique() > 1:
    strat_col = X['Gender']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=strat_col
)

# Create some new features using only training data (avoid leakage)
# 1) Experience to age ratio
X_train = X_train.copy()
X_test = X_test.copy()
X_train['exp_age_ratio'] = X_train['Years of Experience'] / (X_train['Age'] + 1)
X_test['exp_age_ratio'] = X_test['Years of Experience'] / (X_test['Age'] + 1)

# 2) Target-encoding for Job Title (map training target means)
if 'Job Title' in X_train.columns:
    df_te = X_train.join(y_train.rename('Salary'))
    job_title_te = df_te.groupby('Job Title')['Salary'].mean()
    X_train['job_title_te'] = X_train['Job Title'].map(job_title_te).fillna(job_title_te.mean())
    X_test['job_title_te'] = X_test['Job Title'].map(job_title_te).fillna(job_title_te.mean())
else:
    X_train['job_title_te'] = 0
    X_test['job_title_te'] = 0

# 3) Ordinal encoding for Education Level based on training-target ordering
if 'Education Level' in X_train.columns:
    edu_order = df_te.groupby('Education Level')['Salary'].mean().sort_values().index.tolist()
    edu_map = {k: i for i, k in enumerate(edu_order)}
    X_train['education_ord'] = X_train['Education Level'].map(edu_map).fillna(-1)
    X_test['education_ord'] = X_test['Education Level'].map(edu_map).fillna(-1)
else:
    X_train['education_ord'] = 0
    X_test['education_ord'] = 0

# Define columns for preprocessing
num_cols = ['Age', 'Years of Experience', 'exp_age_ratio', 'job_title_te', 'education_ord']
cat_cols = []
if 'Gender' in X_train.columns:
    cat_cols.append('Gender')

# Preprocessor: KBins for Age (quantile bins) + scaling; OneHot for Gender
preprocessor = ColumnTransformer(transformers=[
    ('kb_age', KBinsDiscretizer(n_bins=5, encode='onehot-dense', strategy='quantile'), ['Age']),
    ('num', StandardScaler(), ['Years of Experience', 'exp_age_ratio', 'job_title_te', 'education_ord']),
    ('gender', OneHotEncoder(drop='if_binary', handle_unknown='ignore'), cat_cols)
], remainder='drop')

# Base estimator
rf = RandomForestRegressor(random_state=42, n_jobs=-1)

# Wrap with log-target transform to stabilize variance
ttr = TransformedTargetRegressor(regressor=rf, func=np.log1p, inverse_func=np.expm1)

# Full pipeline
pipeline = Pipeline(steps=[
    ('pre', preprocessor),
    ('rf_ttr', ttr)
])

# Hyperparameter space for randomized search
param_dist = {
    'rf_ttr__regressor__n_estimators': [100, 200, 300],
    'rf_ttr__regressor__max_depth': [6, 10, 15, None],
    'rf_ttr__regressor__max_features': ['sqrt', 'log2', 0.5],
    'rf_ttr__regressor__min_samples_leaf': [1, 2, 4],
    'rf_ttr__regressor__min_samples_split': [2, 5, 10]
}

# Cross-validation and randomized search (light)
cv = KFold(n_splits=5, shuffle=True, random_state=42)
search = RandomizedSearchCV(
    pipeline, param_distributions=param_dist, n_iter=12, cv=cv,
    scoring='r2', random_state=42, n_jobs=-1, verbose=0
)

# Fit
search.fit(X_train, y_train)

# Best pipeline
best_pipe = search.best_estimator_

# Predictions and evaluation
y_train_pred = best_pipe.predict(X_train)
y_test_pred = best_pipe.predict(X_test)

def evaluate(y_true, y_pred, name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    print(f"{name} -> RMSE: {rmse:,.2f} | R2: {r2:.4f} | MAE: {mae:,.2f}")
    return {'rmse': rmse, 'r2': r2, 'mae': mae}

train_metrics = evaluate(y_train, y_train_pred, "Train")
test_metrics = evaluate(y_test, y_test_pred, "Test")

# Cross-val on training set with best pipeline
cv_scores = cross_val_score(best_pipe, X_train, y_train, cv=cv, scoring='r2', n_jobs=-1)
print(f"CV R2 mean: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")

# Save model
import os
os.makedirs("../../models", exist_ok=True)

joblib.dump(best_pipe, "../../models/rf_alt_model.joblib")

# Print selected features / notes
print("\nBest params:", search.best_params_)
print("Model saved to ../../models/rf_alt_model.joblib")

Train -> RMSE: 4,278.28 | R2: 0.9934 | MAE: 2,097.76
Test -> RMSE: 8,072.71 | R2: 0.9758 | MAE: 3,558.40
CV R2 mean: 0.9810 (+/- 0.0066)

Best params: {'rf_ttr__regressor__n_estimators': 100, 'rf_ttr__regressor__min_samples_split': 5, 'rf_ttr__regressor__min_samples_leaf': 1, 'rf_ttr__regressor__max_features': 0.5, 'rf_ttr__regressor__max_depth': 15}
Model saved to ../../models/rf_alt_model.joblib
