In [33]:
import sys
from pathlib import Path

# Ensure the project root is on the path so `src` imports work
sys.path.insert(0, str(Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()))

from tqdm import tqdm
import pandas as pd

from src.preprocessing import (
    split_features_target,
    split_data,
    SplitConfig
)
from src.data import DataLoader
from src.model import build_model_pipeline, save_model
from src.eval import (
    evaluate_cv,
    evaluate_holdout,
    run_grid_search,
    run_randomized_search,
    print_scores
)

In [34]:
# Hyperparameter grids for GridSearchCV / RandomizedSearchCV
PARAM_GRID = {
    "elasticnet": {
        "model__alpha": [0.001, 0.01, 0.1, 1.0, 10.0],
        "model__l1_ratio": [0.1, 0.5, 0.9],
        "model__max_iter": [1000, 2000, 5000, 10000],
    },
    "random_forest": {
        "model__n_estimators": [100, 200, 300, 400, 500],
        "model__max_depth": [1, 3, 5, 7],
        "model__min_samples_split": [2, 5, 10, 20],
    },
    "xgboost": {
        "model__n_estimators": [100, 300, 500, 700],
        "model__learning_rate": [0.01, 0.05, 0.1, 0.3],
        "model__max_depth": [1, 3, 5, 7],
        "model__subsample": [0.2, 0.6, 0.8, 1.0],
        "model__colsample_bytree": [0.2, 0.6, 0.8, 1.0],
    },
    "catboost": {
        "model__iterations": [100, 300, 500, 700],
        "model__learning_rate": [0.01, 0.05, 0.1, 0.3],
        "model__depth": [1, 3, 5, 7],
        "model__l2_leaf_reg": [1, 3, 5, 7],
    },
}

In [35]:
# Helper functions
def prefix_param_grid(grid: dict, prefix: str) -> dict:
    """Prefix all keys in a param grid dict (for TransformedTargetRegressor)."""
    return {f"{prefix}{k}": v for k, v in grid.items()}

In [40]:
# Configuration
dataset = "house_prices"  # OpenML dataset name
target_col = "SalePrice"
split_cfg = SplitConfig(val_size=0.15, test_size=0.15, shuffle=True, random_state=42)
cv = 5  # Number of CV folds (set to None for holdout validation)

In [41]:
# Pipeline construction kwargs (set all to True to enable all features)
pipeline_kwargs = {
    "fill_informative_missing": True,
    "use_ordinal_encoding": True,
    "feature_engineering": True,
    "correct_skewness": True,
    "log_target": True,
}

In [42]:
# 1. model and data loading
model_names = ["elasticnet", "random_forest", "xgboost", "catboost"]
df = DataLoader.load_data_from_openml(dataset_name=dataset)

In [43]:
# 2. data splitting
X, y = split_features_target(df, target_col=target_col)
train, val, test = split_data(X, y, cfg=split_cfg) # df[X, y]

# concatenate train and val for cross-val evaluation
X_train_cv = pd.concat([train[0], val[0]])
y_train_cv = pd.concat([train[1], val[1]])

In [None]:
# 3.1 evaluate models
scores_val = {}
scores_eval = {}
for model_name in tqdm(model_names, desc="Training models"):
    pipeline = build_model_pipeline(
        model_name, train[0], **pipeline_kwargs
    )
    
    # Fit to training set
    pipeline.fit(train[0], train[1])
    
    if cv > 1:
        # cross-validation
        model_scores = evaluate_cv(
            pipeline, X_train_cv, y_train_cv, cv=cv
        )
    else:
        # validation set
        model_scores = evaluate_holdout(
            pipeline, val[0], val[1]
        )
    scores_val[model_name] = model_scores   
    
    # Final test evaluation
    scores_eval[model_name] = evaluate_holdout(
        pipeline, test[0], test[1]
    )
    
print_scores("Validation Scores:", scores_val)
print_scores("Final Evaluation Scores:", scores_eval)

Training models:  25%|██▌       | 1/4 [00:00<00:00,  3.18it/s]

elasticnet val/CV RMSE: 48168.70


Training models:  50%|█████     | 2/4 [00:01<00:01,  1.34it/s]

random_forest val/CV RMSE: 22426.53


Training models:  75%|███████▌  | 3/4 [00:02<00:01,  1.11s/it]

xgboost val/CV RMSE: 23710.23


Training models: 100%|██████████| 4/4 [00:06<00:00,  1.59s/it]

catboost val/CV RMSE: 19752.28

[1mFinal Evaluation Scores:[0m
  [96mELASTICNET     [0m: [92mRMSE=[0m[93m  57356.08[0m  [92mMAE=[0m[93m  28747.34[0m  [92mMAPE=[0m[93m  14.86%[0m  [92mR^2=[0m[93m  0.5677[0m
  [96mRANDOM_FOREST  [0m: [92mRMSE=[0m[93m  28246.08[0m  [92mMAE=[0m[93m  15826.13[0m  [92mMAPE=[0m[93m   8.73%[0m  [92mR^2=[0m[93m  0.8952[0m
  [96mXGBOOST        [0m: [92mRMSE=[0m[93m  24080.89[0m  [92mMAE=[0m[93m  14121.29[0m  [92mMAPE=[0m[93m   8.08%[0m  [92mR^2=[0m[93m  0.9238[0m
  [96mCATBOOST       [0m: [92mRMSE=[0m[93m  27002.36[0m  [92mMAE=[0m[93m  13976.84[0m  [92mMAPE=[0m[93m   7.62%[0m  [92mR^2=[0m[93m  0.9042[0m






In [None]:
### ---------- Optimized Run (w/ GridSearch) ---------- ###
scores_eval_optimized = {}
scores_eval_optimized = {}

for model_name in tqdm(
    model_names, desc="Optimizing models"
):
    estimator = build_model_pipeline(
        model_name, train[0], **pipeline_kwargs
    )
    param_grid = PARAM_GRID.get(model_name)
    if param_grid:
        param_grid = prefix_param_grid(
            param_grid, "regressor__"
        )

    if model_name in {"xgboost", "catboost"}:
        search = run_randomized_search(
            estimator,
            train[0],
            train[1],
            param_distributions=param_grid,
            n_iter=150,
            cv=5,
        )
    else:
        search = run_grid_search(
            estimator,
            train[0],
            train[1],
            param_grid=param_grid,
            cv=5,
        )

    best = search.best_estimator_
    
    # Save best model from search
    save_path = save_model(best, f"{model_name}_optimized")
    print(f"Saved optimized {model_name} to {save_path}")

    # Cross-val evaluation of optimized model
    if cv > 1:
        val_scores = evaluate_cv(
            best, X_train_cv, y_train_cv, cv=cv
        )
    else:
        val_scores = evaluate_holdout(
            best, val[0], val[1]
        )
    scores_val[model_name] = val_scores
    
    # Final test evaluation of optimized model
    scores_eval_optimized[model_name] = evaluate_holdout(
        best, test[0], test[1]
    )

print_scores("Validation Scores (Optimized): ", scores_val)
print_scores("Final Evaluation Scores (Optimized): ", scores_eval_optimized)


Optimizing models:  25%|██▌       | 1/4 [00:11<00:34, 11.41s/it]

Saved optimized elasticnet to model\elasticnet_optimized.joblib
elasticnet val RMSE: 24173.63
Saved optimized random_forest to model\random_forest_optimized.joblib
random_forest val RMSE: 22651.58


Optimizing models:  75%|███████▌  | 3/4 [30:57<13:26, 806.98s/it]

Saved optimized xgboost to model\xgboost_optimized.joblib
xgboost val RMSE: 20162.61
Saved optimized catboost to model\catboost_optimized.joblib
catboost val RMSE: 19821.63


Optimizing models: 100%|██████████| 4/4 [2:17:44<00:00, 2066.21s/it]


[1mFinal Evaluation Scores (Optimized):[0m
  [96mELASTICNET     [0m: [92mRMSE=[0m[93m  34434.72[0m  [92mMAE=[0m[93m  18502.22[0m  [92mMAPE=[0m[93m   9.67%[0m  [92mR^2=[0m[93m  0.8442[0m
  [96mRANDOM_FOREST  [0m: [92mRMSE=[0m[93m  28654.07[0m  [92mMAE=[0m[93m  15841.18[0m  [92mMAPE=[0m[93m   8.73%[0m  [92mR^2=[0m[93m  0.8921[0m
  [96mXGBOOST        [0m: [92mRMSE=[0m[93m  26569.87[0m  [92mMAE=[0m[93m  14032.94[0m  [92mMAPE=[0m[93m   7.78%[0m  [92mR^2=[0m[93m  0.9072[0m
  [96mCATBOOST       [0m: [92mRMSE=[0m[93m  27950.20[0m  [92mMAE=[0m[93m  14273.28[0m  [92mMAPE=[0m[93m   7.66%[0m  [92mR^2=[0m[93m  0.8973[0m






In [22]:
import joblib

best_elasticnet = joblib.load("model/elasticnet_optimized.joblib")
best_random_forest = joblib.load("model/random_forest_optimized.joblib")
best_xgboost = joblib.load("model/xgboost_optimized.joblib")
best_catboost = joblib.load("model/catboost_optimized.joblib")
models = {
    "ElasticNet": best_elasticnet,
    "RandomForest": best_random_forest,
    "XGBoost": best_xgboost,
    "CatBoost": best_catboost
}

In [30]:
val_scores = {}
for model_name, model in models.items():
    val_scores[model_name] = evaluate_holdout(
        model, val[0], val[1]
    )
print_scores("Validation Scores for Optimized Models:", val_scores)

test_scores = {}
for model_name, model in models.items():
    test_scores[model_name] = evaluate_holdout(
        model, test[0], test[1]
    )
print_scores("Test Scores for Optimized Models:", test_scores)


[1mValidation Scores for Optimized Models:[0m
  [96mELASTICNET     [0m: [92mRMSE=[0m[93m  24173.63[0m  [92mMAE=[0m[93m  16958.76[0m  [92mMAPE=[0m[93m  10.29%[0m  [92mR^2=[0m[93m  0.9070[0m
  [96mRANDOMFOREST   [0m: [92mRMSE=[0m[93m  22651.58[0m  [92mMAE=[0m[93m  14834.84[0m  [92mMAPE=[0m[93m   9.42%[0m  [92mR^2=[0m[93m  0.9183[0m
  [96mXGBOOST        [0m: [92mRMSE=[0m[93m  20162.61[0m  [92mMAE=[0m[93m  13331.24[0m  [92mMAPE=[0m[93m   8.37%[0m  [92mR^2=[0m[93m  0.9353[0m
  [96mCATBOOST       [0m: [92mRMSE=[0m[93m  19821.63[0m  [92mMAE=[0m[93m  13554.38[0m  [92mMAPE=[0m[93m   8.60%[0m  [92mR^2=[0m[93m  0.9374[0m


[1mTest Scores for Optimized Models:[0m
  [96mELASTICNET     [0m: [92mRMSE=[0m[93m  34434.72[0m  [92mMAE=[0m[93m  18502.22[0m  [92mMAPE=[0m[93m   9.67%[0m  [92mR^2=[0m[93m  0.8442[0m
  [96mRANDOMFOREST   [0m: [92mRMSE=[0m[93m  28654.07[0m  [92mMAE=[0m[93m  15841.18[0m  [92m