In [None]:
import sys
from pathlib import Path

# Ensure the project root is on the path so `src` imports work
sys.path.insert(0, str(Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()))

from tqdm import tqdm

from src.preprocessing import (
    split_features_target,
    split_data,
    SplitConfig
)
from src.data import DataLoader
from src.model import build_model_pipeline
from src.eval import (
    validate_model,
    evaluate_holdout,
    run_grid_search,
    run_randomized_search,
    print_scores
)

In [2]:
# Hyperparameter grids for GridSearchCV / RandomizedSearchCV
PARAM_GRID = {
    "elasticnet": {
        "model__alpha": [0.001, 0.01, 0.1, 1.0, 10.0],
        "model__l1_ratio": [0.1, 0.5, 0.9],
        "model__max_iter": [1000, 2000, 5000, 10000],
    },
    "random_forest": {
        "model__n_estimators": [100, 200, 300, 400, 500],
        "model__max_depth": [None, 10, 20, 30, 40],
        "model__min_samples_split": [2, 5, 10, 20],
    },
    "xgboost": {
        "model__n_estimators": [100, 300, 500, 1000],
        "model__learning_rate": [0.01, 0.05, 0.1, 0.5, 1.0],
        "model__max_depth": [3, 6, 9, 12],
        "model__subsample": [0.2, 0.6, 0.8, 1.0],
        "model__colsample_bytree": [0.2, 0.6, 0.8, 1.0],
    },
}

In [3]:
# Helper functions
def prefix_param_grid(grid: dict, prefix: str) -> dict:
    """Prefix all keys in a param grid dict (for TransformedTargetRegressor)."""
    return {f"{prefix}{k}": v for k, v in grid.items()}

def print_scores(label, scores):
    """Print a scores dict with consistent RMSE / MAE / R^2 formatting."""
    print(f"\n{'=' * 50}")
    print(label)
    for name, s in scores.items():
        print(
            f"  {name}: "
            f"RMSE={s['rmse']:.2f}  "
            f"MAE={s['mae']:.2f}  "
            f"R^2={s['r2']:.4f}"
        )
    print("=" * 50 + "\n")

In [4]:
# Configuration
dataset = "house_prices"  # OpenML dataset name
target_col = "SalePrice"
split_cfg = SplitConfig(val_size=0.20, test_size=0.20, random_state=42)

In [None]:
# Pipeline construction kwargs (set all to True to enable all features)
pipeline_kwargs = {
    "fill_informative_missing": True,
    "use_ordinal_encoding": True,
    "feature_engineering": True,
    "correct_skewness": True,
    "log_target": True,
}

In [6]:
# 1. model and data loading
model_names = ["elasticnet", "random_forest", "xgboost"]
df = DataLoader.load_data_from_openml(dataset_name=dataset)

In [7]:
# 2. data splitting
X, y = split_features_target(df, target_col=target_col)
train, val, test = split_data(X, y, cfg=split_cfg) # df[X, y]

In [8]:
# 3.1 evaluate models
scores_basic = {}
for model_name in tqdm(model_names, desc="Training models"):
    pipeline = build_model_pipeline(
        model_name, train[0], **pipeline_kwargs
    )
    pipeline.fit(train[0], train[1])

    # Evaluate on validation set (holdout, no re-fitting)
    val_scores = validate_model(pipeline, val[0], val[1])
    print(
        f"{model_name} val RMSE: {val_scores:.2f}"
    )

    # Final test evaluation
    scores_basic[model_name] = evaluate_holdout(
        pipeline, test[0], test[1]
    )

print_scores("Final Evaluation Scores:", scores_basic)

Training models:  33%|███▎      | 1/3 [00:04<00:09,  4.58s/it]

elasticnet val RMSE: 61299.73
random_forest val RMSE: 37264.86


Training models: 100%|██████████| 3/3 [00:17<00:00,  5.82s/it]

xgboost val RMSE: 35039.62

Final Evaluation Scores:
  elasticnet: RMSE=46543.24  MAE=30411.80  R^2=0.5571
  random_forest: RMSE=21475.46  MAE=14602.49  R^2=0.9057
  xgboost: RMSE=20118.40  MAE=13527.27  R^2=0.9172






In [9]:
### ---------- Optimized Run (w/ GridSearch) ---------- ###
scores_optim = {}

for model_name in tqdm(
    model_names, desc="Optimizing models"
):
    estimator = build_model_pipeline(
        model_name, train[0], **pipeline_kwargs
    )
    param_grid = PARAM_GRID.get(model_name)
    if param_grid:
        param_grid = prefix_param_grid(
            param_grid, "regressor__"
        )

    if model_name == "xgboost":
        search = run_randomized_search(
            estimator,
            train[0],
            train[1],
            param_distributions=param_grid,
            n_iter=150,
            cv=5,
        )
    else:
        search = run_grid_search(
            estimator,
            train[0],
            train[1],
            param_grid=param_grid,
            cv=5,
        )

    best = search.best_estimator_

    # Evaluate on validation set (holdout)
    val_scores = evaluate_holdout(
        best, val[0], val[1]
    )
    print(
        f"{model_name} val RMSE: "
        f"{val_scores['rmse']:.2f}"
    )

    scores_optim[model_name] = evaluate_holdout(
        best, test[0], test[1]
    )

print_scores(
    "Final Evaluation Scores (Optimized):", scores_optim
)

Optimizing models:   0%|          | 0/3 [00:10<?, ?it/s]


KeyboardInterrupt: 