In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np


def tune_ridge_over_feature_sets(
    X_train_scaled,
    y_train,
    feature_sets,
    cv,
    scoring,
    alpha_grid=None,
    random_state=42
):
    """
    Tune Ridge regression over multiple feature sets using GridSearchCV.

    Parameters
    ----------
    X_train_scaled : pd.DataFrame
        Scaled training features (all features).
    y_train : pd.Series or np.ndarray
        Target variable for training.
    feature_sets : dict
        Dictionary mapping feature set names to lists of column names.
    cv : cross-validation splitter
        e.g., KFold instance.
    scoring : dict
        Scoring dictionary for GridSearchCV (e.g., r2, rmse, mae).
    alpha_grid : list or None
        List of alpha values to test. Defaults to [0.1, 1.0, 10.0, 100.0].
    random_state : int
        Random state for reproducibility.

    Returns
    -------
    results_df : pd.DataFrame
        Cross-validated performance for each feature set.
    best_row : pd.Series
        Row corresponding to the best-performing Ridge model (by RÂ²).
    best_model_name : str
        Human-readable name of the best Ridge model.
    """

    if alpha_grid is None:
        alpha_grid = [0.1, 1.0, 10.0, 100.0]

    ridge_param_grid = {
        "alpha": alpha_grid,
        "fit_intercept": [True, False]
    }

    ridge_results = []

    for set_name, features in feature_sets.items():
        X_loop = X_train_scaled[features]

        grid_ridge = GridSearchCV(
            estimator=Ridge(random_state=random_state),
            param_grid=ridge_param_grid,
            cv=cv,
            scoring=scoring,
            refit="r2"
        )

        grid_ridge.fit(X_loop, y_train)
        best_idx = grid_ridge.best_index_

        ridge_results.append({
            "Feature Set": set_name,
            "Alpha": grid_ridge.best_params_["alpha"],
            "r2": grid_ridge.cv_results_["mean_test_r2"][best_idx],
            "rmse": abs(grid_ridge.cv_results_["mean_test_rmse"][best_idx]),
            "mae": abs(grid_ridge.cv_results_["mean_test_mae"][best_idx])
        })

    results_df = pd.DataFrame(ridge_results)
    best_row = results_df.sort_values("r2", ascending=False).iloc[0]

    best_model_name = f'Ridge (alpha={best_row["Alpha"]})'

    return results_df, best_row, best_model_name


In [None]:
def tune_ridge(
    X_train_scaled,
    y_train,
    feature_set=feature_set,
    cv=kf,
    scoring=scoring,
    alpha_grid=None,
    random_state=42
):
    # setup parameter grid for Ridge
    if alpha_grid is None:
        ridge_param_grid = {
            "alpha": [0.1, 1.0, 10.0, 100.0],
            "fit_intercept": [True, False]
        }

    ridge_feat_results = []

    for set_name, features in feature_set.items():
        loop_X = X_train_scaled[features]

        grid_ridge = GridSearchCV(
            Ridge(random_state=random_state),
            param_grid=ridge_param_grid,
            cv=cv,
            scoring=scoring,
            refit='r2'
        )

        grid_ridge.fit(loop_X, y_train)
        best_index = grid_ridge.best_index_

        ridge_feat_results.append({
            "Feature Set": set_name,
            "Alpha": grid_ridge.best_params_['alpha'],
            "r2": grid_ridge.cv_results_['mean_test_r2'][best_index],
            "rmse": abs(grid_ridge.cv_results_['mean_test_rmse'][best_index]),
            "mae": abs(grid_ridge.cv_results_['mean_test_mae'][best_index]),
        })

    ridge_feat_df = pd.DataFrame(ridge_feat_results)
    best_row = ridge_feat_df.sort_values("r2", ascending=False).iloc[0]

    best_model_name = f'Ridge (alpha={best_row["Alpha"]})'

    return ridge_feat_df, best_row, best_model_name

In [None]:
ridge_feat_df, best_ridge_row, best_ridge_name = tune_ridge(
    X_train_scaled=X_train_opt_scaled,
    y_train=y_train_opt,
    feature_sets=feature_set,
    cv=kf,
    scoring=scoring
)

ridge_feat_df

In [None]:
# find and track the best Ridge performer
best_ridge_row = ridge_feat_df.sort_values("r2", ascending=False).iloc[0]

best_scores_opt.append({
    "Model": best_ridge_name,
    "Feature_set": best_ridge_row["Feature Set"],
    "r2": best_ridge_row["r2"],
    "rmse": best_ridge_row["rmse"],
    "mae": best_ridge_row["mae"]
})

In [None]:
def tune_simple_linear(
    X_train_scaled,
    y_train,
    cv=5,
    scoring=scoring
):
    """
    Tune Simple Linear Regression by testing all single features.
    """

    simple_results = []

    for col in X_train_scaled.columns:
        X_col = X_train_scaled[[col]]
        model = LinearRegression()

        scores = cross_validate(
            model,
            X_col,
            y_train,
            cv=cv,
            scoring=scoring
        )

        simple_results.append({
            "Feature": col,
            "r2": scores["test_r2"].mean(),
            "rmse": abs(scores["test_rmse"].mean()),
            "mae": abs(scores["test_mae"].mean())
        })

    results_df = pd.DataFrame(simple_results)
    best_row = results_df.sort_values("r2", ascending=False).iloc[0]
    best_model_name = f"Simple Linear ({best_row['Feature']})"

    return results_df, best_row, best_model_name

In [None]:
# find the Simple Linear entry
best_simple_entry = next(
    item for item in best_scores_opt
    if item["Model"].startswith("Simple Linear")
)

best_feature = best_simple_entry["Feature_set"]

# initialise best simple regression model
model_simple_best = LinearRegression(fit_intercept=True)
model_simple_best.fit(
    X_train_opt_scaled[[best_feature]],
    y_train_opt
)

# add to model dictionary
best_models_dict["Simple Linear Regression"] = {
    "model_obj": model_simple_best,
    "features": [best_feature]
}

In [None]:
pd.DataFrame(best_models_dict)

In [None]:
def tune_multi_linear(
    X_train_scaled,
    y_train,
    feature_set=feature_set,
    cv=kf,
    scoring=scoring
):
    """
    Tune Multiple Linear Regression over feature sets.
    """

    param_grid = {
        "fit_intercept": [True, False]
    }

    multi_feat_results = []

    for set_name, features in feature_set.items():
        loop_X = X_train_scaled[features]

        grid_multi = GridSearchCV(
            LinearRegression(),
            param_grid,
            cv=cv,
            scoring=scoring,
            refit="r2"
        )

        grid_multi.fit(loop_X, y_train)
        best_index = grid_multi.best_index_

        multi_feat_results.append({
            "Feature Set": set_name,
            "Num Features": len(features),
            "r2": grid_multi.cv_results_["mean_test_r2"][best_index],
            "rmse": abs(grid_multi.cv_results_["mean_test_rmse"][best_index]),
            "mae": abs(grid_multi.cv_results_["mean_test_mae"][best_index]),
            "fit_intercept": grid_multi.best_params_["fit_intercept"]
        })

    results_df = pd.DataFrame(multi_feat_results)
    best_row = results_df.sort_values("r2", ascending=False).iloc[0]
    best_model_name = "Multiple Linear Regression"

    return results_df, best_row, best_model_name