In [None]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt

from pathlib import Path
from typing import Dict, Any, Tuple, Optional

from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import (
    max_error,
    mean_absolute_error,
    mean_squared_error,
    mean_squared_log_error,
    r2_score,
)
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

import xgboost as xgb

In [None]:
# ===================== BASIC CONFIG (EDIT THIS ONLY) =====================
DATA_PREFIX = "Event_48"  # <<< change to your file prefix (no .csv)
OUT_DIR = Path("regression_outputs")
RANDOM_STATE = 42
CV_SPLITS = 10
SCORING = "neg_mean_squared_error"  # primary CV objective
# ========================================================================

In [None]:
# -------------------- Utilities -------------------- #
def safe_index_col(path: str) -> pd.DataFrame:
    """
    Safely read CSV with the first column as index (covers 'Unnamed: 0' cases).
    """
    return pd.read_csv(path, index_col=0)


def can_compute_msle(y_true: pd.Series, y_pred: np.ndarray) -> bool:
    """
    MSLE requires non-negative targets and predictions.
    """
    return (np.min(y_true) >= 0) and (np.min(y_pred) >= 0)


def evaluate_split(y_true: pd.Series, y_pred: np.ndarray) -> Dict[str, Any]:
    """
    Compute common regression metrics on a split.
    """
    metrics = {
        "max_error": float(max_error(y_true, y_pred)),
        "mean_absolute_error": float(mean_absolute_error(y_true, y_pred)),
        "mean_squared_error": float(mean_squared_error(y_true, y_pred)),
        "r2_score": float(r2_score(y_true, y_pred)),
        "mean_squared_log_error": None,
    }
    if can_compute_msle(y_true, y_pred):
        try:
            msle = mean_squared_log_error(y_true, y_pred)
            metrics["mean_squared_log_error"] = float(msle)
        except Exception:
            metrics["mean_squared_log_error"] = None
    return metrics


def parity_plot(y_true_tr, y_pred_tr, y_true_te, y_pred_te, title: str, save_path: Path):
    """
    Create side-by-side parity (y_pred vs y_true) scatter plots for train and test.
    """
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(11, 5))

    # Train
    ax1.scatter(y_pred_tr, y_true_tr, s=30, alpha=0.35, marker='*', label=f"MAE={mean_absolute_error(y_true_tr, y_pred_tr):.3f}")
    lo = min(np.min(y_true_tr), np.min(y_pred_tr))
    hi = max(np.max(y_true_tr), np.max(y_pred_tr))
    ax1.plot([lo, hi], [lo, hi], "k:", linewidth=1)
    ax1.set_title("Training set", fontsize=13)
    ax1.set_xlabel("Predicted", fontsize=12)
    ax1.set_ylabel("Measured", fontsize=12)
    ax1.legend()

    # Test
    ax2.scatter(y_pred_te, y_true_te, s=30, alpha=0.35, marker='o', label=f"MAE={mean_absolute_error(y_true_te, y_pred_te):.3f}")
    lo2 = min(np.min(y_true_te), np.min(y_pred_te))
    hi2 = max(np.max(y_true_te), np.max(y_pred_te))
    ax2.plot([lo2, hi2], [lo2, hi2], "k:", linewidth=1)
    ax2.set_title("Test set", fontsize=13)
    ax2.set_xlabel("Predicted", fontsize=12)
    ax2.legend()

    fig.suptitle(title, fontsize=16)
    fig.tight_layout()
    fig.savefig(save_path, dpi=180)
    plt.close(fig)


def run_model(
    name: str,
    estimator,
    param_grid: Dict[str, Any],
    Xtrain: pd.DataFrame,
    Ytrain: pd.Series,
    Xtest: pd.DataFrame,
    Ytest: pd.Series,
    save_dir: Path,
    scoring: str = SCORING,
    cv_splits: int = CV_SPLITS,
    n_jobs: int = -1,
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
    """
    Grid-search a regressor on (Xtrain, Ytrain), fit best params, evaluate on both train and test,
    save model, CV table and parity plots.
    """
    print(f"\n===== Begin Train: {name} =====")
    kf = KFold(n_splits=cv_splits, shuffle=True, random_state=RANDOM_STATE)
    gs = GridSearchCV(estimator, param_grid, scoring=scoring, n_jobs=n_jobs, cv=kf, refit=True)
    gs.fit(Xtrain, np.array(Ytrain).ravel())

    print(f"Best {scoring}: {gs.best_score_:.6f} using {gs.best_params_}")
    cv_df = pd.DataFrame(gs.cv_results_)
    cv_path = save_dir / f"cv_results_{name}.csv"
    cv_df.to_csv(cv_path, index=False)
    print(f"Saved CV results to: {cv_path}")

    best = gs.best_estimator_

    # Save model
    model_path = save_dir / f"model_{name}.pkl"
    joblib.dump(best, model_path)
    print(f"Saved best model to: {model_path}")

    # Fit on full training set (already done by GridSearchCV refit=True)
    # Predict
    pred_tr = best.predict(Xtrain)
    pred_te = best.predict(Xtest)

    # Evaluate
    metrics_tr = evaluate_split(Ytrain, pred_tr)
    metrics_te = evaluate_split(Ytest, pred_te)
    print(f"[{name}] Training metrics: {metrics_tr}")
    print(f"[{name}] Test metrics: {metrics_te}")

    # Parity plots
    plot_path = save_dir / f"parity_{name}.png"
    parity_plot(Ytrain, pred_tr, Ytest, pred_te, title=name, save_path=plot_path)
    print(f"Saved parity plot to: {plot_path}")

    return metrics_tr, metrics_te

In [None]:
# -------------------- Main -------------------- #
def main():
    OUT_DIR.mkdir(parents=True, exist_ok=True)

    # Load datasets (produced by your preprocessing pipeline)
    Xtrain = safe_index_col(f"{DATA_PREFIX}-Xtrain.csv")
    Xtrain_ = safe_index_col(f"{DATA_PREFIX}-Xtrain_.csv")  # available but unused for regression
    Xtest = safe_index_col(f"{DATA_PREFIX}-Xtest.csv")

    Ytrain = safe_index_col(f"{DATA_PREFIX}-Ytrain.csv").iloc[:, 0]
    # Ytrain_ exists from your pipeline but is not used for regression
    Ytrain_ = pd.read_csv(f"{DATA_PREFIX}-Ytrain_.csv")
    if Ytrain_.shape[1] == 1:
        Ytrain_ = Ytrain_.iloc[:, 0]
    else:
        Ytrain_ = Ytrain_.iloc[:, 0]
    Ytest = safe_index_col(f"{DATA_PREFIX}-Ytest.csv").iloc[:, 0]

    print("Shapes:",
          "Xtrain", Xtrain.shape, "| Ytrain", Ytrain.shape,
          "| Xtrain_", Xtrain_.shape, "| Ytrain_", Ytrain_.shape,
          "| Xtest", Xtest.shape, "| Ytest", Ytest.shape)

    # Collect test metrics to aggregate at the end (keep your original column names)
    rows = []

    # ---------------- RandomForestRegressor ---------------- #
    rf_params = dict(
        n_estimators=[200, 300, 500],
        criterion=["squared_error", "absolute_error", "friedman_mse"],  # 'poisson' only if targets >= 0
        max_depth=[None, 10, 20, 50, 100],
        min_samples_split=[2, 5, 10],
        min_samples_leaf=[1, 2, 4],
        random_state=[RANDOM_STATE],
        n_jobs=[-1],
    )
    _, te_rf = run_model(
        name="RandomForestRegressor",
        estimator=RandomForestRegressor(),
        param_grid=rf_params,
        Xtrain=Xtrain, Ytrain=Ytrain,
        Xtest=Xtest, Ytest=Ytest,
        save_dir=OUT_DIR,
    )
    rows.append(("RandomForestRegressor", te_rf))

    # ---------------- DecisionTreeRegressor ---------------- #
    dt_params = dict(
        criterion=["squared_error", "absolute_error", "friedman_mse"],  # 'poisson' only if targets >= 0
        splitter=["best", "random"],
        max_depth=[None, 10, 20, 50, 100, 200],
        min_samples_split=[2, 5, 10],
        min_samples_leaf=[1, 2, 4],
        random_state=[RANDOM_STATE],
    )
    _, te_dt = run_model(
        name="DecisionTreeRegressor",
        estimator=DecisionTreeRegressor(),
        param_grid=dt_params,
        Xtrain=Xtrain, Ytrain=Ytrain,
        Xtest=Xtest, Ytest=Ytest,
        save_dir=OUT_DIR,
    )
    rows.append(("DecisionTreeRegressor", te_dt))

    # ---------------- XGBRegressor ---------------- #
    xgb_params = dict(
        learning_rate=[0.01, 0.05, 0.1],
        n_estimators=[300, 500, 700, 900],
        max_depth=[3, 5, 7, 9],
        subsample=[0.7, 0.9, 1.0],
        colsample_bytree=[0.7, 0.9, 1.0],
        reg_lambda=[1.0, 5.0, 10.0],
        random_state=[RANDOM_STATE],
        tree_method=["hist"],  # fast and robust
    )
    _, te_xgb = run_model(
        name="XGBRegressor",
        estimator=xgb.XGBRegressor(),
        param_grid=xgb_params,
        Xtrain=Xtrain, Ytrain=Ytrain,
        Xtest=Xtest, Ytest=Ytest,
        save_dir=OUT_DIR,
    )
    rows.append(("XGBRegressor", te_xgb))

    # ---------------- KNeighborsRegressor ---------------- #
    knn_params = dict(
        n_neighbors=[3, 5, 7, 11, 15, 21],
        algorithm=["auto", "ball_tree", "kd_tree", "brute"],
        weights=["uniform", "distance"],
        n_jobs=[-1],
    )
    _, te_knn = run_model(
        name="KNeighborsRegressor",
        estimator=KNeighborsRegressor(),
        param_grid=knn_params,
        Xtrain=Xtrain, Ytrain=Ytrain,
        Xtest=Xtest, Ytest=Ytest,
        save_dir=OUT_DIR,
    )
    rows.append(("KNeighborsRegressor", te_knn))

    # ---------------- SVR ---------------- #
    svr_params = dict(
        kernel=["linear", "rbf", "poly", "sigmoid"],
        C=[0.1, 0.5, 1.0, 2.0, 5.0],
        gamma=["scale", "auto"],
        degree=[2, 3],  # used only for 'poly'
        epsilon=[0.1, 0.2, 0.5],
        cache_size=[5000],
        max_iter=[-1],
    )
    _, te_svr = run_model(
        name="SVR",
        estimator=SVR(),
        param_grid=svr_params,
        Xtrain=Xtrain, Ytrain=Ytrain,
        Xtest=Xtest, Ytest=Ytest,
        save_dir=OUT_DIR,
    )
    rows.append(("SVR", te_svr))

    # ---------------- Aggregate & Save ---------------- #
    columns = ["max_error", "mean_absolute_error", "mean_squared_error", "mean_squared_log_error", "r2_score"]
    index_names = [name for name, _ in rows]

    data = []
    for _, metrics in rows:
        data.append([
            metrics["max_error"],
            metrics["mean_absolute_error"],
            metrics["mean_squared_error"],
            metrics["mean_squared_log_error"],
            metrics["r2_score"],
        ])

    result_df = pd.DataFrame(data, index=index_names, columns=columns)
    print("\n===== Test Set Summary (Regression) =====")
    print(result_df)

    out_path = Path("predictive performance (regression).csv")
    result_df.to_csv(out_path)
    print(f"\nSaved summary to: {out_path.resolve()}")

    if len(rows) == 5:
        print("success!")

In [None]:
if __name__ == "__main__":
    main()