## Tune Value Factor (Extensible for Multiple Models)

This notebook:
1. Fetches and transforms the value-factor dataset  
2. Prepares features `X` and target `y`  
3. Defines hyperparameter grids for each model  
4. Runs a HalvingGridSearchCV for each (`linear`, `random_forest`, `xgboost`, `gbr`)  
5. Plots & saves results automatically  

In [None]:
import sys
import os

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm
from tqdm_joblib import tqdm_joblib
from joblib import parallel_backend

from fetchers.FetchFactory import FetchFactory
from factor_pipeline.registry.TrainerFactory import get_trainer
from tuning.Tuner import Tuner

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV, ParameterGrid, TimeSeriesSplit

### CONFIG & DATA FETCH 

In [2]:
CONFIG = {
    "fetchers": {
        "value": {
            "clean_missing": True,
            "frequency":   "M",
            "default_start_date": "2020-01-01",
            "default_end_date":   "2026-01-31"
        }
    }
}
START_DATE = CONFIG["fetchers"]["value"]["default_start_date"]
END_DATE   = CONFIG["fetchers"]["value"]["default_end_date"]

In [3]:
sp_fetcher = FetchFactory({}).build_fetchers()["sp500"]
tickers = sp_fetcher.fetch()

In [4]:
TRAINING_COLS = [
    "z_price_to_earnings_ratio",
    "z_price_to_book_ratio",
    "z_price_to_sales_ratio",
    "z_price_to_free_cash_flow_ratio",
    "z_free_cash_flow_yield",
    "z_earnings_yield",
    "z_graham_number",
    "z_return_on_equity",
    "z_return_on_assets"
]
TARGET_COL = "next_return"

In [5]:
PARAM_GRIDS = {
    "linear": {
        "fit_intercept": [True, False],
        "positive":      [True, False],
        "copy_X":        [True, False],
        "n_jobs":        [None, -1]
    },
    "random_forest": [
        # ── combos WITH bootstrap=True and max_samples options
        {
            "n_estimators":      [50, 100, 200, 500],
            "criterion":         ["squared_error", "absolute_error", "friedman_mse"],
            "max_depth":         [5, 10, 20, None],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf":  [1, 2, 4],
            "max_features":      [None, "sqrt", "log2", 0.5, 1.0],
            "bootstrap":         [True],
            "max_samples":       [None, 0.5, 0.75]
        },
        # ── combos WITH bootstrap=False (must drop max_samples)
        {
            "n_estimators":      [50, 100, 200, 500],
            "criterion":         ["squared_error", "absolute_error", "friedman_mse"],
            "max_depth":         [5, 10, 20, None],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf":  [1, 2, 4],
            "max_features":      [None, "sqrt", "log2", 0.5, 1.0],
            "bootstrap":         [False]
        }
    ],
    "xgboost": {
        "n_estimators":      [50, 100, 200, 500],
        "max_depth":         [3, 6, 10, 15],
        "learning_rate":     [0.01, 0.05, 0.1, 0.2],
        "subsample":         [0.6, 0.8, 1.0],
        "colsample_bytree":  [0.6, 0.8, 1.0],
        "gamma":             [0, 1, 5],
        "reg_alpha":         [0, 0.1, 1],
        "reg_lambda":        [1, 5, 10],
        "min_child_weight":  [1, 3, 5]
    },
    "gbr": {
        "loss":              ["squared_error", "absolute_error"],
        "learning_rate":     [0.01, 0.05, 0.1, 0.2],
        "n_estimators":      [50, 100, 200, 500],
        "max_depth":         [3, 6, 10, 15],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf":  [1, 2, 4],
        "subsample":         [0.6, 0.8, 1.0],
        "max_features":      [None, "sqrt", "log2"]
    }
}

---
#####  FETCH / PREPROCESS / TUNE / VISUALIZE
Load raw data and let the trainer handle all winsorizing, z-scoring, forward returns, etc.

In [6]:
value_fetcher = FetchFactory(config=CONFIG).build_fetchers()["value"]
df = value_fetcher.fetch(tickers, START_DATE, END_DATE)

In [7]:
def compute_total_fits(n_initial, factor, n_splits):
    """
    Compute total fits across all halving iterations:
      sum_{i=0..I} (n_i * n_splits), where n_0 = n_initial,
      n_{i+1} = max(1, n_i // factor), stop when n_i == 1.
    """
    total = 0
    n = n_initial
    while True:
        total += n * n_splits
        if n == 1:
            break
        n = max(1, n // factor)
    return total

In [8]:
def plot_tuning_results(df_cv, model_type, grid):
    df = df_cv.copy()
    df["mean_mse"] = -df["mean_test_score"]

    # pick your two “axes” out of the grid
    h1, h2 = list(grid.keys())[:2]
    p1, p2 = f"param_{h1}", f"param_{h2}"

    # group to get one MSE per (h2, h1)
    df_line = (
        df
        .groupby([p2, p1])["mean_mse"]
        .mean()
        .reset_index()
        .sort_values([p2, p1])
    )

    # build a pivot for the heatmap too
    pivot = df_line.pivot(index=p2, columns=p1, values="mean_mse")

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

    # — left: one line per h2 —
    for val in pivot.index:
        ax1.plot(
            pivot.columns.astype(str),
            pivot.loc[val].values,
            marker="o",
            label=f"{h2}={val}"
        )
    ax1.set_title(f"{model_type} tuning: MSE vs {h1}")
    ax1.set_xlabel(h1)
    ax1.set_ylabel("Mean CV MSE")
    ax1.legend()
    ax1.grid(True, linestyle="--", alpha=0.5)

    # — right: heatmap of that same pivot —
    im = ax2.imshow(pivot.values, aspect="auto")
    ax2.set_xticks(range(pivot.shape[1]))
    ax2.set_xticklabels(pivot.columns.astype(str), rotation=45, ha="right")
    ax2.set_yticks(range(pivot.shape[0]))
    ax2.set_yticklabels(pivot.index.astype(str))
    ax2.set_title(f"{model_type} heatmap: {h2} x {h1}")
    ax2.set_xlabel(h1)
    ax2.set_ylabel(h2)
    fig.colorbar(im, ax=ax2, label="Mean CV MSE")

    plt.tight_layout()
    plt.show()

In [None]:
all_results = {}
for model_type, grid in PARAM_GRIDS.items():
    trainer = get_trainer("value", model_path=None, model_type=model_type, model_params={})
    X, y = trainer.preprocess_data(df.sort_index(), TRAINING_COLS, TARGET_COL)

    base_est = Tuner(model_type=model_type, param_grid=grid, cv=4, scoring="neg_mean_squared_error")._make_base_estimator()
    tscv     = TimeSeriesSplit(n_splits=4)
    factor   = 2

    search = HalvingGridSearchCV(
        estimator=base_est,
        param_grid=grid,
        cv=tscv,
        scoring="neg_mean_squared_error",
        factor=factor,
        n_jobs=-1,
        verbose=0,          # or >=1 to see per-iteration stats
        error_score=np.nan  # silence invalid combos into NaN rather than crash
    )

    total_fits = compute_total_fits(len(list(ParameterGrid(grid))), factor, tscv.get_n_splits())
    with parallel_backend("loky"):
        with tqdm_joblib(tqdm(desc=f"Tuning {model_type}", total=total_fits, unit="fit")):
            search.fit(X, y)

    best_params = search.best_params_
    df_cv       = pd.DataFrame(search.cv_results_)
    all_results[model_type] = {
        "best_params": best_params,
        "estimator":   search.best_estimator_,
        "cv_results":  df_cv
    }

    print(f"→ Best for {model_type}: {best_params}")
    plot_grid = grid[0] if isinstance(grid, list) else grid
    plot_tuning_results(df_cv, model_type, plot_grid)
