In [1]:
import pandas as pd
import time

from src.data_utils.loader import DataLoader
from src.models.spec import ModelSpec
from src.models.trainer import AssetPricingTrainer
from src.models.tuners import OptunaTuner, NoTuner
from src.models.validator import RollingWindowValidator

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
experiments = [
    # ModelSpec.OLS_Huber(name="OLS+H", feature_set="all"),
    # ModelSpec.OLS_Huber(name="OLS-3+H", feature_set="ff3"),
    # ModelSpec.ElasticNet_Huber(name="ENet+H", feature_set="all"),
    # ModelSpec.PCR(name="PCR", feature_set="all"),
    # ModelSpec.RandomForest(name="RF", feature_set="ff3"),
    # ModelSpec.XGBoost(name="XGB+H", feature_set="ff3"),
    # ModelSpec.MLP(name="NN3", feature_set="all", n_hidden_layers=3),
]

In [3]:
gold_panel_path = "data/processed/gold_panel"
loader = DataLoader(data_path=gold_panel_path)
df = loader.load_panel_data()

# Get the 0.05th and 99.95th percentiles to exclude extreme tails
lower_bound = df['target_ret_excess'].quantile(0.0005)
upper_bound = df['target_ret_excess'].quantile(0.9995)

filtered_df = df[
    (df['target_ret_excess'] >= lower_bound) & (df['target_ret_excess'] <= upper_bound)
]

trainer = AssetPricingTrainer(filtered_df)

[2026-02-12 10:49:44] [INFO] [ParquetLoader] REQ: Loading ALL columns...
[2026-02-12 10:49:44] [INFO] [ParquetLoader] SOURCE: data/processed/gold_panel
[2026-02-12 10:50:15] [INFO] [ParquetLoader] LOAD COMPLETE. Shape: (4065278, 100) | RAM: 1.65 GB


In [4]:
opt_tuner = OptunaTuner(storage="sqlite:///data/tuning/optuna.db")

In [5]:
validator = RollingWindowValidator(
    target_col="target_ret_excess",
    date_col="date",
    n_splits=5
)

In [6]:
results = []

start = time.time()

for spec in experiments:
    print(spec.base_params)
    metrics = trainer.run_experiment(
        spec=spec,
        # n_trials=5,
        tuner=NoTuner(),
        validator=validator
    )
    metrics["model"] = spec.name
    results.append(metrics)

end = time.time()
print(f"Runtime: {(end - start)/60:.2f} minutes")

{'n_components': 3}
[2026-02-12 10:50:17] [INFO] [Trainer] --- STARTING EXPERIMENT: PCR ---
[2026-02-12 10:50:17] [INFO] [Trainer] Time Split: 2008-12-31 | Train: 3169929 | Test: 891283
df_dev rows: 3169929
Number of unique dates: 779
[2026-02-12 10:50:18] [INFO] [Trainer] [+] FINDING OPTIMAL HYPERPARAMETERS (n_trials=30)
[2026-02-12 10:50:18] [INFO] [Trainer] [+] LOADING MODEL
[2026-02-12 10:50:18] [INFO] [Trainer]    > Training Final Model (PCR)...
[2026-02-12 10:50:21] [INFO] [Trainer]    > Saved trained model to src/models/trained\PCR_20260212_105021.joblib
[2026-02-12 10:50:22] [INFO] [Trainer] RESULT PCR: R2_OOS=0.00399 | RMSE=0.14816
Runtime: 0.08 minutes


In [7]:
res_df = pd.DataFrame(results)
print("BASELINE LEADERBOARD")
print(res_df[["model", "r2_oos", "rmse"]])

BASELINE LEADERBOARD
  model    r2_oos      rmse
0   PCR  0.003992  0.148157


In [8]:
res_df['r2_oos']*100

0    0.399244
Name: r2_oos, dtype: float32