In [1]:
import pandas as pd
import sqlite3

from src.models.config import ModelConfig
from src.models.trainer import AssetPricingTrainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_gold_panel(db_path):
    print("[INFO] Loading Gold Panel...")
    conn = sqlite3.connect(db_path)
    # Load everything (we select subsets in the Trainer)
    query = "SELECT date_fmt, permno, target_ret_excess, " \
            "mvel1, bm, mom12m, mom1m, retvol " \
            "FROM gold_panel ORDER BY date_fmt" 
    
    df = pd.read_sql(query, conn)
    df['date'] = pd.to_datetime(df['date_fmt'])
    conn.close()
    return df

In [3]:
experiments = [
    # Model 1: OLS with Huber Loss (All Features) - "The Kitchen Sink"
    ModelConfig(
        name="OLS+H",
        model_type="huber",
        feature_set="all",
        params={'epsilon': 1.35}, # Default Huber epsilon
        use_optuna=False
    ),
    
    # Model 2: OLS with Huber Loss (3 Factors) - "The Minimalist"
    ModelConfig(
        name="OLS-3+H",
        model_type="huber",
        feature_set="ff3",
        params={'epsilon': 1.35},
        use_optuna=False
    ),
    
    # Model 3: ElasticNet with Huber Loss (3 Factors) - "The Regularized Minimalist"
    # Note: Using SGDRegressor to combine Huber + ElasticNet
    ModelConfig(
        name="ENET-3+H",
        model_type="sgd_huber",
        feature_set="ff3",
        params={'alpha': 0.0001, 'l1_ratio': 0.15}, # Initial guess
        use_optuna=True, # Tuned via Optuna
        optuna_trials=10
    )
]

In [4]:
DB_PATH = "data/processed/lakehouse.sqlite"

df = load_gold_panel(DB_PATH)
trainer = AssetPricingTrainer(df)

[INFO] Loading Gold Panel...


In [None]:
results = []
for exp in experiments:
    metrics = trainer.run_experiment(exp)
    metrics['model'] = exp.name
    results.append(metrics)


STARTING EXPERIMENT: OLS+H
FINAL TEST RESULTS (OLS+H):
R2_OOS: -0.00988
RMSE:   0.19118

STARTING EXPERIMENT: OLS-3+H


[32m[I 2026-02-04 11:58:12,279][0m A new study created in memory with name: no-name-570819dd-6256-40ea-a1d3-32be37264f78[0m


FINAL TEST RESULTS (OLS-3+H):
R2_OOS: -0.00751
RMSE:   0.19096

STARTING EXPERIMENT: ENET-3+H
   ... Tuning Hyperparameters


[32m[I 2026-02-04 11:58:17,409][0m Trial 0 finished with value: -0.0022466419481803612 and parameters: {}. Best is trial 0 with value: -0.0022466419481803612.[0m
[32m[I 2026-02-04 11:58:22,473][0m Trial 1 finished with value: -0.0019250185400105184 and parameters: {}. Best is trial 1 with value: -0.0019250185400105184.[0m
[32m[I 2026-02-04 11:58:27,796][0m Trial 2 finished with value: -0.0022129873550405335 and parameters: {}. Best is trial 1 with value: -0.0019250185400105184.[0m
[32m[I 2026-02-04 11:58:33,207][0m Trial 3 finished with value: -0.0023132140083568973 and parameters: {}. Best is trial 1 with value: -0.0019250185400105184.[0m
[32m[I 2026-02-04 11:58:38,460][0m Trial 4 finished with value: -0.00239717350294281 and parameters: {}. Best is trial 1 with value: -0.0019250185400105184.[0m
[32m[I 2026-02-04 11:58:43,561][0m Trial 5 finished with value: -0.0019440058220131136 and parameters: {}. Best is trial 1 with value: -0.0019250185400105184.[0m
[32m[I 2026

   ... Best Params: {'alpha': 0.0001, 'l1_ratio': 0.15}
FINAL TEST RESULTS (ENET-3+H):
R2_OOS: -0.00778
RMSE:   0.19099
BASELINE LEADERBOARD
      model    r2_oos      rmse
0     OLS+H -0.009882  0.191184
1   OLS-3+H -0.007514  0.190960
2  ENET-3+H -0.007784  0.190985


In [8]:
res_df = pd.DataFrame(results)
print("BASELINE LEADERBOARD")
print(res_df[['model', 'r2_oos', 'rmse', 'r2_sklearn']])

BASELINE LEADERBOARD
      model    r2_oos      rmse  r2_sklearn
0     OLS+H -0.009882  0.191184   -0.012937
1   OLS-3+H -0.007514  0.190960   -0.010562
2  ENET-3+H -0.007784  0.190985   -0.010832
