# ML Model 04: Linear Regression (Time-Based Split)\n
\n
This notebook trains linear regression models on the exported feature dataset and evaluates them using a **time-based** train/validation/test split.\n
\n
## Split policy (time-based)\n
- **7 years** for training\n
- **1.5 years** for validation\n
- **1.5 years** for testing\n
\n
We align boundary timestamps to the nearest available trading dates in the dataset index.\n
\n
## Models\n
Three baseline linear models (separate cells):\n
- `LinearRegression` (OLS-like, no regularization)\n
- `Ridge` (L2)\n
- `Lasso` (L1)\n
\n
## Target\n
Predict **next-day return** using `ret_1d` shifted forward by 1 day per asset: `y_ret_1d_fwd`.\n
\n
## Data\n
- Feature store: `dataset/features/all_features.parquet` (preferred) or `dataset/features/all_features.csv`\n

In [1]:
from __future__ import annotations

import os
from dataclasses import dataclass
from pathlib import Path

import numpy as np
import pandas as pd
from scipy.stats import spearmanr

from sklearn.feature_selection import mutual_info_regression
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


In [2]:
# Reproducibility
SEED = 42
rng = np.random.default_rng(SEED)

# Resolve project root regardless of where notebook is launched from
CWD = Path.cwd()
PROJECT_ROOT = CWD.parent if CWD.name == 'notebooks' else CWD

FEATURES_PARQUET_PATH = PROJECT_ROOT / 'dataset' / 'features' / 'all_features.parquet'
FEATURES_CSV_PATH = PROJECT_ROOT / 'dataset' / 'features' / 'all_features.csv'

TARGET_COL = 'ret_1d'
TARGET_FWD_COL = 'y_ret_1d_fwd'

# Split horizon: 7y train, 18m val, 18m test
TRAIN_YEARS = 7
VAL_MONTHS = 18
TEST_MONTHS = 18

# Alpha grids
ALPHA_GRID = [1e-4, 1e-3, 1e-2, 1e-1, 1.0, 10.0, 100.0]

OUTPUT_DIR = PROJECT_ROOT / 'dataset' / 'model_outputs' / 'linear_models_04_timesplit'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)


In [3]:
# Load dataset
if FEATURES_PARQUET_PATH.exists():
    df = pd.read_parquet(FEATURES_PARQUET_PATH)
    # Support both formats: Date as index (older) or Date as a column (CSV->Parquet conversion).
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'])
        df = df.set_index('Date')
elif FEATURES_CSV_PATH.exists():
    df = pd.read_csv(FEATURES_CSV_PATH, parse_dates=['Date']).set_index('Date')
else:
    raise FileNotFoundError('Feature dataset not found under dataset/features/.')

df = df.sort_index()

required_cols = {'Asset_ID', TARGET_COL}
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f'Missing required columns in feature dataset: {sorted(missing)}')

print('shape:', df.shape)
print('date range:', df.index.min(), '->', df.index.max())
print('assets:', df['Asset_ID'].nunique())
display(df.head(3))


shape: (251100, 122)
date range: 2016-01-25 00:00:00 -> 2026-01-16 00:00:00
assets: 100


Unnamed: 0_level_0,ret_1d,logret_1d,excess_ret_1d,logret_lag_1,logret_lag_5,ret_lag_1,ret_lag_5,ret_5d,ret_21d,logret_5d,...,filt_logret_spectral_0p1,filt_close_lms_mu1e-4_taps5,filt_resid_lms_mu1e-4_taps5,filt_logret_lms_mu1e-4_taps5,filt_close_lattice_demo,filt_resid_lattice_demo,filt_logret_lattice_demo,close,volume,Asset_ID
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-25,,,,,,,,,,,...,,28.580592,0.0,,28.580592,0.0,,28.580592,249449990.0,Asset_001
2016-01-25,,,,,,,,,,,...,,8.489112,0.0,,8.489112,0.0,,8.489112,148653644.0,Asset_028
2016-01-25,,,,,,,,,,,...,,23.826383,0.0,,23.826383,0.0,,23.826383,65904403.0,Asset_029


In [4]:
# Create forward-looking label (next-day return) per asset
df = df.copy()
df[TARGET_FWD_COL] = df.groupby('Asset_ID', sort=False)[TARGET_COL].shift(-1)
df = df.dropna(subset=[TARGET_FWD_COL])

print('shape after label:', df.shape)
display(df[[TARGET_COL, TARGET_FWD_COL, 'Asset_ID']].head(5))


shape after label: (251000, 123)


Unnamed: 0_level_0,ret_1d,y_ret_1d_fwd,Asset_ID
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-01-25,,0.005531,Asset_001
2016-01-25,,0.027007,Asset_028
2016-01-25,,0.023737,Asset_029
2016-01-25,,0.010319,Asset_030
2016-01-25,,-0.004676,Asset_031


## Time-based split\n
We split by time across **all assets**.\n
\n
- Test = most recent 18 months\n
- Validation = 18 months immediately before test\n
- Train = 7 years immediately before validation\n

In [5]:
def align_to_trading_date(index: pd.DatetimeIndex, ts: pd.Timestamp) -> pd.Timestamp:
    # Return the first index date >= ts
    pos = int(index.searchsorted(ts, side='left'))
    if pos >= len(index):
        return pd.Timestamp(index[-1])
    return pd.Timestamp(index[pos])

idx = pd.DatetimeIndex(df.index.unique()).sort_values()
end = pd.Timestamp(idx[-1])

raw_test_start = end - pd.DateOffset(months=TEST_MONTHS)
raw_val_start = raw_test_start - pd.DateOffset(months=VAL_MONTHS)
raw_train_start = raw_val_start - pd.DateOffset(years=TRAIN_YEARS)

test_start = align_to_trading_date(idx, pd.Timestamp(raw_test_start))
val_start = align_to_trading_date(idx, pd.Timestamp(raw_val_start))
train_start = align_to_trading_date(idx, pd.Timestamp(raw_train_start))

print('aligned boundaries:')
print('  train_start:', train_start)
print('  val_start  :', val_start)
print('  test_start :', test_start)
print('  end        :', end)

df_train = df.loc[(df.index >= train_start) & (df.index < val_start)].copy()
df_val = df.loc[(df.index >= val_start) & (df.index < test_start)].copy()
df_test = df.loc[(df.index >= test_start) & (df.index <= end)].copy()

print('rows train/val/test:', df_train.shape[0], df_val.shape[0], df_test.shape[0])
print('days train/val/test:', df_train.index.nunique(), df_val.index.nunique(), df_test.index.nunique())
print('assets train/val/test:', df_train['Asset_ID'].nunique(), df_val['Asset_ID'].nunique(), df_test['Asset_ID'].nunique())


aligned boundaries:
  train_start: 2016-01-25 00:00:00
  val_start  : 2023-01-17 00:00:00
  test_start : 2024-07-15 00:00:00
  end        : 2026-01-15 00:00:00
rows train/val/test: 175700 37400 37900
days train/val/test: 1757 374 379
assets train/val/test: 100 100 100


In [6]:
# Feature matrix definition: use all numeric exported features
exclude_cols = {'Asset_ID', TARGET_FWD_COL}
feature_cols = [c for c in df.columns if c not in exclude_cols and pd.api.types.is_numeric_dtype(df[c])]

print('n_features:', len(feature_cols))
print('example features:', feature_cols[:10])

def to_xy(d: pd.DataFrame):
    X = d.loc[:, feature_cols].replace([np.inf, -np.inf], np.nan)
    y = d.loc[:, TARGET_FWD_COL]
    return X, y

X_train, y_train = to_xy(df_train)
X_val, y_val = to_xy(df_val)
X_test, y_test = to_xy(df_test)

print('X_train:', X_train.shape, 'X_val:', X_val.shape, 'X_test:', X_test.shape)


n_features: 121
example features: ['ret_1d', 'logret_1d', 'excess_ret_1d', 'logret_lag_1', 'logret_lag_5', 'ret_lag_1', 'ret_lag_5', 'ret_5d', 'ret_21d', 'logret_5d']
X_train: (175700, 121) X_val: (37400, 121) X_test: (37900, 121)


## Feature diagnostics: Mutual Information (train only)\n
We compute MI on the training split only (no feature selection yet).\n

In [7]:
mi_sample_size = min(50_000, X_train.shape[0])
sample_idx = rng.choice(X_train.shape[0], size=mi_sample_size, replace=False)

X_mi = X_train.iloc[sample_idx]
y_mi = y_train.iloc[sample_idx]

X_mi_imp = SimpleImputer(strategy='median').fit_transform(X_mi)
mi = mutual_info_regression(X_mi_imp, y_mi.to_numpy(), random_state=SEED)
mi_s = pd.Series(mi, index=feature_cols).sort_values(ascending=False)
display(mi_s.head(30).to_frame('mutual_information'))
mi_s.head(200).to_csv(OUTPUT_DIR / 'mutual_information_top200.csv')
print('wrote:', OUTPUT_DIR / 'mutual_information_top200.csv')


Unnamed: 0,mutual_information
realized_vol_20,0.084553
logret_roll_std_20,0.084476
logret_roll_std_10,0.083881
logret_roll_var_10,0.080572
logret_roll_var_20,0.080525
logret_roll_std_5,0.075885
logret_roll_var_60,0.073049
logret_roll_var_5,0.072603
logret_roll_std_60,0.072247
logret_roll_min_20,0.068256


wrote: /home/anivarth/college/quant-task/dataset/model_outputs/linear_models_04_timesplit/mutual_information_top200.csv


## Metrics\n
We report: RMSE, MAE, R2, and Spearman IC (rank correlation).\n

In [8]:
@dataclass(frozen=True)
class RegressionMetrics:
    rmse: float
    mae: float
    r2: float
    spearman_ic: float


def compute_metrics(y_true: pd.Series, y_pred: np.ndarray) -> RegressionMetrics:
    rmse = float(np.sqrt(mean_squared_error(y_true, y_pred)))
    mae = float(mean_absolute_error(y_true, y_pred))
    r2 = float(r2_score(y_true, y_pred))
    ic = float(spearmanr(y_true.to_numpy(), y_pred, nan_policy='omit').correlation)
    return RegressionMetrics(rmse=rmse, mae=mae, r2=r2, spearman_ic=ic)


## Model 1/3: LinearRegression (no regularization)\n

In [9]:
ols_pipe = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler(with_mean=True, with_std=True)),
        ('model', LinearRegression()),
    ]
)

ols_pipe.fit(X_train, y_train)

m_train = compute_metrics(y_train, ols_pipe.predict(X_train))
m_val = compute_metrics(y_val, ols_pipe.predict(X_val))
m_test = compute_metrics(y_test, ols_pipe.predict(X_test))

print('OLS train:', m_train)
print('OLS val  :', m_val)
print('OLS test :', m_test)


OLS train: RegressionMetrics(rmse=0.01907226404250904, mae=0.012579070648091497, r2=0.03318561547292842, spearman_ic=0.04632422548025987)
OLS val  : RegressionMetrics(rmse=0.01603139960624759, mae=0.010985094540947428, r2=-0.018258385086227902, spearman_ic=0.006754101930406864)
OLS test : RegressionMetrics(rmse=0.019380922604850102, mae=0.012791780710181177, r2=-0.011721526968687002, spearman_ic=0.03043559303911317)


## Model 2/3: Ridge regression (L2)\n
We tune `alpha` using the **validation** split (time-based, out-of-sample).\n

In [10]:
def tune_alpha_on_val_ridge(alpha_grid: list[float]) -> tuple[float, pd.DataFrame]:
    rows = []
    for a in alpha_grid:
        pipe = Pipeline(
            steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler()),
                ('model', Ridge(alpha=a, random_state=SEED)),
            ]
        )
        pipe.fit(X_train, y_train)
        pred_val = pipe.predict(X_val)
        m = compute_metrics(y_val, pred_val)
        rows.append({'alpha': a, 'val_rmse': m.rmse, 'val_mae': m.mae, 'val_r2': m.r2, 'val_ic': m.spearman_ic})
    res = pd.DataFrame(rows).sort_values('val_rmse', ascending=True).reset_index(drop=True)
    best = float(res.iloc[0]['alpha'])
    return best, res

best_alpha_ridge, ridge_val_grid = tune_alpha_on_val_ridge(ALPHA_GRID)
display(ridge_val_grid)
print('best_alpha_ridge:', best_alpha_ridge)
ridge_val_grid.to_csv(OUTPUT_DIR / 'ridge_alpha_valgrid.csv', index=False)

ridge_pipe = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('model', Ridge(alpha=best_alpha_ridge, random_state=SEED)),
    ]
)
ridge_pipe.fit(X_train, y_train)

print('Ridge train:', compute_metrics(y_train, ridge_pipe.predict(X_train)))
print('Ridge val  :', compute_metrics(y_val, ridge_pipe.predict(X_val)))
print('Ridge test :', compute_metrics(y_test, ridge_pipe.predict(X_test)))


Unnamed: 0,alpha,val_rmse,val_mae,val_r2,val_ic
0,100.0,0.016022,0.010976,-0.01711,0.007936
1,10.0,0.016028,0.010981,-0.017777,0.006871
2,1.0,0.016031,0.010985,-0.018169,0.006243
3,0.001,0.016031,0.010985,-0.018197,0.006616
4,0.01,0.016031,0.010985,-0.018215,0.006283
5,0.1,0.016031,0.010985,-0.018238,0.006151
6,0.0001,0.016031,0.010985,-0.018243,0.006738


best_alpha_ridge: 100.0
Ridge train: RegressionMetrics(rmse=0.01908874263688693, mae=0.012583221700943847, r2=0.03151422243595259, spearman_ic=0.04074340310229642)
Ridge val  : RegressionMetrics(rmse=0.01602235468941008, mae=0.010976450895541116, r2=-0.017109706308404027, spearman_ic=0.007936364131111876)
Ridge test : RegressionMetrics(rmse=0.019374230928916214, mae=0.012773423671727993, r2=-0.011023010807293954, spearman_ic=0.030208438606241984)


## Model 3/3: Lasso regression (L1)\n
We tune `alpha` using the **validation** split (time-based, out-of-sample).\n

In [11]:
def tune_alpha_on_val_lasso(alpha_grid: list[float]) -> tuple[float, pd.DataFrame]:
    rows = []
    for a in alpha_grid:
        pipe = Pipeline(
            steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler()),
                ('model', Lasso(alpha=a, random_state=SEED, max_iter=20_000)),
            ]
        )
        pipe.fit(X_train, y_train)
        pred_val = pipe.predict(X_val)
        m = compute_metrics(y_val, pred_val)
        rows.append({'alpha': a, 'val_rmse': m.rmse, 'val_mae': m.mae, 'val_r2': m.r2, 'val_ic': m.spearman_ic})
    res = pd.DataFrame(rows).sort_values('val_rmse', ascending=True).reset_index(drop=True)
    best = float(res.iloc[0]['alpha'])
    return best, res

best_alpha_lasso, lasso_val_grid = tune_alpha_on_val_lasso(ALPHA_GRID)
display(lasso_val_grid)
print('best_alpha_lasso:', best_alpha_lasso)
lasso_val_grid.to_csv(OUTPUT_DIR / 'lasso_alpha_valgrid.csv', index=False)

lasso_pipe = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('model', Lasso(alpha=best_alpha_lasso, random_state=SEED, max_iter=20_000)),
    ]
)
lasso_pipe.fit(X_train, y_train)

print('Lasso train:', compute_metrics(y_train, lasso_pipe.predict(X_train)))
print('Lasso val  :', compute_metrics(y_val, lasso_pipe.predict(X_val)))
print('Lasso test :', compute_metrics(y_test, lasso_pipe.predict(X_test)))


  ic = float(spearmanr(y_true.to_numpy(), y_pred, nan_policy='omit').correlation)
  ic = float(spearmanr(y_true.to_numpy(), y_pred, nan_policy='omit').correlation)
  ic = float(spearmanr(y_true.to_numpy(), y_pred, nan_policy='omit').correlation)
  ic = float(spearmanr(y_true.to_numpy(), y_pred, nan_policy='omit').correlation)
  ic = float(spearmanr(y_true.to_numpy(), y_pred, nan_policy='omit').correlation)


Unnamed: 0,alpha,val_rmse,val_mae,val_r2,val_ic
0,0.1,0.015887,0.01086,-7e-06,
1,0.01,0.015887,0.01086,-7e-06,
2,10.0,0.015887,0.01086,-7e-06,
3,1.0,0.015887,0.01086,-7e-06,
4,100.0,0.015887,0.01086,-7e-06,
5,0.001,0.015901,0.010872,-0.001734,-0.003537
6,0.0001,0.015965,0.01093,-0.009873,0.016811


best_alpha_lasso: 0.1
Lasso train: RegressionMetrics(rmse=0.019396827294670512, mae=0.012586788696372204, r2=0.0, spearman_ic=nan)
Lasso val  : RegressionMetrics(rmse=0.015887076756732486, mae=0.010859805912974939, r2=-7.145644700612408e-06, spearman_ic=nan)
Lasso test : RegressionMetrics(rmse=0.01926878581542823, mae=0.012649241290022425, r2=-4.788321152626729e-05, spearman_ic=nan)


  ic = float(spearmanr(y_true.to_numpy(), y_pred, nan_policy='omit').correlation)
  ic = float(spearmanr(y_true.to_numpy(), y_pred, nan_policy='omit').correlation)
  ic = float(spearmanr(y_true.to_numpy(), y_pred, nan_policy='omit').correlation)


## Export predictions (for later backtesting)\n
We export `Date`, `Asset_ID`, `y_true`, `y_pred` for validation and test splits.\n

In [12]:
def export_preds(frame: pd.DataFrame, pipe: Pipeline, name: str) -> Path:
    X, y = to_xy(frame)
    out = pd.DataFrame({
        'Date': frame.index,
        'Asset_ID': frame['Asset_ID'].to_numpy(),
        'y_true': y.to_numpy(),
        'y_pred': pipe.predict(X),
    })
    out_path = OUTPUT_DIR / f'preds_{name}.parquet'
    out.to_parquet(out_path, index=False)
    return out_path

paths = {
    'ols_val': export_preds(df_val, ols_pipe, 'ols_val'),
    'ols_test': export_preds(df_test, ols_pipe, 'ols_test'),
    'ridge_val': export_preds(df_val, ridge_pipe, 'ridge_val'),
    'ridge_test': export_preds(df_test, ridge_pipe, 'ridge_test'),
    'lasso_val': export_preds(df_val, lasso_pipe, 'lasso_val'),
    'lasso_test': export_preds(df_test, lasso_pipe, 'lasso_test'),
}
paths


{'ols_val': PosixPath('/home/anivarth/college/quant-task/dataset/model_outputs/linear_models_04_timesplit/preds_ols_val.parquet'),
 'ols_test': PosixPath('/home/anivarth/college/quant-task/dataset/model_outputs/linear_models_04_timesplit/preds_ols_test.parquet'),
 'ridge_val': PosixPath('/home/anivarth/college/quant-task/dataset/model_outputs/linear_models_04_timesplit/preds_ridge_val.parquet'),
 'ridge_test': PosixPath('/home/anivarth/college/quant-task/dataset/model_outputs/linear_models_04_timesplit/preds_ridge_test.parquet'),
 'lasso_val': PosixPath('/home/anivarth/college/quant-task/dataset/model_outputs/linear_models_04_timesplit/preds_lasso_val.parquet'),
 'lasso_test': PosixPath('/home/anivarth/college/quant-task/dataset/model_outputs/linear_models_04_timesplit/preds_lasso_test.parquet')}

## Optional: Backtesting on the test window\n
This section mirrors the workflow used in the other linear-model notebooks:\n
- Run **personal engine (Original Style 1/N)** for all 3 models (cheap) and pick the best by total return\n
- Show the built-in detailed report + Bokeh dashboard for the best model (1/N and MPT)\n
- Run the **actor-based backtester** for the best model using the MPT weights\n
\n
Note: actor-based backtesting can take a few minutes depending on the window/universe.\n

In [None]:
from IPython.display import display
from bokeh.io import output_notebook, show
from sqlalchemy import create_engine, StaticPool

import sys

# Ensure `src/` is on sys.path so `backtester` is importable
src_dir = PROJECT_ROOT / 'src'
if str(src_dir) not in sys.path:
    sys.path.append(str(src_dir))

from backtester.data import load_cleaned_assets, align_close_prices
from backtester.engine import BacktestConfig, run_backtest
from backtester.report import compute_backtest_report
from backtester.bokeh_plots import build_interactive_portfolio_layout
from backtester.portfolio import equal_weight, optimize_mpt

# trade-engine imports
engine_root = (PROJECT_ROOT / 'another_testing_engine' / 'trade-engine' / 'trade-engine').resolve()
if str(engine_root) not in sys.path:
    sys.path.append(str(engine_root))
from tradeengine.actors.memory import MemPortfolioActor
from tradeengine.actors.sql import SQLOrderbookActor
from tradeengine.backtest import BacktestStrategy

output_notebook()

# Build prediction matrices for the test window only
pipes = {'ols': ols_pipe, 'ridge': ridge_pipe, 'lasso': lasso_pipe}

def pred_matrix_for_pipe(pipe: Pipeline) -> pd.DataFrame:
    X_bt, _y_bt = to_xy(df_test)
    pred = pipe.predict(X_bt)
    long = pd.DataFrame({'Date': df_test.index, 'Asset_ID': df_test['Asset_ID'].to_numpy(), 'y_pred': pred})
    return long.pivot_table(index='Date', columns='Asset_ID', values='y_pred', aggfunc='mean').sort_index()

pred_mats = {k: pred_matrix_for_pipe(p) for k, p in pipes.items()}
bt_assets = sorted(next(iter(pred_mats.values())).columns.tolist())

assets_ohlcv = load_cleaned_assets(symbols=bt_assets, cleaned_dir=str(PROJECT_ROOT / 'dataset' / 'cleaned'))
close_prices = align_close_prices(assets_ohlcv)

# Restrict prices to the test window
close_prices = close_prices.loc[test_start:end]
returns_matrix = close_prices.pct_change().fillna(0.0)

market_df = pd.DataFrame({
    'Open': pd.concat([df['Open'] for df in assets_ohlcv.values()], axis=1).mean(axis=1),
    'High': pd.concat([df['High'] for df in assets_ohlcv.values()], axis=1).mean(axis=1),
    'Low': pd.concat([df['Low'] for df in assets_ohlcv.values()], axis=1).mean(axis=1),
    'Close': pd.concat([df['Close'] for df in assets_ohlcv.values()], axis=1).mean(axis=1),
    'Volume': pd.concat([df['Volume'] for df in assets_ohlcv.values()], axis=1).sum(axis=1),
}).sort_index().loc[test_start:end]

# Align prediction matrices to the available test-window calendar
pred_mats = {k: v.reindex(close_prices.index) for k, v in pred_mats.items()}

REBALANCE_FREQ = 'W'
TOP_K = min(20, len(bt_assets))
LOOKBACK_DAYS = 126

def build_weights_from_predictions(pred_matrix: pd.DataFrame, *, pm_style: str) -> pd.DataFrame:
    rebal_dates = set(pd.Series(pred_matrix.index, index=pred_matrix.index).resample(REBALANCE_FREQ).last().dropna().tolist())
    w_last = pd.Series(0.0, index=bt_assets)
    w_rows = []
    for dt in pred_matrix.index:
        if dt in rebal_dates:
            row = pred_matrix.loc[dt].dropna().sort_values(ascending=False)
            top = row.head(TOP_K)
            candidates = [a for a, v in top.items() if np.isfinite(v) and v > 0]
            if len(candidates) == 0:
                w_last = pd.Series(0.0, index=bt_assets)
            else:
                if pm_style == '1N':
                    w_dict = equal_weight(candidates)
                elif pm_style == 'MPT':
                    w_dict = optimize_mpt(returns_matrix, candidates, dt, lookback_days=LOOKBACK_DAYS)
                else:
                    raise ValueError(f'Unknown pm_style: {pm_style!r}')
                w_last = pd.Series(0.0, index=bt_assets)
                for a, w in w_dict.items():
                    if a in w_last.index:
                        w_last[a] = float(w)
        w_rows.append(w_last)
    return pd.DataFrame(w_rows, index=pred_matrix.index, columns=bt_assets).fillna(0.0)

cfg_vec = BacktestConfig(initial_equity=1_000_000.0, transaction_cost_bps=5.0, mode='vectorized')

rows = []
best_model = None
best_ret = -np.inf

for model_name, pm in pred_mats.items():
    w_1n = build_weights_from_predictions(pm, pm_style='1N')
    res = run_backtest(close_prices, w_1n, config=cfg_vec)
    rpt = compute_backtest_report(result=res, close_prices=close_prices)
    tr = float(rpt['Total Return [%]'])
    rows.append({'model': model_name, 'Total Return [%]': tr, 'Sharpe': float(rpt['Sharpe']), 'Max Drawdown [%]': float(rpt['Max Drawdown [%]'])})
    if tr > best_ret:
        best_ret = tr
        best_model = str(model_name)

compare = pd.DataFrame(rows).sort_values('Total Return [%]', ascending=False).reset_index(drop=True)
display(compare)
print('best_model_by_1N_total_return:', best_model, 'Total Return [%]:', best_ret)

best_pred = pred_mats[best_model]
w_1n_best = build_weights_from_predictions(best_pred, pm_style='1N')
w_mpt_best = build_weights_from_predictions(best_pred, pm_style='MPT')

def run_and_show(weights: pd.DataFrame, title: str):
    res = run_backtest(close_prices, weights, config=cfg_vec)
    report = compute_backtest_report(result=res, close_prices=close_prices)
    display(report.to_frame(title))
    layout = build_interactive_portfolio_layout(
        market_ohlcv=market_df,
        equity=res.equity,
        returns=res.returns,
        weights=res.weights,
        turnover=res.turnover,
        costs=res.costs,
        close_prices=close_prices,
        title=title,
    )
    show(layout)
    return res, report

_res_1n, _rpt_1n = run_and_show(w_1n_best, f'TimeSplit Linear ({best_model}) - Original Style 1N (Test Window)')
_res_mpt, _rpt_mpt = run_and_show(w_mpt_best, f'TimeSplit Linear ({best_model}) - Original Style MPT (Test Window)')

# Actor engine signals (MPT weights)
def weights_to_formatted_signals(w: pd.DataFrame) -> dict[str, pd.Series]:
    rebal_dates = pd.Series(w.index, index=w.index).resample(REBALANCE_FREQ).last().dropna().tolist()
    active: set[str] = set()
    sig: dict[str, dict[pd.Timestamp, dict]] = {a: {} for a in w.columns}
    for dt in rebal_dates:
        if dt not in w.index:
            continue
        row = w.loc[dt]
        for a in list(active):
            if float(row.get(a, 0.0)) <= 0.0:
                sig[a][dt] = {'CloseOrder': {}}
                active.remove(a)
        for a, weight in row.items():
            if float(weight) > 0.0:
                sig[a][dt] = {'TargetWeightOrder': {'size': float(weight)}}
                active.add(str(a))
    formatted: dict[str, pd.Series] = {}
    for a, d in sig.items():
        formatted[a] = (pd.Series(d).sort_index() if d else pd.Series(dtype=object))
    return formatted

formatted_signals = weights_to_formatted_signals(w_mpt_best)
quote_frames = {a: assets_ohlcv[a][['Open', 'High', 'Low', 'Close']].loc[test_start:end] for a in bt_assets}

fund_value = 1_000_000.0
portfolio_actor = MemPortfolioActor.start(funding=fund_value)
db_engine = create_engine('sqlite://', echo=False, connect_args={'check_same_thread': False}, poolclass=StaticPool)
orderbook_actor = SQLOrderbookActor.start(portfolio_actor, db_engine, strategy_id=str(uuid.uuid4()))

print('Starting actor-based backtest (best model, MPT weights, test window)...')
bt_strategy = BacktestStrategy(orderbook_actor, portfolio_actor, quote_frames)
actor_result = bt_strategy.run_backtest(formatted_signals)
print('Actor backtest finished.')

perf = actor_result.porfolio_performance
perf['performance'].plot(figsize=(12, 6), title='Actor-Based Backtest Performance (Test Window)')
print('Actor Final Portfolio Value:', float(perf['value'].iloc[-1]))
print('Actor Total Return [%]:', float((perf['performance'].iloc[-1] - 1) * 100.0))


Unnamed: 0,model,Total Return [%],Sharpe,Max Drawdown [%]
0,ridge,54.010106,1.503062,-22.607705
1,ols,48.653071,1.358176,-22.183641
2,lasso,41.831591,1.396587,-18.490968


best_model_by_1N_total_return: ridge Total Return [%]: 54.01010646111774


Unnamed: 0,TimeSplit Linear (ridge) - Original Style 1N (Test Window)
Start,2024-07-15 00:00:00
End,2026-01-15 00:00:00
Duration,549 days 00:00:00
Initial Equity,1000000.0
Final Equity,1540101.064611
Equity Peak,1546724.966447
Total Return [%],54.010106
CAGR [%],33.362219
Volatility (ann) [%],20.499908
Sharpe,1.503062


Unnamed: 0,TimeSplit Linear (ridge) - Original Style MPT (Test Window)
Start,2024-07-15 00:00:00
End,2026-01-15 00:00:00
Duration,549 days 00:00:00
Initial Equity,1000000.0
Final Equity,1388734.344548
Equity Peak,1392858.934806
Total Return [%],38.873434
CAGR [%],24.474231
Volatility (ann) [%],20.496856
Sharpe,1.168396


NameError: name 'uuid' is not defined

: 