# Bayesian Logistic Regression (MCMC: Metropolis) on Feature Dataset

This is the Chapter-10-style Bayesian logistic regression, but trained on the
exported feature store `dataset/features/all_features.parquet`.

Because random-walk Metropolis does not scale well with very high dimension,
we restrict the design matrix to a small set of informative features (top-K by
mutual information if available).


# Bayesian Models on Feature Dataset: Bayesian Logistic Regression (MCMC, Feature Store)

These notebooks train on the exported feature store: `dataset/features/all_features.parquet`.

Backtesting is identical to the `ML_Linear_Models_*` notebooks: we build a daily
prediction matrix (Date x Asset_ID), convert it to weekly-rebalanced portfolios
(1/N and MPT), and run the vectorized backtest engine.

In [None]:
from __future__ import annotations

from pathlib import Path
import sys

import numpy as np
import pandas as pd

SEED = 42
rng = np.random.default_rng(SEED)

def find_project_root(start: Path) -> Path:
    p = start.resolve()
    for _ in range(10):
        if (p / 'src').exists() and (p / 'dataset').exists():
            return p
        p = p.parent
    raise RuntimeError(f'Could not find project root from: {start!s}')

PROJECT_ROOT = find_project_root(Path.cwd())
FEATURES_PARQUET = PROJECT_ROOT / 'dataset' / 'features' / 'all_features.parquet'

src_dir = PROJECT_ROOT / 'src'
if str(src_dir) not in sys.path:
    sys.path.append(str(src_dir))


In [None]:
import math
from scipy.special import expit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

TARGET_COL = 'ret_1d'
TARGET_FWD_COL = 'y_ret_1d_fwd'

df = pd.read_parquet(FEATURES_PARQUET)
if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.set_index('Date')
df = df.sort_index().copy()
df[TARGET_FWD_COL] = df.groupby('Asset_ID', sort=False)[TARGET_COL].shift(-1)
df = df.dropna(subset=[TARGET_FWD_COL])
df['y_up_fwd'] = (df[TARGET_FWD_COL] > 0).astype(int)

# Time split
TRAIN_YEARS = 7
VAL_MONTHS = 18
TEST_MONTHS = 18

def align_to_trading_date(index: pd.DatetimeIndex, ts: pd.Timestamp) -> pd.Timestamp:
    pos = int(index.searchsorted(ts, side='left'))
    if pos >= len(index):
        return pd.Timestamp(index[-1])
    return pd.Timestamp(index[pos])

idx = pd.DatetimeIndex(df.index.unique()).sort_values()
end = pd.Timestamp(idx[-1])
raw_test_start = end - pd.DateOffset(months=TEST_MONTHS)
raw_val_start = raw_test_start - pd.DateOffset(months=VAL_MONTHS)
raw_train_start = raw_val_start - pd.DateOffset(years=TRAIN_YEARS)
test_start = align_to_trading_date(idx, pd.Timestamp(raw_test_start))
val_start = align_to_trading_date(idx, pd.Timestamp(raw_val_start))
train_start = align_to_trading_date(idx, pd.Timestamp(raw_train_start))

df_train = df.loc[(df.index >= train_start) & (df.index < val_start)].copy()
df_test = df.loc[(df.index >= test_start) & (df.index <= end)].copy()

exclude_cols = {'Asset_ID', TARGET_FWD_COL, 'y_up_fwd'}
all_num = [c for c in df.columns if c not in exclude_cols and pd.api.types.is_numeric_dtype(df[c])]

# Prefer MI-ranked feature list from linear models outputs (if present)
mi_path = PROJECT_ROOT / 'notebooks' / 'dataset' / 'model_outputs' / 'linear_models_01' / 'mutual_information_top200.csv'
TOP_K = 25
if mi_path.exists():
    mi = pd.read_csv(mi_path)
    # Expected columns: `feature` and (optionally) MI score.
    col_feature = 'feature' if 'feature' in mi.columns else mi.columns[0]
    candidates = [f for f in mi[col_feature].astype(str).tolist() if f in all_num]
    feature_cols = candidates[:TOP_K] if candidates else all_num[:TOP_K]
else:
    feature_cols = all_num[:TOP_K]

print('n_features_used:', len(feature_cols))

X_train = df_train[feature_cols].replace([np.inf, -np.inf], np.nan)
y_train = df_train['y_up_fwd'].astype(int).to_numpy()
X_test = df_test[feature_cols].replace([np.inf, -np.inf], np.nan)

imp = SimpleImputer(strategy='median')
scaler = StandardScaler()
Xtr = scaler.fit_transform(imp.fit_transform(X_train))
Xte = scaler.transform(imp.transform(X_test))
Xtr = np.hstack([np.ones((Xtr.shape[0], 1)), Xtr])
Xte = np.hstack([np.ones((Xte.shape[0], 1)), Xte])

# MCMC subsampling for speed
N_TRAIN_SAMPLES = 20_000
if Xtr.shape[0] > N_TRAIN_SAMPLES:
    idx_s = rng.choice(Xtr.shape[0], size=N_TRAIN_SAMPLES, replace=False)
    Xtr_s = Xtr[idx_s]
    y_s = y_train[idx_s]
else:
    Xtr_s = Xtr
    y_s = y_train

TAU2 = 10.0**2
def log_post(beta: np.ndarray) -> float:
    z = Xtr_s @ beta
    p = expit(z)
    eps = 1e-12
    ll = float(np.sum(y_s * np.log(p + eps) + (1 - y_s) * np.log(1 - p + eps)))
    lp = float(-0.5 * np.sum(beta[1:] ** 2) / TAU2)
    return ll + lp

N_STEPS = 4000
BURN = 1000
STEP_SCALE = 0.05

beta = np.zeros(Xtr_s.shape[1])
lp = log_post(beta)
samples = []
accept = 0
for t in range(N_STEPS):
    prop = beta + rng.normal(0.0, STEP_SCALE, size=beta.shape)
    lp_prop = log_post(prop)
    if math.log(rng.random()) < (lp_prop - lp):
        beta, lp = prop, lp_prop
        accept += 1
    if t >= BURN:
        samples.append(beta.copy())
print('accept_rate:', accept / N_STEPS)

B = np.stack(samples, axis=0)
p_samps = expit(Xte @ B.T)
p_mean = p_samps.mean(axis=1)

pred_long = pd.DataFrame({'Date': df_test.index, 'Asset_ID': df_test['Asset_ID'].to_numpy(), 'signal': p_mean - 0.5})
pred_matrix = pred_long.pivot_table(index='Date', columns='Asset_ID', values='signal', aggfunc='mean').sort_index().fillna(0.0)


In [None]:
from IPython.display import display
from bokeh.io import output_notebook, show

from backtester.data import load_cleaned_assets, align_close_prices
from backtester.engine import BacktestConfig, run_backtest
from backtester.report import compute_backtest_report
from backtester.bokeh_plots import build_interactive_portfolio_layout
from backtester.portfolio import equal_weight, optimize_mpt

output_notebook()

if 'pred_matrix' not in globals():
    raise RuntimeError('Expected `pred_matrix` (index=date, columns=Asset_ID) to exist')

pred_range = pd.DatetimeIndex(pred_matrix.index).sort_values()
if pred_range.empty:
    raise RuntimeError('pred_matrix has empty index')
bt_start = pd.Timestamp(pred_range[0])
bt_end = pd.Timestamp(pred_range[-1])

bt_assets = sorted([str(c) for c in pred_matrix.columns.tolist()])
CLEANED_DIR = PROJECT_ROOT / 'dataset' / 'cleaned'
assets_ohlcv = load_cleaned_assets(symbols=bt_assets, cleaned_dir=str(CLEANED_DIR))
close_prices = align_close_prices(assets_ohlcv)

pred_matrix = pred_matrix.reindex(close_prices.index)
close_prices = close_prices.loc[bt_start:bt_end]
pred_matrix = pred_matrix.loc[bt_start:bt_end]
returns_matrix = close_prices.pct_change().fillna(0.0)

market_df = pd.DataFrame({
    'Open': pd.concat([df['Open'] for df in assets_ohlcv.values()], axis=1).mean(axis=1),
    'High': pd.concat([df['High'] for df in assets_ohlcv.values()], axis=1).mean(axis=1),
    'Low': pd.concat([df['Low'] for df in assets_ohlcv.values()], axis=1).mean(axis=1),
    'Close': pd.concat([df['Close'] for df in assets_ohlcv.values()], axis=1).mean(axis=1),
    'Volume': pd.concat([df['Volume'] for df in assets_ohlcv.values()], axis=1).sum(axis=1),
}).sort_index().loc[bt_start:bt_end]

REBALANCE_FREQ = 'W'
TOP_K = min(20, len(bt_assets))
LOOKBACK_DAYS = 126

def build_weights_from_predictions(pred_matrix: pd.DataFrame, *, pm_style: str) -> pd.DataFrame:
    rebal_dates = set(pd.Series(pred_matrix.index, index=pred_matrix.index).resample(REBALANCE_FREQ).last().dropna().tolist())
    w_last = pd.Series(0.0, index=bt_assets)
    rows = []
    for dt in pred_matrix.index:
        if dt in rebal_dates:
            row = pred_matrix.loc[dt].dropna().sort_values(ascending=False)
            top = row.head(TOP_K)
            candidates = [a for a, v in top.items() if np.isfinite(v) and float(v) > 0.0]
            if not candidates:
                w_last = pd.Series(0.0, index=bt_assets)
            else:
                if pm_style == '1N':
                    w_dict = equal_weight(candidates)
                elif pm_style == 'MPT':
                    w_dict = optimize_mpt(returns_matrix, candidates, dt, lookback_days=LOOKBACK_DAYS)
                else:
                    raise ValueError(f'Unknown pm_style: {pm_style!r}')
                w_last = pd.Series(0.0, index=bt_assets)
                for a, w in w_dict.items():
                    w_last[str(a)] = float(w)
        rows.append(w_last)
    return pd.DataFrame(rows, index=pred_matrix.index, columns=bt_assets).fillna(0.0)

cfg = BacktestConfig(initial_equity=1_000_000.0, transaction_cost_bps=5.0, mode='vectorized')

compare_rows = []
results = {}
for pm_style in ['1N', 'MPT']:
    w = build_weights_from_predictions(pred_matrix, pm_style=pm_style)
    res = run_backtest(close_prices, w, config=cfg)
    rpt = compute_backtest_report(result=res, close_prices=close_prices)
    results[pm_style] = (w, res, rpt)
    compare_rows.append({
        'style': pm_style,
        'Total Return [%]': float(rpt['Total Return [%]']),
        'CAGR [%]': float(rpt['CAGR [%]']),
        'Sharpe': float(rpt['Sharpe']),
        'Max Drawdown [%]': float(rpt['Max Drawdown [%]']),
    })
compare = pd.DataFrame(compare_rows).sort_values('Total Return [%]', ascending=False).reset_index(drop=True)
display(compare)

BASE_TITLE = 'Bayes Logistic (MH, Feature Store)'
for pm_style in ['1N', 'MPT']:
    w, res, rpt = results[pm_style]
    title = BASE_TITLE + ' - ' + pm_style
    display(rpt.to_frame(title))
    layout = build_interactive_portfolio_layout(
        market_ohlcv=market_df,
        equity=res.equity,
        returns=res.returns,
        weights=res.weights,
        turnover=res.turnover,
        costs=res.costs,
        close_prices=close_prices,
        title=title,
    )
    show(layout)
