# Bayesian Rolling Regression (Pairs Trading Hedge Ratio)

We estimate a time-varying hedge ratio between two assets using Bayesian
linear regression in a rolling window. The conjugate Normal-Inverse-Gamma
posterior provides both a point estimate and uncertainty for the slope.

Trading rule (simple):
- compute spread $s_t = y_t - \hat{\beta}_t x_t$
- z-score the spread over the rolling window
- long/short when |z| exceeds a threshold


# Chapter 10 Bayesian ML (Predictive): Bayesian Rolling Regression

These notebooks mirror the *methods* highlighted in
`ml_finance_thoery/machine-learning-for-trading/10_bayesian_machine_learning/README.md`
and apply them to the local `dataset/cleaned/` asset universe to produce
out-of-sample predictions and a backtest using the same **vectorized** engine
used by the `notebooks/ML_Linear_Models_*` notebooks.


In [1]:
from __future__ import annotations

import math
from pathlib import Path
import sys

import numpy as np
import pandas as pd

SEED = 42
rng = np.random.default_rng(SEED)

def find_project_root(start: Path) -> Path:
    p = start.resolve()
    for _ in range(10):
        if (p / 'src').exists() and (p / 'dataset').exists():
            return p
        p = p.parent
    raise RuntimeError(f'Could not find project root from: {start!s}')

PROJECT_ROOT = find_project_root(Path.cwd())
CLEANED_DIR = PROJECT_ROOT / 'dataset' / 'cleaned'

# Ensure `src/` is on sys.path so `backtester` is importable
src_dir = PROJECT_ROOT / 'src'
if str(src_dir) not in sys.path:
    sys.path.append(str(src_dir))


In [2]:
from backtester.data import load_cleaned_assets, align_close_prices

# Pick a deterministic pair (keep reproducible).
PAIR = ('Asset_001', 'Asset_002')
assets_ohlcv = load_cleaned_assets(symbols=list(PAIR), cleaned_dir=str(CLEANED_DIR))
close_prices = align_close_prices(assets_ohlcv).sort_index()

y = np.log(close_prices[PAIR[0]].replace(0.0, np.nan)).dropna()
x = np.log(close_prices[PAIR[1]].replace(0.0, np.nan)).dropna()
idx = y.index.intersection(x.index)
y = y.loc[idx].astype(float)
x = x.loc[idx].astype(float)

# Time split
TRAIN_YEARS = 7
VAL_MONTHS = 18
TEST_MONTHS = 18

def align_to_trading_date(index: pd.DatetimeIndex, ts: pd.Timestamp) -> pd.Timestamp:
    pos = int(index.searchsorted(ts, side='left'))
    if pos >= len(index):
        return pd.Timestamp(index[-1])
    return pd.Timestamp(index[pos])

end = pd.Timestamp(idx[-1])
raw_test_start = end - pd.DateOffset(months=TEST_MONTHS)
raw_val_start = raw_test_start - pd.DateOffset(months=VAL_MONTHS)
raw_train_start = raw_val_start - pd.DateOffset(years=TRAIN_YEARS)
test_start = align_to_trading_date(idx, pd.Timestamp(raw_test_start))

# Conjugate prior for slope-only regression y = beta x + eps (no intercept for simplicity).
beta0 = 1.0
V0 = 1.0  # prior variance on beta
alpha0 = 2.0
beta_ig0 = 1.0

WINDOW = 126
Z_ENTRY = 1.5

beta_hat = pd.Series(index=idx, dtype=float)
spread = pd.Series(index=idx, dtype=float)
for t in range(len(idx)):
    dt = idx[t]
    start = max(0, t - WINDOW)
    xs = x.iloc[start:t].to_numpy()
    ys = y.iloc[start:t].to_numpy()
    if xs.size < 20:
        beta_hat.loc[dt] = np.nan
        spread.loc[dt] = np.nan
        continue
    XTX = float(np.sum(xs * xs))
    XTy = float(np.sum(xs * ys))
    Vn = 1.0 / (1.0 / V0 + XTX)
    beta_n = Vn * ((beta0 / V0) + XTy)
    # Residual sum of squares using posterior mean
    rss = float(np.sum((ys - beta_n * xs) ** 2))
    alpha_n = alpha0 + 0.5 * xs.size
    beta_ig_n = beta_ig0 + 0.5 * rss
    beta_hat.loc[dt] = float(beta_n)
    spread.loc[dt] = float(y.loc[dt] - beta_n * x.loc[dt])

# Z-score of spread
z = (spread - spread.rolling(WINDOW).mean()) / spread.rolling(WINDOW).std()

# Build weights: long y short x when z < -Z_ENTRY; reverse when z > Z_ENTRY
pred_matrix = pd.DataFrame(index=idx[idx >= test_start], columns=list(PAIR), dtype=float)
for dt in pred_matrix.index:
    zz = float(z.loc[dt])
    if not np.isfinite(zz):
        pred_matrix.loc[dt] = 0.0
    elif zz > Z_ENTRY:
        # short y, long x
        pred_matrix.loc[dt, PAIR[0]] = -1.0
        pred_matrix.loc[dt, PAIR[1]] = +1.0
    elif zz < -Z_ENTRY:
        # long y, short x
        pred_matrix.loc[dt, PAIR[0]] = +1.0
        pred_matrix.loc[dt, PAIR[1]] = -1.0
    else:
        pred_matrix.loc[dt] = 0.0

# For the shared backtest cell, we want signals (ranking), not fixed +/-1 weights.
# Convert to ranking-like signals: long leg gets +1, short leg gets -1.
# The weight builder will select positives only; to preserve long-short, we run the backtest directly below.


In [3]:
# Backtest the long-short directly using the engine (skip top-K selection).
from IPython.display import display
from bokeh.io import output_notebook, show

from backtester.engine import BacktestConfig, run_backtest
from backtester.report import compute_backtest_report
from backtester.bokeh_plots import build_interactive_portfolio_layout

output_notebook()

cfg = BacktestConfig(initial_equity=1_000_000.0, transaction_cost_bps=5.0, mode='vectorized', allow_leverage=False)
# Normalize gross exposure to 1.0
w = pred_matrix.copy().fillna(0.0)
gross = w.abs().sum(axis=1).replace(0.0, np.nan)
w = w.div(gross, axis=0).fillna(0.0)

# Align close prices
close_bt = close_prices.loc[w.index, list(PAIR)]
res = run_backtest(close_bt, w, config=cfg)
rpt = compute_backtest_report(result=res, close_prices=close_bt)
display(rpt.to_frame('Bayes Rolling Regression (Pairs) - Long/Short'))

market_df = pd.DataFrame({
    'Open': pd.concat([df['Open'] for df in assets_ohlcv.values()], axis=1).mean(axis=1),
    'High': pd.concat([df['High'] for df in assets_ohlcv.values()], axis=1).mean(axis=1),
    'Low': pd.concat([df['Low'] for df in assets_ohlcv.values()], axis=1).mean(axis=1),
    'Close': pd.concat([df['Close'] for df in assets_ohlcv.values()], axis=1).mean(axis=1),
    'Volume': pd.concat([df['Volume'] for df in assets_ohlcv.values()], axis=1).sum(axis=1),
}).sort_index().loc[w.index]

layout = build_interactive_portfolio_layout(
    market_ohlcv=market_df,
    equity=res.equity,
    returns=res.returns,
    weights=res.weights,
    turnover=res.turnover,
    costs=res.costs,
    close_prices=close_bt,
    title='Bayes Rolling Regression (Pairs) - Long/Short',
)
show(layout)


Unnamed: 0,Bayes Rolling Regression (Pairs) - Long/Short
Start,2024-07-16 00:00:00
End,2026-01-16 00:00:00
Duration,549 days 00:00:00
Initial Equity,1000000.0
Final Equity,855721.956584
Equity Peak,1016202.008924
Total Return [%],-14.427804
CAGR [%],-9.86604
Volatility (ann) [%],9.108308
Sharpe,-1.091691
