# Pairs Trading EDA

Exploratory data analysis for statistical arbitrage pairs trading.

**Universe:** KO, PEP, XOM, CVX, GLD, SLV  
**Period:** 2018-01-01 to 2025-01-01

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np

from sarb.data.ingest import load_yfinance_prices
from sarb.features.spread import fit_hedge_ratio, compute_spread, rolling_zscore
from sarb.stats.cointegration import engle_granger_adf_pvalue, estimate_half_life
from sarb.backtest.walkforward import walkforward_pairs_backtest
from sarb.metrics.performance import sharpe, max_drawdown, cagr
from sarb.viz.charts import (
    plot_equity_curve, plot_drawdown, plot_spread,
    plot_correlation_heatmap, plot_rolling_beta
)

## 1. Load Price Data

In [None]:
tickers = ["KO", "PEP", "XOM", "CVX", "GLD", "SLV"]
prices = load_yfinance_prices(tickers, "2018-01-01", "2025-01-01", field="Adj Close")
prices = prices.dropna(axis=1, how="any")
print(f"Shape: {prices.shape}")
prices.tail()

## 2. Normalized Price Series

In [None]:
normalized = prices / prices.iloc[0] * 100
ax = normalized.plot(figsize=(12, 5), title="Normalized Prices (base=100)")
ax.set_ylabel("Price")
ax.grid(True, alpha=0.3)
ax.legend(loc="upper left")

## 3. Return Correlation Heatmap

In [None]:
returns = prices.pct_change().dropna()
fig = plot_correlation_heatmap(returns, title="Daily Return Correlations")
fig

## 4. Spread Analysis (KO/PEP)

In [None]:
y_tkr, x_tkr = "KO", "PEP"

# Fit on first 70% of data
n_train = int(len(prices) * 0.7)
train_px = prices.iloc[:n_train]

alpha, beta = fit_hedge_ratio(train_px[y_tkr], train_px[x_tkr])
print(f"Hedge ratio: {y_tkr} = {alpha:.4f} + {beta:.4f} * {x_tkr}")

spread = compute_spread(prices[y_tkr], prices[x_tkr], alpha, beta)
z = rolling_zscore(spread, 60)

fig = plot_spread(spread, z, entry_z=2.0, exit_z=0.5,
                  title=f"{y_tkr}/{x_tkr} Spread & Z-Score")
fig

## 5. Cointegration Diagnostics

In [None]:
import itertools

results = []
for y, x in itertools.combinations(list(prices.columns), 2):
    a, b = fit_hedge_ratio(train_px[y], train_px[x])
    sp = compute_spread(train_px[y], train_px[x], a, b)
    adf_p = engle_granger_adf_pvalue(sp)
    hl = estimate_half_life(sp)
    corr = returns[y].corr(returns[x])
    results.append({"Y": y, "X": x, "beta": b, "corr": corr,
                    "ADF_p": adf_p, "half_life": hl})

diag = pd.DataFrame(results).sort_values("ADF_p")
diag.style.format({"beta": "{:.3f}", "corr": "{:.3f}",
                    "ADF_p": "{:.4f}", "half_life": "{:.1f}"})

## 6. Walk-Forward Backtest

In [None]:
bt = walkforward_pairs_backtest(
    prices=prices[[y_tkr, x_tkr]].dropna(),
    y=y_tkr, x=x_tkr,
    train_lookback=504, z_lookback=60,
    entry_z=2.0, exit_z=0.5,
    fee_bps=1.0, slippage_bps=0.5,
)

# Test window metrics
test_bt = bt.iloc[n_train:]
r = test_bt["ret_net"]
eq = test_bt["equity"]

print(f"=== {y_tkr}/{x_tkr} Test Window ===")
print(f"Sharpe: {sharpe(r):.2f}")
print(f"Max Drawdown: {max_drawdown(eq):.2%}")
print(f"CAGR: {cagr(eq):.2%}")

In [None]:
fig = plot_equity_curve(test_bt["equity"], title=f"{y_tkr}/{x_tkr} Equity (Test)")
fig

In [None]:
fig = plot_drawdown(test_bt["equity"], title=f"{y_tkr}/{x_tkr} Drawdown (Test)")
fig

In [None]:
fig = plot_rolling_beta(bt["beta"].dropna(), title=f"{y_tkr}/{x_tkr} Rolling Hedge Ratio")
fig

## Summary

This notebook demonstrated the core pipeline:
1. **Data loading** from Yahoo Finance
2. **Correlation analysis** to identify potential pairs
3. **Spread modeling** with OLS hedge ratio and z-score normalization
4. **Cointegration testing** (ADF p-values + half-life estimation)
5. **Walk-forward backtesting** with daily parameter refit

For production research, use `scripts/scan_pairs.py` to scan a larger universe with FDR-controlled selection,
and `scripts/run_walkforward_portfolio.py` for multi-pair portfolio construction.