# Setup & Data
This notebook downloads data, aligns calendars, saves cleaned files, and writes a lightweight helper module that other notebooks will import.


## Step 1 — Settings & Folders 

In [21]:
from pathlib import Path
import pandas as pd
import numpy as np

# --- Project paths (relative to this notebook in /notebooks) ---
NB_DIR = Path.cwd()
ROOT = NB_DIR.parent
DATA = ROOT / "data"
RAW = DATA / "raw"
INTERIM = DATA / "interim"

for d in [RAW, INTERIM]:
    d.mkdir(parents=True, exist_ok=True)

# --- Universe & dates (edit as you like) ---
UNIVERSE = [
    "AAPL", "PG", "XLE", "XLK", "XLP",
    "MSFT", "NVDA", "GOOGL", "META", "JPM",
    "GS", "AMZN", "TSLA", "KO", "PEP",
]  # liquid ETFs/stocks
START_DATE = "2016-01-01"
END_DATE = None         # latest
PRICE_FIELD = "Adj Close"

# --- Selection/Signal/Risk defaults (you may change in later notebooks) ---
ADF_ALPHA = 0.05
ROLL_WINDOW_Z = 60
MIN_OVERLAP_DAYS = 500
MAX_PAIRS = 20

Z_OPEN = 2.0
Z_CLOSE = 0.5
Z_STOP  = 3.5
MAX_HOLD_DAYS = 20

TARGET_ANNUAL_VOL = 0.10
EWMA_SPAN_VOL    = 20
MAX_PAIR_WEIGHT  = 0.10

COMMISSION_BPS = 1.0
SLIPPAGE_BPS   = 1.0


## Step 2 — Download & Align

In [22]:
import yfinance as yf

def download_prices(tickers, start, end=None, field="Adj Close"):
    data = yf.download(tickers, start=start, end=end, auto_adjust=False, progress=False)
    if field not in data:
        raise ValueError(f"{field} not in Yahoo columns: {list(data.columns)}")
    px = data[field].copy()
    px = px.dropna(how="all")
    return px

def align_calendar(prices: pd.DataFrame, min_non_nan:int=0) -> pd.DataFrame:
    px = prices.sort_index()
    if min_non_nan > 0:
        keep = px.notna().sum()
        keep = keep[keep >= min_non_nan].index
        px = px[keep]
    px = px.dropna(how="all")
    return px

prices_raw = download_prices(UNIVERSE, START_DATE, END_DATE, PRICE_FIELD)
prices_raw.to_parquet(RAW / "prices_raw.parquet")

prices = align_calendar(prices_raw, min_non_nan=500)
prices.to_parquet(INTERIM / "prices_interim.parquet")

prices.tail()


Ticker,AAPL,AMZN,GOOGL,GS,JPM,KO,META,MSFT,NVDA,PEP,PG,TSLA,XLE,XLK,XLP
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2025-10-29,269.700012,230.300003,274.570007,783.059998,305.51001,68.349998,751.669983,541.549988,207.039993,146.160004,148.770004,461.51001,88.010002,304.130005,76.580002
2025-10-30,271.399994,222.860001,281.480011,790.159973,309.440002,68.980003,666.469971,525.76001,202.889999,147.550003,149.580002,440.100006,87.489998,300.390015,76.5
2025-10-31,270.369995,244.220001,281.190002,789.369995,311.119995,68.900002,648.349976,517.809998,202.490005,146.089996,150.369995,456.559998,88.129997,300.679993,76.279999
2025-11-03,269.049988,254.0,283.720001,785.52002,309.350006,67.970001,637.710022,517.030029,206.880005,143.600006,148.020004,468.369995,88.139999,301.910004,75.68
2025-11-05,270.035004,249.100006,283.36499,792.98999,312.049988,68.324997,639.70282,508.007812,201.437195,142.320007,146.389999,465.394989,87.714996,297.299988,76.0215


## Step 3 — Write lightweight helpers.py

In [23]:
HELPERS = NB_DIR / "helpers.py"
HELPERS.write_text(r'''
from __future__ import annotations
import numpy as np, pandas as pd
from dataclasses import dataclass
from statsmodels.tsa.stattools import adfuller

# ---------- Stats ----------
def ols_beta(y: pd.Series, x: pd.Series):
    df = pd.concat([y,x], axis=1, join="inner").dropna()
    Y = df.iloc[:,0].values
    X = df.iloc[:,1].values
    X_ = np.column_stack([np.ones_like(X), X])
    a,b = np.linalg.lstsq(X_, Y, rcond=None)[0]
    return float(a), float(b)

def spread(y: pd.Series, x: pd.Series, beta: float, alpha: float=0.0):
    return (y - (alpha + beta*x)).dropna()

def adf_test(series: pd.Series):
    s = series.dropna().astype(float)
    stat, pval, *_ = adfuller(s, autolag="AIC")
    return float(stat), float(pval)

def engle_granger(y: pd.Series, x: pd.Series):
    a,b = ols_beta(y,x)
    res = spread(y,x,b,a)
    stat,p = adf_test(res)
    return a,b,stat,p,res

def half_life(series: pd.Series):
    s = series.dropna().astype(float)
    if len(s) < 30: return None
    s_lag = s.shift(1).dropna()
    ds = s.diff().dropna()
    idx = s_lag.index.intersection(ds.index)
    s_lag, ds = s_lag.loc[idx], ds.loc[idx]
    X = np.column_stack([np.ones(len(s_lag)), s_lag.values])
    phi = np.linalg.lstsq(X, ds.values, rcond=None)[0][1]
    import math
    try:
        kappa = -np.log1p(phi)
        if kappa <= 0: return None
        return float(np.log(2.0)/kappa)
    except Exception:
        return None

# ---------- Signals & Risk ----------
def rolling_zscore(spread: pd.Series, window: int):
    mu = spread.rolling(window).mean()
    sd = spread.rolling(window).std(ddof=1)
    return (spread - mu) / sd

def ewma_vol(series: pd.Series, span: int):
    r = series.diff()
    ew = r.ewm(span=span, adjust=False).std(bias=False)
    return ew * np.sqrt(252.0)

def target_size(spread_vol: pd.Series, target_ann_vol: float, max_weight: float):
    w = target_ann_vol / spread_vol.replace(0.0, np.nan)
    return w.clip(upper=max_weight).fillna(0.0)

def apply_costs(turnover: pd.Series, commission_bps: float, slippage_bps: float):
    bps = (commission_bps + slippage_bps)/10000.0
    return -bps * turnover.abs()

# ---------- Backtester ----------
@dataclass
class PairParams:
    z_open: float
    z_close: float
    z_stop: float
    max_hold_days: int
    roll_window_z: int
    ewma_span_vol: int
    target_ann_vol: float
    max_pair_weight: float
    commission_bps: float
    slippage_bps: float

def backtest_pair(y: pd.Series, x: pd.Series, alpha: float, beta: float, p: PairParams):
    df = pd.concat({"y":y, "x":x}, axis=1).dropna()
    S  = df["y"] - (alpha + beta*df["x"])
    Z  = rolling_zscore(S, p.roll_window_z)

    pos = pd.Series(0.0, index=S.index)
    entry_date = None
    for t in range(1, len(S)):
        date = S.index[t]
        z_t  = Z.iloc[t]
        prev = pos.iloc[t-1]
        exit_sig = (abs(z_t) <= p.z_close) or (abs(z_t) >= p.z_stop)
        time_stop = False
        if entry_date is not None:
            if (date - entry_date).days >= p.max_hold_days:
                time_stop = True
        if prev != 0 and (exit_sig or time_stop):
            pos.iloc[t] = 0.0
            entry_date = None
            continue
        if prev == 0:
            if z_t >= p.z_open:
                pos.iloc[t] = -1.0
                entry_date = date
            elif z_t <= -p.z_open:
                pos.iloc[t] = +1.0
                entry_date = date
            else:
                pos.iloc[t] = 0.0
        else:
            pos.iloc[t] = prev

    vol = ewma_vol(S, p.ewma_span_vol)
    size = target_size(vol, p.target_ann_vol, p.max_pair_weight)
    dS = S.diff().fillna(0.0)

    gross = (pos.shift(1).fillna(0.0) * size.shift(1).fillna(0.0)) * dS
    turnover = pos.diff().abs().fillna(0.0) * size.shift(1).fillna(0.0)
    costs = apply_costs(turnover, p.commission_bps, p.slippage_bps)
    pnl = gross + costs
    equity = (1.0 + pnl).cumprod()

    return pd.DataFrame({"spread":S, "z":Z, "pos":pos, "size":size,
                         "pnl":pnl, "equity":equity, "turnover":turnover, "costs":costs})

def perf_stats(pnl: pd.Series):
    r = pnl.dropna()
    ann_ret = r.mean()*252.0
    ann_vol = r.std(ddof=1)*np.sqrt(252.0)
    sharpe  = ann_ret/ann_vol if ann_vol>0 else np.nan
    eq = (1.0+r).cumprod()
    peak = eq.cummax()
    dd = (eq/peak - 1.0).min()
    return {"ann_return":float(ann_ret),"ann_vol":float(ann_vol),"sharpe":float(sharpe),"max_drawdown":float(dd)}
''')

print(f"Wrote helpers.py at: {HELPERS}")


Wrote helpers.py at: C:\Users\Ruben\Desktop\Projects\PairsTrading\notebooks\helpers.py


# Appendix — `helpers.py` (Theory, Math, and Full Explanation)

> This section documents every function inside `helpers.py` — what it does, the mathematical foundations behind it, and how they connect to form a full market-neutral pairs-trading pipeline.

---

## 0) Overview and Notation

We work with two price series (e.g., ETFs or stocks): $Y_t$ and $X_t$.  
We assume a long-term equilibrium relationship of the form:

$$
Y_t = \alpha + \beta X_t + \varepsilon_t
$$

Here:
- $\alpha$ — constant intercept (offset),
- $\beta$ — slope coefficient (sensitivity of $Y$ to $X$),
- $\varepsilon_t$ — residual, or **spread**.

If the spread $S_t = \varepsilon_t$ is **stationary** (i.e., mean-reverting), then $Y_t$ and $X_t$ are said to be **cointegrated**.  
Pairs trading strategies rely on this mean-reverting property to open and close positions.

---

## 1) Statistical Functions (Estimation & Cointegration)

### 1.1 `ols_beta(y, x)`

**Purpose:**  
Estimate $\alpha$ and $\beta$ using Ordinary Least Squares (OLS) regression.

**Mathematics:**  
We solve the minimization problem:

$$
\min_{\alpha,\beta} \sum_t \left(Y_t - (\alpha + \beta X_t)\right)^2
$$

The analytical solution is:

$$
\begin{bmatrix}
\hat{\alpha} \\[4pt]
\hat{\beta}
\end{bmatrix}
=
(X'X)^{-1}X'Y
$$

**Implementation logic:**
1. Align `y` and `x` by date (`join="inner"`) and drop `NaN`s.  
2. Build a design matrix with a constant column:  
   `X_ = np.column_stack([np.ones_like(X), X])`  
3. Use NumPy’s least squares solver:  
   `a, b = np.linalg.lstsq(X_, Y, rcond=None)[0]`.

**Interpretation:**  
- $\beta$ ≈ how much $Y$ changes when $X$ changes by 1 unit.  
- $\alpha$ is the average offset between $Y$ and $\beta X$.

---

### 1.2 `spread(y, x, beta, alpha=0.0)`

**Purpose:**  
Compute the **spread** between $Y$ and $X$:

$$
S_t = Y_t - (\alpha + \beta X_t)
$$

**Interpretation:**  
This residual series represents the deviation of $Y$ from its equilibrium level predicted by $X$.  
It is the **core object** we test and trade.

---

### 1.3 `adf_test(series)`

**Purpose:**  
Perform the **Augmented Dickey-Fuller (ADF)** test to check whether a time series has a **unit root** (non-stationary).

**Hypotheses:**
- $H_0$: The series has a unit root (non-stationary).  
- $H_1$: The series is stationary (mean-reverting).

**Decision rule:**
- If `p ≤ 0.05` → reject $H_0$ → the spread is stationary.  
- If `p > 0.05` → insufficient evidence of stationarity.

**Output:**  
ADF statistic and p-value `(stat, pval)`.

---

### 1.4 `engle_granger(y, x)`

**Purpose:**  
Implement the **two-step Engle-Granger cointegration test**.

1. Estimate $\alpha$ and $\beta$ with OLS.
2. Compute residuals (spread).
3. Apply ADF to residuals.

**Output:**  
`(alpha, beta, adf_stat, p_value, residual_series)`

**Interpretation:**  
If the residuals are stationary, the pair $(Y, X)$ is **cointegrated** — meaning they move together in the long run.

---

### 1.5 `half_life(series)`

**Purpose:**  
Estimate how quickly deviations of the spread revert to their mean (mean-reversion speed).

**Mathematics:**
Assume the spread follows an AR(1) model:

$$
\Delta S_t = \phi S_{t-1} + \varepsilon_t
$$

Approximate the continuous-time **Ornstein-Uhlenbeck (OU)** process:

$$
dS_t = \kappa(\mu - S_t)dt + \sigma dW_t
$$

We then estimate:

$$
\kappa \approx -\ln(1 + \phi), \qquad
t_{1/2} = \frac{\ln 2}{\kappa}
$$

**Interpretation:**  
$t_{1/2}$ = expected time for a deviation to decay by half.  
- Small $t_{1/2}$ → fast mean reversion (good).  
- Large or undefined $t_{1/2}$ → slow or nonexistent reversion.

---

## 2) Signal and Risk Functions

### 2.1 `rolling_zscore(spread, window)`

**Purpose:**  
Normalize the spread using rolling statistics:

$$
Z_t = \frac{S_t - \mu_t}{\sigma_t}
$$

where:
- $\mu_t$ = rolling mean over `window`,
- $\sigma_t$ = rolling standard deviation.

**Trading logic:**
- Enter trade when $|Z_t| \ge z_{\text{open}}$.  
- Exit when $|Z_t| \le z_{\text{close}}$ or $|Z_t| \ge z_{\text{stop}}$.

**Interpretation:**  
The Z-score tells us **how many standard deviations** the spread currently deviates from its recent mean.

---

### 2.2 `ewma_vol(series, span)`

**Purpose:**  
Estimate **dynamic (time-varying)** volatility of the spread using Exponentially Weighted Moving Average (EWMA).

**Steps:**
1. Compute daily changes: $r_t = S_t - S_{t-1}$.  
2. Calculate EWMA standard deviation of $r_t$.  
3. Annualize volatility: multiply by $\sqrt{252}$.

**Output:**  
Series of annualized volatility values over time.

**Intuition:**  
- Low volatility → stable regime → larger position allowed.  
- High volatility → turbulent regime → reduce size.

---

### 2.3 `target_size(spread_vol, target_ann_vol, max_weight)`

**Purpose:**  
Compute daily position weights based on volatility targeting.

**Formula:**

$$
w_t = \min\!\left(\frac{\text{target\_ann\_vol}}{\sigma_t^{\text{ann}}}, \text{max\_weight}\right)
$$

**Logic:**
- Inverse relationship between risk and position size.  
- Caps exposure at `max_weight`.

**Example:**  
If `target_ann_vol=0.10` (10%) and current vol = 20%, then $w_t = 0.1 / 0.2 = 0.5$.  
If max_weight = 0.3 → actual $w_t = 0.3$.

---

### 2.4 `apply_costs(turnover, commission_bps, slippage_bps)`

**Purpose:**  
Model simple linear transaction costs.

**Formula:**

$$
\text{cost}_t = -\frac{(\text{commission} + \text{slippage})}{10{,}000} \times |\text{turnover}_t|
$$

**Where:**
- 1 basis point (bp) = 0.01%.  
- `turnover` ≈ fraction of capital traded that day.

**Output:**  
Series of daily cost values (negative).

---

## 3) Backtesting Framework

### 3.1 `PairParams` Dataclass

Holds all strategy parameters in a structured object:

| Category | Parameter | Description |
|-----------|------------|--------------|
| **Signals** | `z_open`, `z_close`, `z_stop`, `roll_window_z` | Thresholds & lookback for entry/exit |
| **Risk** | `ewma_span_vol`, `target_ann_vol`, `max_pair_weight` | Volatility control |
| **Execution** | `commission_bps`, `slippage_bps` | Transaction costs |
| **Time** | `max_hold_days` | Maximum trade duration |


---

### 3.2 `backtest_pair(y, x, alpha, beta, p)`

**Goal:**  
Simulate trading the spread $S_t = Y_t - (\alpha + \beta X_t)$ under z-score mean reversion rules.

**Workflow:**
1. Compute `S` (spread) and `Z` (z-score).  
2. Iterate through time, maintaining position state `pos ∈ {-1, 0, +1}`.  
3. Apply entry/exit/stop rules.  
4. Scale exposure using `ewma_vol` and `target_size`.  
5. Apply transaction costs and compute PnL and equity curve.

**Signal logic:**
- **Enter Short:** $Z_t ≥ z_{open}$  
- **Enter Long:** $Z_t ≤ -z_{open}$  
- **Exit:** when $|Z_t| ≤ z_{close}$, $|Z_t| ≥ z_{stop}$, or position aged > `max_hold_days`.

**PnL Calculation:**
```python
gross = (pos.shift(1)*size.shift(1))*dS
turnover = pos.diff().abs()*size.shift(1)
costs = apply_costs(turnover, commission_bps, slippage_bps)
pnl = gross + costs
equity = (1 + pnl).cumprod()


**Interpretation:**
- **gross** = profit from spread movement.
- **costs** = trading friction.
- **pnl** = daily return stream.
- **equity** = compounded equity curve.

---

### 3.3 `perf_stats(pnl)`

**Purpose:**  
Compute key summary metrics from daily PnL.

| Metric | Formula | Meaning |
|---------|----------|---------|
| **Annualized Return** | $\mu \times 252$ | Expected yearly return |
| **Annualized Volatility** | $\sigma \times \sqrt{252}$ | Yearly risk |
| **Sharpe Ratio** | $\frac{\text{ann\_ret}}{\text{ann\_vol}}$ | Risk-adjusted performance |
| **Max Drawdown** | $\min_t \left(\frac{Eq_t}{Eq_{\max}} - 1\right)$ | Peak-to-trough loss |


---

## 4) How Everything Connects

| Stage | Function(s) | Purpose |
|--------|--------------|----------|
| **1. Statistical Layer** | `ols_beta`, `spread`, `adf_test`, `engle_granger`, `half_life` | Detect cointegration and mean reversion strength |
| **2. Signal Layer** | `rolling_zscore`, `ewma_vol`, `target_size` | Build normalized signals and risk scaling |
| **3. Execution Layer** | `apply_costs`, `backtest_pair` | Simulate trading with realistic costs |
| **4. Evaluation Layer** | `perf_stats` | Summarize strategy performance |


---

## 5) Practical Notes and Recommendations

- **Multiple testing:** Adjust p-values (FDR) when scanning many pairs.  
- **Rolling re-estimation:** Cointegration can break; re-fit periodically (walk-forward).  
- **Window choice:** `roll_window_z ≈ 60`, `ewma_span_vol ≈ 20–40` are good starting points.  
- **Transaction costs:** Always include; sensitivity test ±50%.  
- **Risk control:** `max_pair_weight` ensures portfolio diversification.  
- **Time stop:** Avoid holding decaying trades forever.  
- **No look-ahead bias:** Always use `.shift(1)` when computing returns or signals.


---

## 6) Quick Example

```python
alpha, beta, stat, p, res = engle_granger(Y, X)

if p <= 0.05:
    params = PairParams(
        z_open=2.0, z_close=0.5, z_stop=3.0,
        max_hold_days=20,
        roll_window_z=60, ewma_span_vol=20,
        target_ann_vol=0.10, max_pair_weight=0.10,
        commission_bps=1.0, slippage_bps=1.0
    )
    result = backtest_pair(Y, X, alpha, beta, params)
    perf = perf_stats(result["pnl"])


---

## 7) Conceptual Summary

The `helpers` module forms the backbone of the project:

1. **Identify stable relationships** (Engle–Granger test).  
2. **Measure deviations** (Z-score of the spread).  
3. **Trade mean reversion** with disciplined entry/exit logic.  
4. **Size dynamically** to keep risk constant.  
5. **Include costs** for realistic results.  
6. **Evaluate** via annualized metrics and drawdowns.

Together, these components create a clean, professional, and transparent **quantitative pairs-trading framework**.


## Step 4 — Smoke test

In [24]:
# Make sure we can import helpers from this folder
import sys
if str(NB_DIR) not in sys.path:
    sys.path.insert(0, str(NB_DIR))

from helpers import engle_granger, half_life, PairParams, backtest_pair, perf_stats

print("helpers imported OK")
print("Cleaned prices file:", INTERIM / "prices_interim.parquet")


helpers imported OK
Cleaned prices file: C:\Users\Ruben\Desktop\Projects\PairsTrading\data\interim\prices_interim.parquet
