In [12]:
from google.colab import drive
drive.mount('/content/drive')

import numpy as np
import pandas as pd
from pathlib import Path

import statsmodels.api as sm
from statsmodels.tsa.stattools import coint, adfuller
from statsmodels.stats.multitest import multipletests

PROJECT_ROOT = Path("/content/drive/MyDrive/pairs_trading_project")
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"
RAW_DIR = PROJECT_ROOT / "data" / "raw"
STATS_DIR = PROJECT_ROOT / "results" / "statistics"
FIG_DIR = PROJECT_ROOT / "results" / "figures"
MODELS_DIR = PROJECT_ROOT / "models" / "saved_parameters"

for d in [STATS_DIR, FIG_DIR, MODELS_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print("Paths OK:", PROJECT_ROOT)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Paths OK: /content/drive/MyDrive/pairs_trading_project


In [13]:
prices = pd.read_parquet(PROCESSED_DIR / "prices_aligned.parquet")
meta = pd.read_csv(RAW_DIR / "ticker_metadata.csv")

prices.index = pd.to_datetime(prices.index)
prices = prices.sort_index()
logp = np.log(prices)

print("Prices shape:", prices.shape)
print("Buckets:", meta["bucket"].unique())
meta.head()

Prices shape: (2548, 56)
Buckets: ['financials' 'energy' 'tech' 'commodities']


Unnamed: 0,ticker,bucket,subcluster,exchange,notes,adv_usd_60d
0,JPM,financials,banks,US,,3285358000.0
1,BAC,financials,banks,US,,2076703000.0
2,WFC,financials,banks,US,,1342933000.0
3,C,financials,banks,US,,1602639000.0
4,GS,financials,banks,US,,2084273000.0


In [14]:
TRAIN_FRAC = 0.60

# FDR threshold (within each bucket/subcluster group)
P_FDR = 0.05

# OOS validation: ADF on OOS rolling-beta spread
P_OOS_ADF = 0.10

# Rolling hedge ratio window (inside OOS)
ROLL_BETA = 252  # ~1 trading year

# Require enough observations
MIN_TRAIN_OBS = 756  # ~3 years
MIN_TEST_OBS  = 252  # ~1 year

print("TRAIN_FRAC:", TRAIN_FRAC)
print("FDR:", P_FDR, "| OOS ADF:", P_OOS_ADF, "| ROLL_BETA:", ROLL_BETA)

TRAIN_FRAC: 0.6
FDR: 0.05 | OOS ADF: 0.1 | ROLL_BETA: 252


In [15]:
pairs = []
for (bucket, subcluster), g in meta.groupby(["bucket", "subcluster"]):
    tickers = g["ticker"].tolist()
    if len(tickers) < 2:
        continue
    for i in range(len(tickers)):
        for j in range(i + 1, len(tickers)):
            pairs.append({"a": tickers[i], "b": tickers[j], "bucket": bucket, "subcluster": subcluster})

candidates = pd.DataFrame(pairs)
cand_path = STATS_DIR / "candidate_pairs_all_within_subcluster.csv"
candidates.to_csv(cand_path, index=False)

print("Total candidate pairs:", len(candidates))
print("Saved:", cand_path)
candidates.groupby(["bucket","subcluster"]).size()

Total candidate pairs: 117
Saved: /content/drive/MyDrive/pairs_trading_project/results/statistics/candidate_pairs_all_within_subcluster.csv


Unnamed: 0_level_0,Unnamed: 1_level_0,0
bucket,subcluster,Unnamed: 2_level_1
commodities,chemicals,3
commodities,fertilizers,1
commodities,miners,15
commodities,steel,1
energy,e&p,15
energy,integrated,1
energy,midstream,1
energy,services,1
financials,banks,36
financials,payments,6


In [16]:
split_idx = int(len(logp) * TRAIN_FRAC)
train_idx = logp.index[:split_idx]
test_idx  = logp.index[split_idx:]

logp_train = logp.loc[train_idx]
logp_test  = logp.loc[test_idx]

print("Train:", train_idx.min().date(), "->", train_idx.max().date(), "| n=", len(train_idx))
print("Test :", test_idx.min().date(), "->", test_idx.max().date(), "| n=", len(test_idx))

Train: 2016-01-04 -> 2022-01-26 | n= 1528
Test : 2022-01-27 -> 2026-02-20 | n= 1020


In [17]:
def fit_beta_ols(y: pd.Series, x: pd.Series):
    # y = alpha + beta*x
    X = sm.add_constant(x.values)
    m = sm.OLS(y.values, X).fit()
    alpha, beta = float(m.params[0]), float(m.params[1])
    return alpha, beta, float(m.rsquared)

def engle_granger(y: pd.Series, x: pd.Series):
    tstat, pval, crit = coint(y.values, x.values)
    return float(tstat), float(pval), tuple(map(float, crit))

def adf_pvalue(series: pd.Series):
    s = series.dropna()
    if len(s) < MIN_TEST_OBS:
        return np.nan
    return float(adfuller(s.values, autolag="AIC")[1])

def rolling_alpha_beta(y: pd.Series, x: pd.Series, window: int):
    # returns aligned alpha_t, beta_t (NaN for first window)
    alphas = pd.Series(index=y.index, dtype=float)
    betas  = pd.Series(index=y.index, dtype=float)

    yv = y.values
    xv = x.values

    for t in range(window, len(y)):
        yy = yv[t-window:t]
        xx = xv[t-window:t]
        X = sm.add_constant(xx)
        m = sm.OLS(yy, X).fit()
        alphas.iloc[t] = m.params[0]
        betas.iloc[t]  = m.params[1]

    return alphas, betas

In [18]:
rows = []

for _, r in candidates.iterrows():
    a, b = r["a"], r["b"]
    bucket, subcluster = r["bucket"], r["subcluster"]

    df = pd.concat([logp_train[a], logp_train[b]], axis=1, join="inner").dropna()
    if df.shape[0] < MIN_TRAIN_OBS:
        continue

    y = df.iloc[:,0]
    x = df.iloc[:,1]

    alpha, beta, r2 = fit_beta_ols(y, x)
    tstat, pval, crit = engle_granger(y, x)

    rows.append({
        "a": a, "b": b,
        "bucket": bucket, "subcluster": subcluster,
        "train_nobs": int(df.shape[0]),
        "alpha_train": alpha,
        "beta_train": beta,
        "r2_train": r2,
        "eg_tstat_train": tstat,
        "eg_pval_train": pval,
        "crit_1%": crit[0],
        "crit_5%": crit[1],
        "crit_10%": crit[2],
    })

cointeg_train = pd.DataFrame(rows).sort_values(["bucket","subcluster","eg_pval_train"])
raw_path = STATS_DIR / "cointegration_insample_raw_v2.csv"
cointeg_train.to_csv(raw_path, index=False)

print("Computed train tests:", len(cointeg_train))
print("Saved:", raw_path)
cointeg_train.head(15)

Computed train tests: 117
Saved: /content/drive/MyDrive/pairs_trading_project/results/statistics/cointegration_insample_raw_v2.csv


Unnamed: 0,a,b,bucket,subcluster,train_nobs,alpha_train,beta_train,r2_train,eg_tstat_train,eg_pval_train,crit_1%,crit_5%,crit_10%
2,APD,ECL,commodities,chemicals,1528,-1.177905,1.250264,0.942781,-3.013494,0.107033,-3.903627,-3.340134,-3.047229
0,LIN,APD,commodities,chemicals,1528,-0.37269,1.069422,0.950467,-2.445231,0.303743,-3.903627,-3.340134,-3.047229
1,LIN,ECL,commodities,chemicals,1528,-1.740915,1.358744,0.925383,-1.71297,0.670953,-3.903627,-3.340134,-3.047229
3,CF,MOS,commodities,fertilizers,1528,1.830039,0.527392,0.251683,-1.065352,0.889735,-3.903627,-3.340134,-3.047229
5,FCX,GOLD,commodities,miners,1528,1.270579,0.693344,0.631212,-4.48627,0.00129,-3.903627,-3.340134,-3.047229
9,NEM,GOLD,commodities,miners,1528,2.810664,0.367519,0.428675,-3.862596,0.011163,-3.903627,-3.340134,-3.047229
16,RIO,BHP,commodities,miners,1528,-0.262723,1.112101,0.980475,-3.428164,0.039317,-3.903627,-3.340134,-3.047229
10,NEM,RIO,commodities,miners,1528,1.444121,0.616466,0.668084,-2.519084,0.270718,-3.903627,-3.340134,-3.047229
18,BHP,VALE,commodities,miners,1528,2.009089,0.70076,0.875847,-2.512171,0.273728,-3.903627,-3.340134,-3.047229
11,NEM,BHP,commodities,miners,1528,1.354346,0.663573,0.613677,-2.442676,0.304918,-3.903627,-3.340134,-3.047229


In [19]:
def fdr_group(df: pd.DataFrame):
    p = df["eg_pval_train"].values
    reject, p_adj, _, _ = multipletests(p, alpha=P_FDR, method="fdr_bh")
    out = df.copy()
    out["pval_fdr"] = p_adj
    out["reject_fdr"] = reject
    return out

cointeg_fdr = (
    cointeg_train.groupby(["bucket","subcluster"], group_keys=False)
    .apply(fdr_group)
    .sort_values(["bucket","subcluster","pval_fdr","eg_pval_train"])
)

ranked = cointeg_fdr[cointeg_fdr["reject_fdr"]].copy()
rank_path = STATS_DIR / "top_pairs_ranked_insample_v2.csv"
ranked.to_csv(rank_path, index=False)

print("In-sample pairs passing FDR:", len(ranked))
print("Saved:", rank_path)
ranked.head(20)

In-sample pairs passing FDR: 6
Saved: /content/drive/MyDrive/pairs_trading_project/results/statistics/top_pairs_ranked_insample_v2.csv


  .apply(fdr_group)


Unnamed: 0,a,b,bucket,subcluster,train_nobs,alpha_train,beta_train,r2_train,eg_tstat_train,eg_pval_train,crit_1%,crit_5%,crit_10%,pval_fdr,reject_fdr
5,FCX,GOLD,commodities,miners,1528,1.270579,0.693344,0.631212,-4.48627,0.00129,-3.903627,-3.340134,-3.047229,0.019348,True
27,EOG,FANG,energy,e&p,1528,1.002734,0.741821,0.948866,-4.646451,0.000697,-3.903627,-3.340134,-3.047229,0.010448,True
74,V,MA,financials,payments,1528,0.715801,0.791928,0.995732,-4.207373,0.003555,-3.903627,-3.340134,-3.047229,0.021331,True
115,CRM,ADBE,tech,software,1528,1.038846,0.704357,0.976054,-4.355157,0.002097,-3.903627,-3.340134,-3.047229,0.005784,True
116,NOW,ADBE,tech,software,1528,-2.933021,1.210256,0.984721,-4.296952,0.002589,-3.903627,-3.340134,-3.047229,0.005784,True
114,CRM,NOW,tech,software,1528,2.766473,0.576451,0.97242,-4.265937,0.002892,-3.903627,-3.340134,-3.047229,0.005784,True


In [20]:
val_rows = []

for _, r in ranked.iterrows():
    a, b = r["a"], r["b"]

    df = pd.concat([logp_test[a], logp_test[b]], axis=1, join="inner").dropna()
    if df.shape[0] < MIN_TEST_OBS:
        continue

    y = df.iloc[:,0]
    x = df.iloc[:,1]

    # Rolling alpha/beta computed using ONLY OOS window (no lookahead beyond the day)
    alpha_roll, beta_roll = rolling_alpha_beta(y, x, window=ROLL_BETA)

    spread_roll = y - (alpha_roll + beta_roll * x)

    p_oos = adf_pvalue(spread_roll)

    val_rows.append({
        "a": a, "b": b,
        "bucket": r["bucket"], "subcluster": r["subcluster"],
        "train_nobs": r["train_nobs"],
        "pval_fdr": r["pval_fdr"],
        "eg_pval_train": r["eg_pval_train"],
        "beta_train": r["beta_train"],
        "oos_nobs": int(df.shape[0]),
        "adf_pval_spread_oos_rollingbeta": p_oos,
        "pass_oos": (p_oos <= P_OOS_ADF) if not np.isnan(p_oos) else False
    })

oos = pd.DataFrame(val_rows).sort_values(["bucket","subcluster","adf_pval_spread_oos_rollingbeta"])
oos_path = STATS_DIR / "cointegration_oos_validation_rollingbeta_v2.csv"
oos.to_csv(oos_path, index=False)

passed = oos[oos["pass_oos"]].copy().sort_values(["bucket","subcluster","pval_fdr","adf_pval_spread_oos_rollingbeta"])
passed_path = STATS_DIR / "pairs_passed_phase35_rollingbeta_v2.csv"
passed.to_csv(passed_path, index=False)

print("Saved:", oos_path)
print("Saved:", passed_path)
print("Passed OOS (rolling beta):", len(passed))
passed.head(25)

Saved: /content/drive/MyDrive/pairs_trading_project/results/statistics/cointegration_oos_validation_rollingbeta_v2.csv
Saved: /content/drive/MyDrive/pairs_trading_project/results/statistics/pairs_passed_phase35_rollingbeta_v2.csv
Passed OOS (rolling beta): 3


Unnamed: 0,a,b,bucket,subcluster,train_nobs,pval_fdr,eg_pval_train,beta_train,oos_nobs,adf_pval_spread_oos_rollingbeta,pass_oos
0,FCX,GOLD,commodities,miners,1528,0.019348,0.00129,0.693344,1020,0.006401,True
1,EOG,FANG,energy,e&p,1528,0.010448,0.000697,0.741821,1020,0.019373,True
2,V,MA,financials,payments,1528,0.021331,0.003555,0.791928,1020,0.042548,True


In [21]:
import json

params = {}
for _, r in passed.iterrows():
    key = f"{r['a']}_{r['b']}"
    params[key] = {
        "a": r["a"],
        "b": r["b"],
        "bucket": r["bucket"],
        "subcluster": r["subcluster"],
        "train_pval_fdr": float(r["pval_fdr"]),
        "train_pval": float(r["eg_pval_train"]),
        "oos_adf_pval_spread_rollingbeta": float(r["adf_pval_spread_oos_rollingbeta"]),
        "train_start": str(train_idx.min().date()),
        "train_end": str(train_idx.max().date()),
        "test_start": str(test_idx.min().date()),
        "test_end": str(test_idx.max().date()),
        "roll_beta_window": int(ROLL_BETA),
        "oos_adf_gate": float(P_OOS_ADF),
    }

json_path = MODELS_DIR / "pairs_passed_params_phase35_rollingbeta_v2.json"
with open(json_path, "w") as f:
    json.dump(params, f, indent=2)

print("Saved:", json_path)
print("Param entries:", len(params))

Saved: /content/drive/MyDrive/pairs_trading_project/models/saved_parameters/pairs_passed_params_phase35_rollingbeta_v2.json
Param entries: 3


In [22]:
print("In-sample passed (FDR):", len(ranked))
print("OOS passed (rolling beta):", len(passed))

if len(passed) > 0:
    display(passed.head(20))
    print("\nCounts by (bucket, subcluster):")
    display(passed.groupby(["bucket","subcluster"]).size().sort_values(ascending=False))
else:
    print("No pairs passed OOS even with rolling beta. Next step is: expand universe per subcluster.")

In-sample passed (FDR): 6
OOS passed (rolling beta): 3


Unnamed: 0,a,b,bucket,subcluster,train_nobs,pval_fdr,eg_pval_train,beta_train,oos_nobs,adf_pval_spread_oos_rollingbeta,pass_oos
0,FCX,GOLD,commodities,miners,1528,0.019348,0.00129,0.693344,1020,0.006401,True
1,EOG,FANG,energy,e&p,1528,0.010448,0.000697,0.741821,1020,0.019373,True
2,V,MA,financials,payments,1528,0.021331,0.003555,0.791928,1020,0.042548,True



Counts by (bucket, subcluster):


Unnamed: 0_level_0,Unnamed: 1_level_0,0
bucket,subcluster,Unnamed: 2_level_1
commodities,miners,1
energy,e&p,1
financials,payments,1
