In [4]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
from pathlib import Path
PROJECT_ROOT = Path("/content/drive/MyDrive/pairs_trading_project")

STATS_DIR = PROJECT_ROOT / "results" / "statistics"
FEATURES_DIR = PROJECT_ROOT / "data" / "features"
SIGNALS_DIR = FEATURES_DIR / "signals"

for d in [STATS_DIR, FEATURES_DIR, SIGNALS_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print("STATS_DIR:", STATS_DIR)
print("SIGNALS_DIR:", SIGNALS_DIR)

STATS_DIR: /content/drive/MyDrive/pairs_trading_project/results/statistics
SIGNALS_DIR: /content/drive/MyDrive/pairs_trading_project/data/features/signals


In [15]:
import pandas as pd

regime_outputs = {}
files = sorted(STATS_DIR.glob("regime_v3_*.csv"))
if not files:
    raise FileNotFoundError(f"No regime_v3_*.csv found in {STATS_DIR}")

for fp in files:
    pair = fp.stem.replace("regime_v3_", "")
    df = pd.read_csv(fp, index_col=0, parse_dates=True)
    if "mr_regime_final" not in df.columns:
        raise ValueError(f"{fp.name} missing mr_regime_final")
    regime_outputs[pair] = df

pairs = sorted(regime_outputs.keys())
pairs

['EOG_FANG', 'FCX_GOLD', 'V_MA']

In [16]:
ENTRY_Z = 2.0
EXIT_Z  = 0.5
ZS_WINDOW = 60

HL_FALLBACK = 20
HL_MIN_HOLD, HL_MAX_HOLD = 5, 60

In [17]:
import numpy as np

def compute_signals(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    s = out["spread"].astype(float)

    m  = s.rolling(ZS_WINDOW).mean()
    sd = s.rolling(ZS_WINDOW).std(ddof=0)
    out["z"] = (s - m) / sd

    mr = out["mr_regime_final"].fillna(0).astype(int)

    hl = out["half_life_roll"].fillna(HL_FALLBACK).clip(HL_MIN_HOLD, HL_MAX_HOLD)
    out["max_hold_days"] = np.ceil(hl).astype(int)

    pos = np.zeros(len(out), dtype=int)
    entry = np.zeros(len(out), dtype=int)
    exit_ = np.zeros(len(out), dtype=int)
    hold = np.zeros(len(out), dtype=int)

    cur = 0
    age = 0

    z = out["z"].values
    max_hold = out["max_hold_days"].values
    mr_v = mr.values

    for t in range(1, len(out)):
        if cur != 0:
            age += 1

        # force exit if regime flips off
        if cur != 0 and mr_v[t] == 0:
            cur = 0; age = 0; exit_[t] = 1

        # time exit
        elif cur != 0 and age >= max_hold[t]:
            cur = 0; age = 0; exit_[t] = 1

        # exit on mean reversion (near 0)
        elif cur != 0 and np.isfinite(z[t]) and abs(z[t]) <= EXIT_Z:
            cur = 0; age = 0; exit_[t] = 1

        # entry only when MR final is on
        if cur == 0 and mr_v[t] == 1 and np.isfinite(z[t]):
            if z[t] >= ENTRY_Z:
                cur = -1; entry[t] = 1; age = 0
            elif z[t] <= -ENTRY_Z:
                cur = +1; entry[t] = 1; age = 0

        pos[t] = cur
        hold[t] = age if cur != 0 else 0

    out["pos"] = pos
    out["entry"] = entry
    out["exit"] = exit_
    out["hold_days"] = hold

    out["spread_chg"] = out["spread"].diff()
    out["pnl_spread_units"] = out["pos"].shift(1) * out["spread_chg"]

    return out

for pair in pairs:
    sig = compute_signals(regime_outputs[pair])
    sig.to_csv(SIGNALS_DIR / f"signals_{pair}.csv", index=True)

print("Saved signals:", [f"signals_{p}.csv" for p in pairs])

Saved signals: ['signals_EOG_FANG.csv', 'signals_FCX_GOLD.csv', 'signals_V_MA.csv']


In [18]:
import pandas as pd

rows = []
for pair in pairs:
    df = pd.read_csv(SIGNALS_DIR / f"signals_{pair}.csv", index_col=0, parse_dates=True)

    active = df[df["pos"] != 0].copy()
    if active.empty:
        rows.append({"pair": pair, "mr_regime": 1, "n_active_days": 0,
                     "avg_daily_pnl_spread_units": None, "pnl_std_spread_units": None, "hit_rate": None})
        continue

    reg = df.loc[active.index, "mr_regime_final"].fillna(0).astype(int)
    for r, sub in active.groupby(reg):
        pnl = sub["pnl_spread_units"].dropna()
        if pnl.empty:
            continue
        rows.append({
            "pair": pair,
            "mr_regime": int(r),
            "n_active_days": int(len(pnl)),
            "avg_daily_pnl_spread_units": float(pnl.mean()),
            "pnl_std_spread_units": float(pnl.std(ddof=1)),
            "hit_rate": float((pnl > 0).mean()),
        })

quality = pd.DataFrame(rows).sort_values(["pair","mr_regime"])
out_path = STATS_DIR / "signal_quality_by_regime.csv"
quality.to_csv(out_path, index=False)

print("Saved:", out_path)
quality

Saved: /content/drive/MyDrive/pairs_trading_project/results/statistics/signal_quality_by_regime.csv


Unnamed: 0,pair,mr_regime,n_active_days,avg_daily_pnl_spread_units,pnl_std_spread_units,hit_rate
0,EOG_FANG,1,48,0.004672,0.024673,0.5625
1,FCX_GOLD,1,28,-0.000814,0.018964,0.464286
2,V_MA,1,39,-0.000178,0.00496,0.435897


In [19]:
import numpy as np
import pandas as pd

rows = []
for pair in pairs:
    df = pd.read_csv(SIGNALS_DIR / f"signals_{pair}.csv", index_col=0, parse_dates=True)

    z = df["z"]
    spread_chg = df["spread_chg"]
    pnl_proxy = (-np.sign(z.shift(1))) * spread_chg  # short when z>0, long when z<0

    reg = df["mr_regime_final"].fillna(0).astype(int)
    for r in [0,1]:
        pnl = pnl_proxy[reg == r].dropna()
        if pnl.empty:
            continue
        rows.append({
            "pair": pair,
            "mr_regime": r,
            "n_days": int(len(pnl)),
            "avg_daily_pnl_proxy": float(pnl.mean()),
            "hit_rate_proxy": float((pnl > 0).mean()),
        })

counter = pd.DataFrame(rows).sort_values(["pair","mr_regime"])
out_path = STATS_DIR / "signal_quality_counterfactual_by_regime.csv"
counter.to_csv(out_path, index=False)

print("Saved:", out_path)
counter

Saved: /content/drive/MyDrive/pairs_trading_project/results/statistics/signal_quality_counterfactual_by_regime.csv


Unnamed: 0,pair,mr_regime,n_days,avg_daily_pnl_proxy,hit_rate_proxy
0,EOG_FANG,0,1866,0.000562,0.507503
1,EOG_FANG,1,371,0.001444,0.571429
2,FCX_GOLD,0,2009,0.000294,0.509706
3,FCX_GOLD,1,228,0.002849,0.552632
4,V_MA,0,1906,0.000654,0.520462
5,V_MA,1,331,0.000517,0.546828


In [20]:
import numpy as np
import pandas as pd

ENTRY_Z = 2.0
EXIT_Z  = 0.5
STOP_Z  = 3.0
ZS_WINDOW = 60

COOLDOWN_DAYS = 5

HL_FALLBACK = 20
HL_MIN_HOLD, HL_MAX_HOLD = 5, 60

def compute_signals_v2(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    s = out["spread"].astype(float)

    m  = s.rolling(ZS_WINDOW).mean()
    sd = s.rolling(ZS_WINDOW).std(ddof=0)
    out["z"] = (s - m) / sd

    mr = out["mr_regime_final"].fillna(0).astype(int)

    hl = out["half_life_roll"].fillna(HL_FALLBACK).clip(HL_MIN_HOLD, HL_MAX_HOLD)
    out["max_hold_days"] = np.ceil(hl).astype(int)

    pos = np.zeros(len(out), dtype=int)
    entry = np.zeros(len(out), dtype=int)
    exit_ = np.zeros(len(out), dtype=int)
    hold = np.zeros(len(out), dtype=int)

    cur = 0
    age = 0
    cooldown = 0

    z = out["z"].values
    max_hold = out["max_hold_days"].values
    mr_v = mr.values

    for t in range(1, len(out)):
        if cooldown > 0:
            cooldown -= 1

        if cur != 0:
            age += 1

        # forced exit if regime turns off
        if cur != 0 and mr_v[t] == 0:
            cur = 0; age = 0; exit_[t] = 1; cooldown = COOLDOWN_DAYS

        # stop-loss
        elif cur != 0 and np.isfinite(z[t]) and abs(z[t]) >= STOP_Z:
            cur = 0; age = 0; exit_[t] = 1; cooldown = COOLDOWN_DAYS

        # time exit
        elif cur != 0 and age >= max_hold[t]:
            cur = 0; age = 0; exit_[t] = 1; cooldown = COOLDOWN_DAYS

        # normal exit near 0
        elif cur != 0 and np.isfinite(z[t]) and abs(z[t]) <= EXIT_Z:
            cur = 0; age = 0; exit_[t] = 1; cooldown = COOLDOWN_DAYS

        # entry (only if flat, MR on, not cooling down)
        if cur == 0 and cooldown == 0 and mr_v[t] == 1 and np.isfinite(z[t]):
            if z[t] >= ENTRY_Z:
                cur = -1; entry[t] = 1; age = 0
            elif z[t] <= -ENTRY_Z:
                cur = +1; entry[t] = 1; age = 0

        pos[t] = cur
        hold[t] = age if cur != 0 else 0

    out["pos"] = pos
    out["entry"] = entry
    out["exit"] = exit_
    out["hold_days"] = hold
    out["cooldown"] = 0
    out.loc[out.index, "cooldown"] = np.nan  # keep column existence stable

    out["spread_chg"] = out["spread"].diff()
    out["pnl_spread_units"] = out["pos"].shift(1) * out["spread_chg"]

    return out

# rebuild and overwrite signals files
for pair in pairs:
    sig = compute_signals_v2(regime_outputs[pair])
    sig.to_csv(SIGNALS_DIR / f"signals_{pair}.csv", index=True)

print("Rebuilt signals with stop-loss + cooldown.")

Rebuilt signals with stop-loss + cooldown.


In [21]:
import pandas as pd

rows = []
for pair in pairs:
    df = pd.read_csv(SIGNALS_DIR / f"signals_{pair}.csv", index_col=0, parse_dates=True)
    active = df[df["pos"] != 0].copy()
    if active.empty:
        continue

    reg = df.loc[active.index, "mr_regime_final"].fillna(0).astype(int)
    for r, sub in active.groupby(reg):
        pnl = sub["pnl_spread_units"].dropna()
        if pnl.empty:
            continue
        rows.append({
            "pair": pair,
            "mr_regime": int(r),
            "n_active_days": int(len(pnl)),
            "avg_daily_pnl_spread_units": float(pnl.mean()),
            "pnl_std_spread_units": float(pnl.std(ddof=1)),
            "hit_rate": float((pnl > 0).mean()),
        })

quality = pd.DataFrame(rows).sort_values(["pair","mr_regime"])
out_path = STATS_DIR / "signal_quality_by_regime.csv"
quality.to_csv(out_path, index=False)

print("Saved:", out_path)
quality

Saved: /content/drive/MyDrive/pairs_trading_project/results/statistics/signal_quality_by_regime.csv


Unnamed: 0,pair,mr_regime,n_active_days,avg_daily_pnl_spread_units,pnl_std_spread_units,hit_rate
0,EOG_FANG,1,41,0.006821,0.025071,0.634146
1,FCX_GOLD,1,19,-0.002064,0.020924,0.473684
2,V_MA,1,34,0.000314,0.004528,0.470588
