In [1]:
from google.colab import drive
drive.mount("/content/drive")

from pathlib import Path
PROJECT_ROOT = Path("/content/drive/MyDrive/pairs_trading_project")

PROC_DIR   = PROJECT_ROOT / "data" / "processed"
FEATURES_DIR = PROJECT_ROOT / "data" / "features"
SIGNALS_DIR  = FEATURES_DIR / "signals"

BACKTEST_DIR = PROJECT_ROOT / "results" / "backtests"
FIG_DIR      = PROJECT_ROOT / "results" / "figures"
SRC_DIR      = PROJECT_ROOT / "src" / "backtest"
STATS_DIR    = PROJECT_ROOT / "results" / "statistics"

for d in [BACKTEST_DIR, FIG_DIR, SRC_DIR, STATS_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)

Mounted at /content/drive
PROJECT_ROOT: /content/drive/MyDrive/pairs_trading_project


In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

prices = pd.read_parquet(PROC_DIR / "prices_aligned.parquet")
prices.index = pd.to_datetime(prices.index)
prices = prices.sort_index()

In [3]:
# ---- COST MODEL (single source of truth) ----
GROSS_NOTIONAL = 10_000
TRADING_DAYS = 252

SLIPPAGE_PCT = 0.0005
BIDASK_PCT   = 0.0006
COMMISSION_PER_SHARE = 0.005
COMMISSION_MIN = 1.0
BORROW_ANNUAL = 0.01

def commission(shares: float) -> float:
    return max(COMMISSION_MIN, COMMISSION_PER_SHARE * abs(shares))

def per_leg_spread_cost(notional: float) -> float:
    return (SLIPPAGE_PCT + BIDASK_PCT) * abs(notional)

In [4]:
ROLL_BETA = 252

def rolling_beta(prices: pd.DataFrame, pair: str, roll=252) -> pd.Series:
    X, Y = pair.split("_")
    x = np.log(prices[X])
    y = np.log(prices[Y])
    df_lr = pd.concat([x.rename("x"), y.rename("y")], axis=1).dropna()
    beta = df_lr["y"].rolling(roll).cov(df_lr["x"]) / df_lr["x"].rolling(roll).var()
    return beta.reindex(prices.index)

def load_pair_inputs(pair: str):
    X, Y = pair.split("_")
    beta = rolling_beta(prices, pair, roll=ROLL_BETA)

    sig = pd.read_csv(SIGNALS_DIR / f"signals_{pair}.csv", index_col=0, parse_dates=True)
    sig = sig.reindex(prices.index)

    mask = beta.notna() & sig["pos"].notna()
    sig = sig.loc[mask].copy()

    px = prices.loc[sig.index, X].astype(float)
    py = prices.loc[sig.index, Y].astype(float)
    b  = beta.loc[sig.index].astype(float)

    return X, Y, sig, px, py, b

In [5]:
def run_backtest(pair: str):
    X, Y, sig, px, py, b = load_pair_inputs(pair)

    trades = []
    cash_pnl = 0.0
    shares_x = 0.0
    shares_y = 0.0

    for t in range(1, len(sig.index)):
        date = sig.index[t]
        pos_now = int(sig["pos"].iloc[t])
        pos_prev = int(sig["pos"].iloc[t-1])

        px_t = float(px.iloc[t]); py_t = float(py.iloc[t])
        px_prev = float(px.iloc[t-1]); py_prev = float(py.iloc[t-1])

        beta_prev = float(b.iloc[t-1])  # no-lookahead sizing

        # mark-to-market from yesterday holdings
        daily_pnl = shares_x * (px_t - px_prev) + shares_y * (py_t - py_prev)

        # borrow on short notional (daily)
        short_notional = 0.0
        if shares_x < 0: short_notional += abs(shares_x) * px_t
        if shares_y < 0: short_notional += abs(shares_y) * py_t
        borrow_cost = (BORROW_ANNUAL / TRADING_DAYS) * short_notional

        cash_pnl += (daily_pnl - borrow_cost)

        trade_cost = 0.0
        event = "HOLD"

        # ENTRY
        if pos_prev == 0 and pos_now != 0:
            y_notional = (GROSS_NOTIONAL / 2.0) * pos_now
            shares_y = y_notional / py_t

            x_notional = -(beta_prev * y_notional)
            shares_x = x_notional / px_t

            trade_cost = (
                per_leg_spread_cost(y_notional) + commission(shares_y) +
                per_leg_spread_cost(x_notional) + commission(shares_x)
            )
            cash_pnl -= trade_cost
            event = "ENTER"

        # EXIT
        elif pos_prev != 0 and pos_now == 0:
            y_notional = shares_y * py_t
            x_notional = shares_x * px_t

            trade_cost = (
                per_leg_spread_cost(y_notional) + commission(shares_y) +
                per_leg_spread_cost(x_notional) + commission(shares_x)
            )
            cash_pnl -= trade_cost
            event = "EXIT"
            shares_x = 0.0
            shares_y = 0.0

        trades.append({
            "date": date, "pair": pair, "event": event,
            "pos_prev": pos_prev, "pos_now": pos_now,
            "beta_used": beta_prev,
            "shares_x": shares_x, "shares_y": shares_y,
            "trade_cost": trade_cost,
            "borrow_cost": borrow_cost,
            "daily_pnl": (daily_pnl - borrow_cost),
            "cum_pnl": cash_pnl
        })

    return pd.DataFrame(trades)

In [6]:
import json
import matplotlib.pyplot as plt

PAIRS = ["EOG_FANG", "FCX_GOLD", "V_MA"]

all_metrics = []
pair_returns = []

for pair in PAIRS:
    tl = run_backtest(pair)

    # save trade log per pair (donâ€™t overwrite)
    tl_path = BACKTEST_DIR / f"trade_log_{pair}.csv"
    tl.to_csv(tl_path, index=False)

    # metrics
    tl["date"] = pd.to_datetime(tl["date"])
    tl = tl.sort_values("date")
    equity = tl.set_index("date")["cum_pnl"]
    daily = equity.diff().dropna()

    sharpe = (daily.mean() / daily.std(ddof=1)) * np.sqrt(252) if daily.std(ddof=1) > 0 else np.nan
    max_dd = (equity - equity.cummax()).min()

    metrics = {
        "pair": pair,
        "gross_notional": GROSS_NOTIONAL,
        "total_pnl": float(equity.iloc[-1]) if len(equity) else 0.0,
        "total_trade_cost": float(tl["trade_cost"].sum()),
        "total_borrow_cost": float(tl["borrow_cost"].sum()),
        "sharpe": None if sharpe != sharpe else float(sharpe),
        "max_drawdown": float(max_dd) if len(equity) else 0.0,
        "n_entries": int((tl["event"] == "ENTER").sum()),
        "n_exits": int((tl["event"] == "EXIT").sum()),
        "n_days": int(len(daily)),
    }

    m_path = BACKTEST_DIR / f"metrics_after_costs_{pair}.json"
    m_path.write_text(json.dumps(metrics, indent=2))
    all_metrics.append(metrics)

    # return series for Phase 7
    pair_returns.append((daily / GROSS_NOTIONAL).rename(pair))

    print(pair, "saved:", tl_path.name, m_path.name)

metrics_df = pd.DataFrame(all_metrics).sort_values("total_pnl", ascending=False)
metrics_df.to_csv(BACKTEST_DIR / "metrics_after_costs_all_pairs.csv", index=False)
print("Saved:", BACKTEST_DIR / "metrics_after_costs_all_pairs.csv")

returns_df = pd.concat(pair_returns, axis=1).fillna(0.0)
returns_df.to_csv(STATS_DIR / "pair_returns_after_costs.csv", index=True)
print("Saved:", STATS_DIR / "pair_returns_after_costs.csv")

metrics_df

EOG_FANG saved: trade_log_EOG_FANG.csv metrics_after_costs_EOG_FANG.json
FCX_GOLD saved: trade_log_FCX_GOLD.csv metrics_after_costs_FCX_GOLD.json
V_MA saved: trade_log_V_MA.csv metrics_after_costs_V_MA.json
Saved: /content/drive/MyDrive/pairs_trading_project/results/backtests/metrics_after_costs_all_pairs.csv
Saved: /content/drive/MyDrive/pairs_trading_project/results/statistics/pair_returns_after_costs.csv


Unnamed: 0,pair,gross_notional,total_pnl,total_trade_cost,total_borrow_cost,sharpe,max_drawdown,n_entries,n_exits,n_days
0,EOG_FANG,10000,1414.36892,206.875212,10.051632,0.538593,-291.522867,7,7,2295
2,V_MA,10000,-295.709024,161.485352,7.082767,-0.467403,-295.709024,6,6,2295
1,FCX_GOLD,10000,-597.699946,126.095885,1.362955,-0.396981,-780.661762,5,5,2295
