# Setup (Paths, Seeds, Reproducibility)

In [1]:
# ==========================
# Global Setup (RUN ONCE)
# ==========================
import os, json, random
import numpy as np
import pandas as pd
from pathlib import Path

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

PROJECT_DIR = Path("stock_project")
DATA_DIR = PROJECT_DIR / "data"
RAW_DIR = DATA_DIR / "raw"
PROC_DIR = DATA_DIR / "processed"

REPORT_DIR = PROJECT_DIR / "reports"
PRED_DIR = REPORT_DIR / "predictions"
TAB_DIR  = REPORT_DIR / "tables"
FIG_DIR  = REPORT_DIR / "figures"
MODEL_DIR = PROJECT_DIR / "models"

for d in [RAW_DIR, PROC_DIR, PRED_DIR, TAB_DIR, FIG_DIR, MODEL_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print("PROJECT_DIR:", PROJECT_DIR.resolve())
print("PROC_DIR:", PROC_DIR.resolve())

PROJECT_DIR: /notebooks/stock_project
PROC_DIR: /notebooks/stock_project/data/processed


# Load Panel and Sanity Checks

In [2]:
panel_path = PROC_DIR / "panel.parquet"
assert panel_path.exists(), f"Missing {panel_path}. Build the panel first."

panel = pd.read_parquet(panel_path).copy()
panel["Date"] = pd.to_datetime(panel["Date"])
panel["target_date"] = pd.to_datetime(panel["target_date"])

print("panel shape:", panel.shape)
print("tickers:", panel["Ticker"].nunique())
print(panel["split"].value_counts(dropna=False))
print(panel[["Date","target_date","Ticker","target_ret","split"]].head())

panel shape: (19776, 23)
tickers: 8
split
train    7720
val      6048
test     6008
Name: count, dtype: int64
        Date target_date Ticker  target_ret  split
0 2016-03-02  2016-03-03   AAPL    0.007417  train
1 2016-03-03  2016-03-04   AAPL    0.014767  train
2 2016-03-04  2016-03-07   AAPL   -0.011129  train
3 2016-03-07  2016-03-08   AAPL   -0.008280  train
4 2016-03-08  2016-03-09   AAPL    0.000890  train


In [3]:
# =========================================
# 3. Metrics & Utilities (Publication-grade)
# =========================================
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error

def directional_accuracy(y_true: np.ndarray, y_pred: np.ndarray, eps: float = 0.0) -> float:
    """
    Direction accuracy using >0 definition (NOT np.sign equality).
    eps allows ignoring tiny moves (e.g., 5 bps => eps=0.0005)
    """
    y_true = np.asarray(y_true).reshape(-1)
    y_pred = np.asarray(y_pred).reshape(-1)
    return float(np.mean((y_true > eps) == (y_pred > eps)))

def regression_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> dict:
    y_true = np.asarray(y_true).reshape(-1)
    y_pred = np.asarray(y_pred).reshape(-1)

    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)

    # Pearson correlation (guard for constant arrays)
    corr = np.nan
    if np.std(y_true) > 0 and np.std(y_pred) > 0:
        corr = float(np.corrcoef(y_true, y_pred)[0, 1])

    dir_acc = directional_accuracy(y_true, y_pred, eps=0.0)

    return {
        "MAE": float(mae),
        "RMSE": float(rmse),
        "Corr": corr,
        "DirectionalAcc": float(dir_acc),
    }

def oos_r2_vs_baseline_matched(pred_df: pd.DataFrame, baseline_pred_path: Path) -> dict:
    """
    Computes OOS R^2 vs baseline on the EXACT SAME ROWS by merging baseline predictions.
    pred_df must have: Ticker, target_date, split, y_true, y_pred
    baseline file should have: Ticker, target_date, y_pred
    """
    if not baseline_pred_path.exists():
        return {s: np.nan for s in ["train", "val", "test"]}

    base = pd.read_parquet(baseline_pred_path).copy()
    base["target_date"] = pd.to_datetime(base["target_date"])
    base = base.rename(columns={"y_pred": "y_pred_baseline"})[["Ticker", "target_date", "y_pred_baseline"]]

    merged = pred_df.merge(base, on=["Ticker", "target_date"], how="left", validate="many_to_one")
    out = {}

    for split in ["train", "val", "test"]:
        d = merged[merged["split"] == split].dropna(subset=["y_pred_baseline"]).copy()
        if len(d) == 0:
            out[split] = np.nan
            continue
        mse_model = float(np.mean((d["y_true"] - d["y_pred"]) ** 2))
        mse_base  = float(np.mean((d["y_true"] - d["y_pred_baseline"]) ** 2))
        out[split] = np.nan if mse_base <= 0 else float(1.0 - mse_model / mse_base)

    return out

print("Utilities loaded: regression_metrics, directional_accuracy, oos_r2_vs_baseline_matched")

Utilities loaded: regression_metrics, directional_accuracy, oos_r2_vs_baseline_matched


#  Feature Set Definition (Consistent across models)

In [4]:
from pandas.api.types import is_numeric_dtype

NON_FEATURES = {"Date", "target_date", "split", "Ticker", "target_ret", "has_garch"}

def numeric_feature_candidates(df: pd.DataFrame) -> list[str]:
    feats = []
    for c in df.columns:
        if c in NON_FEATURES:
            continue
        if is_numeric_dtype(df[c]):
            feats.append(c)
    return sorted(feats)

feature_candidates_base = numeric_feature_candidates(panel)
print("Num features (base):", len(feature_candidates_base))
print(feature_candidates_base[:15], "...")

Num features (base): 18
['DFF_diff_lag1', 'DFF_lag1', 'DGS10_diff_lag1', 'DGS10_lag1', 'SP500_lag1', 'mkt_ret_lag1', 'ret', 'ret_lag1', 'ret_lag10', 'ret_lag2', 'ret_lag3', 'ret_lag5', 'ret_vol10', 'ret_vol20', 'ret_vol5'] ...


#  Baseline Mean Models (Zero, Ticker-Mean, AR1, Ridge, RF)