# Setup (Paths, Seeds, Reproducibility)

In [1]:
# ==========================
# Global Setup (RUN ONCE)
# ==========================
import os, json, random
import numpy as np
import pandas as pd
from pathlib import Path

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

PROJECT_DIR = Path("stock_project")
DATA_DIR = PROJECT_DIR / "data"
RAW_DIR = DATA_DIR / "raw"
PROC_DIR = DATA_DIR / "processed"

REPORT_DIR = PROJECT_DIR / "reports"
PRED_DIR = REPORT_DIR / "predictions"
TAB_DIR  = REPORT_DIR / "tables"
FIG_DIR  = REPORT_DIR / "figures"
MODEL_DIR = PROJECT_DIR / "models"

for d in [RAW_DIR, PROC_DIR, PRED_DIR, TAB_DIR, FIG_DIR, MODEL_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print("PROJECT_DIR:", PROJECT_DIR.resolve())
print("PROC_DIR:", PROC_DIR.resolve())

PROJECT_DIR: /notebooks/stock_project
PROC_DIR: /notebooks/stock_project/data/processed


# Load Panel and Sanity Checks

In [2]:
panel_path = PROC_DIR / "panel.parquet"
assert panel_path.exists(), f"Missing {panel_path}. Build the panel first."

panel = pd.read_parquet(panel_path).copy()
panel["Date"] = pd.to_datetime(panel["Date"])
panel["target_date"] = pd.to_datetime(panel["target_date"])

print("panel shape:", panel.shape)
print("tickers:", panel["Ticker"].nunique())
print(panel["split"].value_counts(dropna=False))
print(panel[["Date","target_date","Ticker","target_ret","split"]].head())

panel shape: (19776, 23)
tickers: 8
split
train    7720
val      6048
test     6008
Name: count, dtype: int64
        Date target_date Ticker  target_ret  split
0 2016-03-02  2016-03-03   AAPL    0.007417  train
1 2016-03-03  2016-03-04   AAPL    0.014767  train
2 2016-03-04  2016-03-07   AAPL   -0.011129  train
3 2016-03-07  2016-03-08   AAPL   -0.008280  train
4 2016-03-08  2016-03-09   AAPL    0.000890  train


In [3]:
# =========================================
# 3. Metrics & Utilities (Publication-grade)
# =========================================
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error

def directional_accuracy(y_true: np.ndarray, y_pred: np.ndarray, eps: float = 0.0) -> float:
    """
    Direction accuracy using >0 definition (NOT np.sign equality).
    eps allows ignoring tiny moves (e.g., 5 bps => eps=0.0005)
    """
    y_true = np.asarray(y_true).reshape(-1)
    y_pred = np.asarray(y_pred).reshape(-1)
    return float(np.mean((y_true > eps) == (y_pred > eps)))

def regression_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> dict:
    y_true = np.asarray(y_true).reshape(-1)
    y_pred = np.asarray(y_pred).reshape(-1)

    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)

    # Pearson correlation (guard for constant arrays)
    corr = np.nan
    if np.std(y_true) > 0 and np.std(y_pred) > 0:
        corr = float(np.corrcoef(y_true, y_pred)[0, 1])

    dir_acc = directional_accuracy(y_true, y_pred, eps=0.0)

    return {
        "MAE": float(mae),
        "RMSE": float(rmse),
        "Corr": corr,
        "DirectionalAcc": float(dir_acc),
    }

def oos_r2_vs_baseline_matched(pred_df: pd.DataFrame, baseline_pred_path: Path) -> dict:
    """
    Computes OOS R^2 vs baseline on the EXACT SAME ROWS by merging baseline predictions.
    pred_df must have: Ticker, target_date, split, y_true, y_pred
    baseline file should have: Ticker, target_date, y_pred
    """
    if not baseline_pred_path.exists():
        return {s: np.nan for s in ["train", "val", "test"]}

    base = pd.read_parquet(baseline_pred_path).copy()
    base["target_date"] = pd.to_datetime(base["target_date"])
    base = base.rename(columns={"y_pred": "y_pred_baseline"})[["Ticker", "target_date", "y_pred_baseline"]]

    merged = pred_df.merge(base, on=["Ticker", "target_date"], how="left", validate="many_to_one")
    out = {}

    for split in ["train", "val", "test"]:
        d = merged[merged["split"] == split].dropna(subset=["y_pred_baseline"]).copy()
        if len(d) == 0:
            out[split] = np.nan
            continue
        mse_model = float(np.mean((d["y_true"] - d["y_pred"]) ** 2))
        mse_base  = float(np.mean((d["y_true"] - d["y_pred_baseline"]) ** 2))
        out[split] = np.nan if mse_base <= 0 else float(1.0 - mse_model / mse_base)

    return out

print("Utilities loaded: regression_metrics, directional_accuracy, oos_r2_vs_baseline_matched")

Utilities loaded: regression_metrics, directional_accuracy, oos_r2_vs_baseline_matched


#  Feature Set Definition (Consistent across models)

In [4]:
from pandas.api.types import is_numeric_dtype

NON_FEATURES = {"Date", "target_date", "split", "Ticker", "target_ret", "has_garch"}

def numeric_feature_candidates(df: pd.DataFrame) -> list[str]:
    feats = []
    for c in df.columns:
        if c in NON_FEATURES:
            continue
        if is_numeric_dtype(df[c]):
            feats.append(c)
    return sorted(feats)

feature_candidates_base = numeric_feature_candidates(panel)
print("Num features (base):", len(feature_candidates_base))
print(feature_candidates_base[:15], "...")

Num features (base): 18
['DFF_diff_lag1', 'DFF_lag1', 'DGS10_diff_lag1', 'DGS10_lag1', 'SP500_lag1', 'mkt_ret_lag1', 'ret', 'ret_lag1', 'ret_lag10', 'ret_lag2', 'ret_lag3', 'ret_lag5', 'ret_vol10', 'ret_vol20', 'ret_vol5'] ...


#  Baseline Mean Models (Zero, Ticker-Mean, AR1, Ridge, RF)

In [5]:
# ============================================================
# STEP 5 â€” Baseline Mean Models (FULL COPY/PASTE, RUN TOP->BOTTOM)
# Models:
#   1) baseline_zero
#   2) baseline_ticker_mean  (this becomes the reference baseline)
#   3) ar1_per_ticker
#   4) ridge_pooled (numeric + ticker one-hot)
#   5) rf_pooled    (numeric + ticker one-hot)
# Saves:
#   - reports/predictions/<model>.parquet
#   - reports/tables/baseline_metrics.csv
# ============================================================

import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor

# ----------------------------
# Preconditions (must exist from earlier steps)
# - panel (loaded from panel.parquet)
# - feature_candidates_base (from numeric_feature_candidates(panel))
# - PRED_DIR, TAB_DIR
# - regression_metrics(...)
# - oos_r2_vs_baseline_matched(...)
# ----------------------------

assert "panel" in globals(), "Run Step 2 (load panel) first."
assert "feature_candidates_base" in globals(), "Run Step 4 (feature candidates) first."
assert "PRED_DIR" in globals() and "TAB_DIR" in globals(), "Run Step 1 (paths) first."
assert "regression_metrics" in globals(), "Run Step 3 (metrics utilities) first."

df = panel.copy()
df["Date"] = pd.to_datetime(df["Date"])
df["target_date"] = pd.to_datetime(df["target_date"])

# Ensure target exists
assert "target_ret" in df.columns, "panel must contain target_ret."

# A strict time-respecting split is assumed already encoded in df['split']
assert "split" in df.columns, "panel must contain split column with train/val/test."

# ----------------------------
# Helpers
# ----------------------------

def _save_pred_df(pred_df: pd.DataFrame, model_name: str) -> Path:
    pred_df = pred_df.copy()
    pred_df["target_date"] = pd.to_datetime(pred_df["target_date"])
    path = PRED_DIR / f"{model_name}.parquet"
    pred_df.to_parquet(path, index=False)
    print(f"Saved predictions: {path} | rows={len(pred_df):,}")
    return path

def _metrics_from_pred(pred_df: pd.DataFrame, model_name: str) -> pd.DataFrame:
    rows = []
    for split in ["train", "val", "test"]:
        d = pred_df[pred_df["split"] == split]
        if len(d) == 0:
            continue
        m = regression_metrics(d["y_true"].to_numpy(), d["y_pred"].to_numpy())
        rows.append({"model": model_name, "split": split, "n": int(len(d)), **m})
    return pd.DataFrame(rows)

def _attach_matched_oos_r2(metrics_df: pd.DataFrame, pred_df: pd.DataFrame) -> pd.DataFrame:
    baseline_path = PRED_DIR / "baseline_ticker_mean.parquet"
    oos = oos_r2_vs_baseline_matched(pred_df, baseline_path)
    metrics_df = metrics_df.copy()
    metrics_df["OOS_R2_vs_baseline"] = metrics_df["split"].map(oos)
    return metrics_df

def _basic_pred_frame(df_sub: pd.DataFrame, y_pred: np.ndarray, model_name: str) -> pd.DataFrame:
    out = df_sub[["Date", "target_date", "Ticker", "split"]].copy()
    out["model"] = model_name
    out["y_true"] = df_sub["target_ret"].to_numpy()
    out["y_pred"] = np.asarray(y_pred).reshape(-1)
    out["residual"] = out["y_true"] - out["y_pred"]
    return out

def _find_ar1_feature(df: pd.DataFrame) -> str:
    """
    We prefer a 1-day lag of returns as AR(1) regressor.
    This tries to find an existing column. If none exists, it builds one
    from target_ret by shifting within ticker (safe: uses past info).
    """
    candidates = [
        "ret_lag1", "ret_lag_1", "ret_l1", "lag1_ret", "return_lag1", "logret_lag1"
    ]
    for c in candidates:
        if c in df.columns:
            return c

    # Try fuzzy find
    fuzzy = [c for c in df.columns if ("lag" in c.lower() and "ret" in c.lower() and "1" in c)]
    if len(fuzzy) > 0:
        return fuzzy[0]

    # Build from target_ret: ret at t is target_ret shifted back one day within ticker
    # target_ret at t corresponds to ret_{t+1}; shifting by 1 gives a proxy for ret_t.
    df["ar1_x"] = df.groupby("Ticker", sort=False)["target_ret"].shift(1)
    return "ar1_x"

# ----------------------------
# Create base split frames
# ----------------------------
train_df = df[df["split"] == "train"].copy()
val_df   = df[df["split"] == "val"].copy()
test_df  = df[df["split"] == "test"].copy()

print("Split sizes:", {k: int(v) for k, v in df["split"].value_counts().to_dict().items()})

all_metrics = []

# ============================================================
# 5.1 Baseline: Zero predictor
# ============================================================
model_name = "baseline_zero"
y_pred = np.zeros(len(df), dtype=float)
pred_df = _basic_pred_frame(df, y_pred, model_name)
_save_pred_df(pred_df, model_name)

m = _metrics_from_pred(pred_df, model_name)
# OOS_R2_vs_baseline will be NaN here because baseline file might not exist yet; that's fine.
all_metrics.append(m)

print(m)

# ============================================================
# 5.2 Baseline: Per-ticker mean (TRAIN only)  <-- reference baseline
# ============================================================
model_name = "baseline_ticker_mean"
ticker_mean = train_df.groupby("Ticker")["target_ret"].mean()

# If a ticker is missing in train (rare), fall back to global train mean
global_mean = float(train_df["target_ret"].mean())
y_pred = df["Ticker"].map(ticker_mean).fillna(global_mean).to_numpy()

pred_df = _basic_pred_frame(df, y_pred, model_name)
baseline_path = _save_pred_df(pred_df, model_name)

m = _metrics_from_pred(pred_df, model_name)
m = _attach_matched_oos_r2(m, pred_df)  # now baseline exists; this will be ~0 in expectation vs itself
all_metrics.append(m)

print(m)

# ============================================================
# 5.3 AR(1) per ticker (fit on TRAIN only within each ticker)
# ============================================================
model_name = "ar1_per_ticker"

ar1_col = _find_ar1_feature(df)
print("AR1 regressor column:", ar1_col)

pred_rows = []
for tkr, g in df.sort_values(["Ticker", "Date"]).groupby("Ticker", sort=False):
    g = g.copy()
    # Use only rows where x and y exist
    g = g.dropna(subset=[ar1_col, "target_ret"])
    if len(g) < 50:
        # too few points, fallback to train mean
        yhat = np.full(len(g), float(ticker_mean.get(tkr, global_mean)))
        pred_rows.append(_basic_pred_frame(g, yhat, model_name))
        continue

    g_tr = g[g["split"] == "train"]
    if len(g_tr) < 30:
        yhat = np.full(len(g), float(ticker_mean.get(tkr, global_mean)))
        pred_rows.append(_basic_pred_frame(g, yhat, model_name))
        continue

    X_tr = g_tr[[ar1_col]].to_numpy()
    y_tr = g_tr["target_ret"].to_numpy()

    lr = LinearRegression()
    lr.fit(X_tr, y_tr)

    X_all = g[[ar1_col]].to_numpy()
    yhat = lr.predict(X_all)

    pred_rows.append(_basic_pred_frame(g, yhat, model_name))

pred_df = pd.concat(pred_rows, ignore_index=True)
_save_pred_df(pred_df, model_name)

m = _metrics_from_pred(pred_df, model_name)
m = _attach_matched_oos_r2(m, pred_df)
all_metrics.append(m)

print(m)

# ============================================================
# 5.4 Ridge (pooled across tickers, numeric + ticker one-hot)
# ============================================================
model_name = "ridge_pooled"

# Choose a stable numeric feature set
num_features = feature_candidates_base.copy()
cat_features = ["Ticker"]

# Drop any features that are all-NaN in train (safety)
allnan = [c for c in num_features if train_df[c].isna().all()]
if len(allnan) > 0:
    print("Dropping all-NaN train features:", allnan[:20], "..." if len(allnan) > 20 else "")
    num_features = [c for c in num_features if c not in allnan]

print("Ridge num features:", len(num_features))

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("impute", SimpleImputer(strategy="median")),
            ("scale", StandardScaler())
        ]), num_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
    ],
    remainder="drop"
)

ridge = Ridge(alpha=1000.0, random_state=42)

pipe = Pipeline([
    ("prep", preprocess),
    ("model", ridge)
])

# Fit on TRAIN only
train_fit = train_df.dropna(subset=["target_ret"]).copy()
pipe.fit(train_fit[num_features + cat_features], train_fit["target_ret"])

# Predict on all rows where features exist (imputer handles NaNs in numerics)
yhat = pipe.predict(df[num_features + cat_features])
pred_df = _basic_pred_frame(df, yhat, model_name)
_save_pred_df(pred_df, model_name)

m = _metrics_from_pred(pred_df, model_name)
m = _attach_matched_oos_r2(m, pred_df)
all_metrics.append(m)

print(m)

# ============================================================
# 5.5 RandomForest (pooled, numeric + ticker one-hot)
# Notes:
# - No scaling needed
# - Still uses imputation for numerics
# ============================================================
model_name = "rf_pooled"

rf_preprocess = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), num_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
    ],
    remainder="drop"
)

rf = RandomForestRegressor(
    n_estimators=400,
    max_depth=None,
    min_samples_leaf=50,
    random_state=42,
    n_jobs=-1
)

rf_pipe = Pipeline([
    ("prep", rf_preprocess),
    ("model", rf)
])

train_fit = train_df.dropna(subset=["target_ret"]).copy()
rf_pipe.fit(train_fit[num_features + cat_features], train_fit["target_ret"])

yhat = rf_pipe.predict(df[num_features + cat_features])
pred_df = _basic_pred_frame(df, yhat, model_name)
_save_pred_df(pred_df, model_name)

m = _metrics_from_pred(pred_df, model_name)
m = _attach_matched_oos_r2(m, pred_df)
all_metrics.append(m)

print(m)

# ============================================================
# 5.6 Consolidate and save baseline metrics table
# ============================================================
baseline_metrics = pd.concat(all_metrics, ignore_index=True)

# Order columns nicely
col_order = ["model", "split", "n", "MAE", "RMSE", "Corr", "DirectionalAcc", "OOS_R2_vs_baseline"]
for c in col_order:
    if c not in baseline_metrics.columns:
        baseline_metrics[c] = np.nan
baseline_metrics = baseline_metrics[col_order]

baseline_metrics = baseline_metrics.sort_values(["split", "RMSE", "MAE"], ascending=[True, True, True]).reset_index(drop=True)

out_path = TAB_DIR / "baseline_metrics.csv"
baseline_metrics.to_csv(out_path, index=False)
print("Saved baseline metrics:", out_path)

baseline_metrics

Split sizes: {'train': 7720, 'val': 6048, 'test': 6008}
Saved predictions: stock_project/reports/predictions/baseline_zero.parquet | rows=19,776
           model  split     n       MAE      RMSE  Corr  DirectionalAcc
0  baseline_zero  train  7720  0.012403  0.019210   NaN        0.461140
1  baseline_zero    val  6048  0.019788  0.029259   NaN        0.477844
2  baseline_zero   test  6008  0.016531  0.024833   NaN        0.457390
Saved predictions: stock_project/reports/predictions/baseline_ticker_mean.parquet | rows=19,776
                  model  split     n       MAE      RMSE      Corr  \
0  baseline_ticker_mean  train  7720  0.012353  0.019176  0.025203   
1  baseline_ticker_mean    val  6048  0.019762  0.029256  0.007244   
2  baseline_ticker_mean   test  6008  0.016471  0.024781  0.015846   

   DirectionalAcc  OOS_R2_vs_baseline  
0        0.538860                 0.0  
1        0.522156                 0.0  
2        0.542610                 0.0  
AR1 regressor column: ret_lag1

Unnamed: 0,model,split,n,MAE,RMSE,Corr,DirectionalAcc,OOS_R2_vs_baseline
0,baseline_ticker_mean,test,6008,0.016471,0.024781,0.015846,0.54261,0.0
1,ar1_per_ticker,test,6008,0.016473,0.024786,0.020092,0.54261,-0.000373
2,baseline_zero,test,6008,0.016531,0.024833,,0.45739,
3,ridge_pooled,test,6008,0.017224,0.025323,0.039794,0.46022,-0.044188
4,rf_pooled,test,6008,0.019257,0.027006,0.045119,0.462051,-0.187616
5,rf_pooled,train,7720,0.011018,0.017436,0.580693,0.704793,0.17329
6,ridge_pooled,train,7720,0.012329,0.019092,0.098605,0.540026,0.008793
7,ar1_per_ticker,train,7720,0.012353,0.019163,0.044651,0.536269,0.001359
8,baseline_ticker_mean,train,7720,0.012353,0.019176,0.025203,0.53886,0.0
9,baseline_zero,train,7720,0.012403,0.01921,,0.46114,


In [6]:
# =========================================
# 6. Deep Learning: Shared Setup & Utilities
# =========================================
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import StandardScaler

tf.random.set_seed(SEED)

def make_sequences(df: pd.DataFrame, feature_cols: list[str], seq_len: int):
    """
    Build sequences per ticker in time order.
    Returns X (N, seq_len, F), y (N,), meta dataframe aligning sequences to target_date.
    """
    df = df.sort_values(["Ticker", "Date"]).copy()
    X_list, y_list, meta_rows = [], [], []

    for tkr, g in df.groupby("Ticker", sort=False):
        g = g.dropna(subset=feature_cols + ["target_ret"]).copy()
        vals = g[feature_cols].to_numpy(dtype=np.float32)
        y = g["target_ret"].to_numpy(dtype=np.float32)

        # sequence end index i predicts y[i] using past seq_len rows ending at i-1
        for i in range(seq_len, len(g)):
            X_list.append(vals[i-seq_len:i, :])
            y_list.append(y[i])
            meta_rows.append({
                "Ticker": tkr,
                "Date": g.iloc[i]["Date"],
                "target_date": g.iloc[i]["target_date"],
                "split": g.iloc[i]["split"],
            })

    if len(X_list) == 0:
        raise ValueError("No sequences created. Check NaNs, feature_cols, or seq_len.")

    X = np.stack(X_list, axis=0)
    y = np.array(y_list, dtype=np.float32)
    meta = pd.DataFrame(meta_rows)
    return X, y, meta

def build_gru(input_shape):
    model = keras.Sequential([
        layers.Input(shape=input_shape),
        layers.GRU(64),
        layers.Dense(32, activation="relu"),
        layers.Dense(1)
    ])
    model.compile(optimizer=keras.optimizers.Adam(1e-3), loss="mse")
    return model

print("DL utilities loaded: make_sequences, build_gru")

2026-02-28 15:41:30.333497: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2026-02-28 15:41:30.334244: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2026-02-28 15:41:30.408146: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2026-02-28 15:41:30.581574: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


DL utilities loaded: make_sequences, build_gru
