In [9]:
# ==========================
# Global Setup
# ==========================
import os, random
import numpy as np
import pandas as pd
from pathlib import Path

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

PROJECT_DIR = Path("stock_project")
DATA_DIR = PROJECT_DIR / "data"
RAW_DIR = DATA_DIR / "raw"
PROC_DIR = DATA_DIR / "processed"

REPORT_DIR = PROJECT_DIR / "reports"
PRED_DIR = REPORT_DIR / "predictions"
TAB_DIR  = REPORT_DIR / "tables"
FIG_DIR  = REPORT_DIR / "figures"
MODEL_DIR = PROJECT_DIR / "models"

for d in [RAW_DIR, PROC_DIR, PRED_DIR, TAB_DIR, FIG_DIR, MODEL_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print("PROJECT_DIR:", PROJECT_DIR.resolve())
print("PROC_DIR:", PROC_DIR.resolve())
print("PRED_DIR:", PRED_DIR.resolve())
print("TAB_DIR :", TAB_DIR.resolve())
print("FIG_DIR :", FIG_DIR.resolve())

PROJECT_DIR: /notebooks/stock_project
PROC_DIR: /notebooks/stock_project/data/processed
PRED_DIR: /notebooks/stock_project/reports/predictions
TAB_DIR : /notebooks/stock_project/reports/tables
FIG_DIR : /notebooks/stock_project/reports/figures


In [10]:
# ==========================
# Load Base Panel
# ==========================
panel_path = PROC_DIR / "panel.parquet"
assert panel_path.exists(), f"Missing {panel_path}. Build your panel first."

panel = pd.read_parquet(panel_path).copy()
panel["Date"] = pd.to_datetime(panel["Date"])
panel["target_date"] = pd.to_datetime(panel["target_date"])

required = ["Date","target_date","Ticker","split","target_ret"]
missing = [c for c in required if c not in panel.columns]
assert not missing, f"panel missing columns: {missing}"

print("panel shape:", panel.shape)
print("tickers:", panel["Ticker"].nunique())
print("split counts:\n", panel["split"].value_counts(dropna=False))
print(panel[["Date","target_date","Ticker","target_ret","split"]].head())

panel shape: (19776, 23)
tickers: 8
split counts:
 split
train    7720
val      6048
test     6008
Name: count, dtype: int64
        Date target_date Ticker  target_ret  split
0 2016-03-02  2016-03-03   AAPL    0.007417  train
1 2016-03-03  2016-03-04   AAPL    0.014767  train
2 2016-03-04  2016-03-07   AAPL   -0.011129  train
3 2016-03-07  2016-03-08   AAPL   -0.008280  train
4 2016-03-08  2016-03-09   AAPL    0.000890  train


In [11]:
# ==========================
# Integrity & Leakage Checks
# ==========================
df = panel.copy()

# Basic NaN checks
print("NaNs in target_ret:", df["target_ret"].isna().sum())

# Check split ordering by target_date (train < val < test ideally)
df = df.sort_values(["Ticker", "target_date"])
split_minmax = df.groupby("split")["target_date"].agg(["min","max"]).sort_index()
print("target_date min/max by split:\n", split_minmax)

# Ensure no target_date earlier than Date
bad = (df["target_date"] < df["Date"]).sum()
print("Rows where target_date < Date:", bad)

# Ensure no overlap of target_date between splits (not always strictly true but should usually be)
train_max = df.loc[df["split"]=="train", "target_date"].max()
val_min   = df.loc[df["split"]=="val", "target_date"].min()
val_max   = df.loc[df["split"]=="val", "target_date"].max()
test_min  = df.loc[df["split"]=="test", "target_date"].min()

print("train_max:", train_max)
print("val_min  :", val_min, "val_max:", val_max)
print("test_min :", test_min)

NaNs in target_ret: 0
target_date min/max by split:
              min        max
split                      
test  2023-01-03 2025-12-30
train 2016-03-03 2019-12-31
val   2020-01-02 2022-12-30
Rows where target_date < Date: 0
train_max: 2019-12-31 00:00:00
val_min  : 2020-01-02 00:00:00 val_max: 2022-12-30 00:00:00
test_min : 2023-01-03 00:00:00


In [12]:
# ==========================
#  Metrics Utilities 
# ==========================
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error

def directional_accuracy(y_true: np.ndarray, y_pred: np.ndarray, eps: float = 0.0) -> float:
    y_true = np.asarray(y_true).reshape(-1)
    y_pred = np.asarray(y_pred).reshape(-1)
    return float(np.mean((y_true > eps) == (y_pred > eps)))

def regression_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> dict:
    y_true = np.asarray(y_true).reshape(-1)
    y_pred = np.asarray(y_pred).reshape(-1)

    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)

    corr = np.nan
    if np.std(y_true) > 0 and np.std(y_pred) > 0:
        corr = float(np.corrcoef(y_true, y_pred)[0, 1])

    dir_acc = directional_accuracy(y_true, y_pred, eps=0.0)

    return {"MAE": float(mae), "RMSE": float(rmse), "Corr": corr, "DirectionalAcc": float(dir_acc)}

def oos_r2_vs_baseline_matched(pred_df: pd.DataFrame, baseline_pred_path: Path) -> dict:
    if not baseline_pred_path.exists():
        return {s: np.nan for s in ["train","val","test"]}

    base = pd.read_parquet(baseline_pred_path).copy()
    base["target_date"] = pd.to_datetime(base["target_date"])
    base = base.rename(columns={"y_pred":"y_pred_baseline"})[["Ticker","target_date","y_pred_baseline"]]

    merged = pred_df.merge(base, on=["Ticker","target_date"], how="left", validate="many_to_one")
    out = {}

    for split in ["train","val","test"]:
        d = merged[merged["split"] == split].dropna(subset=["y_pred_baseline"])
        if len(d) == 0:
            out[split] = np.nan
            continue
        mse_model = float(np.mean((d["y_true"] - d["y_pred"])**2))
        mse_base  = float(np.mean((d["y_true"] - d["y_pred_baseline"])**2))
        out[split] = np.nan if mse_base <= 0 else float(1.0 - mse_model/mse_base)
    return out

print("Loaded: regression_metrics, directional_accuracy, oos_r2_vs_baseline_matched")

Loaded: regression_metrics, directional_accuracy, oos_r2_vs_baseline_matched


In [13]:
# ==========================
#  Feature Candidates 
# ==========================
from pandas.api.types import is_numeric_dtype

NON_FEATURES = {"Date","target_date","split","Ticker","target_ret","has_garch"}

def numeric_feature_candidates(df: pd.DataFrame) -> list[str]:
    feats = []
    for c in df.columns:
        if c in NON_FEATURES:
            continue
        if is_numeric_dtype(df[c]):
            feats.append(c)
    return sorted(feats)

feature_candidates_base = numeric_feature_candidates(panel)
print("Number of numeric features:", len(feature_candidates_base))
print("Sample features:", feature_candidates_base[:20])

Number of numeric features: 18
Sample features: ['DFF_diff_lag1', 'DFF_lag1', 'DGS10_diff_lag1', 'DGS10_lag1', 'SP500_lag1', 'mkt_ret_lag1', 'ret', 'ret_lag1', 'ret_lag10', 'ret_lag2', 'ret_lag3', 'ret_lag5', 'ret_vol10', 'ret_vol20', 'ret_vol5', 'sp500_ret_lag1', 'vix_level_lag1', 'vix_ret_lag1']


In [None]:
# ============================================================
# STEP 6 â€” Baseline Mean Models (FULL COPY/PASTE, RUN TOP->BOTTOM)
# ============================================================

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor

df = panel.copy()
df["Date"] = pd.to_datetime(df["Date"])
df["target_date"] = pd.to_datetime(df["target_date"])

train_df = df[df["split"]=="train"].copy()
val_df   = df[df["split"]=="val"].copy()
test_df  = df[df["split"]=="test"].copy()

def _save_pred_df(pred_df: pd.DataFrame, model_name: str) -> Path:
    pred_df = pred_df.copy()
    pred_df["target_date"] = pd.to_datetime(pred_df["target_date"])
    path = PRED_DIR / f"{model_name}.parquet"
    pred_df.to_parquet(path, index=False)
    print(f"Saved predictions: {path} | rows={len(pred_df):,}")
    return path

def _metrics_from_pred(pred_df: pd.DataFrame, model_name: str) -> pd.DataFrame:
    rows = []
    for split in ["train","val","test"]:
        d = pred_df[pred_df["split"]==split]
        if len(d)==0:
            continue
        m = regression_metrics(d["y_true"].to_numpy(), d["y_pred"].to_numpy())
        rows.append({"model":model_name,"split":split,"n":int(len(d)),**m})
    return pd.DataFrame(rows)

def _attach_matched_oos_r2(metrics_df: pd.DataFrame, pred_df: pd.DataFrame) -> pd.DataFrame:
    baseline_path = PRED_DIR / "baseline_ticker_mean.parquet"
    oos = oos_r2_vs_baseline_matched(pred_df, baseline_path)
    metrics_df = metrics_df.copy()
    metrics_df["OOS_R2_vs_baseline"] = metrics_df["split"].map(oos)
    return metrics_df

def _basic_pred_frame(df_sub: pd.DataFrame, y_pred: np.ndarray, model_name: str) -> pd.DataFrame:
    out = df_sub[["Date","target_date","Ticker","split"]].copy()
    out["model"] = model_name
    out["y_true"] = df_sub["target_ret"].to_numpy()
    out["y_pred"] = np.asarray(y_pred).reshape(-1)
    out["residual"] = out["y_true"] - out["y_pred"]
    return out

def _find_ar1_feature(df: pd.DataFrame) -> str:
    candidates = ["ret_lag1","ret_lag_1","ret_l1","lag1_ret","return_lag1","logret_lag1"]
    for c in candidates:
        if c in df.columns:
            return c
    fuzzy = [c for c in df.columns if ("lag" in c.lower() and "ret" in c.lower() and "1" in c)]
    if len(fuzzy)>0:
        return fuzzy[0]
    df["ar1_x"] = df.groupby("Ticker", sort=False)["target_ret"].shift(1)
    return "ar1_x"

all_metrics = []

# 6.1 baseline_zero
model_name = "baseline_zero"
pred_df = _basic_pred_frame(df, np.zeros(len(df)), model_name)
_save_pred_df(pred_df, model_name)
m = _metrics_from_pred(pred_df, model_name)
all_metrics.append(m)
print(m)

# 6.2 baseline_ticker_mean (TRAIN only)
model_name = "baseline_ticker_mean"
ticker_mean = train_df.groupby("Ticker")["target_ret"].mean()
global_mean = float(train_df["target_ret"].mean())
y_pred = df["Ticker"].map(ticker_mean).fillna(global_mean).to_numpy()
pred_df = _basic_pred_frame(df, y_pred, model_name)
_save_pred_df(pred_df, model_name)
m = _metrics_from_pred(pred_df, model_name)
m = _attach_matched_oos_r2(m, pred_df)
all_metrics.append(m)
print(m)

# 6.3 AR1 per ticker
model_name = "ar1_per_ticker"
ar1_col = _find_ar1_feature(df)
print("AR1 regressor column:", ar1_col)

pred_rows = []
for tkr, g in df.sort_values(["Ticker","Date"]).groupby("Ticker", sort=False):
    g = g.dropna(subset=[ar1_col,"target_ret"]).copy()
    if len(g) < 50:
        yhat = np.full(len(g), float(ticker_mean.get(tkr, global_mean)))
        pred_rows.append(_basic_pred_frame(g, yhat, model_name))
        continue

    g_tr = g[g["split"]=="train"]
    if len(g_tr) < 30:
        yhat = np.full(len(g), float(ticker_mean.get(tkr, global_mean)))
        pred_rows.append(_basic_pred_frame(g, yhat, model_name))
        continue

    lr = LinearRegression()
    lr.fit(g_tr[[ar1_col]].to_numpy(), g_tr["target_ret"].to_numpy())
    yhat = lr.predict(g[[ar1_col]].to_numpy())
    pred_rows.append(_basic_pred_frame(g, yhat, model_name))

pred_df = pd.concat(pred_rows, ignore_index=True)
_save_pred_df(pred_df, model_name)
m = _metrics_from_pred(pred_df, model_name)
m = _attach_matched_oos_r2(m, pred_df)
all_metrics.append(m)
print(m)

# 6.4 ridge_pooled
model_name = "ridge_pooled"
num_features = feature_candidates_base.copy()
cat_features = ["Ticker"]

allnan = [c for c in num_features if train_df[c].isna().all()]
if allnan:
    print("Dropping all-NaN train features:", allnan[:20])
    num_features = [c for c in num_features if c not in allnan]

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("impute", SimpleImputer(strategy="median")),
            ("scale", StandardScaler())
        ]), num_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
    ],
    remainder="drop"
)

pipe = Pipeline([
    ("prep", preprocess),
    ("model", Ridge(alpha=1000.0, random_state=42))
])

pipe.fit(train_df[num_features + cat_features], train_df["target_ret"])
yhat = pipe.predict(df[num_features + cat_features])
pred_df = _basic_pred_frame(df, yhat, model_name)
_save_pred_df(pred_df, model_name)
m = _metrics_from_pred(pred_df, model_name)
m = _attach_matched_oos_r2(m, pred_df)
all_metrics.append(m)
print(m)

# 6.5 rf_pooled
model_name = "rf_pooled"
rf_preprocess = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), num_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
    ],
    remainder="drop"
)

rf_pipe = Pipeline([
    ("prep", rf_preprocess),
    ("model", RandomForestRegressor(
        n_estimators=400,
        min_samples_leaf=50,
        random_state=42,
        n_jobs=-1
    ))
])

rf_pipe.fit(train_df[num_features + cat_features], train_df["target_ret"])
yhat = rf_pipe.predict(df[num_features + cat_features])
pred_df = _basic_pred_frame(df, yhat, model_name)
_save_pred_df(pred_df, model_name)
m = _metrics_from_pred(pred_df, model_name)
m = _attach_matched_oos_r2(m, pred_df)
all_metrics.append(m)
print(m)

# Save baseline metrics
baseline_metrics = pd.concat(all_metrics, ignore_index=True)
col_order = ["model","split","n","MAE","RMSE","Corr","DirectionalAcc","OOS_R2_vs_baseline"]
for c in col_order:
    if c not in baseline_metrics.columns:
        baseline_metrics[c] = np.nan
baseline_metrics = baseline_metrics[col_order].sort_values(["split","RMSE"]).reset_index(drop=True)

out_path = TAB_DIR / "baseline_metrics.csv"
baseline_metrics.to_csv(out_path, index=False)
print("Saved baseline metrics:", out_path)
baseline_metrics