# PackBoost Numerai GPU Demo

In [None]:
# Optional: install dependencies in fresh environments
# !pip install -q numerapi

import gc
import json
from pathlib import Path
from typing import Dict, List

import numpy as np
import pandas as pd
import torch
from numerapi import NumerAPI

from packboost.booster import PackBoost
from packboost.config import PackBoostConfig

## Configuration

In [None]:
# Numerai credentials (only needed for diagnostics upload)
your_public_id = ""
your_secret_key = ""
your_model_slot_name = ""

# Training hyperparameters
ERA_BUCKET_SIZE = 64        # consecutive eras per bucket for DES stability
NUM_ROUNDS = 20             # number of boosting rounds
PACK_SIZE = 8               # trees grown per round
MAX_DEPTH = 6
LEARNING_RATE = 0.05
MIN_SAMPLES_LEAF = 20
MAX_BINS = 64
K_CUTS = 0                  # 0 => use all thresholds

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DATA_VERSION = "v5.0"
DATA_DIR = Path(DATA_VERSION)
DATA_DIR.mkdir(exist_ok=True)

## Download Numerai data

In [None]:
napi = NumerAPI(your_public_id, your_secret_key)

feature_path = DATA_DIR / "features.json"
train_path = DATA_DIR / "train.parquet"
valid_path = DATA_DIR / "validation.parquet"

if not feature_path.exists():
    napi.download_dataset(f"{DATA_VERSION}/features.json", str(feature_path))
if not train_path.exists():
    napi.download_dataset(f"{DATA_VERSION}/train.parquet", str(train_path))
if not valid_path.exists():
    napi.download_dataset(f"{DATA_VERSION}/validation.parquet", str(valid_path))

## Load and preprocess

In [None]:
with feature_path.open("r", encoding="utf-8") as fh:
    features_meta: Dict = json.load(fh)
FEATURES: List[str] = features_meta["feature_sets"]["all"]

train_df = pd.read_parquet(train_path)
train_df["era"] = train_df["era"].astype(np.int32)
train_df = train_df.dropna(subset=["target"]).reset_index(drop=True)
train_df["era_bucket"] = (train_df["era"] // ERA_BUCKET_SIZE).astype(np.int32)

Xt = train_df[FEATURES].fillna(2).astype(np.uint8).values
yt = train_df["target"].astype(np.float32).values
Et = train_df["era_bucket"].to_numpy(np.int32)

del train_df
gc.collect()

valid_df = pd.read_parquet(valid_path)
valid_df = valid_df.dropna(subset=["target"])                     .reset_index(drop=True)
valid_df["era"] = valid_df["era"].astype(np.int32)
valid_df["era_bucket"] = (valid_df["era"] // ERA_BUCKET_SIZE).astype(np.int32)

Xv = valid_df[FEATURES].fillna(2).astype(np.uint8).values
Yv = valid_df["target"].astype(np.float32).values
Ev = valid_df["era_bucket"].to_numpy(np.int32)

## Train PackBoost

In [None]:
config = PackBoostConfig(
    pack_size=PACK_SIZE,
    max_depth=MAX_DEPTH,
    learning_rate=LEARNING_RATE,
    lambda_l2=1e-6,
    lambda_dro=0.0,
    direction_weight=0.0,
    min_samples_leaf=MIN_SAMPLES_LEAF,
    max_bins=MAX_BINS,
    k_cuts=K_CUTS,
    device=str(DEVICE),
    prebinned=True,
)

booster = PackBoost(config)
booster.fit(Xt, yt, Et, num_rounds=NUM_ROUNDS)

## Per-round train/validation correlation

In [None]:
def era_correlation(eras: np.ndarray, target: np.ndarray, pred: np.ndarray) -> float:
    frame = pd.DataFrame({"era": eras, "target": target, "pred": pred})
    cors = []
    for _, grp in frame.groupby("era", sort=False):
        if grp["target"].std(ddof=0) == 0 or grp["pred"].std(ddof=0) == 0:
            continue
        val = np.corrcoef(grp["target"], grp["pred"])[0, 1]
        if np.isfinite(val):
            cors.append(val)
    return float(np.mean(cors)) if cors else float("nan")


def compute_round_correlations(model: PackBoost) -> pd.DataFrame:
    B = model._trained_pack_size or model.config.pack_size
    weight = float(model._tree_weight or (model.config.learning_rate / model.config.pack_size))
    num_packs = len(model._trees) // B

    bins_train = torch.from_numpy(Xt).to(model._device, dtype=torch.int32)
    bins_valid = torch.from_numpy(Xv).to(model._device, dtype=torch.int32)
    y_train = torch.from_numpy(yt).to(model._device, dtype=torch.float32)
    y_valid = torch.from_numpy(Yv).to(model._device, dtype=torch.float32)

    preds_train = torch.zeros_like(y_train)
    preds_valid = torch.zeros_like(y_valid)

    records = []
    for pack_idx in range(num_packs):
        start = pack_idx * B
        end = start + B
        for tree in model._trees[start:end]:
            preds_train += weight * tree.predict_bins(bins_train)
            preds_valid += weight * tree.predict_bins(bins_valid)

        corr_train = era_correlation(Et, y_train.cpu().numpy(), preds_train.cpu().numpy())
        corr_valid = era_correlation(Ev, y_valid.cpu().numpy(), preds_valid.cpu().numpy())
        records.append({"round": pack_idx + 1, "train_corr": corr_train, "valid_corr": corr_valid})

    return pd.DataFrame(records)

round_stats = compute_round_correlations(booster)
round_stats

## Validation predictions & submission scaffold

In [None]:
pred_valid = booster.predict(Xv)

# scale to [0, 1] for Numerai diagnostics
def normalize_preds(preds: np.ndarray) -> np.ndarray:
    preds = preds.copy()
    preds -= preds.min()
    if preds.max() > 0:
        preds /= preds.max()
    return np.clip(preds * 0.98 + 0.01, 0.0, 1.0)

valid_df["prediction"] = normalize_preds(pred_valid.astype(np.float32))
valid_df[["prediction"]].to_csv("packboost_predictions.csv", index=False)

print("Saved packboost_predictions.csv (run napi.upload_diagnostics manually if desired).")

gc.collect()
