[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Pranshu-Bahadur/PackBoost/blob/main/notebooks/numerai_gpu_demo.ipynb)


# PackBoost Numerai GPU Demo

Train PackBoost on Numerai v5.0 with CUDA frontier enabled, bucket eras into user-defined
groups, and monitor per-round train/validation correlation as well as trees-per-second.


In [None]:
#@title Install PackBoost and dependencies
REPO_URL = "https://github.com/Pranshu-Bahadur/PackBoost.git"  # change if using a fork

import subprocess, sys, os

if not os.path.exists('PackBoost'):
    subprocess.run(['git', 'clone', REPO_URL, 'PackBoost'], check=True)

os.chdir('PackBoost')

# Build native extensions (enables CUDA frontier when nvcc is present)
subprocess.run([sys.executable, 'setup_native.py', 'build_ext', '--inplace'], check=True)

# Install in editable mode so notebooks can import packboost
subprocess.run([sys.executable, '-m', 'pip', 'install', '-e', '.'], check=True)

print('PackBoost ready.')


## Download Numerai data

This cell pulls the v5.0 training and validation parquet files. Provide keys if you plan to
upload diagnostics later.


In [None]:
from numerapi import NumerAPI
from pathlib import Path
import json
import pandas as pd

DATA_VERSION = 'v5.0'
DATA_DIR = Path('numerai-data')
DATA_DIR.mkdir(exist_ok=True)

napi = NumerAPI()

feature_path = DATA_DIR / 'features.json'
train_path = DATA_DIR / 'train.parquet'
valid_path = DATA_DIR / 'validation.parquet'

if not feature_path.exists():
    napi.download_dataset(f"{DATA_VERSION}/features.json", str(feature_path))
if not train_path.exists():
    napi.download_dataset(f"{DATA_VERSION}/train.parquet", str(train_path))
if not valid_path.exists():
    napi.download_dataset(f"{DATA_VERSION}/validation.parquet", str(valid_path))

with feature_path.open('r', encoding='utf-8') as fh:
    FEATURES = json.load(fh)['feature_sets']['all']


## Preprocess

* Convert targets to float32 and drop rows where the target is NaN.
* Bucket consecutive eras into groups of configurable size (default 64).
* Leave features as uint8 bins so we can set `prebinned=True`.


In [None]:
import numpy as np
import torch
import gc

ERA_BUCKET_SIZE = 64  # feel free to tweak

train_df = pd.read_parquet(train_path)
train_df = train_df.dropna(subset=['target']).reset_index(drop=True)
train_df['era'] = train_df['era'].astype(np.int32)
train_df['era_bucket'] = (train_df['era'] // ERA_BUCKET_SIZE).astype(np.int32)

Xt = train_df[FEATURES].astype(np.uint8).values
yt = train_df['target'].astype(np.float32).values
Et = train_df['era_bucket'].to_numpy(np.int32)

valid_df = pd.read_parquet(valid_path)
valid_df = valid_df.dropna(subset=['target']).reset_index(drop=True)
valid_df['era'] = valid_df['era'].astype(np.int32)
valid_df['era_bucket'] = (valid_df['era'] // ERA_BUCKET_SIZE).astype(np.int32)

Xv = valid_df[FEATURES].astype(np.uint8).values
Yv = valid_df['target'].astype(np.float32).values
Ev = valid_df['era_bucket'].to_numpy(np.int32)

del train_df
gc.collect()


## Train PackBoost with CUDA frontier

The training loop logs metrics each round (train/validation correlation and trees per second).


In [None]:
from packboost.booster import PackBoost
from packboost.config import PackBoostConfig
from packboost import backends
from time import perf_counter
import pandas as pd

print('CUDA backend available:', backends.cuda_available())

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

config = PackBoostConfig(
    pack_size=8,
    max_depth=6,
    learning_rate=0.05,
    lambda_l2=1e-6,
    lambda_dro=0.0,
    direction_weight=0.0,
    min_samples_leaf=20,
    max_bins=64,
    k_cuts=0,
    device=str(DEVICE),
    prebinned=True,
)

round_logs = []

def log_round(idx: int, metrics: dict[str, float]) -> None:
    print(f"Round {metrics['round']:>3}: train_corr={metrics['train_corr']:+.4f} "
          f"valid_corr={metrics.get('valid_corr', float('nan')):+.4f} "
          f"trees/s={metrics['trees_per_second']:.2f}")
    round_logs.append(metrics)

booster = PackBoost(config)
start = perf_counter()
booster.fit(
    Xt,
    yt,
    Et,
    num_rounds=20,
    eval_sets=[('valid', Xv, Yv, Ev)],
    round_callback=log_round,
)
elapsed = perf_counter() - start
print(f"Finished training in {elapsed:.2f} seconds")

metrics_df = pd.DataFrame(round_logs)
metrics_df


## Plot per-round correlation


In [None]:
import matplotlib.pyplot as plt
if not metrics_df.empty:
    ax = metrics_df.plot(x='round', y=['train_corr', 'valid_corr'], marker='o')
    ax.set_ylabel('Correlation')
    ax.set_title('PackBoost correlation per round')
    ax.grid(True)
    plt.show()


## Generate validation predictions

Predictions are normalised to the [0,1] range expected by Numerai diagnostics.


In [None]:
pred_valid = booster.predict(Xv)

pred_norm = pred_valid.copy()
pred_norm -= pred_norm.min()
if pred_norm.max() > 0:
    pred_norm /= pred_norm.max()
pred_norm = np.clip(pred_norm * 0.98 + 0.01, 0.0, 1.0)

submission = pd.DataFrame({'prediction': pred_norm})
submission.to_csv('packboost_predictions.csv', index=False)
print('Saved packboost_predictions.csv')

gc.collect()
