# PackBoost Numerai Demo

This notebook downloads a slice of the Numerai dataset, bins features, and
trains PackBoost on CPU and (optionally) GPU. It is designed for Google
Colab – adjust the installation cell if you prefer a local environment.

In [None]:
#@title Install dependencies (edit REPO_URL if you forked the project)
REPO_URL = "https://github.com/pranshubahadur/PackBoost.git"  # TODO: update if needed

!pip install -q numerapi pandas~=2.2 pyarrow~=15.0 catboost lightgbm xgboost
# Install CuPy for GPU acceleration. Pick the wheel matching your CUDA version.
try:
    import cupy  # noqa: F401
except ImportError:
    !pip install -q cupy-cuda12x

# Clone PackBoost and install in editable mode
import pathlib, sys
work_dir = pathlib.Path.cwd()
if not (work_dir / "PackBoost").exists():
    !git clone {REPO_URL} PackBoost
!pip install -q -e PackBoost
sys.path.append(str(work_dir / "PackBoost"))

## Download Numerai data
This cell pulls the v5.0 training and validation data. Set your API keys if
you plan to upload diagnostics at the end.

In [None]:
from numerapi import NumerAPI
import pandas as pd
from pathlib import Path

DATA_DIR = Path("numerai-data")
DATA_DIR.mkdir(exist_ok=True)

napi = NumerAPI()
napi.download_dataset('v5.0/train.parquet', str(DATA_DIR / 'train.parquet'))
napi.download_dataset('v5.0/validation.parquet', str(DATA_DIR / 'validation.parquet'))
napi.download_dataset('v5.0/features.json', str(DATA_DIR / 'features.json'))

train_df = pd.read_parquet(DATA_DIR / 'train.parquet')
valid_df = pd.read_parquet(DATA_DIR / 'validation.parquet')
features_json = pd.read_json(DATA_DIR / 'features.json')
feature_list = features_json['feature_sets']['all']

# keep recent eras for the demo to speed things up
train_df = train_df[train_df['era'] >= train_df['era'].max() - 10]
valid_df = valid_df[valid_df['era'] >= valid_df['era'].max() - 4]

print('Train shape:', train_df.shape)
print('Valid shape:', valid_df.shape)

## Binning and feature prep

In [None]:
import numpy as np
from packboost.utils.binning import quantile_binning, apply_binning

X_train = train_df[feature_list].astype(np.float32).values
y_train = train_df['target'].astype(np.float32).values
era_train = train_df['era'].astype(np.int32).values

X_valid = valid_df[feature_list].astype(np.float32).values
y_valid = valid_df['target'].astype(np.float32).values
era_valid = valid_df['era'].astype(np.int32).values

X_train_binned, bin_edges = quantile_binning(X_train, max_bins=128, random_state=42)
X_valid_binned = apply_binning(X_valid, bin_edges)


## Train PackBoost

In [None]:
from packboost import PackBoost, PackBoostConfig
from packboost.gpu import has_cuda
from sklearn.metrics import r2_score
import time

device = 'cuda' if has_cuda() else 'cpu'
config = PackBoostConfig(
    pack_size=8,
    max_depth=5,
    learning_rate=0.05,
    lambda_l2=1.0,
    lambda_dro=0.5,
    max_bins=128,
    min_samples_leaf=32,
    random_state=42,
    layer_feature_fraction=0.5,
    device=device,
)

booster = PackBoost(config)
start = time.perf_counter()
booster.fit(X_train, y_train, era_train, num_rounds=20)
fit_time = time.perf_counter() - start

start = time.perf_counter()
preds = booster.predict(X_valid)
pred_time = time.perf_counter() - start

print(f'Device: {device}')
print(f'Fit time: {fit_time:.2f}s')
print(f'Predict time: {pred_time:.2f}s')
print(f'Validation R^2: {r2_score(y_valid, preds):.4f}')


## Upload diagnostics (optional)

In [None]:
# Provide your credentials to upload to Numerai
# napi = NumerAPI(public_id='PUBLIC', secret_key='SECRET')
# submission = pd.DataFrame({'prediction': preds}).clip(0, 1)
# submission.to_csv('packboost_preds.csv', index=False)
# napi.upload_diagnostics('packboost_preds.csv', model_id='YOUR_MODEL_ID')
print('Diagnostics upload skipped.')