[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Pranshu-Bahadur/PackBoost/blob/main/notebooks/numerai_gpu_demo.ipynb)

# PackBoost Numerai Demo

This notebook downloads a slice of the Numerai dataset, bins features, and
trains PackBoost on CPU and (optionally) GPU. It is designed for Google
Colab – adjust the installation cell if you prefer a local environment.

In [None]:
#@title Install dependencies (edit REPO_URL if you forked the project)
REPO_URL = "https://github.com/Pranshu-Bahadur/PackBoost.git"  # TODO: update if needed
FORCE_CPU_BACKEND = False  # Set True to skip CUDA build (uses PACKBOOST_DISABLE_CUDA)

from pathlib import Path
import os
cuda_bin = Path('/usr/local/cuda/bin')
if cuda_bin.exists() and str(cuda_bin) not in os.environ['PATH']:
    os.environ['PATH'] = os.environ['PATH'] + os.pathsep + str(cuda_bin)
!pip install -q numerapi pandas~=2.2 pyarrow~=15.0
!pip install -q catboost lightgbm xgboost

if FORCE_CPU_BACKEND:
    os.environ['PACKBOOST_DISABLE_CUDA'] = '1'
else:
    os.environ.setdefault('PACKBOOST_DISABLE_CUDA', '0')

# Build native frontier backend
!pip install -q pybind11
%cd /content
if not (Path('PackBoost').exists()):
    !git clone {REPO_URL} PackBoost
%cd PackBoost
!python3 setup_native.py build_ext --inplace
!pip install -q -e .
import sys; sys.path.append('/content/PackBoost')



## Download Numerai data
This cell pulls the v5.0 training and validation data. Set your API keys if
you plan to upload diagnostics at the end.

In [None]:
from numerapi import NumerAPI
import pandas as pd
from pathlib import Path

DATA_DIR = Path("numerai-data")
DATA_DIR.mkdir(exist_ok=True)

napi = NumerAPI()
napi.download_dataset('v5.0/train.parquet', str(DATA_DIR / 'train.parquet'))
napi.download_dataset('v5.0/validation.parquet', str(DATA_DIR / 'validation.parquet'))
napi.download_dataset('v5.0/features.json', str(DATA_DIR / 'features.json'))

train_df = pd.read_parquet(DATA_DIR / 'train.parquet')
valid_df = pd.read_parquet(DATA_DIR / 'validation.parquet')
features_json = pd.read_json(DATA_DIR / 'features.json')
feature_list = features_json['feature_sets']['all']

# keep recent eras for the demo to speed things up
train_df = train_df[train_df['era'] >= train_df['era'].max() - 10]
valid_df = valid_df[valid_df['era'] >= valid_df['era'].max() - 4]

print('Train shape:', train_df.shape)
print('Valid shape:', valid_df.shape)

## Binning and feature prep

In [None]:
import numpy as np
from packboost import PackBoost, PackBoostConfig

X_train = train_df[feature_list].astype(np.int8).values
y_train = train_df['target'].astype(np.float32).values
era_train = train_df['era'].astype(np.int16).values

X_valid = valid_df[feature_list].astype(np.int8).values
y_valid = valid_df['target'].astype(np.float32).values
era_valid = valid_df['era'].astype(np.int16).values


## Train PackBoost

In [None]:

from typing import List
import numpy as np

class LoggingCallback:
    def __init__(self, every: int = 5, print_train: bool = False) -> None:
        self.every = max(1, int(every))
        self.print_train = print_train
        self.logs: List[dict] = []

    def _record(self, info: dict) -> None:
        self.logs.append(
            {
                "round": int(info.get("round", 0)),
                "train_corr": float(info.get("train_corr", 0.0)),
                "valid_corr": float(info.get("valid_corr", np.nan)),
            }
        )

    def on_round(self, booster, info: dict) -> None:
        round_idx = int(info.get("round", 0))
        if round_idx % self.every != 0:
            return
        train_corr = float(info.get("train_corr", 0.0))
        valid_corr = info.get("valid_corr")

        metrics_parts: List[str] = []
        if self.print_train and np.isfinite(train_corr):
            metrics_parts.append(f"train corr = {train_corr:.4f}")
        if valid_corr is not None:
            if np.isfinite(valid_corr):
                metrics_parts.append(f"valid corr = {float(valid_corr):.4f}")
            else:
                metrics_parts.append("valid corr = nan")
        elif self.print_train and not metrics_parts:
            metrics_parts.append(f"train corr = {train_corr:.4f}")

        if metrics_parts:
            print(f"Round {round_idx}: {', '.join(metrics_parts)}")
        self._record(info)

    __call__ = on_round

    def to_frame(self):
        import pandas as pd
        return pd.DataFrame(self.logs)



In [None]:

from packboost import PackBoost, PackBoostConfig
from packboost import backends as pb_backends
from packboost.backends import cuda_available
from sklearn.metrics import r2_score
import time

device = 'cuda' if cuda_available() else 'cpu'
config = PackBoostConfig(
    pack_size=8,
    max_depth=5,
    learning_rate=0.05,
    lambda_l2=1.0,
    lambda_dro=0.5,
    max_bins=128,
    min_samples_leaf=32,
    random_state=42,
    layer_feature_fraction=0.5,
    device=device,
)

booster = PackBoost(config)
callback = LoggingCallback(every=5, print_train=True)
start = time.perf_counter()
booster.fit(
    X_train,
    y_train,
    era_train,
    num_rounds=20,
    eval_set=(X_valid, y_valid, era_valid),
    callbacks=[callback],
)
fit_time = time.perf_counter() - start

start = time.perf_counter()
preds = booster.predict(X_valid)
metrics = callback.to_frame()
metrics.head()
pred_time = time.perf_counter() - start

native_frontier = getattr(pb_backends, 'cuda_frontier_evaluate', None) is not None
print(f'CUDA frontier available: {native_frontier}')
print(f'Using GPU frontier: {booster._use_gpu}')
print(f'Device requested: {device}')
print(f'Fit time: {fit_time:.2f}s')
print(f'Predict time: {pred_time:.2f}s')
print(f'Validation R^2: {r2_score(y_valid, preds):.4f}')



## Upload diagnostics (optional)

In [None]:
# Provide your credentials to upload to Numerai
# napi = NumerAPI(public_id='PUBLIC', secret_key='SECRET')
# submission = pd.DataFrame({'prediction': preds}).clip(0, 1)
# submission.to_csv('packboost_preds.csv', index=False)
# napi.upload_diagnostics('packboost_preds.csv', model_id='YOUR_MODEL_ID')
print('Diagnostics upload skipped.')

In [None]:
# Plot validation correlation over rounds
import matplotlib.pyplot as plt
if not metrics.empty:
    metrics.plot(x='round', y='corr', marker='o')
    plt.title('Validation correlation per round')
    plt.xlabel('Trees built')
    plt.ylabel('Correlation')
    plt.grid(True)
    plt.show()
else:
    print('No metrics logged (adjust callback frequency).')
