# NumerAI Colab Training Runner

This notebook is intentionally thin: it captures runtime inputs, safely syncs the fixed GitHub repo, installs dependencies, and starts `python -m numerai_re.cli.train_colab` with `PYTHONPATH=src`.


In [None]:
# What this cell does: captures configuration for the fixed public repository.
import os
import re
from pathlib import Path

REPO_URL = "https://github.com/WilliamQD/Numerai-Re.git"
REPO_REF = os.getenv("REPO_REF", "").strip()  # optional: full 40-char commit SHA override
REPO_DIR = Path(os.getenv("REPO_DIR", "/content/Numerai-Re"))
ALLOW_UNPINNED_REF = os.getenv("ALLOW_UNPINNED_REF", "0").strip() == "1"

if REPO_REF and not re.fullmatch(r"[0-9a-fA-F]{40}", REPO_REF):
    raise ValueError("REPO_REF must be a full 40-character commit SHA.")

os.environ["REPO_URL"] = REPO_URL
os.environ["REPO_REF"] = REPO_REF
os.environ["REPO_DIR"] = str(REPO_DIR)
os.environ["ALLOW_UNPINNED_REF"] = "1" if ALLOW_UNPINNED_REF else "0"

print(f"REPO_URL={REPO_URL}")
print(f"REPO_REF={REPO_REF or '(main/latest)'}")
print(f"REPO_DIR={REPO_DIR}")


In [None]:
# What this cell does: syncs the repo, installs deps, and mounts Drive (if available).
import os
import subprocess
from pathlib import Path

repo_url = os.environ["REPO_URL"]
repo_ref = os.environ.get("REPO_REF", "").strip()
repo_dir = Path(os.environ["REPO_DIR"])

def run(cmd: list[str], cwd: str | None = None) -> None:
    print("+", " ".join(cmd))
    subprocess.run(cmd, check=True, cwd=cwd)

def canonical_git_url(url: str) -> str:
    return url.removesuffix(".git").rstrip("/").lower()

if not (repo_dir / ".git").is_dir():
    run(["git", "clone", repo_url, str(repo_dir)])
else:
    current_origin = subprocess.check_output(
        ["git", "-C", str(repo_dir), "remote", "get-url", "origin"],
        text=True,
    ).strip()
    if canonical_git_url(current_origin) != canonical_git_url(repo_url):
        raise RuntimeError(
            f"Refusing to use existing repo at {repo_dir}: origin is {current_origin!r}, expected {repo_url!r}."
        )

run(["git", "-C", str(repo_dir), "fetch", "--tags", "--prune", "origin"])
if repo_ref:
    run(["git", "-C", str(repo_dir), "checkout", "--detach", repo_ref])
else:
    run(["git", "-C", str(repo_dir), "checkout", "main"])
    run(["git", "-C", str(repo_dir), "pull", "--ff-only", "origin", "main"])

run(["python", "-m", "pip", "install", "--quiet", "--upgrade", "pip"])
run(["python", "-m", "pip", "install", "--quiet", "-r", str(repo_dir / "requirements-train.txt")])

try:
    from google.colab import drive
except Exception:
    drive = None

if drive:
    drive.mount("/content/drive", force_remount=False)

if not os.getenv("NUMERAI_DATA_DIR", "").strip():
    os.environ["NUMERAI_DATA_DIR"] = "/content/numerai_data"


In [None]:
# What this cell does: loads .env.colab (Drive-first), backfills missing secrets from Colab Secrets, and syncs cached data to local disk.
import os
import shutil
from pathlib import Path

repo_env = Path(os.environ["REPO_DIR"]) / ".env.colab"
drive_env = Path(os.getenv("COLAB_ENV_PATH", "/content/drive/MyDrive/Numerai-Re/.env.colab"))
env_file = drive_env if drive_env.exists() else (repo_env if repo_env.exists() else None)

if env_file:
    for raw in env_file.read_text(encoding="utf-8").splitlines():
        line = raw.strip()
        if not line or line.startswith("#") or "=" not in line:
            continue
        key, value = line.split("=", 1)
        key = key.strip()
        value = value.strip().strip('"').strip("'")
        if key and value:
            os.environ[key] = value
    print(f"Loaded env from {env_file}")
else:
    print(f"No env file found at {drive_env} or {repo_env}")

try:
    from google.colab import userdata
except Exception:
    userdata = None

if userdata:
    for key in (
        "WANDB_API_KEY",
        "NUMERAI_PUBLIC_ID",
        "NUMERAI_SECRET_KEY",
        "NUMERAI_MODEL_NAME",
        "WANDB_ENTITY",
        "WANDB_PROJECT",
    ):
        if os.getenv(key, "").strip():
            continue
        try:
            value = (userdata.get(key) or "").strip()
        except Exception as exc:
            if exc.__class__.__name__ in {"SecretNotFoundError", "NotebookAccessError"}:
                value = ""
            else:
                raise
        if value:
            os.environ[key] = value

dataset_version = os.getenv("NUMERAI_DATASET_VERSION", "v5.2").strip() or "v5.2"
local_root = Path(os.getenv("NUMERAI_DATA_DIR", "/content/numerai_data"))
drive_root = Path(os.getenv("COLAB_DRIVE_DATA_ROOT", "/content/drive/MyDrive/Numerai-Re/datasets/numerai"))
if os.getenv("COLAB_SYNC_DATA_FROM_DRIVE", "true").strip().lower() in {"1", "true", "yes", "on"}:
    src = drive_root / dataset_version
    dst = local_root / dataset_version
    if src.exists() and src != dst:
        copied = 0
        for rel in (
            "train.parquet",
            "validation.parquet",
            "features.json",
            "benchmarks/train_benchmark_models.parquet",
            "benchmarks/validation_benchmark_models.parquet",
            "benchmarks/live_benchmark_models.parquet",
        ):
            s = src / rel
            d = dst / rel
            if s.exists() and not d.exists():
                d.parent.mkdir(parents=True, exist_ok=True)
                shutil.copy2(s, d)
                copied += 1
        print(f"Dataset sync copied={copied} src={src} dst={dst}")

print(f"NUMERAI_DATA_DIR={os.getenv('NUMERAI_DATA_DIR', '')}")
print(f"LOAD_MODE={os.getenv('LOAD_MODE', '')}")


In [None]:
# What this cell does: starts training from the prepared repo directory.
%cd $REPO_DIR
!PYTHONPATH=src python -m numerai_re.cli.train_colab
