# NumerAI Colab Training Runner

This notebook is intentionally thin: it captures runtime inputs, safely syncs the fixed GitHub repo, installs dependencies, and starts `src/train_colab.py`.


In [None]:
# What this cell does: captures configuration for the fixed public repository.
import os
import re
from pathlib import Path

REPO_URL = "https://github.com/WilliamQD/Numerai-Re.git"
REPO_REF = os.getenv("REPO_REF", "").strip()  # optional: full 40-char commit SHA override
REPO_DIR = Path(os.getenv("REPO_DIR", "/content/Numerai-Re"))
ALLOW_UNPINNED_REF = os.getenv("ALLOW_UNPINNED_REF", "0").strip() == "1"

if REPO_REF and not re.fullmatch(r"[0-9a-fA-F]{40}", REPO_REF):
    raise ValueError("REPO_REF must be a full 40-character commit SHA.")

os.environ["REPO_URL"] = REPO_URL
os.environ["REPO_REF"] = REPO_REF
os.environ["REPO_DIR"] = str(REPO_DIR)
os.environ["ALLOW_UNPINNED_REF"] = "1" if ALLOW_UNPINNED_REF else "0"

print(f"REPO_URL={REPO_URL}")
print(f"REPO_REF={REPO_REF or '(main/latest)'}")
print(f"REPO_DIR={REPO_DIR}")


In [None]:
# What this cell does: clones/updates repo safely in Colab and installs dependencies.
import os
import subprocess
from pathlib import Path

repo_url = os.environ["REPO_URL"]
repo_ref = os.environ.get("REPO_REF", "").strip()
repo_dir = os.environ["REPO_DIR"]

def run(cmd: list[str], cwd: str | None = None) -> None:
    print("+", " ".join(cmd))
    subprocess.run(cmd, check=True, cwd=cwd)

def canonical_git_url(url: str) -> str:
    return url.removesuffix(".git").rstrip("/").lower()

if not os.path.isdir(f"{repo_dir}/.git"):
    run(["git", "clone", repo_url, repo_dir])
else:
    current_origin = subprocess.check_output(
        ["git", "-C", repo_dir, "remote", "get-url", "origin"],
        text=True,
    ).strip()
    if canonical_git_url(current_origin) != canonical_git_url(repo_url):
        raise RuntimeError(
            f"Refusing to use existing repo at {repo_dir}: origin is {current_origin!r}, expected {repo_url!r}."
        )

run(["git", "-C", repo_dir, "fetch", "--tags", "--prune", "origin"])
if repo_ref:
    run(["git", "-C", repo_dir, "checkout", "--detach", repo_ref])
else:
    run(["git", "-C", repo_dir, "checkout", "main"])
    run(["git", "-C", repo_dir, "pull", "--ff-only", "origin", "main"])

run(["python", "-m", "pip", "install", "--quiet", "--upgrade", "pip"])
run(["python", "-m", "pip", "install", "--quiet", "-r", f"{repo_dir}/requirements-train.txt"])


PERSISTENT_ROOT = Path(os.getenv("PERSISTENT_ROOT", "/content/drive/MyDrive/Numerai-Re"))

try:
    from google.colab import drive
except Exception:
    drive = None

if drive:
    drive.mount("/content/drive", force_remount=False)
    PERSISTENT_ROOT.mkdir(parents=True, exist_ok=True)

if not os.getenv("NUMERAI_DATA_DIR", "").strip():
    numerai_data_dir = PERSISTENT_ROOT / "datasets" / "numerai"
    numerai_data_dir.mkdir(parents=True, exist_ok=True)
    os.environ["NUMERAI_DATA_DIR"] = str(numerai_data_dir)
    print(f"Set NUMERAI_DATA_DIR={numerai_data_dir}")

try:
    from google.colab import userdata
except Exception:
    userdata = None

if userdata:
    secret_names = (
        "WANDB_API_KEY",
        "NUMERAI_PUBLIC_ID",
        "NUMERAI_SECRET_KEY",
        "NUMERAI_MODEL_NAME",
        "WANDB_ENTITY",
        "WANDB_PROJECT",
    )
    for name in secret_names:
        if os.getenv(name, "").strip():
            continue
        try:
            value = (userdata.get(name) or "").strip()
        except Exception as exc:
            if exc.__class__.__name__ in {"SecretNotFoundError", "NotebookAccessError"}:
                value = ""
            else:
                raise
        if value:
            os.environ[name] = value
            print(f"Loaded {name} from Colab Secrets for this runtime.")


In [None]:
# What this cell does: starts training from the prepared repo directory.
%cd $REPO_DIR
!python src/train_colab.py
