# NumerAI Colab Training Runner

This notebook is intentionally thin: it captures runtime inputs, safely syncs the fixed GitHub repo, installs dependencies, and starts `python -m numerai_re.cli.train_colab` with `PYTHONPATH=src`.


In [None]:
# What this cell does: captures configuration for the fixed public repository.
import os
import re
from pathlib import Path

REPO_URL = "https://github.com/WilliamQD/Numerai-Re.git"
REPO_REF = os.getenv("REPO_REF", "").strip()  # optional: full 40-char commit SHA override
REPO_DIR = Path(os.getenv("REPO_DIR", "/content/Numerai-Re"))
ALLOW_UNPINNED_REF = os.getenv("ALLOW_UNPINNED_REF", "0").strip() == "1"

if REPO_REF and not re.fullmatch(r"[0-9a-fA-F]{40}", REPO_REF):
    raise ValueError("REPO_REF must be a full 40-character commit SHA.")

os.environ["REPO_URL"] = REPO_URL
os.environ["REPO_REF"] = REPO_REF
os.environ["REPO_DIR"] = str(REPO_DIR)
os.environ["ALLOW_UNPINNED_REF"] = "1" if ALLOW_UNPINNED_REF else "0"

print(f"REPO_URL={REPO_URL}")
print(f"REPO_REF={REPO_REF or '(main/latest)'}")
print(f"REPO_DIR={REPO_DIR}")


In [None]:
# What this cell does: clones/updates repo safely in Colab and installs dependencies.
import os
import subprocess
from pathlib import Path

repo_url = os.environ["REPO_URL"]
repo_ref = os.environ.get("REPO_REF", "").strip()
repo_dir = os.environ["REPO_DIR"]

def run(cmd: list[str], cwd: str | None = None) -> None:
    print("+", " ".join(cmd))
    subprocess.run(cmd, check=True, cwd=cwd)

def canonical_git_url(url: str) -> str:
    return url.removesuffix(".git").rstrip("/").lower()

if not os.path.isdir(f"{repo_dir}/.git"):
    run(["git", "clone", repo_url, repo_dir])
else:
    current_origin = subprocess.check_output(
        ["git", "-C", repo_dir, "remote", "get-url", "origin"],
        text=True,
    ).strip()
    if canonical_git_url(current_origin) != canonical_git_url(repo_url):
        raise RuntimeError(
            f"Refusing to use existing repo at {repo_dir}: origin is {current_origin!r}, expected {repo_url!r}."
        )

run(["git", "-C", repo_dir, "fetch", "--tags", "--prune", "origin"])
if repo_ref:
    run(["git", "-C", repo_dir, "checkout", "--detach", repo_ref])
else:
    run(["git", "-C", repo_dir, "checkout", "main"])
    run(["git", "-C", repo_dir, "pull", "--ff-only", "origin", "main"])

run(["python", "-m", "pip", "install", "--quiet", "--upgrade", "pip"])
run(["python", "-m", "pip", "install", "--quiet", "-r", f"{repo_dir}/requirements-train.txt"])


PERSISTENT_ROOT = Path(os.getenv("PERSISTENT_ROOT", "/content/drive/MyDrive/Numerai-Re"))

try:
    from google.colab import drive
except Exception:
    drive = None

if drive:
    drive.mount("/content/drive", force_remount=False)
    PERSISTENT_ROOT.mkdir(parents=True, exist_ok=True)

if not os.getenv("NUMERAI_DATA_DIR", "").strip():
    numerai_data_dir = PERSISTENT_ROOT / "datasets" / "numerai"
    numerai_data_dir.mkdir(parents=True, exist_ok=True)
    os.environ["NUMERAI_DATA_DIR"] = str(numerai_data_dir)
    print(f"Set NUMERAI_DATA_DIR={numerai_data_dir}")

try:
    from google.colab import userdata
except Exception:
    userdata = None

if userdata:
    secret_names = (
        "WANDB_API_KEY",
        "NUMERAI_PUBLIC_ID",
        "NUMERAI_SECRET_KEY",
        "NUMERAI_MODEL_NAME",
        "WANDB_ENTITY",
        "WANDB_PROJECT",
    )
    for name in secret_names:
        if os.getenv(name, "").strip():
            continue
        try:
            value = (userdata.get(name) or "").strip()
        except Exception as exc:
            if exc.__class__.__name__ in {"SecretNotFoundError", "NotebookAccessError"}:
                value = ""
            else:
                raise
        if value:
            os.environ[name] = value
            print(f"Loaded {name} from Colab Secrets for this runtime.")


In [None]:
# What this cell does: loads optional .env.colab (Drive-first) and optionally syncs dataset cache to local disk.
import os
import shutil
from pathlib import Path

repo_env_path = Path(os.environ["REPO_DIR"]) / ".env.colab"
drive_env_path = Path(os.getenv("COLAB_ENV_PATH", "/content/drive/MyDrive/Numerai-Re/.env.colab"))

candidate_paths = [drive_env_path, repo_env_path]
loaded_path = None
for candidate in candidate_paths:
    if candidate.exists():
        loaded_path = candidate
        break

if loaded_path is not None:
    loaded = 0
    for raw_line in loaded_path.read_text(encoding="utf-8").splitlines():
        line = raw_line.strip()
        if not line or line.startswith("#") or "=" not in line:
            continue
        key, value = line.split("=", 1)
        key = key.strip()
        if not key:
            continue
        value = value.strip().strip("\"").strip("'")
        os.environ[key] = value
        loaded += 1
    print(f"Loaded {loaded} env vars from {loaded_path}")
else:
    print(
        f"No env file found. Checked {drive_env_path} and {repo_env_path}; using defaults + existing env."
    )

dataset_version = os.getenv("NUMERAI_DATASET_VERSION", "v5.2").strip() or "v5.2"
local_data_root = Path(os.getenv("NUMERAI_DATA_DIR", "/content/numerai_data"))
drive_data_root = Path(os.getenv("COLAB_DRIVE_DATA_ROOT", "/content/drive/MyDrive/Numerai-Re/datasets/numerai"))
sync_from_drive = (os.getenv("COLAB_SYNC_DATA_FROM_DRIVE", "true").strip().lower() in {"1", "true", "yes", "on"})

src_version_dir = drive_data_root / dataset_version
dst_version_dir = local_data_root / dataset_version

if sync_from_drive and src_version_dir.exists() and dst_version_dir != src_version_dir:
    required_rel_paths = [
        "train.parquet",
        "validation.parquet",
        "features.json",
        "benchmarks/train_benchmark_models.parquet",
        "benchmarks/validation_benchmark_models.parquet",
        "benchmarks/live_benchmark_models.parquet",
    ]
    copied = 0
    reused = 0
    for rel in required_rel_paths:
        src_path = src_version_dir / rel
        if not src_path.exists():
            continue
        dst_path = dst_version_dir / rel
        dst_path.parent.mkdir(parents=True, exist_ok=True)
        if dst_path.exists():
            reused += 1
            continue
        shutil.copy2(src_path, dst_path)
        copied += 1
    print(
        f"Dataset sync: source={src_version_dir} target={dst_version_dir} copied={copied} reused={reused}"
    )
else:
    print(
        f"Dataset sync skipped (enabled={sync_from_drive}, source_exists={src_version_dir.exists()}, same_path={dst_version_dir == src_version_dir})"
    )

for key in [
    "COLAB_ENV_PATH",
    "COLAB_DRIVE_DATA_ROOT",
    "COLAB_SYNC_DATA_FROM_DRIVE",
    "NUMERAI_DATA_DIR",
    "LOAD_MODE",
    "STATUS_UPDATE_SECONDS",
    "MAX_FEATURES_PER_MODEL",
]:
    if key == "COLAB_ENV_PATH":
        print(f"{key}={drive_env_path}")
    elif key == "COLAB_DRIVE_DATA_ROOT":
        print(f"{key}={drive_data_root}")
    elif os.getenv(key, "").strip():
        print(f"{key}={os.environ[key]}")


In [None]:
# What this cell does: starts training from the prepared repo directory.
%cd $REPO_DIR
!PYTHONPATH=src python -m numerai_re.cli.train_colab
