# Train Gated Fusion MLP (Colab)
This notebook trains one model family across `concat,sum_pool,max_pool` × `7,30` and uploads snapshots to S3.


### 1) Runtime
Set runtime to **GPU** (`Runtime` → `Change runtime type` → GPU).


In [None]:
!pip -q install --upgrade pip
!pip -q install boto3 pandas numpy pyarrow scikit-learn torch joblib


### 2) Bootstrap repo + credentials from Colab Secrets


In [None]:
import os
import subprocess
from pathlib import Path

try:
    from google.colab import userdata
except Exception as exc:
    raise RuntimeError("This notebook must run in Google Colab") from exc

def _secret(name: str, default: str = "") -> str:
    try:
        value = userdata.get(name)
    except Exception:
        value = default
    return (value or default).strip()

# ----- EDIT THESE IF NEEDED -----
REPO_SLUG = "<ORG>/<REPO>"  # e.g. Western-Artificial-Intelligence/video-virality-predictor
BRANCH = "main"
REPO_DIR = Path("/content/video-virality-predictor")
# ----------------------------------

AWS_ACCESS_KEY_ID = _secret("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = _secret("AWS_SECRET_ACCESS_KEY")
GITHUB_TOKEN = _secret("GITHUB_TOKEN", "")  # optional; needed for private repos

if not AWS_ACCESS_KEY_ID or not AWS_SECRET_ACCESS_KEY:
    raise ValueError("Missing Colab secrets: AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY")

if REPO_SLUG.startswith("<"):
    raise ValueError("Set REPO_SLUG first (e.g. org/repo)")

clone_url = f"https://github.com/{REPO_SLUG}.git"
if GITHUB_TOKEN:
    clone_url = f"https://x-access-token:{GITHUB_TOKEN}@github.com/{REPO_SLUG}.git"

if not REPO_DIR.exists():
    subprocess.run(["git", "clone", clone_url, str(REPO_DIR)], check=True)

subprocess.run(["git", "-C", str(REPO_DIR), "fetch", "--all"], check=True)
subprocess.run(["git", "-C", str(REPO_DIR), "checkout", BRANCH], check=True)
subprocess.run(["git", "-C", str(REPO_DIR), "pull", "--ff-only"], check=False)

os.chdir(REPO_DIR)
os.environ["AWS_ACCESS_KEY_ID"] = AWS_ACCESS_KEY_ID
os.environ["AWS_SECRET_ACCESS_KEY"] = AWS_SECRET_ACCESS_KEY

print("cwd:", os.getcwd())
print("repo ready on branch:", BRANCH)


### 3) Training parameters


In [None]:
import os

MODEL_FAMILY = "gated_fusion_mlp"
S3_BUCKET = "clipfarm-prod-us-west-2"
AWS_REGION = "ca-central-1"
RUN_ID = ""  # optional; leave blank to auto-generate
SNAPSHOT_PREFIX = "clipfarm/models/snapshots"
STRATEGIES = "concat,sum_pool,max_pool"
HORIZONS = "7,30"
SEED = 42
MAX_EPOCHS = 40
PATIENCE = 6
PROJECTOR_DIM = 128
RANK_METRIC = "rmse_log"

if not S3_BUCKET:
    raise ValueError("S3_BUCKET is required")

os.environ["S3_BUCKET"] = S3_BUCKET
os.environ["AWS_REGION"] = AWS_REGION


### 4) Run model-family matrix (6 outputs)


In [None]:
import os
import subprocess
import sys

runner = "Super_Predict/run_model_colab_matrix.py"
cmd = [
    sys.executable, runner,
    "--model_family", MODEL_FAMILY,
    "--s3_bucket", S3_BUCKET,
    "--s3_region", AWS_REGION,
    "--snapshot_prefix", SNAPSHOT_PREFIX,
    "--strategies", STRATEGIES,
    "--horizons", HORIZONS,
    "--seed", str(SEED),
    "--max_epochs", str(MAX_EPOCHS),
    "--patience", str(PATIENCE),
    "--projector_dim", str(PROJECTOR_DIM),
    "--rank_metric", RANK_METRIC,
]
if RUN_ID.strip():
    cmd.extend(["--run_id", RUN_ID.strip()])

print("Running:", " ".join(cmd))
subprocess.run(cmd, check=True)
