# Tic-Tac-Toe REINFORCE (Colab)

This notebook mirrors `scripts/ttt_reinforce.py` with smaller defaults for Colab.
Use a GPU runtime (T4/A100) for vLLM + training.


In [None]:
# Clone and install. Replace RedTachyon with your GitHub org/user.
!git clone https://github.com/RedTachyon/nanorl.git
%cd nanorl
!pip -q install -e packages/gyllm -e packages/nanorl


In [None]:
import os

os.environ["VLLM_CONFIGURE_LOGGING"] = "0"

import torch
import wandb
from transformers import AutoModelForCausalLM, AutoTokenizer
from vllm import SamplingParams

import gyllm
from nanorl.rl import compute_reinforce_loss
from nanorl.rollout import rollout_autoreset_batched
from nanorl.rollout.reporting import summarize_rollouts
from nanorl.rollout import NanoLLM
from gyllm.envs import AutoResetWrapper


In [None]:
# Small defaults for Colab. Adjust as needed.
model_id = "Qwen/Qwen3-0.6B"
num_envs = 2
episodes = 4 * num_envs
num_updates = 2
lr = 1e-5
max_grad_norm = 1.0
minibatch_size = 2

wandb.init(mode="disabled")


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.bfloat16 if device == "cuda" else torch.float32

tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

model = AutoModelForCausalLM.from_pretrained(model_id, dtype=dtype)
model.to(device)
device = next(model.parameters()).device

llm = NanoLLM(
    model,
    tokenizer=tokenizer,
    gpu_memory_utilization=0.4,
    enable_sleep_mode=True,
)

env = gyllm.make(
    "simple/tic_tac_toe",
    env_kwargs={"opponent": "random"},
    num_envs=num_envs,
)
env = AutoResetWrapper(env)

sampling_params = SamplingParams(temperature=1.0, max_tokens=20)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)


In [None]:
for update in range(num_updates):
    model.eval()
    with torch.no_grad():
        llm.wake_up()
        rollouts = rollout_autoreset_batched(
            env,
            llm,
            tokenizer,
            sampling_params,
            max_episodes=episodes,
            max_steps=None,
        )
    llm.sleep(1)

    _tokens, mean_reward, _sample_text = summarize_rollouts(rollouts, tokenizer)

    model.train()
    optimizer.zero_grad(set_to_none=True)
    total_rollouts = len(rollouts)
    total_loss_value = 0.0
    total_assistant_tokens = 0.0
    total_logprob = 0.0
    reward_sum = 0.0

    for start in range(0, total_rollouts, minibatch_size):
        minibatch = rollouts[start : start + minibatch_size]
        loss, mb_metrics = compute_reinforce_loss(
            minibatch,
            model,
            tokenizer,
            device=device,
        )
        mb_size = len(minibatch)
        reward_sum += mb_metrics["avg_reward"] * mb_size
        total_assistant_tokens += mb_metrics["assistant_tokens"]
        total_logprob += mb_metrics["avg_logprob"] * mb_metrics["assistant_tokens"]

        if mb_metrics["assistant_tokens"] <= 0:
            continue

        scale = mb_size / max(total_rollouts, 1)
        (loss * scale).backward()
        total_loss_value += float(loss.item()) * scale

    avg_reward = reward_sum / max(total_rollouts, 1)
    metrics = {
        "avg_reward": avg_reward,
        "baseline": avg_reward,
        "avg_logprob": total_logprob / max(total_assistant_tokens, 1.0),
        "assistant_tokens": float(total_assistant_tokens),
    }

    if metrics["assistant_tokens"] > 0:
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()

    print(
        f"update={update} loss={total_loss_value:.4f} "
        f"avg_reward={metrics['avg_reward']:.3f} "
        f"assistant_tokens={metrics['assistant_tokens']:.0f}"
    )

env.close()
wandb.finish()
