# NanoRL data collection loop

This notebook builds a simple data collection loop using an LLM agent.


## Setup


In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from vllm import SamplingParams

import gyllm
from gyllm.envs import AutoResetWrapper
from nanorl.agent import AgentState, InstructAgent
from nanorl.rollout import NanoLLM
from nanorl.rollout.reporting import render_rollouts_html


## Model and LLM wrapper


In [2]:
model_id = "Qwen/Qwen2.5-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    dtype="bfloat16",
    device_map="cuda",
)
llm = NanoLLM(
    model,
    tokenizer=tokenizer,
    gpu_memory_utilization=0.4,
    enable_sleep_mode=True,
)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

INFO 01-11 23:24:17 [__init__.py:216] Automatically detected platform cuda.
INFO 01-11 23:24:18 [utils.py:233] non-default args: {'tokenizer': '/tmp/nanollm_tokenizer_ukxwrlzg', 'load_format': 'dummy', 'dtype': torch.bfloat16, 'distributed_executor_backend': 'uni', 'gpu_memory_utilization': 0.4, 'disable_log_stats': True, 'enforce_eager': True, 'enable_sleep_mode': True, 'model_impl': 'transformers', 'model': 'Qwen/Qwen2.5-3B-Instruct'}
INFO 01-11 23:24:19 [model.py:547] Resolved architecture: TransformersForCausalLM


`torch_dtype` is deprecated! Use `dtype` instead!


INFO 01-11 23:24:19 [model.py:1510] Using max model len 32768
INFO 01-11 23:24:21 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 01-11 23:24:21 [__init__.py:381] Cudagraph is disabled under eager mode
INFO 01-11 23:24:21 [core.py:77] Initializing a V1 LLM engine (v0.11.0+582e4e37.nv25.11) with config: model='Qwen/Qwen2.5-3B-Instruct', speculative_config=None, tokenizer='/tmp/nanollm_tokenizer_ukxwrlzg', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=dummy, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser=''),

`torch_dtype` is deprecated! Use `dtype` instead!


INFO 01-11 23:24:23 [cuda.py:366] Using Flash Attention backend on V1 engine.
INFO 01-11 23:24:23 [gpu_model_runner.py:2653] Model loading took 5.8424 GiB and 0.371737 seconds
INFO 01-11 23:24:25 [gpu_worker.py:298] Available KV cache memory: 41.14 GiB
INFO 01-11 23:24:25 [kv_cache_utils.py:1087] GPU KV cache size: 1,198,304 tokens
INFO 01-11 23:24:25 [kv_cache_utils.py:1091] Maximum concurrency for 32,768 tokens per request: 36.57x


2026-01-11 23:24:27,302 - INFO - autotuner.py:256 - flashinfer.jit: [Autotuner]: Autotuning process starts ...
2026-01-11 23:24:28,237 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process ends


INFO 01-11 23:24:28 [core.py:210] init engine (profile, create kv cache, warmup model) took 4.43 seconds
INFO 01-11 23:24:28 [llm.py:306] Supported_tasks: ('generate',)


## Environment and agent


In [3]:
env = gyllm.make(
    "simple/tic_tac_toe",
    env_kwargs={"opponent": "random", "repeat_invalid_action": True},
    num_envs=2,
)
env = AutoResetWrapper(env)

sampling_params = SamplingParams(temperature=0.8, max_tokens=64)
agent = InstructAgent(
    model=model,
    llm=llm,
    tokenizer=tokenizer,
    sampling_params=sampling_params,
)


## Collect rollouts (explicit loop)

This mirrors the production loop but is small enough to read end-to-end.


In [5]:
def collect_rollouts_batched(env, agent, *, max_episodes, max_steps=None):
    """Collect rollouts with an explicit batched loop.

    This is a small, readable version of the production rollout helper.
    """
    if max_episodes is None and max_steps is None:
        raise ValueError("Set max_episodes or max_steps.")

    requests = env.reset()
    if not requests:
        return []

    state = AgentState()
    completed = []
    step_count = 0

    while True:
        # repeat_count requests are handled inside agent.act; no special casing here.
        actions, state = agent.act(requests, state)
        completed.extend(state.pop_completed())

        pending = [req for req in requests if req.get("needs_action")]
        if max_episodes is not None and len(completed) >= max_episodes:
            break
        if max_steps is not None and step_count >= max_steps:
            break
        if not pending:
            break

        requests = env.step(actions)
        step_count += len(actions)

    if max_episodes is not None:
        return completed[:max_episodes]
    return completed


In [6]:
rollouts = collect_rollouts_batched(env, agent, max_episodes=4)
episode_rewards = [sum(r.rewards) for r in rollouts]
mean_reward = (sum(episode_rewards) / len(episode_rewards)) if episode_rewards else 0.0
mean_reward


-0.25

## Render a rollout summary


In [8]:
from IPython.display import HTML, display
html = render_rollouts_html(rollouts)
display(HTML(html))
