# NanoRL agent manual stepping

This notebook shows how to build a NanoRL agent and step it through an env
with explicit AgentState handling.


## Notes
These examples load a real LLM. Run them in a GPU-enabled environment (e.g., a uv venv with CUDA/vllm installed or a container).


In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from vllm import SamplingParams

import gyllm
from nanorl.agent import InstructAgent
from nanorl.rollout import NanoLLM


## Load the model and tokenizer


In [2]:
model_id = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    dtype="bfloat16",
    device_map="cuda",
)


## Create a NanoLLM wrapper


In [3]:
llm = NanoLLM(
    model,
    tokenizer=tokenizer,
    gpu_memory_utilization=0.4,
    enable_sleep_mode=True,
)


INFO 01-11 23:11:59 [__init__.py:216] Automatically detected platform cuda.
INFO 01-11 23:12:00 [utils.py:233] non-default args: {'tokenizer': '/tmp/nanollm_tokenizer_p9usoozb', 'load_format': 'dummy', 'dtype': torch.bfloat16, 'distributed_executor_backend': 'uni', 'gpu_memory_utilization': 0.4, 'disable_log_stats': True, 'enforce_eager': True, 'enable_sleep_mode': True, 'model_impl': 'transformers', 'model': 'Qwen/Qwen2.5-0.5B-Instruct'}
INFO 01-11 23:12:05 [model.py:547] Resolved architecture: TransformersForCausalLM


`torch_dtype` is deprecated! Use `dtype` instead!


INFO 01-11 23:12:05 [model.py:1510] Using max model len 32768
INFO 01-11 23:12:07 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 01-11 23:12:07 [__init__.py:381] Cudagraph is disabled under eager mode
INFO 01-11 23:12:08 [core.py:77] Initializing a V1 LLM engine (v0.11.0+582e4e37.nv25.11) with config: model='Qwen/Qwen2.5-0.5B-Instruct', speculative_config=None, tokenizer='/tmp/nanollm_tokenizer_p9usoozb', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=dummy, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser=''

`torch_dtype` is deprecated! Use `dtype` instead!


INFO 01-11 23:12:17 [cuda.py:366] Using Flash Attention backend on V1 engine.
INFO 01-11 23:12:17 [gpu_model_runner.py:2653] Model loading took 0.9249 GiB and 0.147455 seconds
INFO 01-11 23:12:43 [gpu_worker.py:298] Available KV cache memory: 45.28 GiB
INFO 01-11 23:12:43 [kv_cache_utils.py:1087] GPU KV cache size: 3,956,720 tokens
INFO 01-11 23:12:43 [kv_cache_utils.py:1091] Maximum concurrency for 32,768 tokens per request: 120.75x


2026-01-11 23:12:45,179 - INFO - autotuner.py:256 - flashinfer.jit: [Autotuner]: Autotuning process starts ...
2026-01-11 23:12:45,383 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process ends


INFO 01-11 23:12:45 [core.py:210] init engine (profile, create kv cache, warmup model) took 27.69 seconds
INFO 01-11 23:12:45 [llm.py:306] Supported_tasks: ('generate',)


## Create an environment


In [4]:
env = gyllm.make("simple/reverse_echo", env_kwargs={"num_turns": 2})


## Build an agent


In [5]:
sampling_params = SamplingParams(temperature=0.7, max_tokens=64)
agent = InstructAgent(
    model=model,
    llm=llm,
    tokenizer=tokenizer,
    sampling_params=sampling_params,
)


## Step through the environment


In [9]:
requests = env.reset()
state = None
requests

[{'actor': 'agent',
  'reward': 0.0,
  'system_message': {'role': 'system',
   'content': 'You are in ReverseEcho.\nEach turn, the environment will send you a message.\nYour task is to reply with the exact same message.\nDo not add extra words.\nLeading/trailing whitespace is ignored.\n'},
  'message': {'role': 'user', 'content': 'harbor'},
  'needs_action': True,
  'info': {'turn': 0, 'num_turns': 2},
  'episode_id': 2,
  'episode_start': True,
  'episode_end': False}]

In [10]:
actions, state = agent.act(requests, state)
requests = env.step(actions)
requests

[{'actor': 'agent',
  'reward': 1.0,
  'message': {'role': 'user', 'content': 'delta'},
  'needs_action': True,
  'info': {'turn': 1, 'num_turns': 2},
  'episode_id': 2,
  'episode_start': False,
  'episode_end': False}]

In [11]:
actions, state = agent.act(requests, state)
requests = env.step(actions)
requests

[{'actor': 'agent',
  'reward': 1.0,
  'message': {'role': 'user', 'content': 'Done.'},
  'needs_action': False,
  'info': {'turn': 2, 'num_turns': 2},
  'episode_id': 2,
  'episode_start': False,
  'episode_end': True}]

In [12]:
actions, state = agent.act(requests, state)
state.completed

[EpisodeRollout(actor='agent', messages=[{'role': 'system', 'content': 'You are in ReverseEcho.\nEach turn, the environment will send you a message.\nYour task is to reply with the exact same message.\nDo not add extra words.\nLeading/trailing whitespace is ignored.\n'}, {'role': 'user', 'content': 'harbor'}, {'role': 'assistant', 'content': 'harbor'}, {'role': 'user', 'content': 'delta'}, {'role': 'assistant', 'content': 'delta'}, {'role': 'user', 'content': 'Done.'}], rewards=[1.0, 1.0], actions=['harbor', 'delta'])]