# RL w/ Verifiable Rewards Experiments
Default to left-to-right arithmetic with shared helpers in `rlvr_math.py`.

In [1]:
%load_ext autoreload
%autoreload 2

import logging
import torch
from rlvr_math import (
    DEFAULT_MODEL_ID,
    TaskConfig,
    TrainConfig,
    measure_baseline_accuracy,
    train_grpo_integer_math,
    load_trainer_logs,
    load_mem_log,
    plot_losses,
    plot_memory,
    summarize_logs,
)

logging.basicConfig(level=logging.INFO, format="%(message)s")


### Baseline accuracy (matches training task)
Using left-to-right arithmetic to keep evaluation aligned with the RL task.

In [None]:
task_cfg = TaskConfig(
    task_mode="ltr",
    ltr_min_steps=4,
    ltr_max_steps=7,
    val_range=9999,
    mul_range=50,
)

eval_seed = 123
baseline = measure_baseline_accuracy(
    model_id=DEFAULT_MODEL_ID,
    task_cfg=task_cfg,
    n_eval=10,
    device="cuda",
    dtype=torch.float32,
    load_in_4bit=True,
    eval_seed=eval_seed,
)
baseline

We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


### GRPO training
Defaults target the LTR task with concise logging and quick-run sizing.

In [None]:
cfg = TrainConfig(
    model_id=DEFAULT_MODEL_ID,
    task=task_cfg,
    train_seed=42,
    eval_seed=eval_seed,
    quick_run=True,
    run_name="grpo-math-quick",
    output_dir="qwen3-06b-grpo-math-quick",
    logging_steps=5,
)
trainer = train_grpo_integer_math(cfg)

### Training diagnostics

In [None]:
df = load_trainer_logs(cfg.output_dir)
dfm = load_mem_log(cfg.output_dir)
plot_losses(df)
plot_memory(dfm)
summarize_logs(df, dfm)