# nanoAhaMoment: Single File R1-Zero Training

## Installation Requirements

Install the required packages:

```txt
# Core dependencies
torch==2.9.1

# Deep learning frameworks
transformers>=4.57.0
datasets>=4.5.0
deepspeed>=0.18.0
accelerate

# Flash Attention for A100
flash-attn

# vLLM for inference
vllm>=0.14.1

# Training and logging (optional - see USE_WANDB flag)
wandb
tqdm

# Standard scientific computing
numpy
```

In [None]:
!pip install torch==2.9.1
!pip install transformers datasets deepspeed accelerate vllm tqdm numpy
# Optional: install wandb if you want to use it (set USE_WANDB=True)
# !pip install wandb
!pip install flash-attn --no-build-isolation

In [1]:
import os
from pathlib import Path

# Use /workspace for persistent storage on RunPod (network volume).
# Falls back to current directory for local development.
WORKSPACE = Path("/workspace") if Path("/workspace").exists() else Path.cwd()
SCRATCH = WORKSPACE / "scratch"
SCRATCH.mkdir(parents=True, exist_ok=True)

os.environ["HF_HOME"] = str(SCRATCH / "hf_home")

In [2]:
from __future__ import annotations

import gc
import re
import time
import sys
from typing import Any, Dict, List, Tuple, Union, TYPE_CHECKING

import torch

import deepspeed
import numpy as np
from datasets import load_dataset
from deepspeed import DeepSpeedEngine
from tqdm import trange
from transformers import AutoModelForCausalLM, AutoTokenizer

# Direct imports to avoid vLLM lazy import issues
from vllm.entrypoints.llm import LLM
from vllm.sampling_params import SamplingParams

if TYPE_CHECKING:
    from transformers import PreTrainedModel

try:
    import wandb
except ImportError:
    wandb = None
    print("wandb not installed. Set USE_WANDB=False to run without it.")

from utils import (
    compute_token_log_probs,
    dump_episodes,
    evaluate_on_test_set,
    find_free_port,
    find_last_checkpoint,
    prepare_model_inputs,
    load_model_into_vllm
)

# Needed to stop DeepSpeed from complaining
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = str(find_free_port())
os.environ["RANK"] = "0"
os.environ["LOCAL_RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"

# Must be set BEFORE creating the vLLM engine so the engine core subprocess
# inherits it and can deserialize callables sent via apply_model / collective_rpc.
os.environ["VLLM_ALLOW_INSECURE_SERIALIZATION"] = "1"

/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


## Hyperparameters

In [3]:
# Model configuration
MODEL_NAME = "Qwen/Qwen2.5-3B"
MODEL_CHAT_NAME = MODEL_NAME + "-Instruct"

# Dataset configuration
DATASET_NAME = "Jiayi-Pan/Countdown-Tasks-3to4"

# Total number of training iterations
NUM_ITERATIONS = 1000
# Number of episodes to collect per iteration for training
EPISODES_PER_ITERATION = 64
# Number of responses to generate for each input prompt (i.e. group size in GRPO)
GENERATIONS_PER_SAMPLE = 4
# Controls how much the policy can deviate from the reference model
KL_COEFFICIENT = 0.001

# Training hyperparameters
# Batch size for each GPU device during training
PER_DEVICE_BATCH_SIZE = 4
# Learning rate for model updates
LEARNING_RATE = 1e-6

# Sampling parameters
# Maximum number of tokens to generate in each response
MAX_RESPONSE_TOKENS = 1024
# Controls randomness in generation (higher = more random)
TEMPERATURE = 1.0
# Nucleus sampling parameter (1.0 = disabled)
TOP_P = 1.0
# Top-k sampling parameter (-1 = disabled)
TOP_K = -1  # no top k

# Logging configuration
USE_WANDB = False  # Set to True to enable wandb logging

# DeepSpeed configuration
# DeepSpeed config for the policy model
deepspeed_config = {
    "bf16": {"enabled": True},
    "zero_optimization": {"stage": 2, "overlap_comm": False},
    "train_batch_size": EPISODES_PER_ITERATION,
    "train_micro_batch_size_per_gpu": PER_DEVICE_BATCH_SIZE,
    "gradient_accumulation_steps": EPISODES_PER_ITERATION // PER_DEVICE_BATCH_SIZE,
    "gradient_clipping": 1.0,
    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": LEARNING_RATE,
            "betas": (0.9, 0.999),
            "eps": 1e-8,
            "weight_decay": 0.0,
            "torch_adam": True,
        },
    },
}
# DeepSpeed config for the reference model
ref_deepspeed_config = {
    "bf16": {"enabled": True},
    # Note that we don't train the reference model
    # These are just for compatibility with DeepSpeed.
    "train_batch_size": EPISODES_PER_ITERATION,
    "train_micro_batch_size_per_gpu": PER_DEVICE_BATCH_SIZE,
    "gradient_accumulation_steps": EPISODES_PER_ITERATION // PER_DEVICE_BATCH_SIZE,
}

RUN_NAME = "r1-zero"
EXP_DIR = SCRATCH / "deepseek_r1z_hackathon" / RUN_NAME
EXP_DIR.mkdir(parents=True, exist_ok=True)
print(f"Logs and Checkpoints will be saved to: {EXP_DIR}")

Logs and Checkpoints will be saved to: /workspace/scratch/deepseek_r1z_hackathon/r1-zero


## Generating the training prompts

In [4]:
SYSTEM_MESSAGE = (
    "You are a helpful assistant. You first think about the reasoning process in the mind "
    "and then provide the user with the answer."
)
PROMPT_TEMPLATE = (
    "Using the numbers {numbers}, create an equation that equals {target}. "
    "You can use basic arithmetic operations (+, -, *, /) and each number can only be used once. "
    "Show your work in <think> </think> tags. And return the final equation and answer in "
    "<answer> </answer> tags, for example <answer>(1 + 2) / (3 * 5)</answer>."
)

In [5]:
# Load and process dataset
def preprocess_example(example: Dict[str, Any]):
    numbers: List[int] = example["nums"]
    target: int = example["target"]

    prefix = [
        {"role": "system", "content": SYSTEM_MESSAGE},
        {"role": "user", "content": PROMPT_TEMPLATE.format(numbers=numbers, target=target)},
        {"role": "assistant", "content": "Let me solve this step by step.\n<think>"},
    ]
    input_ids = tokenizer.apply_chat_template(
        prefix, tokenize=True, continue_final_message=True
    )
    prompt = tokenizer.decode(
        input_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
    )
    return {"prompt": prompt, "input_ids": input_ids}

# Note that the base model and "instruct" model have different eos token. 
# Here we make sure to use the correct one.
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHAT_NAME)
EOS_TOKEN_ID = AutoTokenizer.from_pretrained(MODEL_NAME).eos_token_id
EOS_TOKEN = tokenizer.convert_ids_to_tokens(EOS_TOKEN_ID)

dataset = load_dataset(DATASET_NAME, split="train")
dataset = dataset.map(preprocess_example, num_proc=6)

# Split dataset
train_test_split = dataset.train_test_split(test_size=500, seed=42)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

len(train_dataset), len(test_dataset)

(489864, 500)

## Reward Function

In [6]:
def format_reward_func(completion: str) -> float:
    """Check if output follows <think>...</think>\n<answer>...</answer> format."""
    allowed_pattern = r"^[\d+\-*/().\s]+$"

    try:
        completion = "<think>" + completion
        if completion.endswith(EOS_TOKEN):
            completion = completion[:-len(EOS_TOKEN)]

        regex = r"^<think>([^<]*(?:<(?!/?think>)[^<]*)*)<\/think>\n<answer>([\s\S]*?)<\/answer>$"
        match = re.search(regex, completion, re.DOTALL)

        if match is None or len(match.groups()) != 2:
            return 0.0
        else:
            answer_content = match.group(2).strip()
            if not re.match(allowed_pattern, answer_content):
                return 0.5
            else:
                return 1.0
    except Exception:
        return 0.0


def equation_reward_func(completion: str, nums: List[int], target: int) -> float:
    """Check if the equation in <answer> tags evaluates to the target using all numbers exactly once."""
    try:
        match = re.search(r"<answer>(.*?)<\/answer>", completion)
        if match is None:
            return 0.0

        equation = match.group(1).strip()
        used_numbers = [int(n) for n in re.findall(r"\d+", equation)]

        if sorted(used_numbers) != sorted(nums):
            return 0.0

        allowed_pattern = r"^[\d+\-*/().\s]+$"
        if not re.match(allowed_pattern, equation):
            return 0.0

        result = eval(equation, {"__builtins__": None}, {})
        if abs(float(result) - float(target)) < 1e-5:
            return 1.0
        else:
            return 0.0
    except Exception:
        return 0.0
    

def compute_reward(completion: str, sample: Dict[str, Any]) -> Tuple[float, Dict[str, float]]:
    nums = sample["nums"]
    target = sample["target"]

    format_reward = format_reward_func(completion)
    equation_reward = equation_reward_func(completion=completion, nums=nums, target=target)

    reward = format_reward + equation_reward
    metrics = {
        "format_reward": format_reward,
        "equation_reward": equation_reward,
    }   

    return reward, metrics

## Episode Generation

In [7]:
def create_training_episodes(
    samples: List[Dict[str, Any]],
    all_generations: List[List[int]],
    all_finish_reasons: List[str],
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
    """Process generations into training episodes with GRPO advantages."""
    assert len(all_generations) == len(all_finish_reasons)
    assert len(all_generations) == len(samples) * GENERATIONS_PER_SAMPLE

    groups = [
        list(range(i, i + GENERATIONS_PER_SAMPLE))
        for i in range(0, len(all_generations), GENERATIONS_PER_SAMPLE)
    ]

    all_query_token_ids, all_responses_token_ids, all_advantages = [], [], []

    stats = {
        "response_lengths": [],
        "rewards": [],
        "non_stop_rate": [],
    }

    for sample, group_indices in zip(samples, groups):
        finish_reasons = [all_finish_reasons[i] for i in group_indices]
        response_token_ids = [all_generations[i] for i in group_indices]
        responses = tokenizer.batch_decode(response_token_ids, skip_special_tokens=False)

        rewards_and_metrics = [compute_reward(resp, sample) for resp in responses]
        rewards, reward_metrics = zip(*rewards_and_metrics)

        # GRPO advantage: normalize rewards within the group
        rewards = np.array(rewards)
        response_advantages = (rewards - rewards.mean()) / (rewards.std() + 1e-4)
        
        # Assign same advantage to all tokens in each response
        advantages = [
            [resp_adv] * len(resp) 
            for resp_adv, resp in zip(response_advantages, response_token_ids)
        ]

        all_query_token_ids.extend([sample["input_ids"]] * GENERATIONS_PER_SAMPLE)
        all_responses_token_ids.extend(response_token_ids)
        all_advantages.extend(advantages)

        stats["rewards"].extend(rewards)
        stats["non_stop_rate"].extend([fr != "stop" for fr in finish_reasons])
        stats["response_lengths"].extend([len(ids) for ids in response_token_ids])
        for rm in reward_metrics:
            for k, v in rm.items():
                stats.setdefault(f"reward_metrics/{k}", []).append(v)

    episodes = {
        "all_query_token_ids": all_query_token_ids,
        "all_response_token_ids": all_responses_token_ids,
        "all_advantages": all_advantages,
    }

    return episodes, stats

## Policy Gradient Loss

In [8]:
def compute_pg_loss(
    policy_model: Union[DeepSpeedEngine, PreTrainedModel],
    reference_model: Union[DeepSpeedEngine, PreTrainedModel],
    batch: Dict[str, torch.Tensor],
    total_response_len: int,
) -> Tuple[torch.Tensor, Dict[str, float]]:
    """Compute policy gradient loss with KL penalty between policy and reference models."""
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    advantages = batch["advantages"]

    model_inputs = {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
        "labels_mask": batch["labels_mask"],
    }

    labels_mask = (labels[..., 1:] != -100).float()

    with torch.no_grad():
        ref_logps = compute_token_log_probs(reference_model, model_inputs, TEMPERATURE)

    logps = compute_token_log_probs(policy_model, model_inputs, TEMPERATURE)

    # KL penalty (k3 estimator)
    kl_penalty = torch.exp(ref_logps - logps) - (ref_logps - logps) - 1
    kl_penalty = kl_penalty * labels_mask

    entropy = -logps.sum() / labels_mask.sum()

    # Policy gradient loss: -log_prob * advantage
    policy_loss = -logps * advantages[..., 1:]
    policy_loss = policy_loss * labels_mask

    loss = (policy_loss + KL_COEFFICIENT * kl_penalty).sum() / total_response_len

    metrics = {
        "policy_loss": policy_loss.sum().item() / total_response_len,
        "kl_penalty": kl_penalty.sum().item() / total_response_len,
        "entropy": entropy.item() / total_response_len,
    }

    return loss, metrics

## Training

In [9]:
# Initialize main and reference models
policy_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map=0,
)
reference_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map=0,
)
policy_model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})


# Initialize DeepSpeed engines
policy_model, *_ = deepspeed.initialize(
    model=policy_model,
    config=deepspeed_config,
    model_parameters=policy_model.parameters(),
)
reference_model, *_ = deepspeed.initialize(
    model=reference_model,
    config=ref_deepspeed_config,
)

reference_model.module.cpu()

# Free up GPU memory before initializing vLLM
gc.collect()
torch.cuda.empty_cache()
time.sleep(2)

############################################
# Initialize vLLM (Inference) engine
############################################

print("Initializing vLLM inference engine...")
inference_engine = LLM(
    model=MODEL_NAME,
    skip_tokenizer_init=False,
    gpu_memory_utilization=0.15,  # Reduced from 0.2 to leave more room
    enable_prefix_caching=True,
    swap_space=2,  # Increased swap space
    scheduling_policy="fcfs",
    dtype="bfloat16",  # Use string instead of torch.bfloat16
    max_model_len=2048,
    enable_sleep_mode=True,
)
print("vLLM inference engine initialized successfully")

# Wandb for logging
if USE_WANDB:
    if wandb is None:
        raise ImportError("wandb is not installed. Run: pip install wandb")
    wandb.init(
        project="r1-aha-moment",
        name=RUN_NAME,
        config={
            "model_name": MODEL_NAME,
            "learning_rate": LEARNING_RATE,
            "num_iterations": NUM_ITERATIONS,
            "episodes_per_iteration": EPISODES_PER_ITERATION,
            "rollouts_per_episode": GENERATIONS_PER_SAMPLE,
            "kl_coefficient": KL_COEFFICIENT,
            "temperature": TEMPERATURE,
        },
    )
    print("Wandb logging enabled")
else:
    print("Wandb logging disabled")

# Load checkpoint if it exists
begin_iter = 0
ckpt_path, ckpt_iter = find_last_checkpoint(EXP_DIR)
if ckpt_path is not None:
    print(f"Resuming from checkpoint {ckpt_path} at iteration {ckpt_iter}")
    out = policy_model.load_checkpoint(ckpt_path / "deepspeed")
    if out is None:
        raise RuntimeError(f"Failed to load checkpoint {ckpt_path}")
    begin_iter = ckpt_iter + 1
    load_model_into_vllm(policy_model, inference_engine)

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Before initializing optimizer states
MA 22.99 GB         Max_MA 28.74 GB         CA 34.49 GB         Max_CA 34 GB 
CPU Virtual Memory:  used = 47.06 GB, percent = 5.0%
After initializing optimizer states
MA 22.99 GB         Max_MA 34.49 GB         CA 45.99 GB         Max_CA 46 GB 
CPU Virtual Memory:  used = 46.91 GB, percent = 5.0%
After initializing ZeRO optimizer
MA 22.99 GB         Max_MA 22.99 GB         CA 45.99 GB         Max_CA 46 GB 
CPU Virtual Memory:  used = 46.99 GB, percent = 5.0%
begin bf16_optimizer
MA 22.99 GB         Max_MA 22.99 GB         CA 45.99 GB         Max_CA 46 GB 
CPU Virtual Memory:  used = 47.08 GB, percent = 5.0%
end bf16_ optimizer
MA 22.99 GB         Max_MA 22.99 GB         CA 45.99 GB         Max_CA 46 GB 
CPU Virtual Memory:  used = 47.02 GB, percent = 5.0%
Initializing vLLM inference engine...
INFO 02-17 22:05:35 [utils.py:261] non-default args: {'dtype': 'bfloat16', 'max_model_len': 2048, 'enable_prefix_caching': True, 'swap_space': 2, 'gpu_memory_u

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.82it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.96it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.93it/s]
[0;36m(EngineCore_DP0 pid=13850)[0;0m 


[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:05:46 [default_loader.py:291] Loading weights took 1.12 seconds
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:05:46 [gpu_model_runner.py:4130] Model loading took 5.79 GiB memory and 2.046550 seconds
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:05:53 [backends.py:812] Using cache directory: /root/.cache/vllm/torch_compile_cache/f20a54b745/rank_0_0/backbone for vLLM's torch.compile
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:05:53 [backends.py:872] Dynamo bytecode transform time: 5.78 s
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:05:58 [backends.py:267] Directly load the compiled graph(s) for compile range (1, 8192) from the cache, took 0.925 s
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:05:58 [monitor.py:34] torch.compile takes 6.71 s in total
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:05:59 [gpu_worker.py:356] Available KV cache memory: 4.64 GiB
[0;36m(EngineCore_DP0 pid=138

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 51/51 [00:02<00:00, 18.51it/s]
Capturing CUDA graphs (decode, FULL): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 35/35 [00:01<00:00, 20.45it/s]


[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:06:05 [gpu_model_runner.py:5063] Graph capturing finished in 5 secs, took 0.57 GiB
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:06:05 [core.py:272] init engine (profile, create kv cache, warmup model) took 18.28 seconds
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:06:06 [vllm.py:624] Asynchronous scheduling is enabled.
INFO 02-17 22:06:06 [llm.py:343] Supported tasks: ('generate',)
vLLM inference engine initialized successfully
Wandb logging disabled


In [10]:
for iteration in trange(NUM_ITERATIONS):
    print(f"Iteration {iteration}/{NUM_ITERATIONS}")

    metrics = {}

    #########################################################
    # Evaluation
    #########################################################

    eval_stats = None
    if iteration % 25 == 0:
        print("Evaluating on eval set...")
        eval_episodes, eval_stats = evaluate_on_test_set(
            inference_engine=inference_engine,
            test_dataset=test_dataset,
            tokenizer=tokenizer,
            eos_token=EOS_TOKEN,
            eval_sampling_params=SamplingParams(
                temperature=0.3,
                max_tokens=1024,
                n=1,
                detokenize=False,
                stop_token_ids=[EOS_TOKEN_ID],
            ),
            reward_func=lambda completion, sample: compute_reward(
                completion, sample
            ),
        )
        eval_episode_table = dump_episodes(
            episodes=eval_episodes,
            episodes_stats=eval_stats,
            exp_dir=EXP_DIR,
            tokenizer=tokenizer,
            iteration=iteration,
            is_eval=True,
        )
        if USE_WANDB:
            wandb.log({"eval/episodes": eval_episode_table, "iteration": iteration})


    #########################################################
    # Generate Episodes
    #########################################################

    # Sample training batch
    num_samples = EPISODES_PER_ITERATION // GENERATIONS_PER_SAMPLE
    indices = np.random.choice(
        len(train_dataset), size=num_samples, replace=False
    )
    samples = train_dataset.select(indices)

    # Sample responses
    outputs = inference_engine.generate(
        prompts=[{"prompt_token_ids": ids} for ids in samples["input_ids"]],
        sampling_params=SamplingParams(
            n=GENERATIONS_PER_SAMPLE,
            temperature=TEMPERATURE,
            top_p=TOP_P,
            top_k=TOP_K,
            max_tokens=MAX_RESPONSE_TOKENS,
            detokenize=False,
            stop_token_ids=[EOS_TOKEN_ID],
        )
    )
    all_generations = [list(g.token_ids) for out in outputs for g in out.outputs]
    all_finish_reasons = [g.finish_reason for out in outputs for g in out.outputs]
    inference_engine.sleep(1)

    print(f"Generated {len(all_generations)} responses")
    gc.collect()
    torch.cuda.empty_cache()
    time.sleep(1)

    # Process responses and calculate rewards
    episodes, episodes_stats = create_training_episodes(
        samples,
        all_generations,
        all_finish_reasons,
    )
    for k, v in episodes_stats.items():
        metrics.setdefault(k, []).extend(v)

    episode_table = dump_episodes(
        episodes=episodes,
        episodes_stats=episodes_stats,
        exp_dir=EXP_DIR,
        tokenizer=tokenizer,
        iteration=iteration,
    )

    #########################################################
    # Training
    #########################################################

    # Prepare training batch
    model_inputs = prepare_model_inputs(
        query_token_ids=episodes["all_query_token_ids"],
        response_token_ids=episodes["all_response_token_ids"],
        advantages=episodes["all_advantages"],
        device="cuda"
    )

    # Calculate losses and update model
    policy_model.train()
    reference_model.module.cuda()
    reference_model.eval()

    total_response_len = (model_inputs["labels"] != -100).sum().item()

    for i in trange(0, EPISODES_PER_ITERATION, PER_DEVICE_BATCH_SIZE, desc="Gradient Accumulation"):
        batch = {
            k: v[i : i + PER_DEVICE_BATCH_SIZE]
            for k, v in model_inputs.items()
        }

        # Compute policy gradient loss
        loss, loss_metrics = compute_pg_loss(
            policy_model=policy_model,
            reference_model=reference_model,
            batch=batch,
            total_response_len=total_response_len,
        )

        # Track metrics
        metrics.setdefault("loss", []).append(loss.item())
        grad_norm = policy_model.get_global_grad_norm()
        if grad_norm is not None:
            grad_norm = grad_norm.item()
        metrics.setdefault("grad_norm", []).append(grad_norm)
        for k, v in loss_metrics.items():
            metrics.setdefault(k, []).append(v.item() if isinstance(v, torch.Tensor) else v)

        # Backpropagation and optimization step
        policy_model.backward(loss, scale_wrt_gas=False)
        
        # Free memory
        del loss, loss_metrics
        if policy_model.is_gradient_accumulation_boundary():
            reference_model.module.cpu()

        policy_model.step()

    #########################################################
    # Update inference engine weights
    #########################################################
    
    gc.collect()
    torch.cuda.empty_cache()
    time.sleep(1)

    inference_engine.wake_up()
    load_model_into_vllm(policy_model, inference_engine)

    gc.collect()
    torch.cuda.empty_cache()
    time.sleep(1)


    #########################################################
    # Log metrics
    #########################################################

    train_metrics = {
        k: np.mean(v) for k, v in metrics.items() if None not in v
    }
    train_metrics["learning_rate"] = policy_model.get_lr()[0]
    logs = {
        "iteration": iteration,
        f"episodes/iter_{iteration:06d}": episode_table,
        **{f"train/{k}": v for k, v in train_metrics.items()},
    }
    if eval_stats is not None:
        eval_metrics = {k: np.mean(v) for k, v in eval_stats.items() if None not in v}
        logs.update({f"eval/{k}": v for k, v in eval_metrics.items()})
    
    if USE_WANDB:
        wandb.log(logs)

    selected_keys = [
        "train/kl_penalty",
        "train/rewards",
        "train/reward_metrics/format_reward",
        "train/reward_metrics/equation_reward",
        "eval/rewards",
        "eval/reward_metrics/format_reward",
        "eval/reward_metrics/equation_reward",
    ]
    selected_metrics = {k: logs[k] for k in selected_keys if k in logs}
    print(f"KEY METRICS: {selected_metrics}")

    if iteration % 50 == 0 and iteration != 0:
        policy_model.module.save_pretrained(
            str(EXP_DIR / "checkpoints" / f"ckpt_{iteration:06d}" / "hf_model")
        )
        policy_model.save_checkpoint(
            str(EXP_DIR / "checkpoints" / f"ckpt_{iteration:06d}" / "deepspeed")
        )

  0%|          | 0/1000 [00:00<?, ?it/s]

Iteration 0/1000
Evaluating on eval set...


Adding requests:   0%|          | 0/500 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/500 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:06:46 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:06:46 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:06:46 [gpu_worker.py:128] Sleep mode freed 6.04 GiB memory, 25.56 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:06:46 [abstract.py:306] It took 0.021288 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 0.0, Response Length: 143)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [54, 26, 71], create an equation that equals 99. You can use basic arithmetic operations (+, -, *, /) and each number can on


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:02<00:38,  2.55s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:03<00:26,  1.87s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:05<00:21,  1.66s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:06<00:18,  1.55s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:08<00:16,  1.51s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:09<00:14,  1.48s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:11<00:13,  1.47s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:12<00:11,  1.45s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:13<00:10,  1.45s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:15<00:08,  1.45s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:16<00:07,  1.46s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:07:17 [abstract.py:324] It took 0.017152 seconds to wake up tags {'weights', 'kv_cache'}.


  0%|          | 1/1000 [01:17<21:31:21, 77.56s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.0), 'train/rewards': np.float64(0.1171875), 'train/reward_metrics/format_reward': np.float64(0.0859375), 'train/reward_metrics/equation_reward': np.float64(0.03125), 'eval/rewards': np.float64(0.261), 'eval/reward_metrics/format_reward': np.float64(0.251), 'eval/reward_metrics/equation_reward': np.float64(0.01)}
Iteration 1/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:07:38 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:07:38 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:07:38 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:07:38 [abstract.py:306] It took 0.015506 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 0.0, Response Length: 445)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [37, 13, 63, 38], create an equation that equals 49. You can use basic arithmetic operations (+, -, *, /) and each number ca


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:22,  1.53s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:03<00:21,  1.57s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:04<00:20,  1.57s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:06<00:17,  1.50s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:07<00:16,  1.46s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:08<00:14,  1.45s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:10<00:13,  1.45s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:11<00:11,  1.44s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:13<00:09,  1.43s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:14<00:08,  1.44s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:16<00:07,  1.45s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:08:08 [abstract.py:324] It took 0.014575 seconds to wake up tags {'weights', 'kv_cache'}.


  0%|          | 2/1000 [02:08<17:12:06, 62.05s/it]

KEY METRICS: {'train/kl_penalty': np.float64(1.977932577881987e-05), 'train/rewards': np.float64(0.1015625), 'train/reward_metrics/format_reward': np.float64(0.0859375), 'train/reward_metrics/equation_reward': np.float64(0.015625)}
Iteration 2/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:08:29 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:08:29 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:08:29 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:08:29 [abstract.py:306] It took 0.013956 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 0.0, Response Length: 142)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [80, 78, 74], create an equation that equals 84. You can use basic arithmetic operations (+, -, *, /) and each number can on


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:20,  1.40s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:19,  1.40s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:04<00:18,  1.41s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:16,  1.41s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:07<00:15,  1.41s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:08<00:14,  1.42s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:09<00:12,  1.42s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:11<00:11,  1.43s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:12<00:09,  1.42s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:14<00:08,  1.42s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:15<00:07,  1.42s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:08:59 [abstract.py:324] It took 0.017155 seconds to wake up tags {'weights', 'kv_cache'}.


  0%|          | 3/1000 [02:58<15:38:20, 56.47s/it]

KEY METRICS: {'train/kl_penalty': np.float64(1.921026830179283e-05), 'train/rewards': np.float64(0.0703125), 'train/reward_metrics/format_reward': np.float64(0.0703125), 'train/reward_metrics/equation_reward': np.float64(0.0)}
Iteration 3/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:09:19 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:09:19 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:09:19 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:09:19 [abstract.py:306] It took 0.012964 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 0.5, Response Length: 158)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [29, 58, 19], create an equation that equals 38. You can use basic arithmetic operations (+, -, *, /) and each number can on


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:21,  1.41s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:19,  1.42s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:04<00:18,  1.42s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:17,  1.42s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:07<00:15,  1.42s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:08<00:14,  1.43s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:09<00:12,  1.43s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:11<00:11,  1.42s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:12<00:10,  1.43s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:14<00:08,  1.44s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:15<00:07,  1.45s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:09:49 [abstract.py:324] It took 0.017581 seconds to wake up tags {'weights', 'kv_cache'}.


  0%|          | 4/1000 [03:48<14:54:20, 53.88s/it]

KEY METRICS: {'train/kl_penalty': np.float64(2.4703943839111153e-05), 'train/rewards': np.float64(0.09375), 'train/reward_metrics/format_reward': np.float64(0.09375), 'train/reward_metrics/equation_reward': np.float64(0.0)}
Iteration 4/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:10:09 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:10:09 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:10:09 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:10:09 [abstract.py:306] It took 0.014358 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 0.5, Response Length: 39)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [33, 9, 36, 1], create an equation that equals 30. You can use basic arithmetic operations (+, -, *, /) and each number can o


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:21,  1.41s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:19,  1.42s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:04<00:18,  1.42s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:17,  1.43s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:07<00:15,  1.42s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:08<00:14,  1.42s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:09<00:12,  1.42s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:11<00:11,  1.42s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:12<00:09,  1.42s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:14<00:08,  1.41s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:15<00:07,  1.41s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:10:38 [abstract.py:324] It took 0.018221 seconds to wake up tags {'weights', 'kv_cache'}.


  0%|          | 5/1000 [04:38<14:28:54, 52.40s/it]

KEY METRICS: {'train/kl_penalty': np.float64(2.238489519888493e-05), 'train/rewards': np.float64(0.078125), 'train/reward_metrics/format_reward': np.float64(0.0625), 'train/reward_metrics/equation_reward': np.float64(0.015625)}
Iteration 5/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:10:59 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:10:59 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:10:59 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:10:59 [abstract.py:306] It took 0.013601 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 0.0, Response Length: 295)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [92, 94, 45], create an equation that equals 43. You can use basic arithmetic operations (+, -, *, /) and each number can on


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:21,  1.41s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:19,  1.40s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:04<00:18,  1.40s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:16,  1.40s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:07<00:15,  1.40s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:08<00:14,  1.40s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:09<00:12,  1.41s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:11<00:11,  1.41s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:12<00:09,  1.41s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:14<00:08,  1.40s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:15<00:06,  1.39s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:11:29 [abstract.py:324] It took 0.016796 seconds to wake up tags {'weights', 'kv_cache'}.


  1%|          | 6/1000 [05:28<14:13:27, 51.52s/it]

KEY METRICS: {'train/kl_penalty': np.float64(2.5043618405488524e-05), 'train/rewards': np.float64(0.1875), 'train/reward_metrics/format_reward': np.float64(0.1875), 'train/reward_metrics/equation_reward': np.float64(0.0)}
Iteration 6/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:11:48 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:11:48 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:11:48 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:11:48 [abstract.py:306] It took 0.013153 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 0.0, Response Length: 378)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [6, 4, 17], create an equation that equals 98. You can use basic arithmetic operations (+, -, *, /) and each number can only


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:21,  1.41s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:19,  1.42s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:04<00:18,  1.41s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:17,  1.42s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:07<00:15,  1.41s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:08<00:14,  1.41s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:09<00:12,  1.42s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:11<00:11,  1.42s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:12<00:09,  1.42s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:14<00:08,  1.42s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:15<00:07,  1.43s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:12:19 [abstract.py:324] It took 0.017747 seconds to wake up tags {'weights', 'kv_cache'}.


  1%|          | 7/1000 [06:18<14:04:28, 51.03s/it]

KEY METRICS: {'train/kl_penalty': np.float64(3.733292164919895e-05), 'train/rewards': np.float64(0.1015625), 'train/reward_metrics/format_reward': np.float64(0.1015625), 'train/reward_metrics/equation_reward': np.float64(0.0)}
Iteration 7/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:12:38 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:12:38 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:12:38 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:12:38 [abstract.py:306] It took 0.012679 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 0.0, Response Length: 918)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [11, 20, 2], create an equation that equals 11. You can use basic arithmetic operations (+, -, *, /) and each number can onl


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:21,  1.40s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:19,  1.41s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:04<00:18,  1.41s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:16,  1.41s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:07<00:15,  1.41s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:08<00:14,  1.42s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:09<00:12,  1.42s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:11<00:11,  1.41s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:12<00:09,  1.42s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:14<00:08,  1.42s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:15<00:07,  1.42s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:13:08 [abstract.py:324] It took 0.015907 seconds to wake up tags {'weights', 'kv_cache'}.


  1%|          | 8/1000 [07:08<13:59:18, 50.76s/it]

KEY METRICS: {'train/kl_penalty': np.float64(5.0783006035804945e-05), 'train/rewards': np.float64(0.2265625), 'train/reward_metrics/format_reward': np.float64(0.1953125), 'train/reward_metrics/equation_reward': np.float64(0.03125)}
Iteration 8/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:13:29 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:13:29 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:13:29 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:13:29 [abstract.py:306] It took 0.013071 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 136)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [56, 67, 35], create an equation that equals 24. You can use basic arithmetic operations (+, -, *, /) and each number can on


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:21,  1.42s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:19,  1.43s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:04<00:18,  1.43s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:17,  1.42s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:07<00:15,  1.42s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:08<00:14,  1.41s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:09<00:12,  1.41s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:11<00:11,  1.41s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:12<00:09,  1.41s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:14<00:08,  1.41s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:15<00:07,  1.42s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:13:59 [abstract.py:324] It took 0.016480 seconds to wake up tags {'weights', 'kv_cache'}.


  1%|          | 9/1000 [07:59<14:00:10, 50.87s/it]

KEY METRICS: {'train/kl_penalty': np.float64(6.0337664749352435e-05), 'train/rewards': np.float64(0.171875), 'train/reward_metrics/format_reward': np.float64(0.171875), 'train/reward_metrics/equation_reward': np.float64(0.0)}
Iteration 9/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:14:20 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:14:20 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:14:20 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:14:20 [abstract.py:306] It took 0.013153 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 0.0, Response Length: 269)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [65, 84, 55], create an equation that equals 36. You can use basic arithmetic operations (+, -, *, /) and each number can on


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:21,  1.40s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:19,  1.42s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:04<00:18,  1.42s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:17,  1.43s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:07<00:15,  1.42s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:08<00:14,  1.42s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:09<00:12,  1.42s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:11<00:11,  1.41s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:12<00:09,  1.41s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:14<00:08,  1.41s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:15<00:07,  1.41s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:14:49 [abstract.py:324] It took 0.014294 seconds to wake up tags {'weights', 'kv_cache'}.


  1%|          | 10/1000 [08:49<13:54:19, 50.57s/it]

KEY METRICS: {'train/kl_penalty': np.float64(9.954686882655077e-05), 'train/rewards': np.float64(0.265625), 'train/reward_metrics/format_reward': np.float64(0.25), 'train/reward_metrics/equation_reward': np.float64(0.015625)}
Iteration 10/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:15:09 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:15:09 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:15:09 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.71 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:15:09 [abstract.py:306] It took 0.015474 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 248)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [88, 87, 77, 85], create an equation that equals 30. You can use basic arithmetic operations (+, -, *, /) and each number c


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:20,  1.39s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:19,  1.41s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:04<00:18,  1.41s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:16,  1.41s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:07<00:15,  1.41s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:08<00:14,  1.42s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:09<00:12,  1.42s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:11<00:11,  1.42s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:12<00:09,  1.42s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:14<00:08,  1.42s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:15<00:07,  1.42s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:15:39 [abstract.py:324] It took 0.016972 seconds to wake up tags {'weights', 'kv_cache'}.


  1%|          | 11/1000 [09:38<13:48:52, 50.29s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.00014807881837845382), 'train/rewards': np.float64(0.265625), 'train/reward_metrics/format_reward': np.float64(0.25), 'train/reward_metrics/equation_reward': np.float64(0.015625)}
Iteration 11/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:15:59 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:15:59 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:15:59 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.71 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:15:59 [abstract.py:306] It took 0.014636 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 349)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [86, 18, 5, 37], create an equation that equals 97. You can use basic arithmetic operations (+, -, *, /) and each number ca


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:21,  1.40s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:19,  1.42s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:04<00:18,  1.42s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:16,  1.42s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:07<00:15,  1.42s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:08<00:14,  1.41s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:09<00:12,  1.41s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:11<00:11,  1.41s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:12<00:09,  1.41s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:14<00:08,  1.42s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:15<00:07,  1.42s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:16:29 [abstract.py:324] It took 0.015327 seconds to wake up tags {'weights', 'kv_cache'}.


  1%|          | 12/1000 [10:29<13:48:09, 50.29s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.00020405880856246662), 'train/rewards': np.float64(0.4453125), 'train/reward_metrics/format_reward': np.float64(0.4140625), 'train/reward_metrics/equation_reward': np.float64(0.03125)}
Iteration 12/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:16:49 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:16:49 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:16:49 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.71 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:16:49 [abstract.py:306] It took 0.014496 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 279)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [67, 73, 9, 41], create an equation that equals 38. You can use basic arithmetic operations (+, -, *, /) and each number ca


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:21,  1.41s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:19,  1.42s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:04<00:18,  1.42s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:17,  1.42s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:07<00:15,  1.42s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:08<00:14,  1.42s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:09<00:12,  1.42s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:11<00:11,  1.42s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:12<00:09,  1.42s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:14<00:08,  1.42s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:15<00:07,  1.42s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:17:19 [abstract.py:324] It took 0.015779 seconds to wake up tags {'weights', 'kv_cache'}.


  1%|‚ñè         | 13/1000 [11:19<13:48:51, 50.39s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.00025828411660502415), 'train/rewards': np.float64(0.40625), 'train/reward_metrics/format_reward': np.float64(0.390625), 'train/reward_metrics/equation_reward': np.float64(0.015625)}
Iteration 13/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:17:40 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:17:40 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:17:40 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.71 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:17:40 [abstract.py:306] It took 0.012992 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 0.5, Response Length: 85)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [54, 61, 30, 47], create an equation that equals 15. You can use basic arithmetic operations (+, -, *, /) and each number ca


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:21,  1.41s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:19,  1.41s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:04<00:18,  1.42s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:17,  1.42s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:07<00:15,  1.42s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:08<00:14,  1.42s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:09<00:12,  1.42s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:11<00:11,  1.43s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:12<00:09,  1.42s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:14<00:08,  1.42s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:15<00:07,  1.42s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:18:10 [abstract.py:324] It took 0.013911 seconds to wake up tags {'weights', 'kv_cache'}.


  1%|‚ñè         | 14/1000 [12:10<13:50:09, 50.52s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.0003364141111140574), 'train/rewards': np.float64(0.296875), 'train/reward_metrics/format_reward': np.float64(0.296875), 'train/reward_metrics/equation_reward': np.float64(0.0)}
Iteration 14/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:18:31 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:18:31 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:18:31 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.71 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:18:31 [abstract.py:306] It took 0.014418 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 140)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [12, 24, 21], create an equation that equals 57. You can use basic arithmetic operations (+, -, *, /) and each number can o


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:21,  1.40s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:19,  1.41s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:04<00:18,  1.41s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:16,  1.41s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:07<00:15,  1.41s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:08<00:14,  1.42s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:09<00:12,  1.42s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:11<00:11,  1.41s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:12<00:09,  1.41s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:14<00:08,  1.41s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:15<00:07,  1.41s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:19:00 [abstract.py:324] It took 0.016943 seconds to wake up tags {'weights', 'kv_cache'}.


  2%|‚ñè         | 15/1000 [13:00<13:48:27, 50.46s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.00040336188908406785), 'train/rewards': np.float64(0.453125), 'train/reward_metrics/format_reward': np.float64(0.4375), 'train/reward_metrics/equation_reward': np.float64(0.015625)}
Iteration 15/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:19:21 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:19:21 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:19:21 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.71 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:19:21 [abstract.py:306] It took 0.015790 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 0.0, Response Length: 272)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [17, 3, 62, 1], create an equation that equals 15. You can use basic arithmetic operations (+, -, *, /) and each number can


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:21,  1.44s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:19,  1.43s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:04<00:18,  1.43s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:17,  1.42s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:07<00:15,  1.42s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:08<00:14,  1.42s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:09<00:12,  1.40s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:11<00:11,  1.41s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:12<00:09,  1.40s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:14<00:08,  1.40s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:15<00:06,  1.39s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:19:51 [abstract.py:324] It took 0.019233 seconds to wake up tags {'weights', 'kv_cache'}.


  2%|‚ñè         | 16/1000 [13:51<13:45:52, 50.36s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.0004238373863935722), 'train/rewards': np.float64(0.5078125), 'train/reward_metrics/format_reward': np.float64(0.4921875), 'train/reward_metrics/equation_reward': np.float64(0.015625)}
Iteration 16/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:20:11 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:20:11 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:20:11 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:20:11 [abstract.py:306] It took 0.017131 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 0.5, Response Length: 179)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [12, 3, 26], create an equation that equals 66. You can use basic arithmetic operations (+, -, *, /) and each number can onl


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:21,  1.42s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:20,  1.44s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:04<00:18,  1.44s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:17,  1.44s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:07<00:15,  1.44s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:08<00:14,  1.44s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:10<00:12,  1.44s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:11<00:11,  1.43s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:12<00:09,  1.43s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:14<00:08,  1.42s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:15<00:07,  1.42s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:20:41 [abstract.py:324] It took 0.017791 seconds to wake up tags {'weights', 'kv_cache'}.


  2%|‚ñè         | 17/1000 [14:41<13:44:37, 50.33s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.0004888636792204602), 'train/rewards': np.float64(0.40625), 'train/reward_metrics/format_reward': np.float64(0.390625), 'train/reward_metrics/equation_reward': np.float64(0.015625)}
Iteration 17/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:21:00 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:21:00 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:21:00 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:21:00 [abstract.py:306] It took 0.014015 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 0.0, Response Length: 163)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [12, 21, 29], create an equation that equals 96. You can use basic arithmetic operations (+, -, *, /) and each number can on


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:20,  1.39s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:19,  1.37s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:04<00:17,  1.35s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:16,  1.34s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:06<00:14,  1.34s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:08<00:13,  1.35s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:09<00:12,  1.35s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:10<00:10,  1.35s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:12<00:09,  1.36s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:13<00:08,  1.35s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:14<00:06,  1.35s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:21:31 [abstract.py:324] It took 0.016131 seconds to wake up tags {'weights', 'kv_cache'}.


  2%|‚ñè         | 18/1000 [15:31<13:44:21, 50.37s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.0005007411438006605), 'train/rewards': np.float64(0.453125), 'train/reward_metrics/format_reward': np.float64(0.4375), 'train/reward_metrics/equation_reward': np.float64(0.015625)}
Iteration 18/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:21:52 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:21:52 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:21:52 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:21:52 [abstract.py:306] It took 0.013641 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 0.5, Response Length: 120)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [8, 95, 81], create an equation that equals 22. You can use basic arithmetic operations (+, -, *, /) and each number can onl


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:21,  1.42s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:20,  1.43s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:04<00:18,  1.43s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:17,  1.43s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:07<00:15,  1.42s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:08<00:14,  1.43s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:09<00:12,  1.42s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:11<00:11,  1.42s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:12<00:09,  1.41s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:14<00:08,  1.41s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:15<00:07,  1.41s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:22:22 [abstract.py:324] It took 0.014000 seconds to wake up tags {'weights', 'kv_cache'}.


  2%|‚ñè         | 19/1000 [16:22<13:44:47, 50.45s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.0008487646130548008), 'train/rewards': np.float64(0.546875), 'train/reward_metrics/format_reward': np.float64(0.546875), 'train/reward_metrics/equation_reward': np.float64(0.0)}
Iteration 19/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:22:43 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:22:43 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:22:43 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:22:43 [abstract.py:306] It took 0.018015 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 0.5, Response Length: 107)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [83, 60, 65], create an equation that equals 88. You can use basic arithmetic operations (+, -, *, /) and each number can on


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:21,  1.43s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:20,  1.49s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:04<00:19,  1.46s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:17,  1.46s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:07<00:16,  1.46s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:08<00:14,  1.47s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:10<00:13,  1.45s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:11<00:11,  1.44s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:13<00:10,  1.44s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:14<00:08,  1.43s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:15<00:07,  1.42s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:23:13 [abstract.py:324] It took 0.013571 seconds to wake up tags {'weights', 'kv_cache'}.


  2%|‚ñè         | 20/1000 [17:15<13:55:02, 51.13s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.0007778998369453575), 'train/rewards': np.float64(0.6015625), 'train/reward_metrics/format_reward': np.float64(0.5703125), 'train/reward_metrics/equation_reward': np.float64(0.03125)}
Iteration 20/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:23:35 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:23:35 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:23:35 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:23:35 [abstract.py:306] It took 0.012468 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 0.5, Response Length: 178)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [20, 30, 42, 12], create an equation that equals 80. You can use basic arithmetic operations (+, -, *, /) and each number ca


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:20,  1.40s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:19,  1.40s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:04<00:18,  1.41s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:16,  1.41s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:07<00:15,  1.40s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:08<00:14,  1.41s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:09<00:12,  1.42s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:11<00:11,  1.43s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:12<00:09,  1.42s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:14<00:08,  1.42s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:15<00:07,  1.42s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:24:05 [abstract.py:324] It took 0.014354 seconds to wake up tags {'weights', 'kv_cache'}.


  2%|‚ñè         | 21/1000 [18:06<13:53:49, 51.10s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.0007712350640049319), 'train/rewards': np.float64(0.6484375), 'train/reward_metrics/format_reward': np.float64(0.6328125), 'train/reward_metrics/equation_reward': np.float64(0.015625)}
Iteration 21/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:24:26 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:24:26 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:24:26 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:24:26 [abstract.py:306] It took 0.016279 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 125)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [5, 6, 4, 31], create an equation that equals 95. You can use basic arithmetic operations (+, -, *, /) and each number can o


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:21,  1.45s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:20,  1.44s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:04<00:18,  1.44s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:17,  1.44s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:07<00:15,  1.44s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:08<00:14,  1.43s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:10<00:12,  1.43s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:11<00:11,  1.43s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:12<00:09,  1.43s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:14<00:08,  1.41s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:15<00:07,  1.42s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:24:57 [abstract.py:324] It took 0.018819 seconds to wake up tags {'weights', 'kv_cache'}.


  2%|‚ñè         | 22/1000 [18:58<13:58:49, 51.46s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.000988462471494057), 'train/rewards': np.float64(0.6796875), 'train/reward_metrics/format_reward': np.float64(0.6171875), 'train/reward_metrics/equation_reward': np.float64(0.0625)}
Iteration 22/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:25:19 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:25:19 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:25:19 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.71 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:25:19 [abstract.py:306] It took 0.014513 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 0.5, Response Length: 177)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [9, 6, 96, 13], create an equation that equals 45. You can use basic arithmetic operations (+, -, *, /) and each number can


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:21,  1.43s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:20,  1.43s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:04<00:18,  1.43s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:16,  1.40s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:07<00:15,  1.41s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:08<00:14,  1.41s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:09<00:12,  1.41s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:11<00:11,  1.41s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:12<00:09,  1.40s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:14<00:08,  1.40s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:15<00:07,  1.40s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:25:48 [abstract.py:324] It took 0.013346 seconds to wake up tags {'weights', 'kv_cache'}.


  2%|‚ñè         | 23/1000 [19:49<13:54:24, 51.24s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.0011948022017738628), 'train/rewards': np.float64(0.7265625), 'train/reward_metrics/format_reward': np.float64(0.6953125), 'train/reward_metrics/equation_reward': np.float64(0.03125)}
Iteration 23/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:26:06 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:26:06 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:26:06 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.71 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:26:06 [abstract.py:306] It took 0.019718 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 0.5, Response Length: 38)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [57, 56, 71], create an equation that equals 72. You can use basic arithmetic operations (+, -, *, /) and each number can on


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:17,  1.17s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:16,  1.19s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:15,  1.21s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:04<00:14,  1.19s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:06<00:13,  1.21s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:07<00:12,  1.22s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:08<00:11,  1.23s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:09<00:09,  1.22s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:10<00:08,  1.20s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:12<00:07,  1.20s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:13<00:06,  1.20s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:26:33 [abstract.py:324] It took 0.015562 seconds to wake up tags {'weights', 'kv_cache'}.


  2%|‚ñè         | 24/1000 [20:33<13:19:52, 49.17s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.0011389734654113706), 'train/rewards': np.float64(0.765625), 'train/reward_metrics/format_reward': np.float64(0.734375), 'train/reward_metrics/equation_reward': np.float64(0.03125)}
Iteration 24/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:26:54 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:26:54 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:26:54 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.71 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:26:54 [abstract.py:306] It took 0.015678 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 95)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [2, 92, 15, 5], create an equation that equals 70. You can use basic arithmetic operations (+, -, *, /) and each number can 


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:20,  1.35s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:18,  1.35s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:04<00:17,  1.34s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:16,  1.37s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:06<00:14,  1.36s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:08<00:13,  1.36s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:09<00:12,  1.37s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:10<00:10,  1.37s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:12<00:09,  1.36s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:13<00:08,  1.35s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:14<00:06,  1.36s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:27:24 [abstract.py:324] It took 0.017061 seconds to wake up tags {'weights', 'kv_cache'}.


  2%|‚ñé         | 25/1000 [21:26<13:38:26, 50.37s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.0013846411917402745), 'train/rewards': np.float64(0.78125), 'train/reward_metrics/format_reward': np.float64(0.75), 'train/reward_metrics/equation_reward': np.float64(0.03125)}
Iteration 25/1000
Evaluating on eval set...


Adding requests:   0%|          | 0/500 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/500 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:28:02 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:28:02 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:28:02 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.71 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:28:02 [abstract.py:306] It took 0.032118 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 0.0, Response Length: 435)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [72, 33, 66, 37], create an equation that equals 68. You can use basic arithmetic operations (+, -, *, /) and each number c


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:18,  1.20s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:16,  1.21s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:15,  1.20s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:04<00:14,  1.20s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:06<00:13,  1.20s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:07<00:12,  1.20s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:08<00:10,  1.21s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:09<00:09,  1.21s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:10<00:08,  1.20s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:12<00:07,  1.19s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:13<00:06,  1.22s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:28:30 [abstract.py:324] It took 0.014592 seconds to wake up tags {'weights', 'kv_cache'}.


  3%|‚ñé         | 26/1000 [22:32<14:51:44, 54.93s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.0016859913123354782), 'train/rewards': np.float64(0.8671875), 'train/reward_metrics/format_reward': np.float64(0.8203125), 'train/reward_metrics/equation_reward': np.float64(0.046875), 'eval/rewards': np.float64(0.813), 'eval/reward_metrics/format_reward': np.float64(0.731), 'eval/reward_metrics/equation_reward': np.float64(0.082)}
Iteration 26/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:28:48 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:28:48 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:28:48 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:28:48 [abstract.py:306] It took 0.016507 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 0.5, Response Length: 156)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [45, 26, 58], create an equation that equals 13. You can use basic arithmetic operations (+, -, *, /) and each number can on


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:17,  1.14s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:16,  1.15s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:15,  1.15s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:04<00:13,  1.16s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:05<00:12,  1.14s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:06<00:11,  1.15s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:08<00:10,  1.14s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:09<00:09,  1.15s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:10<00:08,  1.16s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:11<00:06,  1.16s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:12<00:05,  1.17s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:29:14 [abstract.py:324] It took 0.017685 seconds to wake up tags {'weights', 'kv_cache'}.


  3%|‚ñé         | 27/1000 [23:16<13:56:08, 51.56s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.0010289950725521132), 'train/rewards': np.float64(0.8046875), 'train/reward_metrics/format_reward': np.float64(0.7890625), 'train/reward_metrics/equation_reward': np.float64(0.015625)}
Iteration 27/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:29:34 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:29:34 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:29:34 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:29:34 [abstract.py:306] It took 0.016304 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 303)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [55, 49, 76], create an equation that equals 70. You can use basic arithmetic operations (+, -, *, /) and each number can on


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:19,  1.29s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:17,  1.27s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:16,  1.28s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:15,  1.26s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:06<00:13,  1.27s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:07<00:12,  1.28s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:08<00:11,  1.29s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:10<00:10,  1.28s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:11<00:09,  1.30s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:12<00:07,  1.28s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:14<00:06,  1.28s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:30:03 [abstract.py:324] It took 0.017583 seconds to wake up tags {'weights', 'kv_cache'}.


  3%|‚ñé         | 28/1000 [24:04<13:41:29, 50.71s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.0009888735382976872), 'train/rewards': np.float64(0.9609375), 'train/reward_metrics/format_reward': np.float64(0.8984375), 'train/reward_metrics/equation_reward': np.float64(0.0625)}
Iteration 28/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:30:25 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:30:25 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:30:25 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:30:25 [abstract.py:306] It took 0.015416 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 138)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [88, 67, 97, 10], create an equation that equals 91. You can use basic arithmetic operations (+, -, *, /) and each number ca


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:21,  1.40s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:19,  1.42s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:04<00:18,  1.43s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:17,  1.43s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:07<00:15,  1.43s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:08<00:14,  1.43s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:09<00:12,  1.42s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:11<00:11,  1.42s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:12<00:09,  1.42s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:14<00:08,  1.42s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:15<00:07,  1.42s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:30:56 [abstract.py:324] It took 0.013599 seconds to wake up tags {'weights', 'kv_cache'}.


  3%|‚ñé         | 29/1000 [24:55<13:42:22, 50.82s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.0010179790530177799), 'train/rewards': np.float64(0.890625), 'train/reward_metrics/format_reward': np.float64(0.875), 'train/reward_metrics/equation_reward': np.float64(0.015625)}
Iteration 29/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:31:14 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:31:14 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:31:14 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:31:14 [abstract.py:306] It took 0.014393 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 133)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [86, 58, 65, 74], create an equation that equals 84. You can use basic arithmetic operations (+, -, *, /) and each number ca


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:18,  1.23s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:17,  1.25s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:16,  1.27s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:14,  1.25s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:06<00:13,  1.26s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:07<00:12,  1.28s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:08<00:11,  1.27s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:10<00:10,  1.26s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:11<00:08,  1.27s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:12<00:07,  1.28s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:14<00:06,  1.30s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:31:43 [abstract.py:324] It took 0.017445 seconds to wake up tags {'weights', 'kv_cache'}.


  3%|‚ñé         | 30/1000 [25:43<13:28:07, 49.99s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.002063491218872479), 'train/rewards': np.float64(0.9375), 'train/reward_metrics/format_reward': np.float64(0.90625), 'train/reward_metrics/equation_reward': np.float64(0.03125)}
Iteration 30/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:32:02 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:32:02 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:32:02 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:32:02 [abstract.py:306] It took 0.015096 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 116)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [22, 56, 47, 33], create an equation that equals 47. You can use basic arithmetic operations (+, -, *, /) and each number ca


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:19,  1.30s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:17,  1.28s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:16,  1.28s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:15,  1.28s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:06<00:14,  1.28s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:07<00:12,  1.27s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:08<00:11,  1.27s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:10<00:10,  1.27s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:11<00:08,  1.26s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:12<00:07,  1.25s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:13<00:06,  1.25s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:32:31 [abstract.py:324] It took 0.015165 seconds to wake up tags {'weights', 'kv_cache'}.


  3%|‚ñé         | 31/1000 [26:31<13:17:20, 49.37s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.001318160390328332), 'train/rewards': np.float64(0.984375), 'train/reward_metrics/format_reward': np.float64(0.90625), 'train/reward_metrics/equation_reward': np.float64(0.078125)}
Iteration 31/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:32:52 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:32:52 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:32:52 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:32:52 [abstract.py:306] It took 0.014764 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 79)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [14, 39, 17, 87], create an equation that equals 17. You can use basic arithmetic operations (+, -, *, /) and each number can


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:21,  1.40s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:19,  1.39s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:04<00:17,  1.37s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:16,  1.36s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:06<00:15,  1.38s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:08<00:13,  1.37s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:09<00:12,  1.37s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:10<00:10,  1.37s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:12<00:09,  1.38s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:13<00:08,  1.38s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:15<00:06,  1.37s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:33:22 [abstract.py:324] It took 0.014664 seconds to wake up tags {'weights', 'kv_cache'}.


  3%|‚ñé         | 32/1000 [27:23<13:28:20, 50.10s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.0023634123613519163), 'train/rewards': np.float64(0.859375), 'train/reward_metrics/format_reward': np.float64(0.859375), 'train/reward_metrics/equation_reward': np.float64(0.0)}
Iteration 32/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:33:40 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:33:40 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:33:40 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:33:40 [abstract.py:306] It took 0.015433 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 81)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [3, 12, 9, 42], create an equation that equals 27. You can use basic arithmetic operations (+, -, *, /) and each number can o


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:19,  1.30s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:18,  1.29s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:16,  1.28s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:15,  1.29s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:06<00:14,  1.29s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:07<00:12,  1.27s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:08<00:11,  1.27s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:10<00:10,  1.27s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:11<00:08,  1.26s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:12<00:07,  1.26s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:13<00:06,  1.26s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:34:08 [abstract.py:324] It took 0.014630 seconds to wake up tags {'weights', 'kv_cache'}.


  3%|‚ñé         | 33/1000 [28:07<12:58:26, 48.30s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.001534763692774581), 'train/rewards': np.float64(0.984375), 'train/reward_metrics/format_reward': np.float64(0.96875), 'train/reward_metrics/equation_reward': np.float64(0.015625)}
Iteration 33/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:34:27 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:34:27 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:34:27 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:34:27 [abstract.py:306] It took 0.015431 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 62)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [91, 8, 66], create an equation that equals 17. You can use basic arithmetic operations (+, -, *, /) and each number can only


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:20,  1.35s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:19,  1.36s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:04<00:17,  1.38s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:16,  1.37s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:06<00:15,  1.38s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:08<00:13,  1.37s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:09<00:12,  1.37s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:10<00:10,  1.36s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:12<00:09,  1.37s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:13<00:08,  1.37s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:15<00:06,  1.36s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:34:57 [abstract.py:324] It took 0.014534 seconds to wake up tags {'weights', 'kv_cache'}.


  3%|‚ñé         | 34/1000 [28:57<13:02:47, 48.62s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.008768196149924638), 'train/rewards': np.float64(0.9921875), 'train/reward_metrics/format_reward': np.float64(0.9453125), 'train/reward_metrics/equation_reward': np.float64(0.046875)}
Iteration 34/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:35:14 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:35:14 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:35:14 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:35:14 [abstract.py:306] It took 0.015270 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 137)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [9, 27, 68], create an equation that equals 71. You can use basic arithmetic operations (+, -, *, /) and each number can onl


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:17,  1.16s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:16,  1.19s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:15,  1.18s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:04<00:14,  1.17s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:05<00:13,  1.20s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:07<00:12,  1.22s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:08<00:11,  1.23s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:09<00:09,  1.21s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:10<00:08,  1.19s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:12<00:07,  1.21s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:13<00:05,  1.20s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:35:41 [abstract.py:324] It took 0.013912 seconds to wake up tags {'weights', 'kv_cache'}.


  4%|‚ñé         | 35/1000 [29:42<12:45:48, 47.62s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.002066671063205624), 'train/rewards': np.float64(0.9765625), 'train/reward_metrics/format_reward': np.float64(0.9609375), 'train/reward_metrics/equation_reward': np.float64(0.015625)}
Iteration 35/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:35:59 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:35:59 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:35:59 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:35:59 [abstract.py:306] It took 0.015292 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 52)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [17, 12, 6, 55], create an equation that equals 59. You can use basic arithmetic operations (+, -, *, /) and each number can 


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:18,  1.24s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:17,  1.25s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:16,  1.24s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:04<00:14,  1.24s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:06<00:13,  1.22s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:07<00:12,  1.20s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:08<00:10,  1.20s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:09<00:09,  1.21s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:11<00:08,  1.24s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:12<00:07,  1.25s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:13<00:06,  1.27s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:36:26 [abstract.py:324] It took 0.018730 seconds to wake up tags {'weights', 'kv_cache'}.


  4%|‚ñé         | 36/1000 [30:24<12:18:13, 45.95s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.002704659166298098), 'train/rewards': np.float64(0.953125), 'train/reward_metrics/format_reward': np.float64(0.953125), 'train/reward_metrics/equation_reward': np.float64(0.0)}
Iteration 36/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:36:44 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:36:44 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:36:44 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:36:44 [abstract.py:306] It took 0.014932 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 0.5, Response Length: 153)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [84, 83, 83], create an equation that equals 84. You can use basic arithmetic operations (+, -, *, /) and each number can on


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:21,  1.42s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:20,  1.43s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:04<00:18,  1.41s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:16,  1.40s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:07<00:15,  1.40s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:08<00:14,  1.40s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:09<00:12,  1.41s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:11<00:11,  1.41s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:12<00:09,  1.41s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:14<00:08,  1.41s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:15<00:07,  1.41s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:37:15 [abstract.py:324] It took 0.017271 seconds to wake up tags {'weights', 'kv_cache'}.


  4%|‚ñé         | 37/1000 [31:15<12:40:47, 47.40s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.0031215252206462235), 'train/rewards': np.float64(0.9921875), 'train/reward_metrics/format_reward': np.float64(0.9609375), 'train/reward_metrics/equation_reward': np.float64(0.03125)}
Iteration 37/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:37:31 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:37:31 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:37:31 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:37:31 [abstract.py:306] It took 0.015243 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 120)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [23, 70, 97, 74], create an equation that equals 70. You can use basic arithmetic operations (+, -, *, /) and each number ca


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:17,  1.17s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:16,  1.16s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:15,  1.17s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:04<00:14,  1.18s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:05<00:12,  1.17s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:07<00:11,  1.18s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:08<00:10,  1.19s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:09<00:09,  1.17s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:10<00:08,  1.18s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:11<00:07,  1.18s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:12<00:05,  1.17s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:37:58 [abstract.py:324] It took 0.016883 seconds to wake up tags {'weights', 'kv_cache'}.


  4%|‚ñç         | 38/1000 [31:58<12:22:01, 46.28s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.0024771817954488623), 'train/rewards': np.float64(0.9453125), 'train/reward_metrics/format_reward': np.float64(0.9453125), 'train/reward_metrics/equation_reward': np.float64(0.0)}
Iteration 38/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:38:17 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:38:17 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:38:17 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:38:17 [abstract.py:306] It took 0.016265 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 2.0, Response Length: 317)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [21, 60, 57], create an equation that equals 96. You can use basic arithmetic operations (+, -, *, /) and each number can on


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:19,  1.31s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:18,  1.32s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:17,  1.32s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:15,  1.31s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:06<00:14,  1.31s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:07<00:13,  1.31s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:09<00:11,  1.31s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:10<00:10,  1.31s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:11<00:09,  1.31s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:13<00:07,  1.29s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:14<00:06,  1.29s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:38:45 [abstract.py:324] It took 0.013531 seconds to wake up tags {'weights', 'kv_cache'}.


  4%|‚ñç         | 39/1000 [32:43<12:14:37, 45.87s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.0035497758546958473), 'train/rewards': np.float64(1.046875), 'train/reward_metrics/format_reward': np.float64(0.953125), 'train/reward_metrics/equation_reward': np.float64(0.09375)}
Iteration 39/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:39:00 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:39:00 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:39:00 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:39:00 [abstract.py:306] It took 0.015946 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 134)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [28, 69, 33, 46], create an equation that equals 18. You can use basic arithmetic operations (+, -, *, /) and each number ca


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:17,  1.17s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:17,  1.24s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:16,  1.25s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:04<00:14,  1.23s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:06<00:13,  1.23s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:07<00:12,  1.22s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:08<00:11,  1.23s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:09<00:09,  1.22s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:11<00:08,  1.23s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:12<00:07,  1.23s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:13<00:06,  1.22s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:39:28 [abstract.py:324] It took 0.018136 seconds to wake up tags {'weights', 'kv_cache'}.


  4%|‚ñç         | 40/1000 [33:28<12:06:36, 45.41s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.001682495250359516), 'train/rewards': np.float64(1.0390625), 'train/reward_metrics/format_reward': np.float64(0.9921875), 'train/reward_metrics/equation_reward': np.float64(0.046875)}
Iteration 40/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:39:45 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:39:45 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:39:45 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:39:45 [abstract.py:306] It took 0.015283 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 89)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [74, 10, 1, 72], create an equation that equals 66. You can use basic arithmetic operations (+, -, *, /) and each number can 


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:17,  1.19s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:16,  1.18s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:15,  1.18s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:04<00:14,  1.20s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:05<00:13,  1.20s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:07<00:12,  1.20s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:08<00:10,  1.19s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:09<00:09,  1.19s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:10<00:08,  1.17s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:11<00:06,  1.16s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:13<00:05,  1.19s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:40:12 [abstract.py:324] It took 0.018059 seconds to wake up tags {'weights', 'kv_cache'}.


  4%|‚ñç         | 41/1000 [34:14<12:11:39, 45.78s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.002195639024907464), 'train/rewards': np.float64(0.984375), 'train/reward_metrics/format_reward': np.float64(0.96875), 'train/reward_metrics/equation_reward': np.float64(0.015625)}
Iteration 41/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:40:35 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:40:35 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:40:35 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:40:35 [abstract.py:306] It took 0.017507 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 157)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [42, 33, 32], create an equation that equals 41. You can use basic arithmetic operations (+, -, *, /) and each number can on


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:21,  1.42s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:19,  1.42s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:04<00:18,  1.42s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:17,  1.43s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:07<00:15,  1.43s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:08<00:14,  1.43s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:09<00:12,  1.43s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:11<00:11,  1.43s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:12<00:09,  1.43s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:14<00:08,  1.43s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:15<00:07,  1.44s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:41:05 [abstract.py:324] It took 0.015357 seconds to wake up tags {'weights', 'kv_cache'}.


  4%|‚ñç         | 42/1000 [35:04<12:30:02, 46.98s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.001803872019974227), 'train/rewards': np.float64(0.9921875), 'train/reward_metrics/format_reward': np.float64(0.9453125), 'train/reward_metrics/equation_reward': np.float64(0.046875)}
Iteration 42/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:41:21 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:41:21 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:41:21 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:41:21 [abstract.py:306] It took 0.014144 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 142)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [24, 4, 27, 56], create an equation that equals 63. You can use basic arithmetic operations (+, -, *, /) and each number can


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:18,  1.21s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:17,  1.23s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:15,  1.21s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:04<00:14,  1.21s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:06<00:13,  1.22s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:07<00:12,  1.22s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:08<00:10,  1.22s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:09<00:09,  1.22s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:10<00:08,  1.22s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:12<00:07,  1.22s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:13<00:06,  1.23s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:41:48 [abstract.py:324] It took 0.013217 seconds to wake up tags {'weights', 'kv_cache'}.


  4%|‚ñç         | 43/1000 [35:47<12:12:00, 45.89s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.0018933917942796363), 'train/rewards': np.float64(0.9921875), 'train/reward_metrics/format_reward': np.float64(0.9921875), 'train/reward_metrics/equation_reward': np.float64(0.0)}
Iteration 43/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:42:04 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:42:04 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:42:04 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:42:04 [abstract.py:306] It took 0.013420 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 101)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [88, 82, 3], create an equation that equals 18. You can use basic arithmetic operations (+, -, *, /) and each number can onl


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:17,  1.20s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:16,  1.21s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:15,  1.20s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:04<00:14,  1.20s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:06<00:13,  1.21s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:07<00:12,  1.21s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:08<00:10,  1.21s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:09<00:09,  1.21s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:10<00:08,  1.21s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:12<00:07,  1.21s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:13<00:06,  1.20s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:42:31 [abstract.py:324] It took 0.013987 seconds to wake up tags {'weights', 'kv_cache'}.


  4%|‚ñç         | 44/1000 [36:32<12:06:58, 45.63s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.009941178565760617), 'train/rewards': np.float64(0.984375), 'train/reward_metrics/format_reward': np.float64(0.984375), 'train/reward_metrics/equation_reward': np.float64(0.0)}
Iteration 44/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:42:48 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:42:48 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:42:48 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:42:48 [abstract.py:306] It took 0.014450 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 223)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [59, 38, 75, 62], create an equation that equals 40. You can use basic arithmetic operations (+, -, *, /) and each number ca


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:18,  1.21s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:16,  1.21s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:15,  1.21s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:04<00:14,  1.21s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:06<00:13,  1.21s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:07<00:12,  1.21s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:08<00:10,  1.21s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:09<00:09,  1.21s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:10<00:08,  1.21s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:12<00:07,  1.21s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:13<00:06,  1.21s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:43:16 [abstract.py:324] It took 0.014681 seconds to wake up tags {'weights', 'kv_cache'}.


  4%|‚ñç         | 45/1000 [37:14<11:48:46, 44.53s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.0021161761676881854), 'train/rewards': np.float64(1.015625), 'train/reward_metrics/format_reward': np.float64(1.0), 'train/reward_metrics/equation_reward': np.float64(0.015625)}
Iteration 45/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:43:31 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:43:31 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:43:31 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:43:31 [abstract.py:306] It took 0.015389 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 148)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [74, 14, 32, 31], create an equation that equals 25. You can use basic arithmetic operations (+, -, *, /) and each number ca


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:17,  1.18s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:16,  1.19s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:15,  1.18s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:04<00:14,  1.18s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:05<00:12,  1.17s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:07<00:11,  1.16s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:08<00:10,  1.17s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:09<00:09,  1.17s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:10<00:08,  1.17s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:11<00:07,  1.17s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:12<00:05,  1.17s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:43:58 [abstract.py:324] It took 0.015913 seconds to wake up tags {'weights', 'kv_cache'}.


  5%|‚ñç         | 46/1000 [37:58<11:41:40, 44.13s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.007193468998271033), 'train/rewards': np.float64(1.0), 'train/reward_metrics/format_reward': np.float64(0.984375), 'train/reward_metrics/equation_reward': np.float64(0.015625)}
Iteration 46/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:44:14 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:44:14 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:44:14 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:44:14 [abstract.py:306] It took 0.014388 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 153)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [99, 80, 13, 73], create an equation that equals 67. You can use basic arithmetic operations (+, -, *, /) and each number ca


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:18,  1.23s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:17,  1.23s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:15,  1.23s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:04<00:14,  1.22s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:06<00:13,  1.22s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:07<00:12,  1.22s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:08<00:10,  1.21s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:09<00:09,  1.20s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:10<00:08,  1.21s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:12<00:07,  1.20s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:13<00:06,  1.21s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:44:41 [abstract.py:324] It took 0.014645 seconds to wake up tags {'weights', 'kv_cache'}.


  5%|‚ñç         | 47/1000 [38:39<11:30:33, 43.48s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.0029583274584700577), 'train/rewards': np.float64(1.0078125), 'train/reward_metrics/format_reward': np.float64(0.9453125), 'train/reward_metrics/equation_reward': np.float64(0.0625)}
Iteration 47/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:44:57 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:44:57 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:44:57 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:44:57 [abstract.py:306] It took 0.016297 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 118)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [46, 81, 73, 13], create an equation that equals 58. You can use basic arithmetic operations (+, -, *, /) and each number ca


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:18,  1.27s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:17,  1.27s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:16,  1.27s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:15,  1.29s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:06<00:14,  1.30s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:07<00:13,  1.30s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:09<00:11,  1.31s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:10<00:10,  1.30s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:11<00:09,  1.29s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:12<00:07,  1.27s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:14<00:06,  1.28s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:45:25 [abstract.py:324] It took 0.019031 seconds to wake up tags {'weights', 'kv_cache'}.


  5%|‚ñç         | 48/1000 [39:26<11:45:36, 44.47s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.002635383872833925), 'train/rewards': np.float64(1.0), 'train/reward_metrics/format_reward': np.float64(0.953125), 'train/reward_metrics/equation_reward': np.float64(0.046875)}
Iteration 48/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:45:46 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:45:46 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:45:46 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:45:46 [abstract.py:306] It took 0.016504 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 146)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [45, 33, 5, 95], create an equation that equals 61. You can use basic arithmetic operations (+, -, *, /) and each number can


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:19,  1.33s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:18,  1.33s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:17,  1.32s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:15,  1.31s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:06<00:14,  1.31s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:07<00:13,  1.30s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:09<00:11,  1.31s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:10<00:10,  1.31s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:11<00:09,  1.31s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:13<00:07,  1.33s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:14<00:06,  1.33s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:46:14 [abstract.py:324] It took 0.018612 seconds to wake up tags {'weights', 'kv_cache'}.


  5%|‚ñç         | 49/1000 [40:15<12:02:57, 45.61s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.0022555187263178424), 'train/rewards': np.float64(1.03125), 'train/reward_metrics/format_reward': np.float64(0.96875), 'train/reward_metrics/equation_reward': np.float64(0.0625)}
Iteration 49/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:46:32 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:46:32 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:46:32 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:46:32 [abstract.py:306] It took 0.016627 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 151)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [47, 9, 20, 76], create an equation that equals 57. You can use basic arithmetic operations (+, -, *, /) and each number can


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:19,  1.28s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:17,  1.28s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:16,  1.28s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:15,  1.26s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:06<00:13,  1.26s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:07<00:12,  1.25s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:08<00:11,  1.25s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:10<00:09,  1.24s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:11<00:08,  1.25s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:12<00:07,  1.26s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:13<00:06,  1.26s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:47:00 [abstract.py:324] It took 0.019033 seconds to wake up tags {'weights', 'kv_cache'}.


  5%|‚ñå         | 50/1000 [40:59<11:54:50, 45.15s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.0025891708527321735), 'train/rewards': np.float64(1.1015625), 'train/reward_metrics/format_reward': np.float64(0.9921875), 'train/reward_metrics/equation_reward': np.float64(0.109375)}
Iteration 50/1000
Evaluating on eval set...


Adding requests:   0%|          | 0/500 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/500 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:47:31 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:47:31 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:47:31 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:47:31 [abstract.py:306] It took 0.024369 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 89)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [72, 59, 70], create an equation that equals 61. You can use basic arithmetic operations (+, -, *, /) and each number can onl


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:18,  1.24s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:17,  1.24s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:16,  1.27s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:15,  1.27s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:06<00:14,  1.28s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:07<00:12,  1.28s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:08<00:11,  1.27s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:10<00:10,  1.27s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:11<00:08,  1.25s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:12<00:07,  1.25s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:13<00:06,  1.24s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:47:58 [abstract.py:324] It took 0.019137 seconds to wake up tags {'weights', 'kv_cache'}.
KEY METRICS: {'train/kl_penalty': np.float64(0.003285149316887095), 'train/rewards': np.float64(1.03125), 'train/reward_metrics/format_reward': np.float64(0.984375), 'train/reward_metrics/equation_reward': np.float64(0.046875), 'eval/rewards': np.float64(0.988), 'eval/reward_metrics/format_reward': np.float64(0.862), 'eval/reward_metrics/equation_reward': np.float64(0.126)}


  5%|‚ñå         | 51/1000 [43:41<21:08:41, 80.21s/it]

Iteration 51/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:50:01 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:50:01 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:50:01 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 54.01 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:50:01 [abstract.py:306] It took 0.016879 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 92)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [23, 72, 87, 79], create an equation that equals 71. You can use basic arithmetic operations (+, -, *, /) and each number ca


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:20,  1.37s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:19,  1.37s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:04<00:17,  1.38s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:16,  1.41s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:07<00:15,  1.43s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:08<00:14,  1.44s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:09<00:12,  1.44s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:11<00:11,  1.44s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:12<00:09,  1.43s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:14<00:08,  1.43s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:15<00:07,  1.42s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:50:32 [abstract.py:324] It took 0.014621 seconds to wake up tags {'weights', 'kv_cache'}.


  5%|‚ñå         | 52/1000 [44:32<18:49:45, 71.50s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.0025780542271688902), 'train/rewards': np.float64(0.96875), 'train/reward_metrics/format_reward': np.float64(0.953125), 'train/reward_metrics/equation_reward': np.float64(0.015625)}
Iteration 52/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:50:49 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:50:49 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:50:49 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:50:49 [abstract.py:306] It took 0.017364 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 110)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [85, 81, 76, 98], create an equation that equals 65. You can use basic arithmetic operations (+, -, *, /) and each number ca


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:18,  1.21s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:17,  1.21s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:15,  1.22s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:04<00:14,  1.21s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:06<00:13,  1.21s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:07<00:12,  1.20s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:08<00:10,  1.20s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:09<00:09,  1.20s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:10<00:08,  1.21s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:12<00:07,  1.21s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:13<00:06,  1.21s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:51:15 [abstract.py:324] It took 0.017080 seconds to wake up tags {'weights', 'kv_cache'}.


  5%|‚ñå         | 53/1000 [45:18<16:49:53, 63.99s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.00314965483234714), 'train/rewards': np.float64(1.1171875), 'train/reward_metrics/format_reward': np.float64(0.9921875), 'train/reward_metrics/equation_reward': np.float64(0.125)}
Iteration 53/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:51:37 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:51:37 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:51:37 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:51:37 [abstract.py:306] It took 0.016994 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 217)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [95, 62, 35, 26], create an equation that equals 57. You can use basic arithmetic operations (+, -, *, /) and each number ca


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:19,  1.30s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:18,  1.31s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:16,  1.30s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:15,  1.30s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:06<00:14,  1.30s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:07<00:12,  1.30s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:09<00:11,  1.30s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:10<00:10,  1.31s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:11<00:09,  1.30s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:12<00:07,  1.29s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:14<00:06,  1.30s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:52:05 [abstract.py:324] It took 0.014686 seconds to wake up tags {'weights', 'kv_cache'}.


  5%|‚ñå         | 54/1000 [46:02<15:14:50, 58.02s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.003153892824403459), 'train/rewards': np.float64(1.03125), 'train/reward_metrics/format_reward': np.float64(1.0), 'train/reward_metrics/equation_reward': np.float64(0.03125)}
Iteration 54/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:52:18 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:52:18 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:52:18 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:52:18 [abstract.py:306] It took 0.016293 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 88)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [97, 9, 16, 57], create an equation that equals 65. You can use basic arithmetic operations (+, -, *, /) and each number can 


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:17,  1.17s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:16,  1.18s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:15,  1.19s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:04<00:14,  1.21s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:06<00:13,  1.21s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:07<00:12,  1.21s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:08<00:10,  1.22s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:09<00:09,  1.22s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:10<00:08,  1.22s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:12<00:07,  1.22s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:13<00:06,  1.22s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:52:46 [abstract.py:324] It took 0.013078 seconds to wake up tags {'weights', 'kv_cache'}.


  6%|‚ñå         | 55/1000 [46:44<13:57:21, 53.17s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.003781886414302488), 'train/rewards': np.float64(1.109375), 'train/reward_metrics/format_reward': np.float64(1.0), 'train/reward_metrics/equation_reward': np.float64(0.109375)}
Iteration 55/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:53:02 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:53:02 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:53:02 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:53:02 [abstract.py:306] It took 0.014474 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 110)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [37, 56, 37], create an equation that equals 55. You can use basic arithmetic operations (+, -, *, /) and each number can on


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:18,  1.25s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:17,  1.25s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:16,  1.25s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:04<00:14,  1.24s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:06<00:13,  1.24s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:07<00:12,  1.24s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:08<00:11,  1.25s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:09<00:09,  1.24s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:11<00:08,  1.24s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:12<00:07,  1.24s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:13<00:06,  1.24s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:53:29 [abstract.py:324] It took 0.015834 seconds to wake up tags {'weights', 'kv_cache'}.


  6%|‚ñå         | 56/1000 [47:31<13:27:08, 51.30s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.0038480390550077914), 'train/rewards': np.float64(1.0625), 'train/reward_metrics/format_reward': np.float64(0.984375), 'train/reward_metrics/equation_reward': np.float64(0.078125)}
Iteration 56/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:53:49 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:53:49 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:53:49 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:53:49 [abstract.py:306] It took 0.017626 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 90)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [48, 40, 72, 80], create an equation that equals 90. You can use basic arithmetic operations (+, -, *, /) and each number can


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:19,  1.28s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:17,  1.28s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:16,  1.28s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:15,  1.29s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:06<00:14,  1.30s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:07<00:13,  1.31s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:09<00:11,  1.30s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:10<00:10,  1.31s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:11<00:09,  1.29s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:12<00:07,  1.29s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:14<00:06,  1.28s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:54:17 [abstract.py:324] It took 0.018198 seconds to wake up tags {'weights', 'kv_cache'}.


  6%|‚ñå         | 57/1000 [48:14<12:47:49, 48.85s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.0042914562771333604), 'train/rewards': np.float64(1.015625), 'train/reward_metrics/format_reward': np.float64(1.0), 'train/reward_metrics/equation_reward': np.float64(0.015625)}
Iteration 57/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:54:31 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:54:31 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:54:31 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:54:31 [abstract.py:306] It took 0.016105 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 162)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [12, 35, 61, 86], create an equation that equals 94. You can use basic arithmetic operations (+, -, *, /) and each number ca


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:17,  1.15s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:15,  1.13s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:14,  1.15s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:04<00:13,  1.15s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:05<00:12,  1.13s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:06<00:11,  1.14s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:08<00:10,  1.17s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:09<00:09,  1.18s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:10<00:08,  1.19s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:11<00:07,  1.17s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:12<00:05,  1.18s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:54:57 [abstract.py:324] It took 0.018896 seconds to wake up tags {'weights', 'kv_cache'}.


  6%|‚ñå         | 58/1000 [48:56<12:14:10, 46.76s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.004728435551720199), 'train/rewards': np.float64(1.09375), 'train/reward_metrics/format_reward': np.float64(1.0), 'train/reward_metrics/equation_reward': np.float64(0.09375)}
Iteration 58/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:55:14 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:55:14 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:55:14 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:55:14 [abstract.py:306] It took 0.018255 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 118)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [34, 43, 12, 29], create an equation that equals 87. You can use basic arithmetic operations (+, -, *, /) and each number ca


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:18,  1.26s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:17,  1.23s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:16,  1.23s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:04<00:14,  1.22s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:06<00:13,  1.22s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:07<00:12,  1.23s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:08<00:11,  1.24s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:09<00:09,  1.25s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:11<00:08,  1.25s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:12<00:07,  1.24s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:13<00:06,  1.23s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:55:42 [abstract.py:324] It took 0.020608 seconds to wake up tags {'weights', 'kv_cache'}.


  6%|‚ñå         | 59/1000 [49:44<12:20:11, 47.20s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.004639963298960957), 'train/rewards': np.float64(1.015625), 'train/reward_metrics/format_reward': np.float64(1.0), 'train/reward_metrics/equation_reward': np.float64(0.015625)}
Iteration 59/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:56:00 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:56:00 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:56:00 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:56:00 [abstract.py:306] It took 0.013647 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 155)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [23, 58, 69], create an equation that equals 61. You can use basic arithmetic operations (+, -, *, /) and each number can on


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:18,  1.21s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:17,  1.24s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:15,  1.20s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:04<00:14,  1.18s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:05<00:12,  1.18s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:07<00:11,  1.17s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:08<00:10,  1.18s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:09<00:09,  1.17s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:10<00:08,  1.16s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:11<00:06,  1.16s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:12<00:05,  1.16s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:56:27 [abstract.py:324] It took 0.015511 seconds to wake up tags {'weights', 'kv_cache'}.


  6%|‚ñå         | 60/1000 [50:26<11:54:07, 45.58s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.004890213677900738), 'train/rewards': np.float64(1.015625), 'train/reward_metrics/format_reward': np.float64(1.0), 'train/reward_metrics/equation_reward': np.float64(0.015625)}
Iteration 60/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:56:42 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:56:42 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:56:42 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:56:42 [abstract.py:306] It took 0.015507 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 80)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [56, 61, 62, 60], create an equation that equals 33. You can use basic arithmetic operations (+, -, *, /) and each number can


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:18,  1.21s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:16,  1.16s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:15,  1.17s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:04<00:13,  1.16s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:05<00:12,  1.15s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:07<00:11,  1.17s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:08<00:10,  1.16s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:09<00:09,  1.18s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:10<00:08,  1.19s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:11<00:07,  1.19s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:12<00:05,  1.20s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:57:09 [abstract.py:324] It took 0.015260 seconds to wake up tags {'weights', 'kv_cache'}.


  6%|‚ñå         | 61/1000 [51:08<11:34:31, 44.38s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.005190480172570917), 'train/rewards': np.float64(1.0390625), 'train/reward_metrics/format_reward': np.float64(0.9921875), 'train/reward_metrics/equation_reward': np.float64(0.046875)}
Iteration 61/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:57:24 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:57:24 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:57:24 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:57:24 [abstract.py:306] It took 0.016583 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 108)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [5, 81, 7, 16], create an equation that equals 95. You can use basic arithmetic operations (+, -, *, /) and each number can 


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:17,  1.14s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:15,  1.14s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:14,  1.15s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:04<00:13,  1.15s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:05<00:12,  1.14s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:06<00:11,  1.16s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:08<00:10,  1.16s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:09<00:09,  1.16s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:10<00:08,  1.16s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:11<00:06,  1.15s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:12<00:05,  1.15s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:57:51 [abstract.py:324] It took 0.014299 seconds to wake up tags {'weights', 'kv_cache'}.


  6%|‚ñå         | 62/1000 [51:50<11:21:13, 43.58s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.00581240358155437), 'train/rewards': np.float64(1.0390625), 'train/reward_metrics/format_reward': np.float64(0.9921875), 'train/reward_metrics/equation_reward': np.float64(0.046875)}
Iteration 62/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:58:07 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:58:07 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:58:07 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:58:07 [abstract.py:306] It took 0.017484 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 170)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [73, 73, 51, 32], create an equation that equals 82. You can use basic arithmetic operations (+, -, *, /) and each number ca


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:18,  1.21s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:17,  1.25s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:16,  1.25s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:05<00:15,  1.26s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:06<00:13,  1.26s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:07<00:12,  1.25s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:08<00:11,  1.25s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:09<00:09,  1.24s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:11<00:08,  1.25s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:12<00:07,  1.24s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:13<00:06,  1.25s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:58:35 [abstract.py:324] It took 0.015236 seconds to wake up tags {'weights', 'kv_cache'}.


  6%|‚ñã         | 63/1000 [52:32<11:16:27, 43.32s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.005176370242494035), 'train/rewards': np.float64(1.046875), 'train/reward_metrics/format_reward': np.float64(1.0), 'train/reward_metrics/equation_reward': np.float64(0.046875)}
Iteration 63/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:58:48 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:58:48 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:58:48 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:58:48 [abstract.py:306] It took 0.016942 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 54)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [21, 86, 3], create an equation that equals 79. You can use basic arithmetic operations (+, -, *, /) and each number can only


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:16,  1.10s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:15,  1.10s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:14,  1.11s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:04<00:13,  1.11s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:05<00:12,  1.11s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:06<00:11,  1.12s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:07<00:10,  1.12s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:08<00:08,  1.12s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:10<00:07,  1.12s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:11<00:06,  1.16s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:12<00:05,  1.19s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:59:14 [abstract.py:324] It took 0.013521 seconds to wake up tags {'weights', 'kv_cache'}.


  6%|‚ñã         | 64/1000 [53:13<11:03:22, 42.52s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.006427200717335187), 'train/rewards': np.float64(1.0625), 'train/reward_metrics/format_reward': np.float64(1.0), 'train/reward_metrics/equation_reward': np.float64(0.0625)}
Iteration 64/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:59:29 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:59:29 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:59:29 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:59:29 [abstract.py:306] It took 0.016311 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 123)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [81, 17, 8], create an equation that equals 55. You can use basic arithmetic operations (+, -, *, /) and each number can onl


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:18,  1.21s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:16,  1.19s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:15,  1.18s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:04<00:14,  1.19s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:05<00:12,  1.18s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:07<00:11,  1.18s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:08<00:10,  1.17s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:09<00:09,  1.17s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:10<00:08,  1.19s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:11<00:07,  1.18s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:12<00:05,  1.17s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 22:59:56 [abstract.py:324] It took 0.018278 seconds to wake up tags {'weights', 'kv_cache'}.


  6%|‚ñã         | 65/1000 [53:54<10:58:09, 42.23s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.0059199767705251476), 'train/rewards': np.float64(1.0625), 'train/reward_metrics/format_reward': np.float64(0.984375), 'train/reward_metrics/equation_reward': np.float64(0.078125)}
Iteration 65/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 23:00:10 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 23:00:10 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 23:00:10 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 23:00:10 [abstract.py:306] It took 0.017910 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 85)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [25, 38, 79], create an equation that equals 92. You can use basic arithmetic operations (+, -, *, /) and each number can onl


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:17,  1.18s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:16,  1.20s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:15,  1.18s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:04<00:14,  1.20s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:06<00:13,  1.21s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:07<00:12,  1.22s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:08<00:11,  1.22s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:09<00:09,  1.22s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:10<00:08,  1.21s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:12<00:07,  1.20s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:13<00:05,  1.20s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 23:00:37 [abstract.py:324] It took 0.013518 seconds to wake up tags {'weights', 'kv_cache'}.


  7%|‚ñã         | 66/1000 [54:35<10:50:55, 41.81s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.006309249548227888), 'train/rewards': np.float64(1.109375), 'train/reward_metrics/format_reward': np.float64(1.0), 'train/reward_metrics/equation_reward': np.float64(0.109375)}
Iteration 66/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 23:00:51 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 23:00:51 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 23:00:51 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 23:00:51 [abstract.py:306] It took 0.015556 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 69)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [98, 23, 19], create an equation that equals 94. You can use basic arithmetic operations (+, -, *, /) and each number can onl


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:17,  1.18s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:16,  1.16s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:14,  1.15s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:04<00:13,  1.16s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:05<00:12,  1.15s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:06<00:11,  1.15s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:08<00:10,  1.15s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:09<00:09,  1.17s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:10<00:08,  1.17s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:11<00:07,  1.18s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:12<00:05,  1.19s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 23:01:18 [abstract.py:324] It took 0.014143 seconds to wake up tags {'weights', 'kv_cache'}.


  7%|‚ñã         | 67/1000 [55:17<10:47:33, 41.64s/it]

KEY METRICS: {'train/kl_penalty': np.float64(0.0073803989424904935), 'train/rewards': np.float64(1.171875), 'train/reward_metrics/format_reward': np.float64(1.0), 'train/reward_metrics/equation_reward': np.float64(0.171875)}
Iteration 67/1000


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 23:01:33 [block_pool.py:452] Successfully reset prefix cache
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 23:01:33 [cumem.py:213] CuMemAllocator: sleep freed 4.64 GiB memory in total, of which 0.00 GiB is backed up in CPU and the rest 4.64 GiB is discarded directly.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 23:01:33 [gpu_worker.py:128] Sleep mode freed 10.39 GiB memory, 48.7 GiB memory is still in use.
[0;36m(EngineCore_DP0 pid=13850)[0;0m INFO 02-17 23:01:33 [abstract.py:306] It took 0.016537 seconds to fall asleep.
Generated 64 responses
########## Example 1 (Reward: 1.0, Response Length: 152)
#### Query:
`<|im_start|>system
You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.<|im_end|>
<|im_start|>user
Using the numbers [4, 80, 8, 19], create an equation that equals 13. You can use basic arithmetic operations (+, -, *, /) and each number can 


Gradient Accumulation:   0%|          | 0/16 [00:00<?, ?it/s][A
Gradient Accumulation:   6%|‚ñã         | 1/16 [00:01<00:18,  1.25s/it][A
Gradient Accumulation:  12%|‚ñà‚ñé        | 2/16 [00:02<00:17,  1.25s/it][A
Gradient Accumulation:  19%|‚ñà‚ñâ        | 3/16 [00:03<00:16,  1.24s/it][A
Gradient Accumulation:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:04<00:14,  1.21s/it][A
Gradient Accumulation:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:06<00:13,  1.20s/it][A
Gradient Accumulation:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:07<00:11,  1.19s/it][A
Gradient Accumulation:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:08<00:10,  1.19s/it][A
Gradient Accumulation:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:09<00:09,  1.20s/it][A
Gradient Accumulation:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:10<00:08,  1.19s/it][A
Gradient Accumulation:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:12<00:07,  1.20s/it][A
Gradient Accumulation:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:13<00:06,  1.20s/it][A
Gradient Accumulation:  75%|‚ñà‚ñà‚ñà

KeyboardInterrupt: 

ERROR 02-17 23:01:58 [core_client.py:605] Engine core proc EngineCore_DP0 died unexpectedly, shutting down client.


## Results

In [None]:
import json
import glob as glob_module
import matplotlib.pyplot as plt
from IPython.display import display, HTML

def load_episodes(episode_dir: Path) -> Dict[int, List[Dict]]:
    """Load all saved episode JSON files from a directory, keyed by iteration number."""
    episodes_by_iter = {}
    for fpath in sorted(episode_dir.glob("eps_*.json")):
        iteration = int(fpath.stem.split("_")[-1])
        with open(fpath) as f:
            episodes_by_iter[iteration] = json.load(f)
    return episodes_by_iter

train_episodes = load_episodes(EXP_DIR / "episodes")
eval_episodes = load_episodes(EXP_DIR / "eval_episodes")

print(f"Loaded train episodes from {len(train_episodes)} iterations")
print(f"Loaded eval episodes from {len(eval_episodes)} iterations")
print(f"Train iterations: {sorted(train_episodes.keys())[:5]} ... {sorted(train_episodes.keys())[-5:]}")
print(f"Eval iterations: {sorted(eval_episodes.keys())[:5]} ... {sorted(eval_episodes.keys())[-5:]}")

### How did performance increase on the countdown task?

We track two reward components across training:
- **Format reward** (0 or 0.5 or 1.0): Does the model produce valid `<think>...</think>\n<answer>...</answer>` formatting?
- **Equation reward** (0 or 1.0): Does the equation in the answer actually evaluate to the target using all numbers exactly once?

The total reward is the sum of both (max 2.0). We plot these over training iterations for both the training rollouts and the held-out eval set.

In [None]:
def compute_episode_metrics(episodes_by_iter: Dict[int, List[Dict]]) -> Dict[str, List]:
    """Compute per-iteration aggregate metrics from saved episodes."""
    iterations = sorted(episodes_by_iter.keys())
    metrics = {"iteration": [], "mean_reward": [], "format_rate": [], "equation_rate": [], "mean_response_len": []}

    for it in iterations:
        eps = episodes_by_iter[it]
        rewards = [e["reward"] for e in eps]
        responses = [e["response"] for e in eps]

        # Re-derive format and equation rewards from the saved responses
        format_rewards = [format_reward_func(r) for r in responses]
        equation_rewards = []
        for e in eps:
            # Extract nums and target from the query text
            query = e["query"]
            nums_match = re.search(r"numbers \[([^\]]+)\]", query)
            target_match = re.search(r"equals (\d+)", query)
            if nums_match and target_match:
                nums = [int(x.strip()) for x in nums_match.group(1).split(",")]
                target = int(target_match.group(1))
                equation_rewards.append(equation_reward_func(e["response"], nums, target))
            else:
                equation_rewards.append(0.0)

        metrics["iteration"].append(it)
        metrics["mean_reward"].append(np.mean(rewards))
        metrics["format_rate"].append(np.mean(format_rewards))
        metrics["equation_rate"].append(np.mean(equation_rewards))
        metrics["mean_response_len"].append(np.mean([len(r) for r in responses]))

    return metrics

train_metrics = compute_episode_metrics(train_episodes)
eval_metrics = compute_episode_metrics(eval_episodes)

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Plot 1: Total reward
axes[0].plot(train_metrics["iteration"], train_metrics["mean_reward"], alpha=0.4, label="Train")
if eval_metrics["iteration"]:
    axes[0].plot(eval_metrics["iteration"], eval_metrics["mean_reward"], "o-", label="Eval", markersize=3)
axes[0].set_xlabel("Iteration")
axes[0].set_ylabel("Mean Reward")
axes[0].set_title("Total Reward over Training")
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Plot 2: Format reward vs Equation reward (eval)
source = eval_metrics if eval_metrics["iteration"] else train_metrics
axes[1].plot(source["iteration"], source["format_rate"], "o-", label="Format Reward", markersize=3)
axes[1].plot(source["iteration"], source["equation_rate"], "s-", label="Equation Reward", markersize=3)
axes[1].set_xlabel("Iteration")
axes[1].set_ylabel("Mean Reward Component")
axes[1].set_title("Reward Components (Eval)" if eval_metrics["iteration"] else "Reward Components (Train)")
axes[1].legend()
axes[1].grid(True, alpha=0.3)

# Plot 3: Response length
axes[2].plot(train_metrics["iteration"], train_metrics["mean_response_len"], alpha=0.4, label="Train")
if eval_metrics["iteration"]:
    axes[2].plot(eval_metrics["iteration"], eval_metrics["mean_response_len"], "o-", label="Eval", markersize=3)
axes[2].set_xlabel("Iteration")
axes[2].set_ylabel("Mean Response Length (chars)")
axes[2].set_title("Response Length over Training")
axes[2].legend()
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print summary table
if eval_metrics["iteration"]:
    first_iter, last_iter = eval_metrics["iteration"][0], eval_metrics["iteration"][-1]
    print(f"\n{'Metric':<25} {'Iter ' + str(first_iter):<15} {'Iter ' + str(last_iter):<15} {'Change':<15}")
    print("-" * 70)
    for key in ["mean_reward", "format_rate", "equation_rate"]:
        v0, v1 = eval_metrics[key][0], eval_metrics[key][-1]
        print(f"{key:<25} {v0:<15.4f} {v1:<15.4f} {v1-v0:+.4f}")

### Do we see things like the "aha moment" in the rollouts?

The "aha moment" from DeepSeek-R1 refers to the model spontaneously developing self-verification and correction behaviors during RL training -- without being explicitly taught to do so. We look for rollouts where the model:

- **Re-checks its own work** ("Wait, let me verify...", "Hmm, that doesn't work...")
- **Backtracks and tries again** ("No, that's wrong. Let me try a different approach...")
- **Self-verifies the answer** ("Let me check: 3 + 5 = 8. Yes, that's correct!")

We search through training rollouts for these patterns, focusing on high-reward episodes where the model got the right answer after showing signs of self-correction.

In [None]:
# Patterns that indicate self-correction / "aha moment" behaviors
AHA_PATTERNS = [
    r"[Ww]ait",
    r"[Hh]mm",
    r"[Aa]ctually",
    r"[Ll]et me (try|check|verify|reconsider|re-?evaluate|think)",
    r"[Nn]o,?\s*(that|this)('s|\s+is)?\s*(not|wrong|incorrect)",
    r"[Tt]hat('s|\s+is)?\s*(not|wrong|incorrect)",
    r"[Dd]oesn'?t\s*(work|equal|add up|give)",
    r"[Ll]et me re",
    r"[Ii] (made a mistake|was wrong|need to)",
    r"[Oo]n second thought",
    r"[Bb]ut wait",
    r"[Ss]o the answer",
]

def find_aha_episodes(episodes_by_iter: Dict[int, List[Dict]], min_reward: float = 1.5) -> List[Dict]:
    """Find episodes that show self-correction behavior AND got a high reward."""
    aha_episodes = []
    for iteration in sorted(episodes_by_iter.keys()):
        for ep in episodes_by_iter[iteration]:
            if ep["reward"] < min_reward:
                continue
            response = ep["response"]
            matched_patterns = []
            for pattern in AHA_PATTERNS:
                if re.search(pattern, response):
                    matched_patterns.append(pattern)
            if matched_patterns:
                aha_episodes.append({
                    "iteration": iteration,
                    "query": ep["query"],
                    "response": response,
                    "reward": ep["reward"],
                    "patterns": matched_patterns,
                    "num_patterns": len(matched_patterns),
                })
    return aha_episodes

aha_results = find_aha_episodes(train_episodes)
print(f"Found {len(aha_results)} high-reward episodes with self-correction patterns\n")

# Show when these patterns start appearing
if aha_results:
    aha_by_iter = {}
    for ep in aha_results:
        aha_by_iter.setdefault(ep["iteration"], []).append(ep)

    print("Iterations with aha-moment episodes (count):")
    for it in sorted(aha_by_iter.keys())[:20]:
        print(f"  Iter {it:>5d}: {len(aha_by_iter[it])} episodes")
    if len(aha_by_iter) > 20:
        print(f"  ... and {len(aha_by_iter) - 20} more iterations")

In [None]:
# Display the best "aha moment" examples (most self-correction patterns matched)
def display_rollout(ep: Dict, title: str = ""):
    """Render a single rollout with syntax highlighting for think/answer tags."""
    response = ep["response"]

    # Highlight self-correction phrases
    highlighted = response
    for pattern in AHA_PATTERNS:
        highlighted = re.sub(
            f"({pattern})",
            r'<span style="background-color: #fff3cd; font-weight: bold;">\1</span>',
            highlighted,
        )
    # Highlight tags
    highlighted = highlighted.replace("&lt;think&gt;", "<b>&lt;think&gt;</b>")
    highlighted = highlighted.replace("&lt;/think&gt;", "<b>&lt;/think&gt;</b>")
    highlighted = highlighted.replace("&lt;answer&gt;", "<b>&lt;answer&gt;</b>")
    highlighted = highlighted.replace("&lt;/answer&gt;", "<b>&lt;/answer&gt;</b>")

    html = f"""
    <div style="border: 1px solid #ddd; border-radius: 8px; padding: 16px; margin: 12px 0; background: #fafafa;">
        <div style="font-weight: bold; font-size: 14px; color: #333; margin-bottom: 8px;">
            {title} | Reward: {ep['reward']} | Patterns: {ep.get('num_patterns', '?')}
        </div>
        <div style="font-family: monospace; white-space: pre-wrap; font-size: 12px; line-height: 1.5; color: #555;">
{highlighted}
        </div>
    </div>
    """
    display(HTML(html))

if aha_results:
    # Sort by number of matched patterns (most "aha-like" first)
    best_aha = sorted(aha_results, key=lambda x: x["num_patterns"], reverse=True)

    print("Top 3 'aha moment' examples (most self-correction patterns):\n")
    for i, ep in enumerate(best_aha[:3]):
        display_rollout(ep, title=f"Example {i+1} (Iteration {ep['iteration']})")
else:
    print("No aha-moment episodes found. This may be expected if training hasn't run yet.")

### How do rollouts (model responses) compare from the beginning to the end of training?

We compare model responses from early vs. late training iterations side-by-side. This shows how the model evolves from producing unstructured or incorrect outputs to generating well-formatted, correct solutions with reasoning.

In [None]:
def display_rollout_simple(response: str, reward: float, title: str = ""):
    """Render a single rollout without pattern highlighting."""
    html = f"""
    <div style="border: 1px solid #ddd; border-radius: 8px; padding: 16px; margin: 8px 0; background: #fafafa;">
        <div style="font-weight: bold; font-size: 13px; color: #333; margin-bottom: 8px;">
            {title} | Reward: {reward}
        </div>
        <div style="font-family: monospace; white-space: pre-wrap; font-size: 12px; line-height: 1.5; color: #555;">
{response}
        </div>
    </div>
    """
    display(HTML(html))

# Use eval episodes if available (cleaner comparison), fall back to train
comparison_source = eval_episodes if eval_episodes else train_episodes
sorted_iters = sorted(comparison_source.keys())

if len(sorted_iters) >= 2:
    early_iter = sorted_iters[0]
    late_iter = sorted_iters[-1]
    early_eps = comparison_source[early_iter]
    late_eps = comparison_source[late_iter]

    NUM_EXAMPLES = 3

    for i in range(min(NUM_EXAMPLES, len(early_eps), len(late_eps))):
        display(HTML(f"<h4 style='margin-top: 24px;'>Comparison {i+1}</h4>"))
        display_rollout_simple(
            early_eps[i]["response"], early_eps[i]["reward"],
            title=f"EARLY (Iteration {early_iter})"
        )
        display_rollout_simple(
            late_eps[i]["response"], late_eps[i]["reward"],
            title=f"LATE (Iteration {late_iter})"
        )

    # Summary statistics
    early_rewards = [e["reward"] for e in early_eps]
    late_rewards = [e["reward"] for e in late_eps]
    early_lens = [len(e["response"]) for e in early_eps]
    late_lens = [len(e["response"]) for e in late_eps]

    print(f"\n{'='*60}")
    print(f"{'Statistic':<30} {'Iter ' + str(early_iter):<15} {'Iter ' + str(late_iter):<15}")
    print(f"{'='*60}")
    print(f"{'Mean reward':<30} {np.mean(early_rewards):<15.3f} {np.mean(late_rewards):<15.3f}")
    print(f"{'Reward > 0 (%)':<30} {100*np.mean([r > 0 for r in early_rewards]):<15.1f} {100*np.mean([r > 0 for r in late_rewards]):<15.1f}")
    print(f"{'Perfect reward (2.0) (%)':<30} {100*np.mean([r == 2.0 for r in early_rewards]):<15.1f} {100*np.mean([r == 2.0 for r in late_rewards]):<15.1f}")
    print(f"{'Mean response length (chars)':<30} {np.mean(early_lens):<15.0f} {np.mean(late_lens):<15.0f}")
else:
    print("Not enough iterations to compare. Need at least 2 saved checkpoints.")

### Tracing the GRPO policy gradient for a single example

To understand how GRPO works concretely, let's walk through the full computation for a single training prompt. We'll:

1. **Start with one prompt** and its group of `GENERATIONS_PER_SAMPLE` (4) responses
2. **Score each response** with the reward function
3. **Compute GRPO advantages** by normalizing rewards within the group (subtract mean, divide by std)
4. **Show how the advantage maps to the policy gradient**: responses with above-average reward get positive advantage (reinforced), below-average get negative advantage (discouraged)

This is the core mechanism: GRPO doesn't need a learned value function -- it just compares responses within a group to decide which to reinforce.

In [None]:
# Pick a training iteration that has some variance in rewards (not all 0s or all 2s)
def find_good_example_group(episodes_by_iter: Dict[int, List[Dict]]) -> Tuple[int, int]:
    """Find an iteration and starting index where a group of 4 has mixed rewards."""
    for iteration in sorted(episodes_by_iter.keys()):
        eps = episodes_by_iter[iteration]
        # Episodes are stored in groups of GENERATIONS_PER_SAMPLE
        for group_start in range(0, len(eps) - GENERATIONS_PER_SAMPLE + 1, GENERATIONS_PER_SAMPLE):
            group = eps[group_start:group_start + GENERATIONS_PER_SAMPLE]
            rewards = [e["reward"] for e in group]
            # Want mixed rewards: at least one success and one failure
            if min(rewards) < max(rewards) and max(rewards) >= 1.5:
                return iteration, group_start
    # Fallback: just use the first group from the first iteration
    first_iter = sorted(episodes_by_iter.keys())[0]
    return first_iter, 0

if train_episodes:
    example_iter, example_start = find_good_example_group(train_episodes)
    group = train_episodes[example_iter][example_start:example_start + GENERATIONS_PER_SAMPLE]

    print(f"Example from iteration {example_iter}, episodes {example_start}-{example_start + GENERATIONS_PER_SAMPLE - 1}")
    print(f"Query (shared across all {GENERATIONS_PER_SAMPLE} responses):")
    print(f"  {group[0]['query'][:200]}...")
    print()

    ##############################
    # Step 1: Show the raw rewards
    ##############################
    print("=" * 70)
    print("STEP 1: Raw rewards for each response in the group")
    print("=" * 70)
    rewards_raw = []
    for i, ep in enumerate(group):
        r = ep["reward"]
        rewards_raw.append(r)
        # Truncate response for display
        resp_preview = ep["response"][:150].replace("\n", " ")
        print(f"  Response {i+1}: reward = {r:.1f}  |  '{resp_preview}...'")

    ##############################
    # Step 2: GRPO advantage computation
    ##############################
    print(f"\n{'=' * 70}")
    print("STEP 2: GRPO advantage computation")
    print("=" * 70)
    rewards = np.array(rewards_raw)
    mean_r = rewards.mean()
    std_r = rewards.std()
    advantages = (rewards - mean_r) / (std_r + 1e-4)

    print(f"  Group rewards:     {rewards}")
    print(f"  Mean reward:       {mean_r:.4f}")
    print(f"  Std reward:        {std_r:.4f}")
    print(f"  Advantages:        {advantages}")
    print()
    for i in range(len(group)):
        direction = "REINFORCE (increase probability)" if advantages[i] > 0 else "DISCOURAGE (decrease probability)" if advantages[i] < 0 else "NEUTRAL (no gradient signal)"
        print(f"  Response {i+1}: advantage = {advantages[i]:+.4f} --> {direction}")

    ##############################
    # Step 3: How this becomes a gradient
    ##############################
    print(f"\n{'=' * 70}")
    print("STEP 3: Policy gradient for each response")
    print("=" * 70)
    print()
    print("For each token t in response i, the policy gradient contribution is:")
    print()
    print("  grad_contribution(t) = -advantage_i * d/d(theta) log p(t | context)")
    print()
    print("Since advantage is CONSTANT across all tokens in a response (GRPO),")
    print("the entire response is reinforced or discouraged uniformly:")
    print()
    for i in range(len(group)):
        n_tokens = len(group[i]["response"].split())  # rough word count
        if advantages[i] > 0:
            print(f"  Response {i+1} ({n_tokens:>3d} words): adv={advantages[i]:+.4f} => every token gets pushed UP in probability")
        elif advantages[i] < 0:
            print(f"  Response {i+1} ({n_tokens:>3d} words): adv={advantages[i]:+.4f} => every token gets pushed DOWN in probability")
        else:
            print(f"  Response {i+1} ({n_tokens:>3d} words): adv={advantages[i]:+.4f} => no gradient signal")

    print(f"\nThe KL penalty (coefficient={KL_COEFFICIENT}) then regularizes")
    print("the update to prevent the policy from drifting too far from the reference model.")
else:
    print("No training episodes available. Run training first.")