NousResearch · mormio · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026
diff --git a/3rdparty/Gym-workspace/Gym b/3rdparty/Gym-workspace/Gym
diff --git a/README.md b/README.md
@@ -104,6 +104,8 @@ For detailed information on backend selection, configuration, and examples, see
 - ✅ **Environment Support and Isolation** - Support for multi-environment training and dependency isolation between components.
 - ✅ **Worker Isolation** - Process isolation between RL Actors (no worries about global state).
 - ✅ **Learning Algorithms** - GRPO/GSPO/DAPO, SFT(with LoRA), DPO, and On-policy distillation.
+- ✅ **Advantage Estimators** - Group-relative (GRPO), multi-reward (GDPO), Reinforce++, and [Entropic Adaptive-β](nemo_rl/algorithms/entropic_advantage_estimator.py) (LOO entropic weighting from [TTT-Discover](https://arxiv.org/abs/2601.16175)).
+- ✅ **PUCT Buffer** - [Tree-structured state selection](nemo_rl/utils/puct_buffer.py) for iterative optimization environments (exploration/exploitation via Upper Confidence bounds).
 - ✅ **Multi-Turn RL** - Multi-turn generation and training for RL with tool use, games, etc.
 - ✅ **Advanced Parallelism with DTensor** - PyTorch FSDP2, TP, CP, and SP for efficient training (through NeMo AutoModel).
 - ✅ **Larger Model Support with Longer Sequences** - Performant parallelisms with Megatron Core (TP/PP/CP/SP/EP/FSDP) (through NeMo Megatron Bridge). 

diff --git a/examples/configs/grpo_erdos_discover.yaml b/examples/configs/grpo_erdos_discover.yaml
@@ -0,0 +1,98 @@
+# TTT-Discover Erdős — Nemotron-3-Super-120B, 16k seq, 8 nodes, CP=2
+defaults: "grpo_superv3.yaml"
+
+grpo:
+  num_prompts_per_step: 8
+  num_generations_per_prompt: 63
+  max_num_steps: 50
+  max_rollout_turns: 1
+  remove_constant_reward_groups: true
+  val_period: 0
+  val_at_start: false
+  val_at_end: false
+  adv_estimator:
+    name: entropic_adaptive_beta
+    gamma: 0.6931471805599453
+
+loss_fn:
+  kl_penalty_coef: 0.1
+  ratio_clip: 0.2
+  token_level_loss: false
+
+policy:
+  model_name: "/home/shared/models/NVIDIA-Nemotron-3-Super-120B-A12B-BF16"
+  tokenizer:
+    name: "/home/shared/models/NVIDIA-Nemotron-3-Super-120B-A12B-BF16"
+    chat_template_kwargs: null
+  max_total_sequence_length: 16384
+  train_global_batch_size: 504
+  train_micro_batch_size: 1
+  logprob_batch_size: 1
+
+  generation:
+    colocated:
+      enabled: false
+      resources:
+        num_nodes: 2
+        gpus_per_node: 8
+    max_new_tokens: 15360
+    vllm_cfg:
+      async_engine: false
+      tensor_parallel_size: 8
+      gpu_memory_utilization: 0.85
+      max_model_len: 16384
+
+  megatron_cfg:
+    tensor_model_parallel_size: 4
+    pipeline_model_parallel_size: 1
+    context_parallel_size: 2
+    expert_model_parallel_size: 8
+    sequence_parallel: true
+    activation_checkpointing: true
+    empty_unused_memory_level: 2
+    optimizer_cpu_offload: true
+    optimizer:
+      optimizer_cpu_offload: true
+      optimizer_offload_fraction: 1.0
+
+  dynamic_batching:
+    enabled: false
+
+  lora_cfg:
+    enabled: false
+
+optimizer:
+  lr: 4.0e-5
+
+data:
+  shuffle: false
+  max_input_seq_length: 16384
+
+env:
+  erdos_discovery:
+    num_initial_states: 8  # matches num_prompts_per_step
+    puct_seed_batch_size: 8  # matches num_prompts_per_step
+    sandbox_timeout: 1000
+  should_use_nemo_gym: false
+
+cluster:
+  gpus_per_node: 8
+  num_nodes: 8
+
+logger:
+  log_dir: "results/erdos-120b-16k"
+  wandb_enabled: true
+  wandb:
+    project: "ttt-discover-erdos"
+    name: "nemotron-120b-16k-8node-puct"
+  tensorboard_enabled: false
+  mlflow_enabled: false
+  swanlab_enabled: false
+
+checkpointing:
+  enabled: false
+  checkpoint_dir: "results/erdos-120b-16k"
+  save_period: 999999
+  checkpoint_must_save_by: null
+  model_save_format: "safetensors"
+  save_consolidated: false