From 184e44db472cf6ef0e577e2a3dc08870a2370b47 Mon Sep 17 00:00:00 2001
From: N!no <ninomyemail@gmail.com>
Date: Thu, 25 Sep 2025 21:45:20 -0400
Subject: [PATCH 1/5] early

---
 README.md                         | 45 +++++++++++++++++++++++++++++--
 configs/grpo_che_config.yaml      |  1 +
 configs/grpo_he_config.yaml       |  1 +
 configs/magrpo_che_config.yaml    |  1 +
 configs/magrpo_he_config.yaml     |  1 +
 configs/mt_grpo_che_config.yaml   |  1 +
 configs/mt_grpo_he_config.yaml    |  1 +
 configs/mt_magrpo_che_config.yaml |  1 +
 configs/mt_magrpo_he_config.yaml  |  1 +
 train_grpo.py                     |  1 +
 train_magrpo.py                   |  1 +
 11 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 84acb79..7a413f9 100644
--- a/README.md
+++ b/README.md
@@ -61,14 +61,55 @@ python LLM_Collaboration_with_MARL/train_magrpo.py \
 ```
 ## Multi-Turn Settings
 
+### Joint Actions & Early Termination
+
+- Joint actions (magrpo.joint_mode):
+  - `cross` (default): Form joint actions by Cartesian product of each agent's K generations (reuses sequences; no extra generation).
+  - `aligned`: Join index‑aligned generations.
+
+- Early termination (magrpo.termination_threshold / grpo.termination_threshold):
+  - At each node (branch, turn), compute the mean immediate reward across the sibling joint actions at that node.
+  - If the mean exceeds the threshold, that branch stops expanding at this turn; training backpropagates from the truncated subtree. Other branches continue.
+
+Illustrative example (threshold = -0.2, 2 agents, K=2 → 4 joint actions per node):
+
+```
+Turn 1 (root) (a,b,c,d): [-1.5, -1.5, -1.0, -1.0]
+mean rewards = -1.25 ≤ -0.2 → continue expanding all branches
+
+  a (-1.5)
+    ↳ Turn 2 children (e,f,g,h): [-1, -1, -1, -1]
+      mean rewards = -1.0 ≤ -0.2 → continue
+
+  b (-1.5)
+    ↳ Turn 2 children (i,j,k,l): [0.0, 0.0, 0.0, -0.2]
+      mean rewards = -0.05 > -0.2 → TERMINATE branch b here (no further children)
+
+  c (-1.0)
+    ↳ Turn 2 children: [...]
+
+  d (-1.0)
+    ↳ Turn 2 children: [...]
+```
+
+Notes:
+- Termination is per‑branch; other branches continue normally.
+- The same rule applies at deeper turns.
+- For GRPO (single agent), the same threshold logic applies (one agent → one set of siblings per node).
+
 ### 2+Turn Prompt Composition
 
-To save memory usage, 2+ turn prompts **include the previous response without the original first‑turn problem prompt by default**. You can add the original prompt to match the concept of observation-action history in MARL.
+By default, multi-turn prompts include both the original first‑turn problem prompt and the previous response.
+
+- external.original_prompt: true (default)
+- external.previous_response: true (default)
+
+To exclude the original prompt but keep the previous response (shorter context):
 
 ```bash
 python LLM_Collaboration_with_MARL/train_magrpo.py \
   --config LLM_Collaboration_with_MARL/configs/mt_magrpo_he_config.yaml \
-  --override magrpo.external_original_prompt=True magrpo.external_previous_response=True
+  --override external.original_prompt=False external.previous_response=True
 ```
 
 ### External Modes
diff --git a/configs/grpo_che_config.yaml b/configs/grpo_che_config.yaml
index 8b21935..4761df5 100644
--- a/configs/grpo_che_config.yaml
+++ b/configs/grpo_che_config.yaml
@@ -43,6 +43,7 @@ grpo:
   temperature: 0.8
   top_p: 0.95
   discount: 0.9
+  termination_threshold: -0.1
   reward_shift: -2.1
 
 # wandb
diff --git a/configs/grpo_he_config.yaml b/configs/grpo_he_config.yaml
index c9d8bf7..4d8dff9 100644
--- a/configs/grpo_he_config.yaml
+++ b/configs/grpo_he_config.yaml
@@ -43,6 +43,7 @@ grpo:
   temperature: 0.8
   top_p: 0.95
   discount: 0.9
+  termination_threshold: -0.1
   reward_shift: -2.1
 
 # wandb
diff --git a/configs/magrpo_che_config.yaml b/configs/magrpo_che_config.yaml
index 5ace7f6..72534f7 100644
--- a/configs/magrpo_che_config.yaml
+++ b/configs/magrpo_che_config.yaml
@@ -44,6 +44,7 @@ magrpo:
   joint_mode: cross
   num_agents: 2
   discount: 0.9
+  termination_threshold: -0.2
   reward_shift: -4
 
 # wandb
diff --git a/configs/magrpo_he_config.yaml b/configs/magrpo_he_config.yaml
index a7e638c..21c3654 100644
--- a/configs/magrpo_he_config.yaml
+++ b/configs/magrpo_he_config.yaml
@@ -42,6 +42,7 @@ magrpo:
   joint_mode: cross
   num_agents: 2
   discount: 0.9
+  termination_threshold: -0.2
   reward_shift: -4
 
 # wandb
diff --git a/configs/mt_grpo_che_config.yaml b/configs/mt_grpo_che_config.yaml
index dae8305..9803d06 100644
--- a/configs/mt_grpo_che_config.yaml
+++ b/configs/mt_grpo_che_config.yaml
@@ -44,6 +44,7 @@ grpo:
   temperature: 0.8
   top_p: 0.95
   discount: 0.9
+  termination_threshold: -0.1
   reward_shift: -2.1
 
 # wandb
diff --git a/configs/mt_grpo_he_config.yaml b/configs/mt_grpo_he_config.yaml
index 0157d7c..e9e7f3e 100644
--- a/configs/mt_grpo_he_config.yaml
+++ b/configs/mt_grpo_he_config.yaml
@@ -44,6 +44,7 @@ grpo:
   temperature: 0.8
   top_p: 0.95
   discount: 0.9
+  termination_threshold: -0.1
   reward_shift: -2.1
 
 # wandb
diff --git a/configs/mt_magrpo_che_config.yaml b/configs/mt_magrpo_che_config.yaml
index 64139cb..5064553 100644
--- a/configs/mt_magrpo_che_config.yaml
+++ b/configs/mt_magrpo_che_config.yaml
@@ -45,6 +45,7 @@ magrpo:
   joint_mode: cross
   num_agents: 2
   discount: 0.9
+  termination_threshold: -0.2
   reward_shift: -4
 
 # wandb
diff --git a/configs/mt_magrpo_he_config.yaml b/configs/mt_magrpo_he_config.yaml
index f0aa070..932790d 100644
--- a/configs/mt_magrpo_he_config.yaml
+++ b/configs/mt_magrpo_he_config.yaml
@@ -43,6 +43,7 @@ magrpo:
   joint_mode: cross
   num_agents: 2
   discount: 0.9
+  termination_threshold: -0.2
   reward_shift: -4
 
 # wandb
diff --git a/train_grpo.py b/train_grpo.py
index 1f7e0a5..c8b24eb 100644
--- a/train_grpo.py
+++ b/train_grpo.py
@@ -353,6 +353,7 @@ def _resolver(prompt: str):
         num_turns=num_turns,
         discount=grpo_config.get("discount", 0.9),
         joint_mode=grpo_config.get("joint_mode", "cross"),
+        termination_threshold=grpo_config.get("termination_threshold", None),
     )
 
     formatter = get_formatter(dataset_type)
diff --git a/train_magrpo.py b/train_magrpo.py
index dc1b6f6..58316b8 100644
--- a/train_magrpo.py
+++ b/train_magrpo.py
@@ -402,6 +402,7 @@ def _resolver(prompt: str):
         num_turns=num_turns,
         discount=magrpo_config.get("discount", 0.9),
         joint_mode=magrpo_config.get("joint_mode", "cross"),
+        termination_threshold=magrpo_config.get("termination_threshold", None),
     )
 
     # Get appropriate formatters and functions based on dataset type, agent count, and training mode

From 1ab001c624530e8883007ff800c26054743d8b78 Mon Sep 17 00:00:00 2001
From: N!no <ninomyemail@gmail.com>
Date: Thu, 25 Sep 2025 21:54:26 -0400
Subject: [PATCH 2/5] change default num_gen

---
 configs/grpo_che_config.yaml      | 2 +-
 configs/grpo_he_config.yaml       | 2 +-
 configs/magrpo_che_config.yaml    | 2 +-
 configs/magrpo_he_config.yaml     | 2 +-
 configs/mt_grpo_che_config.yaml   | 2 +-
 configs/mt_grpo_he_config.yaml    | 2 +-
 configs/mt_magrpo_che_config.yaml | 2 +-
 configs/mt_magrpo_he_config.yaml  | 2 +-
 train_grpo.py                     | 2 +-
 train_magrpo.py                   | 2 +-
 10 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/configs/grpo_che_config.yaml b/configs/grpo_che_config.yaml
index 4761df5..87b1495 100644
--- a/configs/grpo_che_config.yaml
+++ b/configs/grpo_che_config.yaml
@@ -37,7 +37,7 @@ grpo:
   learning_rate: 1.0e-5
   logging_steps: 50
   save_steps: 200
-  num_generations: 4
+  num_generations: 3
   max_new_tokens: 256
   joint_mode: cross
   temperature: 0.8
diff --git a/configs/grpo_he_config.yaml b/configs/grpo_he_config.yaml
index 4d8dff9..24fcd49 100644
--- a/configs/grpo_he_config.yaml
+++ b/configs/grpo_he_config.yaml
@@ -37,7 +37,7 @@ grpo:
   learning_rate: 1.0e-5
   logging_steps: 50
   save_steps: 200
-  num_generations: 4
+  num_generations: 3
   max_new_tokens: 256
   joint_mode: cross
   temperature: 0.8
diff --git a/configs/magrpo_che_config.yaml b/configs/magrpo_che_config.yaml
index 72534f7..4d34d83 100644
--- a/configs/magrpo_che_config.yaml
+++ b/configs/magrpo_che_config.yaml
@@ -37,7 +37,7 @@ magrpo:
   learning_rate: 2.0e-5
   logging_steps: 50
   save_steps: 200
-  num_generations: 4
+  num_generations: 3
   max_new_tokens: 256
   temperature: 0.8
   top_p: 0.95
diff --git a/configs/magrpo_he_config.yaml b/configs/magrpo_he_config.yaml
index 21c3654..7b46546 100644
--- a/configs/magrpo_he_config.yaml
+++ b/configs/magrpo_he_config.yaml
@@ -37,7 +37,7 @@ magrpo:
   learning_rate: 2.0e-5
   logging_steps: 50
   save_steps: 200
-  num_generations: 4
+  num_generations: 3
   max_new_tokens: 256
   joint_mode: cross
   num_agents: 2
diff --git a/configs/mt_grpo_che_config.yaml b/configs/mt_grpo_che_config.yaml
index 9803d06..150bc2f 100644
--- a/configs/mt_grpo_che_config.yaml
+++ b/configs/mt_grpo_che_config.yaml
@@ -38,7 +38,7 @@ grpo:
   learning_rate: 2.0e-5
   logging_steps: 50
   save_steps: 200
-  num_generations: 4
+  num_generations: 3
   max_new_tokens: 256
   joint_mode: cross
   temperature: 0.8
diff --git a/configs/mt_grpo_he_config.yaml b/configs/mt_grpo_he_config.yaml
index e9e7f3e..b0de70f 100644
--- a/configs/mt_grpo_he_config.yaml
+++ b/configs/mt_grpo_he_config.yaml
@@ -38,7 +38,7 @@ grpo:
   learning_rate: 2.0e-5
   logging_steps: 50
   save_steps: 200
-  num_generations: 4
+  num_generations: 3
   max_new_tokens: 256
   joint_mode: cross
   temperature: 0.8
diff --git a/configs/mt_magrpo_che_config.yaml b/configs/mt_magrpo_che_config.yaml
index 5064553..4e0e472 100644
--- a/configs/mt_magrpo_che_config.yaml
+++ b/configs/mt_magrpo_che_config.yaml
@@ -38,7 +38,7 @@ magrpo:
   learning_rate: 2.0e-5
   logging_steps: 50
   save_steps: 200
-  num_generations: 4
+  num_generations: 3
   max_new_tokens: 256
   temperature: 0.8
   top_p: 0.95
diff --git a/configs/mt_magrpo_he_config.yaml b/configs/mt_magrpo_he_config.yaml
index 932790d..35f96f9 100644
--- a/configs/mt_magrpo_he_config.yaml
+++ b/configs/mt_magrpo_he_config.yaml
@@ -38,7 +38,7 @@ magrpo:
   learning_rate: 2.0e-5
   logging_steps: 50
   save_steps: 200
-  num_generations: 4
+  num_generations: 3
   max_new_tokens: 256
   joint_mode: cross
   num_agents: 2
diff --git a/train_grpo.py b/train_grpo.py
index c8b24eb..7b3a974 100644
--- a/train_grpo.py
+++ b/train_grpo.py
@@ -345,7 +345,7 @@ def _resolver(prompt: str):
         learning_rate=grpo_config.get("learning_rate", 1e-5),
         logging_steps=grpo_config.get("logging_steps", 50),
         save_steps=grpo_config.get("save_steps", 200),
-        num_generations=grpo_config.get("num_generations", 4),
+        num_generations=grpo_config.get("num_generations", 3),
         max_new_tokens=grpo_config.get("max_new_tokens", 256),
         temperature=temperature,
         top_p=top_p,
diff --git a/train_magrpo.py b/train_magrpo.py
index 58316b8..4ddb4d6 100644
--- a/train_magrpo.py
+++ b/train_magrpo.py
@@ -394,7 +394,7 @@ def _resolver(prompt: str):
         learning_rate=magrpo_config.get("learning_rate", 1e-5),
         logging_steps=magrpo_config.get("logging_steps", 50),
         save_steps=magrpo_config.get("save_steps", 200),
-        num_generations=magrpo_config.get("num_generations", 4),
+        num_generations=magrpo_config.get("num_generations", 3),
         max_new_tokens=magrpo_config.get("max_new_tokens", 256),
         temperature=temperature,
         top_p=top_p,

From ee51791c076a296ac4c3f2d4b74f5597fb6b9d7c Mon Sep 17 00:00:00 2001
From: N!no <ninomyemail@gmail.com>
Date: Fri, 26 Sep 2025 09:54:45 -0400
Subject: [PATCH 3/5] allow verbose

---
 configs/grpo_che_config.yaml      |  5 ++--
 configs/grpo_he_config.yaml       |  5 ++--
 configs/magrpo_che_config.yaml    |  5 ++--
 configs/magrpo_he_config.yaml     |  5 ++--
 configs/mt_grpo_che_config.yaml   |  5 ++--
 configs/mt_grpo_he_config.yaml    |  5 ++--
 configs/mt_magrpo_che_config.yaml |  5 ++--
 configs/mt_magrpo_he_config.yaml  |  5 ++--
 external/__init__.py              | 11 +++++++++
 rewards/code_rewards.py           | 11 +++++++++
 train_grpo.py                     | 22 +++++++++++++----
 train_magrpo.py                   | 39 ++++++++++++++++++++++---------
 12 files changed, 92 insertions(+), 31 deletions(-)

diff --git a/configs/grpo_che_config.yaml b/configs/grpo_che_config.yaml
index 87b1495..d462462 100644
--- a/configs/grpo_che_config.yaml
+++ b/configs/grpo_che_config.yaml
@@ -22,6 +22,7 @@ dataset:
 output:
   base_dir: "../../../work/hdd/bepg/sliu30/output_st_grpo"
   save_final_model: false
+  verbose: false
 
 # external
 external:
@@ -37,9 +38,9 @@ grpo:
   learning_rate: 1.0e-5
   logging_steps: 50
   save_steps: 200
-  num_generations: 3
+  num_generations: 4
   max_new_tokens: 256
-  joint_mode: cross
+  joint_mode: aligned
   temperature: 0.8
   top_p: 0.95
   discount: 0.9
diff --git a/configs/grpo_he_config.yaml b/configs/grpo_he_config.yaml
index 24fcd49..7e01a4e 100644
--- a/configs/grpo_he_config.yaml
+++ b/configs/grpo_he_config.yaml
@@ -22,6 +22,7 @@ dataset:
 output:
   base_dir: "../../../work/hdd/bepg/sliu30/output_st_grpo"
   save_final_model: false
+  verbose: false
 
 # external
 external:
@@ -37,9 +38,9 @@ grpo:
   learning_rate: 1.0e-5
   logging_steps: 50
   save_steps: 200
-  num_generations: 3
+  num_generations: 4
   max_new_tokens: 256
-  joint_mode: cross
+  joint_mode: aligned
   temperature: 0.8
   top_p: 0.95
   discount: 0.9
diff --git a/configs/magrpo_che_config.yaml b/configs/magrpo_che_config.yaml
index 4d34d83..9cf6f8b 100644
--- a/configs/magrpo_che_config.yaml
+++ b/configs/magrpo_che_config.yaml
@@ -22,6 +22,7 @@ dataset:
 output:
   base_dir: "../../../work/hdd/bepg/sliu30/output_st_magrpo"
   save_final_model: false
+  verbose: false
 
 # external
 external:
@@ -37,11 +38,11 @@ magrpo:
   learning_rate: 2.0e-5
   logging_steps: 50
   save_steps: 200
-  num_generations: 3
+  num_generations: 4
   max_new_tokens: 256
   temperature: 0.8
   top_p: 0.95
-  joint_mode: cross
+  joint_mode: aligned
   num_agents: 2
   discount: 0.9
   termination_threshold: -0.2
diff --git a/configs/magrpo_he_config.yaml b/configs/magrpo_he_config.yaml
index 7b46546..ebbc9a6 100644
--- a/configs/magrpo_he_config.yaml
+++ b/configs/magrpo_he_config.yaml
@@ -22,6 +22,7 @@ dataset:
 output:
   base_dir: "../../../work/hdd/bepg/sliu30/output_st_magrpo"
   save_final_model: false
+  verbose: false
 
 # external
 external:
@@ -37,9 +38,9 @@ magrpo:
   learning_rate: 2.0e-5
   logging_steps: 50
   save_steps: 200
-  num_generations: 3
+  num_generations: 4
   max_new_tokens: 256
-  joint_mode: cross
+  joint_mode: aligned
   num_agents: 2
   discount: 0.9
   termination_threshold: -0.2
diff --git a/configs/mt_grpo_che_config.yaml b/configs/mt_grpo_che_config.yaml
index 150bc2f..fea700d 100644
--- a/configs/mt_grpo_che_config.yaml
+++ b/configs/mt_grpo_che_config.yaml
@@ -22,6 +22,7 @@ dataset:
 output:
   base_dir: "../../../work/hdd/bepg/sliu30/output_mt_grpo"
   save_final_model: false
+  verbose: false
 
 # external
 external:
@@ -38,9 +39,9 @@ grpo:
   learning_rate: 2.0e-5
   logging_steps: 50
   save_steps: 200
-  num_generations: 3
+  num_generations: 4
   max_new_tokens: 256
-  joint_mode: cross
+  joint_mode: aligned
   temperature: 0.8
   top_p: 0.95
   discount: 0.9
diff --git a/configs/mt_grpo_he_config.yaml b/configs/mt_grpo_he_config.yaml
index b0de70f..7c3e222 100644
--- a/configs/mt_grpo_he_config.yaml
+++ b/configs/mt_grpo_he_config.yaml
@@ -22,6 +22,7 @@ dataset:
 output:
   base_dir: "../../../work/hdd/bepg/sliu30/output_mt_grpo"
   save_final_model: false
+  verbose: false
 
 # external
 external:
@@ -38,9 +39,9 @@ grpo:
   learning_rate: 2.0e-5
   logging_steps: 50
   save_steps: 200
-  num_generations: 3
+  num_generations: 4
   max_new_tokens: 256
-  joint_mode: cross
+  joint_mode: aligned
   temperature: 0.8
   top_p: 0.95
   discount: 0.9
diff --git a/configs/mt_magrpo_che_config.yaml b/configs/mt_magrpo_che_config.yaml
index 4e0e472..a38e1ad 100644
--- a/configs/mt_magrpo_che_config.yaml
+++ b/configs/mt_magrpo_che_config.yaml
@@ -22,6 +22,7 @@ dataset:
 output:
   base_dir: "../../../work/hdd/bepg/sliu30/output_mt_magrpo"
   save_final_model: false
+  verbose: false
 
 # external
 external:
@@ -38,11 +39,11 @@ magrpo:
   learning_rate: 2.0e-5
   logging_steps: 50
   save_steps: 200
-  num_generations: 3
+  num_generations: 4
   max_new_tokens: 256
   temperature: 0.8
   top_p: 0.95
-  joint_mode: cross
+  joint_mode: aligned
   num_agents: 2
   discount: 0.9
   termination_threshold: -0.2
diff --git a/configs/mt_magrpo_he_config.yaml b/configs/mt_magrpo_he_config.yaml
index 35f96f9..a2d7803 100644
--- a/configs/mt_magrpo_he_config.yaml
+++ b/configs/mt_magrpo_he_config.yaml
@@ -22,6 +22,7 @@ dataset:
 output:
   base_dir: "../../../work/hdd/bepg/sliu30/output_mt_magrpo"
   save_final_model: false
+  verbose: false
 
 # external
 external:
@@ -38,9 +39,9 @@ magrpo:
   learning_rate: 2.0e-5
   logging_steps: 50
   save_steps: 200
-  num_generations: 3
+  num_generations: 4
   max_new_tokens: 256
-  joint_mode: cross
+  joint_mode: aligned
   num_agents: 2
   discount: 0.9
   termination_threshold: -0.2
diff --git a/external/__init__.py b/external/__init__.py
index 2b994ee..a2252e1 100644
--- a/external/__init__.py
+++ b/external/__init__.py
@@ -6,6 +6,10 @@
 from . import level_passed
 from . import passed
 from . import plain
+import builtins
+
+# Verbose toggle for external previews
+VERBOSE = True
 
 # -----------------------------
 # Context resolver API
@@ -59,6 +63,13 @@ def get_external_transition(
     Returns:
         A list/tuple of full prompts for each agent to use in the next turn.
     """
+    # Local print override
+    if not VERBOSE:
+        def print(*args, **kwargs):  # type: ignore
+            return None
+    else:
+        print = builtins.print  # type: ignore
+
     if int(num_agents) not in (1, 2):
         raise ValueError(
             f"External transition supports 1 or 2 agents, got {num_agents}."
diff --git a/rewards/code_rewards.py b/rewards/code_rewards.py
index 45c7858..bd9c79c 100644
--- a/rewards/code_rewards.py
+++ b/rewards/code_rewards.py
@@ -1,6 +1,10 @@
 import re
 import signal
 from typing import List
+import builtins
+
+# Verbose toggle (can be set by training scripts)
+VERBOSE = True
 
 from rewards.code_utils import (
     TimeoutException,
@@ -43,6 +47,13 @@ def execution_reward_aux(
 
     Maximum reward: 4.0 (updated from 3.5)
     """
+    # Local print override based on VERBOSE
+    if not VERBOSE:
+        def print(*args, **kwargs):  # type: ignore
+            return None
+    else:
+        print = builtins.print  # type: ignore
+
     rewards = []
     TEST_TIMEOUT = 10  # Timeout per individual test
 
diff --git a/train_grpo.py b/train_grpo.py
index 7b3a974..ec9d7cc 100644
--- a/train_grpo.py
+++ b/train_grpo.py
@@ -187,9 +187,11 @@ def main():
     num_turns = grpo_config.get("num_turns", 1)
     is_multi_turn = num_turns > 1
 
-    print(f"Multi-turn GRPO enabled: num_turns={num_turns}") if is_multi_turn else print(
-        f"Single-turn GRPO: num_turns={num_turns}"
-    )
+    output_verbose = config.get("output.verbose", True)
+    if output_verbose:
+        print(f"Multi-turn GRPO enabled: num_turns={num_turns}") if is_multi_turn else print(
+            f"Single-turn GRPO: num_turns={num_turns}"
+        )
 
     slurm_job_id = os.environ.get("SLURM_JOB_ID", "no_job_id")
     # Use different output directory prefix for multi-turn for clarity
@@ -345,7 +347,7 @@ def _resolver(prompt: str):
         learning_rate=grpo_config.get("learning_rate", 1e-5),
         logging_steps=grpo_config.get("logging_steps", 50),
         save_steps=grpo_config.get("save_steps", 200),
-        num_generations=grpo_config.get("num_generations", 3),
+        num_generations=grpo_config.get("num_generations", 4),
         max_new_tokens=grpo_config.get("max_new_tokens", 256),
         temperature=temperature,
         top_p=top_p,
@@ -407,6 +409,18 @@ def _resolver(prompt: str):
         },
     }
 
+    # Propagate verbosity to reward/external modules
+    try:
+        import rewards.code_rewards as code_rewards
+        code_rewards.VERBOSE = bool(output_verbose)
+    except Exception:
+        pass
+    try:
+        import external as external_mod
+        external_mod.VERBOSE = bool(output_verbose)
+    except Exception:
+        pass
+
     reward_processor = None
     # Optional scale
     if config.get("reward_processor.enabled", False):
diff --git a/train_magrpo.py b/train_magrpo.py
index 4ddb4d6..a0fdb13 100644
--- a/train_magrpo.py
+++ b/train_magrpo.py
@@ -232,9 +232,11 @@ def main():
     num_turns = magrpo_config.get("num_turns", 1)
     is_multi_turn = num_turns > 1
 
-    print(f"Multi-turn training enabled: num_turns={num_turns}") if is_multi_turn else print(
-        f"Single-turn training: num_turns={num_turns}"
-    )
+    output_verbose = config.get("output.verbose", True)
+    if output_verbose:
+        print(f"Multi-turn training enabled: num_turns={num_turns}") if is_multi_turn else print(
+            f"Single-turn training: num_turns={num_turns}"
+        )
 
     slurm_job_id = os.environ.get("SLURM_JOB_ID", "no_job_id")
 
@@ -260,9 +262,10 @@ def main():
         print(f"Error loading dataset: {e}")
         return
 
-    print(f"\nUsing model: {model_name}")
-    print(f"Model type: {model_config.type}")
-    print(f"Max context window: {model_config.max_length} tokens")
+    if output_verbose:
+        print(f"\nUsing model: {model_name}")
+        print(f"Model type: {model_config.type}")
+        print(f"Max context window: {model_config.max_length} tokens")
 
     tokenizer = AutoTokenizer.from_pretrained(
         model_name, **model_config.tokenizer_kwargs
@@ -277,11 +280,13 @@ def main():
 
     # Add special tokens if needed (e.g., FIM tokens for StarCoder)
     if model_config.special_tokens:
-        print("Adding special tokens...")
+        if output_verbose:
+            print("Adding special tokens...")
         tokenizer.add_special_tokens(model_config.special_tokens)
-        print(
-            f"Special tokens added: {model_config.special_tokens.get('additional_special_tokens', [])}"
-        )
+        if output_verbose:
+            print(
+                f"Special tokens added: {model_config.special_tokens.get('additional_special_tokens', [])}"
+            )
 
     temperature = magrpo_config.get("temperature", model_config.temperature)
     top_p = magrpo_config.get("top_p", model_config.top_p)
@@ -394,7 +399,7 @@ def _resolver(prompt: str):
         learning_rate=magrpo_config.get("learning_rate", 1e-5),
         logging_steps=magrpo_config.get("logging_steps", 50),
         save_steps=magrpo_config.get("save_steps", 200),
-        num_generations=magrpo_config.get("num_generations", 3),
+        num_generations=magrpo_config.get("num_generations", 4),
         max_new_tokens=magrpo_config.get("max_new_tokens", 256),
         temperature=temperature,
         top_p=top_p,
@@ -455,6 +460,18 @@ def _resolver(prompt: str):
         },
     }
 
+    # Propagate verbosity to reward/external modules
+    try:
+        import rewards.code_rewards as code_rewards
+        code_rewards.VERBOSE = bool(output_verbose)
+    except Exception:
+        pass
+    try:
+        import external as external_mod
+        external_mod.VERBOSE = bool(output_verbose)
+    except Exception:
+        pass
+
     # Get num_agents from magrpo config (where it belongs for MAGRPO training)
     num_agents = magrpo_config.get("num_agents", 2)
     agents = [

From 54c1ed03e3ca32a38b6d6e6e2c7755b2903e58d3 Mon Sep 17 00:00:00 2001
From: N!no <ninomyemail@gmail.com>
Date: Fri, 26 Sep 2025 10:38:29 -0400
Subject: [PATCH 4/5] rm the single-turn yaml

---
 README.md                         | 46 +++++-------------------
 configs/grpo_che_config.yaml      | 13 +++----
 configs/grpo_he_config.yaml       | 13 +++----
 configs/magrpo_che_config.yaml    |  9 ++---
 configs/magrpo_he_config.yaml     |  9 ++---
 configs/mt_grpo_che_config.yaml   | 57 ------------------------------
 configs/mt_grpo_he_config.yaml    | 57 ------------------------------
 configs/mt_magrpo_che_config.yaml | 58 -------------------------------
 configs/mt_magrpo_he_config.yaml  | 56 -----------------------------
 9 files changed, 32 insertions(+), 286 deletions(-)
 delete mode 100644 configs/mt_grpo_che_config.yaml
 delete mode 100644 configs/mt_grpo_he_config.yaml
 delete mode 100644 configs/mt_magrpo_che_config.yaml
 delete mode 100644 configs/mt_magrpo_he_config.yaml

diff --git a/README.md b/README.md
index 7a413f9..f8b61e7 100644
--- a/README.md
+++ b/README.md
@@ -1,63 +1,33 @@
 # LLM Collaboration with MARL
 
-This repository contains training scripts and configurations for the paper "LLM Collaboration with Multi‑Agent Reinforcement Learning".
-- [Benchmarks](#benchmarks)
-- [Training Scripts](#training-scripts)
-  - [Default Configs](#default-configs)
-  - [Parameter Overrides](#parameter-overrides)
-- [Multi-Turn Settings](#multi-turn-settings)
-  - [2+Turn Prompt Composition](#2turn-prompt-composition)
-  - [External Modes](#external-modes)
-  - [Sandbox Tests](#sandbox-tests)
+Training scripts and configs for _"LLM Collaboration with Multi‑Agent Reinforcement Learning"_.
 
 ## Benchmarks
 
-- HumanEval (HE): 164 problems on split `test`
-- CoopHumanEval (CHE): 82 problems on split `test`
+- MBPP: 427 problems on split `sanitized`
+- HumanEval: 164 problems on split `test`
+- CoopHumanEval: 82 problems on split `test`
 
 ## Training Scripts
 
 ### Default Configs
 
 ```bash
-# Single-agent HumanEval (GRPO)
 python LLM_Collaboration_with_MARL/train_grpo.py \
   --config LLM_Collaboration_with_MARL/configs/grpo_he_config.yaml
 
-# Multi-agent CoopHumanEval (MAGRPO)
 python LLM_Collaboration_with_MARL/train_magrpo.py \
   --config LLM_Collaboration_with_MARL/configs/magrpo_che_config.yaml
-
-# Multi-turn HumanEval (MT-MAGRPO)
-python LLM_Collaboration_with_MARL/train_magrpo.py \
-  --config LLM_Collaboration_with_MARL/configs/mt_magrpo_he_config.yaml
 ```
 
 ### Parameter Overrides
 
-You can override any configuration parameter using `--override`:
+You can always override any configuration parameter using `--override`:
 
 ```bash
-# Change model
-python LLM_Collaboration_with_MARL/train_magrpo.py \
-  --config LLM_Collaboration_with_MARL/configs/magrpo_he_config.yaml \
-  --override model_name='bigcode/starcoder2-3b'
-
-# Modify training params
-python LLM_Collaboration_with_MARL/train_grpo.py \
-  --config LLM_Collaboration_with_MARL/configs/grpo_che_config.yaml \
-  --override grpo.num_train_epochs=20 grpo.learning_rate=3e-5
-
-# Multi-turn override example
-python LLM_Collaboration_with_MARL/train_magrpo.py \
-  --config LLM_Collaboration_with_MARL/configs/mt_magrpo_che_config.yaml \
-  --override dataset.train_split='test[16:]' dataset.eval_split='test[:16]' \
-  magrpo.num_turns=2
-
-# Enable code-level training metrics (expensive; default is off)
 python LLM_Collaboration_with_MARL/train_magrpo.py \
   --config LLM_Collaboration_with_MARL/configs/magrpo_he_config.yaml \
-  --override magrpo.log_code_levels=true
+  --override model.name='bigcode/starcoder2-3b' magrpo.num_turns=1
 ```
 ## Multi-Turn Settings
 
@@ -97,7 +67,7 @@ Notes:
 - The same rule applies at deeper turns.
 - For GRPO (single agent), the same threshold logic applies (one agent → one set of siblings per node).
 
-### 2+Turn Prompt Composition
+### 2+Turn Prompt
 
 By default, multi-turn prompts include both the original first‑turn problem prompt and the previous response.
 
@@ -130,7 +100,7 @@ python LLM_Collaboration_with_MARL/train_magrpo.py \
   --override external.mode='level_feedback'
 ```
 
-### Sandbox Tests
+#### Sandbox Tests
 
 The external modes obtain `entry_point` and tests via an internal resolver registered by the training script. **By default, sandbox executes only the first assert (`sandbox_slice=1`).** Use all eval tests by setting `external.sandbox_slice` to `0`, `None`, or `'all'`. A negative value uses the last N asserts. Note: `external.sandbox_slice` only affects analysis-based modes (`level_feedback`, `level_passed`, `passed`), and it has no effect on `expert_edits`.
 
diff --git a/configs/grpo_che_config.yaml b/configs/grpo_che_config.yaml
index d462462..1d773b0 100644
--- a/configs/grpo_che_config.yaml
+++ b/configs/grpo_che_config.yaml
@@ -9,7 +9,7 @@ model:
     trust_remote_code: true
   model_kwargs:
     trust_remote_code: true
-    torch_dtype: "auto"
+    torch_dtype: "bfloat16"
 
 # dataset
 dataset:
@@ -20,7 +20,7 @@ dataset:
 
 # output
 output:
-  base_dir: "../../../work/hdd/bepg/sliu30/output_st_grpo"
+  base_dir: "output"
   save_final_model: false
   verbose: false
 
@@ -33,9 +33,10 @@ external:
 
 # grpo
 grpo:
-  num_train_epochs: 16
+  num_turns: 2
+  num_train_epochs: 8
   per_device_train_batch_size: 1
-  learning_rate: 1.0e-5
+  learning_rate: 2.0e-5
   logging_steps: 50
   save_steps: 200
   num_generations: 4
@@ -52,5 +53,5 @@ wandb:
   project: "mlrl"
   entity: "nu-llpr"
   name: "grpo_coophumaneval"
-  dir: "../../../work/hdd/bepg/sliu30/output_st_grpo"
-  tags: ["grpo", "coophumaneval", "single-agent"]
+  dir: "output"
+  tags: ["grpo", "coophumaneval"]
diff --git a/configs/grpo_he_config.yaml b/configs/grpo_he_config.yaml
index 7e01a4e..421e2c0 100644
--- a/configs/grpo_he_config.yaml
+++ b/configs/grpo_he_config.yaml
@@ -9,7 +9,7 @@ model:
     trust_remote_code: true
   model_kwargs:
     trust_remote_code: true
-    torch_dtype: "auto"
+    torch_dtype: "bfloat16"
 
 # dataset
 dataset:
@@ -20,7 +20,7 @@ dataset:
 
 # output
 output:
-  base_dir: "../../../work/hdd/bepg/sliu30/output_st_grpo"
+  base_dir: "output"
   save_final_model: false
   verbose: false
 
@@ -33,9 +33,10 @@ external:
 
 # grpo
 grpo:
-  num_train_epochs: 8
+  num_turns: 2
+  num_train_epochs: 6
   per_device_train_batch_size: 1
-  learning_rate: 1.0e-5
+  learning_rate: 2.0e-5
   logging_steps: 50
   save_steps: 200
   num_generations: 4
@@ -52,5 +53,5 @@ wandb:
   project: "mlrl"
   entity: "nu-llpr"
   name: "grpo_humaneval"
-  dir: "../../../work/hdd/bepg/sliu30/output_st_grpo"
-  tags: ["grpo", "humaneval", "single-agent"]
+  dir: "output"
+  tags: ["grpo", "humaneval"]
diff --git a/configs/magrpo_che_config.yaml b/configs/magrpo_che_config.yaml
index 9cf6f8b..f6f6c7e 100644
--- a/configs/magrpo_che_config.yaml
+++ b/configs/magrpo_che_config.yaml
@@ -9,7 +9,7 @@ model:
     trust_remote_code: true
   model_kwargs:
     trust_remote_code: true
-    torch_dtype: "auto"
+    torch_dtype: "bfloat16"
 
 # dataset
 dataset:
@@ -20,7 +20,7 @@ dataset:
 
 # output
 output:
-  base_dir: "../../../work/hdd/bepg/sliu30/output_st_magrpo"
+  base_dir: "output"
   save_final_model: false
   verbose: false
 
@@ -33,7 +33,8 @@ external:
 
 # magrpo
 magrpo:
-  num_train_epochs: 16
+  num_turns: 2
+  num_train_epochs: 8
   per_device_train_batch_size: 1
   learning_rate: 2.0e-5
   logging_steps: 50
@@ -53,5 +54,5 @@ wandb:
   project: "mlrl"
   entity: "nu-llpr"
   name: "magrpo_coophumaneval"
-  dir: "../../../work/hdd/bepg/sliu30/output_st_magrpo"
+  dir: "output"
   tags: ["magrpo", "coophumaneval", "multi-agent"]
diff --git a/configs/magrpo_he_config.yaml b/configs/magrpo_he_config.yaml
index ebbc9a6..029dcfe 100644
--- a/configs/magrpo_he_config.yaml
+++ b/configs/magrpo_he_config.yaml
@@ -9,7 +9,7 @@ model:
     trust_remote_code: true
   model_kwargs:
     trust_remote_code: true
-    torch_dtype: "auto"
+    torch_dtype: "bfloat16"
 
 # dataset
 dataset:
@@ -20,7 +20,7 @@ dataset:
 
 # output
 output:
-  base_dir: "../../../work/hdd/bepg/sliu30/output_st_magrpo"
+  base_dir: "output"
   save_final_model: false
   verbose: false
 
@@ -33,7 +33,8 @@ external:
 
 # magrpo
 magrpo:
-  num_train_epochs: 8
+  num_turns: 2
+  num_train_epochs: 6
   per_device_train_batch_size: 1
   learning_rate: 2.0e-5
   logging_steps: 50
@@ -51,5 +52,5 @@ wandb:
   project: "mlrl"
   entity: "nu-llpr"
   name: "magrpo_humaneval"
-  dir: "../../../work/hdd/bepg/sliu30/output_st_magrpo"
+  dir: "output"
   tags: ["magrpo", "humaneval", "multi-agent"]
diff --git a/configs/mt_grpo_che_config.yaml b/configs/mt_grpo_che_config.yaml
deleted file mode 100644
index fea700d..0000000
--- a/configs/mt_grpo_che_config.yaml
+++ /dev/null
@@ -1,57 +0,0 @@
-# model
-model:
-  name: "Qwen/Qwen2.5-Coder-3B"
-  type: "qwen"
-  temperature: 0.7
-  top_p: 0.9
-  max_length: 2048
-  tokenizer_kwargs:
-    trust_remote_code: true
-  model_kwargs:
-    trust_remote_code: true
-    torch_dtype: "bfloat16"
-
-# dataset
-dataset:
-  name: "CoMLRL/CoopHumanEval"
-  type: "coophumaneval"
-  train_split: "test[16:]"
-  eval_split: "test[:16]"
-
-# output
-output:
-  base_dir: "../../../work/hdd/bepg/sliu30/output_mt_grpo"
-  save_final_model: false
-  verbose: false
-
-# external
-external:
-  mode: "level_feedback"
-  sandbox_slice: 1
-  original_prompt: true
-  previous_response: true
-
-# grpo
-grpo:
-  num_turns: 2
-  num_train_epochs: 8
-  per_device_train_batch_size: 1
-  learning_rate: 2.0e-5
-  logging_steps: 50
-  save_steps: 200
-  num_generations: 4
-  max_new_tokens: 256
-  joint_mode: aligned
-  temperature: 0.8
-  top_p: 0.95
-  discount: 0.9
-  termination_threshold: -0.1
-  reward_shift: -2.1
-
-# wandb
-wandb:
-  project: "mlrl"
-  entity: "nu-llpr"
-  name: "mt_grpo_coophumaneval"
-  dir: "../../../work/hdd/bepg/sliu30/output_mt_grpo"
-  tags: ["mt_grpo", "coophumaneval", "single-agent", "multi-turn"]
diff --git a/configs/mt_grpo_he_config.yaml b/configs/mt_grpo_he_config.yaml
deleted file mode 100644
index 7c3e222..0000000
--- a/configs/mt_grpo_he_config.yaml
+++ /dev/null
@@ -1,57 +0,0 @@
-# model
-model:
-  name: "Qwen/Qwen2.5-Coder-3B"
-  type: "qwen"
-  temperature: 0.7
-  top_p: 0.9
-  max_length: 2048
-  tokenizer_kwargs:
-    trust_remote_code: true
-  model_kwargs:
-    trust_remote_code: true
-    torch_dtype: "bfloat16"
-
-# dataset
-dataset:
-  name: "openai/openai_humaneval"
-  type: "humaneval"
-  train_split: "test[33:163]"
-  eval_split: "test[:32]"
-
-# output
-output:
-  base_dir: "../../../work/hdd/bepg/sliu30/output_mt_grpo"
-  save_final_model: false
-  verbose: false
-
-# external
-external:
-  mode: "level_feedback"
-  sandbox_slice: 1
-  original_prompt: true
-  previous_response: true
-
-# grpo
-grpo:
-  num_turns: 2
-  num_train_epochs: 6
-  per_device_train_batch_size: 1
-  learning_rate: 2.0e-5
-  logging_steps: 50
-  save_steps: 200
-  num_generations: 4
-  max_new_tokens: 256
-  joint_mode: aligned
-  temperature: 0.8
-  top_p: 0.95
-  discount: 0.9
-  termination_threshold: -0.1
-  reward_shift: -2.1
-
-# wandb
-wandb:
-  project: "mlrl"
-  entity: "nu-llpr"
-  name: "mt_grpo_humaneval"
-  dir: "../../../work/hdd/bepg/sliu30/output_mt_grpo"
-  tags: ["mt_grpo", "humaneval", "single-agent", "multi-turn"]
diff --git a/configs/mt_magrpo_che_config.yaml b/configs/mt_magrpo_che_config.yaml
deleted file mode 100644
index a38e1ad..0000000
--- a/configs/mt_magrpo_che_config.yaml
+++ /dev/null
@@ -1,58 +0,0 @@
-# model
-model:
-  name: "Qwen/Qwen2.5-Coder-3B"
-  type: "qwen"
-  temperature: 0.7
-  top_p: 0.9
-  max_length: 2048
-  tokenizer_kwargs:
-    trust_remote_code: true
-  model_kwargs:
-    trust_remote_code: true
-    torch_dtype: "bfloat16"
-
-# dataset
-dataset:
-  name: "CoMLRL/CoopHumanEval"
-  type: "coophumaneval"
-  train_split: "test[16:]"
-  eval_split: "test[:16]"
-
-# output
-output:
-  base_dir: "../../../work/hdd/bepg/sliu30/output_mt_magrpo"
-  save_final_model: false
-  verbose: false
-
-# external
-external:
-  mode: "level_feedback"
-  sandbox_slice: 1
-  original_prompt: true
-  previous_response: true
-
-# magrpo
-magrpo:
-  num_turns: 2
-  num_train_epochs: 8
-  per_device_train_batch_size: 1
-  learning_rate: 2.0e-5
-  logging_steps: 50
-  save_steps: 200
-  num_generations: 4
-  max_new_tokens: 256
-  temperature: 0.8
-  top_p: 0.95
-  joint_mode: aligned
-  num_agents: 2
-  discount: 0.9
-  termination_threshold: -0.2
-  reward_shift: -4
-
-# wandb
-wandb:
-  project: "mlrl"
-  entity: "nu-llpr"
-  name: "mt_magrpo_coophumaneval"
-  dir: "../../../work/hdd/bepg/sliu30/output_mt_magrpo"
-  tags: ["mt_magrpo", "coophumaneval", "multi-agent", "multi-turn"]
diff --git a/configs/mt_magrpo_he_config.yaml b/configs/mt_magrpo_he_config.yaml
deleted file mode 100644
index a2d7803..0000000
--- a/configs/mt_magrpo_he_config.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-# model
-model:
-  name: "Qwen/Qwen2.5-Coder-3B"
-  type: "qwen"
-  temperature: 0.7
-  top_p: 0.9
-  max_length: 2048
-  tokenizer_kwargs:
-    trust_remote_code: true
-  model_kwargs:
-    trust_remote_code: true
-    torch_dtype: "bfloat16"
-
-# dataset
-dataset:
-  name: "openai/openai_humaneval"
-  type: "humaneval"
-  train_split: "test[33:163]"
-  eval_split: "test[:32]"
-
-# output
-output:
-  base_dir: "../../../work/hdd/bepg/sliu30/output_mt_magrpo"
-  save_final_model: false
-  verbose: false
-
-# external
-external:
-  mode: "level_feedback"
-  sandbox_slice: 1
-  original_prompt: true
-  previous_response: true
-
-# magrpo
-magrpo:
-  num_turns: 2
-  num_train_epochs: 6
-  per_device_train_batch_size: 1
-  learning_rate: 2.0e-5
-  logging_steps: 50
-  save_steps: 200
-  num_generations: 4
-  max_new_tokens: 256
-  joint_mode: aligned
-  num_agents: 2
-  discount: 0.9
-  termination_threshold: -0.2
-  reward_shift: -4
-
-# wandb
-wandb:
-  project: "mlrl"
-  entity: "nu-llpr"
-  name: "mt_magrpo_humaneval"
-  dir: "../../../work/hdd/bepg/sliu30/output_mt_magrpo"
-  tags: ["mt_magrpo", "humaneval", "multi-agent", "multi-turn"]

From ebee2f7a1ba7eb13f3788c15ee9e3e5d51e268a6 Mon Sep 17 00:00:00 2001
From: N!no <ninomyemail@gmail.com>
Date: Fri, 26 Sep 2025 11:45:34 -0400
Subject: [PATCH 5/5] Update README.md

---
 README.md | 79 +++++++++++--------------------------------------------
 1 file changed, 16 insertions(+), 63 deletions(-)

diff --git a/README.md b/README.md
index f8b61e7..8277afa 100644
--- a/README.md
+++ b/README.md
@@ -29,84 +29,37 @@ python LLM_Collaboration_with_MARL/train_magrpo.py \
   --config LLM_Collaboration_with_MARL/configs/magrpo_he_config.yaml \
   --override model.name='bigcode/starcoder2-3b' magrpo.num_turns=1
 ```
-## Multi-Turn Settings
 
-### Joint Actions & Early Termination
+## Settings
 
-- Joint actions (magrpo.joint_mode):
-  - `cross` (default): Form joint actions by Cartesian product of each agent's K generations (reuses sequences; no extra generation).
-  - `aligned`: Join index‑aligned generations.
+### Joint Action Modes
 
-- Early termination (magrpo.termination_threshold / grpo.termination_threshold):
-  - At each node (branch, turn), compute the mean immediate reward across the sibling joint actions at that node.
-  - If the mean exceeds the threshold, that branch stops expanding at this turn; training backpropagates from the truncated subtree. Other branches continue.
+`magrpo.joint_mode` determine how to combine each agent's K generations into joint actions at each turn. 2 modes are supported: if set 'align' by default, each agent's k-th generation is paired with the other agents' k-th generations to form a joint action; if set 'cross', all combinations of the agents' K generations are used to form joint actions (K^N joint actions for N agents).
 
-Illustrative example (threshold = -0.2, 2 agents, K=2 → 4 joint actions per node):
+Since the number of samples will also grow exponentially with the number of turns, aligned joint will be **more flexible** (\#samples could not be a perfect power) and hence faster to train in wall time. However, using cross joint will be more sample efficient (much lower VRAM compare to 'align' when num_generations=K^N), it also performs better since the value estimation is more accurate.
 
-```
-Turn 1 (root) (a,b,c,d): [-1.5, -1.5, -1.0, -1.0]
-mean rewards = -1.25 ≤ -0.2 → continue expanding all branches
-
-  a (-1.5)
-    ↳ Turn 2 children (e,f,g,h): [-1, -1, -1, -1]
-      mean rewards = -1.0 ≤ -0.2 → continue
-
-  b (-1.5)
-    ↳ Turn 2 children (i,j,k,l): [0.0, 0.0, 0.0, -0.2]
-      mean rewards = -0.05 > -0.2 → TERMINATE branch b here (no further children)
-
-  c (-1.0)
-    ↳ Turn 2 children: [...]
+### Number of Turns
 
-  d (-1.0)
-    ↳ Turn 2 children: [...]
-```
-
-Notes:
-- Termination is per‑branch; other branches continue normally.
-- The same rule applies at deeper turns.
-- For GRPO (single agent), the same threshold logic applies (one agent → one set of siblings per node).
+`magrpo.num_turns` determines the number of turns (`magrpo.num_turns=2` by default). The number of samples at each turn will grow exponentially with the number of turns: K^TN at turn T if cross joint, K^N if aligned joint. 
 
-### 2+Turn Prompt
+### Early Termination
 
-By default, multi-turn prompts include both the original first‑turn problem prompt and the previous response.
+`magrpo.termination_threshold` is used to incentive agents to find high-reward solutions quickly, instead of expanding the full Monte Carlo tree.
 
-- external.original_prompt: true (default)
-- external.previous_response: true (default)
+At each node (branch, turn), compute the mean immediate **reward across the sibling** joint actions at that node. If the mean exceeds the threshold, that branch stops expanding at this turn; training backpropagates from the truncated subtree. Other branches continue.
 
-To exclude the original prompt but keep the previous response (shorter context):
+### Multi-Turn Prompt
 
-```bash
-python LLM_Collaboration_with_MARL/train_magrpo.py \
-  --config LLM_Collaboration_with_MARL/configs/mt_magrpo_he_config.yaml \
-  --override external.original_prompt=False external.previous_response=True
-```
+`external.original_prompt` and `external.previous_response` both default as `true`. 2+ turn prompts include both the original first‑turn problem prompt and the previous response by default to preserve full context; you can shorten the context by setting either to `false` (for example, keep only the previous response to reduce tokens while retaining the most recent interaction).
 
 ### External Modes
 
-Multi-turn training supports external transition modes for 2nd+ turns, set via `external.mode`:
-
-- `level_feedback` **(default)**: Detailed diagnostics (impl found, syntax with line/col, per-test pass/fail errors, aux usage).
- - Requires `external.expert_model` in config when using `expert_edits` (e.g., `deepseek-coder`, Claude, etc.). This parameter is ignored for other modes (`level_feedback`, `level_passed`, `passed`, `plain`).
-- Requires corrsponding API keys in env vars.
-- `level_passed`: Binary passed signals (impl found, syntax, tests summary, aux usage).
-- `passed`: A binary signal — "All levels passed" or "Not all levels passed".
-- `plain`: No signals or diagnostics.
+`external.mode` is set to be 'level_feedback' by default. This gives additional information from external to prompts in the following turns; 'level_feedback' attaches test‑driven diagnostics, while alternatives include 'expert_edits' (an LLM proposes edits), 'level_passed'/'passed' (binary outcomes), and 'plain' (no signals). 
 
-```bash
-# HumanEval with detailed feedback signals
-python LLM_Collaboration_with_MARL/train_magrpo.py \
-  --config LLM_Collaboration_with_MARL/configs/mt_magrpo_he_config.yaml \
-  --override external.mode='level_feedback'
-```
+Specific settings for 'level_feedback' is `external.sandbox_slice`, which controls how many eval tests to include in the feedback. By default, sandbox executes only the first assert (sandbox_slice=1). Use all eval tests by setting `external.sandbox_slice` to 0, None, or 'all'. Negative values use the last asserts. `external.sandbox_slice` only affects analysis-based modes ('level_feedback', 'level_passed', 'passed'), and it has no effect on 'expert_edits'.
 
-#### Sandbox Tests
+Specific settings for 'expert_edits' is `external.expert_edits_model`, which controls which LLM to use for proposing edits. By default, it uses DeepSeek-Coder. You can also change it to Claude-3, GPT-4, once you have keys/tokens in your global environment variables.
 
-The external modes obtain `entry_point` and tests via an internal resolver registered by the training script. **By default, sandbox executes only the first assert (`sandbox_slice=1`).** Use all eval tests by setting `external.sandbox_slice` to `0`, `None`, or `'all'`. A negative value uses the last N asserts. Note: `external.sandbox_slice` only affects analysis-based modes (`level_feedback`, `level_passed`, `passed`), and it has no effect on `expert_edits`.
+### Output
 
-```bash
-# Add an external.sandbox_slice override
-python LLM_Collaboration_with_MARL/train_magrpo.py \
-  --config LLM_Collaboration_with_MARL/configs/mt_magrpo_che_config.yaml \
-  --override external.mode='level_feedback' external.sandbox_slice=-2
-```
+`output.save_model` is set to `false` by default because of the huge storage required by multiple LLMs. `verbose` is used for debug printing on cluster if set to be true, but it is default to be false and you can only see a tqdm bar that shows the training progress. You can also turn on `magrpo.log_code_levels` to log the level-rewards during training, but it will crazily slow down the training.
\ No newline at end of file