From 184e44db472cf6ef0e577e2a3dc08870a2370b47 Mon Sep 17 00:00:00 2001 From: N!no Date: Thu, 25 Sep 2025 21:45:20 -0400 Subject: [PATCH 1/5] early --- README.md | 45 +++++++++++++++++++++++++++++-- configs/grpo_che_config.yaml | 1 + configs/grpo_he_config.yaml | 1 + configs/magrpo_che_config.yaml | 1 + configs/magrpo_he_config.yaml | 1 + configs/mt_grpo_che_config.yaml | 1 + configs/mt_grpo_he_config.yaml | 1 + configs/mt_magrpo_che_config.yaml | 1 + configs/mt_magrpo_he_config.yaml | 1 + train_grpo.py | 1 + train_magrpo.py | 1 + 11 files changed, 53 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 84acb79..7a413f9 100644 --- a/README.md +++ b/README.md @@ -61,14 +61,55 @@ python LLM_Collaboration_with_MARL/train_magrpo.py \ ``` ## Multi-Turn Settings +### Joint Actions & Early Termination + +- Joint actions (magrpo.joint_mode): + - `cross` (default): Form joint actions by Cartesian product of each agent's K generations (reuses sequences; no extra generation). + - `aligned`: Join index‑aligned generations. + +- Early termination (magrpo.termination_threshold / grpo.termination_threshold): + - At each node (branch, turn), compute the mean immediate reward across the sibling joint actions at that node. + - If the mean exceeds the threshold, that branch stops expanding at this turn; training backpropagates from the truncated subtree. Other branches continue. + +Illustrative example (threshold = -0.2, 2 agents, K=2 → 4 joint actions per node): + +``` +Turn 1 (root) (a,b,c,d): [-1.5, -1.5, -1.0, -1.0] +mean rewards = -1.25 ≤ -0.2 → continue expanding all branches + + a (-1.5) + ↳ Turn 2 children (e,f,g,h): [-1, -1, -1, -1] + mean rewards = -1.0 ≤ -0.2 → continue + + b (-1.5) + ↳ Turn 2 children (i,j,k,l): [0.0, 0.0, 0.0, -0.2] + mean rewards = -0.05 > -0.2 → TERMINATE branch b here (no further children) + + c (-1.0) + ↳ Turn 2 children: [...] + + d (-1.0) + ↳ Turn 2 children: [...] +``` + +Notes: +- Termination is per‑branch; other branches continue normally. +- The same rule applies at deeper turns. +- For GRPO (single agent), the same threshold logic applies (one agent → one set of siblings per node). + ### 2+Turn Prompt Composition -To save memory usage, 2+ turn prompts **include the previous response without the original first‑turn problem prompt by default**. You can add the original prompt to match the concept of observation-action history in MARL. +By default, multi-turn prompts include both the original first‑turn problem prompt and the previous response. + +- external.original_prompt: true (default) +- external.previous_response: true (default) + +To exclude the original prompt but keep the previous response (shorter context): ```bash python LLM_Collaboration_with_MARL/train_magrpo.py \ --config LLM_Collaboration_with_MARL/configs/mt_magrpo_he_config.yaml \ - --override magrpo.external_original_prompt=True magrpo.external_previous_response=True + --override external.original_prompt=False external.previous_response=True ``` ### External Modes diff --git a/configs/grpo_che_config.yaml b/configs/grpo_che_config.yaml index 8b21935..4761df5 100644 --- a/configs/grpo_che_config.yaml +++ b/configs/grpo_che_config.yaml @@ -43,6 +43,7 @@ grpo: temperature: 0.8 top_p: 0.95 discount: 0.9 + termination_threshold: -0.1 reward_shift: -2.1 # wandb diff --git a/configs/grpo_he_config.yaml b/configs/grpo_he_config.yaml index c9d8bf7..4d8dff9 100644 --- a/configs/grpo_he_config.yaml +++ b/configs/grpo_he_config.yaml @@ -43,6 +43,7 @@ grpo: temperature: 0.8 top_p: 0.95 discount: 0.9 + termination_threshold: -0.1 reward_shift: -2.1 # wandb diff --git a/configs/magrpo_che_config.yaml b/configs/magrpo_che_config.yaml index 5ace7f6..72534f7 100644 --- a/configs/magrpo_che_config.yaml +++ b/configs/magrpo_che_config.yaml @@ -44,6 +44,7 @@ magrpo: joint_mode: cross num_agents: 2 discount: 0.9 + termination_threshold: -0.2 reward_shift: -4 # wandb diff --git a/configs/magrpo_he_config.yaml b/configs/magrpo_he_config.yaml index a7e638c..21c3654 100644 --- a/configs/magrpo_he_config.yaml +++ b/configs/magrpo_he_config.yaml @@ -42,6 +42,7 @@ magrpo: joint_mode: cross num_agents: 2 discount: 0.9 + termination_threshold: -0.2 reward_shift: -4 # wandb diff --git a/configs/mt_grpo_che_config.yaml b/configs/mt_grpo_che_config.yaml index dae8305..9803d06 100644 --- a/configs/mt_grpo_che_config.yaml +++ b/configs/mt_grpo_che_config.yaml @@ -44,6 +44,7 @@ grpo: temperature: 0.8 top_p: 0.95 discount: 0.9 + termination_threshold: -0.1 reward_shift: -2.1 # wandb diff --git a/configs/mt_grpo_he_config.yaml b/configs/mt_grpo_he_config.yaml index 0157d7c..e9e7f3e 100644 --- a/configs/mt_grpo_he_config.yaml +++ b/configs/mt_grpo_he_config.yaml @@ -44,6 +44,7 @@ grpo: temperature: 0.8 top_p: 0.95 discount: 0.9 + termination_threshold: -0.1 reward_shift: -2.1 # wandb diff --git a/configs/mt_magrpo_che_config.yaml b/configs/mt_magrpo_che_config.yaml index 64139cb..5064553 100644 --- a/configs/mt_magrpo_che_config.yaml +++ b/configs/mt_magrpo_che_config.yaml @@ -45,6 +45,7 @@ magrpo: joint_mode: cross num_agents: 2 discount: 0.9 + termination_threshold: -0.2 reward_shift: -4 # wandb diff --git a/configs/mt_magrpo_he_config.yaml b/configs/mt_magrpo_he_config.yaml index f0aa070..932790d 100644 --- a/configs/mt_magrpo_he_config.yaml +++ b/configs/mt_magrpo_he_config.yaml @@ -43,6 +43,7 @@ magrpo: joint_mode: cross num_agents: 2 discount: 0.9 + termination_threshold: -0.2 reward_shift: -4 # wandb diff --git a/train_grpo.py b/train_grpo.py index 1f7e0a5..c8b24eb 100644 --- a/train_grpo.py +++ b/train_grpo.py @@ -353,6 +353,7 @@ def _resolver(prompt: str): num_turns=num_turns, discount=grpo_config.get("discount", 0.9), joint_mode=grpo_config.get("joint_mode", "cross"), + termination_threshold=grpo_config.get("termination_threshold", None), ) formatter = get_formatter(dataset_type) diff --git a/train_magrpo.py b/train_magrpo.py index dc1b6f6..58316b8 100644 --- a/train_magrpo.py +++ b/train_magrpo.py @@ -402,6 +402,7 @@ def _resolver(prompt: str): num_turns=num_turns, discount=magrpo_config.get("discount", 0.9), joint_mode=magrpo_config.get("joint_mode", "cross"), + termination_threshold=magrpo_config.get("termination_threshold", None), ) # Get appropriate formatters and functions based on dataset type, agent count, and training mode From 1ab001c624530e8883007ff800c26054743d8b78 Mon Sep 17 00:00:00 2001 From: N!no Date: Thu, 25 Sep 2025 21:54:26 -0400 Subject: [PATCH 2/5] change default num_gen --- configs/grpo_che_config.yaml | 2 +- configs/grpo_he_config.yaml | 2 +- configs/magrpo_che_config.yaml | 2 +- configs/magrpo_he_config.yaml | 2 +- configs/mt_grpo_che_config.yaml | 2 +- configs/mt_grpo_he_config.yaml | 2 +- configs/mt_magrpo_che_config.yaml | 2 +- configs/mt_magrpo_he_config.yaml | 2 +- train_grpo.py | 2 +- train_magrpo.py | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/configs/grpo_che_config.yaml b/configs/grpo_che_config.yaml index 4761df5..87b1495 100644 --- a/configs/grpo_che_config.yaml +++ b/configs/grpo_che_config.yaml @@ -37,7 +37,7 @@ grpo: learning_rate: 1.0e-5 logging_steps: 50 save_steps: 200 - num_generations: 4 + num_generations: 3 max_new_tokens: 256 joint_mode: cross temperature: 0.8 diff --git a/configs/grpo_he_config.yaml b/configs/grpo_he_config.yaml index 4d8dff9..24fcd49 100644 --- a/configs/grpo_he_config.yaml +++ b/configs/grpo_he_config.yaml @@ -37,7 +37,7 @@ grpo: learning_rate: 1.0e-5 logging_steps: 50 save_steps: 200 - num_generations: 4 + num_generations: 3 max_new_tokens: 256 joint_mode: cross temperature: 0.8 diff --git a/configs/magrpo_che_config.yaml b/configs/magrpo_che_config.yaml index 72534f7..4d34d83 100644 --- a/configs/magrpo_che_config.yaml +++ b/configs/magrpo_che_config.yaml @@ -37,7 +37,7 @@ magrpo: learning_rate: 2.0e-5 logging_steps: 50 save_steps: 200 - num_generations: 4 + num_generations: 3 max_new_tokens: 256 temperature: 0.8 top_p: 0.95 diff --git a/configs/magrpo_he_config.yaml b/configs/magrpo_he_config.yaml index 21c3654..7b46546 100644 --- a/configs/magrpo_he_config.yaml +++ b/configs/magrpo_he_config.yaml @@ -37,7 +37,7 @@ magrpo: learning_rate: 2.0e-5 logging_steps: 50 save_steps: 200 - num_generations: 4 + num_generations: 3 max_new_tokens: 256 joint_mode: cross num_agents: 2 diff --git a/configs/mt_grpo_che_config.yaml b/configs/mt_grpo_che_config.yaml index 9803d06..150bc2f 100644 --- a/configs/mt_grpo_che_config.yaml +++ b/configs/mt_grpo_che_config.yaml @@ -38,7 +38,7 @@ grpo: learning_rate: 2.0e-5 logging_steps: 50 save_steps: 200 - num_generations: 4 + num_generations: 3 max_new_tokens: 256 joint_mode: cross temperature: 0.8 diff --git a/configs/mt_grpo_he_config.yaml b/configs/mt_grpo_he_config.yaml index e9e7f3e..b0de70f 100644 --- a/configs/mt_grpo_he_config.yaml +++ b/configs/mt_grpo_he_config.yaml @@ -38,7 +38,7 @@ grpo: learning_rate: 2.0e-5 logging_steps: 50 save_steps: 200 - num_generations: 4 + num_generations: 3 max_new_tokens: 256 joint_mode: cross temperature: 0.8 diff --git a/configs/mt_magrpo_che_config.yaml b/configs/mt_magrpo_che_config.yaml index 5064553..4e0e472 100644 --- a/configs/mt_magrpo_che_config.yaml +++ b/configs/mt_magrpo_che_config.yaml @@ -38,7 +38,7 @@ magrpo: learning_rate: 2.0e-5 logging_steps: 50 save_steps: 200 - num_generations: 4 + num_generations: 3 max_new_tokens: 256 temperature: 0.8 top_p: 0.95 diff --git a/configs/mt_magrpo_he_config.yaml b/configs/mt_magrpo_he_config.yaml index 932790d..35f96f9 100644 --- a/configs/mt_magrpo_he_config.yaml +++ b/configs/mt_magrpo_he_config.yaml @@ -38,7 +38,7 @@ magrpo: learning_rate: 2.0e-5 logging_steps: 50 save_steps: 200 - num_generations: 4 + num_generations: 3 max_new_tokens: 256 joint_mode: cross num_agents: 2 diff --git a/train_grpo.py b/train_grpo.py index c8b24eb..7b3a974 100644 --- a/train_grpo.py +++ b/train_grpo.py @@ -345,7 +345,7 @@ def _resolver(prompt: str): learning_rate=grpo_config.get("learning_rate", 1e-5), logging_steps=grpo_config.get("logging_steps", 50), save_steps=grpo_config.get("save_steps", 200), - num_generations=grpo_config.get("num_generations", 4), + num_generations=grpo_config.get("num_generations", 3), max_new_tokens=grpo_config.get("max_new_tokens", 256), temperature=temperature, top_p=top_p, diff --git a/train_magrpo.py b/train_magrpo.py index 58316b8..4ddb4d6 100644 --- a/train_magrpo.py +++ b/train_magrpo.py @@ -394,7 +394,7 @@ def _resolver(prompt: str): learning_rate=magrpo_config.get("learning_rate", 1e-5), logging_steps=magrpo_config.get("logging_steps", 50), save_steps=magrpo_config.get("save_steps", 200), - num_generations=magrpo_config.get("num_generations", 4), + num_generations=magrpo_config.get("num_generations", 3), max_new_tokens=magrpo_config.get("max_new_tokens", 256), temperature=temperature, top_p=top_p, From ee51791c076a296ac4c3f2d4b74f5597fb6b9d7c Mon Sep 17 00:00:00 2001 From: N!no Date: Fri, 26 Sep 2025 09:54:45 -0400 Subject: [PATCH 3/5] allow verbose --- configs/grpo_che_config.yaml | 5 ++-- configs/grpo_he_config.yaml | 5 ++-- configs/magrpo_che_config.yaml | 5 ++-- configs/magrpo_he_config.yaml | 5 ++-- configs/mt_grpo_che_config.yaml | 5 ++-- configs/mt_grpo_he_config.yaml | 5 ++-- configs/mt_magrpo_che_config.yaml | 5 ++-- configs/mt_magrpo_he_config.yaml | 5 ++-- external/__init__.py | 11 +++++++++ rewards/code_rewards.py | 11 +++++++++ train_grpo.py | 22 +++++++++++++---- train_magrpo.py | 39 ++++++++++++++++++++++--------- 12 files changed, 92 insertions(+), 31 deletions(-) diff --git a/configs/grpo_che_config.yaml b/configs/grpo_che_config.yaml index 87b1495..d462462 100644 --- a/configs/grpo_che_config.yaml +++ b/configs/grpo_che_config.yaml @@ -22,6 +22,7 @@ dataset: output: base_dir: "../../../work/hdd/bepg/sliu30/output_st_grpo" save_final_model: false + verbose: false # external external: @@ -37,9 +38,9 @@ grpo: learning_rate: 1.0e-5 logging_steps: 50 save_steps: 200 - num_generations: 3 + num_generations: 4 max_new_tokens: 256 - joint_mode: cross + joint_mode: aligned temperature: 0.8 top_p: 0.95 discount: 0.9 diff --git a/configs/grpo_he_config.yaml b/configs/grpo_he_config.yaml index 24fcd49..7e01a4e 100644 --- a/configs/grpo_he_config.yaml +++ b/configs/grpo_he_config.yaml @@ -22,6 +22,7 @@ dataset: output: base_dir: "../../../work/hdd/bepg/sliu30/output_st_grpo" save_final_model: false + verbose: false # external external: @@ -37,9 +38,9 @@ grpo: learning_rate: 1.0e-5 logging_steps: 50 save_steps: 200 - num_generations: 3 + num_generations: 4 max_new_tokens: 256 - joint_mode: cross + joint_mode: aligned temperature: 0.8 top_p: 0.95 discount: 0.9 diff --git a/configs/magrpo_che_config.yaml b/configs/magrpo_che_config.yaml index 4d34d83..9cf6f8b 100644 --- a/configs/magrpo_che_config.yaml +++ b/configs/magrpo_che_config.yaml @@ -22,6 +22,7 @@ dataset: output: base_dir: "../../../work/hdd/bepg/sliu30/output_st_magrpo" save_final_model: false + verbose: false # external external: @@ -37,11 +38,11 @@ magrpo: learning_rate: 2.0e-5 logging_steps: 50 save_steps: 200 - num_generations: 3 + num_generations: 4 max_new_tokens: 256 temperature: 0.8 top_p: 0.95 - joint_mode: cross + joint_mode: aligned num_agents: 2 discount: 0.9 termination_threshold: -0.2 diff --git a/configs/magrpo_he_config.yaml b/configs/magrpo_he_config.yaml index 7b46546..ebbc9a6 100644 --- a/configs/magrpo_he_config.yaml +++ b/configs/magrpo_he_config.yaml @@ -22,6 +22,7 @@ dataset: output: base_dir: "../../../work/hdd/bepg/sliu30/output_st_magrpo" save_final_model: false + verbose: false # external external: @@ -37,9 +38,9 @@ magrpo: learning_rate: 2.0e-5 logging_steps: 50 save_steps: 200 - num_generations: 3 + num_generations: 4 max_new_tokens: 256 - joint_mode: cross + joint_mode: aligned num_agents: 2 discount: 0.9 termination_threshold: -0.2 diff --git a/configs/mt_grpo_che_config.yaml b/configs/mt_grpo_che_config.yaml index 150bc2f..fea700d 100644 --- a/configs/mt_grpo_che_config.yaml +++ b/configs/mt_grpo_che_config.yaml @@ -22,6 +22,7 @@ dataset: output: base_dir: "../../../work/hdd/bepg/sliu30/output_mt_grpo" save_final_model: false + verbose: false # external external: @@ -38,9 +39,9 @@ grpo: learning_rate: 2.0e-5 logging_steps: 50 save_steps: 200 - num_generations: 3 + num_generations: 4 max_new_tokens: 256 - joint_mode: cross + joint_mode: aligned temperature: 0.8 top_p: 0.95 discount: 0.9 diff --git a/configs/mt_grpo_he_config.yaml b/configs/mt_grpo_he_config.yaml index b0de70f..7c3e222 100644 --- a/configs/mt_grpo_he_config.yaml +++ b/configs/mt_grpo_he_config.yaml @@ -22,6 +22,7 @@ dataset: output: base_dir: "../../../work/hdd/bepg/sliu30/output_mt_grpo" save_final_model: false + verbose: false # external external: @@ -38,9 +39,9 @@ grpo: learning_rate: 2.0e-5 logging_steps: 50 save_steps: 200 - num_generations: 3 + num_generations: 4 max_new_tokens: 256 - joint_mode: cross + joint_mode: aligned temperature: 0.8 top_p: 0.95 discount: 0.9 diff --git a/configs/mt_magrpo_che_config.yaml b/configs/mt_magrpo_che_config.yaml index 4e0e472..a38e1ad 100644 --- a/configs/mt_magrpo_che_config.yaml +++ b/configs/mt_magrpo_che_config.yaml @@ -22,6 +22,7 @@ dataset: output: base_dir: "../../../work/hdd/bepg/sliu30/output_mt_magrpo" save_final_model: false + verbose: false # external external: @@ -38,11 +39,11 @@ magrpo: learning_rate: 2.0e-5 logging_steps: 50 save_steps: 200 - num_generations: 3 + num_generations: 4 max_new_tokens: 256 temperature: 0.8 top_p: 0.95 - joint_mode: cross + joint_mode: aligned num_agents: 2 discount: 0.9 termination_threshold: -0.2 diff --git a/configs/mt_magrpo_he_config.yaml b/configs/mt_magrpo_he_config.yaml index 35f96f9..a2d7803 100644 --- a/configs/mt_magrpo_he_config.yaml +++ b/configs/mt_magrpo_he_config.yaml @@ -22,6 +22,7 @@ dataset: output: base_dir: "../../../work/hdd/bepg/sliu30/output_mt_magrpo" save_final_model: false + verbose: false # external external: @@ -38,9 +39,9 @@ magrpo: learning_rate: 2.0e-5 logging_steps: 50 save_steps: 200 - num_generations: 3 + num_generations: 4 max_new_tokens: 256 - joint_mode: cross + joint_mode: aligned num_agents: 2 discount: 0.9 termination_threshold: -0.2 diff --git a/external/__init__.py b/external/__init__.py index 2b994ee..a2252e1 100644 --- a/external/__init__.py +++ b/external/__init__.py @@ -6,6 +6,10 @@ from . import level_passed from . import passed from . import plain +import builtins + +# Verbose toggle for external previews +VERBOSE = True # ----------------------------- # Context resolver API @@ -59,6 +63,13 @@ def get_external_transition( Returns: A list/tuple of full prompts for each agent to use in the next turn. """ + # Local print override + if not VERBOSE: + def print(*args, **kwargs): # type: ignore + return None + else: + print = builtins.print # type: ignore + if int(num_agents) not in (1, 2): raise ValueError( f"External transition supports 1 or 2 agents, got {num_agents}." diff --git a/rewards/code_rewards.py b/rewards/code_rewards.py index 45c7858..bd9c79c 100644 --- a/rewards/code_rewards.py +++ b/rewards/code_rewards.py @@ -1,6 +1,10 @@ import re import signal from typing import List +import builtins + +# Verbose toggle (can be set by training scripts) +VERBOSE = True from rewards.code_utils import ( TimeoutException, @@ -43,6 +47,13 @@ def execution_reward_aux( Maximum reward: 4.0 (updated from 3.5) """ + # Local print override based on VERBOSE + if not VERBOSE: + def print(*args, **kwargs): # type: ignore + return None + else: + print = builtins.print # type: ignore + rewards = [] TEST_TIMEOUT = 10 # Timeout per individual test diff --git a/train_grpo.py b/train_grpo.py index 7b3a974..ec9d7cc 100644 --- a/train_grpo.py +++ b/train_grpo.py @@ -187,9 +187,11 @@ def main(): num_turns = grpo_config.get("num_turns", 1) is_multi_turn = num_turns > 1 - print(f"Multi-turn GRPO enabled: num_turns={num_turns}") if is_multi_turn else print( - f"Single-turn GRPO: num_turns={num_turns}" - ) + output_verbose = config.get("output.verbose", True) + if output_verbose: + print(f"Multi-turn GRPO enabled: num_turns={num_turns}") if is_multi_turn else print( + f"Single-turn GRPO: num_turns={num_turns}" + ) slurm_job_id = os.environ.get("SLURM_JOB_ID", "no_job_id") # Use different output directory prefix for multi-turn for clarity @@ -345,7 +347,7 @@ def _resolver(prompt: str): learning_rate=grpo_config.get("learning_rate", 1e-5), logging_steps=grpo_config.get("logging_steps", 50), save_steps=grpo_config.get("save_steps", 200), - num_generations=grpo_config.get("num_generations", 3), + num_generations=grpo_config.get("num_generations", 4), max_new_tokens=grpo_config.get("max_new_tokens", 256), temperature=temperature, top_p=top_p, @@ -407,6 +409,18 @@ def _resolver(prompt: str): }, } + # Propagate verbosity to reward/external modules + try: + import rewards.code_rewards as code_rewards + code_rewards.VERBOSE = bool(output_verbose) + except Exception: + pass + try: + import external as external_mod + external_mod.VERBOSE = bool(output_verbose) + except Exception: + pass + reward_processor = None # Optional scale if config.get("reward_processor.enabled", False): diff --git a/train_magrpo.py b/train_magrpo.py index 4ddb4d6..a0fdb13 100644 --- a/train_magrpo.py +++ b/train_magrpo.py @@ -232,9 +232,11 @@ def main(): num_turns = magrpo_config.get("num_turns", 1) is_multi_turn = num_turns > 1 - print(f"Multi-turn training enabled: num_turns={num_turns}") if is_multi_turn else print( - f"Single-turn training: num_turns={num_turns}" - ) + output_verbose = config.get("output.verbose", True) + if output_verbose: + print(f"Multi-turn training enabled: num_turns={num_turns}") if is_multi_turn else print( + f"Single-turn training: num_turns={num_turns}" + ) slurm_job_id = os.environ.get("SLURM_JOB_ID", "no_job_id") @@ -260,9 +262,10 @@ def main(): print(f"Error loading dataset: {e}") return - print(f"\nUsing model: {model_name}") - print(f"Model type: {model_config.type}") - print(f"Max context window: {model_config.max_length} tokens") + if output_verbose: + print(f"\nUsing model: {model_name}") + print(f"Model type: {model_config.type}") + print(f"Max context window: {model_config.max_length} tokens") tokenizer = AutoTokenizer.from_pretrained( model_name, **model_config.tokenizer_kwargs @@ -277,11 +280,13 @@ def main(): # Add special tokens if needed (e.g., FIM tokens for StarCoder) if model_config.special_tokens: - print("Adding special tokens...") + if output_verbose: + print("Adding special tokens...") tokenizer.add_special_tokens(model_config.special_tokens) - print( - f"Special tokens added: {model_config.special_tokens.get('additional_special_tokens', [])}" - ) + if output_verbose: + print( + f"Special tokens added: {model_config.special_tokens.get('additional_special_tokens', [])}" + ) temperature = magrpo_config.get("temperature", model_config.temperature) top_p = magrpo_config.get("top_p", model_config.top_p) @@ -394,7 +399,7 @@ def _resolver(prompt: str): learning_rate=magrpo_config.get("learning_rate", 1e-5), logging_steps=magrpo_config.get("logging_steps", 50), save_steps=magrpo_config.get("save_steps", 200), - num_generations=magrpo_config.get("num_generations", 3), + num_generations=magrpo_config.get("num_generations", 4), max_new_tokens=magrpo_config.get("max_new_tokens", 256), temperature=temperature, top_p=top_p, @@ -455,6 +460,18 @@ def _resolver(prompt: str): }, } + # Propagate verbosity to reward/external modules + try: + import rewards.code_rewards as code_rewards + code_rewards.VERBOSE = bool(output_verbose) + except Exception: + pass + try: + import external as external_mod + external_mod.VERBOSE = bool(output_verbose) + except Exception: + pass + # Get num_agents from magrpo config (where it belongs for MAGRPO training) num_agents = magrpo_config.get("num_agents", 2) agents = [ From 54c1ed03e3ca32a38b6d6e6e2c7755b2903e58d3 Mon Sep 17 00:00:00 2001 From: N!no Date: Fri, 26 Sep 2025 10:38:29 -0400 Subject: [PATCH 4/5] rm the single-turn yaml --- README.md | 46 +++++------------------- configs/grpo_che_config.yaml | 13 +++---- configs/grpo_he_config.yaml | 13 +++---- configs/magrpo_che_config.yaml | 9 ++--- configs/magrpo_he_config.yaml | 9 ++--- configs/mt_grpo_che_config.yaml | 57 ------------------------------ configs/mt_grpo_he_config.yaml | 57 ------------------------------ configs/mt_magrpo_che_config.yaml | 58 ------------------------------- configs/mt_magrpo_he_config.yaml | 56 ----------------------------- 9 files changed, 32 insertions(+), 286 deletions(-) delete mode 100644 configs/mt_grpo_che_config.yaml delete mode 100644 configs/mt_grpo_he_config.yaml delete mode 100644 configs/mt_magrpo_che_config.yaml delete mode 100644 configs/mt_magrpo_he_config.yaml diff --git a/README.md b/README.md index 7a413f9..f8b61e7 100644 --- a/README.md +++ b/README.md @@ -1,63 +1,33 @@ # LLM Collaboration with MARL -This repository contains training scripts and configurations for the paper "LLM Collaboration with Multi‑Agent Reinforcement Learning". -- [Benchmarks](#benchmarks) -- [Training Scripts](#training-scripts) - - [Default Configs](#default-configs) - - [Parameter Overrides](#parameter-overrides) -- [Multi-Turn Settings](#multi-turn-settings) - - [2+Turn Prompt Composition](#2turn-prompt-composition) - - [External Modes](#external-modes) - - [Sandbox Tests](#sandbox-tests) +Training scripts and configs for _"LLM Collaboration with Multi‑Agent Reinforcement Learning"_. ## Benchmarks -- HumanEval (HE): 164 problems on split `test` -- CoopHumanEval (CHE): 82 problems on split `test` +- MBPP: 427 problems on split `sanitized` +- HumanEval: 164 problems on split `test` +- CoopHumanEval: 82 problems on split `test` ## Training Scripts ### Default Configs ```bash -# Single-agent HumanEval (GRPO) python LLM_Collaboration_with_MARL/train_grpo.py \ --config LLM_Collaboration_with_MARL/configs/grpo_he_config.yaml -# Multi-agent CoopHumanEval (MAGRPO) python LLM_Collaboration_with_MARL/train_magrpo.py \ --config LLM_Collaboration_with_MARL/configs/magrpo_che_config.yaml - -# Multi-turn HumanEval (MT-MAGRPO) -python LLM_Collaboration_with_MARL/train_magrpo.py \ - --config LLM_Collaboration_with_MARL/configs/mt_magrpo_he_config.yaml ``` ### Parameter Overrides -You can override any configuration parameter using `--override`: +You can always override any configuration parameter using `--override`: ```bash -# Change model -python LLM_Collaboration_with_MARL/train_magrpo.py \ - --config LLM_Collaboration_with_MARL/configs/magrpo_he_config.yaml \ - --override model_name='bigcode/starcoder2-3b' - -# Modify training params -python LLM_Collaboration_with_MARL/train_grpo.py \ - --config LLM_Collaboration_with_MARL/configs/grpo_che_config.yaml \ - --override grpo.num_train_epochs=20 grpo.learning_rate=3e-5 - -# Multi-turn override example -python LLM_Collaboration_with_MARL/train_magrpo.py \ - --config LLM_Collaboration_with_MARL/configs/mt_magrpo_che_config.yaml \ - --override dataset.train_split='test[16:]' dataset.eval_split='test[:16]' \ - magrpo.num_turns=2 - -# Enable code-level training metrics (expensive; default is off) python LLM_Collaboration_with_MARL/train_magrpo.py \ --config LLM_Collaboration_with_MARL/configs/magrpo_he_config.yaml \ - --override magrpo.log_code_levels=true + --override model.name='bigcode/starcoder2-3b' magrpo.num_turns=1 ``` ## Multi-Turn Settings @@ -97,7 +67,7 @@ Notes: - The same rule applies at deeper turns. - For GRPO (single agent), the same threshold logic applies (one agent → one set of siblings per node). -### 2+Turn Prompt Composition +### 2+Turn Prompt By default, multi-turn prompts include both the original first‑turn problem prompt and the previous response. @@ -130,7 +100,7 @@ python LLM_Collaboration_with_MARL/train_magrpo.py \ --override external.mode='level_feedback' ``` -### Sandbox Tests +#### Sandbox Tests The external modes obtain `entry_point` and tests via an internal resolver registered by the training script. **By default, sandbox executes only the first assert (`sandbox_slice=1`).** Use all eval tests by setting `external.sandbox_slice` to `0`, `None`, or `'all'`. A negative value uses the last N asserts. Note: `external.sandbox_slice` only affects analysis-based modes (`level_feedback`, `level_passed`, `passed`), and it has no effect on `expert_edits`. diff --git a/configs/grpo_che_config.yaml b/configs/grpo_che_config.yaml index d462462..1d773b0 100644 --- a/configs/grpo_che_config.yaml +++ b/configs/grpo_che_config.yaml @@ -9,7 +9,7 @@ model: trust_remote_code: true model_kwargs: trust_remote_code: true - torch_dtype: "auto" + torch_dtype: "bfloat16" # dataset dataset: @@ -20,7 +20,7 @@ dataset: # output output: - base_dir: "../../../work/hdd/bepg/sliu30/output_st_grpo" + base_dir: "output" save_final_model: false verbose: false @@ -33,9 +33,10 @@ external: # grpo grpo: - num_train_epochs: 16 + num_turns: 2 + num_train_epochs: 8 per_device_train_batch_size: 1 - learning_rate: 1.0e-5 + learning_rate: 2.0e-5 logging_steps: 50 save_steps: 200 num_generations: 4 @@ -52,5 +53,5 @@ wandb: project: "mlrl" entity: "nu-llpr" name: "grpo_coophumaneval" - dir: "../../../work/hdd/bepg/sliu30/output_st_grpo" - tags: ["grpo", "coophumaneval", "single-agent"] + dir: "output" + tags: ["grpo", "coophumaneval"] diff --git a/configs/grpo_he_config.yaml b/configs/grpo_he_config.yaml index 7e01a4e..421e2c0 100644 --- a/configs/grpo_he_config.yaml +++ b/configs/grpo_he_config.yaml @@ -9,7 +9,7 @@ model: trust_remote_code: true model_kwargs: trust_remote_code: true - torch_dtype: "auto" + torch_dtype: "bfloat16" # dataset dataset: @@ -20,7 +20,7 @@ dataset: # output output: - base_dir: "../../../work/hdd/bepg/sliu30/output_st_grpo" + base_dir: "output" save_final_model: false verbose: false @@ -33,9 +33,10 @@ external: # grpo grpo: - num_train_epochs: 8 + num_turns: 2 + num_train_epochs: 6 per_device_train_batch_size: 1 - learning_rate: 1.0e-5 + learning_rate: 2.0e-5 logging_steps: 50 save_steps: 200 num_generations: 4 @@ -52,5 +53,5 @@ wandb: project: "mlrl" entity: "nu-llpr" name: "grpo_humaneval" - dir: "../../../work/hdd/bepg/sliu30/output_st_grpo" - tags: ["grpo", "humaneval", "single-agent"] + dir: "output" + tags: ["grpo", "humaneval"] diff --git a/configs/magrpo_che_config.yaml b/configs/magrpo_che_config.yaml index 9cf6f8b..f6f6c7e 100644 --- a/configs/magrpo_che_config.yaml +++ b/configs/magrpo_che_config.yaml @@ -9,7 +9,7 @@ model: trust_remote_code: true model_kwargs: trust_remote_code: true - torch_dtype: "auto" + torch_dtype: "bfloat16" # dataset dataset: @@ -20,7 +20,7 @@ dataset: # output output: - base_dir: "../../../work/hdd/bepg/sliu30/output_st_magrpo" + base_dir: "output" save_final_model: false verbose: false @@ -33,7 +33,8 @@ external: # magrpo magrpo: - num_train_epochs: 16 + num_turns: 2 + num_train_epochs: 8 per_device_train_batch_size: 1 learning_rate: 2.0e-5 logging_steps: 50 @@ -53,5 +54,5 @@ wandb: project: "mlrl" entity: "nu-llpr" name: "magrpo_coophumaneval" - dir: "../../../work/hdd/bepg/sliu30/output_st_magrpo" + dir: "output" tags: ["magrpo", "coophumaneval", "multi-agent"] diff --git a/configs/magrpo_he_config.yaml b/configs/magrpo_he_config.yaml index ebbc9a6..029dcfe 100644 --- a/configs/magrpo_he_config.yaml +++ b/configs/magrpo_he_config.yaml @@ -9,7 +9,7 @@ model: trust_remote_code: true model_kwargs: trust_remote_code: true - torch_dtype: "auto" + torch_dtype: "bfloat16" # dataset dataset: @@ -20,7 +20,7 @@ dataset: # output output: - base_dir: "../../../work/hdd/bepg/sliu30/output_st_magrpo" + base_dir: "output" save_final_model: false verbose: false @@ -33,7 +33,8 @@ external: # magrpo magrpo: - num_train_epochs: 8 + num_turns: 2 + num_train_epochs: 6 per_device_train_batch_size: 1 learning_rate: 2.0e-5 logging_steps: 50 @@ -51,5 +52,5 @@ wandb: project: "mlrl" entity: "nu-llpr" name: "magrpo_humaneval" - dir: "../../../work/hdd/bepg/sliu30/output_st_magrpo" + dir: "output" tags: ["magrpo", "humaneval", "multi-agent"] diff --git a/configs/mt_grpo_che_config.yaml b/configs/mt_grpo_che_config.yaml deleted file mode 100644 index fea700d..0000000 --- a/configs/mt_grpo_che_config.yaml +++ /dev/null @@ -1,57 +0,0 @@ -# model -model: - name: "Qwen/Qwen2.5-Coder-3B" - type: "qwen" - temperature: 0.7 - top_p: 0.9 - max_length: 2048 - tokenizer_kwargs: - trust_remote_code: true - model_kwargs: - trust_remote_code: true - torch_dtype: "bfloat16" - -# dataset -dataset: - name: "CoMLRL/CoopHumanEval" - type: "coophumaneval" - train_split: "test[16:]" - eval_split: "test[:16]" - -# output -output: - base_dir: "../../../work/hdd/bepg/sliu30/output_mt_grpo" - save_final_model: false - verbose: false - -# external -external: - mode: "level_feedback" - sandbox_slice: 1 - original_prompt: true - previous_response: true - -# grpo -grpo: - num_turns: 2 - num_train_epochs: 8 - per_device_train_batch_size: 1 - learning_rate: 2.0e-5 - logging_steps: 50 - save_steps: 200 - num_generations: 4 - max_new_tokens: 256 - joint_mode: aligned - temperature: 0.8 - top_p: 0.95 - discount: 0.9 - termination_threshold: -0.1 - reward_shift: -2.1 - -# wandb -wandb: - project: "mlrl" - entity: "nu-llpr" - name: "mt_grpo_coophumaneval" - dir: "../../../work/hdd/bepg/sliu30/output_mt_grpo" - tags: ["mt_grpo", "coophumaneval", "single-agent", "multi-turn"] diff --git a/configs/mt_grpo_he_config.yaml b/configs/mt_grpo_he_config.yaml deleted file mode 100644 index 7c3e222..0000000 --- a/configs/mt_grpo_he_config.yaml +++ /dev/null @@ -1,57 +0,0 @@ -# model -model: - name: "Qwen/Qwen2.5-Coder-3B" - type: "qwen" - temperature: 0.7 - top_p: 0.9 - max_length: 2048 - tokenizer_kwargs: - trust_remote_code: true - model_kwargs: - trust_remote_code: true - torch_dtype: "bfloat16" - -# dataset -dataset: - name: "openai/openai_humaneval" - type: "humaneval" - train_split: "test[33:163]" - eval_split: "test[:32]" - -# output -output: - base_dir: "../../../work/hdd/bepg/sliu30/output_mt_grpo" - save_final_model: false - verbose: false - -# external -external: - mode: "level_feedback" - sandbox_slice: 1 - original_prompt: true - previous_response: true - -# grpo -grpo: - num_turns: 2 - num_train_epochs: 6 - per_device_train_batch_size: 1 - learning_rate: 2.0e-5 - logging_steps: 50 - save_steps: 200 - num_generations: 4 - max_new_tokens: 256 - joint_mode: aligned - temperature: 0.8 - top_p: 0.95 - discount: 0.9 - termination_threshold: -0.1 - reward_shift: -2.1 - -# wandb -wandb: - project: "mlrl" - entity: "nu-llpr" - name: "mt_grpo_humaneval" - dir: "../../../work/hdd/bepg/sliu30/output_mt_grpo" - tags: ["mt_grpo", "humaneval", "single-agent", "multi-turn"] diff --git a/configs/mt_magrpo_che_config.yaml b/configs/mt_magrpo_che_config.yaml deleted file mode 100644 index a38e1ad..0000000 --- a/configs/mt_magrpo_che_config.yaml +++ /dev/null @@ -1,58 +0,0 @@ -# model -model: - name: "Qwen/Qwen2.5-Coder-3B" - type: "qwen" - temperature: 0.7 - top_p: 0.9 - max_length: 2048 - tokenizer_kwargs: - trust_remote_code: true - model_kwargs: - trust_remote_code: true - torch_dtype: "bfloat16" - -# dataset -dataset: - name: "CoMLRL/CoopHumanEval" - type: "coophumaneval" - train_split: "test[16:]" - eval_split: "test[:16]" - -# output -output: - base_dir: "../../../work/hdd/bepg/sliu30/output_mt_magrpo" - save_final_model: false - verbose: false - -# external -external: - mode: "level_feedback" - sandbox_slice: 1 - original_prompt: true - previous_response: true - -# magrpo -magrpo: - num_turns: 2 - num_train_epochs: 8 - per_device_train_batch_size: 1 - learning_rate: 2.0e-5 - logging_steps: 50 - save_steps: 200 - num_generations: 4 - max_new_tokens: 256 - temperature: 0.8 - top_p: 0.95 - joint_mode: aligned - num_agents: 2 - discount: 0.9 - termination_threshold: -0.2 - reward_shift: -4 - -# wandb -wandb: - project: "mlrl" - entity: "nu-llpr" - name: "mt_magrpo_coophumaneval" - dir: "../../../work/hdd/bepg/sliu30/output_mt_magrpo" - tags: ["mt_magrpo", "coophumaneval", "multi-agent", "multi-turn"] diff --git a/configs/mt_magrpo_he_config.yaml b/configs/mt_magrpo_he_config.yaml deleted file mode 100644 index a2d7803..0000000 --- a/configs/mt_magrpo_he_config.yaml +++ /dev/null @@ -1,56 +0,0 @@ -# model -model: - name: "Qwen/Qwen2.5-Coder-3B" - type: "qwen" - temperature: 0.7 - top_p: 0.9 - max_length: 2048 - tokenizer_kwargs: - trust_remote_code: true - model_kwargs: - trust_remote_code: true - torch_dtype: "bfloat16" - -# dataset -dataset: - name: "openai/openai_humaneval" - type: "humaneval" - train_split: "test[33:163]" - eval_split: "test[:32]" - -# output -output: - base_dir: "../../../work/hdd/bepg/sliu30/output_mt_magrpo" - save_final_model: false - verbose: false - -# external -external: - mode: "level_feedback" - sandbox_slice: 1 - original_prompt: true - previous_response: true - -# magrpo -magrpo: - num_turns: 2 - num_train_epochs: 6 - per_device_train_batch_size: 1 - learning_rate: 2.0e-5 - logging_steps: 50 - save_steps: 200 - num_generations: 4 - max_new_tokens: 256 - joint_mode: aligned - num_agents: 2 - discount: 0.9 - termination_threshold: -0.2 - reward_shift: -4 - -# wandb -wandb: - project: "mlrl" - entity: "nu-llpr" - name: "mt_magrpo_humaneval" - dir: "../../../work/hdd/bepg/sliu30/output_mt_magrpo" - tags: ["mt_magrpo", "humaneval", "multi-agent", "multi-turn"] From ebee2f7a1ba7eb13f3788c15ee9e3e5d51e268a6 Mon Sep 17 00:00:00 2001 From: N!no Date: Fri, 26 Sep 2025 11:45:34 -0400 Subject: [PATCH 5/5] Update README.md --- README.md | 79 +++++++++++-------------------------------------------- 1 file changed, 16 insertions(+), 63 deletions(-) diff --git a/README.md b/README.md index f8b61e7..8277afa 100644 --- a/README.md +++ b/README.md @@ -29,84 +29,37 @@ python LLM_Collaboration_with_MARL/train_magrpo.py \ --config LLM_Collaboration_with_MARL/configs/magrpo_he_config.yaml \ --override model.name='bigcode/starcoder2-3b' magrpo.num_turns=1 ``` -## Multi-Turn Settings -### Joint Actions & Early Termination +## Settings -- Joint actions (magrpo.joint_mode): - - `cross` (default): Form joint actions by Cartesian product of each agent's K generations (reuses sequences; no extra generation). - - `aligned`: Join index‑aligned generations. +### Joint Action Modes -- Early termination (magrpo.termination_threshold / grpo.termination_threshold): - - At each node (branch, turn), compute the mean immediate reward across the sibling joint actions at that node. - - If the mean exceeds the threshold, that branch stops expanding at this turn; training backpropagates from the truncated subtree. Other branches continue. +`magrpo.joint_mode` determine how to combine each agent's K generations into joint actions at each turn. 2 modes are supported: if set 'align' by default, each agent's k-th generation is paired with the other agents' k-th generations to form a joint action; if set 'cross', all combinations of the agents' K generations are used to form joint actions (K^N joint actions for N agents). -Illustrative example (threshold = -0.2, 2 agents, K=2 → 4 joint actions per node): +Since the number of samples will also grow exponentially with the number of turns, aligned joint will be **more flexible** (\#samples could not be a perfect power) and hence faster to train in wall time. However, using cross joint will be more sample efficient (much lower VRAM compare to 'align' when num_generations=K^N), it also performs better since the value estimation is more accurate. -``` -Turn 1 (root) (a,b,c,d): [-1.5, -1.5, -1.0, -1.0] -mean rewards = -1.25 ≤ -0.2 → continue expanding all branches - - a (-1.5) - ↳ Turn 2 children (e,f,g,h): [-1, -1, -1, -1] - mean rewards = -1.0 ≤ -0.2 → continue - - b (-1.5) - ↳ Turn 2 children (i,j,k,l): [0.0, 0.0, 0.0, -0.2] - mean rewards = -0.05 > -0.2 → TERMINATE branch b here (no further children) - - c (-1.0) - ↳ Turn 2 children: [...] +### Number of Turns - d (-1.0) - ↳ Turn 2 children: [...] -``` - -Notes: -- Termination is per‑branch; other branches continue normally. -- The same rule applies at deeper turns. -- For GRPO (single agent), the same threshold logic applies (one agent → one set of siblings per node). +`magrpo.num_turns` determines the number of turns (`magrpo.num_turns=2` by default). The number of samples at each turn will grow exponentially with the number of turns: K^TN at turn T if cross joint, K^N if aligned joint. -### 2+Turn Prompt +### Early Termination -By default, multi-turn prompts include both the original first‑turn problem prompt and the previous response. +`magrpo.termination_threshold` is used to incentive agents to find high-reward solutions quickly, instead of expanding the full Monte Carlo tree. -- external.original_prompt: true (default) -- external.previous_response: true (default) +At each node (branch, turn), compute the mean immediate **reward across the sibling** joint actions at that node. If the mean exceeds the threshold, that branch stops expanding at this turn; training backpropagates from the truncated subtree. Other branches continue. -To exclude the original prompt but keep the previous response (shorter context): +### Multi-Turn Prompt -```bash -python LLM_Collaboration_with_MARL/train_magrpo.py \ - --config LLM_Collaboration_with_MARL/configs/mt_magrpo_he_config.yaml \ - --override external.original_prompt=False external.previous_response=True -``` +`external.original_prompt` and `external.previous_response` both default as `true`. 2+ turn prompts include both the original first‑turn problem prompt and the previous response by default to preserve full context; you can shorten the context by setting either to `false` (for example, keep only the previous response to reduce tokens while retaining the most recent interaction). ### External Modes -Multi-turn training supports external transition modes for 2nd+ turns, set via `external.mode`: - -- `level_feedback` **(default)**: Detailed diagnostics (impl found, syntax with line/col, per-test pass/fail errors, aux usage). - - Requires `external.expert_model` in config when using `expert_edits` (e.g., `deepseek-coder`, Claude, etc.). This parameter is ignored for other modes (`level_feedback`, `level_passed`, `passed`, `plain`). -- Requires corrsponding API keys in env vars. -- `level_passed`: Binary passed signals (impl found, syntax, tests summary, aux usage). -- `passed`: A binary signal — "All levels passed" or "Not all levels passed". -- `plain`: No signals or diagnostics. +`external.mode` is set to be 'level_feedback' by default. This gives additional information from external to prompts in the following turns; 'level_feedback' attaches test‑driven diagnostics, while alternatives include 'expert_edits' (an LLM proposes edits), 'level_passed'/'passed' (binary outcomes), and 'plain' (no signals). -```bash -# HumanEval with detailed feedback signals -python LLM_Collaboration_with_MARL/train_magrpo.py \ - --config LLM_Collaboration_with_MARL/configs/mt_magrpo_he_config.yaml \ - --override external.mode='level_feedback' -``` +Specific settings for 'level_feedback' is `external.sandbox_slice`, which controls how many eval tests to include in the feedback. By default, sandbox executes only the first assert (sandbox_slice=1). Use all eval tests by setting `external.sandbox_slice` to 0, None, or 'all'. Negative values use the last asserts. `external.sandbox_slice` only affects analysis-based modes ('level_feedback', 'level_passed', 'passed'), and it has no effect on 'expert_edits'. -#### Sandbox Tests +Specific settings for 'expert_edits' is `external.expert_edits_model`, which controls which LLM to use for proposing edits. By default, it uses DeepSeek-Coder. You can also change it to Claude-3, GPT-4, once you have keys/tokens in your global environment variables. -The external modes obtain `entry_point` and tests via an internal resolver registered by the training script. **By default, sandbox executes only the first assert (`sandbox_slice=1`).** Use all eval tests by setting `external.sandbox_slice` to `0`, `None`, or `'all'`. A negative value uses the last N asserts. Note: `external.sandbox_slice` only affects analysis-based modes (`level_feedback`, `level_passed`, `passed`), and it has no effect on `expert_edits`. +### Output -```bash -# Add an external.sandbox_slice override -python LLM_Collaboration_with_MARL/train_magrpo.py \ - --config LLM_Collaboration_with_MARL/configs/mt_magrpo_che_config.yaml \ - --override external.mode='level_feedback' external.sandbox_slice=-2 -``` +`output.save_model` is set to `false` by default because of the huge storage required by multiple LLMs. `verbose` is used for debug printing on cluster if set to be true, but it is default to be false and you can only see a tqdm bar that shows the training progress. You can also turn on `magrpo.log_code_levels` to log the level-rewards during training, but it will crazily slow down the training. \ No newline at end of file