Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions dev/run_qwen3_5_localbackend_yes_no_maybe.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@ def _format_int_list(values: list[int]) -> str:
parser.add_argument(
"--enable-thinking", action=argparse.BooleanOptionalAction, default=False
)
parser.add_argument(
"--rollout-weights-mode",
choices=("lora", "merged"),
default=None,
)
parser.add_argument("--trainer-gpu-ids", type=int, nargs="+")
parser.add_argument("--inference-gpu-ids", type=int, nargs="+")
args = parser.parse_args()
Expand Down Expand Up @@ -98,6 +103,8 @@ def _format_int_list(values: list[int]) -> str:
f"INFERENCE_GPU_IDS={_format_int_list(args.inference_gpu_ids)}",
]
)
if args.rollout_weights_mode is not None:
env.append(f"ROLLOUT_WEIGHTS_MODE={args.rollout_weights_mode}")
env_block = " \\\n ".join(env)

run_script = textwrap.dedent(
Expand Down Expand Up @@ -143,6 +150,7 @@ def _format_int_list(values: list[int]) -> str:
print(f" load_in_4bit: {args.load_in_4bit}")
print(f" load_in_16bit: {args.load_in_16bit}")
print(f" enable_thinking: {args.enable_thinking}")
print(f" rollout_weights_mode: {args.rollout_weights_mode}")
print(f" trainer_gpu_ids: {args.trainer_gpu_ids}")
print(f" inference_gpu_ids: {args.inference_gpu_ids}")

Expand Down
6 changes: 6 additions & 0 deletions dev/yes-no-maybe-metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,12 @@ def build_internal_config() -> art.dev.InternalModelConfig:
result["trainer_gpu_ids"] = trainer_gpu_ids
result["inference_gpu_ids"] = inference_gpu_ids

rollout_weights_mode = os.environ.get("ROLLOUT_WEIGHTS_MODE")
if rollout_weights_mode is not None:
if rollout_weights_mode not in {"lora", "merged"}:
raise ValueError("ROLLOUT_WEIGHTS_MODE must be either 'lora' or 'merged'")
result["rollout_weights_mode"] = rollout_weights_mode

return result


Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ backend = [
"pytest>=8.4.1",
"nbmake>=1.5.5",
"gql<4",
"nvidia-cudnn-frontend<1.21 ; sys_platform == 'linux'",
"vllm @ https://github.com/vivekkalyan/vllm/releases/download/v0.17.0-art1/vllm-0.17.0%2Bart1-cp38-abi3-manylinux_2_31_x86_64.whl ; sys_platform == 'linux'",
]
megatron = [
Expand Down
5 changes: 5 additions & 0 deletions src/art/dev/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
from typing_extensions import TypedDict


class WeightTransferConfig(TypedDict):
backend: Literal["nccl"]


class EngineArgs(TypedDict, total=False):
model: str
served_model_name: str | list[str] | None
Expand Down Expand Up @@ -124,6 +128,7 @@ class EngineArgs(TypedDict, total=False):
calculate_kv_scales: bool | None

additional_config: dict[str, Any] | None
weight_transfer_config: WeightTransferConfig | None

disable_log_requests: (
bool # Deprecated in vLLM 0.13+, use enable_log_requests instead
Expand Down
39 changes: 29 additions & 10 deletions src/art/dev/get_model_config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,31 @@
from .engine import EngineArgs
from .model import InitArgs, InternalModelConfig, PeftArgs, TrainerArgs
from .validate import is_dedicated_mode
from .validate import QWEN3_5_MOE_MODELS, is_dedicated_mode


def default_target_modules(base_model: str) -> list[str]:
if base_model in QWEN3_5_MOE_MODELS:
return [
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"in_proj_qkv",
"in_proj_z",
"out_proj",
"gate_proj",
"up_proj",
"down_proj",
]
return [
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj",
]


def get_model_config(
Expand All @@ -14,6 +39,7 @@ def get_model_config(
config = InternalModelConfig()

dedicated = is_dedicated_mode(config)
rollout_weights_mode = config.get("rollout_weights_mode", "lora")

if dedicated:
enable_sleep_mode = False
Expand Down Expand Up @@ -43,15 +69,7 @@ def get_model_config(
lora_alpha=16,
r=8,
random_state=3407,
target_modules=[
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj",
],
target_modules=default_target_modules(base_model),
use_gradient_checkpointing="unsloth",
)
peft_args.update(config.get("peft_args", {}))
Expand All @@ -78,6 +96,7 @@ def get_model_config(
init_args=init_args,
engine_args=engine_args,
peft_args=peft_args,
rollout_weights_mode=rollout_weights_mode,
tinker_args=config.get("tinker_args"),
trainer_args=trainer_args,
)
Expand Down
8 changes: 8 additions & 0 deletions src/art/dev/model.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
from enum import Enum
from typing import Literal

from typing_extensions import Required, TypedDict

from .engine import EngineArgs

RolloutWeightsMode = Literal["lora", "merged"]


# Vendored from transformers.training_args.OptimizerNames
class OptimizerNames(str, Enum):
Expand Down Expand Up @@ -120,6 +123,10 @@ class InternalModelConfig(TypedDict, total=False):
inference run on separate GPUs.
inference_gpu_ids: GPU IDs for vLLM inference (e.g., [1]). When set
with trainer_gpu_ids, enables dedicated mode.
rollout_weights_mode: How inference weights are applied in vLLM.
- "lora": load LoRA adapters into vLLM directly
- "merged": keep training LoRA adapters, but push merged weights
into vLLM for inference
"""

init_args: "InitArgs"
Expand All @@ -130,6 +137,7 @@ class InternalModelConfig(TypedDict, total=False):
trainer_args: "TrainerArgs"
trainer_gpu_ids: list[int]
inference_gpu_ids: list[int]
rollout_weights_mode: "RolloutWeightsMode"


class TinkerArgs(TypedDict, total=False):
Expand Down
32 changes: 31 additions & 1 deletion src/art/dev/validate.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,30 @@
"""Validation functions for model configuration."""

from .model import InternalModelConfig
from .model import InternalModelConfig, RolloutWeightsMode

QWEN3_5_MOE_MODELS = {
"Qwen/Qwen3.5-35B-A3B",
"Qwen/Qwen3.5-397B-A17B",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we able to support training for 397B-A17B?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not tested for now, but it should work with megatron. we need to add the merging logic to our megatron service as well

}


def is_dedicated_mode(config: InternalModelConfig) -> bool:
"""Return True if the config specifies dedicated mode (separate training and inference GPUs)."""
return "trainer_gpu_ids" in config and "inference_gpu_ids" in config


def _rollout_weights_mode(config: InternalModelConfig) -> RolloutWeightsMode:
mode = config.get("rollout_weights_mode", "lora")
if mode in {"lora", "merged"}:
return mode
raise ValueError("rollout_weights_mode must be either 'lora' or 'merged'")


def _is_qwen3_5_moe_model(config: InternalModelConfig) -> bool:
model_name = config.get("engine_args", {}).get("model")
return model_name in QWEN3_5_MOE_MODELS


def validate_dedicated_config(config: InternalModelConfig) -> None:
"""Validate dedicated mode GPU configuration.

Expand All @@ -16,12 +33,19 @@ def validate_dedicated_config(config: InternalModelConfig) -> None:
"""
has_trainer = "trainer_gpu_ids" in config
has_inference = "inference_gpu_ids" in config
rollout_weights_mode = _rollout_weights_mode(config)

if has_trainer != has_inference:
raise ValueError(
"trainer_gpu_ids and inference_gpu_ids must both be set or both unset"
)

if rollout_weights_mode == "merged" and not has_trainer:
raise ValueError(
"rollout_weights_mode='merged' requires dedicated mode "
"(set both trainer_gpu_ids and inference_gpu_ids)"
)

if not has_trainer:
return

Expand Down Expand Up @@ -65,3 +89,9 @@ def validate_dedicated_config(config: InternalModelConfig) -> None:
"enable_sleep_mode is incompatible with dedicated mode "
"(dedicated mode runs vLLM on a separate GPU, sleep/wake is not needed)"
)

if _is_qwen3_5_moe_model(config) and rollout_weights_mode == "lora":
raise ValueError(
"Qwen3.5-MoE models require rollout_weights_mode='merged' with the "
"current vLLM version because direct LoRA inference is currently broken"
)
11 changes: 2 additions & 9 deletions src/art/megatron/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from vllm.v1.engine.async_llm import AsyncLLM

from .. import dev, types
from ..dev.get_model_config import default_target_modules
from ..local.checkpoints import get_last_checkpoint_dir
from ..preprocessing.pack import DiskPackedTensors
from ..preprocessing.tokenize import SFTBatch
Expand Down Expand Up @@ -66,15 +67,7 @@ def _default_lora_adapter_config(self) -> LoraConfig:
return LoraConfig(
r=1,
lora_alpha=32,
target_modules=[
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj",
],
target_modules=default_target_modules(self.base_model),
bias="none",
)

Expand Down
Loading
Loading