From c32e09c2b19900551b089b7482b9dfb8b11b6e4a Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 3 Jul 2025 10:21:25 -0400 Subject: [PATCH 01/14] stuff --- fast_llm/engine/distributed/config.py | 15 + fast_llm/engine/distributed/distributed.py | 64 ++++- tests/conftest.py | 4 +- tests/models/distributed_test_model.py | 59 ++++ tests/models/test_checkpoint.py | 76 +++--- tests/models/test_match_megatron.py | 39 ++- tests/models/test_mb.py | 92 ------- tests/models/test_mb_seq_first.py | 50 ---- tests/models/test_model.py | 75 +++++ tests/models/test_ms.py | 45 --- tests/models/test_seq_first.py | 48 ---- tests/models/test_simple.py | 99 ------- tests/utils/distributed_configs.py | 304 +++++++++++++++++++++ tests/utils/run_test_script.py | 147 +++------- tests/utils/utils.py | 56 ++++ 15 files changed, 672 insertions(+), 501 deletions(-) create mode 100644 tests/models/distributed_test_model.py delete mode 100644 tests/models/test_mb.py delete mode 100644 tests/models/test_mb_seq_first.py create mode 100644 tests/models/test_model.py delete mode 100644 tests/models/test_ms.py delete mode 100644 tests/models/test_seq_first.py delete mode 100644 tests/models/test_simple.py create mode 100644 tests/utils/distributed_configs.py diff --git a/fast_llm/engine/distributed/config.py b/fast_llm/engine/distributed/config.py index 7fd9fed13..9f006cdb1 100644 --- a/fast_llm/engine/distributed/config.py +++ b/fast_llm/engine/distributed/config.py @@ -79,6 +79,14 @@ def setup(self, group: "ProcessGroup|None"): Assert.eq(group.rank(), self.rank) self._group = group + def check_ranks_in_range(self, start, stop): + check_ranks_in_range(self.global_ranks, start, stop) + + +def check_ranks_in_range(global_ranks, start, stop): + Assert.geq(min(global_ranks), start) + Assert.lt(max(global_ranks), stop) + class DistributedDimNames: # A set of common distributed dim names packed into a singleton. @@ -348,6 +356,13 @@ def _get_global_ranks(self, size: int, stride: int) -> range: def _add_distributed_dim(self, distributed_dim: DistributedDim) -> None: Assert.eq(distributed_dim.global_ranks[distributed_dim.rank], self.rank, msg=distributed_dim) + + logger.info(f"Initializing group {distributed_dim}") + try: + distributed_dim.check_ranks_in_range(0, self.world_size) + except: + logger.info(str(self)) + raise if distributed_dim.name in self.distributed_dims: Assert.eq(distributed_dim, self.distributed_dims[distributed_dim.name]) else: diff --git a/fast_llm/engine/distributed/distributed.py b/fast_llm/engine/distributed/distributed.py index fbbf9b6a7..977318841 100644 --- a/fast_llm/engine/distributed/distributed.py +++ b/fast_llm/engine/distributed/distributed.py @@ -13,6 +13,7 @@ DistributedDim, DistributedDimNames, PhaseType, + check_ranks_in_range, ) from fast_llm.utils import Assert @@ -20,14 +21,34 @@ class ProcessGroupPool: - def __init__(self, rank: int | None = None, world_size: int | None = None, timeout: float = 60): + def __init__( + self, + rank: int | None = None, + world_size: int | None = None, + local_world_size: int | None = None, + timeout: float = 60, + use_cpu: bool = False, + ): self._rank = DistributedConfig.default_rank if rank is None else rank self._world_size = DistributedConfig.default_world_size if world_size is None else world_size + self._local_world_size = ( + DistributedConfig.default_local_world_size if local_world_size is None else local_world_size + ) self._timeout = timeout + self._use_cpu = use_cpu + + if self._use_cpu: + Assert.eq(self._world_size, 1) + self._device = torch.device("cpu") + else: + Assert.in_range_incl(self._local_world_size, 1, torch.cuda.device_count()) + torch.cuda.init() + self._device = torch.device(self._rank) + torch.cuda.set_device(self._device) if self._world_size > 1: - if rank == 0: + if self._rank == 0: logger.info("Initializing TCP store.") # We bypass `torch.distributed.init_process_group` which makes things way more complicated for no reason. # TODO: Allow other init methods? @@ -49,12 +70,21 @@ def rank(self): def world_size(self): return self._world_size + @property + def local_world_size(self): + return self._local_world_size + + @property + def device(self): + return self._device + def get_process_group(self, global_ranks: range | tuple, group_rank: int) -> ProcessGroup | None: """ Get the requested process group from the pool, or create it if it doesn't exist. """ group_size = len(global_ranks) Assert.eq(global_ranks[group_rank], self._rank) + check_ranks_in_range(global_ranks, 0, self._world_size) if group_size == 1: return None @@ -85,6 +115,7 @@ def __enter__(self): global _default_pool assert _default_pool is None _default_pool = self + return self def __exit__(self, exc_type, exc_val, exc_tb): global _default_pool @@ -120,24 +151,22 @@ class Distributed[ConfigType: DistributedConfig](Configurable[ConfigType]): def __init__(self, config: DistributedConfig, use_cpu: bool = False): super().__init__(config) assert self._config.reference_config is None - self._use_cpu = use_cpu - - if self._use_cpu: - Assert.eq(self._config.world_size, 1) - self.device = torch.device("cpu") - else: - Assert.in_range_incl(self._config.local_world_size, 1, torch.cuda.device_count()) - torch.cuda.init() - self.device = torch.device(self._config.local_rank) - torch.cuda.set_device(self.device) self._local_pool = _default_pool is None if self._local_pool: - self._pool = ProcessGroupPool(self._config.rank, self._config.world_size, self._config.timeout) + self._pool = ProcessGroupPool( + self._config.rank, + self._config.world_size, + self._config.local_world_size, + self._config.timeout, + use_cpu, + ) else: self._pool = _default_pool - Assert.eq(self._pool._world_size, self._config.world_size) - Assert.eq(self._pool._rank, self._config.rank) + Assert.geq(self._pool.world_size, self._config.world_size) + Assert.eq(self._pool.rank, self._config.rank) + Assert.geq(self._pool.local_world_size, self._config.local_world_size) + Assert.eq(self._pool.device.type, "cpu" if use_cpu else "cuda") self.world_group = self.add_group(self._config.distributed_dims[DistributedDimNames.world]) self.data_group = self.add_group(self._config.distributed_dims[DistributedDimNames.data]) @@ -188,11 +217,16 @@ def __init__(self, config: DistributedConfig, use_cpu: bool = False): self.set_step(0, PhaseType.training) + @property + def device(self): + return self._pool.device + def add_group(self, distributed_dim: DistributedDim) -> ProcessGroup | None: """ Add a process group from its definition. """ self._config.log_first_rank(f"Initializing group {distributed_dim.name}, size={distributed_dim.size}...") + distributed_dim.check_ranks_in_range(0, self._config.world_size) group = self._pool.get_process_group(distributed_dim.global_ranks, distributed_dim.rank) distributed_dim.setup(group) return group diff --git a/tests/conftest.py b/tests/conftest.py index 27ea5f63d..4c9161ea6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -15,8 +15,8 @@ # Make fixtures available globally without import from tests.utils.run_test_script import ( # isort: skip - run_distributed_script_for_all_models, - run_test_script, + compare_results_for_all_models, + run_distributed_script, run_test_script_base_path, run_test_script_for_all_models, ) diff --git a/tests/models/distributed_test_model.py b/tests/models/distributed_test_model.py new file mode 100644 index 000000000..ad19eeafa --- /dev/null +++ b/tests/models/distributed_test_model.py @@ -0,0 +1,59 @@ +import logging + +import torch + +from fast_llm.cli import fast_llm_main_wrapper +from fast_llm.core.distributed import allreduce_scalar, safe_barrier +from fast_llm.engine.distributed.config import DistributedConfig +from fast_llm.engine.distributed.distributed import ProcessGroupPool +from tests.utils.distributed_configs import DISTRIBUTED_TESTING_CONFIGS +from tests.utils.run_test_script import do_run_test_script_for_all_models, parse_run_distributed_script +from tests.utils.utils import DistributedSubtestContext + +logger = logging.getLogger(__name__) + + +def main(args: list[str] | None = None) -> None: + base_path, model_testing_config = parse_run_distributed_script(args) + + with ProcessGroupPool(timeout=20) as pool: + failures = [] + world_size = DistributedConfig.default_world_size + rank = DistributedConfig.default_rank + group = pool.get_process_group(range(world_size), rank) + + for name, config in DISTRIBUTED_TESTING_CONFIGS.items(): + if config.num_gpus > world_size: + logger.warning(f"{name} {f"SKIPPED (not enough GPUs: {config.num_gpus} > {world_size})"})") + if DistributedConfig.default_rank < config.num_gpus: + logger.info(f"Running {name}") + with DistributedSubtestContext(base_path / name, rank) as subtest: + do_run_test_script_for_all_models(config, model_testing_config, base_path) + assert subtest._capture_manager._global_capturing is None + success = subtest.success + else: + # Worker is not needed for this one, skip. + success = True + + # Barrier so `allreduce_scalar` doesn't go crazy in case of desync. + safe_barrier(group, name) + success = ( + success if group is None else allreduce_scalar(success, dtype=torch.int64, group=group) == world_size + ) + logger.warning(f"{name} {"PASSED" if success else "FAILED"})") + if not success: + failures.append(name) + if rank == 0: + (base_path / name / "pytest_success").write_text(str(int(success))) + + # Final barrier to ensure everything is done before torchrun potentially kills workers. + safe_barrier(group, "testing end") + # Let pytest know how things went. + # These should already be reported above, we repeat for convenience. + if failures: + raise RuntimeError(f"The following subtests failed: {", ".join(failures)}") + + +if __name__ == "__main__": + with fast_llm_main_wrapper(): + main() diff --git a/tests/models/test_checkpoint.py b/tests/models/test_checkpoint.py index 63a25747f..8392494e4 100644 --- a/tests/models/test_checkpoint.py +++ b/tests/models/test_checkpoint.py @@ -20,7 +20,9 @@ from fast_llm.engine.multi_stage.config import FastLLMModelConfig, ShardName from fast_llm.utils import Assert from tests.utils.compare_tensor_logs import CompareConfig, compare_logged_tensor +from tests.utils.distributed_configs import DistributedTestingConfig from tests.utils.model_configs import ModelTestingConfig, ModelTestingGroup +from tests.utils.run_test_script import ARTIFACT_PATH _WEIGHT_SHARD_SAVE_NAME = f"{ShardName.weights}_shard" @@ -34,46 +36,53 @@ @pytest.mark.model_testing_group(ModelTestingGroup.checkpoint) def test_checkpoint_and_eval(run_test_script_for_all_models, model_testing_config): # A baseline config (single-gpu, bf16, flash-attn). - run_test_script_for_all_models(_CHECKPOINT_AND_EVAL_ARGS) - - -def _prepare_resume_fn(test_path: pathlib.Path, compare_path: pathlib.Path): - shutil.copytree(compare_path, test_path) - shutil.rmtree(test_path / "checkpoint" / "2") - assert (test_path / "checkpoint" / "1" / "ok").is_file() - # TODO: Eval - shutil.rmtree(test_path / "runs") + run_test_script_for_all_models( + distributed_testing_config=DistributedTestingConfig( + name="checkpoint_and_eval", config_args=_CHECKPOINT_AND_EVAL_ARGS + ), + ) -def _compare_resume_fn(test_path: pathlib.Path, compare_path: pathlib.Path): - for artifact in ["init", "train_1"]: - path = f"runs/0/artifacts/0/tensor_logs_{artifact}.pt" - if not (test_path / path).is_file(): - shutil.copy(compare_path / path, test_path / path) +@pytest.fixture(scope="module") +def prepare_resume(run_test_script_base_path: pathlib.Path): + def do_prepare_resume(distributed_testing_config: DistributedTestingConfig): + resume_from_path = run_test_script_base_path / distributed_testing_config.compare + self_path = run_test_script_base_path / distributed_testing_config.name + shutil.copytree(resume_from_path, self_path) + shutil.rmtree(self_path / "checkpoint" / "2") + assert (self_path / "checkpoint" / "1" / "ok").is_file() + # TODO: Eval + shutil.rmtree(self_path / "runs") + for artifact in ["init", "train_1"]: + path = f"{ARTIFACT_PATH}/0/tensor_logs_{artifact}.pt" + shutil.copy(resume_from_path / path, self_path / path) + + return do_prepare_resume @pytest.mark.depends_on(on=["test_checkpoint_and_eval[{model_testing_config}]"]) @pytest.mark.model_testing_group(ModelTestingGroup.checkpoint) -def test_resume(run_test_script_for_all_models): - # Resume from iteration=1 and compare outputs with the baseline run. - run_test_script_for_all_models( - _CHECKPOINT_AND_EVAL_ARGS, - compare=f"test_checkpoint_and_eval", - prepare_fn=_prepare_resume_fn, - compare_fn=_compare_resume_fn, +def test_resume(run_test_script_for_all_models, compare_results_for_all_models, prepare_resume): + distributed_testing_config = DistributedTestingConfig( + name="resume", compare="test_checkpoint_and_eval", config_args=_CHECKPOINT_AND_EVAL_ARGS ) + prepare_resume(distributed_testing_config) + + # Resume from iteration=1 and compare outputs with the baseline run. + run_test_script_for_all_models(distributed_testing_config) + compare_results_for_all_models("distributed_testing_config") + @pytest.mark.depends_on(on=["test_checkpoint_and_eval[{model_testing_config}]"]) @pytest.mark.model_testing_group(ModelTestingGroup.checkpoint) -def test_resume_frozen(run_test_script_for_all_models): - # Resume with frozen mlp. No comparison. - run_test_script_for_all_models( - _CHECKPOINT_AND_EVAL_ARGS + ["model.base_model.transformer.mlp_lr_scale=0."], - compare="test_checkpoint_and_eval", - prepare_fn=_prepare_resume_fn, - do_compare=False, +def test_resume_frozen(run_test_script_for_all_models, prepare_resume): + distributed_testing_config = DistributedTestingConfig( + name="resume_frozen", compare="test_checkpoint_and_eval", config_args=_CHECKPOINT_AND_EVAL_ARGS ) + prepare_resume(distributed_testing_config) + # Resume with frozen mlp. No comparison. + run_test_script_for_all_models(distributed_testing_config) def do_get_convert_path( @@ -343,15 +352,18 @@ def load_and_save_parallel_base_path(run_test_script_base_path): ] ) @pytest.mark.model_testing_group(ModelTestingGroup.convert, ModelTestingGroup.distributed) -def test_save_and_load_in_parallel(run_distributed_script_for_all_models, load_and_save_parallel_base_path): +def test_save_and_load_in_parallel(run_distributed_script, load_and_save_parallel_base_path, model_testing_config): # Save and load checkpoints to and from various distributed configurations. # Combined in a single test to mitigate process creation overhead. # TODO: Test beyond 2 gpu configs? import tests.models.distributed_test_checkpoint - run_distributed_script_for_all_models( - [tests.models.distributed_test_checkpoint.__file__], - base_path=load_and_save_parallel_base_path, + run_distributed_script( + [ + tests.models.distributed_test_checkpoint.__file__, + str(load_and_save_parallel_base_path), + model_testing_config.name, + ], num_gpus=2, ) diff --git a/tests/models/test_match_megatron.py b/tests/models/test_match_megatron.py index 7645de9e1..5d974172d 100644 --- a/tests/models/test_match_megatron.py +++ b/tests/models/test_match_megatron.py @@ -1,18 +1,36 @@ +import os + import pytest from tests.utils.compare_tensor_logs import CompareConfig -from tests.utils.dataset import DATASET_PREFIX +from tests.utils.dataset import DATASET_PREFIX, get_test_dataset +from tests.utils.distributed_configs import DistributedTestingConfig from tests.utils.model_configs import ModelTestingGroup @pytest.mark.model_testing_group(ModelTestingGroup.megatron) -def test_megatron(run_test_script_for_all_models, model_testing_config): - run_test_script_for_all_models([], is_megatron=True) +def test_megatron(run_distributed_script, model_testing_config, run_test_script_base_path): + path = run_test_script_base_path / "megatron" + env = os.environ.copy() + # Prevent Megatron from complaining. + env["CUDA_DEVICE_MAX_CONNECTIONS"] = "1" + env["NVTE_FLASH_ATTN"] = "0" + get_test_dataset() + run_distributed_script( + [ + "Megatron-LM/pretrain_gpt.py", + *model_testing_config.megatron_args, + f"--structured-logs-dir={path}", + f"--data-cache-path={path}", + ], + num_gpus=1, + env=env, + ) @pytest.mark.depends_on(on=["test_megatron[{model_testing_config}]"]) @pytest.mark.model_testing_group(ModelTestingGroup.megatron) -def test_match_megatron(run_test_script_for_all_models, model_testing_config): +def test_match_megatron(run_test_script_for_all_models, model_testing_config, compare_results_for_all_models): assert model_testing_config.megatron_args is not None ignore_tensors = [ @@ -25,13 +43,18 @@ def test_match_megatron(run_test_script_for_all_models, model_testing_config): if model_testing_config.name == "mixtral": ignore_tensors.extend([".mlp.experts.", ".mlp.layer_1.weight"]) - run_test_script_for_all_models( - [ + distributed_testing_config = DistributedTestingConfig( + name="match_megatron", + compare="megatron", + config_args=[ "model.distributed.training_dtype=fp32", "data.datasets={}", f"data.path={DATASET_PREFIX}", "model.base_model.use_megatron_initialization=True", ], - compare="test_megatron", - config=CompareConfig(ignore_tensors=ignore_tensors), + num_gpus=1, + compare_config=CompareConfig(ignore_tensors=ignore_tensors), ) + + run_test_script_for_all_models(distributed_testing_config) + compare_results_for_all_models(distributed_testing_config) diff --git a/tests/models/test_mb.py b/tests/models/test_mb.py deleted file mode 100644 index 781de6e85..000000000 --- a/tests/models/test_mb.py +++ /dev/null @@ -1,92 +0,0 @@ -import pytest - -from tests.utils.compare_tensor_logs import CompareConfig -from tests.utils.model_configs import ModelTestingGroup - - -# TODO: Compare grads with simple -@pytest.mark.model_testing_group(ModelTestingGroup.basic) -def test_model_df4(run_test_script_for_all_models): - # Depth-first gradient accumulation baseline. - run_test_script_for_all_models(["batch.depth_first_micro_batches=4"]) - - -@pytest.mark.depends_on(on=["test_model_df4[{model_testing_config}]"]) -@pytest.mark.model_testing_group(ModelTestingGroup.distributed) -def test_model_df4_z3(run_test_script_for_all_models): - # Gradient accumulation with ZeRO-3. - run_test_script_for_all_models( - ["model.multi_stage.zero_stage=3", "batch.depth_first_micro_batches=4"], - num_gpus=2, - compare="test_model_df4", - config=CompareConfig(ignore_duplicates=["Global gradient"]), - ) - - -@pytest.mark.depends_on(on=["test_model_df4[{model_testing_config}]"], scope="session") -@pytest.mark.model_testing_group(ModelTestingGroup.distributed) -def test_model_bf4(run_test_script_for_all_models): - # Breadth-first gradient accumulation baseline. - run_test_script_for_all_models(["batch.breadth_first_micro_batches=4"], compare="test_model_df4") - - -@pytest.mark.depends_on(on=["test_model_df4[{model_testing_config}]", "test_model_bf4[{model_testing_config}]"]) -@pytest.mark.model_testing_group(ModelTestingGroup.distributed) -def test_model_bf2_df2(run_test_script_for_all_models): - # Mixed gradient accumulation baseline. - run_test_script_for_all_models( - ["batch.depth_first_micro_batches=2", "batch.breadth_first_micro_batches=2"], compare="test_model_df4" - ) - - -@pytest.mark.depends_on(on=["test_model_bf4[{model_testing_config}]"]) -@pytest.mark.model_testing_group(ModelTestingGroup.distributed) -def test_model_pp2s2_bf4(run_test_script_for_all_models): - # Pipeline-parallel without tied weights. - run_test_script_for_all_models( - [ - "batch.breadth_first_micro_batches=4", - "model.distributed.pipeline_parallel=2", - "model.multi_stage.layers_per_stage=2", - ], - num_gpus=2, - compare="test_model_df4", - ) - - -@pytest.mark.depends_on(on=["test_model_bf4[{model_testing_config}]"]) -@pytest.mark.model_testing_group(ModelTestingGroup.distributed) -def test_model_pp2s1_bf4(run_test_script_for_all_models): - # Pipeline-parallel with tied weights. - run_test_script_for_all_models( - [ - "batch.breadth_first_micro_batches=4", - "model.distributed.pipeline_parallel=2", - "model.multi_stage.layers_per_stage=1", - ], - num_gpus=2, - compare="test_model_df4", - config=CompareConfig( - ignore_duplicates=[ - "layers.0.word_embeddings_weight", - "layers.0.position_embeddings_weight", - ] - ), - ) - - -@pytest.mark.depends_on(on=["test_model_bf4[{model_testing_config}]"]) -@pytest.mark.model_testing_group(ModelTestingGroup.distributed) -def test_model_dp2_tp2_pp2s2_bf4(run_test_script_for_all_models): - # Simple 3d parallelism - # TODO: Test fails - run_test_script_for_all_models( - [ - "batch.breadth_first_micro_batches=4", - "model.distributed.tensor_parallel=2", - "model.distributed.pipeline_parallel=2", - "model.multi_stage.layers_per_stage=1", - ], - num_gpus=8, - compare="test_model_df4", - ) diff --git a/tests/models/test_mb_seq_first.py b/tests/models/test_mb_seq_first.py deleted file mode 100644 index 5a8db0b98..000000000 --- a/tests/models/test_mb_seq_first.py +++ /dev/null @@ -1,50 +0,0 @@ -import pytest - -from tests.utils.compare_tensor_logs import CompareConfig -from tests.utils.model_configs import ModelTestingGroup - - -# TODO: Compare grads with simple -@pytest.mark.model_testing_group(ModelTestingGroup.basic) -def test_model_df4_sf(run_test_script_for_all_models): - # Sequence-first gradient accumulation baseline. - run_test_script_for_all_models(["batch.depth_first_micro_batches=4", "model.base_model.sequence_first=True"]) - - -@pytest.mark.depends_on(on=["test_model_df4_sf[{model_testing_config}]"]) -@pytest.mark.model_testing_group(ModelTestingGroup.distributed) -def test_model_dp2_sp2_df4(run_test_script_for_all_models): - # Sequence-tensor-parallel with gradient accumulation. - # TODO: Compiled cross-entropy broken for this config - run_test_script_for_all_models( - [ - "batch.breadth_first_micro_batches=4", - "model.base_model.sequence_first=True", - "model.distributed.tensor_parallel=2", - "model.distributed.sequence_tensor_parallel=True", - "run.torch_dynamo_enable=False", - ], - num_gpus=4, - compare="test_model_df4_sf", - ) - - -@pytest.mark.skip(reason="Test is broken.") -@pytest.mark.depends_on(on=["test_model_df4_sf[{model_testing_config}]"]) -@pytest.mark.model_testing_group(ModelTestingGroup.distributed) -def test_model_dp2_sp2_pp2s1(run_test_script_for_all_models): - # 3d-parallel with sequence-tensor-parallel. - # TODO: Compiled cross-entropy broken for this config - run_test_script_for_all_models( - [ - "batch.breadth_first_micro_batches=4", - "model.base_model.sequence_first=True", - "model.distributed.tensor_parallel=2", - "model.distributed.pipeline_parallel=2", - "model.distributed.sequence_tensor_parallel=True", - "run.torch_dynamo_enable=False", - ], - num_gpus=8, - compare="test_model_df4_sf", - config=CompareConfig(ignore_duplicates=["layers.0.word_embeddings_weight"]), - ) diff --git a/tests/models/test_model.py b/tests/models/test_model.py new file mode 100644 index 000000000..7e853a24a --- /dev/null +++ b/tests/models/test_model.py @@ -0,0 +1,75 @@ +import pytest +import torch + +from tests.utils.distributed_configs import ( + DISTRIBUTED_TESTING_CONFIGS, + SIMPLE_TESTING_CONFIG, + SINGLE_GPU_TESTING_CONFIGS, +) +from tests.utils.model_configs import ModelTestingGroup +from tests.utils.run_test_script import ARTIFACT_PATH +from tests.utils.utils import report_subtest + + +@pytest.mark.model_testing_group(ModelTestingGroup.basic) +def test_model_simple(run_test_script_for_all_models): + # A simple config to prevent unnecessary testing and creation of dependency group + run_test_script_for_all_models(SIMPLE_TESTING_CONFIG) + + +@pytest.mark.depends_on(on=["test_model_simple[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.basic) +# Parametrize with config name so it shows in test name. +@pytest.mark.parametrize("config_name", SINGLE_GPU_TESTING_CONFIGS) +def test_and_compare_model( + run_test_script_for_all_models, compare_results_for_all_models, config_name, run_test_script_base_path +): + # We can expect tests to respect the ordering of `SINGLE_GPU_TESTING_CONFIGS`, so compare should have run already. + config = SINGLE_GPU_TESTING_CONFIGS[config_name] + if config.compare is not None: + for artifact in ["init", "train_1"]: + path = run_test_script_base_path / config.compare / ARTIFACT_PATH / "0" / f"tensor_logs_{artifact}.pt" + if not path.is_file(): + # Dependency likely failed, skipping this test because it will most likely fail for the same reason. + # We still need to fail because we can't confirm the failure. + pytest.fail(f"Compared test {config.compare} failed or did not run ({path} not found).", pytrace=False) + # A baseline config (single-gpu, bf16, flash-attn). + # Also tests for multiple data loaders. + run_test_script_for_all_models(config) + + if config.compare is not None: + compare_results_for_all_models(config) + + +@pytest.mark.depends_on(on=["test_model_simple[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.distributed) +def test_run_model_distributed(run_distributed_script, model_testing_config, run_test_script_base_path): + import tests.models.distributed_test_model + + run_distributed_script( + [ + tests.models.distributed_test_model.__file__, + str(run_test_script_base_path), + model_testing_config.name, + ], + num_gpus=torch.cuda.device_count(), + ) + + +# We don't want to depend on `test_model_distributed` because we still want to run this in cas of failure. +# This should still run after `test_model_distributed` +@pytest.mark.depends_on(on=["test_model_simple[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.distributed) +@pytest.mark.parametrize("config_name", list(DISTRIBUTED_TESTING_CONFIGS)[:1]) +def test_model_distributed( + run_test_script_for_all_models, compare_results_for_all_models, config_name, run_test_script_base_path +): + config = DISTRIBUTED_TESTING_CONFIGS[config_name] + report_subtest(run_test_script_base_path / config.name, config.num_gpus) + if config.compare is not None: + for artifact in ["init", "train_1"]: + if not ( + run_test_script_base_path / config.compare / ARTIFACT_PATH / f"tensor_logs_{artifact}.pt" + ).is_file(): + pytest.fail(f"Compared test {config.compare} failed or did not run.", pytrace=False) + compare_results_for_all_models(config) diff --git a/tests/models/test_ms.py b/tests/models/test_ms.py deleted file mode 100644 index b97f84e5d..000000000 --- a/tests/models/test_ms.py +++ /dev/null @@ -1,45 +0,0 @@ -import pytest - -from tests.utils.model_configs import ModelTestingGroup - - -# TODO: Compare grads with simple -@pytest.mark.model_testing_group(ModelTestingGroup.basic) -def test_model_ms256(run_test_script_for_all_models): - # Micro-sequence baseline - run_test_script_for_all_models(["batch.micro_sequence_length=256"]) - - -@pytest.mark.depends_on(on=["test_model_ms256[{model_testing_config}]"]) -@pytest.mark.model_testing_group(ModelTestingGroup.distributed) -def test_model_pp2s2_ms256(run_test_script_for_all_models): - # Sequence-pipeline-parallel - run_test_script_for_all_models( - [ - "batch.micro_sequence_length=256", - "model.distributed.pipeline_parallel=2", - "model.multi_stage.layers_per_stage=2", - ], - num_gpus=2, - compare="test_model_ms256", - ) - - -@pytest.mark.skip -@pytest.mark.depends_on(on=["test_model_ms256[{model_testing_config}]"]) -@pytest.mark.model_testing_group(ModelTestingGroup.distributed) -def test_model_dp2s2_stp2_pp2s2_ms256(run_test_script_for_all_models): - # TODO: Handle this case. - # Sequence-3d-parallel - run_test_script_for_all_models( - [ - "batch.micro_sequence_length=256", - "model.distributed.pipeline_parallel=2", - "model.distributed.tensor_parallel=2", - "model.distributed.sequence_tensor_parallel=True", - "model.distributed.sequence_data_parallel=2", - "model.multi_stage.layers_per_stage=2", - ], - num_gpus=8, - compare="test_model_ms256", - ) diff --git a/tests/models/test_seq_first.py b/tests/models/test_seq_first.py deleted file mode 100644 index 66b044df3..000000000 --- a/tests/models/test_seq_first.py +++ /dev/null @@ -1,48 +0,0 @@ -import pytest - -from tests.utils.model_configs import ModelTestingGroup - - -# TODO: Compare grads with simple -@pytest.mark.model_testing_group(ModelTestingGroup.basic) -def test_model_sf(run_test_script_for_all_models): - # Sequence-first baseline. - run_test_script_for_all_models(["model.base_model.sequence_first=True"]) - - -@pytest.mark.depends_on(on=["test_model_sf[{model_testing_config}]"]) -@pytest.mark.model_testing_group(ModelTestingGroup.distributed) -def test_model_sp2(run_test_script_for_all_models): - # Sequence-tensor-parallel. - run_test_script_for_all_models( - ["model.distributed.tensor_parallel=2", "model.distributed.sequence_tensor_parallel=True"], - num_gpus=2, - compare="test_model_sf", - ) - - -@pytest.mark.depends_on(on=["test_model_sf[{model_testing_config}]"]) -@pytest.mark.model_testing_group(ModelTestingGroup.distributed) -def test_model_sdp2(run_test_script_for_all_models): - # Sequence-data-parallel - run_test_script_for_all_models( - ["model.distributed.sequence_data_parallel=2"], - num_gpus=2, - compare="test_model_sf", - ) - - -@pytest.mark.depends_on(on=["test_model_sf[{model_testing_config}]"]) -@pytest.mark.model_testing_group(ModelTestingGroup.distributed) -def test_model_sp2_ce4(run_test_script_for_all_models): - # Sequence-tensor-parallel with cross-entropy splits. - run_test_script_for_all_models( - [ - "model.distributed.tensor_parallel=2", - "model.distributed.sequence_tensor_parallel=True", - "model.base_model.parallel_embeddings=False", - "model.base_model.cross_entropy_splits=4", - ], - num_gpus=2, - compare="test_model_sf", - ) diff --git a/tests/models/test_simple.py b/tests/models/test_simple.py deleted file mode 100644 index 4616942c6..000000000 --- a/tests/models/test_simple.py +++ /dev/null @@ -1,99 +0,0 @@ -import pytest - -from tests.utils.model_configs import ModelTestingGroup - - -@pytest.mark.model_testing_group(ModelTestingGroup.basic) -def test_model_safe(run_test_script_for_all_models): - # The safest possible config, identical to the one in test_match_megatron except for the initialization. - run_test_script_for_all_models( - [ - "model.distributed.training_dtype=fp32", - "run.torch_dynamo_enable=False", - "schedule.data_overlap=False", - "model.base_model.transformer.dropless_moe=False", - ], - ) - - -@pytest.mark.depends_on(on=["test_model_safe[{model_testing_config}]"]) -@pytest.mark.model_testing_group(ModelTestingGroup.basic) -def test_model(run_test_script_for_all_models): - # A baseline config (single-gpu, bf16, flash-attn). - # Also tests for multiple data loaders. - run_test_script_for_all_models(["training.num_workers=2"], compare="test_model_safe") - - -@pytest.mark.depends_on(on=["test_model[{model_testing_config}]"]) -@pytest.mark.model_testing_group(ModelTestingGroup.distributed) -def test_model_dp2(run_test_script_for_all_models): - # Simple data-parallel. - run_test_script_for_all_models([], num_gpus=2, compare="test_model") - - -@pytest.mark.skip(reason="Flaky") -@pytest.mark.model_testing_group(ModelTestingGroup.distributed) -def test_model_dp2_timeout(run_test_script_for_all_models): - # Test sampling timeout - # TODO: Find a better way to test this - run_test_script_for_all_models( - [ - # Use a short timeout - "model.distributed.timeout=4", - # Make a dataset that would timeout under the distributed timeout - 'data.datasets.training={"type":"test_slow"}', - "data.datasets.training.type=test_slow", - "data.datasets.training.sleep=6", - # Use a bigger timeout for the dataset. - "training.timeout=10", - # Remove testing clutter. - "model.multi_stage.debug_param_init=0", - "model.multi_stage.debug_layer_outputs=0", - "model.multi_stage.debug_layer_gradients=0", - "model.multi_stage.debug_all_param_gradients=0", - ], - num_gpus=2, - ) - - -@pytest.mark.depends_on(on=["test_model[{model_testing_config}]"]) -@pytest.mark.model_testing_group(ModelTestingGroup.distributed) -def test_model_tp2(run_test_script_for_all_models): - # Simple tensor-parallel. - run_test_script_for_all_models( - ["model.distributed.tensor_parallel=2"], - num_gpus=2, - compare="test_model", - ) - - -@pytest.mark.depends_on(on=["test_model[{model_testing_config}]"]) -@pytest.mark.model_testing_group(ModelTestingGroup.basic) -def test_model_ce4(run_test_script_for_all_models): - # Cross-entropy splits. - run_test_script_for_all_models( - ["model.base_model.cross_entropy_splits=4"], - compare="test_model", - ) - - -@pytest.mark.depends_on(on=["test_model[{model_testing_config}]"]) -@pytest.mark.model_testing_group(ModelTestingGroup.distributed) -def test_model_dp2_z2(run_test_script_for_all_models): - # Data-parallel with zero stage 2. - run_test_script_for_all_models( - ["model.multi_stage.zero_stage=2"], - num_gpus=2, - compare="test_model", - ) - - -@pytest.mark.depends_on(on=["test_model[{model_testing_config}]"]) -@pytest.mark.model_testing_group(ModelTestingGroup.distributed) -def test_model_dp2_z3(run_test_script_for_all_models): - # Data-parallel with zero stage 3. - run_test_script_for_all_models( - ["model.multi_stage.zero_stage=3"], - num_gpus=2, - compare="test_model", - ) diff --git a/tests/utils/distributed_configs.py b/tests/utils/distributed_configs.py new file mode 100644 index 000000000..8bbd08d51 --- /dev/null +++ b/tests/utils/distributed_configs.py @@ -0,0 +1,304 @@ +import dataclasses +import logging + +from tests.utils.compare_tensor_logs import CompareConfig + +logger = logging.getLogger(__name__) + + +@dataclasses.dataclass(kw_only=True) +class DistributedTestingConfig: + name: str + compare: str | None = None + config_args: list[str] + num_gpus: int = 1 + compare_config: CompareConfig | None = None + + +# Baseline (also tests data-parallel workers) +SIMPLE_TESTING_CONFIG = DistributedTestingConfig( + name="simple", + compare=None, + config_args=["training.num_workers=2"], + num_gpus=1, +) + +_SINGLE_GPU_TESTING_CONFIGS = [ + # Sequence-first baseline + DistributedTestingConfig( + name="sf", + compare=None, + config_args=["model.base_model.sequence_first=True"], + num_gpus=1, + ), + # Cross-entropy splits. + DistributedTestingConfig( + name="ce4", + compare=None, + config_args=["model.base_model.cross_entropy_splits=4"], + num_gpus=1, + ), + # Micro-sequence baseline + DistributedTestingConfig( + name="ms", + compare=None, + config_args=["batch.micro_sequence_length=256"], + num_gpus=1, + ), + # Gradient accumulation baseline. + DistributedTestingConfig( + name="df4", + compare=None, + config_args=["batch.depth_first_micro_batches=4"], + num_gpus=1, + ), + # Breadth-first gradient accumulation. + DistributedTestingConfig( + name="bf4", + compare="df4", + config_args=["batch.breadth_first_micro_batches=4"], + num_gpus=1, + ), + # Mixed gradient accumulation. + DistributedTestingConfig( + name="bf2_df2", + compare="df4", + config_args=["batch.depth_first_micro_batches=2", "batch.breadth_first_micro_batches=2"], + num_gpus=1, + ), + # Sequence-first gradient accumulation baseline. + DistributedTestingConfig( + name="df4_sf", + compare=None, + config_args=[], + num_gpus=1, + ), +] + +SINGLE_GPU_TESTING_CONFIGS = {config.name: config for config in _SINGLE_GPU_TESTING_CONFIGS} + + +_DISTRIBUTED_TESTING_CONFIGS = [ + # ===== Data-parallel configs + # Simple + DistributedTestingConfig( + name="dp2", + compare="simple", + config_args=[], + num_gpus=2, + ), + # Zero stage 2 + DistributedTestingConfig( + name="dp2_z2", + compare="simple", + config_args=["model.multi_stage.zero_stage=2"], + num_gpus=2, + ), + # Zero stage 3 + DistributedTestingConfig( + name="dp2_z3", + compare="simple", + config_args=["model.multi_stage.zero_stage=3"], + num_gpus=2, + ), + # Depth-first micro-batches + DistributedTestingConfig( + name="dp2_df4_z3", + compare="df4", + config_args=["model.multi_stage.zero_stage=3", "batch.depth_first_micro_batches=4"], + num_gpus=2, + ), + # Sequence-data-parallel + DistributedTestingConfig( + name="sdp2", + compare="sf", + config_args=["model.distributed.sequence_data_parallel=2"], + num_gpus=2, + ), + # ===== Tensor-parallel configs + # Simple tensor-parallel + DistributedTestingConfig( + name="tp2", + compare="simple", + config_args=["model.distributed.tensor_parallel=2"], + num_gpus=2, + ), + # Simple sequence-tensor-parallel + DistributedTestingConfig( + name="stp2", + compare="simple", + config_args=["model.distributed.tensor_parallel=2", "model.distributed.sequence_tensor_parallel=True"], + num_gpus=2, + ), + # Cross-entropy splits + DistributedTestingConfig( + name="sp2_ce4", + compare="sf", + config_args=[ + "model.distributed.tensor_parallel=2", + "model.distributed.sequence_tensor_parallel=True", + "model.base_model.parallel_embeddings=False", + "model.base_model.cross_entropy_splits=4", + ], + num_gpus=2, + ), + # ===== 2d configs (Data + Tensor) + # Simple + DistributedTestingConfig( + name="dp2_sp2", + compare="sf", + config_args=[ + "model.distributed.tensor_parallel=2", + "model.distributed.sequence_tensor_parallel=True", + ], + num_gpus=4, + ), + # Depth-first micro-batches, tensor-parallel + DistributedTestingConfig( + name="tp2_df4", + compare="df4", + config_args=[ + "batch.depth_first_micro_batches=4", + "model.distributed.tensor_parallel=2", + ], + num_gpus=4, + ), + # Breadth-first micro-batches + DistributedTestingConfig( + name="sdp2_sp2_bf4", + compare="df4_sf", + config_args=[ + "model.distributed.sequence_data_parallel=2", + "batch.breadth_first_micro_batches=4", + "model.distributed.tensor_parallel=2", + "model.distributed.sequence_tensor_parallel=True", + ], + num_gpus=4, + ), + # Sequence-data-parallel + DistributedTestingConfig( + name="sdp2_sp2", + compare="sf", + config_args=["model.distributed.tensor_parallel=2"], + num_gpus=4, + ), + # ===== Pipeline-parallel configs + # Simple [mb] + DistributedTestingConfig( + name="pp2s2_bf4", + compare="df4", + config_args=[ + "batch.breadth_first_micro_batches=4", + "model.distributed.pipeline_parallel=2", + "model.multi_stage.layers_per_stage=2", + ], + num_gpus=2, + ), + # Tied weights on different ranks + DistributedTestingConfig( + name="pp2s1_bf4", + compare="df4", + config_args=[ + "batch.breadth_first_micro_batches=4", + "model.distributed.pipeline_parallel=2", + "model.multi_stage.layers_per_stage=1", + ], + num_gpus=2, + compare_config=CompareConfig( + ignore_duplicates=[ + "layers.0.word_embeddings_weight", + "layers.0.position_embeddings_weight", + ] + ), + ), + # Micro-sequence [ms] + DistributedTestingConfig( + name="pp2s2_ms", + compare="ms", + config_args=[ + "batch.micro_sequence_length=256", + "model.distributed.pipeline_parallel=2", + "model.multi_stage.layers_per_stage=2", + ], + num_gpus=2, + ), + # ===== Data + Pipeline + # Simple + DistributedTestingConfig( + name="dp2_pp2s2", + compare="df4", + config_args=[ + "batch.breadth_first_micro_batches=4", + "model.distributed.pipeline_parallel=2", + "model.multi_stage.layers_per_stage=2", + ], + num_gpus=4, + ), + # ===== Tensor + Pipeline + # Simple [sf, mb] + DistributedTestingConfig( + name="sp2_pp2s1", + compare="df4_sf", + config_args=[ + "model.distributed.tensor_parallel=2", + "model.distributed.sequence_tensor_parallel=True", + "model.distributed.pipeline_parallel=2", + "model.multi_stage.layers_per_stage=2", + ], + num_gpus=4, + compare_config=CompareConfig( + ignore_duplicates=[ + "layers.0.word_embeddings_weight", + "layers.0.position_embeddings_weight", + ] + ), + ), + # ===== Data + Tensor + Pipeline + # Simple + DistributedTestingConfig( + name="dp2_stp2_pp2s2", + compare="mb", + config_args=[ + "batch.breadth_first_micro_batches=4", + "model.distributed.tensor_parallel=2", + "model.distributed.pipeline_parallel=2", + "model.multi_stage.layers_per_stage=2", + ], + num_gpus=8, + ), + # Tied weights on different ranks + DistributedTestingConfig( + name="dp2_tp2_pp2s1_bf4", + compare="mb", + config_args=[ + "batch.breadth_first_micro_batches=4", + "model.distributed.tensor_parallel=2", + "model.distributed.sequence_tensor_parallel=True", + "model.distributed.pipeline_parallel=2", + "model.multi_stage.layers_per_stage=1", + ], + num_gpus=8, + compare_config=CompareConfig( + ignore_duplicates=[ + "layers.0.word_embeddings_weight", + "layers.0.position_embeddings_weight", + ] + ), + ), + # Micro-sequence + DistributedTestingConfig( + name="dp2s2_stp2_pp2s2_ms256", + compare="ms", + config_args=[ + "batch.micro_sequence_length=256", + "model.distributed.pipeline_parallel=2", + "model.distributed.tensor_parallel=2", + "model.distributed.sequence_tensor_parallel=True", + "model.distributed.sequence_data_parallel=2", + "model.multi_stage.layers_per_stage=2", + ], + num_gpus=8, + ), +] + +DISTRIBUTED_TESTING_CONFIGS = {config.name: config for config in _DISTRIBUTED_TESTING_CONFIGS} diff --git a/tests/utils/run_test_script.py b/tests/utils/run_test_script.py index ab08ad734..6c0b561dd 100644 --- a/tests/utils/run_test_script.py +++ b/tests/utils/run_test_script.py @@ -7,13 +7,12 @@ import typing import pytest -import torch -from fast_llm.engine.config_utils.runnable import RunnableConfig from fast_llm.engine.distributed.config import DistributedConfig from fast_llm.utils import Assert -from tests.utils.compare_tensor_logs import CompareConfig, compare_tensor_logs +from tests.utils.compare_tensor_logs import compare_tensor_logs from tests.utils.dataset import get_test_dataset +from tests.utils.distributed_configs import DistributedTestingConfig from tests.utils.model_configs import MODEL_CONFIGS, ModelTestingConfig if typing.TYPE_CHECKING: @@ -22,7 +21,7 @@ # FIXME: figure out correct import of megatron modules without this hack sys.path.append(os.getcwd()) -_ARTIFACT_PATH = "runs/0/artifacts" +ARTIFACT_PATH = "runs/0/artifacts" def do_run_distributed_script( @@ -48,96 +47,14 @@ def do_run_distributed_script( raise RuntimeError(f"Process failed with return code {completed_proc.returncode}") -def do_run_test_script( - path: pathlib.Path, - args: list[str], - num_gpus: int = 1, - *, - model_type: str, - is_megatron: bool = False, - compare_path: pathlib.Path | None = None, - config: CompareConfig | None = None, - prepare_fn=None, - compare_fn=None, - do_compare: bool = True, - rendezvous_port: int, - torchrun_port: int, -): - is_parallel = DistributedConfig.default_world_size > 1 - if is_parallel: - Assert.eq(num_gpus, DistributedConfig.default_world_size) - local_rank = DistributedConfig.default_rank - - if torch.cuda.device_count() < num_gpus: - pytest.skip(f"Not enough GPUs to run test ({torch.cuda.device_count()}<{num_gpus})") - env = os.environ.copy() - if is_megatron: - assert num_gpus == 1 - # Prevent Megatron from complaining. - env["CUDA_DEVICE_MAX_CONNECTIONS"] = "1" - env["NVTE_FLASH_ATTN"] = "0" - else: - env = None - if local_rank == 0 and prepare_fn is not None: - prepare_fn(path, None if compare_path is None else compare_path) - if is_megatron: - args = ["Megatron-LM/pretrain_gpt.py", *args, f"--structured-logs-dir={path}", f"--data-cache-path={path}"] - else: - args = ["--no-python", "fast-llm", "train", model_type, *args, f"run.experiment_dir={path}"] - get_test_dataset() - if (num_gpus == 1 or is_parallel) and not is_megatron: - print(" ".join(args[1:])) - RunnableConfig.parse_and_run(args[2:]) - else: - do_run_distributed_script( - args, rendezvous_port=rendezvous_port, torchrun_port=torchrun_port, num_gpus=num_gpus, env=env - ) - if local_rank == 0 and compare_path is not None and do_compare: - if compare_fn is not None: - compare_fn(path, compare_path) - compare_tensor_logs( - compare_path / _ARTIFACT_PATH, - path / _ARTIFACT_PATH, - config, - ) - - -def do_run_test_script_for_all_models( - extra_args: list[str], - num_gpus: int = 1, - *, - is_megatron: bool = False, - compare: str | None = None, - config: CompareConfig | None = None, - prepare_fn=None, - compare_fn=None, - do_compare: bool = True, - rendezvous_port: int, - torchrun_port: int, - test_name: str, - base_path: pathlib.Path, +@pytest.fixture(scope="session") +def run_distributed_script( + worker_resources: "WorkerResources", + run_test_script_base_path: pathlib.Path, model_testing_config: ModelTestingConfig, ): - do_run_test_script( - base_path / test_name, - (model_testing_config.megatron_args if is_megatron else model_testing_config.config_args) + extra_args, - num_gpus, - model_type=model_testing_config.model_type, - is_megatron=is_megatron, - compare_path=None if compare is None else base_path / compare, - config=config, - prepare_fn=prepare_fn, - compare_fn=compare_fn, - do_compare=do_compare, - rendezvous_port=rendezvous_port, - torchrun_port=torchrun_port, - ) - - -@pytest.fixture(scope="session") -def run_test_script(worker_resources: "WorkerResources"): return functools.partial( - do_run_test_script, + do_run_distributed_script, rendezvous_port=worker_resources.rendezvous_port, torchrun_port=worker_resources.torchrun_port, ) @@ -148,18 +65,34 @@ def run_test_script_base_path(model_testing_config, result_path, request): return result_path / "models" / model_testing_config.name +def do_run_test_script_for_all_models( + distributed_testing_config: DistributedTestingConfig, + model_testing_config: ModelTestingConfig, + base_path: pathlib.Path, +): + Assert.leq(distributed_testing_config.num_gpus, DistributedConfig.default_world_size) + get_test_dataset() + args = [ + "fast-llm", + "train", + model_testing_config.model_type, + *model_testing_config.config_args, + *distributed_testing_config.config_args, + f"model.distributed.world_size={distributed_testing_config.num_gpus}", + f"model.distributed.local_world_size={distributed_testing_config.num_gpus}", + f"run.experiment_dir={base_path/distributed_testing_config.name}", + ] + print(" ".join(args)) + model_testing_config.trainer_config_class.parse_and_run(args[3:]) + + @pytest.fixture(scope="function") def run_test_script_for_all_models( - worker_resources: "WorkerResources", run_test_script_base_path: pathlib.Path, model_testing_config: ModelTestingConfig, - request: pytest.FixtureRequest, ): return functools.partial( do_run_test_script_for_all_models, - rendezvous_port=worker_resources.rendezvous_port, - torchrun_port=worker_resources.torchrun_port, - test_name=request.node.originalname, base_path=run_test_script_base_path, model_testing_config=model_testing_config, ) @@ -174,22 +107,16 @@ def parse_run_distributed_script(args: list[str] | None = None): @pytest.fixture(scope="session") -def run_distributed_script_for_all_models( +def compare_results_for_all_models( worker_resources: "WorkerResources", run_test_script_base_path: pathlib.Path, - model_testing_config: ModelTestingConfig, - request: pytest.FixtureRequest, ): - def do_run_distributed_script_for_all_models(args: list[str], num_gpus=2, base_path: pathlib.Path | None = None): - do_run_distributed_script( - args - + [ - str(run_test_script_base_path if base_path is None else base_path), - model_testing_config.name, - ], - worker_resources.rendezvous_port, - worker_resources.torchrun_port, - num_gpus, + def do_compare_results_for_all_models(distributed_testing_config: DistributedTestingConfig): + assert distributed_testing_config.compare is not None + compare_tensor_logs( + run_test_script_base_path / distributed_testing_config.compare / ARTIFACT_PATH, + run_test_script_base_path / distributed_testing_config.name / ARTIFACT_PATH, + distributed_testing_config.compare_config, ) - return do_run_distributed_script_for_all_models + return do_compare_results_for_all_models diff --git a/tests/utils/utils.py b/tests/utils/utils.py index 1ea7717f5..0ca596aed 100644 --- a/tests/utils/utils.py +++ b/tests/utils/utils.py @@ -1,13 +1,22 @@ +import logging import pathlib +import sys +import traceback +import typing +import _pytest.capture import pytest import torch from fast_llm.engine.base_model.base_model import BaseModel, Layer +from fast_llm.engine.config_utils.logging import configure_logging from fast_llm.engine.config_utils.tensor_space import TensorSpace from fast_llm.engine.distributed.distributed import Distributed from fast_llm.engine.multi_stage.config import FastLLMModelConfig, StageConfig from fast_llm.engine.multi_stage.stage import Stage +from fast_llm.utils import header + +logger = logging.getLogger(__name__) requires_cuda = pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available") @@ -47,3 +56,50 @@ def get_stage(base_model: BaseModel | list[Layer], distributed: Distributed): stage.restore_parameters() stage.reset_gradients() return stage + + +class DistributedSubtestContext: + def __init__(self, path: pathlib.Path, rank: int) -> None: + self._path = path + self._rank = rank + self._capture_manager = _pytest.capture.CaptureManager("fd") + self.success = False + + def __enter__(self) -> typing.Self: + self._capture_manager.start_global_capturing() + # Logging is set to log to the old stdout, so we need to reconfigure. + configure_logging() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + try: + self._capture_manager.suspend_global_capture() + out, err = self._capture_manager.read_global_capture() + self._path.mkdir(parents=True, exist_ok=True) + self._path.joinpath(f"pytest_stdout_{self._rank}").write_text(out) + self._path.joinpath(f"pytest_stderr_{self._rank}").write_text(err) + if exc_type is None: + self.success = True + else: + self._path.joinpath(f"pytest_traceback_{self._rank}").write_text(traceback.format_exc()) + return True + finally: + self._capture_manager.stop_global_capturing() + configure_logging() + + +def report_subtest(path: pathlib.Path, world_size: int): + try: + success = bool(int(path.joinpath("pytest_success").read_text())) + except OSError: + success = False + if not success: + for rank in range(world_size): + for fd, file_ in (("stdout", sys.stdout), ("stderr", sys.stdout), ("traceback", sys.stderr)): + print(header(f"{fd} rank {rank}", 80), file=file_) + file_path = path / f"pytest_{fd}_{rank}" + try: + print(file_path.read_text(), file=file_) + except OSError: + print(f"<<< not found {file_path}>>>", file=file_) + raise RuntimeError(f"test {path.name} failed") From 98593d4d7daee70dc69a09ff42defb20e0b76526 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Fri, 4 Jul 2025 10:03:19 -0400 Subject: [PATCH 02/14] misc --- fast_llm/engine/config_utils/run.py | 26 ++-- fast_llm/engine/config_utils/runnable.py | 1 + fast_llm/engine/distributed/config.py | 5 - fast_llm/engine/distributed/distributed.py | 9 +- fast_llm/logging.py | 13 +- fast_llm/utils.py | 66 ++++++++++ tests/conftest.py | 34 +---- tests/models/distributed_test_checkpoint.py | 2 +- tests/models/distributed_test_model.py | 40 +++--- tests/models/test_model.py | 60 ++++----- tests/utils/compare_tensor_logs.py | 6 +- tests/utils/distributed_configs.py | 58 +++++---- tests/utils/run_test_script.py | 18 ++- tests/utils/utils.py | 132 ++++++++++++++------ 14 files changed, 289 insertions(+), 181 deletions(-) diff --git a/fast_llm/engine/config_utils/run.py b/fast_llm/engine/config_utils/run.py index 126e0ae8c..f8cfa8c5b 100644 --- a/fast_llm/engine/config_utils/run.py +++ b/fast_llm/engine/config_utils/run.py @@ -10,7 +10,7 @@ from fast_llm.engine.config_utils.logging import TensorLogs, TensorLogsConfig, configure_logging from fast_llm.engine.config_utils.runnable import RunnableConfig from fast_llm.engine.distributed.config import DistributedConfig -from fast_llm.utils import Assert, log +from fast_llm.utils import log if typing.TYPE_CHECKING: from fast_llm.engine.distributed.distributed import Distributed @@ -82,12 +82,14 @@ def _show( if is_main_rank(): return super()._show(verbose, log_fn=log_fn, title=title, width=width, fill_char=fill_char) - def configure_logging(self, directory: pathlib.Path | str | None = None) -> None: + def configure_logging( + self, directory: pathlib.Path | str | None = None, distributed: DistributedConfig | None = None + ) -> None: configure_logging( log_timestamps=self.run.log_timestamps, enable_all_loggers=self.run.enable_all_loggers, - rank=DistributedConfig.default_rank, - world_size=DistributedConfig.default_world_size, + rank=DistributedConfig.default_rank if distributed is None else distributed.rank, + world_size=DistributedConfig.default_world_size if distributed is None else distributed.world_size, directory=directory, ) @@ -131,17 +133,13 @@ def __init__( distributed: "Distributed", ): self._config = config.run - self._distributed_config = distributed.config - Assert.eq(self._distributed_config.world_size, DistributedConfig.default_world_size) - Assert.eq(self._distributed_config.local_world_size, DistributedConfig.default_local_world_size) - Assert.eq(self._distributed_config.rank, DistributedConfig.default_rank) self._distributed = distributed # TODO: Main rank should contain the last pipeline stage so it calculates loss - self._is_main_rank = self._distributed_config.rank == _MAIN_RANK - self._is_model_parallel_main_rank = self._distributed_config.data_rank == 0 + self._is_main_rank = self._distributed.config.rank == _MAIN_RANK + self._is_model_parallel_main_rank = self._distributed.config.data_rank == 0 self._is_pipeline_parallel_main_rank = ( - self._distributed_config.data_rank == 0 and self._distributed_config.tensor_rank == 0 + self._distributed.config.data_rank == 0 and self._distributed.config.tensor_rank == 0 ) config_dict = config.to_dict() config_dict_verbose = config.to_dict(verbose=FieldVerboseLevel.performance) @@ -160,14 +158,14 @@ def __init__( # Make sure all the workers agree on the run. This also acts as a barrier. self.index = self.broadcast_int(run) run_dir = self._experiment_directory / "runs" / str(self.index) - self._artifact_dir = run_dir / "artifacts" / str(self._distributed_config.rank) + self._artifact_dir = run_dir / "artifacts" / str(self._distributed.config.rank) log_dir = run_dir / "logs" else: self._experiment_directory, self._artifact_dir, log_dir = None, None, None self.index = None - if self._config.structured_logs: - config.configure_logging(log_dir) + # Finalize logging configuration. + config.configure_logging(log_dir) self._experiment_name = self._config.experiment_name or ( "default" if self._experiment_directory is None else self._experiment_directory.name diff --git a/fast_llm/engine/config_utils/runnable.py b/fast_llm/engine/config_utils/runnable.py index bcdebb856..051163084 100644 --- a/fast_llm/engine/config_utils/runnable.py +++ b/fast_llm/engine/config_utils/runnable.py @@ -29,6 +29,7 @@ def parse_and_run(cls, args: list[str] | None = None) -> None: with NoAutoValidate(): config: "RunnableConfig" = cls._from_parsed_args(parsed, unparsed) try: + # Configure logging so validation errors are logged properly. config.configure_logging() config.validate() if not parsed.do_run: diff --git a/fast_llm/engine/distributed/config.py b/fast_llm/engine/distributed/config.py index 9f006cdb1..6f2e2ab95 100644 --- a/fast_llm/engine/distributed/config.py +++ b/fast_llm/engine/distributed/config.py @@ -387,10 +387,5 @@ def _from_dict( strict: bool = True, flat: bool = False, ) -> typing.Self: - # TODO v0.3: Remove backward compatibility fix - if "sequence_first" in default and strict: - del default["sequence_first"] - if "separate_init_generators" in default and strict: - del default["separate_init_generators"] cls._handle_renamed_field(default, "distributed_timeout", "timeout") return super()._from_dict(default, strict, flat) diff --git a/fast_llm/engine/distributed/distributed.py b/fast_llm/engine/distributed/distributed.py index 977318841..ce9f660f2 100644 --- a/fast_llm/engine/distributed/distributed.py +++ b/fast_llm/engine/distributed/distributed.py @@ -1,5 +1,6 @@ import datetime import logging +import time import typing import torch @@ -97,10 +98,12 @@ def get_process_group(self, global_ranks: range | tuple, group_rank: int) -> Pro return group prefix = ( - f"range_{global_ranks.start}_{global_ranks.start}_{global_ranks.step}" + f"range_{global_ranks.start}_{global_ranks.stop}_{global_ranks.step}" if isinstance(global_ranks, range) else f"ranks_{"_".join(str(rank) for rank in global_ranks)}" ) + logger.info(f"Creating process group {prefix} (rank = {group_rank}, size = {group_size})") + time.sleep(0.1) group = torch.distributed.ProcessGroupNCCL( torch.distributed.PrefixStore(prefix + "/", self.store), @@ -108,6 +111,8 @@ def get_process_group(self, global_ranks: range | tuple, group_rank: int) -> Pro group_size, datetime.timedelta(seconds=self._timeout), ) + logger.info(f"Barrier process group {prefix} (rank = {group_rank}, size = {group_size})") + logger.info(f"Done process group {prefix} (rank = {group_rank}, size = {group_size})") self._process_groups[global_ranks] = group return group @@ -225,7 +230,7 @@ def add_group(self, distributed_dim: DistributedDim) -> ProcessGroup | None: """ Add a process group from its definition. """ - self._config.log_first_rank(f"Initializing group {distributed_dim.name}, size={distributed_dim.size}...") + # self._config.log_first_rank(f"Initializing group {distributed_dim.name}, size={distributed_dim.size}...") distributed_dim.check_ranks_in_range(0, self._config.world_size) group = self._pool.get_process_group(distributed_dim.global_ranks, distributed_dim.rank) distributed_dim.setup(group) diff --git a/fast_llm/logging.py b/fast_llm/logging.py index f574aa381..4f77be7fa 100644 --- a/fast_llm/logging.py +++ b/fast_llm/logging.py @@ -10,7 +10,7 @@ from fast_llm.engine.config_utils.logging import TensorLogs from fast_llm.engine.distributed.config import PhaseType from fast_llm.tensor import TensorMeta -from fast_llm.utils import format_number, log +from fast_llm.utils import format_number, get_and_reset_memory_usage_mib, log if typing.TYPE_CHECKING: from fast_llm.core.distributed import ProcessGroup @@ -329,7 +329,7 @@ def log_generator[ _global_max_reserved = 0 -def get_memory_usage_mib(reset_stats: bool = True, relative_to: dict[str, int] | None = None) -> dict[str, float]: +def get_memory_usage_mib(reset_stats: bool = True, relative_to: dict[str, float] | None = None) -> dict[str, float]: global _global_max_allocated, _global_max_reserved max_allocated = torch.cuda.max_memory_allocated() / 2**20 max_reserved = torch.cuda.max_memory_reserved() / 2**20 @@ -355,12 +355,13 @@ def log_memory_usage[ header: str | None = None, log_fn: type[BaseException] | typing.Callable[[str], T] = logger.info, reset_stats: bool = True, - stats: dict[str, int] | None = None, + report: dict[str, float] | None = None, relative_to: dict[str, int] | None = None, ) -> T: - if stats is None: - stats = get_memory_usage_mib(reset_stats, relative_to) - formatted = _MEMORY_METRIC_FORMAT.format(**stats) + if report is None: + get_and_reset_memory_usage_mib(relative_to=relative_to, reset_stats=reset_stats) + report = get_memory_usage_mib(reset_stats, relative_to) + formatted = _MEMORY_METRIC_FORMAT.format(**report) if header is not None: formatted = f"{header}: {formatted}" return log(formatted, log_fn=log_fn) diff --git a/fast_llm/utils.py b/fast_llm/utils.py index 7bbdd6979..bd2f8ef7b 100644 --- a/fast_llm/utils.py +++ b/fast_llm/utils.py @@ -1,3 +1,4 @@ +import gc import itertools import logging import math @@ -392,3 +393,68 @@ def enabled(self) -> bool: @property def interrupted(self): return self._interrupted + + +_global_max_allocated = 0 +_global_max_reserved = 0 + + +def get_and_reset_memory_usage_mib( + *, + relative_to: dict[str, int] | None = None, + clear_cache: bool = False, + global_stats: bool = False, + reset_stats: bool = True, + reset_global_stats: bool = False, +) -> dict[str, float]: + global _global_max_allocated, _global_max_reserved + import torch + + if clear_cache: + # Free memory for more accurate reporting, and to reduce OOM risk with lots of workers. + # Cublas workspace can unnecessarily keep 100s of MBs of reserved memory. + torch._C._cuda_clearCublasWorkspaces() + # Lots of tensors tend to stay allocated until the next garbage collection. + # Collect only if the remaining memory is significant enough since it's costly. + if torch.cuda.memory_allocated() > 1e7: + gc.collect() + try: + # Actually free the memory. + torch.cuda.empty_cache() + except RuntimeError: + # Happens if cuda is broken. + return {} + report = { + # Relevant value for OOM risk. Also look at global max since fast-llm resets stats. + "max_memory_reserved": max(torch.cuda.max_memory_reserved() / 2**20, _global_max_reserved), + # Actual memory usage from the test. + "max_memory_allocated": max(torch.cuda.max_memory_allocated() / 2**20, _global_max_allocated), + "memory_reserved": torch.cuda.memory_reserved() / 2**20, + "memory_allocated": torch.cuda.memory_allocated() / 2**20, + } + max_allocated = torch.cuda.max_memory_allocated() / 2**20 + max_reserved = torch.cuda.max_memory_reserved() / 2**20 + if global_stats: + report |= { + "max_memory_reserved": max(max_reserved, _global_max_reserved), + "max_memory_allocated": max(max_allocated, _global_max_allocated), + } + else: + report |= { + "max_allocated": max_allocated, + "max_reserved": max_reserved, + "global_max_reserved": _global_max_reserved, + } + + if relative_to: + report = {key: value - relative_to.get(key, 0) for key, value in report.items()} + if reset_global_stats: + torch.cuda.reset_peak_memory_stats() + _global_max_reserved = 0 + _global_max_allocated = 0 + elif reset_stats: + torch.cuda.reset_peak_memory_stats() + _global_max_allocated = max(max_allocated, _global_max_allocated) + _global_max_reserved = max(max_reserved, _global_max_reserved) + + return report diff --git a/tests/conftest.py b/tests/conftest.py index 4c9161ea6..0eb7826f2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,4 @@ import dataclasses -import gc import json import logging import math @@ -11,6 +10,7 @@ import xdist.scheduler import fast_llm.logging +from fast_llm.utils import get_and_reset_memory_usage_mib from tests.utils.depends import DependencyManager # Make fixtures available globally without import @@ -33,6 +33,7 @@ def pytest_addoption(parser): group.addoption("--skip-slow", action="store_true") group.addoption("--show-skipped", action="store_true") group.addoption("--show-gpu-memory", type=int, default=10) + group.addoption("--no-distributed-capture", dest="distributed_capture", action="store_false") group.addoption("--models", nargs="*") group.addoption( "--run-extra-slow", @@ -187,37 +188,12 @@ def pytest_runtest_makereport(item: pytest.Function, call: pytest.CallInfo): # Measure GPU memory usage. (TODO: This excludes child processes) if call.when == "call" and torch.cuda.is_available(): - # Free memory for more accurate reporting, and to reduce OOM risk with lots of workers. - # Cublas workspace can unnecessarily keep 100s of MBs of reserved memory. - torch._C._cuda_clearCublasWorkspaces() - # Lots of tensors tend to stay allocated until the next garbage collection. - # Collect only if the remaining memory is significant enough since it's costly. - if torch.cuda.memory_allocated() > 1e7: - gc.collect() - try: - # Actually free the memory. - torch.cuda.empty_cache() - except RuntimeError: - # Happens if the test broke cuda. - return + report = get_and_reset_memory_usage_mib(clear_cache=True, global_stats=True, reset_global_stats=True) + report["duration"] = call.duration item.add_report_section( call.when, "resource usage", - json.dumps( - { - "duration": call.duration, - # Relevant value for OOM risk. Also look at global max since fast-llm resets stats. - "max_memory_reserved": max( - torch.cuda.max_memory_reserved() / 2**20, fast_llm.logging._global_max_reserved - ), - # Actual memory usage from the test. - "max_memory_allocated": max( - torch.cuda.max_memory_allocated() / 2**20, fast_llm.logging._global_max_allocated - ), - "memory_reserved": torch.cuda.memory_reserved() / 2**20, - "memory_allocated": torch.cuda.memory_allocated() / 2**20, - } - ), + json.dumps(report), ) torch.cuda.reset_peak_memory_stats() # Reset global stats for next test. diff --git a/tests/models/distributed_test_checkpoint.py b/tests/models/distributed_test_checkpoint.py index 9e706ebee..05a0bf443 100644 --- a/tests/models/distributed_test_checkpoint.py +++ b/tests/models/distributed_test_checkpoint.py @@ -47,7 +47,7 @@ def _test_load_and_save_parallel( def main(args: list[str] | None = None) -> None: - base_path, model_testing_config = parse_run_distributed_script(args) + base_path, model_testing_config, _ = parse_run_distributed_script(args) with ProcessGroupPool(timeout=20): for pretrained_format, pretrained_path in ( diff --git a/tests/models/distributed_test_model.py b/tests/models/distributed_test_model.py index ad19eeafa..36f13ec2a 100644 --- a/tests/models/distributed_test_model.py +++ b/tests/models/distributed_test_model.py @@ -1,9 +1,7 @@ import logging -import torch - from fast_llm.cli import fast_llm_main_wrapper -from fast_llm.core.distributed import allreduce_scalar, safe_barrier +from fast_llm.core.distributed import safe_barrier from fast_llm.engine.distributed.config import DistributedConfig from fast_llm.engine.distributed.distributed import ProcessGroupPool from tests.utils.distributed_configs import DISTRIBUTED_TESTING_CONFIGS @@ -14,37 +12,29 @@ def main(args: list[str] | None = None) -> None: - base_path, model_testing_config = parse_run_distributed_script(args) + base_path, model_testing_config, do_capture = parse_run_distributed_script(args) + + if do_capture: + logger.warning( + "Capturing output and forwarding to associated tests. Run with `--no-distributed-capture` to disable." + ) - with ProcessGroupPool(timeout=20) as pool: + # TODO: Why are barriers needed? + with ProcessGroupPool(timeout=60) as pool: failures = [] world_size = DistributedConfig.default_world_size rank = DistributedConfig.default_rank group = pool.get_process_group(range(world_size), rank) for name, config in DISTRIBUTED_TESTING_CONFIGS.items(): - if config.num_gpus > world_size: - logger.warning(f"{name} {f"SKIPPED (not enough GPUs: {config.num_gpus} > {world_size})"})") - if DistributedConfig.default_rank < config.num_gpus: - logger.info(f"Running {name}") - with DistributedSubtestContext(base_path / name, rank) as subtest: + if world_size < config.num_gpus: + logger.warning(f"{name} {f"SKIPPED (not enough GPUs: {world_size} < {config.num_gpus})"})") + continue + with DistributedSubtestContext(base_path, name, group, config.num_gpus, enabled=do_capture) as subtest: + if rank < config.num_gpus: do_run_test_script_for_all_models(config, model_testing_config, base_path) - assert subtest._capture_manager._global_capturing is None - success = subtest.success - else: - # Worker is not needed for this one, skip. - success = True - - # Barrier so `allreduce_scalar` doesn't go crazy in case of desync. - safe_barrier(group, name) - success = ( - success if group is None else allreduce_scalar(success, dtype=torch.int64, group=group) == world_size - ) - logger.warning(f"{name} {"PASSED" if success else "FAILED"})") - if not success: + if not subtest.success: failures.append(name) - if rank == 0: - (base_path / name / "pytest_success").write_text(str(int(success))) # Final barrier to ensure everything is done before torchrun potentially kills workers. safe_barrier(group, "testing end") diff --git a/tests/models/test_model.py b/tests/models/test_model.py index 7e853a24a..2aeff95cc 100644 --- a/tests/models/test_model.py +++ b/tests/models/test_model.py @@ -1,3 +1,5 @@ +import logging + import pytest import torch @@ -7,14 +9,16 @@ SINGLE_GPU_TESTING_CONFIGS, ) from tests.utils.model_configs import ModelTestingGroup -from tests.utils.run_test_script import ARTIFACT_PATH -from tests.utils.utils import report_subtest +from tests.utils.utils import check_subtest_success, set_subtest_success + +logger = logging.getLogger(__name__) @pytest.mark.model_testing_group(ModelTestingGroup.basic) -def test_model_simple(run_test_script_for_all_models): +def test_model_simple(run_test_script_for_all_models, run_test_script_base_path): # A simple config to prevent unnecessary testing and creation of dependency group run_test_script_for_all_models(SIMPLE_TESTING_CONFIG) + set_subtest_success(run_test_script_base_path / SIMPLE_TESTING_CONFIG.name) @pytest.mark.depends_on(on=["test_model_simple[{model_testing_config}]"]) @@ -27,49 +31,49 @@ def test_and_compare_model( # We can expect tests to respect the ordering of `SINGLE_GPU_TESTING_CONFIGS`, so compare should have run already. config = SINGLE_GPU_TESTING_CONFIGS[config_name] if config.compare is not None: - for artifact in ["init", "train_1"]: - path = run_test_script_base_path / config.compare / ARTIFACT_PATH / "0" / f"tensor_logs_{artifact}.pt" - if not path.is_file(): - # Dependency likely failed, skipping this test because it will most likely fail for the same reason. - # We still need to fail because we can't confirm the failure. - pytest.fail(f"Compared test {config.compare} failed or did not run ({path} not found).", pytrace=False) + check_subtest_success(run_test_script_base_path / config.compare) # A baseline config (single-gpu, bf16, flash-attn). # Also tests for multiple data loaders. run_test_script_for_all_models(config) + set_subtest_success(run_test_script_base_path / config.name) if config.compare is not None: - compare_results_for_all_models(config) + compare_results_for_all_models(config, ("init", "train_1", "train_2")) @pytest.mark.depends_on(on=["test_model_simple[{model_testing_config}]"]) -@pytest.mark.model_testing_group(ModelTestingGroup.distributed) -def test_run_model_distributed(run_distributed_script, model_testing_config, run_test_script_base_path): +@pytest.mark.model_testing_group( + ModelTestingGroup.distributed, +) +def test_run_model_distributed(run_distributed_script, model_testing_config, run_test_script_base_path, request): import tests.models.distributed_test_model - run_distributed_script( - [ - tests.models.distributed_test_model.__file__, - str(run_test_script_base_path), - model_testing_config.name, - ], - num_gpus=torch.cuda.device_count(), - ) + script = [tests.models.distributed_test_model.__file__, str(run_test_script_base_path), model_testing_config.name] + if not request.config.getoption("distributed_capture"): + logger.warning( + "Capturing output and forwarding to associated tests. Run with `--no-distributed-capture` to disable." + ) + script.append("--no-capture") + run_distributed_script(script, num_gpus=torch.cuda.device_count()) # We don't want to depend on `test_model_distributed` because we still want to run this in cas of failure. # This should still run after `test_model_distributed` @pytest.mark.depends_on(on=["test_model_simple[{model_testing_config}]"]) @pytest.mark.model_testing_group(ModelTestingGroup.distributed) -@pytest.mark.parametrize("config_name", list(DISTRIBUTED_TESTING_CONFIGS)[:1]) +@pytest.mark.parametrize("config_name", list(DISTRIBUTED_TESTING_CONFIGS)) def test_model_distributed( - run_test_script_for_all_models, compare_results_for_all_models, config_name, run_test_script_base_path + run_test_script_for_all_models, + compare_results_for_all_models, + config_name, + run_test_script_base_path, + report_subtest, ): config = DISTRIBUTED_TESTING_CONFIGS[config_name] + if torch.cuda.device_count() < config.num_gpus: + pytest.skip(f"Not enough GPUs: {torch.cuda.device_count()} < {config.num_gpus}") report_subtest(run_test_script_base_path / config.name, config.num_gpus) if config.compare is not None: - for artifact in ["init", "train_1"]: - if not ( - run_test_script_base_path / config.compare / ARTIFACT_PATH / f"tensor_logs_{artifact}.pt" - ).is_file(): - pytest.fail(f"Compared test {config.compare} failed or did not run.", pytrace=False) - compare_results_for_all_models(config) + if not check_subtest_success(run_test_script_base_path / config.compare): + pytest.fail(f"Test {config.compare} failed", pytrace=False) + compare_results_for_all_models(config, ("init", "train_1", "train_2")) diff --git a/tests/utils/compare_tensor_logs.py b/tests/utils/compare_tensor_logs.py index e34fd6007..96acf9658 100644 --- a/tests/utils/compare_tensor_logs.py +++ b/tests/utils/compare_tensor_logs.py @@ -59,7 +59,7 @@ def compare_logged_tensor(tensor_ref, tensor_test, errors, step, name, config: C if tensor_ref["shape"] != tensor_test["shape"]: errors.append( "\n".join( - [f">>>> [{step}] Incompatible shape for tensor {name}: {tensor_ref['shape']}!={tensor_test['shape']}"] + [f">>>> [{step}] Incompatible shape for tensor {name}: {tensor_test['shape']}!={tensor_ref['shape']}"] ) ) return @@ -67,7 +67,7 @@ def compare_logged_tensor(tensor_ref, tensor_test, errors, step, name, config: C errors.append( "\n".join( [ - f">>>> [{step}] Incompatible sampling rate for tensor {name}: {tensor_ref['step']}!={tensor_test['step']}" + f">>>> [{step}] Incompatible sampling rate for tensor {name}: {tensor_test['step']}!={tensor_ref['step']}" ] ) ) @@ -101,8 +101,8 @@ def compare_logged_tensor(tensor_ref, tensor_test, errors, step, name, config: C if tensor_errors: tensor_errors.extend( [ - f" Ref samples: " + "".join(f"{x:12.4e}" for x in samples_ref[: config.show_samples].tolist()), f" Test samples: " + "".join(f"{x:12.4e}" for x in samples_test[: config.show_samples].tolist()), + f" Ref samples: " + "".join(f"{x:12.4e}" for x in samples_ref[: config.show_samples].tolist()), ] ) errors.append("\n".join([f">>>> [{step}] Excessive diff for tensor {name}:"] + tensor_errors)) diff --git a/tests/utils/distributed_configs.py b/tests/utils/distributed_configs.py index 8bbd08d51..c38939eae 100644 --- a/tests/utils/distributed_configs.py +++ b/tests/utils/distributed_configs.py @@ -70,7 +70,7 @@ class DistributedTestingConfig: DistributedTestingConfig( name="df4_sf", compare=None, - config_args=[], + config_args=["batch.depth_first_micro_batches=4", "model.base_model.sequence_first=True"], num_gpus=1, ), ] @@ -103,10 +103,15 @@ class DistributedTestingConfig: ), # Depth-first micro-batches DistributedTestingConfig( - name="dp2_df4_z3", + name="dp2_z3_df4", compare="df4", config_args=["model.multi_stage.zero_stage=3", "batch.depth_first_micro_batches=4"], num_gpus=2, + compare_config=CompareConfig( + ignore_duplicates=[ + "Global gradient", + ] + ), ), # Sequence-data-parallel DistributedTestingConfig( @@ -126,13 +131,13 @@ class DistributedTestingConfig: # Simple sequence-tensor-parallel DistributedTestingConfig( name="stp2", - compare="simple", + compare="sf", config_args=["model.distributed.tensor_parallel=2", "model.distributed.sequence_tensor_parallel=True"], num_gpus=2, ), # Cross-entropy splits DistributedTestingConfig( - name="sp2_ce4", + name="stp2_ce4", compare="sf", config_args=[ "model.distributed.tensor_parallel=2", @@ -145,7 +150,7 @@ class DistributedTestingConfig: # ===== 2d configs (Data + Tensor) # Simple DistributedTestingConfig( - name="dp2_sp2", + name="dp2_stp2", compare="sf", config_args=[ "model.distributed.tensor_parallel=2", @@ -158,28 +163,32 @@ class DistributedTestingConfig: name="tp2_df4", compare="df4", config_args=[ - "batch.depth_first_micro_batches=4", "model.distributed.tensor_parallel=2", + "batch.depth_first_micro_batches=4", ], num_gpus=4, ), # Breadth-first micro-batches DistributedTestingConfig( - name="sdp2_sp2_bf4", + name="sdp2_stp2_bf4", compare="df4_sf", config_args=[ "model.distributed.sequence_data_parallel=2", - "batch.breadth_first_micro_batches=4", "model.distributed.tensor_parallel=2", "model.distributed.sequence_tensor_parallel=True", + "batch.breadth_first_micro_batches=4", ], num_gpus=4, ), # Sequence-data-parallel DistributedTestingConfig( - name="sdp2_sp2", + name="sdp2_stp2", compare="sf", - config_args=["model.distributed.tensor_parallel=2"], + config_args=[ + "model.distributed.sequence_data_parallel=2", + "model.distributed.tensor_parallel=2", + "model.distributed.sequence_tensor_parallel=True", + ], num_gpus=4, ), # ===== Pipeline-parallel configs @@ -188,9 +197,9 @@ class DistributedTestingConfig: name="pp2s2_bf4", compare="df4", config_args=[ - "batch.breadth_first_micro_batches=4", "model.distributed.pipeline_parallel=2", "model.multi_stage.layers_per_stage=2", + "batch.breadth_first_micro_batches=4", ], num_gpus=2, ), @@ -199,9 +208,9 @@ class DistributedTestingConfig: name="pp2s1_bf4", compare="df4", config_args=[ - "batch.breadth_first_micro_batches=4", "model.distributed.pipeline_parallel=2", "model.multi_stage.layers_per_stage=1", + "batch.breadth_first_micro_batches=4", ], num_gpus=2, compare_config=CompareConfig( @@ -216,34 +225,35 @@ class DistributedTestingConfig: name="pp2s2_ms", compare="ms", config_args=[ - "batch.micro_sequence_length=256", "model.distributed.pipeline_parallel=2", "model.multi_stage.layers_per_stage=2", + "batch.micro_sequence_length=256", ], num_gpus=2, ), - # ===== Data + Pipeline + # ===== 2d configs (Data + Pipeline) # Simple DistributedTestingConfig( - name="dp2_pp2s2", + name="dp2_pp2s2_bf4", compare="df4", config_args=[ - "batch.breadth_first_micro_batches=4", "model.distributed.pipeline_parallel=2", "model.multi_stage.layers_per_stage=2", + "batch.breadth_first_micro_batches=4", ], num_gpus=4, ), - # ===== Tensor + Pipeline + # ===== 2d configs (Tensor + Pipeline) # Simple [sf, mb] DistributedTestingConfig( - name="sp2_pp2s1", + name="stp2_pp2s1_bf4", compare="df4_sf", config_args=[ "model.distributed.tensor_parallel=2", "model.distributed.sequence_tensor_parallel=True", "model.distributed.pipeline_parallel=2", "model.multi_stage.layers_per_stage=2", + "batch.breadth_first_micro_batches=4", ], num_gpus=4, compare_config=CompareConfig( @@ -259,10 +269,10 @@ class DistributedTestingConfig: name="dp2_stp2_pp2s2", compare="mb", config_args=[ - "batch.breadth_first_micro_batches=4", "model.distributed.tensor_parallel=2", "model.distributed.pipeline_parallel=2", "model.multi_stage.layers_per_stage=2", + "batch.breadth_first_micro_batches=4", ], num_gpus=8, ), @@ -271,11 +281,11 @@ class DistributedTestingConfig: name="dp2_tp2_pp2s1_bf4", compare="mb", config_args=[ - "batch.breadth_first_micro_batches=4", "model.distributed.tensor_parallel=2", "model.distributed.sequence_tensor_parallel=True", "model.distributed.pipeline_parallel=2", "model.multi_stage.layers_per_stage=1", + "batch.breadth_first_micro_batches=4", ], num_gpus=8, compare_config=CompareConfig( @@ -287,15 +297,15 @@ class DistributedTestingConfig: ), # Micro-sequence DistributedTestingConfig( - name="dp2s2_stp2_pp2s2_ms256", + name="sdp2_stp2_pp2s2_ms", compare="ms", config_args=[ - "batch.micro_sequence_length=256", - "model.distributed.pipeline_parallel=2", + "model.distributed.sequence_data_parallel=2", "model.distributed.tensor_parallel=2", "model.distributed.sequence_tensor_parallel=True", - "model.distributed.sequence_data_parallel=2", + "model.distributed.pipeline_parallel=2", "model.multi_stage.layers_per_stage=2", + "batch.micro_sequence_length=256", ], num_gpus=8, ), diff --git a/tests/utils/run_test_script.py b/tests/utils/run_test_script.py index 6c0b561dd..61bc75074 100644 --- a/tests/utils/run_test_script.py +++ b/tests/utils/run_test_script.py @@ -102,8 +102,10 @@ def parse_run_distributed_script(args: list[str] | None = None): parser = argparse.ArgumentParser() parser.add_argument("base_path", type=pathlib.Path) parser.add_argument("model_testing_config", type=str) + parser.add_argument("--no-distributed-capture", dest="distributed_capture", action="store_false") + parsed = parser.parse_args(args) - return parsed.base_path, MODEL_CONFIGS[parsed.model_testing_config] + return parsed.base_path, MODEL_CONFIGS[parsed.model_testing_config], parsed.distributed_capture @pytest.fixture(scope="session") @@ -111,12 +113,16 @@ def compare_results_for_all_models( worker_resources: "WorkerResources", run_test_script_base_path: pathlib.Path, ): - def do_compare_results_for_all_models(distributed_testing_config: DistributedTestingConfig): - assert distributed_testing_config.compare is not None + def do_compare_results_for_all_models(config: DistributedTestingConfig, artifacts: typing.Iterable[str]): + assert config.compare is not None + compare_path = run_test_script_base_path / config.compare / ARTIFACT_PATH + for artifact in artifacts: + if not (artifact_path := compare_path / "0" / f"tensor_logs_{artifact}.pt").is_file(): + pytest.fail(f"Missing artifact {artifact_path} from {config.compare}.", pytrace=False) compare_tensor_logs( - run_test_script_base_path / distributed_testing_config.compare / ARTIFACT_PATH, - run_test_script_base_path / distributed_testing_config.name / ARTIFACT_PATH, - distributed_testing_config.compare_config, + compare_path, + run_test_script_base_path / config.name / ARTIFACT_PATH, + config.compare_config, ) return do_compare_results_for_all_models diff --git a/tests/utils/utils.py b/tests/utils/utils.py index 0ca596aed..49151cbe8 100644 --- a/tests/utils/utils.py +++ b/tests/utils/utils.py @@ -1,20 +1,21 @@ import logging import pathlib import sys +import time import traceback import typing -import _pytest.capture import pytest import torch +from fast_llm.core.distributed import ProcessGroup, allreduce_scalar, safe_barrier from fast_llm.engine.base_model.base_model import BaseModel, Layer from fast_llm.engine.config_utils.logging import configure_logging from fast_llm.engine.config_utils.tensor_space import TensorSpace from fast_llm.engine.distributed.distributed import Distributed from fast_llm.engine.multi_stage.config import FastLLMModelConfig, StageConfig from fast_llm.engine.multi_stage.stage import Stage -from fast_llm.utils import header +from fast_llm.utils import get_and_reset_memory_usage_mib, header logger = logging.getLogger(__name__) @@ -59,47 +60,102 @@ def get_stage(base_model: BaseModel | list[Layer], distributed: Distributed): class DistributedSubtestContext: - def __init__(self, path: pathlib.Path, rank: int) -> None: - self._path = path - self._rank = rank - self._capture_manager = _pytest.capture.CaptureManager("fd") + def __init__( + self, base_path: pathlib.Path, name: str, group: ProcessGroup | None, num_gpus: int, enabled: bool = True + ) -> None: + self._path = base_path / name + self._name = name + self._group = group + self._rank = 0 if group is None else group.rank() + self._rank_enabled = self._rank < num_gpus + self._enabled = enabled and self._rank_enabled self.success = False def __enter__(self) -> typing.Self: - self._capture_manager.start_global_capturing() - # Logging is set to log to the old stdout, so we need to reconfigure. - configure_logging() - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - try: - self._capture_manager.suspend_global_capture() - out, err = self._capture_manager.read_global_capture() + if self._enabled: + self._sys_stdout = sys.stdout + self._sys_stderr = sys.stderr self._path.mkdir(parents=True, exist_ok=True) - self._path.joinpath(f"pytest_stdout_{self._rank}").write_text(out) - self._path.joinpath(f"pytest_stderr_{self._rank}").write_text(err) - if exc_type is None: - self.success = True - else: - self._path.joinpath(f"pytest_traceback_{self._rank}").write_text(traceback.format_exc()) - return True - finally: - self._capture_manager.stop_global_capturing() + sys.stdout = self._path.joinpath(f"pytest_stdout_{self._rank}").open("w") + sys.stderr = self._path.joinpath(f"pytest_stderr_{self._rank}").open("w") + # Logging is set to log to the old stdout, so we need to reconfigure. configure_logging() + self._start = time.perf_counter() + return self - -def report_subtest(path: pathlib.Path, world_size: int): + def __exit__(self, exc_type, exc_val, exc_tb): + if self._enabled: + try: + stdout_handle = sys.stdout + stderr_handle = sys.stderr + sys.stdout = self._sys_stdout + sys.stderr = self._sys_stderr + stdout_handle.close() + stderr_handle.close() + finally: + configure_logging() + + if exc_type is None: + self.success = True + else: + self._path.joinpath(f"pytest_traceback_{self._rank}").write_text(traceback.format_exc()) + + if self._group is not None: + # Barrier so `allreduce_scalar` doesn't go crazy in case of desync. + safe_barrier(self._group, self._name) + self.success = allreduce_scalar(self.success, dtype=torch.int64, group=self._group) == self._group.size() + + if self._rank_enabled: + # Free resources to limit memory usage. + report = get_and_reset_memory_usage_mib(clear_cache=True, global_stats=True, reset_global_stats=True) + report["duration"] = time.perf_counter() - self._start + + self._path.joinpath(f"pytest_report_{self._rank}").write_text(traceback.format_exc()) + + logger.warning(f"{self._name} {"PASSED" if self.success else "FAILED"})") + if self._rank == 0: + set_subtest_success(self._path, self.success) + + return True + + +def set_subtest_success(path: pathlib.Path, success: bool = True): + path.joinpath("pytest_success").write_text(str(int(success))) + + +def check_subtest_success(path: pathlib, fail: bool = True) -> bool: + if not path.is_dir(): + if fail: + pytest.fail(f"Test {path.name} did not run", pytrace=False) + else: + return False try: - success = bool(int(path.joinpath("pytest_success").read_text())) + return bool(int(path.joinpath("pytest_success").read_text())) except OSError: - success = False - if not success: - for rank in range(world_size): - for fd, file_ in (("stdout", sys.stdout), ("stderr", sys.stdout), ("traceback", sys.stderr)): - print(header(f"{fd} rank {rank}", 80), file=file_) - file_path = path / f"pytest_{fd}_{rank}" - try: - print(file_path.read_text(), file=file_) - except OSError: - print(f"<<< not found {file_path}>>>", file=file_) - raise RuntimeError(f"test {path.name} failed") + return False + + +@pytest.fixture(scope="session") +def report_subtest(request): + verbose = request.config.getoption("verbose") + do_capture = request.config.getoption("distributed_capture") + + def do_report_subtest(path: pathlib.Path, world_size: int) -> None: + success = check_subtest_success(path) + if not do_capture: + logger.warning("Distributed capture is disabled. See distributed test for run output.") + elif verbose > 1 or not success: + for rank in range(world_size): + for fd, file_ in (("stdout", sys.stdout), ("stderr", sys.stdout), ("traceback", sys.stderr)): + print(header(f"{fd} rank {rank}", 80), file=file_) + file_path = path / f"pytest_{fd}_{rank}" + try: + print(file_path.read_text(), file=file_) + except OSError: + print(f"<<< not found {file_path}>>>", file=file_) + else: + print("Set verbose > 1 to show run output.") + if not success: + raise RuntimeError(f"test {path.name} failed") + + return do_report_subtest From 718a09a6eb7a141dd8822303ad70307aaaddfa1d Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Fri, 4 Jul 2025 11:12:29 -0400 Subject: [PATCH 03/14] fixes --- fast_llm/engine/distributed/distributed.py | 7 +------ fast_llm/engine/evaluation/evaluator.py | 5 +++-- fast_llm/engine/training/trainer.py | 6 +++--- fast_llm/logging.py | 21 --------------------- 4 files changed, 7 insertions(+), 32 deletions(-) diff --git a/fast_llm/engine/distributed/distributed.py b/fast_llm/engine/distributed/distributed.py index ce9f660f2..f53f25afc 100644 --- a/fast_llm/engine/distributed/distributed.py +++ b/fast_llm/engine/distributed/distributed.py @@ -1,6 +1,5 @@ import datetime import logging -import time import typing import torch @@ -102,8 +101,6 @@ def get_process_group(self, global_ranks: range | tuple, group_rank: int) -> Pro if isinstance(global_ranks, range) else f"ranks_{"_".join(str(rank) for rank in global_ranks)}" ) - logger.info(f"Creating process group {prefix} (rank = {group_rank}, size = {group_size})") - time.sleep(0.1) group = torch.distributed.ProcessGroupNCCL( torch.distributed.PrefixStore(prefix + "/", self.store), @@ -111,8 +108,6 @@ def get_process_group(self, global_ranks: range | tuple, group_rank: int) -> Pro group_size, datetime.timedelta(seconds=self._timeout), ) - logger.info(f"Barrier process group {prefix} (rank = {group_rank}, size = {group_size})") - logger.info(f"Done process group {prefix} (rank = {group_rank}, size = {group_size})") self._process_groups[global_ranks] = group return group @@ -230,7 +225,7 @@ def add_group(self, distributed_dim: DistributedDim) -> ProcessGroup | None: """ Add a process group from its definition. """ - # self._config.log_first_rank(f"Initializing group {distributed_dim.name}, size={distributed_dim.size}...") + self._config.log_first_rank(f"Initializing group {distributed_dim.name}, size={distributed_dim.size}...") distributed_dim.check_ranks_in_range(0, self._config.world_size) group = self._pool.get_process_group(distributed_dim.global_ranks, distributed_dim.rank) distributed_dim.setup(group) diff --git a/fast_llm/engine/evaluation/evaluator.py b/fast_llm/engine/evaluation/evaluator.py index 78aad230f..3fee32baf 100644 --- a/fast_llm/engine/evaluation/evaluator.py +++ b/fast_llm/engine/evaluation/evaluator.py @@ -17,7 +17,8 @@ from fast_llm.engine.schedule.schedule import Schedule from fast_llm.engine.training.config import WandbConfig from fast_llm.engine.training.wandb import Wandb -from fast_llm.logging import format_metrics, get_memory_usage_mib +from fast_llm.logging import format_metrics +from fast_llm.utils import get_and_reset_memory_usage_mib # from fast_llm.engine.training.lm_eval.evaluator import simple_evaluate as lm_eval_simple_evaluate @@ -226,7 +227,7 @@ def _evaluate_loss( / self._schedule._distributed.world_size / time_per_iteration ), - **get_memory_usage_mib(), + **get_and_reset_memory_usage_mib(), } return metrics diff --git a/fast_llm/engine/training/trainer.py b/fast_llm/engine/training/trainer.py index 766398d01..64408bb06 100644 --- a/fast_llm/engine/training/trainer.py +++ b/fast_llm/engine/training/trainer.py @@ -36,8 +36,8 @@ TrainingEvaluatorConfig, ) from fast_llm.engine.training.wandb import Wandb -from fast_llm.logging import format_metrics, get_memory_usage_mib, log_memory_usage -from fast_llm.utils import Assert, Interrupter +from fast_llm.logging import format_metrics, log_memory_usage +from fast_llm.utils import Assert, Interrupter, get_and_reset_memory_usage_mib logger = logging.getLogger(__name__) @@ -422,7 +422,7 @@ def _train(self) -> tuple[bool, dict[PhaseType, dict[str, typing.Any]]]: ), "run": self._run.index, **train_metrics, - **get_memory_usage_mib(), + **get_and_reset_memory_usage_mib(), } formatted_metrics = format_metrics(metrics[metrics_key], self._loss_defs, PhaseType.training) diff --git a/fast_llm/logging.py b/fast_llm/logging.py index 4f77be7fa..41fd4d99b 100644 --- a/fast_llm/logging.py +++ b/fast_llm/logging.py @@ -329,26 +329,6 @@ def log_generator[ _global_max_reserved = 0 -def get_memory_usage_mib(reset_stats: bool = True, relative_to: dict[str, float] | None = None) -> dict[str, float]: - global _global_max_allocated, _global_max_reserved - max_allocated = torch.cuda.max_memory_allocated() / 2**20 - max_reserved = torch.cuda.max_memory_reserved() / 2**20 - _global_max_allocated = max(max_allocated, _global_max_allocated) - _global_max_reserved = max(max_reserved, _global_max_reserved) - out = { - "allocated": torch.cuda.memory_allocated() / 2**20, - "max_allocated": max_allocated, - "reserved": torch.cuda.memory_reserved() / 2**20, - "max_reserved": max_reserved, - "global_max_reserved": _global_max_reserved, - } - if relative_to: - out = {key: value - relative_to.get(key, 0) for key, value in out.items()} - if reset_stats: - torch.cuda.reset_peak_memory_stats() - return out - - def log_memory_usage[ T ]( @@ -360,7 +340,6 @@ def log_memory_usage[ ) -> T: if report is None: get_and_reset_memory_usage_mib(relative_to=relative_to, reset_stats=reset_stats) - report = get_memory_usage_mib(reset_stats, relative_to) formatted = _MEMORY_METRIC_FORMAT.format(**report) if header is not None: formatted = f"{header}: {formatted}" From 1625c58a4407239f081b5556a5f1d4fc866cd4f0 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Fri, 4 Jul 2025 12:00:02 -0400 Subject: [PATCH 04/14] misc --- fast_llm/logging.py | 2 +- fast_llm/utils.py | 12 ++++-------- tests/conftest.py | 25 ++++++++++--------------- tests/utils/utils.py | 39 ++++++++++++++++++++++++++++++++++++--- 4 files changed, 51 insertions(+), 27 deletions(-) diff --git a/fast_llm/logging.py b/fast_llm/logging.py index 41fd4d99b..385a8b960 100644 --- a/fast_llm/logging.py +++ b/fast_llm/logging.py @@ -339,7 +339,7 @@ def log_memory_usage[ relative_to: dict[str, int] | None = None, ) -> T: if report is None: - get_and_reset_memory_usage_mib(relative_to=relative_to, reset_stats=reset_stats) + report = get_and_reset_memory_usage_mib(relative_to=relative_to, reset_stats=reset_stats) formatted = _MEMORY_METRIC_FORMAT.format(**report) if header is not None: formatted = f"{header}: {formatted}" diff --git a/fast_llm/utils.py b/fast_llm/utils.py index bd2f8ef7b..821ec5874 100644 --- a/fast_llm/utils.py +++ b/fast_llm/utils.py @@ -425,19 +425,15 @@ def get_and_reset_memory_usage_mib( # Happens if cuda is broken. return {} report = { - # Relevant value for OOM risk. Also look at global max since fast-llm resets stats. - "max_memory_reserved": max(torch.cuda.max_memory_reserved() / 2**20, _global_max_reserved), - # Actual memory usage from the test. - "max_memory_allocated": max(torch.cuda.max_memory_allocated() / 2**20, _global_max_allocated), - "memory_reserved": torch.cuda.memory_reserved() / 2**20, - "memory_allocated": torch.cuda.memory_allocated() / 2**20, + "reserved": torch.cuda.memory_reserved() / 2**20, + "allocated": torch.cuda.memory_allocated() / 2**20, } max_allocated = torch.cuda.max_memory_allocated() / 2**20 max_reserved = torch.cuda.max_memory_reserved() / 2**20 if global_stats: report |= { - "max_memory_reserved": max(max_reserved, _global_max_reserved), - "max_memory_allocated": max(max_allocated, _global_max_allocated), + "max_reserved": max(max_reserved, _global_max_reserved), + "max_allocated": max(max_allocated, _global_max_allocated), } else: report |= { diff --git a/tests/conftest.py b/tests/conftest.py index 0eb7826f2..ef6fff695 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,7 +9,6 @@ import torch import xdist.scheduler -import fast_llm.logging from fast_llm.utils import get_and_reset_memory_usage_mib from tests.utils.depends import DependencyManager @@ -22,7 +21,7 @@ ) from tests.utils.model_configs import model_testing_config, ModelTestingConfig, testing_group_enabled # isort: skip -from tests.utils.utils import result_path, TEST_RESULTS_PATH # isort: skip +from tests.utils.utils import result_path, TEST_RESULTS_PATH, format_resource_report, report_subtest # isort: skip manager: DependencyManager | None = None @@ -190,15 +189,18 @@ def pytest_runtest_makereport(item: pytest.Function, call: pytest.CallInfo): if call.when == "call" and torch.cuda.is_available(): report = get_and_reset_memory_usage_mib(clear_cache=True, global_stats=True, reset_global_stats=True) report["duration"] = call.duration + if hasattr(item, "fast_llm_resource_report"): + report_ = getattr(item, "fast_llm_resource_report") + report = { + key: max(report[key] for report in (report, report_) if key in report) + for key in set(report_) | set(report) + } + item.add_report_section( call.when, "resource usage", json.dumps(report), ) - torch.cuda.reset_peak_memory_stats() - # Reset global stats for next test. - fast_llm.logging._global_max_reserved = 0 - fast_llm.logging._global_max_allocated = 0 @pytest.hookimpl @@ -218,18 +220,11 @@ def pytest_terminal_summary(terminalreporter): terminalreporter.write_sep("=", "Highest gpu memory usage", bold=True) sorted_nodeids = sorted( resource_reports.keys(), - key=lambda nodeid: resource_reports[nodeid]["max_memory_reserved"], + key=lambda nodeid: resource_reports[nodeid]["max_reserved"], reverse=True, ) for nodeid in sorted_nodeids[: terminalreporter.config.getoption("--show-gpu-memory")]: - terminalreporter.write_line( - f"{nodeid}:\n " - f"Max Reserved {resource_reports[nodeid]["max_memory_reserved"]:.0f} MiB | " - f"Max Allocated {resource_reports[nodeid]["max_memory_allocated"]:.0f} MiB | " - f"End Reserved {resource_reports[nodeid]["memory_reserved"]:.0f} MiB | " - f"End Allocated {resource_reports[nodeid]["memory_allocated"]:.0f} MiB | " - f"Duration {resource_reports[nodeid]["duration"]:.2f}" - ) + terminalreporter.write_line(format_resource_report(nodeid, resource_reports[nodeid])) def pytest_runtest_call(item: pytest.Function): diff --git a/tests/utils/utils.py b/tests/utils/utils.py index 49151cbe8..600b4aecb 100644 --- a/tests/utils/utils.py +++ b/tests/utils/utils.py @@ -1,4 +1,6 @@ +import json import logging +import math import pathlib import sys import time @@ -110,7 +112,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): report = get_and_reset_memory_usage_mib(clear_cache=True, global_stats=True, reset_global_stats=True) report["duration"] = time.perf_counter() - self._start - self._path.joinpath(f"pytest_report_{self._rank}").write_text(traceback.format_exc()) + json.dump(report, self._path.joinpath(f"pytest_report_{self._rank}").open("w")) logger.warning(f"{self._name} {"PASSED" if self.success else "FAILED"})") if self._rank == 0: @@ -135,8 +137,22 @@ def check_subtest_success(path: pathlib, fail: bool = True) -> bool: return False -@pytest.fixture(scope="session") -def report_subtest(request): +def format_resource_report(title: str, report: dict[str, float]) -> str: + return "".join( + [ + f"{title}:\n ", + f"Max Reserved: {report.get("max_reserved", math.nan):.0f} MiB", + f"| Max Allocated: {report.get("max_allocated", math.nan):.0f} MiB".ljust(26), + f"| End Reserved: {report.get("reserved", math.nan):.0f} MiB".ljust(25), + f"| End Allocated: {report.get("allocated", math.nan):.0f} MiB".ljust(26), + f"| Duration: {report.get("duration", math.nan):.2f}".ljust(18), + f"| GPUs: {report["gpus"]:.0f}" if "gpus" in report else "", + ] + ) + + +@pytest.fixture(scope="function") +def report_subtest(request: pytest.FixtureRequest): verbose = request.config.getoption("verbose") do_capture = request.config.getoption("distributed_capture") @@ -155,6 +171,23 @@ def do_report_subtest(path: pathlib.Path, world_size: int) -> None: print(f"<<< not found {file_path}>>>", file=file_) else: print("Set verbose > 1 to show run output.") + + reports = {} + for rank in range(world_size): + try: + reports[f"rank_{rank}"] = json.load(path.joinpath(f"pytest_report_{rank}").open("r")) + except OSError: + reports[rank] = {} + keys = {key for report in reports.values() for key in report} + report = {key: max(report[key] for report in reports.values() if key in report) for key in keys} + report["gpus"] = world_size + reports["global"] = report + + print(header(f"Resource usage", 80), file=sys.stderr) + for name, report in reports.items(): + print(format_resource_report(name, report), file=sys.stderr) + setattr(request.node, "fast_llm_resource_report", report) + if not success: raise RuntimeError(f"test {path.name} failed") From 808feccf89d19ac61c99bd3da23228343263d90c Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Mon, 7 Jul 2025 19:02:03 -0400 Subject: [PATCH 05/14] misc --- tests/conftest.py | 6 + tests/models/distributed_test_checkpoint.py | 141 ++++++----------- tests/models/distributed_test_model.py | 2 + tests/models/test_checkpoint.py | 165 +++++++++----------- tests/models/test_model.py | 5 +- tests/utils/compare_tensor_logs.py | 15 +- tests/utils/run_test_script.py | 13 +- tests/utils/save_load_configs.py | 143 +++++++++++++++++ tests/utils/utils.py | 1 + 9 files changed, 290 insertions(+), 201 deletions(-) create mode 100644 tests/utils/save_load_configs.py diff --git a/tests/conftest.py b/tests/conftest.py index ef6fff695..960f0c7a4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -12,6 +12,12 @@ from fast_llm.utils import get_and_reset_memory_usage_mib from tests.utils.depends import DependencyManager +from tests.utils.save_load_configs import ( # isort: skip + distributed_save_load_config, + distributed_save_load_config_non_pp, + get_convert_path, +) + # Make fixtures available globally without import from tests.utils.run_test_script import ( # isort: skip compare_results_for_all_models, diff --git a/tests/models/distributed_test_checkpoint.py b/tests/models/distributed_test_checkpoint.py index 05a0bf443..51687c6d8 100644 --- a/tests/models/distributed_test_checkpoint.py +++ b/tests/models/distributed_test_checkpoint.py @@ -1,134 +1,87 @@ import gc import logging -import pathlib -import typing import torch from fast_llm.cli import fast_llm_main_wrapper +from fast_llm.config import NoAutoValidate +from fast_llm.core.distributed import safe_barrier from fast_llm.engine.checkpoint.config import ( - CheckpointFormat, CheckpointLoadConfig, CheckpointSaveConfig, DistributedCheckpointFormat, FastLLMCheckpointFormat, ) +from fast_llm.engine.distributed.config import DistributedConfig from fast_llm.engine.distributed.distributed import ProcessGroupPool from fast_llm.engine.multi_stage.config import StageMode -from fast_llm.utils import header -from tests.models.test_checkpoint import do_get_convert_path +from fast_llm.utils import Assert, header from tests.utils.model_configs import ModelTestingConfig from tests.utils.run_test_script import parse_run_distributed_script +from tests.utils.save_load_configs import DISTRIBUTED_SAVE_LOAD_CONFIGS, DistributedSaveLoadConfig +from tests.utils.utils import DistributedSubtestContext logger = logging.getLogger(__name__) def _test_load_and_save_parallel( model_testing_config: ModelTestingConfig, - pretrained_path: pathlib.Path, - pretrained_format: type[CheckpointFormat], - distributed_config: dict[str, typing.Any], - save_path: pathlib.Path, + config: DistributedSaveLoadConfig, ): - logger.info(header(save_path.name)) - logger.info(f"Loading {pretrained_format.name} checkpoint from {pretrained_path}") + logger.info(header(config.name)) + logger.info(f"Loading {config.load_format} checkpoint from {config.load_path}") + with NoAutoValidate(): + load_config = CheckpointLoadConfig(path=config.load_path, format=config.load_format) + load_config.setup(model_testing_config.model_config_class) + load_config.validate() model = model_testing_config.model_class.from_pretrained( - CheckpointLoadConfig(path=pretrained_path, format=pretrained_format), + load_config, # The world size and rank are already set through environment variable. - {"distributed": distributed_config}, + {"distributed": config.distributed}, mode=StageMode.inference, ) for save_format in (DistributedCheckpointFormat, FastLLMCheckpointFormat): - logger.info(f"Loading {save_format.name} checkpoint to {save_path / save_format.name}") - model.save_checkpoint(CheckpointSaveConfig(path=save_path / save_format.name, format=save_format)) + logger.info(f"Loading {save_format.name} checkpoint to {config.save_path / save_format.name}") + model.save_checkpoint(CheckpointSaveConfig(path=config.save_path / save_format.name, format=save_format)) del model gc.collect() torch.cuda.empty_cache() def main(args: list[str] | None = None) -> None: - base_path, model_testing_config, _ = parse_run_distributed_script(args) + base_path, model_testing_config, do_capture = parse_run_distributed_script(args) - with ProcessGroupPool(timeout=20): - for pretrained_format, pretrained_path in ( - ( - DistributedCheckpointFormat, - do_get_convert_path( - DistributedCheckpointFormat, model_testing_config.checkpoint_format, base_path=base_path.parent - ), - ), - ( - FastLLMCheckpointFormat, - do_get_convert_path( - FastLLMCheckpointFormat, model_testing_config.checkpoint_format, base_path=base_path.parent - ), - ), - ( - model_testing_config.checkpoint_format, - do_get_convert_path( - model_testing_config.checkpoint_format, DistributedCheckpointFormat, base_path=base_path.parent - ), - ), - ): - _test_load_and_save_parallel( - model_testing_config=model_testing_config, - pretrained_path=pretrained_path, - pretrained_format=pretrained_format, - distributed_config={}, - save_path=base_path / f"load_pretrained_{pretrained_format.name}_in_dp2", - ) - _test_load_and_save_parallel( - model_testing_config=model_testing_config, - pretrained_path=pretrained_path, - pretrained_format=pretrained_format, - distributed_config={"tensor_parallel": 2}, - save_path=base_path / f"load_pretrained_{pretrained_format.name}_in_tp2", - ) - _test_load_and_save_parallel( - model_testing_config=model_testing_config, - pretrained_path=pretrained_path, - pretrained_format=pretrained_format, - distributed_config={"tensor_parallel": 2, "sequence_tensor_parallel": True}, - save_path=base_path / f"load_pretrained_{pretrained_format.name}_in_stp2", - ) - _test_load_and_save_parallel( - model_testing_config=model_testing_config, - pretrained_path=pretrained_path, - pretrained_format=pretrained_format, - distributed_config={"pipeline_parallel": 2}, - save_path=base_path / f"load_pretrained_{pretrained_format.name}_in_pp2", - ) - - dist = DistributedCheckpointFormat.name - _test_load_and_save_parallel( - model_testing_config=model_testing_config, - pretrained_path=base_path / f"load_pretrained_{dist}_in_dp2" / dist, - pretrained_format=DistributedCheckpointFormat, - distributed_config={"tensor_parallel": 2, "sequence_tensor_parallel": True}, - save_path=base_path / "load_pretrained_dp2_in_stp2", - ) - _test_load_and_save_parallel( - model_testing_config=model_testing_config, - pretrained_path=base_path / f"load_pretrained_{dist}_in_stp2" / dist, - pretrained_format=DistributedCheckpointFormat, - distributed_config={}, - save_path=base_path / "load_pretrained_stp2_in_dp2", - ) - _test_load_and_save_parallel( - model_testing_config=model_testing_config, - pretrained_path=base_path / f"load_pretrained_{dist}_in_tp2" / dist, - pretrained_format=DistributedCheckpointFormat, - distributed_config={"tensor_parallel": 2, "sequence_tensor_parallel": True}, - save_path=base_path / "load_pretrained_tp2_in_pp2", - ) - _test_load_and_save_parallel( - model_testing_config=model_testing_config, - pretrained_path=base_path / f"load_pretrained_{dist}_in_pp2" / dist, - pretrained_format=DistributedCheckpointFormat, - distributed_config={"tensor_parallel": 2}, - save_path=base_path / "load_pretrained_pp2_in_tp2", + if do_capture: + logger.warning( + "Capturing output and forwarding to associated tests. Run with `--no-distributed-capture` to disable." ) + with ProcessGroupPool(timeout=20) as pool: + failures = [] + world_size = DistributedConfig.default_world_size + rank = DistributedConfig.default_rank + group = pool.get_process_group(range(world_size), rank) + + for config in DISTRIBUTED_SAVE_LOAD_CONFIGS.values(): + config = config.resolve(base_path, model_testing_config) + Assert.eq(world_size, config.num_gpus) + with DistributedSubtestContext(base_path, config.name, group, world_size, enabled=do_capture) as subtest: + _test_load_and_save_parallel( + model_testing_config=model_testing_config, + config=config, + ) + if not subtest.success: + failures.append(config.name) + + # Final barrier to ensure everything is done before torchrun potentially kills workers. + safe_barrier(group, "testing end") + # Let pytest know how things went. + # These should already be reported above, we repeat for convenience. + if failures: + raise RuntimeError(f"The following subtests failed: {", ".join(failures)}") + else: + logger.warning("All tests passed") + if __name__ == "__main__": with fast_llm_main_wrapper(): diff --git a/tests/models/distributed_test_model.py b/tests/models/distributed_test_model.py index 36f13ec2a..933b215e7 100644 --- a/tests/models/distributed_test_model.py +++ b/tests/models/distributed_test_model.py @@ -42,6 +42,8 @@ def main(args: list[str] | None = None) -> None: # These should already be reported above, we repeat for convenience. if failures: raise RuntimeError(f"The following subtests failed: {", ".join(failures)}") + else: + logger.warning("All tests passed") if __name__ == "__main__": diff --git a/tests/models/test_checkpoint.py b/tests/models/test_checkpoint.py index 8392494e4..3b615d748 100644 --- a/tests/models/test_checkpoint.py +++ b/tests/models/test_checkpoint.py @@ -1,4 +1,4 @@ -import functools +import logging import pathlib import shutil @@ -22,7 +22,9 @@ from tests.utils.compare_tensor_logs import CompareConfig, compare_logged_tensor from tests.utils.distributed_configs import DistributedTestingConfig from tests.utils.model_configs import ModelTestingConfig, ModelTestingGroup -from tests.utils.run_test_script import ARTIFACT_PATH +from tests.utils.save_load_configs import DISTRIBUTED_SAVE_LOAD_CONFIGS, DistributedSaveLoadConfig + +logger = logging.getLogger(__name__) _WEIGHT_SHARD_SAVE_NAME = f"{ShardName.weights}_shard" @@ -46,16 +48,12 @@ def test_checkpoint_and_eval(run_test_script_for_all_models, model_testing_confi @pytest.fixture(scope="module") def prepare_resume(run_test_script_base_path: pathlib.Path): def do_prepare_resume(distributed_testing_config: DistributedTestingConfig): - resume_from_path = run_test_script_base_path / distributed_testing_config.compare self_path = run_test_script_base_path / distributed_testing_config.name - shutil.copytree(resume_from_path, self_path) + shutil.copytree(run_test_script_base_path / distributed_testing_config.compare, self_path) shutil.rmtree(self_path / "checkpoint" / "2") assert (self_path / "checkpoint" / "1" / "ok").is_file() # TODO: Eval shutil.rmtree(self_path / "runs") - for artifact in ["init", "train_1"]: - path = f"{ARTIFACT_PATH}/0/tensor_logs_{artifact}.pt" - shutil.copy(resume_from_path / path, self_path / path) return do_prepare_resume @@ -64,40 +62,25 @@ def do_prepare_resume(distributed_testing_config: DistributedTestingConfig): @pytest.mark.model_testing_group(ModelTestingGroup.checkpoint) def test_resume(run_test_script_for_all_models, compare_results_for_all_models, prepare_resume): distributed_testing_config = DistributedTestingConfig( - name="resume", compare="test_checkpoint_and_eval", config_args=_CHECKPOINT_AND_EVAL_ARGS + name="resume", compare="checkpoint_and_eval", config_args=_CHECKPOINT_AND_EVAL_ARGS ) - prepare_resume(distributed_testing_config) - # Resume from iteration=1 and compare outputs with the baseline run. run_test_script_for_all_models(distributed_testing_config) - compare_results_for_all_models("distributed_testing_config") + compare_results_for_all_models(distributed_testing_config, ("train_2",)) @pytest.mark.depends_on(on=["test_checkpoint_and_eval[{model_testing_config}]"]) @pytest.mark.model_testing_group(ModelTestingGroup.checkpoint) def test_resume_frozen(run_test_script_for_all_models, prepare_resume): distributed_testing_config = DistributedTestingConfig( - name="resume_frozen", compare="test_checkpoint_and_eval", config_args=_CHECKPOINT_AND_EVAL_ARGS + name="resume_frozen", compare="checkpoint_and_eval", config_args=_CHECKPOINT_AND_EVAL_ARGS ) prepare_resume(distributed_testing_config) # Resume with frozen mlp. No comparison. run_test_script_for_all_models(distributed_testing_config) -def do_get_convert_path( - to: type[CheckpointFormat] | None = None, from_: type[CheckpointFormat] | None = None, *, base_path: pathlib.Path -) -> pathlib.Path: - if to is None or from_ is None: - return base_path / "test_checkpoint_and_eval" / "checkpoint" / "2" - return base_path / "test_convert_model" / f"{to.name}_from_{from_.name}" - - -@pytest.fixture(scope="module") -def get_convert_path(run_test_script_base_path): - return functools.partial(do_get_convert_path, base_path=run_test_script_base_path) - - @pytest.fixture(scope="module") def run_conversion(model_testing_config: ModelTestingConfig, get_convert_path): def do_run_conversion( @@ -155,9 +138,12 @@ def test_conversion(model_testing_config, run_conversion, get_convert_path): def _compare_safetensor_files( - reference_path: pathlib.Path, *other_paths: pathlib.Path, expected_keys: set[str] | None = None + reference: pathlib.Path | dict[str, torch.Tensor], + *other_paths: pathlib.Path, + expected_keys: set[str] | None = None, ): - reference = safetensors.torch.load_file(reference_path) + if isinstance(reference, pathlib.Path): + reference = safetensors.torch.load_file(reference) if expected_keys is None: expected_keys = set(reference.keys()) else: @@ -341,91 +327,82 @@ def test_huggingface_model(model_testing_config, get_convert_path): raise ValueError(f"Comparison failed ({len(errors)} errors)") -@pytest.fixture(scope="module") -def load_and_save_parallel_base_path(run_test_script_base_path): - return run_test_script_base_path / "test_load_and_save_parallel" - - -@pytest.mark.depends_on( - on=[ - "test_load_pretrained[{model_testing_config}]", - ] -) +@pytest.mark.depends_on(on=["test_load_pretrained[{model_testing_config}]"]) @pytest.mark.model_testing_group(ModelTestingGroup.convert, ModelTestingGroup.distributed) -def test_save_and_load_in_parallel(run_distributed_script, load_and_save_parallel_base_path, model_testing_config): +def test_save_and_load_in_parallel(run_distributed_script, run_test_script_base_path, model_testing_config, request): # Save and load checkpoints to and from various distributed configurations. # Combined in a single test to mitigate process creation overhead. # TODO: Test beyond 2 gpu configs? import tests.models.distributed_test_checkpoint - run_distributed_script( - [ - tests.models.distributed_test_checkpoint.__file__, - str(load_and_save_parallel_base_path), - model_testing_config.name, - ], - num_gpus=2, - ) + script = [ + tests.models.distributed_test_checkpoint.__file__, + str(run_test_script_base_path), + model_testing_config.name, + ] + if request.config.getoption("distributed_capture"): + logger.warning( + "Capturing output and forwarding to associated tests. Run with `--no-distributed-capture` to disable." + ) + else: + script.append("--no-distributed-capture") + run_distributed_script(script, num_gpus=2) @pytest.fixture(scope="module") -def parallel_checkpoint_names(model_testing_config): - names = [] - for format_ in (DistributedCheckpointFormat, FastLLMCheckpointFormat, model_testing_config.checkpoint_format): - names.extend( - [ - f"load_pretrained_{format_.name}_in_dp2", - f"load_pretrained_{format_.name}_in_tp2", - f"load_pretrained_{format_.name}_in_stp2", - f"load_pretrained_{format_.name}_in_pp2", - ] - ) - - names.extend( - [ - "load_pretrained_dp2_in_stp2", - "load_pretrained_stp2_in_dp2", - "load_pretrained_tp2_in_pp2", - "load_pretrained_pp2_in_tp2", - ] - ) - return names +def reference_distributed_shard(get_convert_path) -> torch.Tensor: + # Load the file in a fixture (on cpu) so it's not loaded from disk each time. + return safetensors.torch.load_file(get_convert_path() / "rank_0.safetensors")[_WEIGHT_SHARD_SAVE_NAME] -@pytest.mark.depends_on(on=["test_save_and_load_in_parallel[{model_testing_config}]"]) +@pytest.mark.depends_on(on=["test_load_pretrained[{model_testing_config}]"]) @pytest.mark.model_testing_group(ModelTestingGroup.convert, ModelTestingGroup.distributed) def test_load_parallel_checkpoint_in_single_gpu( - load_and_save_parallel_base_path, get_convert_path, load_and_compare_checkpoints, parallel_checkpoint_names + distributed_save_load_config: DistributedSaveLoadConfig, + run_test_script_base_path, + model_testing_config, + load_and_compare_checkpoints, + reference_distributed_shard, + report_subtest, ): - # Test single-gpu loading of multi-gpu distributed checkpoints. - reference_shard = safetensors.torch.load_file(get_convert_path() / "rank_0.safetensors", device="cuda")[ - _WEIGHT_SHARD_SAVE_NAME - ] + distributed_save_load_config = distributed_save_load_config.resolve( + base_path=run_test_script_base_path, model_testing_config=model_testing_config + ) + report_subtest(distributed_save_load_config.save_path, distributed_save_load_config.num_gpus) + load_and_compare_checkpoints( + DistributedCheckpointFormat, + distributed_save_load_config.save_path / DistributedCheckpointFormat.name, + None, + reference_distributed_shard.to(device="cuda"), + ) - for name in parallel_checkpoint_names: - load_and_compare_checkpoints( - DistributedCheckpointFormat, - load_and_save_parallel_base_path / name / DistributedCheckpointFormat.name, - None, - reference_shard, - ) + +@pytest.fixture(scope="module") +def reference_fast_llm_shard(get_convert_path) -> dict[str, torch.Tensor]: + # Load the file in a fixture (on cpu) so it's not loaded from disk each time. + return safetensors.torch.load_file( + get_convert_path(FastLLMCheckpointFormat, DistributedCheckpointFormat) / f"model_0.safetensors" + ) @pytest.mark.depends_on(on=["test_save_and_load_in_parallel[{model_testing_config}]"]) @pytest.mark.model_testing_group(ModelTestingGroup.convert, ModelTestingGroup.distributed) -def test_parallel_checkpoint_consistency(model_testing_config, load_and_save_parallel_base_path, get_convert_path): +def test_parallel_checkpoint_consistency(model_testing_config, run_test_script_base_path): # Check the consistency of the checkpoints saved in `test_save_and_load_in_parallel` - checkpoint_formats = (DistributedCheckpointFormat, FastLLMCheckpointFormat, model_testing_config.checkpoint_format) # Compare Distributed checkpoints for config in ("dp2", "tp2", "stp2", "pp2"): for rank in range(2): _compare_safetensor_files( *[ - load_and_save_parallel_base_path - / f"load_pretrained_{format_.name}_in_{config}" - / DistributedCheckpointFormat.name - / f"rank_{rank}.safetensors" - for format_ in checkpoint_formats + DISTRIBUTED_SAVE_LOAD_CONFIGS[f"load_{format_}_in_{config}"] + .resolve(base_path=run_test_script_base_path, model_testing_config=model_testing_config) + .save_path + / f"{DistributedCheckpointFormat.name}/rank_{rank}.safetensors" + for format_ in ( + DistributedCheckpointFormat.name, + FastLLMCheckpointFormat.name, + "{checkpoint_format}", + ) ] ) @@ -433,15 +410,15 @@ def test_parallel_checkpoint_consistency(model_testing_config, load_and_save_par @pytest.mark.depends_on(on=["test_save_and_load_in_parallel[{model_testing_config}]"]) @pytest.mark.model_testing_group(ModelTestingGroup.convert, ModelTestingGroup.distributed) def test_multi_gpu_fast_llm_checkpoint( - model_testing_config, load_and_save_parallel_base_path, get_convert_path, parallel_checkpoint_names + model_testing_config, distributed_save_load_config_non_pp, run_test_script_base_path, reference_fast_llm_shard ): # Fast-LLM checkpoints are independent of the distributed configuration that saved it. # TODO: Check pipeline-parallel checkpoints (two files). + distributed_save_load_config_non_pp = distributed_save_load_config_non_pp.resolve( + base_path=run_test_script_base_path, model_testing_config=model_testing_config + ) + _compare_safetensor_files( - get_convert_path(FastLLMCheckpointFormat, DistributedCheckpointFormat) / f"model_0.safetensors", - *[ - load_and_save_parallel_base_path / name / FastLLMCheckpointFormat.name / f"model_0.safetensors" - for name in parallel_checkpoint_names - if "in_pp2" not in name - ], + reference_fast_llm_shard, + distributed_save_load_config_non_pp.save_path / f"{FastLLMCheckpointFormat.name}/model_0.safetensors", ) diff --git a/tests/models/test_model.py b/tests/models/test_model.py index 2aeff95cc..b1579c3f4 100644 --- a/tests/models/test_model.py +++ b/tests/models/test_model.py @@ -49,11 +49,12 @@ def test_run_model_distributed(run_distributed_script, model_testing_config, run import tests.models.distributed_test_model script = [tests.models.distributed_test_model.__file__, str(run_test_script_base_path), model_testing_config.name] - if not request.config.getoption("distributed_capture"): + if request.config.getoption("distributed_capture"): logger.warning( "Capturing output and forwarding to associated tests. Run with `--no-distributed-capture` to disable." ) - script.append("--no-capture") + else: + script.append("--no-distributed-capture") run_distributed_script(script, num_gpus=torch.cuda.device_count()) diff --git a/tests/utils/compare_tensor_logs.py b/tests/utils/compare_tensor_logs.py index 96acf9658..f22859dfd 100644 --- a/tests/utils/compare_tensor_logs.py +++ b/tests/utils/compare_tensor_logs.py @@ -1,6 +1,7 @@ import argparse import dataclasses import pathlib +import typing import warnings import torch @@ -20,13 +21,17 @@ class CompareConfig: ignore_duplicates: list[str] = dataclasses.field(default_factory=list) -def extract_tensor_logs(artifact_path: pathlib.Path, errors, config: CompareConfig): +def extract_tensor_logs( + artifact_path: pathlib.Path, errors, config: CompareConfig, artifacts: typing.Sequence[str] | None = None +): tensor_logs = {} ignore_keys = set() for rank_path in sorted(artifact_path.iterdir()): for p in rank_path.iterdir(): if p.name.startswith(_TENSOR_LOG_PREFIX) and p.suffix == ".pt": step_name = p.stem[len(_TENSOR_LOG_PREFIX) :] + if artifacts is not None and step_name not in artifacts: + continue step_logs = torch.load(p) if step_name not in tensor_logs: tensor_logs[step_name] = {} @@ -112,14 +117,15 @@ def compare_tensor_logs_base( artifact_path_ref: pathlib.Path, artifact_path_test: pathlib.Path, config: CompareConfig | None = None, + artifacts: typing.Sequence[str] | None = None, ): errors = [] if config is None: config = CompareConfig() - logs_ref = extract_tensor_logs(artifact_path_ref, errors, config=config) - logs_test = extract_tensor_logs(artifact_path_test, errors, config=config) + logs_ref = extract_tensor_logs(artifact_path_ref, errors, config=config, artifacts=artifacts) + logs_test = extract_tensor_logs(artifact_path_test, errors, config=config, artifacts=artifacts) for step_key in sorted(compare_dict_keys(logs_ref, logs_test, errors, "Logged steps")): step_logs_ref = logs_ref[step_key] @@ -144,9 +150,10 @@ def compare_tensor_logs( artifact_path_ref: pathlib.Path, artifact_path_test: pathlib.Path, config: CompareConfig | None = None, + artifacts: typing.Sequence[str] | None = None, ): print(f'Comparing tensor logs in "{artifact_path_test}" with reference logs "{artifact_path_ref}"') - errors = compare_tensor_logs_base(artifact_path_ref, artifact_path_test, config) + errors = compare_tensor_logs_base(artifact_path_ref, artifact_path_test, config, artifacts) if errors: for error in errors: print(error) diff --git a/tests/utils/run_test_script.py b/tests/utils/run_test_script.py index 61bc75074..602afeb23 100644 --- a/tests/utils/run_test_script.py +++ b/tests/utils/run_test_script.py @@ -29,7 +29,7 @@ def do_run_distributed_script( rendezvous_port: int, torchrun_port: int, num_gpus: int, - timeout: float = 120, + timeout: float = 240, env: dict[str, str | None] = None, ): command = [ @@ -113,16 +113,15 @@ def compare_results_for_all_models( worker_resources: "WorkerResources", run_test_script_base_path: pathlib.Path, ): - def do_compare_results_for_all_models(config: DistributedTestingConfig, artifacts: typing.Iterable[str]): + def do_compare_results_for_all_models( + config: DistributedTestingConfig, artifacts: typing.Iterable[str] | None = None + ): assert config.compare is not None - compare_path = run_test_script_base_path / config.compare / ARTIFACT_PATH - for artifact in artifacts: - if not (artifact_path := compare_path / "0" / f"tensor_logs_{artifact}.pt").is_file(): - pytest.fail(f"Missing artifact {artifact_path} from {config.compare}.", pytrace=False) compare_tensor_logs( - compare_path, + run_test_script_base_path / config.compare / ARTIFACT_PATH, run_test_script_base_path / config.name / ARTIFACT_PATH, config.compare_config, + artifacts, ) return do_compare_results_for_all_models diff --git a/tests/utils/save_load_configs.py b/tests/utils/save_load_configs.py new file mode 100644 index 000000000..f5a15020e --- /dev/null +++ b/tests/utils/save_load_configs.py @@ -0,0 +1,143 @@ +import dataclasses +import functools +import pathlib +import typing + +import pytest + +from fast_llm.engine.checkpoint.config import CheckpointFormat, DistributedCheckpointFormat, FastLLMCheckpointFormat +from tests.utils.model_configs import ModelTestingConfig + + +@dataclasses.dataclass(kw_only=True) +class DistributedSaveLoadConfig: + load_path: pathlib.Path | str + load_format: str + save_path: pathlib.Path | str + distributed: dict[str, typing.Any] + num_gpus: int = 2 + + def resolve(self, base_path: pathlib.Path, model_testing_config: ModelTestingConfig) -> typing.Self: + return dataclasses.replace( + self, + load_path=base_path + / str(self.load_path).format(checkpoint_format=model_testing_config.checkpoint_format.name), + load_format=self.load_format.format(checkpoint_format=model_testing_config.checkpoint_format.name), + save_path=base_path + / str(self.save_path).format(checkpoint_format=model_testing_config.checkpoint_format.name), + ) + + @property + def name(self) -> str: + return pathlib.Path(self.save_path).name + + +def do_get_convert_path( + to: type[CheckpointFormat] | str | None = None, + from_: type[CheckpointFormat] | str | None = None, + *, + base_path: pathlib.Path, +) -> pathlib.Path: + if to is None or from_ is None: + return base_path / "checkpoint_and_eval" / "checkpoint" / "2" + return ( + base_path + / "convert_model" + / f"{to.name if isinstance(to,type) else to}_from_{from_.name if isinstance(from_,type) else from_}" + ) + + +@pytest.fixture(scope="module") +def get_convert_path(run_test_script_base_path): + return functools.partial(do_get_convert_path, base_path=run_test_script_base_path) + + +_DISTRIBUTED_SAVE_LOAD_CONFIGS = [] + + +for pretrained_format, pretrained_path in ( + ( + DistributedCheckpointFormat.name, + do_get_convert_path(DistributedCheckpointFormat.name, "{checkpoint_format}", base_path=pathlib.Path()), + ), + ( + FastLLMCheckpointFormat.name, + do_get_convert_path(FastLLMCheckpointFormat.name, "{checkpoint_format}", base_path=pathlib.Path()), + ), + ( + "{checkpoint_format}", + do_get_convert_path("{checkpoint_format}", DistributedCheckpointFormat.name, base_path=pathlib.Path()), + ), +): + _DISTRIBUTED_SAVE_LOAD_CONFIGS.extend( + [ + DistributedSaveLoadConfig( + load_path=pretrained_path, + load_format=pretrained_format, + save_path=f"load_{pretrained_format}_in_dp2", + distributed={}, + ), + DistributedSaveLoadConfig( + load_path=pretrained_path, + load_format=pretrained_format, + save_path=f"load_{pretrained_format}_in_tp2", + distributed={"tensor_parallel": 2}, + ), + DistributedSaveLoadConfig( + load_path=pretrained_path, + load_format=pretrained_format, + save_path=f"load_{pretrained_format}_in_stp2", + distributed={"tensor_parallel": 2, "sequence_tensor_parallel": True}, + ), + DistributedSaveLoadConfig( + load_path=pretrained_path, + load_format=pretrained_format, + save_path=f"load_{pretrained_format}_in_pp2", + distributed={"pipeline_parallel": 2}, + ), + ] + ) + +_DISTRIBUTED_SAVE_LOAD_CONFIGS.extend( + [ + DistributedSaveLoadConfig( + load_path=f"load_{DistributedCheckpointFormat.name}_in_dp2/{DistributedCheckpointFormat.name}", + load_format=DistributedCheckpointFormat.name, + save_path="load_dp2_in_stp2", + distributed={"tensor_parallel": 2, "sequence_tensor_parallel": True}, + ), + DistributedSaveLoadConfig( + load_path=f"load_{DistributedCheckpointFormat.name}_in_stp2/{DistributedCheckpointFormat.name}", + load_format=DistributedCheckpointFormat.name, + save_path="load_stp2_in_dp2", + distributed={}, + ), + DistributedSaveLoadConfig( + load_path=f"load_{DistributedCheckpointFormat.name}_in_tp2/{DistributedCheckpointFormat.name}", + load_format=DistributedCheckpointFormat.name, + save_path="load_tp2_in_pp2", + distributed={"pipeline_parallel": 2}, + ), + DistributedSaveLoadConfig( + load_path=f"load_{DistributedCheckpointFormat.name}_in_pp2/{DistributedCheckpointFormat.name}", + load_format=DistributedCheckpointFormat.name, + save_path="load_pp2_in_tp2", + distributed={"tensor_parallel": 2}, + ), + ] +) + +# TODO: Name isn't formated. +DISTRIBUTED_SAVE_LOAD_CONFIGS: dict[str, DistributedSaveLoadConfig] = { + config.name: config for config in _DISTRIBUTED_SAVE_LOAD_CONFIGS +} + + +@pytest.fixture(scope="module", params=DISTRIBUTED_SAVE_LOAD_CONFIGS) +def distributed_save_load_config(request): + return DISTRIBUTED_SAVE_LOAD_CONFIGS[request.param] + + +@pytest.fixture(scope="module", params=[name for name in DISTRIBUTED_SAVE_LOAD_CONFIGS if "pp2" not in name]) +def distributed_save_load_config_non_pp(request): + return DISTRIBUTED_SAVE_LOAD_CONFIGS[request.param] diff --git a/tests/utils/utils.py b/tests/utils/utils.py index 600b4aecb..54efe0966 100644 --- a/tests/utils/utils.py +++ b/tests/utils/utils.py @@ -100,6 +100,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): if exc_type is None: self.success = True else: + self._path.mkdir(parents=True, exist_ok=True) self._path.joinpath(f"pytest_traceback_{self._rank}").write_text(traceback.format_exc()) if self._group is not None: From 5ae8388d0f62bedef1e6ee7bbd00d69adef8197e Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Tue, 8 Jul 2025 12:38:05 -0400 Subject: [PATCH 06/14] stuff --- .github/workflows/ci.yaml | 2 +- fast_llm/engine/distributed/config.py | 1 - fast_llm/engine/distributed/distributed.py | 2 +- tests/models/test_checkpoint.py | 28 +++++++++++++++------- tests/models/test_model.py | 6 ++++- 5 files changed, 27 insertions(+), 12 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 0bca2dd8d..ca7ea749d 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -33,7 +33,7 @@ jobs: MAMBA_FORCE_BUILD=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE \ pip install --no-build-isolation -e ".[CORE,OPTIONAL,HUGGINGFACE,SSM,DEV,DOCS]" - name: Run tests - run: pytest . + run: pytest -v -ra . docker: name: Docker diff --git a/fast_llm/engine/distributed/config.py b/fast_llm/engine/distributed/config.py index 6f2e2ab95..9ec63517c 100644 --- a/fast_llm/engine/distributed/config.py +++ b/fast_llm/engine/distributed/config.py @@ -357,7 +357,6 @@ def _get_global_ranks(self, size: int, stride: int) -> range: def _add_distributed_dim(self, distributed_dim: DistributedDim) -> None: Assert.eq(distributed_dim.global_ranks[distributed_dim.rank], self.rank, msg=distributed_dim) - logger.info(f"Initializing group {distributed_dim}") try: distributed_dim.check_ranks_in_range(0, self.world_size) except: diff --git a/fast_llm/engine/distributed/distributed.py b/fast_llm/engine/distributed/distributed.py index f53f25afc..200074ee9 100644 --- a/fast_llm/engine/distributed/distributed.py +++ b/fast_llm/engine/distributed/distributed.py @@ -37,6 +37,7 @@ def __init__( ) self._timeout = timeout self._use_cpu = use_cpu + self._process_groups = {} if self._use_cpu: Assert.eq(self._world_size, 1) @@ -60,7 +61,6 @@ def __init__( timeout=datetime.timedelta(seconds=timeout), ) ) - self._process_groups = {} @property def rank(self): diff --git a/tests/models/test_checkpoint.py b/tests/models/test_checkpoint.py index 3b615d748..0781ee549 100644 --- a/tests/models/test_checkpoint.py +++ b/tests/models/test_checkpoint.py @@ -23,6 +23,7 @@ from tests.utils.distributed_configs import DistributedTestingConfig from tests.utils.model_configs import ModelTestingConfig, ModelTestingGroup from tests.utils.save_load_configs import DISTRIBUTED_SAVE_LOAD_CONFIGS, DistributedSaveLoadConfig +from tests.utils.utils import requires_cuda logger = logging.getLogger(__name__) @@ -58,6 +59,7 @@ def do_prepare_resume(distributed_testing_config: DistributedTestingConfig): return do_prepare_resume +# @requires_cuda @pytest.mark.depends_on(on=["test_checkpoint_and_eval[{model_testing_config}]"]) @pytest.mark.model_testing_group(ModelTestingGroup.checkpoint) def test_resume(run_test_script_for_all_models, compare_results_for_all_models, prepare_resume): @@ -70,6 +72,7 @@ def test_resume(run_test_script_for_all_models, compare_results_for_all_models, compare_results_for_all_models(distributed_testing_config, ("train_2",)) +# @requires_cuda @pytest.mark.depends_on(on=["test_checkpoint_and_eval[{model_testing_config}]"]) @pytest.mark.model_testing_group(ModelTestingGroup.checkpoint) def test_resume_frozen(run_test_script_for_all_models, prepare_resume): @@ -101,6 +104,7 @@ def do_run_conversion( return do_run_conversion +# @requires_cuda @pytest.mark.depends_on(on=["test_checkpoint_and_eval[{model_testing_config}]"]) @pytest.mark.model_testing_group(ModelTestingGroup.convert) def test_conversion(model_testing_config, run_conversion, get_convert_path): @@ -156,6 +160,7 @@ def _compare_safetensor_files( Assert.all_equal(reference[key], other[key]) +# @requires_cuda @pytest.mark.depends_on(on=["test_conversion[{model_testing_config}]"]) @pytest.mark.model_testing_group(ModelTestingGroup.convert) def test_converted_round_trip(model_testing_config, get_convert_path): @@ -203,6 +208,7 @@ def do_load_and_compare_checkpoints( return do_load_and_compare_checkpoints +# @requires_cuda @pytest.mark.depends_on(on=["test_conversion[{model_testing_config}]"]) @pytest.mark.model_testing_group(ModelTestingGroup.convert) def test_load_pretrained( @@ -269,6 +275,7 @@ def test_load_pretrained( ) +# @requires_cuda @pytest.mark.depends_on(on=["test_load_pretrained[{model_testing_config}]"]) @pytest.mark.model_testing_group(ModelTestingGroup.convert) def test_huggingface_model(model_testing_config, get_convert_path): @@ -327,6 +334,7 @@ def test_huggingface_model(model_testing_config, get_convert_path): raise ValueError(f"Comparison failed ({len(errors)} errors)") +# @requires_cuda @pytest.mark.depends_on(on=["test_load_pretrained[{model_testing_config}]"]) @pytest.mark.model_testing_group(ModelTestingGroup.convert, ModelTestingGroup.distributed) def test_save_and_load_in_parallel(run_distributed_script, run_test_script_base_path, model_testing_config, request): @@ -355,6 +363,7 @@ def reference_distributed_shard(get_convert_path) -> torch.Tensor: return safetensors.torch.load_file(get_convert_path() / "rank_0.safetensors")[_WEIGHT_SHARD_SAVE_NAME] +# @requires_cuda @pytest.mark.depends_on(on=["test_load_pretrained[{model_testing_config}]"]) @pytest.mark.model_testing_group(ModelTestingGroup.convert, ModelTestingGroup.distributed) def test_load_parallel_checkpoint_in_single_gpu( @@ -377,14 +386,7 @@ def test_load_parallel_checkpoint_in_single_gpu( ) -@pytest.fixture(scope="module") -def reference_fast_llm_shard(get_convert_path) -> dict[str, torch.Tensor]: - # Load the file in a fixture (on cpu) so it's not loaded from disk each time. - return safetensors.torch.load_file( - get_convert_path(FastLLMCheckpointFormat, DistributedCheckpointFormat) / f"model_0.safetensors" - ) - - +@requires_cuda @pytest.mark.depends_on(on=["test_save_and_load_in_parallel[{model_testing_config}]"]) @pytest.mark.model_testing_group(ModelTestingGroup.convert, ModelTestingGroup.distributed) def test_parallel_checkpoint_consistency(model_testing_config, run_test_script_base_path): @@ -407,6 +409,16 @@ def test_parallel_checkpoint_consistency(model_testing_config, run_test_script_b ) +@pytest.fixture(scope="module") +def reference_fast_llm_shard(get_convert_path) -> dict[str, torch.Tensor]: + # Load the file in a fixture (on cpu) so it's not loaded from disk each time. + path = get_convert_path(FastLLMCheckpointFormat, DistributedCheckpointFormat) / f"model_0.safetensors" + if not path.is_file(): + pytest.skip(f"Reference model failed or did not run.") + return safetensors.torch.load_file(path) + + +# @requires_cuda @pytest.mark.depends_on(on=["test_save_and_load_in_parallel[{model_testing_config}]"]) @pytest.mark.model_testing_group(ModelTestingGroup.convert, ModelTestingGroup.distributed) def test_multi_gpu_fast_llm_checkpoint( diff --git a/tests/models/test_model.py b/tests/models/test_model.py index b1579c3f4..91670b253 100644 --- a/tests/models/test_model.py +++ b/tests/models/test_model.py @@ -9,11 +9,12 @@ SINGLE_GPU_TESTING_CONFIGS, ) from tests.utils.model_configs import ModelTestingGroup -from tests.utils.utils import check_subtest_success, set_subtest_success +from tests.utils.utils import check_subtest_success, requires_cuda, set_subtest_success logger = logging.getLogger(__name__) +@requires_cuda @pytest.mark.model_testing_group(ModelTestingGroup.basic) def test_model_simple(run_test_script_for_all_models, run_test_script_base_path): # A simple config to prevent unnecessary testing and creation of dependency group @@ -21,6 +22,7 @@ def test_model_simple(run_test_script_for_all_models, run_test_script_base_path) set_subtest_success(run_test_script_base_path / SIMPLE_TESTING_CONFIG.name) +@requires_cuda @pytest.mark.depends_on(on=["test_model_simple[{model_testing_config}]"]) @pytest.mark.model_testing_group(ModelTestingGroup.basic) # Parametrize with config name so it shows in test name. @@ -41,6 +43,7 @@ def test_and_compare_model( compare_results_for_all_models(config, ("init", "train_1", "train_2")) +@requires_cuda @pytest.mark.depends_on(on=["test_model_simple[{model_testing_config}]"]) @pytest.mark.model_testing_group( ModelTestingGroup.distributed, @@ -60,6 +63,7 @@ def test_run_model_distributed(run_distributed_script, model_testing_config, run # We don't want to depend on `test_model_distributed` because we still want to run this in cas of failure. # This should still run after `test_model_distributed` +@requires_cuda @pytest.mark.depends_on(on=["test_model_simple[{model_testing_config}]"]) @pytest.mark.model_testing_group(ModelTestingGroup.distributed) @pytest.mark.parametrize("config_name", list(DISTRIBUTED_TESTING_CONFIGS)) From 4626d7fe460abe9378247f9d0f7548f68bccf6dc Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Tue, 8 Jul 2025 15:31:27 -0400 Subject: [PATCH 07/14] stuff --- tests/conftest.py | 35 ++++++++++++++------------- tests/models/test_checkpoint.py | 43 +++++++++++++++++++++------------ 2 files changed, 45 insertions(+), 33 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 960f0c7a4..e9011979a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,12 +6,25 @@ import shutil import pytest -import torch import xdist.scheduler from fast_llm.utils import get_and_reset_memory_usage_mib from tests.utils.depends import DependencyManager +if worker_name := os.environ.get("PYTEST_XDIST_WORKER"): + if gpus := os.environ.get("CUDA_VISIBLE_DEVICES"): + # We set the device through "CUDA_VISIBLE_DEVICES", and this needs to happen before importing torch. + assert worker_name.startswith("gw") + worker_id = int(worker_name[2:]) + gpus = [int(i) for i in gpus.split(",")] + num_gpus = len(gpus) + gpus = [gpus[(i + worker_id) % num_gpus] for i in range(num_gpus)] + os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(i) for i in gpus) + + +import torch # isort: skip + + from tests.utils.save_load_configs import ( # isort: skip distributed_save_load_config, distributed_save_load_config_non_pp, @@ -29,6 +42,7 @@ from tests.utils.model_configs import model_testing_config, ModelTestingConfig, testing_group_enabled # isort: skip from tests.utils.utils import result_path, TEST_RESULTS_PATH, format_resource_report, report_subtest # isort: skip +logger = logging.getLogger(__name__) manager: DependencyManager | None = None @@ -56,9 +70,6 @@ def pytest_addoption(parser): @dataclasses.dataclass class WorkerResources: - worker_id: int - gpu_id: int | None - num_gpus: int torchrun_port: int rendezvous_port: int @@ -92,17 +103,10 @@ def pytest_configure(config): num_gpus = torch.cuda.device_count() if num_gpus > 0 and is_parallel: # We spread workers across GPUs. - gpu_id = worker_id % num_gpus - # We set the device through "CUDA_VISIBLE_DEVICES", and this needs to happen before cuda initialization. - # The `device_count` call above doesn't initialize, but `mem_get_info` below does. - assert not torch.cuda.is_initialized() - # TODO: Support this? - assert "CUDA_VISIBLE_DEVICES" not in os.environ - os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str((gpu_id + i) % num_gpus) for i in range(num_gpus)) + logger.warning(f"[Worker {worker_id}] Using GPUs {os.environ["CUDA_VISIBLE_DEVICES"]}") elif num_gpus > 0: - gpu_id = 0 - else: - gpu_id = None + if "CUDA_VISIBLE_DEVICES" not in os.environ: + os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(i) for i in range(num_gpus)) gpu_memory = torch.cuda.mem_get_info(0)[1] if num_gpus > 0 else 0 if num_gpus > 0: @@ -118,9 +122,6 @@ def pytest_configure(config): ) config.worker_resources = WorkerResources( - worker_id=worker_id, - gpu_id=gpu_id, - num_gpus=num_gpus, # Each worker needs its own set of ports for safe distributed run. Hopefully these are free. torchrun_port=TORCHRUN_DEFAULT_PORT + 2 * worker_id, rendezvous_port=TORCHRUN_DEFAULT_PORT + 2 * worker_id + 1, diff --git a/tests/models/test_checkpoint.py b/tests/models/test_checkpoint.py index 0781ee549..4d70857ff 100644 --- a/tests/models/test_checkpoint.py +++ b/tests/models/test_checkpoint.py @@ -59,7 +59,7 @@ def do_prepare_resume(distributed_testing_config: DistributedTestingConfig): return do_prepare_resume -# @requires_cuda +@requires_cuda @pytest.mark.depends_on(on=["test_checkpoint_and_eval[{model_testing_config}]"]) @pytest.mark.model_testing_group(ModelTestingGroup.checkpoint) def test_resume(run_test_script_for_all_models, compare_results_for_all_models, prepare_resume): @@ -72,7 +72,7 @@ def test_resume(run_test_script_for_all_models, compare_results_for_all_models, compare_results_for_all_models(distributed_testing_config, ("train_2",)) -# @requires_cuda +@requires_cuda @pytest.mark.depends_on(on=["test_checkpoint_and_eval[{model_testing_config}]"]) @pytest.mark.model_testing_group(ModelTestingGroup.checkpoint) def test_resume_frozen(run_test_script_for_all_models, prepare_resume): @@ -104,7 +104,7 @@ def do_run_conversion( return do_run_conversion -# @requires_cuda +@requires_cuda @pytest.mark.depends_on(on=["test_checkpoint_and_eval[{model_testing_config}]"]) @pytest.mark.model_testing_group(ModelTestingGroup.convert) def test_conversion(model_testing_config, run_conversion, get_convert_path): @@ -160,7 +160,7 @@ def _compare_safetensor_files( Assert.all_equal(reference[key], other[key]) -# @requires_cuda +@requires_cuda @pytest.mark.depends_on(on=["test_conversion[{model_testing_config}]"]) @pytest.mark.model_testing_group(ModelTestingGroup.convert) def test_converted_round_trip(model_testing_config, get_convert_path): @@ -208,7 +208,7 @@ def do_load_and_compare_checkpoints( return do_load_and_compare_checkpoints -# @requires_cuda +@requires_cuda @pytest.mark.depends_on(on=["test_conversion[{model_testing_config}]"]) @pytest.mark.model_testing_group(ModelTestingGroup.convert) def test_load_pretrained( @@ -275,7 +275,7 @@ def test_load_pretrained( ) -# @requires_cuda +@requires_cuda @pytest.mark.depends_on(on=["test_load_pretrained[{model_testing_config}]"]) @pytest.mark.model_testing_group(ModelTestingGroup.convert) def test_huggingface_model(model_testing_config, get_convert_path): @@ -334,7 +334,7 @@ def test_huggingface_model(model_testing_config, get_convert_path): raise ValueError(f"Comparison failed ({len(errors)} errors)") -# @requires_cuda +@requires_cuda @pytest.mark.depends_on(on=["test_load_pretrained[{model_testing_config}]"]) @pytest.mark.model_testing_group(ModelTestingGroup.convert, ModelTestingGroup.distributed) def test_save_and_load_in_parallel(run_distributed_script, run_test_script_base_path, model_testing_config, request): @@ -358,12 +358,16 @@ def test_save_and_load_in_parallel(run_distributed_script, run_test_script_base_ @pytest.fixture(scope="module") -def reference_distributed_shard(get_convert_path) -> torch.Tensor: +def reference_distributed_shard(get_convert_path) -> torch.Tensor | None: # Load the file in a fixture (on cpu) so it's not loaded from disk each time. - return safetensors.torch.load_file(get_convert_path() / "rank_0.safetensors")[_WEIGHT_SHARD_SAVE_NAME] + try: + return safetensors.torch.load_file(get_convert_path() / "rank_0.safetensors")[_WEIGHT_SHARD_SAVE_NAME] + except OSError: + # The fixture may be evaluated even if the tests are to be skipped. + return None -# @requires_cuda +@requires_cuda @pytest.mark.depends_on(on=["test_load_pretrained[{model_testing_config}]"]) @pytest.mark.model_testing_group(ModelTestingGroup.convert, ModelTestingGroup.distributed) def test_load_parallel_checkpoint_in_single_gpu( @@ -374,6 +378,8 @@ def test_load_parallel_checkpoint_in_single_gpu( reference_distributed_shard, report_subtest, ): + # This should only happen when test is skipped (failed dependency). + assert reference_distributed_shard is not None distributed_save_load_config = distributed_save_load_config.resolve( base_path=run_test_script_base_path, model_testing_config=model_testing_config ) @@ -410,20 +416,25 @@ def test_parallel_checkpoint_consistency(model_testing_config, run_test_script_b @pytest.fixture(scope="module") -def reference_fast_llm_shard(get_convert_path) -> dict[str, torch.Tensor]: +def reference_fast_llm_shard(get_convert_path) -> dict[str, torch.Tensor] | None: # Load the file in a fixture (on cpu) so it's not loaded from disk each time. - path = get_convert_path(FastLLMCheckpointFormat, DistributedCheckpointFormat) / f"model_0.safetensors" - if not path.is_file(): - pytest.skip(f"Reference model failed or did not run.") - return safetensors.torch.load_file(path) + try: + return safetensors.torch.load_file( + get_convert_path(FastLLMCheckpointFormat, DistributedCheckpointFormat) / f"model_0.safetensors" + ) + except OSError: + # The fixture may be evaluated even if the tests are to be skipped. + return None -# @requires_cuda +@requires_cuda @pytest.mark.depends_on(on=["test_save_and_load_in_parallel[{model_testing_config}]"]) @pytest.mark.model_testing_group(ModelTestingGroup.convert, ModelTestingGroup.distributed) def test_multi_gpu_fast_llm_checkpoint( model_testing_config, distributed_save_load_config_non_pp, run_test_script_base_path, reference_fast_llm_shard ): + # This should only happen when test is skipped (failed dependency). + assert reference_fast_llm_shard is not None # Fast-LLM checkpoints are independent of the distributed configuration that saved it. # TODO: Check pipeline-parallel checkpoints (two files). distributed_save_load_config_non_pp = distributed_save_load_config_non_pp.resolve( From 97d86b818b9d1acb7d6689f454fc805adf86ed4d Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Tue, 8 Jul 2025 15:40:24 -0400 Subject: [PATCH 08/14] stuff --- tests/models/test_checkpoint.py | 1 + tests/models/test_match_megatron.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/tests/models/test_checkpoint.py b/tests/models/test_checkpoint.py index 4d70857ff..ecd23649f 100644 --- a/tests/models/test_checkpoint.py +++ b/tests/models/test_checkpoint.py @@ -36,6 +36,7 @@ ] +@requires_cuda @pytest.mark.model_testing_group(ModelTestingGroup.checkpoint) def test_checkpoint_and_eval(run_test_script_for_all_models, model_testing_config): # A baseline config (single-gpu, bf16, flash-attn). diff --git a/tests/models/test_match_megatron.py b/tests/models/test_match_megatron.py index 5d974172d..be5ddb608 100644 --- a/tests/models/test_match_megatron.py +++ b/tests/models/test_match_megatron.py @@ -6,8 +6,10 @@ from tests.utils.dataset import DATASET_PREFIX, get_test_dataset from tests.utils.distributed_configs import DistributedTestingConfig from tests.utils.model_configs import ModelTestingGroup +from tests.utils.utils import requires_cuda +@requires_cuda @pytest.mark.model_testing_group(ModelTestingGroup.megatron) def test_megatron(run_distributed_script, model_testing_config, run_test_script_base_path): path = run_test_script_base_path / "megatron" @@ -28,6 +30,7 @@ def test_megatron(run_distributed_script, model_testing_config, run_test_script_ ) +@requires_cuda @pytest.mark.depends_on(on=["test_megatron[{model_testing_config}]"]) @pytest.mark.model_testing_group(ModelTestingGroup.megatron) def test_match_megatron(run_test_script_for_all_models, model_testing_config, compare_results_for_all_models): From ea81ef68a847fdf28671dfebea34c3c928c207d8 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Wed, 9 Jul 2025 13:04:07 -0400 Subject: [PATCH 09/14] Fp32 tests --- tests/models/test_checkpoint.py | 7 +- tests/models/test_match_megatron.py | 2 +- tests/models/test_model.py | 4 +- tests/utils/compare_tensor_logs.py | 282 +++++++++++++++------------- tests/utils/distributed_configs.py | 97 +++++++--- tests/utils/model_configs.py | 1 - tests/utils/run_test_script.py | 4 +- 7 files changed, 233 insertions(+), 164 deletions(-) diff --git a/tests/models/test_checkpoint.py b/tests/models/test_checkpoint.py index ecd23649f..6f30bd318 100644 --- a/tests/models/test_checkpoint.py +++ b/tests/models/test_checkpoint.py @@ -65,12 +65,15 @@ def do_prepare_resume(distributed_testing_config: DistributedTestingConfig): @pytest.mark.model_testing_group(ModelTestingGroup.checkpoint) def test_resume(run_test_script_for_all_models, compare_results_for_all_models, prepare_resume): distributed_testing_config = DistributedTestingConfig( - name="resume", compare="checkpoint_and_eval", config_args=_CHECKPOINT_AND_EVAL_ARGS + name="resume", + compare="checkpoint_and_eval", + config_args=_CHECKPOINT_AND_EVAL_ARGS, + compare_config=CompareConfig(sub_configs={(("init", "train_1"), None): CompareConfig(ignore_tensors=True)}), ) prepare_resume(distributed_testing_config) # Resume from iteration=1 and compare outputs with the baseline run. run_test_script_for_all_models(distributed_testing_config) - compare_results_for_all_models(distributed_testing_config, ("train_2",)) + compare_results_for_all_models(distributed_testing_config) @requires_cuda diff --git a/tests/models/test_match_megatron.py b/tests/models/test_match_megatron.py index be5ddb608..edc524e04 100644 --- a/tests/models/test_match_megatron.py +++ b/tests/models/test_match_megatron.py @@ -56,7 +56,7 @@ def test_match_megatron(run_test_script_for_all_models, model_testing_config, co "model.base_model.use_megatron_initialization=True", ], num_gpus=1, - compare_config=CompareConfig(ignore_tensors=ignore_tensors), + compare_config=CompareConfig(sub_configs={(None, ignore_tensors): CompareConfig(ignore_tensors=True)}), ) run_test_script_for_all_models(distributed_testing_config) diff --git a/tests/models/test_model.py b/tests/models/test_model.py index 91670b253..4a344cdc7 100644 --- a/tests/models/test_model.py +++ b/tests/models/test_model.py @@ -40,7 +40,7 @@ def test_and_compare_model( set_subtest_success(run_test_script_base_path / config.name) if config.compare is not None: - compare_results_for_all_models(config, ("init", "train_1", "train_2")) + compare_results_for_all_models(config) @requires_cuda @@ -81,4 +81,4 @@ def test_model_distributed( if config.compare is not None: if not check_subtest_success(run_test_script_base_path / config.compare): pytest.fail(f"Test {config.compare} failed", pytrace=False) - compare_results_for_all_models(config, ("init", "train_1", "train_2")) + compare_results_for_all_models(config) diff --git a/tests/utils/compare_tensor_logs.py b/tests/utils/compare_tensor_logs.py index f22859dfd..59577e25a 100644 --- a/tests/utils/compare_tensor_logs.py +++ b/tests/utils/compare_tensor_logs.py @@ -9,157 +9,175 @@ _TENSOR_LOG_PREFIX = "tensor_logs_" +def _compare_pattern(pattern: typing.Iterable[str] | str | None, name: str): + # TODO: Regex? + return ( + True + if pattern is None + else pattern in name if isinstance(pattern, str) else any(pattern_ in name for pattern_ in pattern) + ) + + @dataclasses.dataclass() class CompareConfig: - rms_eps: float = 1e-3 - rms_rel_tolerance: float = 3e-2 - rms_abs_tolerance: float = 5e-3 - max_rel_tolerance: float = 1.5e-1 - max_abs_tolerance: float = 5e-2 + rms_eps: float = 1e-4 + rms_rel_tolerance: float = 3e-3 + rms_abs_tolerance: float = 5e-4 + max_rel_tolerance: float = 1.5e-2 + max_abs_tolerance: float = 5e-3 show_samples: int = 10 - ignore_tensors: list[str] = dataclasses.field(default_factory=list) - ignore_duplicates: list[str] = dataclasses.field(default_factory=list) - + ignore_tensors: bool = False + ignore_duplicates: bool = False + # Use a different config for specific step and/or tensor names. First match is used. + sub_configs: dict[tuple[typing.Iterable[str] | str | None, typing.Iterable[str] | str | None], "CompareConfig"] = ( + dataclasses.field(default_factory=dict) + ) + + def _get_sub_config(self, step_name: str, tensor_name: str) -> typing.Self: + for (step_key, name_key), sub_config in self.sub_configs.items(): + if _compare_pattern(step_key, step_name) and _compare_pattern(name_key, tensor_name): + return sub_config._get_sub_config(step_name, tensor_name) + return self + + def _extract_tensor_logs(self, artifact_path: pathlib.Path, errors): + tensor_logs = {} + ignore_keys = set() + for rank_path in sorted(artifact_path.iterdir()): + for p in rank_path.iterdir(): + if p.name.startswith(_TENSOR_LOG_PREFIX) and p.suffix == ".pt": + step_name = p.stem[len(_TENSOR_LOG_PREFIX) :] + for step_log in torch.load(p): + tensor_name = step_log["name"] + sub_config = self._get_sub_config(step_name, tensor_name) + if sub_config.ignore_tensors: + ignore_keys.add(f"{step_name}/{tensor_name}") + else: + if step_name not in tensor_logs: + tensor_logs[step_name] = {} + if ( + tensor_name in (tensor_step_logs := tensor_logs[step_name]) + and not self.ignore_duplicates + ): + errors.append(f"Duplicate tensor log in step {step_name}: {tensor_name}") + tensor_step_logs[tensor_name] = step_log + if ignore_keys: + warnings.warn(f"Ignoring keys in {artifact_path}: {ignore_keys}") + return tensor_logs + + def _compare_dict_keys(self, dict_ref, dict_test, errors, name): + keys_ref = set(dict_ref) + keys_test = set(dict_test) + if keys_ref != keys_test: + errors.append( + f">>>> {name} do not match. Missing = {keys_ref - keys_test}, extra = {keys_test - keys_ref}." + ) -def extract_tensor_logs( - artifact_path: pathlib.Path, errors, config: CompareConfig, artifacts: typing.Sequence[str] | None = None -): - tensor_logs = {} - ignore_keys = set() - for rank_path in sorted(artifact_path.iterdir()): - for p in rank_path.iterdir(): - if p.name.startswith(_TENSOR_LOG_PREFIX) and p.suffix == ".pt": - step_name = p.stem[len(_TENSOR_LOG_PREFIX) :] - if artifacts is not None and step_name not in artifacts: - continue - step_logs = torch.load(p) - if step_name not in tensor_logs: - tensor_logs[step_name] = {} - for step_log in step_logs: - name = step_log["name"] - if any(ignore_name in name for ignore_name in config.ignore_tensors): - ignore_keys.add(name) - else: - if name in tensor_logs[step_name] and not any( - ignore_name in name for ignore_name in config.ignore_duplicates - ): - errors.append(f"Duplicate tensor log in step {step_name}: {name}") - tensor_logs[step_name][name] = step_log - if ignore_keys: - warnings.warn(f"Ignoring keys in {artifact_path}: {ignore_keys}") - return tensor_logs - - -def compare_dict_keys(dict_ref, dict_test, errors, name): - keys_ref = set(dict_ref) - keys_test = set(dict_test) - if keys_ref != keys_test: - errors.append(f">>>> {name} do not match. Missing = {keys_ref-keys_test}, extra = {keys_test-keys_ref}.") - - # Avoid set to preserve ordering. - return [key for key in dict_test if key in dict_ref] - - -def compare_logged_tensor(tensor_ref, tensor_test, errors, step, name, config: CompareConfig): - if tensor_ref["shape"] != tensor_test["shape"]: - errors.append( - "\n".join( - [f">>>> [{step}] Incompatible shape for tensor {name}: {tensor_test['shape']}!={tensor_ref['shape']}"] + # Avoid set to preserve ordering. + return [key for key in dict_test if key in dict_ref] + + def _compare_tensors(self, tensor_ref, tensor_test, errors, step_name, tensor_name): + sub_config = self._get_sub_config(step_name, tensor_name) + if tensor_ref["shape"] != tensor_test["shape"]: + errors.append( + "\n".join( + [ + f">>>> [{step_name}] Incompatible shape for tensor {tensor_name}: {tensor_test['shape']}!={tensor_ref['shape']}" + ] + ) ) - ) - return - if tensor_ref["step"] != tensor_test["step"]: - errors.append( - "\n".join( - [ - f">>>> [{step}] Incompatible sampling rate for tensor {name}: {tensor_test['step']}!={tensor_ref['step']}" - ] + return + if tensor_ref["step"] != tensor_test["step"]: + errors.append( + "\n".join( + [ + f">>>> [{step_name}] Incompatible sampling rate for tensor {tensor_name}: {tensor_test['step']}!={tensor_ref['step']}" + ] + ) ) - ) - return - - samples_ref = tensor_ref["samples"].flatten().float() - samples_test = tensor_test["samples"].flatten().float() - scale_unreg = (samples_ref**2).mean() ** 0.5 - rms_scale = (scale_unreg**2 + config.rms_eps**2) ** 0.5 - rms = ((samples_ref - samples_test) ** 2).mean() ** 0.5 - max_diff = (samples_ref - samples_test).abs().max() - - tensor_errors = [] + return - if rms > config.rms_abs_tolerance: - tensor_errors.append(f" * RMS diff absolute = {rms} > {config.rms_abs_tolerance}") + samples_ref = tensor_ref["samples"].flatten().float() + samples_test = tensor_test["samples"].flatten().float() + scale_unreg = (samples_ref**2).mean() ** 0.5 + rms_scale = (scale_unreg**2 + sub_config.rms_eps**2) ** 0.5 + rms = ((samples_ref - samples_test) ** 2).mean() ** 0.5 + max_diff = (samples_ref - samples_test).abs().max() - if rms / rms_scale > config.rms_rel_tolerance: - tensor_errors.append( - f" * RMS diff scaled = {rms/rms_scale} > {config.rms_rel_tolerance} (scale={rms_scale}, unregularized={scale_unreg})" - ) + tensor_errors = [] - if max_diff > config.max_abs_tolerance: - tensor_errors.append(f" * Max diff absolute = {max_diff} > {config.max_abs_tolerance}") + if rms > sub_config.rms_abs_tolerance: + tensor_errors.append(f" * RMS diff absolute = {rms} > {sub_config.rms_abs_tolerance}") - if max_diff / rms_scale > config.max_rel_tolerance: - tensor_errors.append( - f" * Max diff scaled = {max_diff/rms_scale} > {config.max_rel_tolerance} (scale={rms_scale}, unregularized={scale_unreg})" - ) - - if tensor_errors: - tensor_errors.extend( - [ - f" Test samples: " + "".join(f"{x:12.4e}" for x in samples_test[: config.show_samples].tolist()), - f" Ref samples: " + "".join(f"{x:12.4e}" for x in samples_ref[: config.show_samples].tolist()), - ] - ) - errors.append("\n".join([f">>>> [{step}] Excessive diff for tensor {name}:"] + tensor_errors)) + if rms / rms_scale > sub_config.rms_rel_tolerance: + tensor_errors.append( + f" * RMS diff scaled = {rms / rms_scale} > {sub_config.rms_rel_tolerance} (scale={rms_scale}, unregularized={scale_unreg})" + ) + if max_diff > sub_config.max_abs_tolerance: + tensor_errors.append(f" * Max diff absolute = {max_diff} > {sub_config.max_abs_tolerance}") -def compare_tensor_logs_base( - artifact_path_ref: pathlib.Path, - artifact_path_test: pathlib.Path, - config: CompareConfig | None = None, - artifacts: typing.Sequence[str] | None = None, -): - errors = [] - - if config is None: - config = CompareConfig() - - logs_ref = extract_tensor_logs(artifact_path_ref, errors, config=config, artifacts=artifacts) - logs_test = extract_tensor_logs(artifact_path_test, errors, config=config, artifacts=artifacts) - - for step_key in sorted(compare_dict_keys(logs_ref, logs_test, errors, "Logged steps")): - step_logs_ref = logs_ref[step_key] - step_logs_test = logs_test[step_key] - - for tensor_key in compare_dict_keys( - step_logs_ref, step_logs_test, errors=errors, name=f"[{step_key}] Tensor keys" - ): - compare_logged_tensor( - step_logs_ref[tensor_key], - step_logs_test[tensor_key], - errors, - step_key, - tensor_key, - config, + if max_diff / rms_scale > sub_config.max_rel_tolerance: + tensor_errors.append( + f" * Max diff scaled = {max_diff / rms_scale} > {sub_config.max_rel_tolerance} (scale={rms_scale}, unregularized={scale_unreg})" ) - return errors + if tensor_errors: + tensor_errors.extend( + [ + f" Test samples: " + "".join(f"{x:12.4e}" for x in samples_test[: self.show_samples].tolist()), + f" Ref samples: " + "".join(f"{x:12.4e}" for x in samples_ref[: self.show_samples].tolist()), + ] + ) + errors.append("\n".join([f">>>> [{step_name}] Excessive diff for tensor {tensor_name}:"] + tensor_errors)) + + def _compare_tensor_logs( + self, + artifact_path_ref: pathlib.Path, + artifact_path_test: pathlib.Path, + ): + errors = [] + + logs_ref = self._extract_tensor_logs(artifact_path_ref, errors) + logs_test = self._extract_tensor_logs(artifact_path_test, errors) + + for step_key in sorted(self._compare_dict_keys(logs_ref, logs_test, errors, "Logged steps")): + step_logs_ref = logs_ref[step_key] + step_logs_test = logs_test[step_key] + + for tensor_key in self._compare_dict_keys( + step_logs_ref, step_logs_test, errors=errors, name=f"[{step_key}] Tensor keys" + ): + self._compare_tensors( + step_logs_ref[tensor_key], + step_logs_test[tensor_key], + errors, + step_key, + tensor_key, + ) + + return errors + + def compare_tensor_logs( + self, + artifact_path_ref: pathlib.Path, + artifact_path_test: pathlib.Path, + ): + print(f'Comparing tensor logs in "{artifact_path_test}" with reference logs "{artifact_path_ref}"') + errors = self._compare_tensor_logs(artifact_path_ref, artifact_path_test) + if errors: + for error in errors: + print(error) + raise ValueError(f"Comparison failed ({len(errors)} errors)") + else: + print("Comparison succeeded!") def compare_tensor_logs( + self, artifact_path_ref: pathlib.Path, artifact_path_test: pathlib.Path, - config: CompareConfig | None = None, - artifacts: typing.Sequence[str] | None = None, ): - print(f'Comparing tensor logs in "{artifact_path_test}" with reference logs "{artifact_path_ref}"') - errors = compare_tensor_logs_base(artifact_path_ref, artifact_path_test, config, artifacts) - if errors: - for error in errors: - print(error) - raise ValueError(f"Comparison failed ({len(errors)} errors)") - else: - print("Comparison succeeded!") + pass if __name__ == "__main__": @@ -167,4 +185,4 @@ def compare_tensor_logs( parser.add_argument("path_ref", type=pathlib.Path) parser.add_argument("path_test", type=pathlib.Path) args = parser.parse_args() - compare_tensor_logs(args.path_ref, args.path_test) + CompareConfig().compare_tensor_logs(args.path_ref, args.path_test) diff --git a/tests/utils/distributed_configs.py b/tests/utils/distributed_configs.py index c38939eae..d81e1a33e 100644 --- a/tests/utils/distributed_configs.py +++ b/tests/utils/distributed_configs.py @@ -15,6 +15,43 @@ class DistributedTestingConfig: compare_config: CompareConfig | None = None +# TODO: Ajust +_default_compare = CompareConfig( + rms_eps=1e-4, + rms_rel_tolerance=3e-3, + rms_abs_tolerance=5e-4, + max_rel_tolerance=1.5e-2, + max_abs_tolerance=5e-3, +) +_pp_tied_weight_compare = dataclasses.replace( + _default_compare, + sub_configs={ + (None, ("layers.0.word_embeddings_weight", "layers.0.position_embeddings_weight")): CompareConfig( + ignore_duplicates=True + ) + }, +) + +_z3_accumulation_compare = dataclasses.replace( + _default_compare, sub_configs={(None, "Global gradient"): CompareConfig(ignore_duplicates=True)} +) +_bf16_compare = dataclasses.replace( + _default_compare, + rms_eps=1e-3, + rms_rel_tolerance=3e-2, + rms_abs_tolerance=5e-3, + max_rel_tolerance=1.5e-1, + max_abs_tolerance=5e-2, +) +_fp16_compare = dataclasses.replace( + _default_compare, + rms_eps=1e-3, + rms_rel_tolerance=3e-2, + rms_abs_tolerance=5e-3, + max_rel_tolerance=1.5e-1, + max_abs_tolerance=5e-2, +) + # Baseline (also tests data-parallel workers) SIMPLE_TESTING_CONFIG = DistributedTestingConfig( name="simple", @@ -24,12 +61,27 @@ class DistributedTestingConfig: ) _SINGLE_GPU_TESTING_CONFIGS = [ + DistributedTestingConfig( + name="bf16", + compare="simple", + config_args=["model.distributed.training_dtype=bf16"], + num_gpus=1, + compare_config=_bf16_compare, + ), + DistributedTestingConfig( + name="fp16", + compare="simple", + config_args=["model.distributed.training_dtype=bf16"], + num_gpus=1, + compare_config=_fp16_compare, + ), # Sequence-first baseline DistributedTestingConfig( name="sf", compare=None, config_args=["model.base_model.sequence_first=True"], num_gpus=1, + compare_config=_default_compare, ), # Cross-entropy splits. DistributedTestingConfig( @@ -58,6 +110,7 @@ class DistributedTestingConfig: compare="df4", config_args=["batch.breadth_first_micro_batches=4"], num_gpus=1, + compare_config=_default_compare, ), # Mixed gradient accumulation. DistributedTestingConfig( @@ -65,6 +118,7 @@ class DistributedTestingConfig: compare="df4", config_args=["batch.depth_first_micro_batches=2", "batch.breadth_first_micro_batches=2"], num_gpus=1, + compare_config=_default_compare, ), # Sequence-first gradient accumulation baseline. DistributedTestingConfig( @@ -86,6 +140,7 @@ class DistributedTestingConfig: compare="simple", config_args=[], num_gpus=2, + compare_config=_default_compare, ), # Zero stage 2 DistributedTestingConfig( @@ -93,6 +148,7 @@ class DistributedTestingConfig: compare="simple", config_args=["model.multi_stage.zero_stage=2"], num_gpus=2, + compare_config=_default_compare, ), # Zero stage 3 DistributedTestingConfig( @@ -100,6 +156,7 @@ class DistributedTestingConfig: compare="simple", config_args=["model.multi_stage.zero_stage=3"], num_gpus=2, + compare_config=_default_compare, ), # Depth-first micro-batches DistributedTestingConfig( @@ -107,11 +164,7 @@ class DistributedTestingConfig: compare="df4", config_args=["model.multi_stage.zero_stage=3", "batch.depth_first_micro_batches=4"], num_gpus=2, - compare_config=CompareConfig( - ignore_duplicates=[ - "Global gradient", - ] - ), + compare_config=_z3_accumulation_compare, ), # Sequence-data-parallel DistributedTestingConfig( @@ -119,6 +172,7 @@ class DistributedTestingConfig: compare="sf", config_args=["model.distributed.sequence_data_parallel=2"], num_gpus=2, + compare_config=_default_compare, ), # ===== Tensor-parallel configs # Simple tensor-parallel @@ -127,6 +181,7 @@ class DistributedTestingConfig: compare="simple", config_args=["model.distributed.tensor_parallel=2"], num_gpus=2, + compare_config=_default_compare, ), # Simple sequence-tensor-parallel DistributedTestingConfig( @@ -134,6 +189,7 @@ class DistributedTestingConfig: compare="sf", config_args=["model.distributed.tensor_parallel=2", "model.distributed.sequence_tensor_parallel=True"], num_gpus=2, + compare_config=_default_compare, ), # Cross-entropy splits DistributedTestingConfig( @@ -146,6 +202,7 @@ class DistributedTestingConfig: "model.base_model.cross_entropy_splits=4", ], num_gpus=2, + compare_config=_default_compare, ), # ===== 2d configs (Data + Tensor) # Simple @@ -157,6 +214,7 @@ class DistributedTestingConfig: "model.distributed.sequence_tensor_parallel=True", ], num_gpus=4, + compare_config=_default_compare, ), # Depth-first micro-batches, tensor-parallel DistributedTestingConfig( @@ -167,6 +225,7 @@ class DistributedTestingConfig: "batch.depth_first_micro_batches=4", ], num_gpus=4, + compare_config=_default_compare, ), # Breadth-first micro-batches DistributedTestingConfig( @@ -179,6 +238,7 @@ class DistributedTestingConfig: "batch.breadth_first_micro_batches=4", ], num_gpus=4, + compare_config=_default_compare, ), # Sequence-data-parallel DistributedTestingConfig( @@ -190,6 +250,7 @@ class DistributedTestingConfig: "model.distributed.sequence_tensor_parallel=True", ], num_gpus=4, + compare_config=_default_compare, ), # ===== Pipeline-parallel configs # Simple [mb] @@ -202,6 +263,7 @@ class DistributedTestingConfig: "batch.breadth_first_micro_batches=4", ], num_gpus=2, + compare_config=_default_compare, ), # Tied weights on different ranks DistributedTestingConfig( @@ -213,12 +275,7 @@ class DistributedTestingConfig: "batch.breadth_first_micro_batches=4", ], num_gpus=2, - compare_config=CompareConfig( - ignore_duplicates=[ - "layers.0.word_embeddings_weight", - "layers.0.position_embeddings_weight", - ] - ), + compare_config=_pp_tied_weight_compare, ), # Micro-sequence [ms] DistributedTestingConfig( @@ -230,6 +287,7 @@ class DistributedTestingConfig: "batch.micro_sequence_length=256", ], num_gpus=2, + compare_config=_default_compare, ), # ===== 2d configs (Data + Pipeline) # Simple @@ -242,6 +300,7 @@ class DistributedTestingConfig: "batch.breadth_first_micro_batches=4", ], num_gpus=4, + compare_config=_default_compare, ), # ===== 2d configs (Tensor + Pipeline) # Simple [sf, mb] @@ -256,12 +315,7 @@ class DistributedTestingConfig: "batch.breadth_first_micro_batches=4", ], num_gpus=4, - compare_config=CompareConfig( - ignore_duplicates=[ - "layers.0.word_embeddings_weight", - "layers.0.position_embeddings_weight", - ] - ), + compare_config=_pp_tied_weight_compare, ), # ===== Data + Tensor + Pipeline # Simple @@ -275,6 +329,7 @@ class DistributedTestingConfig: "batch.breadth_first_micro_batches=4", ], num_gpus=8, + compare_config=_default_compare, ), # Tied weights on different ranks DistributedTestingConfig( @@ -288,12 +343,7 @@ class DistributedTestingConfig: "batch.breadth_first_micro_batches=4", ], num_gpus=8, - compare_config=CompareConfig( - ignore_duplicates=[ - "layers.0.word_embeddings_weight", - "layers.0.position_embeddings_weight", - ] - ), + compare_config=_pp_tied_weight_compare, ), # Micro-sequence DistributedTestingConfig( @@ -308,6 +358,7 @@ class DistributedTestingConfig: "batch.micro_sequence_length=256", ], num_gpus=8, + compare_config=_default_compare, ), ] diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py index 199d5b72c..cbe8539aa 100644 --- a/tests/utils/model_configs.py +++ b/tests/utils/model_configs.py @@ -144,7 +144,6 @@ def _update_and_add_testing_config( "model.multi_stage.debug_tensor_parallel=True", "model.distributed.reproducible_init=True", "model.distributed.timeout=20", - "model.distributed.training_dtype=bf16", "training.train_iters=2", "training.num_workers=0", "training.timeout=30", diff --git a/tests/utils/run_test_script.py b/tests/utils/run_test_script.py index 602afeb23..6656e2bbe 100644 --- a/tests/utils/run_test_script.py +++ b/tests/utils/run_test_script.py @@ -10,7 +10,6 @@ from fast_llm.engine.distributed.config import DistributedConfig from fast_llm.utils import Assert -from tests.utils.compare_tensor_logs import compare_tensor_logs from tests.utils.dataset import get_test_dataset from tests.utils.distributed_configs import DistributedTestingConfig from tests.utils.model_configs import MODEL_CONFIGS, ModelTestingConfig @@ -117,10 +116,9 @@ def do_compare_results_for_all_models( config: DistributedTestingConfig, artifacts: typing.Iterable[str] | None = None ): assert config.compare is not None - compare_tensor_logs( + config.compare_config.compare_tensor_logs( run_test_script_base_path / config.compare / ARTIFACT_PATH, run_test_script_base_path / config.name / ARTIFACT_PATH, - config.compare_config, artifacts, ) From 6a712229e0dfa9d2caa6832643f058a6ab478bb1 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Mon, 14 Jul 2025 14:02:21 -0400 Subject: [PATCH 10/14] stuff --- fast_llm/logging.py | 1 + tests/utils/compare_tensor_logs.py | 28 ++++-- tests/utils/dataset.py | 2 +- tests/utils/distributed_configs.py | 140 +++++++++++++++++------------ tests/utils/run_test_script.py | 11 +-- 5 files changed, 112 insertions(+), 70 deletions(-) diff --git a/fast_llm/logging.py b/fast_llm/logging.py index 385a8b960..e8334de6e 100644 --- a/fast_llm/logging.py +++ b/fast_llm/logging.py @@ -137,6 +137,7 @@ def log_tensor[ ) -> (T | None): if level < 1: return + tensor = tensor.detach() save_stats = TensorLogs.config.save shape = tuple(tensor.shape) _, dtype = str(tensor.dtype).split("torch.") diff --git a/tests/utils/compare_tensor_logs.py b/tests/utils/compare_tensor_logs.py index 59577e25a..743fafea0 100644 --- a/tests/utils/compare_tensor_logs.py +++ b/tests/utils/compare_tensor_logs.py @@ -25,6 +25,8 @@ class CompareConfig: rms_abs_tolerance: float = 5e-4 max_rel_tolerance: float = 1.5e-2 max_abs_tolerance: float = 5e-3 + # Test tensors are scaled by this amount (ex. gradient scaling). Unscale (divide) them before comparison. + scale: float = 1.0 show_samples: int = 10 ignore_tensors: bool = False ignore_duplicates: bool = False @@ -33,6 +35,20 @@ class CompareConfig: dataclasses.field(default_factory=dict) ) + def rescale(self, factor: float) -> typing.Self: + # Scale all tolerances by this factor. + if factor == 1.0: + return self + return dataclasses.replace( + self, + rms_eps=self.rms_eps * factor, + rms_rel_tolerance=self.rms_rel_tolerance * factor, + rms_abs_tolerance=self.rms_abs_tolerance * factor, + max_rel_tolerance=self.max_rel_tolerance * factor, + max_abs_tolerance=self.max_abs_tolerance * factor, + sub_configs={key: sub_config.rescale(factor) for key, sub_config in self.sub_configs.items()}, + ) + def _get_sub_config(self, step_name: str, tensor_name: str) -> typing.Self: for (step_key, name_key), sub_config in self.sub_configs.items(): if _compare_pattern(step_key, step_name) and _compare_pattern(name_key, tensor_name): @@ -56,7 +72,7 @@ def _extract_tensor_logs(self, artifact_path: pathlib.Path, errors): tensor_logs[step_name] = {} if ( tensor_name in (tensor_step_logs := tensor_logs[step_name]) - and not self.ignore_duplicates + and not sub_config.ignore_duplicates ): errors.append(f"Duplicate tensor log in step {step_name}: {tensor_name}") tensor_step_logs[tensor_name] = step_log @@ -98,6 +114,8 @@ def _compare_tensors(self, tensor_ref, tensor_test, errors, step_name, tensor_na samples_ref = tensor_ref["samples"].flatten().float() samples_test = tensor_test["samples"].flatten().float() + if sub_config.scale != 1.0: + samples_test = samples_test / sub_config.scale scale_unreg = (samples_ref**2).mean() ** 0.5 rms_scale = (scale_unreg**2 + sub_config.rms_eps**2) ** 0.5 rms = ((samples_ref - samples_test) ** 2).mean() ** 0.5 @@ -172,14 +190,6 @@ def compare_tensor_logs( print("Comparison succeeded!") -def compare_tensor_logs( - self, - artifact_path_ref: pathlib.Path, - artifact_path_test: pathlib.Path, -): - pass - - if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("path_ref", type=pathlib.Path) diff --git a/tests/utils/dataset.py b/tests/utils/dataset.py index 2a12c4f7d..713211da6 100644 --- a/tests/utils/dataset.py +++ b/tests/utils/dataset.py @@ -15,7 +15,7 @@ DATASET_CACHE = TEST_RESULTS_PATH / "dataset" DATASET_PREFIX = DATASET_CACHE / "common" / "dataset" DATASET_SAMPLING_CACHE = TEST_RESULTS_PATH / "dataset" / "cache" -TEST_VOCAB_SIZE = 8192 +TEST_VOCAB_SIZE = 384 # Random lowercase: 80.7% (3.1% each); space: 18.6%; doc end: 0.6% TEST_CHARACTERS = (string.ascii_lowercase) * 5 + " " * 30 + "\n" TEST_DATASET_TOKENS = 1000000 diff --git a/tests/utils/distributed_configs.py b/tests/utils/distributed_configs.py index d81e1a33e..f70a87956 100644 --- a/tests/utils/distributed_configs.py +++ b/tests/utils/distributed_configs.py @@ -1,3 +1,4 @@ +import copy import dataclasses import logging @@ -13,45 +14,70 @@ class DistributedTestingConfig: config_args: list[str] num_gpus: int = 1 compare_config: CompareConfig | None = None + # Scale the comparison thresholds for specific models. + compare_factor: float = 1.0 + + +def get_config(relative: float = 0, absolute: float = 0, **kwargs) -> CompareConfig: + return CompareConfig( + rms_rel_tolerance=relative, + max_rel_tolerance=relative * 10, + rms_abs_tolerance=absolute, + max_abs_tolerance=absolute * 10, + rms_eps=absolute / 10, + **kwargs, + ) # TODO: Ajust -_default_compare = CompareConfig( - rms_eps=1e-4, - rms_rel_tolerance=3e-3, - rms_abs_tolerance=5e-4, - max_rel_tolerance=1.5e-2, - max_abs_tolerance=5e-3, -) -_pp_tied_weight_compare = dataclasses.replace( - _default_compare, +_compare_layer_match = get_config( sub_configs={ - (None, ("layers.0.word_embeddings_weight", "layers.0.position_embeddings_weight")): CompareConfig( - ignore_duplicates=True - ) - }, + ("init", None): get_config(), + ("train_1", "fw"): get_config(1e-3, 3e-5), + ("train_2", "fw"): get_config(1e-3, 1e-4), + ("train_1", "bw"): get_config(3e-3, 3e-6), + ("train_2", "bw"): get_config(3e-3, 1e-5), + ("train_1", "gradient"): get_config(3e-3, 1e-5), + ("train_2", "gradient"): get_config(3e-3, 3e-5), + } ) -_z3_accumulation_compare = dataclasses.replace( - _default_compare, sub_configs={(None, "Global gradient"): CompareConfig(ignore_duplicates=True)} -) -_bf16_compare = dataclasses.replace( - _default_compare, - rms_eps=1e-3, - rms_rel_tolerance=3e-2, - rms_abs_tolerance=5e-3, - max_rel_tolerance=1.5e-1, - max_abs_tolerance=5e-2, +_compare_layer_mismatch = copy.deepcopy(_compare_layer_match) +_pp_tied_weight_compare = copy.deepcopy(_compare_layer_match) +_z3_accumulation_compare = copy.deepcopy(_compare_layer_match) +for step in ("train_1", "train_2"): + _z3_accumulation_compare.sub_configs[(step, "gradient")].ignore_duplicates = True + for tensor in ("fw", "bw"): + _compare_layer_mismatch.sub_configs[(step, tensor)].ignore_tensors = True + _pp_tied_weight_compare.sub_configs[(step, tensor)].ignore_duplicates = True + + +_bf16_compare = get_config( + sub_configs={ + ("init", None): get_config(), + ("train_1", "fw"): get_config(1e-2, 1e-3), + ("train_2", "fw"): get_config(1e-2, 1e-3), + ("train_1", "bw"): get_config(1e-2, 1e-5), + ("train_2", "bw"): get_config(1e-2, 1e-5), + ("train_1", "gradient"): get_config(2e-2, 3e-5), + ("train_2", "gradient"): get_config(2e-2, 3e-5), + } ) -_fp16_compare = dataclasses.replace( - _default_compare, - rms_eps=1e-3, - rms_rel_tolerance=3e-2, - rms_abs_tolerance=5e-3, - max_rel_tolerance=1.5e-1, - max_abs_tolerance=5e-2, + +_fp16_compare = get_config( + sub_configs={ + ("init", None): get_config(), + # Saved gradient include the gradient scaling by 2**16 (default initial value) + ("train_1", "fw"): get_config(1e-3, 1e-4), + ("train_2", "fw"): get_config(1e-3, 1e-4), + ("train_1", "bw"): get_config(3e-3, 1e-5, scale=2**16), + ("train_2", "bw"): get_config(3e-3, 1e-5, scale=2**16), + ("train_1", "gradient"): get_config(3e-3, 1e-5, scale=2**16), + ("train_2", "gradient"): get_config(3e-3, 1e-5, scale=2**16), + } ) + # Baseline (also tests data-parallel workers) SIMPLE_TESTING_CONFIG = DistributedTestingConfig( name="simple", @@ -71,38 +97,41 @@ class DistributedTestingConfig: DistributedTestingConfig( name="fp16", compare="simple", - config_args=["model.distributed.training_dtype=bf16"], + config_args=["model.distributed.training_dtype=fp16"], num_gpus=1, compare_config=_fp16_compare, ), # Sequence-first baseline DistributedTestingConfig( name="sf", - compare=None, + compare="simple", config_args=["model.base_model.sequence_first=True"], num_gpus=1, - compare_config=_default_compare, + compare_config=_compare_layer_mismatch, ), # Cross-entropy splits. DistributedTestingConfig( name="ce4", - compare=None, + compare="simple", config_args=["model.base_model.cross_entropy_splits=4"], num_gpus=1, + compare_config=_compare_layer_mismatch, ), # Micro-sequence baseline DistributedTestingConfig( name="ms", - compare=None, + compare="simple", config_args=["batch.micro_sequence_length=256"], num_gpus=1, + compare_config=_compare_layer_mismatch, ), # Gradient accumulation baseline. DistributedTestingConfig( name="df4", - compare=None, + compare="simple", config_args=["batch.depth_first_micro_batches=4"], num_gpus=1, + compare_config=_compare_layer_mismatch, ), # Breadth-first gradient accumulation. DistributedTestingConfig( @@ -110,7 +139,7 @@ class DistributedTestingConfig: compare="df4", config_args=["batch.breadth_first_micro_batches=4"], num_gpus=1, - compare_config=_default_compare, + compare_config=_compare_layer_match, ), # Mixed gradient accumulation. DistributedTestingConfig( @@ -118,14 +147,15 @@ class DistributedTestingConfig: compare="df4", config_args=["batch.depth_first_micro_batches=2", "batch.breadth_first_micro_batches=2"], num_gpus=1, - compare_config=_default_compare, + compare_config=_compare_layer_match, ), # Sequence-first gradient accumulation baseline. DistributedTestingConfig( name="df4_sf", - compare=None, + compare="simple", config_args=["batch.depth_first_micro_batches=4", "model.base_model.sequence_first=True"], num_gpus=1, + compare_config=_compare_layer_mismatch, ), ] @@ -140,7 +170,7 @@ class DistributedTestingConfig: compare="simple", config_args=[], num_gpus=2, - compare_config=_default_compare, + compare_config=_compare_layer_match, ), # Zero stage 2 DistributedTestingConfig( @@ -148,7 +178,7 @@ class DistributedTestingConfig: compare="simple", config_args=["model.multi_stage.zero_stage=2"], num_gpus=2, - compare_config=_default_compare, + compare_config=_compare_layer_match, ), # Zero stage 3 DistributedTestingConfig( @@ -156,7 +186,7 @@ class DistributedTestingConfig: compare="simple", config_args=["model.multi_stage.zero_stage=3"], num_gpus=2, - compare_config=_default_compare, + compare_config=_compare_layer_match, ), # Depth-first micro-batches DistributedTestingConfig( @@ -172,7 +202,7 @@ class DistributedTestingConfig: compare="sf", config_args=["model.distributed.sequence_data_parallel=2"], num_gpus=2, - compare_config=_default_compare, + compare_config=_compare_layer_match, ), # ===== Tensor-parallel configs # Simple tensor-parallel @@ -181,7 +211,7 @@ class DistributedTestingConfig: compare="simple", config_args=["model.distributed.tensor_parallel=2"], num_gpus=2, - compare_config=_default_compare, + compare_config=_compare_layer_match, ), # Simple sequence-tensor-parallel DistributedTestingConfig( @@ -189,7 +219,7 @@ class DistributedTestingConfig: compare="sf", config_args=["model.distributed.tensor_parallel=2", "model.distributed.sequence_tensor_parallel=True"], num_gpus=2, - compare_config=_default_compare, + compare_config=_compare_layer_match, ), # Cross-entropy splits DistributedTestingConfig( @@ -202,7 +232,7 @@ class DistributedTestingConfig: "model.base_model.cross_entropy_splits=4", ], num_gpus=2, - compare_config=_default_compare, + compare_config=_compare_layer_match, ), # ===== 2d configs (Data + Tensor) # Simple @@ -214,7 +244,7 @@ class DistributedTestingConfig: "model.distributed.sequence_tensor_parallel=True", ], num_gpus=4, - compare_config=_default_compare, + compare_config=_compare_layer_match, ), # Depth-first micro-batches, tensor-parallel DistributedTestingConfig( @@ -225,7 +255,7 @@ class DistributedTestingConfig: "batch.depth_first_micro_batches=4", ], num_gpus=4, - compare_config=_default_compare, + compare_config=_compare_layer_match, ), # Breadth-first micro-batches DistributedTestingConfig( @@ -238,7 +268,7 @@ class DistributedTestingConfig: "batch.breadth_first_micro_batches=4", ], num_gpus=4, - compare_config=_default_compare, + compare_config=_compare_layer_match, ), # Sequence-data-parallel DistributedTestingConfig( @@ -250,7 +280,7 @@ class DistributedTestingConfig: "model.distributed.sequence_tensor_parallel=True", ], num_gpus=4, - compare_config=_default_compare, + compare_config=_compare_layer_match, ), # ===== Pipeline-parallel configs # Simple [mb] @@ -263,7 +293,7 @@ class DistributedTestingConfig: "batch.breadth_first_micro_batches=4", ], num_gpus=2, - compare_config=_default_compare, + compare_config=_compare_layer_match, ), # Tied weights on different ranks DistributedTestingConfig( @@ -287,7 +317,7 @@ class DistributedTestingConfig: "batch.micro_sequence_length=256", ], num_gpus=2, - compare_config=_default_compare, + compare_config=_compare_layer_match, ), # ===== 2d configs (Data + Pipeline) # Simple @@ -300,7 +330,7 @@ class DistributedTestingConfig: "batch.breadth_first_micro_batches=4", ], num_gpus=4, - compare_config=_default_compare, + compare_config=_compare_layer_match, ), # ===== 2d configs (Tensor + Pipeline) # Simple [sf, mb] @@ -329,7 +359,7 @@ class DistributedTestingConfig: "batch.breadth_first_micro_batches=4", ], num_gpus=8, - compare_config=_default_compare, + compare_config=_compare_layer_match, ), # Tied weights on different ranks DistributedTestingConfig( @@ -358,7 +388,7 @@ class DistributedTestingConfig: "batch.micro_sequence_length=256", ], num_gpus=8, - compare_config=_default_compare, + compare_config=_compare_layer_match, ), ] diff --git a/tests/utils/run_test_script.py b/tests/utils/run_test_script.py index 6656e2bbe..f4d4dfab0 100644 --- a/tests/utils/run_test_script.py +++ b/tests/utils/run_test_script.py @@ -2,6 +2,7 @@ import functools import os import pathlib +import pprint import subprocess import sys import typing @@ -111,15 +112,15 @@ def parse_run_distributed_script(args: list[str] | None = None): def compare_results_for_all_models( worker_resources: "WorkerResources", run_test_script_base_path: pathlib.Path, + model_testing_config: ModelTestingConfig, ): - def do_compare_results_for_all_models( - config: DistributedTestingConfig, artifacts: typing.Iterable[str] | None = None - ): + def do_compare_results_for_all_models(config: DistributedTestingConfig): assert config.compare is not None - config.compare_config.compare_tensor_logs( + compare_config = config.compare_config.rescale(config.compare_factor) + pprint.pprint(compare_config) + compare_config.compare_tensor_logs( run_test_script_base_path / config.compare / ARTIFACT_PATH, run_test_script_base_path / config.name / ARTIFACT_PATH, - artifacts, ) return do_compare_results_for_all_models From 28f1a886cc9006362785c59b90931ec9080f6542 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Tue, 15 Jul 2025 16:10:12 -0400 Subject: [PATCH 11/14] fixes --- tests/models/test_checkpoint.py | 6 ++-- tests/models/test_match_megatron.py | 6 ++-- tests/utils/compare_tensor_logs.py | 4 +-- tests/utils/dataset.py | 12 +++++++- tests/utils/distributed_configs.py | 43 +++++++++++++---------------- tests/utils/model_configs.py | 22 ++++++++++----- tests/utils/run_test_script.py | 17 +++++++----- 7 files changed, 62 insertions(+), 48 deletions(-) diff --git a/tests/models/test_checkpoint.py b/tests/models/test_checkpoint.py index 6f30bd318..05acf23dc 100644 --- a/tests/models/test_checkpoint.py +++ b/tests/models/test_checkpoint.py @@ -19,7 +19,7 @@ from fast_llm.engine.checkpoint.convert import ConvertConfig from fast_llm.engine.multi_stage.config import FastLLMModelConfig, ShardName from fast_llm.utils import Assert -from tests.utils.compare_tensor_logs import CompareConfig, compare_logged_tensor +from tests.utils.compare_tensor_logs import CompareConfig from tests.utils.distributed_configs import DistributedTestingConfig from tests.utils.model_configs import ModelTestingConfig, ModelTestingGroup from tests.utils.save_load_configs import DISTRIBUTED_SAVE_LOAD_CONFIGS, DistributedSaveLoadConfig @@ -307,7 +307,6 @@ def test_huggingface_model(model_testing_config, get_convert_path): ) ) errors = [] - compare = CompareConfig() auto_model = ( transformers.AutoModel if model_testing_config.name in ("diffusion_llama", "dream") @@ -323,13 +322,12 @@ def test_huggingface_model(model_testing_config, get_convert_path): print(name) output = model(test_input) # TODO: Make a generic comparison util. - compare_logged_tensor( + CompareConfig().compare_tensors( {"samples": output_ref.logits, "shape": output_ref.logits.shape, "step": 0}, {"samples": output.logits, "shape": output.logits.shape, "step": 0}, errors, name, "logits", - compare, ) if errors: diff --git a/tests/models/test_match_megatron.py b/tests/models/test_match_megatron.py index edc524e04..c7fa623e5 100644 --- a/tests/models/test_match_megatron.py +++ b/tests/models/test_match_megatron.py @@ -3,7 +3,7 @@ import pytest from tests.utils.compare_tensor_logs import CompareConfig -from tests.utils.dataset import DATASET_PREFIX, get_test_dataset +from tests.utils.dataset import MODEL_DATASET_PREFIX, get_model_test_dataset from tests.utils.distributed_configs import DistributedTestingConfig from tests.utils.model_configs import ModelTestingGroup from tests.utils.utils import requires_cuda @@ -17,7 +17,7 @@ def test_megatron(run_distributed_script, model_testing_config, run_test_script_ # Prevent Megatron from complaining. env["CUDA_DEVICE_MAX_CONNECTIONS"] = "1" env["NVTE_FLASH_ATTN"] = "0" - get_test_dataset() + get_model_test_dataset() run_distributed_script( [ "Megatron-LM/pretrain_gpt.py", @@ -52,7 +52,7 @@ def test_match_megatron(run_test_script_for_all_models, model_testing_config, co config_args=[ "model.distributed.training_dtype=fp32", "data.datasets={}", - f"data.path={DATASET_PREFIX}", + f"data.path={MODEL_DATASET_PREFIX}", "model.base_model.use_megatron_initialization=True", ], num_gpus=1, diff --git a/tests/utils/compare_tensor_logs.py b/tests/utils/compare_tensor_logs.py index 743fafea0..a1c17379a 100644 --- a/tests/utils/compare_tensor_logs.py +++ b/tests/utils/compare_tensor_logs.py @@ -91,7 +91,7 @@ def _compare_dict_keys(self, dict_ref, dict_test, errors, name): # Avoid set to preserve ordering. return [key for key in dict_test if key in dict_ref] - def _compare_tensors(self, tensor_ref, tensor_test, errors, step_name, tensor_name): + def compare_tensors(self, tensor_ref, tensor_test, errors, step_name, tensor_name): sub_config = self._get_sub_config(step_name, tensor_name) if tensor_ref["shape"] != tensor_test["shape"]: errors.append( @@ -165,7 +165,7 @@ def _compare_tensor_logs( for tensor_key in self._compare_dict_keys( step_logs_ref, step_logs_test, errors=errors, name=f"[{step_key}] Tensor keys" ): - self._compare_tensors( + self.compare_tensors( step_logs_ref[tensor_key], step_logs_test[tensor_key], errors, diff --git a/tests/utils/dataset.py b/tests/utils/dataset.py index 713211da6..ad8385ae9 100644 --- a/tests/utils/dataset.py +++ b/tests/utils/dataset.py @@ -15,11 +15,14 @@ DATASET_CACHE = TEST_RESULTS_PATH / "dataset" DATASET_PREFIX = DATASET_CACHE / "common" / "dataset" DATASET_SAMPLING_CACHE = TEST_RESULTS_PATH / "dataset" / "cache" -TEST_VOCAB_SIZE = 384 +TEST_VOCAB_SIZE = 8192 # Random lowercase: 80.7% (3.1% each); space: 18.6%; doc end: 0.6% TEST_CHARACTERS = (string.ascii_lowercase) * 5 + " " * 30 + "\n" TEST_DATASET_TOKENS = 1000000 +MODEL_DATASET_PREFIX = DATASET_CACHE / "common" / "model_dataset" +MODEL_TEST_VOCAB_SIZE = 384 + def get_test_dataset( prefix: pathlib.Path = DATASET_PREFIX, @@ -60,6 +63,13 @@ def get_test_dataset( ) +def get_model_test_dataset( + prefix: pathlib.Path = MODEL_DATASET_PREFIX, + vocab_size: int = MODEL_TEST_VOCAB_SIZE, +): + return get_test_dataset(prefix=prefix, vocab_size=vocab_size) + + def get_test_concatenated_memmap_dataset( path: pathlib.Path, num_files: int, diff --git a/tests/utils/distributed_configs.py b/tests/utils/distributed_configs.py index f70a87956..ef7d5d214 100644 --- a/tests/utils/distributed_configs.py +++ b/tests/utils/distributed_configs.py @@ -14,7 +14,7 @@ class DistributedTestingConfig: config_args: list[str] num_gpus: int = 1 compare_config: CompareConfig | None = None - # Scale the comparison thresholds for specific models. + # Scale the comparison thresholds for specific distributed configs. compare_factor: float = 1.0 @@ -33,34 +33,31 @@ def get_config(relative: float = 0, absolute: float = 0, **kwargs) -> CompareCon _compare_layer_match = get_config( sub_configs={ ("init", None): get_config(), - ("train_1", "fw"): get_config(1e-3, 3e-5), - ("train_2", "fw"): get_config(1e-3, 1e-4), - ("train_1", "bw"): get_config(3e-3, 3e-6), - ("train_2", "bw"): get_config(3e-3, 1e-5), - ("train_1", "gradient"): get_config(3e-3, 1e-5), - ("train_2", "gradient"): get_config(3e-3, 3e-5), + (None, "fw"): get_config(1e-3, 1e-4), + (None, "bw"): get_config(3e-3, 1e-5), + # Biases have higher absolute error. + (None, "bias"): get_config(3e-3, 5e-5), + (None, "gradient"): get_config(3e-3, 3e-5), } ) _compare_layer_mismatch = copy.deepcopy(_compare_layer_match) _pp_tied_weight_compare = copy.deepcopy(_compare_layer_match) _z3_accumulation_compare = copy.deepcopy(_compare_layer_match) -for step in ("train_1", "train_2"): - _z3_accumulation_compare.sub_configs[(step, "gradient")].ignore_duplicates = True - for tensor in ("fw", "bw"): - _compare_layer_mismatch.sub_configs[(step, tensor)].ignore_tensors = True - _pp_tied_weight_compare.sub_configs[(step, tensor)].ignore_duplicates = True +_z3_accumulation_compare.sub_configs[(None, "gradient")].ignore_duplicates = True +_pp_tied_weight_compare.sub_configs[(None, "gradient")].ignore_duplicates = True +for tensor in ("fw", "bw"): + _compare_layer_mismatch.sub_configs[(None, tensor)].ignore_tensors = True + _pp_tied_weight_compare.sub_configs[(None, tensor)].ignore_duplicates = True _bf16_compare = get_config( sub_configs={ ("init", None): get_config(), - ("train_1", "fw"): get_config(1e-2, 1e-3), - ("train_2", "fw"): get_config(1e-2, 1e-3), - ("train_1", "bw"): get_config(1e-2, 1e-5), - ("train_2", "bw"): get_config(1e-2, 1e-5), - ("train_1", "gradient"): get_config(2e-2, 3e-5), - ("train_2", "gradient"): get_config(2e-2, 3e-5), + (None, "fw"): get_config(1e-2, 1e-3), + (None, "bw"): get_config(1e-2, 1e-5), + (None, "bias"): get_config(2e-2, 1e-4), + (None, "gradient"): get_config(2e-2, 3e-5), } ) @@ -68,12 +65,10 @@ def get_config(relative: float = 0, absolute: float = 0, **kwargs) -> CompareCon sub_configs={ ("init", None): get_config(), # Saved gradient include the gradient scaling by 2**16 (default initial value) - ("train_1", "fw"): get_config(1e-3, 1e-4), - ("train_2", "fw"): get_config(1e-3, 1e-4), - ("train_1", "bw"): get_config(3e-3, 1e-5, scale=2**16), - ("train_2", "bw"): get_config(3e-3, 1e-5, scale=2**16), - ("train_1", "gradient"): get_config(3e-3, 1e-5, scale=2**16), - ("train_2", "gradient"): get_config(3e-3, 1e-5, scale=2**16), + (None, "fw"): get_config(1e-3, 1e-4), + (None, "bw"): get_config(3e-3, 1e-5, scale=2**16), + (None, "bias"): get_config(3e-3, 1e-4, scale=2**16), + (None, "gradient"): get_config(3e-3, 5e-5, scale=2**16), } ) diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py index cbe8539aa..2c07fd0a1 100644 --- a/tests/utils/model_configs.py +++ b/tests/utils/model_configs.py @@ -20,7 +20,7 @@ Starcoder2GPTHuggingfaceCheckpointFormat, ) from fast_llm.models.ssm.config import LLambaHuggingfaceCheckpointFormat -from tests.utils.dataset import DATASET_PREFIX, TEST_VOCAB_SIZE +from tests.utils.dataset import MODEL_DATASET_PREFIX, MODEL_TEST_VOCAB_SIZE _LOG_LEVEL = int(os.environ.get("LOG_LEVEL", 13)) @@ -55,6 +55,8 @@ class ModelTestingConfig: megatron_args: list[str] | None checkpoint_format: type[CheckpointFormat] | None groups: dict[ModelTestingGroup, ModelTestingGroupAction] + # Scale the comparison thresholds for specific models. + compare_factor: float = 1.0 @functools.cached_property def trainer_config_class(self) -> type[TrainerConfig]: @@ -96,6 +98,7 @@ def _update_and_add_testing_config( megatron_args: list[str] | None = ..., checkpoint_format: CheckpointFormat | None = ..., groups: dict[ModelTestingGroup, ModelTestingGroupAction], + compare_factor: float = ..., ): config = MODEL_CONFIGS[old_name] updates: dict[str, typing.Any] = { @@ -115,6 +118,8 @@ def _update_and_add_testing_config( updates["megatron_args"] = config.megatron_args + megatron_args if checkpoint_format is not ...: updates["checkpoint_format"] = checkpoint_format + if compare_factor is not ...: + updates["compare_factor"] = compare_factor MODEL_CONFIGS[new_name] = dataclasses.replace(config, **updates) @@ -136,7 +141,7 @@ def _update_and_add_testing_config( "model.base_model.transformer.num_attention_heads=8", "model.base_model.transformer.head_groups=8", "model.base_model.transformer.init_method_std=0.022", - f"model.base_model.vocab_size={TEST_VOCAB_SIZE}", + f"model.base_model.vocab_size={MODEL_TEST_VOCAB_SIZE}", f"model.multi_stage.debug_param_init={_LOG_LEVEL}", f"model.multi_stage.debug_layer_outputs={_LOG_LEVEL}", f"model.multi_stage.debug_layer_gradients={_LOG_LEVEL}", @@ -152,17 +157,17 @@ def _update_and_add_testing_config( "data.datasets.training.type=slice", "data.datasets.training.end=0.969", "data.datasets.training.dataset.type=memmap", - f"data.datasets.training.dataset.path={DATASET_PREFIX}", + f"data.datasets.training.dataset.path={MODEL_DATASET_PREFIX}", "data.datasets.validation.type=slice", "data.datasets.validation.begin=0.969", "data.datasets.validation.end=0.999", "data.datasets.validation.dataset.type=memmap", - f"data.datasets.validation.dataset.path={DATASET_PREFIX}", + f"data.datasets.validation.dataset.path={MODEL_DATASET_PREFIX}", "data.datasets.test.type=slice", "data.datasets.test.begin=0.999", "data.datasets.test.end=1", "data.datasets.test.dataset.type=memmap", - f"data.datasets.test.dataset.path={DATASET_PREFIX}", + f"data.datasets.test.dataset.path={MODEL_DATASET_PREFIX}", "optimizer.learning_rate.base=0.0001", ], megatron_args=[ @@ -189,8 +194,8 @@ def _update_and_add_testing_config( "--valid-num-workers=0", "--tokenizer-type=NullTokenizer", # Megatron messes with the vocab size, so we have to subtract 1. - f"--vocab-size={TEST_VOCAB_SIZE - 1}", - f"--data-path={DATASET_PREFIX}", + f"--vocab-size={MODEL_TEST_VOCAB_SIZE - 1}", + f"--data-path={MODEL_DATASET_PREFIX}", "--lr-decay-style=constant", # Initialization is set up to match MCore models (MCore inverts self-attn qkv and dense layers compared to original Megatron) "--use-mcore-models", @@ -439,6 +444,7 @@ def _update_and_add_testing_config( ModelTestingGroup.megatron: ModelTestingGroupAction.normal, ModelTestingGroup.distributed: ModelTestingGroupAction.normal, }, + compare_factor=2.0, ) _update_and_add_testing_config( @@ -466,6 +472,7 @@ def _update_and_add_testing_config( # TODO: Fix and bring back to `testing_groups` ModelTestingGroup.distributed: ModelTestingGroupAction.broken, }, + compare_factor=10.0, ) @@ -487,6 +494,7 @@ def _update_and_add_testing_config( ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented, ModelTestingGroup.distributed: ModelTestingGroupAction.unimportant, }, + compare_factor=10.0, ) diff --git a/tests/utils/run_test_script.py b/tests/utils/run_test_script.py index f4d4dfab0..c188ccd5c 100644 --- a/tests/utils/run_test_script.py +++ b/tests/utils/run_test_script.py @@ -6,12 +6,13 @@ import subprocess import sys import typing +import warnings import pytest from fast_llm.engine.distributed.config import DistributedConfig from fast_llm.utils import Assert -from tests.utils.dataset import get_test_dataset +from tests.utils.dataset import get_model_test_dataset from tests.utils.distributed_configs import DistributedTestingConfig from tests.utils.model_configs import MODEL_CONFIGS, ModelTestingConfig @@ -71,7 +72,7 @@ def do_run_test_script_for_all_models( base_path: pathlib.Path, ): Assert.leq(distributed_testing_config.num_gpus, DistributedConfig.default_world_size) - get_test_dataset() + get_model_test_dataset() args = [ "fast-llm", "train", @@ -116,11 +117,13 @@ def compare_results_for_all_models( ): def do_compare_results_for_all_models(config: DistributedTestingConfig): assert config.compare is not None - compare_config = config.compare_config.rescale(config.compare_factor) + compare_config = config.compare_config.rescale(config.compare_factor * model_testing_config.compare_factor) pprint.pprint(compare_config) - compare_config.compare_tensor_logs( - run_test_script_base_path / config.compare / ARTIFACT_PATH, - run_test_script_base_path / config.name / ARTIFACT_PATH, - ) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=DeprecationWarning, message="Ignoring keys in ") + compare_config.compare_tensor_logs( + run_test_script_base_path / config.compare / ARTIFACT_PATH, + run_test_script_base_path / config.name / ARTIFACT_PATH, + ) return do_compare_results_for_all_models From ca65becfd76eecccec5ffa87b4bec777f69b9ae0 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Tue, 15 Jul 2025 18:17:23 -0400 Subject: [PATCH 12/14] fixes --- fast_llm/engine/schedule/config.py | 6 ----- fast_llm/layers/language_model/head.py | 6 +++++ fast_llm/utils.py | 2 +- tests/data/test_concatenated_memmap.py | 36 ++++++++++++++------------ tests/models/test_match_megatron.py | 4 +-- tests/utils/compare_tensor_logs.py | 8 +----- tests/utils/distributed_configs.py | 20 +++++++++++--- tests/utils/run_test_script.py | 11 +++----- 8 files changed, 51 insertions(+), 42 deletions(-) diff --git a/fast_llm/engine/schedule/config.py b/fast_llm/engine/schedule/config.py index 141490ac3..272b7c6ae 100644 --- a/fast_llm/engine/schedule/config.py +++ b/fast_llm/engine/schedule/config.py @@ -1,6 +1,5 @@ import enum import functools -import warnings from fast_llm.config import Config, Field, FieldHint, check_field, config_class, test_field from fast_llm.engine.distributed.config import DistributedConfig @@ -105,11 +104,6 @@ def _validate(self) -> None: if self._distributed.pipeline_parallel > 1 and self.depth_first_micro_batches > 1: raise NotImplementedError("Depth-first pipeline parallelism not yet implemented") - if self.depth_first_micro_batches > 1 and self.breadth_first_micro_batches > 1: - warnings.warn( - "Mixing of breadth-first and depth-first gradient accumulation is not thoroughly tested." - " Use at your own risk." - ) super()._validate() diff --git a/fast_llm/layers/language_model/head.py b/fast_llm/layers/language_model/head.py index 69eebff39..25fc2b28d 100644 --- a/fast_llm/layers/language_model/head.py +++ b/fast_llm/layers/language_model/head.py @@ -6,6 +6,7 @@ from torch.distributed import all_reduce from fast_llm.config import Configurable +from fast_llm.core.ops import split_op from fast_llm.engine.base_model.base_model import Layer from fast_llm.engine.config_utils.tensor_space import DefaultDimNames, TensorDim, TensorSpace from fast_llm.engine.distributed.config import DistributedDimNames @@ -234,6 +235,11 @@ def _get_targets( lm_target = None targets = (dpo_target, lm_target, distillation_target, loss_mask) + if self._sequence_parallel_logits: + targets = [ + None if target is None else split_op(target, self._tensor_space.distributed.tensor_group, 0) + for target in targets + ] if not any(target is not None for target in targets): # Simplify so we don't have to check every time. targets = None diff --git a/fast_llm/utils.py b/fast_llm/utils.py index 821ec5874..472f5e9b7 100644 --- a/fast_llm/utils.py +++ b/fast_llm/utils.py @@ -145,7 +145,7 @@ def multiple(x, y): @staticmethod def rms_close(x, y, threshold): - rms = rms_diff(x, y).item() + rms = rms_diff(x, y).detach().item() assert rms <= threshold, f"Rms diff too big ({rms:.3e} > {threshold:.3e}) between tensors {x} and {y}" @staticmethod diff --git a/tests/data/test_concatenated_memmap.py b/tests/data/test_concatenated_memmap.py index 0ab7c7fe4..1cc22250d 100644 --- a/tests/data/test_concatenated_memmap.py +++ b/tests/data/test_concatenated_memmap.py @@ -1,3 +1,5 @@ +import pytest + from fast_llm.data.dataset.gpt.config import GPTConcatenatedMemmapConfig from tests.data.common import ( compare_indexed_dataset, @@ -42,10 +44,11 @@ def test_gpt_concatenated_memmap(): # Make sure dataset splitting works and check for unintended changes in behavior. _get_test_dataset_concatenated_memmap() # samples[9:18] - dataset = get_dataset_config( - {"type": "concatenated_memmap", "path": _DATASET_PREFIX_MIX_CONCATENATED_MEMMAP}, - GPTConcatenatedMemmapConfig, - ).build() + with pytest.warns(DeprecationWarning): + dataset = get_dataset_config( + {"type": "concatenated_memmap", "path": _DATASET_PREFIX_MIX_CONCATENATED_MEMMAP}, + GPTConcatenatedMemmapConfig, + ).build() compare_indexed_dataset( dataset, CONCATENATED_MEMMAP_DATASET_LENGTH, @@ -58,16 +61,17 @@ def test_gpt_concatenated_memmap(): def test_gpt_concatenated_memmap_data(): _get_test_dataset_concatenated_memmap() - get_test_data_and_compare_samples( - { - "datasets": { - "Training": { - "type": "concatenated_memmap", - "path": _DATASET_PREFIX_MIX_CONCATENATED_MEMMAP, + with pytest.warns(DeprecationWarning): + get_test_data_and_compare_samples( + { + "datasets": { + "Training": { + "type": "concatenated_memmap", + "path": _DATASET_PREFIX_MIX_CONCATENATED_MEMMAP, + } } - } - }, - 8, - sequence_length=5, - expected_samples=CONCATENATED_MEMMAP_SAMPLES, - ) + }, + 8, + sequence_length=5, + expected_samples=CONCATENATED_MEMMAP_SAMPLES, + ) diff --git a/tests/models/test_match_megatron.py b/tests/models/test_match_megatron.py index c7fa623e5..081f3fb1e 100644 --- a/tests/models/test_match_megatron.py +++ b/tests/models/test_match_megatron.py @@ -36,13 +36,13 @@ def test_megatron(run_distributed_script, model_testing_config, run_test_script_ def test_match_megatron(run_test_script_for_all_models, model_testing_config, compare_results_for_all_models): assert model_testing_config.megatron_args is not None - ignore_tensors = [ + ignore_tensors = ( ".self_attn.query_key_value.", ".self_attn.query.", ".self_attn.key_value.", ".mlp.layer_2.weight", ".mlp.experts.", - ] + ) if model_testing_config.name == "mixtral": ignore_tensors.extend([".mlp.experts.", ".mlp.layer_1.weight"]) diff --git a/tests/utils/compare_tensor_logs.py b/tests/utils/compare_tensor_logs.py index a1c17379a..51ee66d31 100644 --- a/tests/utils/compare_tensor_logs.py +++ b/tests/utils/compare_tensor_logs.py @@ -2,7 +2,6 @@ import dataclasses import pathlib import typing -import warnings import torch @@ -57,7 +56,6 @@ def _get_sub_config(self, step_name: str, tensor_name: str) -> typing.Self: def _extract_tensor_logs(self, artifact_path: pathlib.Path, errors): tensor_logs = {} - ignore_keys = set() for rank_path in sorted(artifact_path.iterdir()): for p in rank_path.iterdir(): if p.name.startswith(_TENSOR_LOG_PREFIX) and p.suffix == ".pt": @@ -65,9 +63,7 @@ def _extract_tensor_logs(self, artifact_path: pathlib.Path, errors): for step_log in torch.load(p): tensor_name = step_log["name"] sub_config = self._get_sub_config(step_name, tensor_name) - if sub_config.ignore_tensors: - ignore_keys.add(f"{step_name}/{tensor_name}") - else: + if not sub_config.ignore_tensors: if step_name not in tensor_logs: tensor_logs[step_name] = {} if ( @@ -76,8 +72,6 @@ def _extract_tensor_logs(self, artifact_path: pathlib.Path, errors): ): errors.append(f"Duplicate tensor log in step {step_name}: {tensor_name}") tensor_step_logs[tensor_name] = step_log - if ignore_keys: - warnings.warn(f"Ignoring keys in {artifact_path}: {ignore_keys}") return tensor_logs def _compare_dict_keys(self, dict_ref, dict_test, errors, name): diff --git a/tests/utils/distributed_configs.py b/tests/utils/distributed_configs.py index ef7d5d214..d054c9889 100644 --- a/tests/utils/distributed_configs.py +++ b/tests/utils/distributed_configs.py @@ -44,8 +44,10 @@ def get_config(relative: float = 0, absolute: float = 0, **kwargs) -> CompareCon _compare_layer_mismatch = copy.deepcopy(_compare_layer_match) _pp_tied_weight_compare = copy.deepcopy(_compare_layer_match) _z3_accumulation_compare = copy.deepcopy(_compare_layer_match) +_z3_accumulation_compare.sub_configs[(None, "bias")].ignore_duplicates = True _z3_accumulation_compare.sub_configs[(None, "gradient")].ignore_duplicates = True _pp_tied_weight_compare.sub_configs[(None, "gradient")].ignore_duplicates = True +_pp_tied_weight_compare.sub_configs[("init", None)].ignore_duplicates = True for tensor in ("fw", "bw"): _compare_layer_mismatch.sub_configs[(None, tensor)].ignore_tensors = True _pp_tied_weight_compare.sub_configs[(None, tensor)].ignore_duplicates = True @@ -55,8 +57,8 @@ def get_config(relative: float = 0, absolute: float = 0, **kwargs) -> CompareCon sub_configs={ ("init", None): get_config(), (None, "fw"): get_config(1e-2, 1e-3), - (None, "bw"): get_config(1e-2, 1e-5), - (None, "bias"): get_config(2e-2, 1e-4), + (None, "bw"): get_config(1.5e-2, 1e-5), + (None, "bias"): get_config(2e-2, 1e-3), (None, "gradient"): get_config(2e-2, 3e-5), } ) @@ -212,7 +214,11 @@ def get_config(relative: float = 0, absolute: float = 0, **kwargs) -> CompareCon DistributedTestingConfig( name="stp2", compare="sf", - config_args=["model.distributed.tensor_parallel=2", "model.distributed.sequence_tensor_parallel=True"], + config_args=[ + "model.distributed.tensor_parallel=2", + "model.distributed.sequence_tensor_parallel=True", + "model.base_model.transformer.dropless_moe=False", + ], num_gpus=2, compare_config=_compare_layer_match, ), @@ -223,6 +229,7 @@ def get_config(relative: float = 0, absolute: float = 0, **kwargs) -> CompareCon config_args=[ "model.distributed.tensor_parallel=2", "model.distributed.sequence_tensor_parallel=True", + "model.base_model.transformer.dropless_moe=False", "model.base_model.parallel_embeddings=False", "model.base_model.cross_entropy_splits=4", ], @@ -237,6 +244,7 @@ def get_config(relative: float = 0, absolute: float = 0, **kwargs) -> CompareCon config_args=[ "model.distributed.tensor_parallel=2", "model.distributed.sequence_tensor_parallel=True", + "model.base_model.transformer.dropless_moe=False", ], num_gpus=4, compare_config=_compare_layer_match, @@ -260,6 +268,7 @@ def get_config(relative: float = 0, absolute: float = 0, **kwargs) -> CompareCon "model.distributed.sequence_data_parallel=2", "model.distributed.tensor_parallel=2", "model.distributed.sequence_tensor_parallel=True", + "model.base_model.transformer.dropless_moe=False", "batch.breadth_first_micro_batches=4", ], num_gpus=4, @@ -273,6 +282,7 @@ def get_config(relative: float = 0, absolute: float = 0, **kwargs) -> CompareCon "model.distributed.sequence_data_parallel=2", "model.distributed.tensor_parallel=2", "model.distributed.sequence_tensor_parallel=True", + "model.base_model.transformer.dropless_moe=False", ], num_gpus=4, compare_config=_compare_layer_match, @@ -335,6 +345,7 @@ def get_config(relative: float = 0, absolute: float = 0, **kwargs) -> CompareCon config_args=[ "model.distributed.tensor_parallel=2", "model.distributed.sequence_tensor_parallel=True", + "model.base_model.transformer.dropless_moe=False", "model.distributed.pipeline_parallel=2", "model.multi_stage.layers_per_stage=2", "batch.breadth_first_micro_batches=4", @@ -349,6 +360,8 @@ def get_config(relative: float = 0, absolute: float = 0, **kwargs) -> CompareCon compare="mb", config_args=[ "model.distributed.tensor_parallel=2", + "model.distributed.sequence_tensor_parallel=True", + "model.base_model.transformer.dropless_moe=False", "model.distributed.pipeline_parallel=2", "model.multi_stage.layers_per_stage=2", "batch.breadth_first_micro_batches=4", @@ -378,6 +391,7 @@ def get_config(relative: float = 0, absolute: float = 0, **kwargs) -> CompareCon "model.distributed.sequence_data_parallel=2", "model.distributed.tensor_parallel=2", "model.distributed.sequence_tensor_parallel=True", + "model.base_model.transformer.dropless_moe=False", "model.distributed.pipeline_parallel=2", "model.multi_stage.layers_per_stage=2", "batch.micro_sequence_length=256", diff --git a/tests/utils/run_test_script.py b/tests/utils/run_test_script.py index c188ccd5c..b8f996a82 100644 --- a/tests/utils/run_test_script.py +++ b/tests/utils/run_test_script.py @@ -6,7 +6,6 @@ import subprocess import sys import typing -import warnings import pytest @@ -119,11 +118,9 @@ def do_compare_results_for_all_models(config: DistributedTestingConfig): assert config.compare is not None compare_config = config.compare_config.rescale(config.compare_factor * model_testing_config.compare_factor) pprint.pprint(compare_config) - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=DeprecationWarning, message="Ignoring keys in ") - compare_config.compare_tensor_logs( - run_test_script_base_path / config.compare / ARTIFACT_PATH, - run_test_script_base_path / config.name / ARTIFACT_PATH, - ) + compare_config.compare_tensor_logs( + run_test_script_base_path / config.compare / ARTIFACT_PATH, + run_test_script_base_path / config.name / ARTIFACT_PATH, + ) return do_compare_results_for_all_models From a488e03dd4d77912b5729b59753e39d7791dd3e7 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Tue, 15 Jul 2025 23:05:49 -0400 Subject: [PATCH 13/14] fixes --- fast_llm/layers/ssm/discrete_mamba2.py | 3 +++ fast_llm/models/ssm/config.py | 6 ++++++ tests/models/distributed_test_model.py | 2 ++ tests/models/test_match_megatron.py | 2 +- tests/models/test_model.py | 11 ++++++++++- tests/utils/distributed_configs.py | 4 ++-- tests/utils/model_configs.py | 20 ++++++++++++-------- 7 files changed, 36 insertions(+), 12 deletions(-) diff --git a/fast_llm/layers/ssm/discrete_mamba2.py b/fast_llm/layers/ssm/discrete_mamba2.py index 31e81e99b..b0aa96805 100644 --- a/fast_llm/layers/ssm/discrete_mamba2.py +++ b/fast_llm/layers/ssm/discrete_mamba2.py @@ -7,6 +7,7 @@ from fast_llm.engine.config_utils.tensor_space import TensorDim, TensorSpace from fast_llm.layers.common.linear import Linear from fast_llm.layers.ssm.config import SSMConfig, SSMDimNames +from fast_llm.layers.transformer.config import TransformerKwargs from fast_llm.tensor import ParameterMeta, init_ones_, init_uniform_, init_zeros_, kaiming_init_ from fast_llm.utils import get_lr_scale @@ -157,6 +158,8 @@ def forward(self, hidden_states, kwargs): outputs["hidden_states"]: (B, L, D). outputs["state"]: inference cache. """ + if kwargs[TransformerKwargs.sequence_first]: + raise NotImplementedError(f"Sequence-first not supported for SSMs.") assert _mamba_available input_ = hidden_states diff --git a/fast_llm/models/ssm/config.py b/fast_llm/models/ssm/config.py index 3c47ff0b2..ecd8908ee 100644 --- a/fast_llm/models/ssm/config.py +++ b/fast_llm/models/ssm/config.py @@ -197,6 +197,12 @@ def _validate(self): logger.warning( "HybridSSMModelConfig is being instantiated. This model is experimental and may not work as expected." ) + if ( + self.base_model.sequence_first + or self.distributed.sequence_data_parallel > 1 + or self.distributed.sequence_tensor_parallel + ): + raise NotImplementedError(f"Sequence-first not supported for SSMs.") super()._validate() diff --git a/tests/models/distributed_test_model.py b/tests/models/distributed_test_model.py index 933b215e7..564920bd5 100644 --- a/tests/models/distributed_test_model.py +++ b/tests/models/distributed_test_model.py @@ -27,6 +27,8 @@ def main(args: list[str] | None = None) -> None: group = pool.get_process_group(range(world_size), rank) for name, config in DISTRIBUTED_TESTING_CONFIGS.items(): + if model_testing_config.should_skip(config): + continue if world_size < config.num_gpus: logger.warning(f"{name} {f"SKIPPED (not enough GPUs: {world_size} < {config.num_gpus})"})") continue diff --git a/tests/models/test_match_megatron.py b/tests/models/test_match_megatron.py index 081f3fb1e..30667cd17 100644 --- a/tests/models/test_match_megatron.py +++ b/tests/models/test_match_megatron.py @@ -44,7 +44,7 @@ def test_match_megatron(run_test_script_for_all_models, model_testing_config, co ".mlp.experts.", ) if model_testing_config.name == "mixtral": - ignore_tensors.extend([".mlp.experts.", ".mlp.layer_1.weight"]) + ignore_tensors += (".mlp.experts.", ".mlp.layer_1.weight") distributed_testing_config = DistributedTestingConfig( name="match_megatron", diff --git a/tests/models/test_model.py b/tests/models/test_model.py index 4a344cdc7..5c4897646 100644 --- a/tests/models/test_model.py +++ b/tests/models/test_model.py @@ -28,10 +28,16 @@ def test_model_simple(run_test_script_for_all_models, run_test_script_base_path) # Parametrize with config name so it shows in test name. @pytest.mark.parametrize("config_name", SINGLE_GPU_TESTING_CONFIGS) def test_and_compare_model( - run_test_script_for_all_models, compare_results_for_all_models, config_name, run_test_script_base_path + run_test_script_for_all_models, + compare_results_for_all_models, + config_name, + run_test_script_base_path, + model_testing_config, ): # We can expect tests to respect the ordering of `SINGLE_GPU_TESTING_CONFIGS`, so compare should have run already. config = SINGLE_GPU_TESTING_CONFIGS[config_name] + if model_testing_config.should_skip(config): + pytest.skip(f"Configuration not supported.") if config.compare is not None: check_subtest_success(run_test_script_base_path / config.compare) # A baseline config (single-gpu, bf16, flash-attn). @@ -73,8 +79,11 @@ def test_model_distributed( config_name, run_test_script_base_path, report_subtest, + model_testing_config, ): config = DISTRIBUTED_TESTING_CONFIGS[config_name] + if model_testing_config.should_skip(config): + pytest.skip(f"Configuration not supported.") if torch.cuda.device_count() < config.num_gpus: pytest.skip(f"Not enough GPUs: {torch.cuda.device_count()} < {config.num_gpus}") report_subtest(run_test_script_base_path / config.name, config.num_gpus) diff --git a/tests/utils/distributed_configs.py b/tests/utils/distributed_configs.py index d054c9889..c3064d987 100644 --- a/tests/utils/distributed_configs.py +++ b/tests/utils/distributed_configs.py @@ -59,7 +59,7 @@ def get_config(relative: float = 0, absolute: float = 0, **kwargs) -> CompareCon (None, "fw"): get_config(1e-2, 1e-3), (None, "bw"): get_config(1.5e-2, 1e-5), (None, "bias"): get_config(2e-2, 1e-3), - (None, "gradient"): get_config(2e-2, 3e-5), + (None, "gradient"): get_config(2e-2, 5e-5), } ) @@ -67,7 +67,7 @@ def get_config(relative: float = 0, absolute: float = 0, **kwargs) -> CompareCon sub_configs={ ("init", None): get_config(), # Saved gradient include the gradient scaling by 2**16 (default initial value) - (None, "fw"): get_config(1e-3, 1e-4), + (None, "fw"): get_config(1e-3, 3e-4), (None, "bw"): get_config(3e-3, 1e-5, scale=2**16), (None, "bias"): get_config(3e-3, 1e-4, scale=2**16), (None, "gradient"): get_config(3e-3, 5e-5, scale=2**16), diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py index 2c07fd0a1..f1890aff8 100644 --- a/tests/utils/model_configs.py +++ b/tests/utils/model_configs.py @@ -21,6 +21,7 @@ ) from fast_llm.models.ssm.config import LLambaHuggingfaceCheckpointFormat from tests.utils.dataset import MODEL_DATASET_PREFIX, MODEL_TEST_VOCAB_SIZE +from tests.utils.distributed_configs import DistributedTestingConfig _LOG_LEVEL = int(os.environ.get("LOG_LEVEL", 13)) @@ -57,6 +58,8 @@ class ModelTestingConfig: groups: dict[ModelTestingGroup, ModelTestingGroupAction] # Scale the comparison thresholds for specific models. compare_factor: float = 1.0 + # Option to skip specific distributed configuration with name containing any of the provided strings. + skip_tests: tuple[str] = () @functools.cached_property def trainer_config_class(self) -> type[TrainerConfig]: @@ -88,6 +91,9 @@ def model_class(self): def base_model_config_class(self): return self.model_config_class.get_base_model_config_class() + def should_skip(self, distributed_config: DistributedTestingConfig) -> bool: + return any(key in distributed_config.name for key in self.skip_tests) + def _update_and_add_testing_config( old_name: str, @@ -96,9 +102,8 @@ def _update_and_add_testing_config( model_type: str | None = None, extra_args: list[str] | None = None, megatron_args: list[str] | None = ..., - checkpoint_format: CheckpointFormat | None = ..., groups: dict[ModelTestingGroup, ModelTestingGroupAction], - compare_factor: float = ..., + **kwargs, ): config = MODEL_CONFIGS[old_name] updates: dict[str, typing.Any] = { @@ -116,10 +121,7 @@ def _update_and_add_testing_config( updates["megatron_args"] = megatron_args else: updates["megatron_args"] = config.megatron_args + megatron_args - if checkpoint_format is not ...: - updates["checkpoint_format"] = checkpoint_format - if compare_factor is not ...: - updates["compare_factor"] = compare_factor + updates.update(kwargs) MODEL_CONFIGS[new_name] = dataclasses.replace(config, **updates) @@ -362,6 +364,7 @@ def _update_and_add_testing_config( ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented, ModelTestingGroup.distributed: ModelTestingGroupAction.unimportant, }, + compare_factor=2.0, ) _update_and_add_testing_config( @@ -472,7 +475,9 @@ def _update_and_add_testing_config( # TODO: Fix and bring back to `testing_groups` ModelTestingGroup.distributed: ModelTestingGroupAction.broken, }, - compare_factor=10.0, + compare_factor=2.0, + # SSMs don't support sequence-first configurations. + skip_tests=("sf", "sdp", "stp", "ms"), ) @@ -494,7 +499,6 @@ def _update_and_add_testing_config( ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented, ModelTestingGroup.distributed: ModelTestingGroupAction.unimportant, }, - compare_factor=10.0, ) From 5f0b87a130b0a8837e866fbfa4aac530c9aab921 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Wed, 16 Jul 2025 14:53:22 -0400 Subject: [PATCH 14/14] Parallel safe --- tests/conftest.py | 6 ++---- tests/utils/dataset.py | 12 ++++++------ tests/utils/utils.py | 11 ++++++++++- 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index e9011979a..298117e1d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -95,10 +95,8 @@ def pytest_configure(config): else: worker_id = 0 - # TODO: Remove the whole `TEST_RESULTS_PATH` once `get_test_dataset` is parallel-safe. - model_result_path = TEST_RESULTS_PATH / "models" - if model_result_path.exists(): - shutil.rmtree(model_result_path) + if TEST_RESULTS_PATH.exists(): + shutil.rmtree(TEST_RESULTS_PATH) num_gpus = torch.cuda.device_count() if num_gpus > 0 and is_parallel: diff --git a/tests/utils/dataset.py b/tests/utils/dataset.py index ad8385ae9..a4136c40e 100644 --- a/tests/utils/dataset.py +++ b/tests/utils/dataset.py @@ -7,20 +7,20 @@ from fast_llm.data.dataset.gpt.memmap import GPTMemmapDataset from fast_llm.data.dataset.gpt.sampled import GPTSample -from tests.utils.utils import TEST_RESULTS_PATH +from tests.utils.utils import SHARED_RESULT_PATH, TEST_RESULTS_PATH # TODO: Fixtures -TOKENIZER_PATH = TEST_RESULTS_PATH / "tokenizer" / "common" +TOKENIZER_PATH = SHARED_RESULT_PATH / "tokenizer" TOKENIZER_FILE = TOKENIZER_PATH / "tokenizer.json" -DATASET_CACHE = TEST_RESULTS_PATH / "dataset" -DATASET_PREFIX = DATASET_CACHE / "common" / "dataset" -DATASET_SAMPLING_CACHE = TEST_RESULTS_PATH / "dataset" / "cache" +DATASET_CACHE = SHARED_RESULT_PATH / "dataset" +DATASET_PREFIX = DATASET_CACHE / "common_dataset" +DATASET_SAMPLING_CACHE = TEST_RESULTS_PATH / "dataset_sampling_cache" TEST_VOCAB_SIZE = 8192 # Random lowercase: 80.7% (3.1% each); space: 18.6%; doc end: 0.6% TEST_CHARACTERS = (string.ascii_lowercase) * 5 + " " * 30 + "\n" TEST_DATASET_TOKENS = 1000000 -MODEL_DATASET_PREFIX = DATASET_CACHE / "common" / "model_dataset" +MODEL_DATASET_PREFIX = DATASET_CACHE / "model_dataset" MODEL_TEST_VOCAB_SIZE = 384 diff --git a/tests/utils/utils.py b/tests/utils/utils.py index 54efe0966..25d5221d8 100644 --- a/tests/utils/utils.py +++ b/tests/utils/utils.py @@ -1,6 +1,7 @@ import json import logging import math +import os import pathlib import sys import time @@ -23,9 +24,17 @@ requires_cuda = pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available") - +# Directory for all test data and results. +# Cannot be a fixture because it's used outside testing environment (ex. distributed scripts). TEST_RESULTS_PATH = pathlib.Path("/tmp/fast_llm_tests") +# Directory for data that is shared between independent tests and may not be parallel-safe, +# ex. generated dataset and downloaded files. +if worker_name := os.environ.get("PYTEST_XDIST_WORKER"): + SHARED_RESULT_PATH = TEST_RESULTS_PATH / f"common_{worker_name}" +else: + SHARED_RESULT_PATH = TEST_RESULTS_PATH / "common" + @pytest.fixture(scope="session") def result_path():