From c32e09c2b19900551b089b7482b9dfb8b11b6e4a Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Thu, 3 Jul 2025 10:21:25 -0400
Subject: [PATCH 01/14] stuff

---
 fast_llm/engine/distributed/config.py      |  15 +
 fast_llm/engine/distributed/distributed.py |  64 ++++-
 tests/conftest.py                          |   4 +-
 tests/models/distributed_test_model.py     |  59 ++++
 tests/models/test_checkpoint.py            |  76 +++---
 tests/models/test_match_megatron.py        |  39 ++-
 tests/models/test_mb.py                    |  92 -------
 tests/models/test_mb_seq_first.py          |  50 ----
 tests/models/test_model.py                 |  75 +++++
 tests/models/test_ms.py                    |  45 ---
 tests/models/test_seq_first.py             |  48 ----
 tests/models/test_simple.py                |  99 -------
 tests/utils/distributed_configs.py         | 304 +++++++++++++++++++++
 tests/utils/run_test_script.py             | 147 +++-------
 tests/utils/utils.py                       |  56 ++++
 15 files changed, 672 insertions(+), 501 deletions(-)
 create mode 100644 tests/models/distributed_test_model.py
 delete mode 100644 tests/models/test_mb.py
 delete mode 100644 tests/models/test_mb_seq_first.py
 create mode 100644 tests/models/test_model.py
 delete mode 100644 tests/models/test_ms.py
 delete mode 100644 tests/models/test_seq_first.py
 delete mode 100644 tests/models/test_simple.py
 create mode 100644 tests/utils/distributed_configs.py

diff --git a/fast_llm/engine/distributed/config.py b/fast_llm/engine/distributed/config.py
index 7fd9fed13..9f006cdb1 100644
--- a/fast_llm/engine/distributed/config.py
+++ b/fast_llm/engine/distributed/config.py
@@ -79,6 +79,14 @@ def setup(self, group: "ProcessGroup|None"):
             Assert.eq(group.rank(), self.rank)
         self._group = group
 
+    def check_ranks_in_range(self, start, stop):
+        check_ranks_in_range(self.global_ranks, start, stop)
+
+
+def check_ranks_in_range(global_ranks, start, stop):
+    Assert.geq(min(global_ranks), start)
+    Assert.lt(max(global_ranks), stop)
+
 
 class DistributedDimNames:
     # A set of common distributed dim names packed into a singleton.
@@ -348,6 +356,13 @@ def _get_global_ranks(self, size: int, stride: int) -> range:
 
     def _add_distributed_dim(self, distributed_dim: DistributedDim) -> None:
         Assert.eq(distributed_dim.global_ranks[distributed_dim.rank], self.rank, msg=distributed_dim)
+
+        logger.info(f"Initializing group {distributed_dim}")
+        try:
+            distributed_dim.check_ranks_in_range(0, self.world_size)
+        except:
+            logger.info(str(self))
+            raise
         if distributed_dim.name in self.distributed_dims:
             Assert.eq(distributed_dim, self.distributed_dims[distributed_dim.name])
         else:
diff --git a/fast_llm/engine/distributed/distributed.py b/fast_llm/engine/distributed/distributed.py
index fbbf9b6a7..977318841 100644
--- a/fast_llm/engine/distributed/distributed.py
+++ b/fast_llm/engine/distributed/distributed.py
@@ -13,6 +13,7 @@
     DistributedDim,
     DistributedDimNames,
     PhaseType,
+    check_ranks_in_range,
 )
 from fast_llm.utils import Assert
 
@@ -20,14 +21,34 @@
 
 
 class ProcessGroupPool:
-    def __init__(self, rank: int | None = None, world_size: int | None = None, timeout: float = 60):
+    def __init__(
+        self,
+        rank: int | None = None,
+        world_size: int | None = None,
+        local_world_size: int | None = None,
+        timeout: float = 60,
+        use_cpu: bool = False,
+    ):
 
         self._rank = DistributedConfig.default_rank if rank is None else rank
         self._world_size = DistributedConfig.default_world_size if world_size is None else world_size
+        self._local_world_size = (
+            DistributedConfig.default_local_world_size if local_world_size is None else local_world_size
+        )
         self._timeout = timeout
+        self._use_cpu = use_cpu
+
+        if self._use_cpu:
+            Assert.eq(self._world_size, 1)
+            self._device = torch.device("cpu")
+        else:
+            Assert.in_range_incl(self._local_world_size, 1, torch.cuda.device_count())
+            torch.cuda.init()
+            self._device = torch.device(self._rank)
+            torch.cuda.set_device(self._device)
 
         if self._world_size > 1:
-            if rank == 0:
+            if self._rank == 0:
                 logger.info("Initializing TCP store.")
             # We bypass `torch.distributed.init_process_group` which makes things way more complicated for no reason.
             # TODO: Allow other init methods?
@@ -49,12 +70,21 @@ def rank(self):
     def world_size(self):
         return self._world_size
 
+    @property
+    def local_world_size(self):
+        return self._local_world_size
+
+    @property
+    def device(self):
+        return self._device
+
     def get_process_group(self, global_ranks: range | tuple, group_rank: int) -> ProcessGroup | None:
         """
         Get the requested process group from the pool, or create it if it doesn't exist.
         """
         group_size = len(global_ranks)
         Assert.eq(global_ranks[group_rank], self._rank)
+        check_ranks_in_range(global_ranks, 0, self._world_size)
         if group_size == 1:
             return None
 
@@ -85,6 +115,7 @@ def __enter__(self):
         global _default_pool
         assert _default_pool is None
         _default_pool = self
+        return self
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         global _default_pool
@@ -120,24 +151,22 @@ class Distributed[ConfigType: DistributedConfig](Configurable[ConfigType]):
     def __init__(self, config: DistributedConfig, use_cpu: bool = False):
         super().__init__(config)
         assert self._config.reference_config is None
-        self._use_cpu = use_cpu
-
-        if self._use_cpu:
-            Assert.eq(self._config.world_size, 1)
-            self.device = torch.device("cpu")
-        else:
-            Assert.in_range_incl(self._config.local_world_size, 1, torch.cuda.device_count())
-            torch.cuda.init()
-            self.device = torch.device(self._config.local_rank)
-            torch.cuda.set_device(self.device)
 
         self._local_pool = _default_pool is None
         if self._local_pool:
-            self._pool = ProcessGroupPool(self._config.rank, self._config.world_size, self._config.timeout)
+            self._pool = ProcessGroupPool(
+                self._config.rank,
+                self._config.world_size,
+                self._config.local_world_size,
+                self._config.timeout,
+                use_cpu,
+            )
         else:
             self._pool = _default_pool
-            Assert.eq(self._pool._world_size, self._config.world_size)
-            Assert.eq(self._pool._rank, self._config.rank)
+            Assert.geq(self._pool.world_size, self._config.world_size)
+            Assert.eq(self._pool.rank, self._config.rank)
+            Assert.geq(self._pool.local_world_size, self._config.local_world_size)
+            Assert.eq(self._pool.device.type, "cpu" if use_cpu else "cuda")
 
         self.world_group = self.add_group(self._config.distributed_dims[DistributedDimNames.world])
         self.data_group = self.add_group(self._config.distributed_dims[DistributedDimNames.data])
@@ -188,11 +217,16 @@ def __init__(self, config: DistributedConfig, use_cpu: bool = False):
 
         self.set_step(0, PhaseType.training)
 
+    @property
+    def device(self):
+        return self._pool.device
+
     def add_group(self, distributed_dim: DistributedDim) -> ProcessGroup | None:
         """
         Add a process group from its definition.
         """
         self._config.log_first_rank(f"Initializing group {distributed_dim.name}, size={distributed_dim.size}...")
+        distributed_dim.check_ranks_in_range(0, self._config.world_size)
         group = self._pool.get_process_group(distributed_dim.global_ranks, distributed_dim.rank)
         distributed_dim.setup(group)
         return group
diff --git a/tests/conftest.py b/tests/conftest.py
index 27ea5f63d..4c9161ea6 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -15,8 +15,8 @@
 
 # Make fixtures available globally without import
 from tests.utils.run_test_script import (  # isort: skip
-    run_distributed_script_for_all_models,
-    run_test_script,
+    compare_results_for_all_models,
+    run_distributed_script,
     run_test_script_base_path,
     run_test_script_for_all_models,
 )
diff --git a/tests/models/distributed_test_model.py b/tests/models/distributed_test_model.py
new file mode 100644
index 000000000..ad19eeafa
--- /dev/null
+++ b/tests/models/distributed_test_model.py
@@ -0,0 +1,59 @@
+import logging
+
+import torch
+
+from fast_llm.cli import fast_llm_main_wrapper
+from fast_llm.core.distributed import allreduce_scalar, safe_barrier
+from fast_llm.engine.distributed.config import DistributedConfig
+from fast_llm.engine.distributed.distributed import ProcessGroupPool
+from tests.utils.distributed_configs import DISTRIBUTED_TESTING_CONFIGS
+from tests.utils.run_test_script import do_run_test_script_for_all_models, parse_run_distributed_script
+from tests.utils.utils import DistributedSubtestContext
+
+logger = logging.getLogger(__name__)
+
+
+def main(args: list[str] | None = None) -> None:
+    base_path, model_testing_config = parse_run_distributed_script(args)
+
+    with ProcessGroupPool(timeout=20) as pool:
+        failures = []
+        world_size = DistributedConfig.default_world_size
+        rank = DistributedConfig.default_rank
+        group = pool.get_process_group(range(world_size), rank)
+
+        for name, config in DISTRIBUTED_TESTING_CONFIGS.items():
+            if config.num_gpus > world_size:
+                logger.warning(f"{name} {f"SKIPPED (not enough GPUs: {config.num_gpus} > {world_size})"})")
+            if DistributedConfig.default_rank < config.num_gpus:
+                logger.info(f"Running {name}")
+                with DistributedSubtestContext(base_path / name, rank) as subtest:
+                    do_run_test_script_for_all_models(config, model_testing_config, base_path)
+                assert subtest._capture_manager._global_capturing is None
+                success = subtest.success
+            else:
+                # Worker is not needed for this one, skip.
+                success = True
+
+            # Barrier so `allreduce_scalar` doesn't go crazy in case of desync.
+            safe_barrier(group, name)
+            success = (
+                success if group is None else allreduce_scalar(success, dtype=torch.int64, group=group) == world_size
+            )
+            logger.warning(f"{name} {"PASSED" if success else "FAILED"})")
+            if not success:
+                failures.append(name)
+            if rank == 0:
+                (base_path / name / "pytest_success").write_text(str(int(success)))
+
+        # Final barrier to ensure everything is done before torchrun potentially kills workers.
+        safe_barrier(group, "testing end")
+        # Let pytest know how things went.
+        # These should already be reported above, we repeat for convenience.
+        if failures:
+            raise RuntimeError(f"The following subtests failed: {", ".join(failures)}")
+
+
+if __name__ == "__main__":
+    with fast_llm_main_wrapper():
+        main()
diff --git a/tests/models/test_checkpoint.py b/tests/models/test_checkpoint.py
index 63a25747f..8392494e4 100644
--- a/tests/models/test_checkpoint.py
+++ b/tests/models/test_checkpoint.py
@@ -20,7 +20,9 @@
 from fast_llm.engine.multi_stage.config import FastLLMModelConfig, ShardName
 from fast_llm.utils import Assert
 from tests.utils.compare_tensor_logs import CompareConfig, compare_logged_tensor
+from tests.utils.distributed_configs import DistributedTestingConfig
 from tests.utils.model_configs import ModelTestingConfig, ModelTestingGroup
+from tests.utils.run_test_script import ARTIFACT_PATH
 
 _WEIGHT_SHARD_SAVE_NAME = f"{ShardName.weights}_shard"
 
@@ -34,46 +36,53 @@
 @pytest.mark.model_testing_group(ModelTestingGroup.checkpoint)
 def test_checkpoint_and_eval(run_test_script_for_all_models, model_testing_config):
     # A baseline config (single-gpu, bf16, flash-attn).
-    run_test_script_for_all_models(_CHECKPOINT_AND_EVAL_ARGS)
-
-
-def _prepare_resume_fn(test_path: pathlib.Path, compare_path: pathlib.Path):
-    shutil.copytree(compare_path, test_path)
-    shutil.rmtree(test_path / "checkpoint" / "2")
-    assert (test_path / "checkpoint" / "1" / "ok").is_file()
-    # TODO: Eval
-    shutil.rmtree(test_path / "runs")
+    run_test_script_for_all_models(
+        distributed_testing_config=DistributedTestingConfig(
+            name="checkpoint_and_eval", config_args=_CHECKPOINT_AND_EVAL_ARGS
+        ),
+    )
 
 
-def _compare_resume_fn(test_path: pathlib.Path, compare_path: pathlib.Path):
-    for artifact in ["init", "train_1"]:
-        path = f"runs/0/artifacts/0/tensor_logs_{artifact}.pt"
-        if not (test_path / path).is_file():
-            shutil.copy(compare_path / path, test_path / path)
+@pytest.fixture(scope="module")
+def prepare_resume(run_test_script_base_path: pathlib.Path):
+    def do_prepare_resume(distributed_testing_config: DistributedTestingConfig):
+        resume_from_path = run_test_script_base_path / distributed_testing_config.compare
+        self_path = run_test_script_base_path / distributed_testing_config.name
+        shutil.copytree(resume_from_path, self_path)
+        shutil.rmtree(self_path / "checkpoint" / "2")
+        assert (self_path / "checkpoint" / "1" / "ok").is_file()
+        # TODO: Eval
+        shutil.rmtree(self_path / "runs")
+        for artifact in ["init", "train_1"]:
+            path = f"{ARTIFACT_PATH}/0/tensor_logs_{artifact}.pt"
+            shutil.copy(resume_from_path / path, self_path / path)
+
+    return do_prepare_resume
 
 
 @pytest.mark.depends_on(on=["test_checkpoint_and_eval[{model_testing_config}]"])
 @pytest.mark.model_testing_group(ModelTestingGroup.checkpoint)
-def test_resume(run_test_script_for_all_models):
-    # Resume from iteration=1 and compare outputs with the baseline run.
-    run_test_script_for_all_models(
-        _CHECKPOINT_AND_EVAL_ARGS,
-        compare=f"test_checkpoint_and_eval",
-        prepare_fn=_prepare_resume_fn,
-        compare_fn=_compare_resume_fn,
+def test_resume(run_test_script_for_all_models, compare_results_for_all_models, prepare_resume):
+    distributed_testing_config = DistributedTestingConfig(
+        name="resume", compare="test_checkpoint_and_eval", config_args=_CHECKPOINT_AND_EVAL_ARGS
     )
 
+    prepare_resume(distributed_testing_config)
+
+    # Resume from iteration=1 and compare outputs with the baseline run.
+    run_test_script_for_all_models(distributed_testing_config)
+    compare_results_for_all_models("distributed_testing_config")
+
 
 @pytest.mark.depends_on(on=["test_checkpoint_and_eval[{model_testing_config}]"])
 @pytest.mark.model_testing_group(ModelTestingGroup.checkpoint)
-def test_resume_frozen(run_test_script_for_all_models):
-    # Resume with frozen mlp. No comparison.
-    run_test_script_for_all_models(
-        _CHECKPOINT_AND_EVAL_ARGS + ["model.base_model.transformer.mlp_lr_scale=0."],
-        compare="test_checkpoint_and_eval",
-        prepare_fn=_prepare_resume_fn,
-        do_compare=False,
+def test_resume_frozen(run_test_script_for_all_models, prepare_resume):
+    distributed_testing_config = DistributedTestingConfig(
+        name="resume_frozen", compare="test_checkpoint_and_eval", config_args=_CHECKPOINT_AND_EVAL_ARGS
     )
+    prepare_resume(distributed_testing_config)
+    # Resume with frozen mlp. No comparison.
+    run_test_script_for_all_models(distributed_testing_config)
 
 
 def do_get_convert_path(
@@ -343,15 +352,18 @@ def load_and_save_parallel_base_path(run_test_script_base_path):
     ]
 )
 @pytest.mark.model_testing_group(ModelTestingGroup.convert, ModelTestingGroup.distributed)
-def test_save_and_load_in_parallel(run_distributed_script_for_all_models, load_and_save_parallel_base_path):
+def test_save_and_load_in_parallel(run_distributed_script, load_and_save_parallel_base_path, model_testing_config):
     # Save and load checkpoints to and from various distributed configurations.
     # Combined in a single test to mitigate process creation overhead.
     # TODO: Test beyond 2 gpu configs?
     import tests.models.distributed_test_checkpoint
 
-    run_distributed_script_for_all_models(
-        [tests.models.distributed_test_checkpoint.__file__],
-        base_path=load_and_save_parallel_base_path,
+    run_distributed_script(
+        [
+            tests.models.distributed_test_checkpoint.__file__,
+            str(load_and_save_parallel_base_path),
+            model_testing_config.name,
+        ],
         num_gpus=2,
     )
 
diff --git a/tests/models/test_match_megatron.py b/tests/models/test_match_megatron.py
index 7645de9e1..5d974172d 100644
--- a/tests/models/test_match_megatron.py
+++ b/tests/models/test_match_megatron.py
@@ -1,18 +1,36 @@
+import os
+
 import pytest
 
 from tests.utils.compare_tensor_logs import CompareConfig
-from tests.utils.dataset import DATASET_PREFIX
+from tests.utils.dataset import DATASET_PREFIX, get_test_dataset
+from tests.utils.distributed_configs import DistributedTestingConfig
 from tests.utils.model_configs import ModelTestingGroup
 
 
 @pytest.mark.model_testing_group(ModelTestingGroup.megatron)
-def test_megatron(run_test_script_for_all_models, model_testing_config):
-    run_test_script_for_all_models([], is_megatron=True)
+def test_megatron(run_distributed_script, model_testing_config, run_test_script_base_path):
+    path = run_test_script_base_path / "megatron"
+    env = os.environ.copy()
+    # Prevent Megatron from complaining.
+    env["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
+    env["NVTE_FLASH_ATTN"] = "0"
+    get_test_dataset()
+    run_distributed_script(
+        [
+            "Megatron-LM/pretrain_gpt.py",
+            *model_testing_config.megatron_args,
+            f"--structured-logs-dir={path}",
+            f"--data-cache-path={path}",
+        ],
+        num_gpus=1,
+        env=env,
+    )
 
 
 @pytest.mark.depends_on(on=["test_megatron[{model_testing_config}]"])
 @pytest.mark.model_testing_group(ModelTestingGroup.megatron)
-def test_match_megatron(run_test_script_for_all_models, model_testing_config):
+def test_match_megatron(run_test_script_for_all_models, model_testing_config, compare_results_for_all_models):
     assert model_testing_config.megatron_args is not None
 
     ignore_tensors = [
@@ -25,13 +43,18 @@ def test_match_megatron(run_test_script_for_all_models, model_testing_config):
     if model_testing_config.name == "mixtral":
         ignore_tensors.extend([".mlp.experts.", ".mlp.layer_1.weight"])
 
-    run_test_script_for_all_models(
-        [
+    distributed_testing_config = DistributedTestingConfig(
+        name="match_megatron",
+        compare="megatron",
+        config_args=[
             "model.distributed.training_dtype=fp32",
             "data.datasets={}",
             f"data.path={DATASET_PREFIX}",
             "model.base_model.use_megatron_initialization=True",
         ],
-        compare="test_megatron",
-        config=CompareConfig(ignore_tensors=ignore_tensors),
+        num_gpus=1,
+        compare_config=CompareConfig(ignore_tensors=ignore_tensors),
     )
+
+    run_test_script_for_all_models(distributed_testing_config)
+    compare_results_for_all_models(distributed_testing_config)
diff --git a/tests/models/test_mb.py b/tests/models/test_mb.py
deleted file mode 100644
index 781de6e85..000000000
--- a/tests/models/test_mb.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import pytest
-
-from tests.utils.compare_tensor_logs import CompareConfig
-from tests.utils.model_configs import ModelTestingGroup
-
-
-# TODO: Compare grads with simple
-@pytest.mark.model_testing_group(ModelTestingGroup.basic)
-def test_model_df4(run_test_script_for_all_models):
-    # Depth-first gradient accumulation baseline.
-    run_test_script_for_all_models(["batch.depth_first_micro_batches=4"])
-
-
-@pytest.mark.depends_on(on=["test_model_df4[{model_testing_config}]"])
-@pytest.mark.model_testing_group(ModelTestingGroup.distributed)
-def test_model_df4_z3(run_test_script_for_all_models):
-    # Gradient accumulation with ZeRO-3.
-    run_test_script_for_all_models(
-        ["model.multi_stage.zero_stage=3", "batch.depth_first_micro_batches=4"],
-        num_gpus=2,
-        compare="test_model_df4",
-        config=CompareConfig(ignore_duplicates=["Global gradient"]),
-    )
-
-
-@pytest.mark.depends_on(on=["test_model_df4[{model_testing_config}]"], scope="session")
-@pytest.mark.model_testing_group(ModelTestingGroup.distributed)
-def test_model_bf4(run_test_script_for_all_models):
-    # Breadth-first gradient accumulation baseline.
-    run_test_script_for_all_models(["batch.breadth_first_micro_batches=4"], compare="test_model_df4")
-
-
-@pytest.mark.depends_on(on=["test_model_df4[{model_testing_config}]", "test_model_bf4[{model_testing_config}]"])
-@pytest.mark.model_testing_group(ModelTestingGroup.distributed)
-def test_model_bf2_df2(run_test_script_for_all_models):
-    # Mixed gradient accumulation baseline.
-    run_test_script_for_all_models(
-        ["batch.depth_first_micro_batches=2", "batch.breadth_first_micro_batches=2"], compare="test_model_df4"
-    )
-
-
-@pytest.mark.depends_on(on=["test_model_bf4[{model_testing_config}]"])
-@pytest.mark.model_testing_group(ModelTestingGroup.distributed)
-def test_model_pp2s2_bf4(run_test_script_for_all_models):
-    # Pipeline-parallel without tied weights.
-    run_test_script_for_all_models(
-        [
-            "batch.breadth_first_micro_batches=4",
-            "model.distributed.pipeline_parallel=2",
-            "model.multi_stage.layers_per_stage=2",
-        ],
-        num_gpus=2,
-        compare="test_model_df4",
-    )
-
-
-@pytest.mark.depends_on(on=["test_model_bf4[{model_testing_config}]"])
-@pytest.mark.model_testing_group(ModelTestingGroup.distributed)
-def test_model_pp2s1_bf4(run_test_script_for_all_models):
-    # Pipeline-parallel with tied weights.
-    run_test_script_for_all_models(
-        [
-            "batch.breadth_first_micro_batches=4",
-            "model.distributed.pipeline_parallel=2",
-            "model.multi_stage.layers_per_stage=1",
-        ],
-        num_gpus=2,
-        compare="test_model_df4",
-        config=CompareConfig(
-            ignore_duplicates=[
-                "layers.0.word_embeddings_weight",
-                "layers.0.position_embeddings_weight",
-            ]
-        ),
-    )
-
-
-@pytest.mark.depends_on(on=["test_model_bf4[{model_testing_config}]"])
-@pytest.mark.model_testing_group(ModelTestingGroup.distributed)
-def test_model_dp2_tp2_pp2s2_bf4(run_test_script_for_all_models):
-    # Simple 3d parallelism
-    # TODO: Test fails
-    run_test_script_for_all_models(
-        [
-            "batch.breadth_first_micro_batches=4",
-            "model.distributed.tensor_parallel=2",
-            "model.distributed.pipeline_parallel=2",
-            "model.multi_stage.layers_per_stage=1",
-        ],
-        num_gpus=8,
-        compare="test_model_df4",
-    )
diff --git a/tests/models/test_mb_seq_first.py b/tests/models/test_mb_seq_first.py
deleted file mode 100644
index 5a8db0b98..000000000
--- a/tests/models/test_mb_seq_first.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import pytest
-
-from tests.utils.compare_tensor_logs import CompareConfig
-from tests.utils.model_configs import ModelTestingGroup
-
-
-# TODO: Compare grads with simple
-@pytest.mark.model_testing_group(ModelTestingGroup.basic)
-def test_model_df4_sf(run_test_script_for_all_models):
-    # Sequence-first gradient accumulation baseline.
-    run_test_script_for_all_models(["batch.depth_first_micro_batches=4", "model.base_model.sequence_first=True"])
-
-
-@pytest.mark.depends_on(on=["test_model_df4_sf[{model_testing_config}]"])
-@pytest.mark.model_testing_group(ModelTestingGroup.distributed)
-def test_model_dp2_sp2_df4(run_test_script_for_all_models):
-    # Sequence-tensor-parallel with gradient accumulation.
-    # TODO: Compiled cross-entropy broken for this config
-    run_test_script_for_all_models(
-        [
-            "batch.breadth_first_micro_batches=4",
-            "model.base_model.sequence_first=True",
-            "model.distributed.tensor_parallel=2",
-            "model.distributed.sequence_tensor_parallel=True",
-            "run.torch_dynamo_enable=False",
-        ],
-        num_gpus=4,
-        compare="test_model_df4_sf",
-    )
-
-
-@pytest.mark.skip(reason="Test is broken.")
-@pytest.mark.depends_on(on=["test_model_df4_sf[{model_testing_config}]"])
-@pytest.mark.model_testing_group(ModelTestingGroup.distributed)
-def test_model_dp2_sp2_pp2s1(run_test_script_for_all_models):
-    # 3d-parallel with sequence-tensor-parallel.
-    # TODO: Compiled cross-entropy broken for this config
-    run_test_script_for_all_models(
-        [
-            "batch.breadth_first_micro_batches=4",
-            "model.base_model.sequence_first=True",
-            "model.distributed.tensor_parallel=2",
-            "model.distributed.pipeline_parallel=2",
-            "model.distributed.sequence_tensor_parallel=True",
-            "run.torch_dynamo_enable=False",
-        ],
-        num_gpus=8,
-        compare="test_model_df4_sf",
-        config=CompareConfig(ignore_duplicates=["layers.0.word_embeddings_weight"]),
-    )
diff --git a/tests/models/test_model.py b/tests/models/test_model.py
new file mode 100644
index 000000000..7e853a24a
--- /dev/null
+++ b/tests/models/test_model.py
@@ -0,0 +1,75 @@
+import pytest
+import torch
+
+from tests.utils.distributed_configs import (
+    DISTRIBUTED_TESTING_CONFIGS,
+    SIMPLE_TESTING_CONFIG,
+    SINGLE_GPU_TESTING_CONFIGS,
+)
+from tests.utils.model_configs import ModelTestingGroup
+from tests.utils.run_test_script import ARTIFACT_PATH
+from tests.utils.utils import report_subtest
+
+
+@pytest.mark.model_testing_group(ModelTestingGroup.basic)
+def test_model_simple(run_test_script_for_all_models):
+    # A simple config to prevent unnecessary testing and creation of dependency group
+    run_test_script_for_all_models(SIMPLE_TESTING_CONFIG)
+
+
+@pytest.mark.depends_on(on=["test_model_simple[{model_testing_config}]"])
+@pytest.mark.model_testing_group(ModelTestingGroup.basic)
+# Parametrize with config name so it shows in test name.
+@pytest.mark.parametrize("config_name", SINGLE_GPU_TESTING_CONFIGS)
+def test_and_compare_model(
+    run_test_script_for_all_models, compare_results_for_all_models, config_name, run_test_script_base_path
+):
+    # We can expect tests to respect the ordering of `SINGLE_GPU_TESTING_CONFIGS`, so compare should have run already.
+    config = SINGLE_GPU_TESTING_CONFIGS[config_name]
+    if config.compare is not None:
+        for artifact in ["init", "train_1"]:
+            path = run_test_script_base_path / config.compare / ARTIFACT_PATH / "0" / f"tensor_logs_{artifact}.pt"
+            if not path.is_file():
+                # Dependency likely failed, skipping this test because it will most likely fail for the same reason.
+                # We still need to fail because we can't confirm the failure.
+                pytest.fail(f"Compared test {config.compare} failed or did not run ({path} not found).", pytrace=False)
+    # A baseline config (single-gpu, bf16, flash-attn).
+    # Also tests for multiple data loaders.
+    run_test_script_for_all_models(config)
+
+    if config.compare is not None:
+        compare_results_for_all_models(config)
+
+
+@pytest.mark.depends_on(on=["test_model_simple[{model_testing_config}]"])
+@pytest.mark.model_testing_group(ModelTestingGroup.distributed)
+def test_run_model_distributed(run_distributed_script, model_testing_config, run_test_script_base_path):
+    import tests.models.distributed_test_model
+
+    run_distributed_script(
+        [
+            tests.models.distributed_test_model.__file__,
+            str(run_test_script_base_path),
+            model_testing_config.name,
+        ],
+        num_gpus=torch.cuda.device_count(),
+    )
+
+
+# We don't want to depend on `test_model_distributed` because we still want to run this in cas of failure.
+# This should still run after `test_model_distributed`
+@pytest.mark.depends_on(on=["test_model_simple[{model_testing_config}]"])
+@pytest.mark.model_testing_group(ModelTestingGroup.distributed)
+@pytest.mark.parametrize("config_name", list(DISTRIBUTED_TESTING_CONFIGS)[:1])
+def test_model_distributed(
+    run_test_script_for_all_models, compare_results_for_all_models, config_name, run_test_script_base_path
+):
+    config = DISTRIBUTED_TESTING_CONFIGS[config_name]
+    report_subtest(run_test_script_base_path / config.name, config.num_gpus)
+    if config.compare is not None:
+        for artifact in ["init", "train_1"]:
+            if not (
+                run_test_script_base_path / config.compare / ARTIFACT_PATH / f"tensor_logs_{artifact}.pt"
+            ).is_file():
+                pytest.fail(f"Compared test {config.compare} failed or did not run.", pytrace=False)
+        compare_results_for_all_models(config)
diff --git a/tests/models/test_ms.py b/tests/models/test_ms.py
deleted file mode 100644
index b97f84e5d..000000000
--- a/tests/models/test_ms.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import pytest
-
-from tests.utils.model_configs import ModelTestingGroup
-
-
-# TODO: Compare grads with simple
-@pytest.mark.model_testing_group(ModelTestingGroup.basic)
-def test_model_ms256(run_test_script_for_all_models):
-    # Micro-sequence baseline
-    run_test_script_for_all_models(["batch.micro_sequence_length=256"])
-
-
-@pytest.mark.depends_on(on=["test_model_ms256[{model_testing_config}]"])
-@pytest.mark.model_testing_group(ModelTestingGroup.distributed)
-def test_model_pp2s2_ms256(run_test_script_for_all_models):
-    # Sequence-pipeline-parallel
-    run_test_script_for_all_models(
-        [
-            "batch.micro_sequence_length=256",
-            "model.distributed.pipeline_parallel=2",
-            "model.multi_stage.layers_per_stage=2",
-        ],
-        num_gpus=2,
-        compare="test_model_ms256",
-    )
-
-
-@pytest.mark.skip
-@pytest.mark.depends_on(on=["test_model_ms256[{model_testing_config}]"])
-@pytest.mark.model_testing_group(ModelTestingGroup.distributed)
-def test_model_dp2s2_stp2_pp2s2_ms256(run_test_script_for_all_models):
-    # TODO: Handle this case.
-    # Sequence-3d-parallel
-    run_test_script_for_all_models(
-        [
-            "batch.micro_sequence_length=256",
-            "model.distributed.pipeline_parallel=2",
-            "model.distributed.tensor_parallel=2",
-            "model.distributed.sequence_tensor_parallel=True",
-            "model.distributed.sequence_data_parallel=2",
-            "model.multi_stage.layers_per_stage=2",
-        ],
-        num_gpus=8,
-        compare="test_model_ms256",
-    )
diff --git a/tests/models/test_seq_first.py b/tests/models/test_seq_first.py
deleted file mode 100644
index 66b044df3..000000000
--- a/tests/models/test_seq_first.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import pytest
-
-from tests.utils.model_configs import ModelTestingGroup
-
-
-# TODO: Compare grads with simple
-@pytest.mark.model_testing_group(ModelTestingGroup.basic)
-def test_model_sf(run_test_script_for_all_models):
-    # Sequence-first baseline.
-    run_test_script_for_all_models(["model.base_model.sequence_first=True"])
-
-
-@pytest.mark.depends_on(on=["test_model_sf[{model_testing_config}]"])
-@pytest.mark.model_testing_group(ModelTestingGroup.distributed)
-def test_model_sp2(run_test_script_for_all_models):
-    # Sequence-tensor-parallel.
-    run_test_script_for_all_models(
-        ["model.distributed.tensor_parallel=2", "model.distributed.sequence_tensor_parallel=True"],
-        num_gpus=2,
-        compare="test_model_sf",
-    )
-
-
-@pytest.mark.depends_on(on=["test_model_sf[{model_testing_config}]"])
-@pytest.mark.model_testing_group(ModelTestingGroup.distributed)
-def test_model_sdp2(run_test_script_for_all_models):
-    # Sequence-data-parallel
-    run_test_script_for_all_models(
-        ["model.distributed.sequence_data_parallel=2"],
-        num_gpus=2,
-        compare="test_model_sf",
-    )
-
-
-@pytest.mark.depends_on(on=["test_model_sf[{model_testing_config}]"])
-@pytest.mark.model_testing_group(ModelTestingGroup.distributed)
-def test_model_sp2_ce4(run_test_script_for_all_models):
-    # Sequence-tensor-parallel with cross-entropy splits.
-    run_test_script_for_all_models(
-        [
-            "model.distributed.tensor_parallel=2",
-            "model.distributed.sequence_tensor_parallel=True",
-            "model.base_model.parallel_embeddings=False",
-            "model.base_model.cross_entropy_splits=4",
-        ],
-        num_gpus=2,
-        compare="test_model_sf",
-    )
diff --git a/tests/models/test_simple.py b/tests/models/test_simple.py
deleted file mode 100644
index 4616942c6..000000000
--- a/tests/models/test_simple.py
+++ /dev/null
@@ -1,99 +0,0 @@
-import pytest
-
-from tests.utils.model_configs import ModelTestingGroup
-
-
-@pytest.mark.model_testing_group(ModelTestingGroup.basic)
-def test_model_safe(run_test_script_for_all_models):
-    # The safest possible config, identical to the one in test_match_megatron except for the initialization.
-    run_test_script_for_all_models(
-        [
-            "model.distributed.training_dtype=fp32",
-            "run.torch_dynamo_enable=False",
-            "schedule.data_overlap=False",
-            "model.base_model.transformer.dropless_moe=False",
-        ],
-    )
-
-
-@pytest.mark.depends_on(on=["test_model_safe[{model_testing_config}]"])
-@pytest.mark.model_testing_group(ModelTestingGroup.basic)
-def test_model(run_test_script_for_all_models):
-    # A baseline config (single-gpu, bf16, flash-attn).
-    # Also tests for multiple data loaders.
-    run_test_script_for_all_models(["training.num_workers=2"], compare="test_model_safe")
-
-
-@pytest.mark.depends_on(on=["test_model[{model_testing_config}]"])
-@pytest.mark.model_testing_group(ModelTestingGroup.distributed)
-def test_model_dp2(run_test_script_for_all_models):
-    # Simple data-parallel.
-    run_test_script_for_all_models([], num_gpus=2, compare="test_model")
-
-
-@pytest.mark.skip(reason="Flaky")
-@pytest.mark.model_testing_group(ModelTestingGroup.distributed)
-def test_model_dp2_timeout(run_test_script_for_all_models):
-    # Test sampling timeout
-    # TODO: Find a better way to test this
-    run_test_script_for_all_models(
-        [
-            # Use a short timeout
-            "model.distributed.timeout=4",
-            # Make a dataset that would timeout under the distributed timeout
-            'data.datasets.training={"type":"test_slow"}',
-            "data.datasets.training.type=test_slow",
-            "data.datasets.training.sleep=6",
-            # Use a bigger timeout for the dataset.
-            "training.timeout=10",
-            # Remove testing clutter.
-            "model.multi_stage.debug_param_init=0",
-            "model.multi_stage.debug_layer_outputs=0",
-            "model.multi_stage.debug_layer_gradients=0",
-            "model.multi_stage.debug_all_param_gradients=0",
-        ],
-        num_gpus=2,
-    )
-
-
-@pytest.mark.depends_on(on=["test_model[{model_testing_config}]"])
-@pytest.mark.model_testing_group(ModelTestingGroup.distributed)
-def test_model_tp2(run_test_script_for_all_models):
-    # Simple tensor-parallel.
-    run_test_script_for_all_models(
-        ["model.distributed.tensor_parallel=2"],
-        num_gpus=2,
-        compare="test_model",
-    )
-
-
-@pytest.mark.depends_on(on=["test_model[{model_testing_config}]"])
-@pytest.mark.model_testing_group(ModelTestingGroup.basic)
-def test_model_ce4(run_test_script_for_all_models):
-    # Cross-entropy splits.
-    run_test_script_for_all_models(
-        ["model.base_model.cross_entropy_splits=4"],
-        compare="test_model",
-    )
-
-
-@pytest.mark.depends_on(on=["test_model[{model_testing_config}]"])
-@pytest.mark.model_testing_group(ModelTestingGroup.distributed)
-def test_model_dp2_z2(run_test_script_for_all_models):
-    # Data-parallel with zero stage 2.
-    run_test_script_for_all_models(
-        ["model.multi_stage.zero_stage=2"],
-        num_gpus=2,
-        compare="test_model",
-    )
-
-
-@pytest.mark.depends_on(on=["test_model[{model_testing_config}]"])
-@pytest.mark.model_testing_group(ModelTestingGroup.distributed)
-def test_model_dp2_z3(run_test_script_for_all_models):
-    # Data-parallel with zero stage 3.
-    run_test_script_for_all_models(
-        ["model.multi_stage.zero_stage=3"],
-        num_gpus=2,
-        compare="test_model",
-    )
diff --git a/tests/utils/distributed_configs.py b/tests/utils/distributed_configs.py
new file mode 100644
index 000000000..8bbd08d51
--- /dev/null
+++ b/tests/utils/distributed_configs.py
@@ -0,0 +1,304 @@
+import dataclasses
+import logging
+
+from tests.utils.compare_tensor_logs import CompareConfig
+
+logger = logging.getLogger(__name__)
+
+
+@dataclasses.dataclass(kw_only=True)
+class DistributedTestingConfig:
+    name: str
+    compare: str | None = None
+    config_args: list[str]
+    num_gpus: int = 1
+    compare_config: CompareConfig | None = None
+
+
+# Baseline (also tests data-parallel workers)
+SIMPLE_TESTING_CONFIG = DistributedTestingConfig(
+    name="simple",
+    compare=None,
+    config_args=["training.num_workers=2"],
+    num_gpus=1,
+)
+
+_SINGLE_GPU_TESTING_CONFIGS = [
+    # Sequence-first baseline
+    DistributedTestingConfig(
+        name="sf",
+        compare=None,
+        config_args=["model.base_model.sequence_first=True"],
+        num_gpus=1,
+    ),
+    # Cross-entropy splits.
+    DistributedTestingConfig(
+        name="ce4",
+        compare=None,
+        config_args=["model.base_model.cross_entropy_splits=4"],
+        num_gpus=1,
+    ),
+    # Micro-sequence baseline
+    DistributedTestingConfig(
+        name="ms",
+        compare=None,
+        config_args=["batch.micro_sequence_length=256"],
+        num_gpus=1,
+    ),
+    # Gradient accumulation baseline.
+    DistributedTestingConfig(
+        name="df4",
+        compare=None,
+        config_args=["batch.depth_first_micro_batches=4"],
+        num_gpus=1,
+    ),
+    # Breadth-first gradient accumulation.
+    DistributedTestingConfig(
+        name="bf4",
+        compare="df4",
+        config_args=["batch.breadth_first_micro_batches=4"],
+        num_gpus=1,
+    ),
+    # Mixed gradient accumulation.
+    DistributedTestingConfig(
+        name="bf2_df2",
+        compare="df4",
+        config_args=["batch.depth_first_micro_batches=2", "batch.breadth_first_micro_batches=2"],
+        num_gpus=1,
+    ),
+    # Sequence-first gradient accumulation baseline.
+    DistributedTestingConfig(
+        name="df4_sf",
+        compare=None,
+        config_args=[],
+        num_gpus=1,
+    ),
+]
+
+SINGLE_GPU_TESTING_CONFIGS = {config.name: config for config in _SINGLE_GPU_TESTING_CONFIGS}
+
+
+_DISTRIBUTED_TESTING_CONFIGS = [
+    # ===== Data-parallel configs
+    # Simple
+    DistributedTestingConfig(
+        name="dp2",
+        compare="simple",
+        config_args=[],
+        num_gpus=2,
+    ),
+    # Zero stage 2
+    DistributedTestingConfig(
+        name="dp2_z2",
+        compare="simple",
+        config_args=["model.multi_stage.zero_stage=2"],
+        num_gpus=2,
+    ),
+    # Zero stage 3
+    DistributedTestingConfig(
+        name="dp2_z3",
+        compare="simple",
+        config_args=["model.multi_stage.zero_stage=3"],
+        num_gpus=2,
+    ),
+    # Depth-first micro-batches
+    DistributedTestingConfig(
+        name="dp2_df4_z3",
+        compare="df4",
+        config_args=["model.multi_stage.zero_stage=3", "batch.depth_first_micro_batches=4"],
+        num_gpus=2,
+    ),
+    # Sequence-data-parallel
+    DistributedTestingConfig(
+        name="sdp2",
+        compare="sf",
+        config_args=["model.distributed.sequence_data_parallel=2"],
+        num_gpus=2,
+    ),
+    # ===== Tensor-parallel configs
+    # Simple tensor-parallel
+    DistributedTestingConfig(
+        name="tp2",
+        compare="simple",
+        config_args=["model.distributed.tensor_parallel=2"],
+        num_gpus=2,
+    ),
+    # Simple sequence-tensor-parallel
+    DistributedTestingConfig(
+        name="stp2",
+        compare="simple",
+        config_args=["model.distributed.tensor_parallel=2", "model.distributed.sequence_tensor_parallel=True"],
+        num_gpus=2,
+    ),
+    # Cross-entropy splits
+    DistributedTestingConfig(
+        name="sp2_ce4",
+        compare="sf",
+        config_args=[
+            "model.distributed.tensor_parallel=2",
+            "model.distributed.sequence_tensor_parallel=True",
+            "model.base_model.parallel_embeddings=False",
+            "model.base_model.cross_entropy_splits=4",
+        ],
+        num_gpus=2,
+    ),
+    # ===== 2d configs (Data + Tensor)
+    # Simple
+    DistributedTestingConfig(
+        name="dp2_sp2",
+        compare="sf",
+        config_args=[
+            "model.distributed.tensor_parallel=2",
+            "model.distributed.sequence_tensor_parallel=True",
+        ],
+        num_gpus=4,
+    ),
+    # Depth-first micro-batches, tensor-parallel
+    DistributedTestingConfig(
+        name="tp2_df4",
+        compare="df4",
+        config_args=[
+            "batch.depth_first_micro_batches=4",
+            "model.distributed.tensor_parallel=2",
+        ],
+        num_gpus=4,
+    ),
+    # Breadth-first micro-batches
+    DistributedTestingConfig(
+        name="sdp2_sp2_bf4",
+        compare="df4_sf",
+        config_args=[
+            "model.distributed.sequence_data_parallel=2",
+            "batch.breadth_first_micro_batches=4",
+            "model.distributed.tensor_parallel=2",
+            "model.distributed.sequence_tensor_parallel=True",
+        ],
+        num_gpus=4,
+    ),
+    # Sequence-data-parallel
+    DistributedTestingConfig(
+        name="sdp2_sp2",
+        compare="sf",
+        config_args=["model.distributed.tensor_parallel=2"],
+        num_gpus=4,
+    ),
+    # ===== Pipeline-parallel configs
+    # Simple [mb]
+    DistributedTestingConfig(
+        name="pp2s2_bf4",
+        compare="df4",
+        config_args=[
+            "batch.breadth_first_micro_batches=4",
+            "model.distributed.pipeline_parallel=2",
+            "model.multi_stage.layers_per_stage=2",
+        ],
+        num_gpus=2,
+    ),
+    # Tied weights on different ranks
+    DistributedTestingConfig(
+        name="pp2s1_bf4",
+        compare="df4",
+        config_args=[
+            "batch.breadth_first_micro_batches=4",
+            "model.distributed.pipeline_parallel=2",
+            "model.multi_stage.layers_per_stage=1",
+        ],
+        num_gpus=2,
+        compare_config=CompareConfig(
+            ignore_duplicates=[
+                "layers.0.word_embeddings_weight",
+                "layers.0.position_embeddings_weight",
+            ]
+        ),
+    ),
+    # Micro-sequence [ms]
+    DistributedTestingConfig(
+        name="pp2s2_ms",
+        compare="ms",
+        config_args=[
+            "batch.micro_sequence_length=256",
+            "model.distributed.pipeline_parallel=2",
+            "model.multi_stage.layers_per_stage=2",
+        ],
+        num_gpus=2,
+    ),
+    # ===== Data + Pipeline
+    # Simple
+    DistributedTestingConfig(
+        name="dp2_pp2s2",
+        compare="df4",
+        config_args=[
+            "batch.breadth_first_micro_batches=4",
+            "model.distributed.pipeline_parallel=2",
+            "model.multi_stage.layers_per_stage=2",
+        ],
+        num_gpus=4,
+    ),
+    # ===== Tensor + Pipeline
+    # Simple [sf, mb]
+    DistributedTestingConfig(
+        name="sp2_pp2s1",
+        compare="df4_sf",
+        config_args=[
+            "model.distributed.tensor_parallel=2",
+            "model.distributed.sequence_tensor_parallel=True",
+            "model.distributed.pipeline_parallel=2",
+            "model.multi_stage.layers_per_stage=2",
+        ],
+        num_gpus=4,
+        compare_config=CompareConfig(
+            ignore_duplicates=[
+                "layers.0.word_embeddings_weight",
+                "layers.0.position_embeddings_weight",
+            ]
+        ),
+    ),
+    # ===== Data + Tensor + Pipeline
+    # Simple
+    DistributedTestingConfig(
+        name="dp2_stp2_pp2s2",
+        compare="mb",
+        config_args=[
+            "batch.breadth_first_micro_batches=4",
+            "model.distributed.tensor_parallel=2",
+            "model.distributed.pipeline_parallel=2",
+            "model.multi_stage.layers_per_stage=2",
+        ],
+        num_gpus=8,
+    ),
+    # Tied weights on different ranks
+    DistributedTestingConfig(
+        name="dp2_tp2_pp2s1_bf4",
+        compare="mb",
+        config_args=[
+            "batch.breadth_first_micro_batches=4",
+            "model.distributed.tensor_parallel=2",
+            "model.distributed.sequence_tensor_parallel=True",
+            "model.distributed.pipeline_parallel=2",
+            "model.multi_stage.layers_per_stage=1",
+        ],
+        num_gpus=8,
+        compare_config=CompareConfig(
+            ignore_duplicates=[
+                "layers.0.word_embeddings_weight",
+                "layers.0.position_embeddings_weight",
+            ]
+        ),
+    ),
+    # Micro-sequence
+    DistributedTestingConfig(
+        name="dp2s2_stp2_pp2s2_ms256",
+        compare="ms",
+        config_args=[
+            "batch.micro_sequence_length=256",
+            "model.distributed.pipeline_parallel=2",
+            "model.distributed.tensor_parallel=2",
+            "model.distributed.sequence_tensor_parallel=True",
+            "model.distributed.sequence_data_parallel=2",
+            "model.multi_stage.layers_per_stage=2",
+        ],
+        num_gpus=8,
+    ),
+]
+
+DISTRIBUTED_TESTING_CONFIGS = {config.name: config for config in _DISTRIBUTED_TESTING_CONFIGS}
diff --git a/tests/utils/run_test_script.py b/tests/utils/run_test_script.py
index ab08ad734..6c0b561dd 100644
--- a/tests/utils/run_test_script.py
+++ b/tests/utils/run_test_script.py
@@ -7,13 +7,12 @@
 import typing
 
 import pytest
-import torch
 
-from fast_llm.engine.config_utils.runnable import RunnableConfig
 from fast_llm.engine.distributed.config import DistributedConfig
 from fast_llm.utils import Assert
-from tests.utils.compare_tensor_logs import CompareConfig, compare_tensor_logs
+from tests.utils.compare_tensor_logs import compare_tensor_logs
 from tests.utils.dataset import get_test_dataset
+from tests.utils.distributed_configs import DistributedTestingConfig
 from tests.utils.model_configs import MODEL_CONFIGS, ModelTestingConfig
 
 if typing.TYPE_CHECKING:
@@ -22,7 +21,7 @@
 # FIXME: figure out correct import of megatron modules without this hack
 sys.path.append(os.getcwd())
 
-_ARTIFACT_PATH = "runs/0/artifacts"
+ARTIFACT_PATH = "runs/0/artifacts"
 
 
 def do_run_distributed_script(
@@ -48,96 +47,14 @@ def do_run_distributed_script(
         raise RuntimeError(f"Process failed with return code {completed_proc.returncode}")
 
 
-def do_run_test_script(
-    path: pathlib.Path,
-    args: list[str],
-    num_gpus: int = 1,
-    *,
-    model_type: str,
-    is_megatron: bool = False,
-    compare_path: pathlib.Path | None = None,
-    config: CompareConfig | None = None,
-    prepare_fn=None,
-    compare_fn=None,
-    do_compare: bool = True,
-    rendezvous_port: int,
-    torchrun_port: int,
-):
-    is_parallel = DistributedConfig.default_world_size > 1
-    if is_parallel:
-        Assert.eq(num_gpus, DistributedConfig.default_world_size)
-    local_rank = DistributedConfig.default_rank
-
-    if torch.cuda.device_count() < num_gpus:
-        pytest.skip(f"Not enough GPUs to run test ({torch.cuda.device_count()}<{num_gpus})")
-    env = os.environ.copy()
-    if is_megatron:
-        assert num_gpus == 1
-        # Prevent Megatron from complaining.
-        env["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
-        env["NVTE_FLASH_ATTN"] = "0"
-    else:
-        env = None
-    if local_rank == 0 and prepare_fn is not None:
-        prepare_fn(path, None if compare_path is None else compare_path)
-    if is_megatron:
-        args = ["Megatron-LM/pretrain_gpt.py", *args, f"--structured-logs-dir={path}", f"--data-cache-path={path}"]
-    else:
-        args = ["--no-python", "fast-llm", "train", model_type, *args, f"run.experiment_dir={path}"]
-    get_test_dataset()
-    if (num_gpus == 1 or is_parallel) and not is_megatron:
-        print(" ".join(args[1:]))
-        RunnableConfig.parse_and_run(args[2:])
-    else:
-        do_run_distributed_script(
-            args, rendezvous_port=rendezvous_port, torchrun_port=torchrun_port, num_gpus=num_gpus, env=env
-        )
-    if local_rank == 0 and compare_path is not None and do_compare:
-        if compare_fn is not None:
-            compare_fn(path, compare_path)
-        compare_tensor_logs(
-            compare_path / _ARTIFACT_PATH,
-            path / _ARTIFACT_PATH,
-            config,
-        )
-
-
-def do_run_test_script_for_all_models(
-    extra_args: list[str],
-    num_gpus: int = 1,
-    *,
-    is_megatron: bool = False,
-    compare: str | None = None,
-    config: CompareConfig | None = None,
-    prepare_fn=None,
-    compare_fn=None,
-    do_compare: bool = True,
-    rendezvous_port: int,
-    torchrun_port: int,
-    test_name: str,
-    base_path: pathlib.Path,
+@pytest.fixture(scope="session")
+def run_distributed_script(
+    worker_resources: "WorkerResources",
+    run_test_script_base_path: pathlib.Path,
     model_testing_config: ModelTestingConfig,
 ):
-    do_run_test_script(
-        base_path / test_name,
-        (model_testing_config.megatron_args if is_megatron else model_testing_config.config_args) + extra_args,
-        num_gpus,
-        model_type=model_testing_config.model_type,
-        is_megatron=is_megatron,
-        compare_path=None if compare is None else base_path / compare,
-        config=config,
-        prepare_fn=prepare_fn,
-        compare_fn=compare_fn,
-        do_compare=do_compare,
-        rendezvous_port=rendezvous_port,
-        torchrun_port=torchrun_port,
-    )
-
-
-@pytest.fixture(scope="session")
-def run_test_script(worker_resources: "WorkerResources"):
     return functools.partial(
-        do_run_test_script,
+        do_run_distributed_script,
         rendezvous_port=worker_resources.rendezvous_port,
         torchrun_port=worker_resources.torchrun_port,
     )
@@ -148,18 +65,34 @@ def run_test_script_base_path(model_testing_config, result_path, request):
     return result_path / "models" / model_testing_config.name
 
 
+def do_run_test_script_for_all_models(
+    distributed_testing_config: DistributedTestingConfig,
+    model_testing_config: ModelTestingConfig,
+    base_path: pathlib.Path,
+):
+    Assert.leq(distributed_testing_config.num_gpus, DistributedConfig.default_world_size)
+    get_test_dataset()
+    args = [
+        "fast-llm",
+        "train",
+        model_testing_config.model_type,
+        *model_testing_config.config_args,
+        *distributed_testing_config.config_args,
+        f"model.distributed.world_size={distributed_testing_config.num_gpus}",
+        f"model.distributed.local_world_size={distributed_testing_config.num_gpus}",
+        f"run.experiment_dir={base_path/distributed_testing_config.name}",
+    ]
+    print(" ".join(args))
+    model_testing_config.trainer_config_class.parse_and_run(args[3:])
+
+
 @pytest.fixture(scope="function")
 def run_test_script_for_all_models(
-    worker_resources: "WorkerResources",
     run_test_script_base_path: pathlib.Path,
     model_testing_config: ModelTestingConfig,
-    request: pytest.FixtureRequest,
 ):
     return functools.partial(
         do_run_test_script_for_all_models,
-        rendezvous_port=worker_resources.rendezvous_port,
-        torchrun_port=worker_resources.torchrun_port,
-        test_name=request.node.originalname,
         base_path=run_test_script_base_path,
         model_testing_config=model_testing_config,
     )
@@ -174,22 +107,16 @@ def parse_run_distributed_script(args: list[str] | None = None):
 
 
 @pytest.fixture(scope="session")
-def run_distributed_script_for_all_models(
+def compare_results_for_all_models(
     worker_resources: "WorkerResources",
     run_test_script_base_path: pathlib.Path,
-    model_testing_config: ModelTestingConfig,
-    request: pytest.FixtureRequest,
 ):
-    def do_run_distributed_script_for_all_models(args: list[str], num_gpus=2, base_path: pathlib.Path | None = None):
-        do_run_distributed_script(
-            args
-            + [
-                str(run_test_script_base_path if base_path is None else base_path),
-                model_testing_config.name,
-            ],
-            worker_resources.rendezvous_port,
-            worker_resources.torchrun_port,
-            num_gpus,
+    def do_compare_results_for_all_models(distributed_testing_config: DistributedTestingConfig):
+        assert distributed_testing_config.compare is not None
+        compare_tensor_logs(
+            run_test_script_base_path / distributed_testing_config.compare / ARTIFACT_PATH,
+            run_test_script_base_path / distributed_testing_config.name / ARTIFACT_PATH,
+            distributed_testing_config.compare_config,
         )
 
-    return do_run_distributed_script_for_all_models
+    return do_compare_results_for_all_models
diff --git a/tests/utils/utils.py b/tests/utils/utils.py
index 1ea7717f5..0ca596aed 100644
--- a/tests/utils/utils.py
+++ b/tests/utils/utils.py
@@ -1,13 +1,22 @@
+import logging
 import pathlib
+import sys
+import traceback
+import typing
 
+import _pytest.capture
 import pytest
 import torch
 
 from fast_llm.engine.base_model.base_model import BaseModel, Layer
+from fast_llm.engine.config_utils.logging import configure_logging
 from fast_llm.engine.config_utils.tensor_space import TensorSpace
 from fast_llm.engine.distributed.distributed import Distributed
 from fast_llm.engine.multi_stage.config import FastLLMModelConfig, StageConfig
 from fast_llm.engine.multi_stage.stage import Stage
+from fast_llm.utils import header
+
+logger = logging.getLogger(__name__)
 
 requires_cuda = pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available")
 
@@ -47,3 +56,50 @@ def get_stage(base_model: BaseModel | list[Layer], distributed: Distributed):
     stage.restore_parameters()
     stage.reset_gradients()
     return stage
+
+
+class DistributedSubtestContext:
+    def __init__(self, path: pathlib.Path, rank: int) -> None:
+        self._path = path
+        self._rank = rank
+        self._capture_manager = _pytest.capture.CaptureManager("fd")
+        self.success = False
+
+    def __enter__(self) -> typing.Self:
+        self._capture_manager.start_global_capturing()
+        # Logging is set to log to the old stdout, so we need to reconfigure.
+        configure_logging()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        try:
+            self._capture_manager.suspend_global_capture()
+            out, err = self._capture_manager.read_global_capture()
+            self._path.mkdir(parents=True, exist_ok=True)
+            self._path.joinpath(f"pytest_stdout_{self._rank}").write_text(out)
+            self._path.joinpath(f"pytest_stderr_{self._rank}").write_text(err)
+            if exc_type is None:
+                self.success = True
+            else:
+                self._path.joinpath(f"pytest_traceback_{self._rank}").write_text(traceback.format_exc())
+            return True
+        finally:
+            self._capture_manager.stop_global_capturing()
+            configure_logging()
+
+
+def report_subtest(path: pathlib.Path, world_size: int):
+    try:
+        success = bool(int(path.joinpath("pytest_success").read_text()))
+    except OSError:
+        success = False
+    if not success:
+        for rank in range(world_size):
+            for fd, file_ in (("stdout", sys.stdout), ("stderr", sys.stdout), ("traceback", sys.stderr)):
+                print(header(f"{fd} rank {rank}", 80), file=file_)
+                file_path = path / f"pytest_{fd}_{rank}"
+                try:
+                    print(file_path.read_text(), file=file_)
+                except OSError:
+                    print(f"<<< not found {file_path}>>>", file=file_)
+        raise RuntimeError(f"test {path.name} failed")

From 98593d4d7daee70dc69a09ff42defb20e0b76526 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Fri, 4 Jul 2025 10:03:19 -0400
Subject: [PATCH 02/14] misc

---
 fast_llm/engine/config_utils/run.py         |  26 ++--
 fast_llm/engine/config_utils/runnable.py    |   1 +
 fast_llm/engine/distributed/config.py       |   5 -
 fast_llm/engine/distributed/distributed.py  |   9 +-
 fast_llm/logging.py                         |  13 +-
 fast_llm/utils.py                           |  66 ++++++++++
 tests/conftest.py                           |  34 +----
 tests/models/distributed_test_checkpoint.py |   2 +-
 tests/models/distributed_test_model.py      |  40 +++---
 tests/models/test_model.py                  |  60 ++++-----
 tests/utils/compare_tensor_logs.py          |   6 +-
 tests/utils/distributed_configs.py          |  58 +++++----
 tests/utils/run_test_script.py              |  18 ++-
 tests/utils/utils.py                        | 132 ++++++++++++++------
 14 files changed, 289 insertions(+), 181 deletions(-)

diff --git a/fast_llm/engine/config_utils/run.py b/fast_llm/engine/config_utils/run.py
index 126e0ae8c..f8cfa8c5b 100644
--- a/fast_llm/engine/config_utils/run.py
+++ b/fast_llm/engine/config_utils/run.py
@@ -10,7 +10,7 @@
 from fast_llm.engine.config_utils.logging import TensorLogs, TensorLogsConfig, configure_logging
 from fast_llm.engine.config_utils.runnable import RunnableConfig
 from fast_llm.engine.distributed.config import DistributedConfig
-from fast_llm.utils import Assert, log
+from fast_llm.utils import log
 
 if typing.TYPE_CHECKING:
     from fast_llm.engine.distributed.distributed import Distributed
@@ -82,12 +82,14 @@ def _show(
         if is_main_rank():
             return super()._show(verbose, log_fn=log_fn, title=title, width=width, fill_char=fill_char)
 
-    def configure_logging(self, directory: pathlib.Path | str | None = None) -> None:
+    def configure_logging(
+        self, directory: pathlib.Path | str | None = None, distributed: DistributedConfig | None = None
+    ) -> None:
         configure_logging(
             log_timestamps=self.run.log_timestamps,
             enable_all_loggers=self.run.enable_all_loggers,
-            rank=DistributedConfig.default_rank,
-            world_size=DistributedConfig.default_world_size,
+            rank=DistributedConfig.default_rank if distributed is None else distributed.rank,
+            world_size=DistributedConfig.default_world_size if distributed is None else distributed.world_size,
             directory=directory,
         )
 
@@ -131,17 +133,13 @@ def __init__(
         distributed: "Distributed",
     ):
         self._config = config.run
-        self._distributed_config = distributed.config
-        Assert.eq(self._distributed_config.world_size, DistributedConfig.default_world_size)
-        Assert.eq(self._distributed_config.local_world_size, DistributedConfig.default_local_world_size)
-        Assert.eq(self._distributed_config.rank, DistributedConfig.default_rank)
         self._distributed = distributed
 
         # TODO: Main rank should contain the last pipeline stage so it calculates loss
-        self._is_main_rank = self._distributed_config.rank == _MAIN_RANK
-        self._is_model_parallel_main_rank = self._distributed_config.data_rank == 0
+        self._is_main_rank = self._distributed.config.rank == _MAIN_RANK
+        self._is_model_parallel_main_rank = self._distributed.config.data_rank == 0
         self._is_pipeline_parallel_main_rank = (
-            self._distributed_config.data_rank == 0 and self._distributed_config.tensor_rank == 0
+            self._distributed.config.data_rank == 0 and self._distributed.config.tensor_rank == 0
         )
         config_dict = config.to_dict()
         config_dict_verbose = config.to_dict(verbose=FieldVerboseLevel.performance)
@@ -160,14 +158,14 @@ def __init__(
             # Make sure all the workers agree on the run. This also acts as a barrier.
             self.index = self.broadcast_int(run)
             run_dir = self._experiment_directory / "runs" / str(self.index)
-            self._artifact_dir = run_dir / "artifacts" / str(self._distributed_config.rank)
+            self._artifact_dir = run_dir / "artifacts" / str(self._distributed.config.rank)
             log_dir = run_dir / "logs"
         else:
             self._experiment_directory, self._artifact_dir, log_dir = None, None, None
             self.index = None
 
-        if self._config.structured_logs:
-            config.configure_logging(log_dir)
+        # Finalize logging configuration.
+        config.configure_logging(log_dir)
 
         self._experiment_name = self._config.experiment_name or (
             "default" if self._experiment_directory is None else self._experiment_directory.name
diff --git a/fast_llm/engine/config_utils/runnable.py b/fast_llm/engine/config_utils/runnable.py
index bcdebb856..051163084 100644
--- a/fast_llm/engine/config_utils/runnable.py
+++ b/fast_llm/engine/config_utils/runnable.py
@@ -29,6 +29,7 @@ def parse_and_run(cls, args: list[str] | None = None) -> None:
         with NoAutoValidate():
             config: "RunnableConfig" = cls._from_parsed_args(parsed, unparsed)
         try:
+            # Configure logging so validation errors are logged properly.
             config.configure_logging()
             config.validate()
             if not parsed.do_run:
diff --git a/fast_llm/engine/distributed/config.py b/fast_llm/engine/distributed/config.py
index 9f006cdb1..6f2e2ab95 100644
--- a/fast_llm/engine/distributed/config.py
+++ b/fast_llm/engine/distributed/config.py
@@ -387,10 +387,5 @@ def _from_dict(
         strict: bool = True,
         flat: bool = False,
     ) -> typing.Self:
-        # TODO v0.3: Remove backward compatibility fix
-        if "sequence_first" in default and strict:
-            del default["sequence_first"]
-        if "separate_init_generators" in default and strict:
-            del default["separate_init_generators"]
         cls._handle_renamed_field(default, "distributed_timeout", "timeout")
         return super()._from_dict(default, strict, flat)
diff --git a/fast_llm/engine/distributed/distributed.py b/fast_llm/engine/distributed/distributed.py
index 977318841..ce9f660f2 100644
--- a/fast_llm/engine/distributed/distributed.py
+++ b/fast_llm/engine/distributed/distributed.py
@@ -1,5 +1,6 @@
 import datetime
 import logging
+import time
 import typing
 
 import torch
@@ -97,10 +98,12 @@ def get_process_group(self, global_ranks: range | tuple, group_rank: int) -> Pro
                 return group
 
         prefix = (
-            f"range_{global_ranks.start}_{global_ranks.start}_{global_ranks.step}"
+            f"range_{global_ranks.start}_{global_ranks.stop}_{global_ranks.step}"
             if isinstance(global_ranks, range)
             else f"ranks_{"_".join(str(rank) for rank in global_ranks)}"
         )
+        logger.info(f"Creating process group {prefix} (rank = {group_rank}, size = {group_size})")
+        time.sleep(0.1)
 
         group = torch.distributed.ProcessGroupNCCL(
             torch.distributed.PrefixStore(prefix + "/", self.store),
@@ -108,6 +111,8 @@ def get_process_group(self, global_ranks: range | tuple, group_rank: int) -> Pro
             group_size,
             datetime.timedelta(seconds=self._timeout),
         )
+        logger.info(f"Barrier process group {prefix} (rank = {group_rank}, size = {group_size})")
+        logger.info(f"Done process group {prefix} (rank = {group_rank}, size = {group_size})")
         self._process_groups[global_ranks] = group
         return group
 
@@ -225,7 +230,7 @@ def add_group(self, distributed_dim: DistributedDim) -> ProcessGroup | None:
         """
         Add a process group from its definition.
         """
-        self._config.log_first_rank(f"Initializing group {distributed_dim.name}, size={distributed_dim.size}...")
+        # self._config.log_first_rank(f"Initializing group {distributed_dim.name}, size={distributed_dim.size}...")
         distributed_dim.check_ranks_in_range(0, self._config.world_size)
         group = self._pool.get_process_group(distributed_dim.global_ranks, distributed_dim.rank)
         distributed_dim.setup(group)
diff --git a/fast_llm/logging.py b/fast_llm/logging.py
index f574aa381..4f77be7fa 100644
--- a/fast_llm/logging.py
+++ b/fast_llm/logging.py
@@ -10,7 +10,7 @@
 from fast_llm.engine.config_utils.logging import TensorLogs
 from fast_llm.engine.distributed.config import PhaseType
 from fast_llm.tensor import TensorMeta
-from fast_llm.utils import format_number, log
+from fast_llm.utils import format_number, get_and_reset_memory_usage_mib, log
 
 if typing.TYPE_CHECKING:
     from fast_llm.core.distributed import ProcessGroup
@@ -329,7 +329,7 @@ def log_generator[
 _global_max_reserved = 0
 
 
-def get_memory_usage_mib(reset_stats: bool = True, relative_to: dict[str, int] | None = None) -> dict[str, float]:
+def get_memory_usage_mib(reset_stats: bool = True, relative_to: dict[str, float] | None = None) -> dict[str, float]:
     global _global_max_allocated, _global_max_reserved
     max_allocated = torch.cuda.max_memory_allocated() / 2**20
     max_reserved = torch.cuda.max_memory_reserved() / 2**20
@@ -355,12 +355,13 @@ def log_memory_usage[
     header: str | None = None,
     log_fn: type[BaseException] | typing.Callable[[str], T] = logger.info,
     reset_stats: bool = True,
-    stats: dict[str, int] | None = None,
+    report: dict[str, float] | None = None,
     relative_to: dict[str, int] | None = None,
 ) -> T:
-    if stats is None:
-        stats = get_memory_usage_mib(reset_stats, relative_to)
-    formatted = _MEMORY_METRIC_FORMAT.format(**stats)
+    if report is None:
+        get_and_reset_memory_usage_mib(relative_to=relative_to, reset_stats=reset_stats)
+        report = get_memory_usage_mib(reset_stats, relative_to)
+    formatted = _MEMORY_METRIC_FORMAT.format(**report)
     if header is not None:
         formatted = f"{header}: {formatted}"
     return log(formatted, log_fn=log_fn)
diff --git a/fast_llm/utils.py b/fast_llm/utils.py
index 7bbdd6979..bd2f8ef7b 100644
--- a/fast_llm/utils.py
+++ b/fast_llm/utils.py
@@ -1,3 +1,4 @@
+import gc
 import itertools
 import logging
 import math
@@ -392,3 +393,68 @@ def enabled(self) -> bool:
     @property
     def interrupted(self):
         return self._interrupted
+
+
+_global_max_allocated = 0
+_global_max_reserved = 0
+
+
+def get_and_reset_memory_usage_mib(
+    *,
+    relative_to: dict[str, int] | None = None,
+    clear_cache: bool = False,
+    global_stats: bool = False,
+    reset_stats: bool = True,
+    reset_global_stats: bool = False,
+) -> dict[str, float]:
+    global _global_max_allocated, _global_max_reserved
+    import torch
+
+    if clear_cache:
+        # Free memory for more accurate reporting, and to reduce OOM risk with lots of workers.
+        # Cublas workspace can unnecessarily keep 100s of MBs of reserved memory.
+        torch._C._cuda_clearCublasWorkspaces()
+        # Lots of tensors tend to stay allocated until the next garbage collection.
+        # Collect only if the remaining memory is significant enough since it's costly.
+        if torch.cuda.memory_allocated() > 1e7:
+            gc.collect()
+        try:
+            # Actually free the memory.
+            torch.cuda.empty_cache()
+        except RuntimeError:
+            # Happens if cuda is broken.
+            return {}
+    report = {
+        # Relevant value for OOM risk. Also look at global max since fast-llm resets stats.
+        "max_memory_reserved": max(torch.cuda.max_memory_reserved() / 2**20, _global_max_reserved),
+        # Actual memory usage from the test.
+        "max_memory_allocated": max(torch.cuda.max_memory_allocated() / 2**20, _global_max_allocated),
+        "memory_reserved": torch.cuda.memory_reserved() / 2**20,
+        "memory_allocated": torch.cuda.memory_allocated() / 2**20,
+    }
+    max_allocated = torch.cuda.max_memory_allocated() / 2**20
+    max_reserved = torch.cuda.max_memory_reserved() / 2**20
+    if global_stats:
+        report |= {
+            "max_memory_reserved": max(max_reserved, _global_max_reserved),
+            "max_memory_allocated": max(max_allocated, _global_max_allocated),
+        }
+    else:
+        report |= {
+            "max_allocated": max_allocated,
+            "max_reserved": max_reserved,
+            "global_max_reserved": _global_max_reserved,
+        }
+
+    if relative_to:
+        report = {key: value - relative_to.get(key, 0) for key, value in report.items()}
+    if reset_global_stats:
+        torch.cuda.reset_peak_memory_stats()
+        _global_max_reserved = 0
+        _global_max_allocated = 0
+    elif reset_stats:
+        torch.cuda.reset_peak_memory_stats()
+        _global_max_allocated = max(max_allocated, _global_max_allocated)
+        _global_max_reserved = max(max_reserved, _global_max_reserved)
+
+    return report
diff --git a/tests/conftest.py b/tests/conftest.py
index 4c9161ea6..0eb7826f2 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,5 +1,4 @@
 import dataclasses
-import gc
 import json
 import logging
 import math
@@ -11,6 +10,7 @@
 import xdist.scheduler
 
 import fast_llm.logging
+from fast_llm.utils import get_and_reset_memory_usage_mib
 from tests.utils.depends import DependencyManager
 
 # Make fixtures available globally without import
@@ -33,6 +33,7 @@ def pytest_addoption(parser):
     group.addoption("--skip-slow", action="store_true")
     group.addoption("--show-skipped", action="store_true")
     group.addoption("--show-gpu-memory", type=int, default=10)
+    group.addoption("--no-distributed-capture", dest="distributed_capture", action="store_false")
     group.addoption("--models", nargs="*")
     group.addoption(
         "--run-extra-slow",
@@ -187,37 +188,12 @@ def pytest_runtest_makereport(item: pytest.Function, call: pytest.CallInfo):
 
     # Measure GPU memory usage. (TODO: This excludes child processes)
     if call.when == "call" and torch.cuda.is_available():
-        # Free memory for more accurate reporting, and to reduce OOM risk with lots of workers.
-        # Cublas workspace can unnecessarily keep 100s of MBs of reserved memory.
-        torch._C._cuda_clearCublasWorkspaces()
-        # Lots of tensors tend to stay allocated until the next garbage collection.
-        # Collect only if the remaining memory is significant enough since it's costly.
-        if torch.cuda.memory_allocated() > 1e7:
-            gc.collect()
-        try:
-            # Actually free the memory.
-            torch.cuda.empty_cache()
-        except RuntimeError:
-            # Happens if the test broke cuda.
-            return
+        report = get_and_reset_memory_usage_mib(clear_cache=True, global_stats=True, reset_global_stats=True)
+        report["duration"] = call.duration
         item.add_report_section(
             call.when,
             "resource usage",
-            json.dumps(
-                {
-                    "duration": call.duration,
-                    # Relevant value for OOM risk. Also look at global max since fast-llm resets stats.
-                    "max_memory_reserved": max(
-                        torch.cuda.max_memory_reserved() / 2**20, fast_llm.logging._global_max_reserved
-                    ),
-                    # Actual memory usage from the test.
-                    "max_memory_allocated": max(
-                        torch.cuda.max_memory_allocated() / 2**20, fast_llm.logging._global_max_allocated
-                    ),
-                    "memory_reserved": torch.cuda.memory_reserved() / 2**20,
-                    "memory_allocated": torch.cuda.memory_allocated() / 2**20,
-                }
-            ),
+            json.dumps(report),
         )
         torch.cuda.reset_peak_memory_stats()
         # Reset global stats for next test.
diff --git a/tests/models/distributed_test_checkpoint.py b/tests/models/distributed_test_checkpoint.py
index 9e706ebee..05a0bf443 100644
--- a/tests/models/distributed_test_checkpoint.py
+++ b/tests/models/distributed_test_checkpoint.py
@@ -47,7 +47,7 @@ def _test_load_and_save_parallel(
 
 
 def main(args: list[str] | None = None) -> None:
-    base_path, model_testing_config = parse_run_distributed_script(args)
+    base_path, model_testing_config, _ = parse_run_distributed_script(args)
 
     with ProcessGroupPool(timeout=20):
         for pretrained_format, pretrained_path in (
diff --git a/tests/models/distributed_test_model.py b/tests/models/distributed_test_model.py
index ad19eeafa..36f13ec2a 100644
--- a/tests/models/distributed_test_model.py
+++ b/tests/models/distributed_test_model.py
@@ -1,9 +1,7 @@
 import logging
 
-import torch
-
 from fast_llm.cli import fast_llm_main_wrapper
-from fast_llm.core.distributed import allreduce_scalar, safe_barrier
+from fast_llm.core.distributed import safe_barrier
 from fast_llm.engine.distributed.config import DistributedConfig
 from fast_llm.engine.distributed.distributed import ProcessGroupPool
 from tests.utils.distributed_configs import DISTRIBUTED_TESTING_CONFIGS
@@ -14,37 +12,29 @@
 
 
 def main(args: list[str] | None = None) -> None:
-    base_path, model_testing_config = parse_run_distributed_script(args)
+    base_path, model_testing_config, do_capture = parse_run_distributed_script(args)
+
+    if do_capture:
+        logger.warning(
+            "Capturing output and forwarding to associated tests. Run with `--no-distributed-capture` to disable."
+        )
 
-    with ProcessGroupPool(timeout=20) as pool:
+    # TODO: Why are barriers needed?
+    with ProcessGroupPool(timeout=60) as pool:
         failures = []
         world_size = DistributedConfig.default_world_size
         rank = DistributedConfig.default_rank
         group = pool.get_process_group(range(world_size), rank)
 
         for name, config in DISTRIBUTED_TESTING_CONFIGS.items():
-            if config.num_gpus > world_size:
-                logger.warning(f"{name} {f"SKIPPED (not enough GPUs: {config.num_gpus} > {world_size})"})")
-            if DistributedConfig.default_rank < config.num_gpus:
-                logger.info(f"Running {name}")
-                with DistributedSubtestContext(base_path / name, rank) as subtest:
+            if world_size < config.num_gpus:
+                logger.warning(f"{name} {f"SKIPPED (not enough GPUs: {world_size} < {config.num_gpus})"})")
+                continue
+            with DistributedSubtestContext(base_path, name, group, config.num_gpus, enabled=do_capture) as subtest:
+                if rank < config.num_gpus:
                     do_run_test_script_for_all_models(config, model_testing_config, base_path)
-                assert subtest._capture_manager._global_capturing is None
-                success = subtest.success
-            else:
-                # Worker is not needed for this one, skip.
-                success = True
-
-            # Barrier so `allreduce_scalar` doesn't go crazy in case of desync.
-            safe_barrier(group, name)
-            success = (
-                success if group is None else allreduce_scalar(success, dtype=torch.int64, group=group) == world_size
-            )
-            logger.warning(f"{name} {"PASSED" if success else "FAILED"})")
-            if not success:
+            if not subtest.success:
                 failures.append(name)
-            if rank == 0:
-                (base_path / name / "pytest_success").write_text(str(int(success)))
 
         # Final barrier to ensure everything is done before torchrun potentially kills workers.
         safe_barrier(group, "testing end")
diff --git a/tests/models/test_model.py b/tests/models/test_model.py
index 7e853a24a..2aeff95cc 100644
--- a/tests/models/test_model.py
+++ b/tests/models/test_model.py
@@ -1,3 +1,5 @@
+import logging
+
 import pytest
 import torch
 
@@ -7,14 +9,16 @@
     SINGLE_GPU_TESTING_CONFIGS,
 )
 from tests.utils.model_configs import ModelTestingGroup
-from tests.utils.run_test_script import ARTIFACT_PATH
-from tests.utils.utils import report_subtest
+from tests.utils.utils import check_subtest_success, set_subtest_success
+
+logger = logging.getLogger(__name__)
 
 
 @pytest.mark.model_testing_group(ModelTestingGroup.basic)
-def test_model_simple(run_test_script_for_all_models):
+def test_model_simple(run_test_script_for_all_models, run_test_script_base_path):
     # A simple config to prevent unnecessary testing and creation of dependency group
     run_test_script_for_all_models(SIMPLE_TESTING_CONFIG)
+    set_subtest_success(run_test_script_base_path / SIMPLE_TESTING_CONFIG.name)
 
 
 @pytest.mark.depends_on(on=["test_model_simple[{model_testing_config}]"])
@@ -27,49 +31,49 @@ def test_and_compare_model(
     # We can expect tests to respect the ordering of `SINGLE_GPU_TESTING_CONFIGS`, so compare should have run already.
     config = SINGLE_GPU_TESTING_CONFIGS[config_name]
     if config.compare is not None:
-        for artifact in ["init", "train_1"]:
-            path = run_test_script_base_path / config.compare / ARTIFACT_PATH / "0" / f"tensor_logs_{artifact}.pt"
-            if not path.is_file():
-                # Dependency likely failed, skipping this test because it will most likely fail for the same reason.
-                # We still need to fail because we can't confirm the failure.
-                pytest.fail(f"Compared test {config.compare} failed or did not run ({path} not found).", pytrace=False)
+        check_subtest_success(run_test_script_base_path / config.compare)
     # A baseline config (single-gpu, bf16, flash-attn).
     # Also tests for multiple data loaders.
     run_test_script_for_all_models(config)
+    set_subtest_success(run_test_script_base_path / config.name)
 
     if config.compare is not None:
-        compare_results_for_all_models(config)
+        compare_results_for_all_models(config, ("init", "train_1", "train_2"))
 
 
 @pytest.mark.depends_on(on=["test_model_simple[{model_testing_config}]"])
-@pytest.mark.model_testing_group(ModelTestingGroup.distributed)
-def test_run_model_distributed(run_distributed_script, model_testing_config, run_test_script_base_path):
+@pytest.mark.model_testing_group(
+    ModelTestingGroup.distributed,
+)
+def test_run_model_distributed(run_distributed_script, model_testing_config, run_test_script_base_path, request):
     import tests.models.distributed_test_model
 
-    run_distributed_script(
-        [
-            tests.models.distributed_test_model.__file__,
-            str(run_test_script_base_path),
-            model_testing_config.name,
-        ],
-        num_gpus=torch.cuda.device_count(),
-    )
+    script = [tests.models.distributed_test_model.__file__, str(run_test_script_base_path), model_testing_config.name]
+    if not request.config.getoption("distributed_capture"):
+        logger.warning(
+            "Capturing output and forwarding to associated tests. Run with `--no-distributed-capture` to disable."
+        )
+        script.append("--no-capture")
+    run_distributed_script(script, num_gpus=torch.cuda.device_count())
 
 
 # We don't want to depend on `test_model_distributed` because we still want to run this in cas of failure.
 # This should still run after `test_model_distributed`
 @pytest.mark.depends_on(on=["test_model_simple[{model_testing_config}]"])
 @pytest.mark.model_testing_group(ModelTestingGroup.distributed)
-@pytest.mark.parametrize("config_name", list(DISTRIBUTED_TESTING_CONFIGS)[:1])
+@pytest.mark.parametrize("config_name", list(DISTRIBUTED_TESTING_CONFIGS))
 def test_model_distributed(
-    run_test_script_for_all_models, compare_results_for_all_models, config_name, run_test_script_base_path
+    run_test_script_for_all_models,
+    compare_results_for_all_models,
+    config_name,
+    run_test_script_base_path,
+    report_subtest,
 ):
     config = DISTRIBUTED_TESTING_CONFIGS[config_name]
+    if torch.cuda.device_count() < config.num_gpus:
+        pytest.skip(f"Not enough GPUs: {torch.cuda.device_count()} < {config.num_gpus}")
     report_subtest(run_test_script_base_path / config.name, config.num_gpus)
     if config.compare is not None:
-        for artifact in ["init", "train_1"]:
-            if not (
-                run_test_script_base_path / config.compare / ARTIFACT_PATH / f"tensor_logs_{artifact}.pt"
-            ).is_file():
-                pytest.fail(f"Compared test {config.compare} failed or did not run.", pytrace=False)
-        compare_results_for_all_models(config)
+        if not check_subtest_success(run_test_script_base_path / config.compare):
+            pytest.fail(f"Test {config.compare} failed", pytrace=False)
+        compare_results_for_all_models(config, ("init", "train_1", "train_2"))
diff --git a/tests/utils/compare_tensor_logs.py b/tests/utils/compare_tensor_logs.py
index e34fd6007..96acf9658 100644
--- a/tests/utils/compare_tensor_logs.py
+++ b/tests/utils/compare_tensor_logs.py
@@ -59,7 +59,7 @@ def compare_logged_tensor(tensor_ref, tensor_test, errors, step, name, config: C
     if tensor_ref["shape"] != tensor_test["shape"]:
         errors.append(
             "\n".join(
-                [f">>>> [{step}] Incompatible shape for tensor {name}: {tensor_ref['shape']}!={tensor_test['shape']}"]
+                [f">>>> [{step}] Incompatible shape for tensor {name}: {tensor_test['shape']}!={tensor_ref['shape']}"]
             )
         )
         return
@@ -67,7 +67,7 @@ def compare_logged_tensor(tensor_ref, tensor_test, errors, step, name, config: C
         errors.append(
             "\n".join(
                 [
-                    f">>>> [{step}] Incompatible sampling rate for tensor {name}: {tensor_ref['step']}!={tensor_test['step']}"
+                    f">>>> [{step}] Incompatible sampling rate for tensor {name}: {tensor_test['step']}!={tensor_ref['step']}"
                 ]
             )
         )
@@ -101,8 +101,8 @@ def compare_logged_tensor(tensor_ref, tensor_test, errors, step, name, config: C
     if tensor_errors:
         tensor_errors.extend(
             [
-                f"  Ref samples:  " + "".join(f"{x:12.4e}" for x in samples_ref[: config.show_samples].tolist()),
                 f"  Test samples: " + "".join(f"{x:12.4e}" for x in samples_test[: config.show_samples].tolist()),
+                f"  Ref samples:  " + "".join(f"{x:12.4e}" for x in samples_ref[: config.show_samples].tolist()),
             ]
         )
         errors.append("\n".join([f">>>> [{step}] Excessive diff for tensor {name}:"] + tensor_errors))
diff --git a/tests/utils/distributed_configs.py b/tests/utils/distributed_configs.py
index 8bbd08d51..c38939eae 100644
--- a/tests/utils/distributed_configs.py
+++ b/tests/utils/distributed_configs.py
@@ -70,7 +70,7 @@ class DistributedTestingConfig:
     DistributedTestingConfig(
         name="df4_sf",
         compare=None,
-        config_args=[],
+        config_args=["batch.depth_first_micro_batches=4", "model.base_model.sequence_first=True"],
         num_gpus=1,
     ),
 ]
@@ -103,10 +103,15 @@ class DistributedTestingConfig:
     ),
     # Depth-first micro-batches
     DistributedTestingConfig(
-        name="dp2_df4_z3",
+        name="dp2_z3_df4",
         compare="df4",
         config_args=["model.multi_stage.zero_stage=3", "batch.depth_first_micro_batches=4"],
         num_gpus=2,
+        compare_config=CompareConfig(
+            ignore_duplicates=[
+                "Global gradient",
+            ]
+        ),
     ),
     # Sequence-data-parallel
     DistributedTestingConfig(
@@ -126,13 +131,13 @@ class DistributedTestingConfig:
     # Simple sequence-tensor-parallel
     DistributedTestingConfig(
         name="stp2",
-        compare="simple",
+        compare="sf",
         config_args=["model.distributed.tensor_parallel=2", "model.distributed.sequence_tensor_parallel=True"],
         num_gpus=2,
     ),
     # Cross-entropy splits
     DistributedTestingConfig(
-        name="sp2_ce4",
+        name="stp2_ce4",
         compare="sf",
         config_args=[
             "model.distributed.tensor_parallel=2",
@@ -145,7 +150,7 @@ class DistributedTestingConfig:
     # ===== 2d configs (Data + Tensor)
     # Simple
     DistributedTestingConfig(
-        name="dp2_sp2",
+        name="dp2_stp2",
         compare="sf",
         config_args=[
             "model.distributed.tensor_parallel=2",
@@ -158,28 +163,32 @@ class DistributedTestingConfig:
         name="tp2_df4",
         compare="df4",
         config_args=[
-            "batch.depth_first_micro_batches=4",
             "model.distributed.tensor_parallel=2",
+            "batch.depth_first_micro_batches=4",
         ],
         num_gpus=4,
     ),
     # Breadth-first micro-batches
     DistributedTestingConfig(
-        name="sdp2_sp2_bf4",
+        name="sdp2_stp2_bf4",
         compare="df4_sf",
         config_args=[
             "model.distributed.sequence_data_parallel=2",
-            "batch.breadth_first_micro_batches=4",
             "model.distributed.tensor_parallel=2",
             "model.distributed.sequence_tensor_parallel=True",
+            "batch.breadth_first_micro_batches=4",
         ],
         num_gpus=4,
     ),
     # Sequence-data-parallel
     DistributedTestingConfig(
-        name="sdp2_sp2",
+        name="sdp2_stp2",
         compare="sf",
-        config_args=["model.distributed.tensor_parallel=2"],
+        config_args=[
+            "model.distributed.sequence_data_parallel=2",
+            "model.distributed.tensor_parallel=2",
+            "model.distributed.sequence_tensor_parallel=True",
+        ],
         num_gpus=4,
     ),
     # ===== Pipeline-parallel configs
@@ -188,9 +197,9 @@ class DistributedTestingConfig:
         name="pp2s2_bf4",
         compare="df4",
         config_args=[
-            "batch.breadth_first_micro_batches=4",
             "model.distributed.pipeline_parallel=2",
             "model.multi_stage.layers_per_stage=2",
+            "batch.breadth_first_micro_batches=4",
         ],
         num_gpus=2,
     ),
@@ -199,9 +208,9 @@ class DistributedTestingConfig:
         name="pp2s1_bf4",
         compare="df4",
         config_args=[
-            "batch.breadth_first_micro_batches=4",
             "model.distributed.pipeline_parallel=2",
             "model.multi_stage.layers_per_stage=1",
+            "batch.breadth_first_micro_batches=4",
         ],
         num_gpus=2,
         compare_config=CompareConfig(
@@ -216,34 +225,35 @@ class DistributedTestingConfig:
         name="pp2s2_ms",
         compare="ms",
         config_args=[
-            "batch.micro_sequence_length=256",
             "model.distributed.pipeline_parallel=2",
             "model.multi_stage.layers_per_stage=2",
+            "batch.micro_sequence_length=256",
         ],
         num_gpus=2,
     ),
-    # ===== Data + Pipeline
+    # ===== 2d configs (Data + Pipeline)
     # Simple
     DistributedTestingConfig(
-        name="dp2_pp2s2",
+        name="dp2_pp2s2_bf4",
         compare="df4",
         config_args=[
-            "batch.breadth_first_micro_batches=4",
             "model.distributed.pipeline_parallel=2",
             "model.multi_stage.layers_per_stage=2",
+            "batch.breadth_first_micro_batches=4",
         ],
         num_gpus=4,
     ),
-    # ===== Tensor + Pipeline
+    # ===== 2d configs (Tensor + Pipeline)
     # Simple [sf, mb]
     DistributedTestingConfig(
-        name="sp2_pp2s1",
+        name="stp2_pp2s1_bf4",
         compare="df4_sf",
         config_args=[
             "model.distributed.tensor_parallel=2",
             "model.distributed.sequence_tensor_parallel=True",
             "model.distributed.pipeline_parallel=2",
             "model.multi_stage.layers_per_stage=2",
+            "batch.breadth_first_micro_batches=4",
         ],
         num_gpus=4,
         compare_config=CompareConfig(
@@ -259,10 +269,10 @@ class DistributedTestingConfig:
         name="dp2_stp2_pp2s2",
         compare="mb",
         config_args=[
-            "batch.breadth_first_micro_batches=4",
             "model.distributed.tensor_parallel=2",
             "model.distributed.pipeline_parallel=2",
             "model.multi_stage.layers_per_stage=2",
+            "batch.breadth_first_micro_batches=4",
         ],
         num_gpus=8,
     ),
@@ -271,11 +281,11 @@ class DistributedTestingConfig:
         name="dp2_tp2_pp2s1_bf4",
         compare="mb",
         config_args=[
-            "batch.breadth_first_micro_batches=4",
             "model.distributed.tensor_parallel=2",
             "model.distributed.sequence_tensor_parallel=True",
             "model.distributed.pipeline_parallel=2",
             "model.multi_stage.layers_per_stage=1",
+            "batch.breadth_first_micro_batches=4",
         ],
         num_gpus=8,
         compare_config=CompareConfig(
@@ -287,15 +297,15 @@ class DistributedTestingConfig:
     ),
     # Micro-sequence
     DistributedTestingConfig(
-        name="dp2s2_stp2_pp2s2_ms256",
+        name="sdp2_stp2_pp2s2_ms",
         compare="ms",
         config_args=[
-            "batch.micro_sequence_length=256",
-            "model.distributed.pipeline_parallel=2",
+            "model.distributed.sequence_data_parallel=2",
             "model.distributed.tensor_parallel=2",
             "model.distributed.sequence_tensor_parallel=True",
-            "model.distributed.sequence_data_parallel=2",
+            "model.distributed.pipeline_parallel=2",
             "model.multi_stage.layers_per_stage=2",
+            "batch.micro_sequence_length=256",
         ],
         num_gpus=8,
     ),
diff --git a/tests/utils/run_test_script.py b/tests/utils/run_test_script.py
index 6c0b561dd..61bc75074 100644
--- a/tests/utils/run_test_script.py
+++ b/tests/utils/run_test_script.py
@@ -102,8 +102,10 @@ def parse_run_distributed_script(args: list[str] | None = None):
     parser = argparse.ArgumentParser()
     parser.add_argument("base_path", type=pathlib.Path)
     parser.add_argument("model_testing_config", type=str)
+    parser.add_argument("--no-distributed-capture", dest="distributed_capture", action="store_false")
+
     parsed = parser.parse_args(args)
-    return parsed.base_path, MODEL_CONFIGS[parsed.model_testing_config]
+    return parsed.base_path, MODEL_CONFIGS[parsed.model_testing_config], parsed.distributed_capture
 
 
 @pytest.fixture(scope="session")
@@ -111,12 +113,16 @@ def compare_results_for_all_models(
     worker_resources: "WorkerResources",
     run_test_script_base_path: pathlib.Path,
 ):
-    def do_compare_results_for_all_models(distributed_testing_config: DistributedTestingConfig):
-        assert distributed_testing_config.compare is not None
+    def do_compare_results_for_all_models(config: DistributedTestingConfig, artifacts: typing.Iterable[str]):
+        assert config.compare is not None
+        compare_path = run_test_script_base_path / config.compare / ARTIFACT_PATH
+        for artifact in artifacts:
+            if not (artifact_path := compare_path / "0" / f"tensor_logs_{artifact}.pt").is_file():
+                pytest.fail(f"Missing artifact {artifact_path} from {config.compare}.", pytrace=False)
         compare_tensor_logs(
-            run_test_script_base_path / distributed_testing_config.compare / ARTIFACT_PATH,
-            run_test_script_base_path / distributed_testing_config.name / ARTIFACT_PATH,
-            distributed_testing_config.compare_config,
+            compare_path,
+            run_test_script_base_path / config.name / ARTIFACT_PATH,
+            config.compare_config,
         )
 
     return do_compare_results_for_all_models
diff --git a/tests/utils/utils.py b/tests/utils/utils.py
index 0ca596aed..49151cbe8 100644
--- a/tests/utils/utils.py
+++ b/tests/utils/utils.py
@@ -1,20 +1,21 @@
 import logging
 import pathlib
 import sys
+import time
 import traceback
 import typing
 
-import _pytest.capture
 import pytest
 import torch
 
+from fast_llm.core.distributed import ProcessGroup, allreduce_scalar, safe_barrier
 from fast_llm.engine.base_model.base_model import BaseModel, Layer
 from fast_llm.engine.config_utils.logging import configure_logging
 from fast_llm.engine.config_utils.tensor_space import TensorSpace
 from fast_llm.engine.distributed.distributed import Distributed
 from fast_llm.engine.multi_stage.config import FastLLMModelConfig, StageConfig
 from fast_llm.engine.multi_stage.stage import Stage
-from fast_llm.utils import header
+from fast_llm.utils import get_and_reset_memory_usage_mib, header
 
 logger = logging.getLogger(__name__)
 
@@ -59,47 +60,102 @@ def get_stage(base_model: BaseModel | list[Layer], distributed: Distributed):
 
 
 class DistributedSubtestContext:
-    def __init__(self, path: pathlib.Path, rank: int) -> None:
-        self._path = path
-        self._rank = rank
-        self._capture_manager = _pytest.capture.CaptureManager("fd")
+    def __init__(
+        self, base_path: pathlib.Path, name: str, group: ProcessGroup | None, num_gpus: int, enabled: bool = True
+    ) -> None:
+        self._path = base_path / name
+        self._name = name
+        self._group = group
+        self._rank = 0 if group is None else group.rank()
+        self._rank_enabled = self._rank < num_gpus
+        self._enabled = enabled and self._rank_enabled
         self.success = False
 
     def __enter__(self) -> typing.Self:
-        self._capture_manager.start_global_capturing()
-        # Logging is set to log to the old stdout, so we need to reconfigure.
-        configure_logging()
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        try:
-            self._capture_manager.suspend_global_capture()
-            out, err = self._capture_manager.read_global_capture()
+        if self._enabled:
+            self._sys_stdout = sys.stdout
+            self._sys_stderr = sys.stderr
             self._path.mkdir(parents=True, exist_ok=True)
-            self._path.joinpath(f"pytest_stdout_{self._rank}").write_text(out)
-            self._path.joinpath(f"pytest_stderr_{self._rank}").write_text(err)
-            if exc_type is None:
-                self.success = True
-            else:
-                self._path.joinpath(f"pytest_traceback_{self._rank}").write_text(traceback.format_exc())
-            return True
-        finally:
-            self._capture_manager.stop_global_capturing()
+            sys.stdout = self._path.joinpath(f"pytest_stdout_{self._rank}").open("w")
+            sys.stderr = self._path.joinpath(f"pytest_stderr_{self._rank}").open("w")
+            # Logging is set to log to the old stdout, so we need to reconfigure.
             configure_logging()
+        self._start = time.perf_counter()
+        return self
 
-
-def report_subtest(path: pathlib.Path, world_size: int):
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self._enabled:
+            try:
+                stdout_handle = sys.stdout
+                stderr_handle = sys.stderr
+                sys.stdout = self._sys_stdout
+                sys.stderr = self._sys_stderr
+                stdout_handle.close()
+                stderr_handle.close()
+            finally:
+                configure_logging()
+
+        if exc_type is None:
+            self.success = True
+        else:
+            self._path.joinpath(f"pytest_traceback_{self._rank}").write_text(traceback.format_exc())
+
+        if self._group is not None:
+            # Barrier so `allreduce_scalar` doesn't go crazy in case of desync.
+            safe_barrier(self._group, self._name)
+            self.success = allreduce_scalar(self.success, dtype=torch.int64, group=self._group) == self._group.size()
+
+        if self._rank_enabled:
+            # Free resources to limit memory usage.
+            report = get_and_reset_memory_usage_mib(clear_cache=True, global_stats=True, reset_global_stats=True)
+            report["duration"] = time.perf_counter() - self._start
+
+            self._path.joinpath(f"pytest_report_{self._rank}").write_text(traceback.format_exc())
+
+        logger.warning(f"{self._name} {"PASSED" if self.success else "FAILED"})")
+        if self._rank == 0:
+            set_subtest_success(self._path, self.success)
+
+        return True
+
+
+def set_subtest_success(path: pathlib.Path, success: bool = True):
+    path.joinpath("pytest_success").write_text(str(int(success)))
+
+
+def check_subtest_success(path: pathlib, fail: bool = True) -> bool:
+    if not path.is_dir():
+        if fail:
+            pytest.fail(f"Test {path.name} did not run", pytrace=False)
+        else:
+            return False
     try:
-        success = bool(int(path.joinpath("pytest_success").read_text()))
+        return bool(int(path.joinpath("pytest_success").read_text()))
     except OSError:
-        success = False
-    if not success:
-        for rank in range(world_size):
-            for fd, file_ in (("stdout", sys.stdout), ("stderr", sys.stdout), ("traceback", sys.stderr)):
-                print(header(f"{fd} rank {rank}", 80), file=file_)
-                file_path = path / f"pytest_{fd}_{rank}"
-                try:
-                    print(file_path.read_text(), file=file_)
-                except OSError:
-                    print(f"<<< not found {file_path}>>>", file=file_)
-        raise RuntimeError(f"test {path.name} failed")
+        return False
+
+
+@pytest.fixture(scope="session")
+def report_subtest(request):
+    verbose = request.config.getoption("verbose")
+    do_capture = request.config.getoption("distributed_capture")
+
+    def do_report_subtest(path: pathlib.Path, world_size: int) -> None:
+        success = check_subtest_success(path)
+        if not do_capture:
+            logger.warning("Distributed capture is disabled. See distributed test for run output.")
+        elif verbose > 1 or not success:
+            for rank in range(world_size):
+                for fd, file_ in (("stdout", sys.stdout), ("stderr", sys.stdout), ("traceback", sys.stderr)):
+                    print(header(f"{fd} rank {rank}", 80), file=file_)
+                    file_path = path / f"pytest_{fd}_{rank}"
+                    try:
+                        print(file_path.read_text(), file=file_)
+                    except OSError:
+                        print(f"<<< not found {file_path}>>>", file=file_)
+        else:
+            print("Set verbose > 1 to show run output.")
+        if not success:
+            raise RuntimeError(f"test {path.name} failed")
+
+    return do_report_subtest

From 718a09a6eb7a141dd8822303ad70307aaaddfa1d Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Fri, 4 Jul 2025 11:12:29 -0400
Subject: [PATCH 03/14] fixes

---
 fast_llm/engine/distributed/distributed.py |  7 +------
 fast_llm/engine/evaluation/evaluator.py    |  5 +++--
 fast_llm/engine/training/trainer.py        |  6 +++---
 fast_llm/logging.py                        | 21 ---------------------
 4 files changed, 7 insertions(+), 32 deletions(-)

diff --git a/fast_llm/engine/distributed/distributed.py b/fast_llm/engine/distributed/distributed.py
index ce9f660f2..f53f25afc 100644
--- a/fast_llm/engine/distributed/distributed.py
+++ b/fast_llm/engine/distributed/distributed.py
@@ -1,6 +1,5 @@
 import datetime
 import logging
-import time
 import typing
 
 import torch
@@ -102,8 +101,6 @@ def get_process_group(self, global_ranks: range | tuple, group_rank: int) -> Pro
             if isinstance(global_ranks, range)
             else f"ranks_{"_".join(str(rank) for rank in global_ranks)}"
         )
-        logger.info(f"Creating process group {prefix} (rank = {group_rank}, size = {group_size})")
-        time.sleep(0.1)
 
         group = torch.distributed.ProcessGroupNCCL(
             torch.distributed.PrefixStore(prefix + "/", self.store),
@@ -111,8 +108,6 @@ def get_process_group(self, global_ranks: range | tuple, group_rank: int) -> Pro
             group_size,
             datetime.timedelta(seconds=self._timeout),
         )
-        logger.info(f"Barrier process group {prefix} (rank = {group_rank}, size = {group_size})")
-        logger.info(f"Done process group {prefix} (rank = {group_rank}, size = {group_size})")
         self._process_groups[global_ranks] = group
         return group
 
@@ -230,7 +225,7 @@ def add_group(self, distributed_dim: DistributedDim) -> ProcessGroup | None:
         """
         Add a process group from its definition.
         """
-        # self._config.log_first_rank(f"Initializing group {distributed_dim.name}, size={distributed_dim.size}...")
+        self._config.log_first_rank(f"Initializing group {distributed_dim.name}, size={distributed_dim.size}...")
         distributed_dim.check_ranks_in_range(0, self._config.world_size)
         group = self._pool.get_process_group(distributed_dim.global_ranks, distributed_dim.rank)
         distributed_dim.setup(group)
diff --git a/fast_llm/engine/evaluation/evaluator.py b/fast_llm/engine/evaluation/evaluator.py
index 78aad230f..3fee32baf 100644
--- a/fast_llm/engine/evaluation/evaluator.py
+++ b/fast_llm/engine/evaluation/evaluator.py
@@ -17,7 +17,8 @@
 from fast_llm.engine.schedule.schedule import Schedule
 from fast_llm.engine.training.config import WandbConfig
 from fast_llm.engine.training.wandb import Wandb
-from fast_llm.logging import format_metrics, get_memory_usage_mib
+from fast_llm.logging import format_metrics
+from fast_llm.utils import get_and_reset_memory_usage_mib
 
 # from fast_llm.engine.training.lm_eval.evaluator import simple_evaluate as lm_eval_simple_evaluate
 
@@ -226,7 +227,7 @@ def _evaluate_loss(
                 / self._schedule._distributed.world_size
                 / time_per_iteration
             ),
-            **get_memory_usage_mib(),
+            **get_and_reset_memory_usage_mib(),
         }
         return metrics
 
diff --git a/fast_llm/engine/training/trainer.py b/fast_llm/engine/training/trainer.py
index 766398d01..64408bb06 100644
--- a/fast_llm/engine/training/trainer.py
+++ b/fast_llm/engine/training/trainer.py
@@ -36,8 +36,8 @@
     TrainingEvaluatorConfig,
 )
 from fast_llm.engine.training.wandb import Wandb
-from fast_llm.logging import format_metrics, get_memory_usage_mib, log_memory_usage
-from fast_llm.utils import Assert, Interrupter
+from fast_llm.logging import format_metrics, log_memory_usage
+from fast_llm.utils import Assert, Interrupter, get_and_reset_memory_usage_mib
 
 logger = logging.getLogger(__name__)
 
@@ -422,7 +422,7 @@ def _train(self) -> tuple[bool, dict[PhaseType, dict[str, typing.Any]]]:
                             ),
                             "run": self._run.index,
                             **train_metrics,
-                            **get_memory_usage_mib(),
+                            **get_and_reset_memory_usage_mib(),
                         }
 
                         formatted_metrics = format_metrics(metrics[metrics_key], self._loss_defs, PhaseType.training)
diff --git a/fast_llm/logging.py b/fast_llm/logging.py
index 4f77be7fa..41fd4d99b 100644
--- a/fast_llm/logging.py
+++ b/fast_llm/logging.py
@@ -329,26 +329,6 @@ def log_generator[
 _global_max_reserved = 0
 
 
-def get_memory_usage_mib(reset_stats: bool = True, relative_to: dict[str, float] | None = None) -> dict[str, float]:
-    global _global_max_allocated, _global_max_reserved
-    max_allocated = torch.cuda.max_memory_allocated() / 2**20
-    max_reserved = torch.cuda.max_memory_reserved() / 2**20
-    _global_max_allocated = max(max_allocated, _global_max_allocated)
-    _global_max_reserved = max(max_reserved, _global_max_reserved)
-    out = {
-        "allocated": torch.cuda.memory_allocated() / 2**20,
-        "max_allocated": max_allocated,
-        "reserved": torch.cuda.memory_reserved() / 2**20,
-        "max_reserved": max_reserved,
-        "global_max_reserved": _global_max_reserved,
-    }
-    if relative_to:
-        out = {key: value - relative_to.get(key, 0) for key, value in out.items()}
-    if reset_stats:
-        torch.cuda.reset_peak_memory_stats()
-    return out
-
-
 def log_memory_usage[
     T
 ](
@@ -360,7 +340,6 @@ def log_memory_usage[
 ) -> T:
     if report is None:
         get_and_reset_memory_usage_mib(relative_to=relative_to, reset_stats=reset_stats)
-        report = get_memory_usage_mib(reset_stats, relative_to)
     formatted = _MEMORY_METRIC_FORMAT.format(**report)
     if header is not None:
         formatted = f"{header}: {formatted}"

From 1625c58a4407239f081b5556a5f1d4fc866cd4f0 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Fri, 4 Jul 2025 12:00:02 -0400
Subject: [PATCH 04/14] misc

---
 fast_llm/logging.py  |  2 +-
 fast_llm/utils.py    | 12 ++++--------
 tests/conftest.py    | 25 ++++++++++---------------
 tests/utils/utils.py | 39 ++++++++++++++++++++++++++++++++++++---
 4 files changed, 51 insertions(+), 27 deletions(-)

diff --git a/fast_llm/logging.py b/fast_llm/logging.py
index 41fd4d99b..385a8b960 100644
--- a/fast_llm/logging.py
+++ b/fast_llm/logging.py
@@ -339,7 +339,7 @@ def log_memory_usage[
     relative_to: dict[str, int] | None = None,
 ) -> T:
     if report is None:
-        get_and_reset_memory_usage_mib(relative_to=relative_to, reset_stats=reset_stats)
+        report = get_and_reset_memory_usage_mib(relative_to=relative_to, reset_stats=reset_stats)
     formatted = _MEMORY_METRIC_FORMAT.format(**report)
     if header is not None:
         formatted = f"{header}: {formatted}"
diff --git a/fast_llm/utils.py b/fast_llm/utils.py
index bd2f8ef7b..821ec5874 100644
--- a/fast_llm/utils.py
+++ b/fast_llm/utils.py
@@ -425,19 +425,15 @@ def get_and_reset_memory_usage_mib(
             # Happens if cuda is broken.
             return {}
     report = {
-        # Relevant value for OOM risk. Also look at global max since fast-llm resets stats.
-        "max_memory_reserved": max(torch.cuda.max_memory_reserved() / 2**20, _global_max_reserved),
-        # Actual memory usage from the test.
-        "max_memory_allocated": max(torch.cuda.max_memory_allocated() / 2**20, _global_max_allocated),
-        "memory_reserved": torch.cuda.memory_reserved() / 2**20,
-        "memory_allocated": torch.cuda.memory_allocated() / 2**20,
+        "reserved": torch.cuda.memory_reserved() / 2**20,
+        "allocated": torch.cuda.memory_allocated() / 2**20,
     }
     max_allocated = torch.cuda.max_memory_allocated() / 2**20
     max_reserved = torch.cuda.max_memory_reserved() / 2**20
     if global_stats:
         report |= {
-            "max_memory_reserved": max(max_reserved, _global_max_reserved),
-            "max_memory_allocated": max(max_allocated, _global_max_allocated),
+            "max_reserved": max(max_reserved, _global_max_reserved),
+            "max_allocated": max(max_allocated, _global_max_allocated),
         }
     else:
         report |= {
diff --git a/tests/conftest.py b/tests/conftest.py
index 0eb7826f2..ef6fff695 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -9,7 +9,6 @@
 import torch
 import xdist.scheduler
 
-import fast_llm.logging
 from fast_llm.utils import get_and_reset_memory_usage_mib
 from tests.utils.depends import DependencyManager
 
@@ -22,7 +21,7 @@
 )
 
 from tests.utils.model_configs import model_testing_config, ModelTestingConfig, testing_group_enabled  # isort: skip
-from tests.utils.utils import result_path, TEST_RESULTS_PATH  # isort: skip
+from tests.utils.utils import result_path, TEST_RESULTS_PATH, format_resource_report, report_subtest  # isort: skip
 
 
 manager: DependencyManager | None = None
@@ -190,15 +189,18 @@ def pytest_runtest_makereport(item: pytest.Function, call: pytest.CallInfo):
     if call.when == "call" and torch.cuda.is_available():
         report = get_and_reset_memory_usage_mib(clear_cache=True, global_stats=True, reset_global_stats=True)
         report["duration"] = call.duration
+        if hasattr(item, "fast_llm_resource_report"):
+            report_ = getattr(item, "fast_llm_resource_report")
+            report = {
+                key: max(report[key] for report in (report, report_) if key in report)
+                for key in set(report_) | set(report)
+            }
+
         item.add_report_section(
             call.when,
             "resource usage",
             json.dumps(report),
         )
-        torch.cuda.reset_peak_memory_stats()
-        # Reset global stats for next test.
-        fast_llm.logging._global_max_reserved = 0
-        fast_llm.logging._global_max_allocated = 0
 
 
 @pytest.hookimpl
@@ -218,18 +220,11 @@ def pytest_terminal_summary(terminalreporter):
     terminalreporter.write_sep("=", "Highest gpu memory usage", bold=True)
     sorted_nodeids = sorted(
         resource_reports.keys(),
-        key=lambda nodeid: resource_reports[nodeid]["max_memory_reserved"],
+        key=lambda nodeid: resource_reports[nodeid]["max_reserved"],
         reverse=True,
     )
     for nodeid in sorted_nodeids[: terminalreporter.config.getoption("--show-gpu-memory")]:
-        terminalreporter.write_line(
-            f"{nodeid}:\n    "
-            f"Max Reserved {resource_reports[nodeid]["max_memory_reserved"]:.0f} MiB | "
-            f"Max Allocated {resource_reports[nodeid]["max_memory_allocated"]:.0f} MiB | "
-            f"End Reserved {resource_reports[nodeid]["memory_reserved"]:.0f} MiB | "
-            f"End Allocated {resource_reports[nodeid]["memory_allocated"]:.0f} MiB | "
-            f"Duration {resource_reports[nodeid]["duration"]:.2f}"
-        )
+        terminalreporter.write_line(format_resource_report(nodeid, resource_reports[nodeid]))
 
 
 def pytest_runtest_call(item: pytest.Function):
diff --git a/tests/utils/utils.py b/tests/utils/utils.py
index 49151cbe8..600b4aecb 100644
--- a/tests/utils/utils.py
+++ b/tests/utils/utils.py
@@ -1,4 +1,6 @@
+import json
 import logging
+import math
 import pathlib
 import sys
 import time
@@ -110,7 +112,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
             report = get_and_reset_memory_usage_mib(clear_cache=True, global_stats=True, reset_global_stats=True)
             report["duration"] = time.perf_counter() - self._start
 
-            self._path.joinpath(f"pytest_report_{self._rank}").write_text(traceback.format_exc())
+            json.dump(report, self._path.joinpath(f"pytest_report_{self._rank}").open("w"))
 
         logger.warning(f"{self._name} {"PASSED" if self.success else "FAILED"})")
         if self._rank == 0:
@@ -135,8 +137,22 @@ def check_subtest_success(path: pathlib, fail: bool = True) -> bool:
         return False
 
 
-@pytest.fixture(scope="session")
-def report_subtest(request):
+def format_resource_report(title: str, report: dict[str, float]) -> str:
+    return "".join(
+        [
+            f"{title}:\n    ",
+            f"Max Reserved: {report.get("max_reserved", math.nan):.0f} MiB",
+            f"| Max Allocated: {report.get("max_allocated", math.nan):.0f} MiB".ljust(26),
+            f"| End Reserved: {report.get("reserved", math.nan):.0f} MiB".ljust(25),
+            f"| End Allocated: {report.get("allocated", math.nan):.0f} MiB".ljust(26),
+            f"| Duration: {report.get("duration", math.nan):.2f}".ljust(18),
+            f"| GPUs: {report["gpus"]:.0f}" if "gpus" in report else "",
+        ]
+    )
+
+
+@pytest.fixture(scope="function")
+def report_subtest(request: pytest.FixtureRequest):
     verbose = request.config.getoption("verbose")
     do_capture = request.config.getoption("distributed_capture")
 
@@ -155,6 +171,23 @@ def do_report_subtest(path: pathlib.Path, world_size: int) -> None:
                         print(f"<<< not found {file_path}>>>", file=file_)
         else:
             print("Set verbose > 1 to show run output.")
+
+        reports = {}
+        for rank in range(world_size):
+            try:
+                reports[f"rank_{rank}"] = json.load(path.joinpath(f"pytest_report_{rank}").open("r"))
+            except OSError:
+                reports[rank] = {}
+        keys = {key for report in reports.values() for key in report}
+        report = {key: max(report[key] for report in reports.values() if key in report) for key in keys}
+        report["gpus"] = world_size
+        reports["global"] = report
+
+        print(header(f"Resource usage", 80), file=sys.stderr)
+        for name, report in reports.items():
+            print(format_resource_report(name, report), file=sys.stderr)
+        setattr(request.node, "fast_llm_resource_report", report)
+
         if not success:
             raise RuntimeError(f"test {path.name} failed")
 

From 808feccf89d19ac61c99bd3da23228343263d90c Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Mon, 7 Jul 2025 19:02:03 -0400
Subject: [PATCH 05/14] misc

---
 tests/conftest.py                           |   6 +
 tests/models/distributed_test_checkpoint.py | 141 ++++++-----------
 tests/models/distributed_test_model.py      |   2 +
 tests/models/test_checkpoint.py             | 165 +++++++++-----------
 tests/models/test_model.py                  |   5 +-
 tests/utils/compare_tensor_logs.py          |  15 +-
 tests/utils/run_test_script.py              |  13 +-
 tests/utils/save_load_configs.py            | 143 +++++++++++++++++
 tests/utils/utils.py                        |   1 +
 9 files changed, 290 insertions(+), 201 deletions(-)
 create mode 100644 tests/utils/save_load_configs.py

diff --git a/tests/conftest.py b/tests/conftest.py
index ef6fff695..960f0c7a4 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -12,6 +12,12 @@
 from fast_llm.utils import get_and_reset_memory_usage_mib
 from tests.utils.depends import DependencyManager
 
+from tests.utils.save_load_configs import (  # isort: skip
+    distributed_save_load_config,
+    distributed_save_load_config_non_pp,
+    get_convert_path,
+)
+
 # Make fixtures available globally without import
 from tests.utils.run_test_script import (  # isort: skip
     compare_results_for_all_models,
diff --git a/tests/models/distributed_test_checkpoint.py b/tests/models/distributed_test_checkpoint.py
index 05a0bf443..51687c6d8 100644
--- a/tests/models/distributed_test_checkpoint.py
+++ b/tests/models/distributed_test_checkpoint.py
@@ -1,134 +1,87 @@
 import gc
 import logging
-import pathlib
-import typing
 
 import torch
 
 from fast_llm.cli import fast_llm_main_wrapper
+from fast_llm.config import NoAutoValidate
+from fast_llm.core.distributed import safe_barrier
 from fast_llm.engine.checkpoint.config import (
-    CheckpointFormat,
     CheckpointLoadConfig,
     CheckpointSaveConfig,
     DistributedCheckpointFormat,
     FastLLMCheckpointFormat,
 )
+from fast_llm.engine.distributed.config import DistributedConfig
 from fast_llm.engine.distributed.distributed import ProcessGroupPool
 from fast_llm.engine.multi_stage.config import StageMode
-from fast_llm.utils import header
-from tests.models.test_checkpoint import do_get_convert_path
+from fast_llm.utils import Assert, header
 from tests.utils.model_configs import ModelTestingConfig
 from tests.utils.run_test_script import parse_run_distributed_script
+from tests.utils.save_load_configs import DISTRIBUTED_SAVE_LOAD_CONFIGS, DistributedSaveLoadConfig
+from tests.utils.utils import DistributedSubtestContext
 
 logger = logging.getLogger(__name__)
 
 
 def _test_load_and_save_parallel(
     model_testing_config: ModelTestingConfig,
-    pretrained_path: pathlib.Path,
-    pretrained_format: type[CheckpointFormat],
-    distributed_config: dict[str, typing.Any],
-    save_path: pathlib.Path,
+    config: DistributedSaveLoadConfig,
 ):
-    logger.info(header(save_path.name))
-    logger.info(f"Loading {pretrained_format.name} checkpoint from {pretrained_path}")
+    logger.info(header(config.name))
+    logger.info(f"Loading {config.load_format} checkpoint from {config.load_path}")
+    with NoAutoValidate():
+        load_config = CheckpointLoadConfig(path=config.load_path, format=config.load_format)
+    load_config.setup(model_testing_config.model_config_class)
+    load_config.validate()
     model = model_testing_config.model_class.from_pretrained(
-        CheckpointLoadConfig(path=pretrained_path, format=pretrained_format),
+        load_config,
         # The world size and rank are already set through environment variable.
-        {"distributed": distributed_config},
+        {"distributed": config.distributed},
         mode=StageMode.inference,
     )
     for save_format in (DistributedCheckpointFormat, FastLLMCheckpointFormat):
-        logger.info(f"Loading {save_format.name} checkpoint to {save_path / save_format.name}")
-        model.save_checkpoint(CheckpointSaveConfig(path=save_path / save_format.name, format=save_format))
+        logger.info(f"Loading {save_format.name} checkpoint to {config.save_path / save_format.name}")
+        model.save_checkpoint(CheckpointSaveConfig(path=config.save_path / save_format.name, format=save_format))
     del model
     gc.collect()
     torch.cuda.empty_cache()
 
 
 def main(args: list[str] | None = None) -> None:
-    base_path, model_testing_config, _ = parse_run_distributed_script(args)
+    base_path, model_testing_config, do_capture = parse_run_distributed_script(args)
 
-    with ProcessGroupPool(timeout=20):
-        for pretrained_format, pretrained_path in (
-            (
-                DistributedCheckpointFormat,
-                do_get_convert_path(
-                    DistributedCheckpointFormat, model_testing_config.checkpoint_format, base_path=base_path.parent
-                ),
-            ),
-            (
-                FastLLMCheckpointFormat,
-                do_get_convert_path(
-                    FastLLMCheckpointFormat, model_testing_config.checkpoint_format, base_path=base_path.parent
-                ),
-            ),
-            (
-                model_testing_config.checkpoint_format,
-                do_get_convert_path(
-                    model_testing_config.checkpoint_format, DistributedCheckpointFormat, base_path=base_path.parent
-                ),
-            ),
-        ):
-            _test_load_and_save_parallel(
-                model_testing_config=model_testing_config,
-                pretrained_path=pretrained_path,
-                pretrained_format=pretrained_format,
-                distributed_config={},
-                save_path=base_path / f"load_pretrained_{pretrained_format.name}_in_dp2",
-            )
-            _test_load_and_save_parallel(
-                model_testing_config=model_testing_config,
-                pretrained_path=pretrained_path,
-                pretrained_format=pretrained_format,
-                distributed_config={"tensor_parallel": 2},
-                save_path=base_path / f"load_pretrained_{pretrained_format.name}_in_tp2",
-            )
-            _test_load_and_save_parallel(
-                model_testing_config=model_testing_config,
-                pretrained_path=pretrained_path,
-                pretrained_format=pretrained_format,
-                distributed_config={"tensor_parallel": 2, "sequence_tensor_parallel": True},
-                save_path=base_path / f"load_pretrained_{pretrained_format.name}_in_stp2",
-            )
-            _test_load_and_save_parallel(
-                model_testing_config=model_testing_config,
-                pretrained_path=pretrained_path,
-                pretrained_format=pretrained_format,
-                distributed_config={"pipeline_parallel": 2},
-                save_path=base_path / f"load_pretrained_{pretrained_format.name}_in_pp2",
-            )
-
-        dist = DistributedCheckpointFormat.name
-        _test_load_and_save_parallel(
-            model_testing_config=model_testing_config,
-            pretrained_path=base_path / f"load_pretrained_{dist}_in_dp2" / dist,
-            pretrained_format=DistributedCheckpointFormat,
-            distributed_config={"tensor_parallel": 2, "sequence_tensor_parallel": True},
-            save_path=base_path / "load_pretrained_dp2_in_stp2",
-        )
-        _test_load_and_save_parallel(
-            model_testing_config=model_testing_config,
-            pretrained_path=base_path / f"load_pretrained_{dist}_in_stp2" / dist,
-            pretrained_format=DistributedCheckpointFormat,
-            distributed_config={},
-            save_path=base_path / "load_pretrained_stp2_in_dp2",
-        )
-        _test_load_and_save_parallel(
-            model_testing_config=model_testing_config,
-            pretrained_path=base_path / f"load_pretrained_{dist}_in_tp2" / dist,
-            pretrained_format=DistributedCheckpointFormat,
-            distributed_config={"tensor_parallel": 2, "sequence_tensor_parallel": True},
-            save_path=base_path / "load_pretrained_tp2_in_pp2",
-        )
-        _test_load_and_save_parallel(
-            model_testing_config=model_testing_config,
-            pretrained_path=base_path / f"load_pretrained_{dist}_in_pp2" / dist,
-            pretrained_format=DistributedCheckpointFormat,
-            distributed_config={"tensor_parallel": 2},
-            save_path=base_path / "load_pretrained_pp2_in_tp2",
+    if do_capture:
+        logger.warning(
+            "Capturing output and forwarding to associated tests. Run with `--no-distributed-capture` to disable."
         )
 
+    with ProcessGroupPool(timeout=20) as pool:
+        failures = []
+        world_size = DistributedConfig.default_world_size
+        rank = DistributedConfig.default_rank
+        group = pool.get_process_group(range(world_size), rank)
+
+        for config in DISTRIBUTED_SAVE_LOAD_CONFIGS.values():
+            config = config.resolve(base_path, model_testing_config)
+            Assert.eq(world_size, config.num_gpus)
+            with DistributedSubtestContext(base_path, config.name, group, world_size, enabled=do_capture) as subtest:
+                _test_load_and_save_parallel(
+                    model_testing_config=model_testing_config,
+                    config=config,
+                )
+            if not subtest.success:
+                failures.append(config.name)
+
+        # Final barrier to ensure everything is done before torchrun potentially kills workers.
+        safe_barrier(group, "testing end")
+        # Let pytest know how things went.
+        # These should already be reported above, we repeat for convenience.
+        if failures:
+            raise RuntimeError(f"The following subtests failed: {", ".join(failures)}")
+        else:
+            logger.warning("All tests passed")
+
 
 if __name__ == "__main__":
     with fast_llm_main_wrapper():
diff --git a/tests/models/distributed_test_model.py b/tests/models/distributed_test_model.py
index 36f13ec2a..933b215e7 100644
--- a/tests/models/distributed_test_model.py
+++ b/tests/models/distributed_test_model.py
@@ -42,6 +42,8 @@ def main(args: list[str] | None = None) -> None:
         # These should already be reported above, we repeat for convenience.
         if failures:
             raise RuntimeError(f"The following subtests failed: {", ".join(failures)}")
+        else:
+            logger.warning("All tests passed")
 
 
 if __name__ == "__main__":
diff --git a/tests/models/test_checkpoint.py b/tests/models/test_checkpoint.py
index 8392494e4..3b615d748 100644
--- a/tests/models/test_checkpoint.py
+++ b/tests/models/test_checkpoint.py
@@ -1,4 +1,4 @@
-import functools
+import logging
 import pathlib
 import shutil
 
@@ -22,7 +22,9 @@
 from tests.utils.compare_tensor_logs import CompareConfig, compare_logged_tensor
 from tests.utils.distributed_configs import DistributedTestingConfig
 from tests.utils.model_configs import ModelTestingConfig, ModelTestingGroup
-from tests.utils.run_test_script import ARTIFACT_PATH
+from tests.utils.save_load_configs import DISTRIBUTED_SAVE_LOAD_CONFIGS, DistributedSaveLoadConfig
+
+logger = logging.getLogger(__name__)
 
 _WEIGHT_SHARD_SAVE_NAME = f"{ShardName.weights}_shard"
 
@@ -46,16 +48,12 @@ def test_checkpoint_and_eval(run_test_script_for_all_models, model_testing_confi
 @pytest.fixture(scope="module")
 def prepare_resume(run_test_script_base_path: pathlib.Path):
     def do_prepare_resume(distributed_testing_config: DistributedTestingConfig):
-        resume_from_path = run_test_script_base_path / distributed_testing_config.compare
         self_path = run_test_script_base_path / distributed_testing_config.name
-        shutil.copytree(resume_from_path, self_path)
+        shutil.copytree(run_test_script_base_path / distributed_testing_config.compare, self_path)
         shutil.rmtree(self_path / "checkpoint" / "2")
         assert (self_path / "checkpoint" / "1" / "ok").is_file()
         # TODO: Eval
         shutil.rmtree(self_path / "runs")
-        for artifact in ["init", "train_1"]:
-            path = f"{ARTIFACT_PATH}/0/tensor_logs_{artifact}.pt"
-            shutil.copy(resume_from_path / path, self_path / path)
 
     return do_prepare_resume
 
@@ -64,40 +62,25 @@ def do_prepare_resume(distributed_testing_config: DistributedTestingConfig):
 @pytest.mark.model_testing_group(ModelTestingGroup.checkpoint)
 def test_resume(run_test_script_for_all_models, compare_results_for_all_models, prepare_resume):
     distributed_testing_config = DistributedTestingConfig(
-        name="resume", compare="test_checkpoint_and_eval", config_args=_CHECKPOINT_AND_EVAL_ARGS
+        name="resume", compare="checkpoint_and_eval", config_args=_CHECKPOINT_AND_EVAL_ARGS
     )
-
     prepare_resume(distributed_testing_config)
-
     # Resume from iteration=1 and compare outputs with the baseline run.
     run_test_script_for_all_models(distributed_testing_config)
-    compare_results_for_all_models("distributed_testing_config")
+    compare_results_for_all_models(distributed_testing_config, ("train_2",))
 
 
 @pytest.mark.depends_on(on=["test_checkpoint_and_eval[{model_testing_config}]"])
 @pytest.mark.model_testing_group(ModelTestingGroup.checkpoint)
 def test_resume_frozen(run_test_script_for_all_models, prepare_resume):
     distributed_testing_config = DistributedTestingConfig(
-        name="resume_frozen", compare="test_checkpoint_and_eval", config_args=_CHECKPOINT_AND_EVAL_ARGS
+        name="resume_frozen", compare="checkpoint_and_eval", config_args=_CHECKPOINT_AND_EVAL_ARGS
     )
     prepare_resume(distributed_testing_config)
     # Resume with frozen mlp. No comparison.
     run_test_script_for_all_models(distributed_testing_config)
 
 
-def do_get_convert_path(
-    to: type[CheckpointFormat] | None = None, from_: type[CheckpointFormat] | None = None, *, base_path: pathlib.Path
-) -> pathlib.Path:
-    if to is None or from_ is None:
-        return base_path / "test_checkpoint_and_eval" / "checkpoint" / "2"
-    return base_path / "test_convert_model" / f"{to.name}_from_{from_.name}"
-
-
-@pytest.fixture(scope="module")
-def get_convert_path(run_test_script_base_path):
-    return functools.partial(do_get_convert_path, base_path=run_test_script_base_path)
-
-
 @pytest.fixture(scope="module")
 def run_conversion(model_testing_config: ModelTestingConfig, get_convert_path):
     def do_run_conversion(
@@ -155,9 +138,12 @@ def test_conversion(model_testing_config, run_conversion, get_convert_path):
 
 
 def _compare_safetensor_files(
-    reference_path: pathlib.Path, *other_paths: pathlib.Path, expected_keys: set[str] | None = None
+    reference: pathlib.Path | dict[str, torch.Tensor],
+    *other_paths: pathlib.Path,
+    expected_keys: set[str] | None = None,
 ):
-    reference = safetensors.torch.load_file(reference_path)
+    if isinstance(reference, pathlib.Path):
+        reference = safetensors.torch.load_file(reference)
     if expected_keys is None:
         expected_keys = set(reference.keys())
     else:
@@ -341,91 +327,82 @@ def test_huggingface_model(model_testing_config, get_convert_path):
         raise ValueError(f"Comparison failed ({len(errors)} errors)")
 
 
-@pytest.fixture(scope="module")
-def load_and_save_parallel_base_path(run_test_script_base_path):
-    return run_test_script_base_path / "test_load_and_save_parallel"
-
-
-@pytest.mark.depends_on(
-    on=[
-        "test_load_pretrained[{model_testing_config}]",
-    ]
-)
+@pytest.mark.depends_on(on=["test_load_pretrained[{model_testing_config}]"])
 @pytest.mark.model_testing_group(ModelTestingGroup.convert, ModelTestingGroup.distributed)
-def test_save_and_load_in_parallel(run_distributed_script, load_and_save_parallel_base_path, model_testing_config):
+def test_save_and_load_in_parallel(run_distributed_script, run_test_script_base_path, model_testing_config, request):
     # Save and load checkpoints to and from various distributed configurations.
     # Combined in a single test to mitigate process creation overhead.
     # TODO: Test beyond 2 gpu configs?
     import tests.models.distributed_test_checkpoint
 
-    run_distributed_script(
-        [
-            tests.models.distributed_test_checkpoint.__file__,
-            str(load_and_save_parallel_base_path),
-            model_testing_config.name,
-        ],
-        num_gpus=2,
-    )
+    script = [
+        tests.models.distributed_test_checkpoint.__file__,
+        str(run_test_script_base_path),
+        model_testing_config.name,
+    ]
+    if request.config.getoption("distributed_capture"):
+        logger.warning(
+            "Capturing output and forwarding to associated tests. Run with `--no-distributed-capture` to disable."
+        )
+    else:
+        script.append("--no-distributed-capture")
+    run_distributed_script(script, num_gpus=2)
 
 
 @pytest.fixture(scope="module")
-def parallel_checkpoint_names(model_testing_config):
-    names = []
-    for format_ in (DistributedCheckpointFormat, FastLLMCheckpointFormat, model_testing_config.checkpoint_format):
-        names.extend(
-            [
-                f"load_pretrained_{format_.name}_in_dp2",
-                f"load_pretrained_{format_.name}_in_tp2",
-                f"load_pretrained_{format_.name}_in_stp2",
-                f"load_pretrained_{format_.name}_in_pp2",
-            ]
-        )
-
-    names.extend(
-        [
-            "load_pretrained_dp2_in_stp2",
-            "load_pretrained_stp2_in_dp2",
-            "load_pretrained_tp2_in_pp2",
-            "load_pretrained_pp2_in_tp2",
-        ]
-    )
-    return names
+def reference_distributed_shard(get_convert_path) -> torch.Tensor:
+    # Load the file in a fixture (on cpu) so it's not loaded from disk each time.
+    return safetensors.torch.load_file(get_convert_path() / "rank_0.safetensors")[_WEIGHT_SHARD_SAVE_NAME]
 
 
-@pytest.mark.depends_on(on=["test_save_and_load_in_parallel[{model_testing_config}]"])
+@pytest.mark.depends_on(on=["test_load_pretrained[{model_testing_config}]"])
 @pytest.mark.model_testing_group(ModelTestingGroup.convert, ModelTestingGroup.distributed)
 def test_load_parallel_checkpoint_in_single_gpu(
-    load_and_save_parallel_base_path, get_convert_path, load_and_compare_checkpoints, parallel_checkpoint_names
+    distributed_save_load_config: DistributedSaveLoadConfig,
+    run_test_script_base_path,
+    model_testing_config,
+    load_and_compare_checkpoints,
+    reference_distributed_shard,
+    report_subtest,
 ):
-    # Test single-gpu loading of multi-gpu distributed checkpoints.
-    reference_shard = safetensors.torch.load_file(get_convert_path() / "rank_0.safetensors", device="cuda")[
-        _WEIGHT_SHARD_SAVE_NAME
-    ]
+    distributed_save_load_config = distributed_save_load_config.resolve(
+        base_path=run_test_script_base_path, model_testing_config=model_testing_config
+    )
+    report_subtest(distributed_save_load_config.save_path, distributed_save_load_config.num_gpus)
+    load_and_compare_checkpoints(
+        DistributedCheckpointFormat,
+        distributed_save_load_config.save_path / DistributedCheckpointFormat.name,
+        None,
+        reference_distributed_shard.to(device="cuda"),
+    )
 
-    for name in parallel_checkpoint_names:
-        load_and_compare_checkpoints(
-            DistributedCheckpointFormat,
-            load_and_save_parallel_base_path / name / DistributedCheckpointFormat.name,
-            None,
-            reference_shard,
-        )
+
+@pytest.fixture(scope="module")
+def reference_fast_llm_shard(get_convert_path) -> dict[str, torch.Tensor]:
+    # Load the file in a fixture (on cpu) so it's not loaded from disk each time.
+    return safetensors.torch.load_file(
+        get_convert_path(FastLLMCheckpointFormat, DistributedCheckpointFormat) / f"model_0.safetensors"
+    )
 
 
 @pytest.mark.depends_on(on=["test_save_and_load_in_parallel[{model_testing_config}]"])
 @pytest.mark.model_testing_group(ModelTestingGroup.convert, ModelTestingGroup.distributed)
-def test_parallel_checkpoint_consistency(model_testing_config, load_and_save_parallel_base_path, get_convert_path):
+def test_parallel_checkpoint_consistency(model_testing_config, run_test_script_base_path):
     # Check the consistency of the checkpoints saved in `test_save_and_load_in_parallel`
-    checkpoint_formats = (DistributedCheckpointFormat, FastLLMCheckpointFormat, model_testing_config.checkpoint_format)
     # Compare Distributed checkpoints
     for config in ("dp2", "tp2", "stp2", "pp2"):
         for rank in range(2):
             _compare_safetensor_files(
                 *[
-                    load_and_save_parallel_base_path
-                    / f"load_pretrained_{format_.name}_in_{config}"
-                    / DistributedCheckpointFormat.name
-                    / f"rank_{rank}.safetensors"
-                    for format_ in checkpoint_formats
+                    DISTRIBUTED_SAVE_LOAD_CONFIGS[f"load_{format_}_in_{config}"]
+                    .resolve(base_path=run_test_script_base_path, model_testing_config=model_testing_config)
+                    .save_path
+                    / f"{DistributedCheckpointFormat.name}/rank_{rank}.safetensors"
+                    for format_ in (
+                        DistributedCheckpointFormat.name,
+                        FastLLMCheckpointFormat.name,
+                        "{checkpoint_format}",
+                    )
                 ]
             )
 
@@ -433,15 +410,15 @@ def test_parallel_checkpoint_consistency(model_testing_config, load_and_save_par
 @pytest.mark.depends_on(on=["test_save_and_load_in_parallel[{model_testing_config}]"])
 @pytest.mark.model_testing_group(ModelTestingGroup.convert, ModelTestingGroup.distributed)
 def test_multi_gpu_fast_llm_checkpoint(
-    model_testing_config, load_and_save_parallel_base_path, get_convert_path, parallel_checkpoint_names
+    model_testing_config, distributed_save_load_config_non_pp, run_test_script_base_path, reference_fast_llm_shard
 ):
     # Fast-LLM checkpoints are independent of the distributed configuration that saved it.
     # TODO: Check pipeline-parallel checkpoints (two files).
+    distributed_save_load_config_non_pp = distributed_save_load_config_non_pp.resolve(
+        base_path=run_test_script_base_path, model_testing_config=model_testing_config
+    )
+
     _compare_safetensor_files(
-        get_convert_path(FastLLMCheckpointFormat, DistributedCheckpointFormat) / f"model_0.safetensors",
-        *[
-            load_and_save_parallel_base_path / name / FastLLMCheckpointFormat.name / f"model_0.safetensors"
-            for name in parallel_checkpoint_names
-            if "in_pp2" not in name
-        ],
+        reference_fast_llm_shard,
+        distributed_save_load_config_non_pp.save_path / f"{FastLLMCheckpointFormat.name}/model_0.safetensors",
     )
diff --git a/tests/models/test_model.py b/tests/models/test_model.py
index 2aeff95cc..b1579c3f4 100644
--- a/tests/models/test_model.py
+++ b/tests/models/test_model.py
@@ -49,11 +49,12 @@ def test_run_model_distributed(run_distributed_script, model_testing_config, run
     import tests.models.distributed_test_model
 
     script = [tests.models.distributed_test_model.__file__, str(run_test_script_base_path), model_testing_config.name]
-    if not request.config.getoption("distributed_capture"):
+    if request.config.getoption("distributed_capture"):
         logger.warning(
             "Capturing output and forwarding to associated tests. Run with `--no-distributed-capture` to disable."
         )
-        script.append("--no-capture")
+    else:
+        script.append("--no-distributed-capture")
     run_distributed_script(script, num_gpus=torch.cuda.device_count())
 
 
diff --git a/tests/utils/compare_tensor_logs.py b/tests/utils/compare_tensor_logs.py
index 96acf9658..f22859dfd 100644
--- a/tests/utils/compare_tensor_logs.py
+++ b/tests/utils/compare_tensor_logs.py
@@ -1,6 +1,7 @@
 import argparse
 import dataclasses
 import pathlib
+import typing
 import warnings
 
 import torch
@@ -20,13 +21,17 @@ class CompareConfig:
     ignore_duplicates: list[str] = dataclasses.field(default_factory=list)
 
 
-def extract_tensor_logs(artifact_path: pathlib.Path, errors, config: CompareConfig):
+def extract_tensor_logs(
+    artifact_path: pathlib.Path, errors, config: CompareConfig, artifacts: typing.Sequence[str] | None = None
+):
     tensor_logs = {}
     ignore_keys = set()
     for rank_path in sorted(artifact_path.iterdir()):
         for p in rank_path.iterdir():
             if p.name.startswith(_TENSOR_LOG_PREFIX) and p.suffix == ".pt":
                 step_name = p.stem[len(_TENSOR_LOG_PREFIX) :]
+                if artifacts is not None and step_name not in artifacts:
+                    continue
                 step_logs = torch.load(p)
                 if step_name not in tensor_logs:
                     tensor_logs[step_name] = {}
@@ -112,14 +117,15 @@ def compare_tensor_logs_base(
     artifact_path_ref: pathlib.Path,
     artifact_path_test: pathlib.Path,
     config: CompareConfig | None = None,
+    artifacts: typing.Sequence[str] | None = None,
 ):
     errors = []
 
     if config is None:
         config = CompareConfig()
 
-    logs_ref = extract_tensor_logs(artifact_path_ref, errors, config=config)
-    logs_test = extract_tensor_logs(artifact_path_test, errors, config=config)
+    logs_ref = extract_tensor_logs(artifact_path_ref, errors, config=config, artifacts=artifacts)
+    logs_test = extract_tensor_logs(artifact_path_test, errors, config=config, artifacts=artifacts)
 
     for step_key in sorted(compare_dict_keys(logs_ref, logs_test, errors, "Logged steps")):
         step_logs_ref = logs_ref[step_key]
@@ -144,9 +150,10 @@ def compare_tensor_logs(
     artifact_path_ref: pathlib.Path,
     artifact_path_test: pathlib.Path,
     config: CompareConfig | None = None,
+    artifacts: typing.Sequence[str] | None = None,
 ):
     print(f'Comparing tensor logs in "{artifact_path_test}" with reference logs "{artifact_path_ref}"')
-    errors = compare_tensor_logs_base(artifact_path_ref, artifact_path_test, config)
+    errors = compare_tensor_logs_base(artifact_path_ref, artifact_path_test, config, artifacts)
     if errors:
         for error in errors:
             print(error)
diff --git a/tests/utils/run_test_script.py b/tests/utils/run_test_script.py
index 61bc75074..602afeb23 100644
--- a/tests/utils/run_test_script.py
+++ b/tests/utils/run_test_script.py
@@ -29,7 +29,7 @@ def do_run_distributed_script(
     rendezvous_port: int,
     torchrun_port: int,
     num_gpus: int,
-    timeout: float = 120,
+    timeout: float = 240,
     env: dict[str, str | None] = None,
 ):
     command = [
@@ -113,16 +113,15 @@ def compare_results_for_all_models(
     worker_resources: "WorkerResources",
     run_test_script_base_path: pathlib.Path,
 ):
-    def do_compare_results_for_all_models(config: DistributedTestingConfig, artifacts: typing.Iterable[str]):
+    def do_compare_results_for_all_models(
+        config: DistributedTestingConfig, artifacts: typing.Iterable[str] | None = None
+    ):
         assert config.compare is not None
-        compare_path = run_test_script_base_path / config.compare / ARTIFACT_PATH
-        for artifact in artifacts:
-            if not (artifact_path := compare_path / "0" / f"tensor_logs_{artifact}.pt").is_file():
-                pytest.fail(f"Missing artifact {artifact_path} from {config.compare}.", pytrace=False)
         compare_tensor_logs(
-            compare_path,
+            run_test_script_base_path / config.compare / ARTIFACT_PATH,
             run_test_script_base_path / config.name / ARTIFACT_PATH,
             config.compare_config,
+            artifacts,
         )
 
     return do_compare_results_for_all_models
diff --git a/tests/utils/save_load_configs.py b/tests/utils/save_load_configs.py
new file mode 100644
index 000000000..f5a15020e
--- /dev/null
+++ b/tests/utils/save_load_configs.py
@@ -0,0 +1,143 @@
+import dataclasses
+import functools
+import pathlib
+import typing
+
+import pytest
+
+from fast_llm.engine.checkpoint.config import CheckpointFormat, DistributedCheckpointFormat, FastLLMCheckpointFormat
+from tests.utils.model_configs import ModelTestingConfig
+
+
+@dataclasses.dataclass(kw_only=True)
+class DistributedSaveLoadConfig:
+    load_path: pathlib.Path | str
+    load_format: str
+    save_path: pathlib.Path | str
+    distributed: dict[str, typing.Any]
+    num_gpus: int = 2
+
+    def resolve(self, base_path: pathlib.Path, model_testing_config: ModelTestingConfig) -> typing.Self:
+        return dataclasses.replace(
+            self,
+            load_path=base_path
+            / str(self.load_path).format(checkpoint_format=model_testing_config.checkpoint_format.name),
+            load_format=self.load_format.format(checkpoint_format=model_testing_config.checkpoint_format.name),
+            save_path=base_path
+            / str(self.save_path).format(checkpoint_format=model_testing_config.checkpoint_format.name),
+        )
+
+    @property
+    def name(self) -> str:
+        return pathlib.Path(self.save_path).name
+
+
+def do_get_convert_path(
+    to: type[CheckpointFormat] | str | None = None,
+    from_: type[CheckpointFormat] | str | None = None,
+    *,
+    base_path: pathlib.Path,
+) -> pathlib.Path:
+    if to is None or from_ is None:
+        return base_path / "checkpoint_and_eval" / "checkpoint" / "2"
+    return (
+        base_path
+        / "convert_model"
+        / f"{to.name if isinstance(to,type) else to}_from_{from_.name if isinstance(from_,type) else from_}"
+    )
+
+
+@pytest.fixture(scope="module")
+def get_convert_path(run_test_script_base_path):
+    return functools.partial(do_get_convert_path, base_path=run_test_script_base_path)
+
+
+_DISTRIBUTED_SAVE_LOAD_CONFIGS = []
+
+
+for pretrained_format, pretrained_path in (
+    (
+        DistributedCheckpointFormat.name,
+        do_get_convert_path(DistributedCheckpointFormat.name, "{checkpoint_format}", base_path=pathlib.Path()),
+    ),
+    (
+        FastLLMCheckpointFormat.name,
+        do_get_convert_path(FastLLMCheckpointFormat.name, "{checkpoint_format}", base_path=pathlib.Path()),
+    ),
+    (
+        "{checkpoint_format}",
+        do_get_convert_path("{checkpoint_format}", DistributedCheckpointFormat.name, base_path=pathlib.Path()),
+    ),
+):
+    _DISTRIBUTED_SAVE_LOAD_CONFIGS.extend(
+        [
+            DistributedSaveLoadConfig(
+                load_path=pretrained_path,
+                load_format=pretrained_format,
+                save_path=f"load_{pretrained_format}_in_dp2",
+                distributed={},
+            ),
+            DistributedSaveLoadConfig(
+                load_path=pretrained_path,
+                load_format=pretrained_format,
+                save_path=f"load_{pretrained_format}_in_tp2",
+                distributed={"tensor_parallel": 2},
+            ),
+            DistributedSaveLoadConfig(
+                load_path=pretrained_path,
+                load_format=pretrained_format,
+                save_path=f"load_{pretrained_format}_in_stp2",
+                distributed={"tensor_parallel": 2, "sequence_tensor_parallel": True},
+            ),
+            DistributedSaveLoadConfig(
+                load_path=pretrained_path,
+                load_format=pretrained_format,
+                save_path=f"load_{pretrained_format}_in_pp2",
+                distributed={"pipeline_parallel": 2},
+            ),
+        ]
+    )
+
+_DISTRIBUTED_SAVE_LOAD_CONFIGS.extend(
+    [
+        DistributedSaveLoadConfig(
+            load_path=f"load_{DistributedCheckpointFormat.name}_in_dp2/{DistributedCheckpointFormat.name}",
+            load_format=DistributedCheckpointFormat.name,
+            save_path="load_dp2_in_stp2",
+            distributed={"tensor_parallel": 2, "sequence_tensor_parallel": True},
+        ),
+        DistributedSaveLoadConfig(
+            load_path=f"load_{DistributedCheckpointFormat.name}_in_stp2/{DistributedCheckpointFormat.name}",
+            load_format=DistributedCheckpointFormat.name,
+            save_path="load_stp2_in_dp2",
+            distributed={},
+        ),
+        DistributedSaveLoadConfig(
+            load_path=f"load_{DistributedCheckpointFormat.name}_in_tp2/{DistributedCheckpointFormat.name}",
+            load_format=DistributedCheckpointFormat.name,
+            save_path="load_tp2_in_pp2",
+            distributed={"pipeline_parallel": 2},
+        ),
+        DistributedSaveLoadConfig(
+            load_path=f"load_{DistributedCheckpointFormat.name}_in_pp2/{DistributedCheckpointFormat.name}",
+            load_format=DistributedCheckpointFormat.name,
+            save_path="load_pp2_in_tp2",
+            distributed={"tensor_parallel": 2},
+        ),
+    ]
+)
+
+# TODO: Name isn't formated.
+DISTRIBUTED_SAVE_LOAD_CONFIGS: dict[str, DistributedSaveLoadConfig] = {
+    config.name: config for config in _DISTRIBUTED_SAVE_LOAD_CONFIGS
+}
+
+
+@pytest.fixture(scope="module", params=DISTRIBUTED_SAVE_LOAD_CONFIGS)
+def distributed_save_load_config(request):
+    return DISTRIBUTED_SAVE_LOAD_CONFIGS[request.param]
+
+
+@pytest.fixture(scope="module", params=[name for name in DISTRIBUTED_SAVE_LOAD_CONFIGS if "pp2" not in name])
+def distributed_save_load_config_non_pp(request):
+    return DISTRIBUTED_SAVE_LOAD_CONFIGS[request.param]
diff --git a/tests/utils/utils.py b/tests/utils/utils.py
index 600b4aecb..54efe0966 100644
--- a/tests/utils/utils.py
+++ b/tests/utils/utils.py
@@ -100,6 +100,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         if exc_type is None:
             self.success = True
         else:
+            self._path.mkdir(parents=True, exist_ok=True)
             self._path.joinpath(f"pytest_traceback_{self._rank}").write_text(traceback.format_exc())
 
         if self._group is not None:

From 5ae8388d0f62bedef1e6ee7bbd00d69adef8197e Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Tue, 8 Jul 2025 12:38:05 -0400
Subject: [PATCH 06/14] stuff

---
 .github/workflows/ci.yaml                  |  2 +-
 fast_llm/engine/distributed/config.py      |  1 -
 fast_llm/engine/distributed/distributed.py |  2 +-
 tests/models/test_checkpoint.py            | 28 +++++++++++++++-------
 tests/models/test_model.py                 |  6 ++++-
 5 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 0bca2dd8d..ca7ea749d 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -33,7 +33,7 @@ jobs:
           MAMBA_FORCE_BUILD=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE \
           pip install --no-build-isolation -e ".[CORE,OPTIONAL,HUGGINGFACE,SSM,DEV,DOCS]"
       - name: Run tests
-        run: pytest .
+        run: pytest -v -ra .
 
   docker:
     name: Docker
diff --git a/fast_llm/engine/distributed/config.py b/fast_llm/engine/distributed/config.py
index 6f2e2ab95..9ec63517c 100644
--- a/fast_llm/engine/distributed/config.py
+++ b/fast_llm/engine/distributed/config.py
@@ -357,7 +357,6 @@ def _get_global_ranks(self, size: int, stride: int) -> range:
     def _add_distributed_dim(self, distributed_dim: DistributedDim) -> None:
         Assert.eq(distributed_dim.global_ranks[distributed_dim.rank], self.rank, msg=distributed_dim)
 
-        logger.info(f"Initializing group {distributed_dim}")
         try:
             distributed_dim.check_ranks_in_range(0, self.world_size)
         except:
diff --git a/fast_llm/engine/distributed/distributed.py b/fast_llm/engine/distributed/distributed.py
index f53f25afc..200074ee9 100644
--- a/fast_llm/engine/distributed/distributed.py
+++ b/fast_llm/engine/distributed/distributed.py
@@ -37,6 +37,7 @@ def __init__(
         )
         self._timeout = timeout
         self._use_cpu = use_cpu
+        self._process_groups = {}
 
         if self._use_cpu:
             Assert.eq(self._world_size, 1)
@@ -60,7 +61,6 @@ def __init__(
                     timeout=datetime.timedelta(seconds=timeout),
                 )
             )
-        self._process_groups = {}
 
     @property
     def rank(self):
diff --git a/tests/models/test_checkpoint.py b/tests/models/test_checkpoint.py
index 3b615d748..0781ee549 100644
--- a/tests/models/test_checkpoint.py
+++ b/tests/models/test_checkpoint.py
@@ -23,6 +23,7 @@
 from tests.utils.distributed_configs import DistributedTestingConfig
 from tests.utils.model_configs import ModelTestingConfig, ModelTestingGroup
 from tests.utils.save_load_configs import DISTRIBUTED_SAVE_LOAD_CONFIGS, DistributedSaveLoadConfig
+from tests.utils.utils import requires_cuda
 
 logger = logging.getLogger(__name__)
 
@@ -58,6 +59,7 @@ def do_prepare_resume(distributed_testing_config: DistributedTestingConfig):
     return do_prepare_resume
 
 
+# @requires_cuda
 @pytest.mark.depends_on(on=["test_checkpoint_and_eval[{model_testing_config}]"])
 @pytest.mark.model_testing_group(ModelTestingGroup.checkpoint)
 def test_resume(run_test_script_for_all_models, compare_results_for_all_models, prepare_resume):
@@ -70,6 +72,7 @@ def test_resume(run_test_script_for_all_models, compare_results_for_all_models,
     compare_results_for_all_models(distributed_testing_config, ("train_2",))
 
 
+# @requires_cuda
 @pytest.mark.depends_on(on=["test_checkpoint_and_eval[{model_testing_config}]"])
 @pytest.mark.model_testing_group(ModelTestingGroup.checkpoint)
 def test_resume_frozen(run_test_script_for_all_models, prepare_resume):
@@ -101,6 +104,7 @@ def do_run_conversion(
     return do_run_conversion
 
 
+# @requires_cuda
 @pytest.mark.depends_on(on=["test_checkpoint_and_eval[{model_testing_config}]"])
 @pytest.mark.model_testing_group(ModelTestingGroup.convert)
 def test_conversion(model_testing_config, run_conversion, get_convert_path):
@@ -156,6 +160,7 @@ def _compare_safetensor_files(
             Assert.all_equal(reference[key], other[key])
 
 
+# @requires_cuda
 @pytest.mark.depends_on(on=["test_conversion[{model_testing_config}]"])
 @pytest.mark.model_testing_group(ModelTestingGroup.convert)
 def test_converted_round_trip(model_testing_config, get_convert_path):
@@ -203,6 +208,7 @@ def do_load_and_compare_checkpoints(
     return do_load_and_compare_checkpoints
 
 
+# @requires_cuda
 @pytest.mark.depends_on(on=["test_conversion[{model_testing_config}]"])
 @pytest.mark.model_testing_group(ModelTestingGroup.convert)
 def test_load_pretrained(
@@ -269,6 +275,7 @@ def test_load_pretrained(
     )
 
 
+# @requires_cuda
 @pytest.mark.depends_on(on=["test_load_pretrained[{model_testing_config}]"])
 @pytest.mark.model_testing_group(ModelTestingGroup.convert)
 def test_huggingface_model(model_testing_config, get_convert_path):
@@ -327,6 +334,7 @@ def test_huggingface_model(model_testing_config, get_convert_path):
         raise ValueError(f"Comparison failed ({len(errors)} errors)")
 
 
+# @requires_cuda
 @pytest.mark.depends_on(on=["test_load_pretrained[{model_testing_config}]"])
 @pytest.mark.model_testing_group(ModelTestingGroup.convert, ModelTestingGroup.distributed)
 def test_save_and_load_in_parallel(run_distributed_script, run_test_script_base_path, model_testing_config, request):
@@ -355,6 +363,7 @@ def reference_distributed_shard(get_convert_path) -> torch.Tensor:
     return safetensors.torch.load_file(get_convert_path() / "rank_0.safetensors")[_WEIGHT_SHARD_SAVE_NAME]
 
 
+# @requires_cuda
 @pytest.mark.depends_on(on=["test_load_pretrained[{model_testing_config}]"])
 @pytest.mark.model_testing_group(ModelTestingGroup.convert, ModelTestingGroup.distributed)
 def test_load_parallel_checkpoint_in_single_gpu(
@@ -377,14 +386,7 @@ def test_load_parallel_checkpoint_in_single_gpu(
     )
 
 
-@pytest.fixture(scope="module")
-def reference_fast_llm_shard(get_convert_path) -> dict[str, torch.Tensor]:
-    # Load the file in a fixture (on cpu) so it's not loaded from disk each time.
-    return safetensors.torch.load_file(
-        get_convert_path(FastLLMCheckpointFormat, DistributedCheckpointFormat) / f"model_0.safetensors"
-    )
-
-
+@requires_cuda
 @pytest.mark.depends_on(on=["test_save_and_load_in_parallel[{model_testing_config}]"])
 @pytest.mark.model_testing_group(ModelTestingGroup.convert, ModelTestingGroup.distributed)
 def test_parallel_checkpoint_consistency(model_testing_config, run_test_script_base_path):
@@ -407,6 +409,16 @@ def test_parallel_checkpoint_consistency(model_testing_config, run_test_script_b
             )
 
 
+@pytest.fixture(scope="module")
+def reference_fast_llm_shard(get_convert_path) -> dict[str, torch.Tensor]:
+    # Load the file in a fixture (on cpu) so it's not loaded from disk each time.
+    path = get_convert_path(FastLLMCheckpointFormat, DistributedCheckpointFormat) / f"model_0.safetensors"
+    if not path.is_file():
+        pytest.skip(f"Reference model failed or did not run.")
+    return safetensors.torch.load_file(path)
+
+
+# @requires_cuda
 @pytest.mark.depends_on(on=["test_save_and_load_in_parallel[{model_testing_config}]"])
 @pytest.mark.model_testing_group(ModelTestingGroup.convert, ModelTestingGroup.distributed)
 def test_multi_gpu_fast_llm_checkpoint(
diff --git a/tests/models/test_model.py b/tests/models/test_model.py
index b1579c3f4..91670b253 100644
--- a/tests/models/test_model.py
+++ b/tests/models/test_model.py
@@ -9,11 +9,12 @@
     SINGLE_GPU_TESTING_CONFIGS,
 )
 from tests.utils.model_configs import ModelTestingGroup
-from tests.utils.utils import check_subtest_success, set_subtest_success
+from tests.utils.utils import check_subtest_success, requires_cuda, set_subtest_success
 
 logger = logging.getLogger(__name__)
 
 
+@requires_cuda
 @pytest.mark.model_testing_group(ModelTestingGroup.basic)
 def test_model_simple(run_test_script_for_all_models, run_test_script_base_path):
     # A simple config to prevent unnecessary testing and creation of dependency group
@@ -21,6 +22,7 @@ def test_model_simple(run_test_script_for_all_models, run_test_script_base_path)
     set_subtest_success(run_test_script_base_path / SIMPLE_TESTING_CONFIG.name)
 
 
+@requires_cuda
 @pytest.mark.depends_on(on=["test_model_simple[{model_testing_config}]"])
 @pytest.mark.model_testing_group(ModelTestingGroup.basic)
 # Parametrize with config name so it shows in test name.
@@ -41,6 +43,7 @@ def test_and_compare_model(
         compare_results_for_all_models(config, ("init", "train_1", "train_2"))
 
 
+@requires_cuda
 @pytest.mark.depends_on(on=["test_model_simple[{model_testing_config}]"])
 @pytest.mark.model_testing_group(
     ModelTestingGroup.distributed,
@@ -60,6 +63,7 @@ def test_run_model_distributed(run_distributed_script, model_testing_config, run
 
 # We don't want to depend on `test_model_distributed` because we still want to run this in cas of failure.
 # This should still run after `test_model_distributed`
+@requires_cuda
 @pytest.mark.depends_on(on=["test_model_simple[{model_testing_config}]"])
 @pytest.mark.model_testing_group(ModelTestingGroup.distributed)
 @pytest.mark.parametrize("config_name", list(DISTRIBUTED_TESTING_CONFIGS))

From 4626d7fe460abe9378247f9d0f7548f68bccf6dc Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Tue, 8 Jul 2025 15:31:27 -0400
Subject: [PATCH 07/14] stuff

---
 tests/conftest.py               | 35 ++++++++++++++-------------
 tests/models/test_checkpoint.py | 43 +++++++++++++++++++++------------
 2 files changed, 45 insertions(+), 33 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 960f0c7a4..e9011979a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -6,12 +6,25 @@
 import shutil
 
 import pytest
-import torch
 import xdist.scheduler
 
 from fast_llm.utils import get_and_reset_memory_usage_mib
 from tests.utils.depends import DependencyManager
 
+if worker_name := os.environ.get("PYTEST_XDIST_WORKER"):
+    if gpus := os.environ.get("CUDA_VISIBLE_DEVICES"):
+        # We set the device through "CUDA_VISIBLE_DEVICES", and this needs to happen before importing torch.
+        assert worker_name.startswith("gw")
+        worker_id = int(worker_name[2:])
+        gpus = [int(i) for i in gpus.split(",")]
+        num_gpus = len(gpus)
+        gpus = [gpus[(i + worker_id) % num_gpus] for i in range(num_gpus)]
+        os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(i) for i in gpus)
+
+
+import torch  # isort: skip
+
+
 from tests.utils.save_load_configs import (  # isort: skip
     distributed_save_load_config,
     distributed_save_load_config_non_pp,
@@ -29,6 +42,7 @@
 from tests.utils.model_configs import model_testing_config, ModelTestingConfig, testing_group_enabled  # isort: skip
 from tests.utils.utils import result_path, TEST_RESULTS_PATH, format_resource_report, report_subtest  # isort: skip
 
+logger = logging.getLogger(__name__)
 
 manager: DependencyManager | None = None
 
@@ -56,9 +70,6 @@ def pytest_addoption(parser):
 
 @dataclasses.dataclass
 class WorkerResources:
-    worker_id: int
-    gpu_id: int | None
-    num_gpus: int
     torchrun_port: int
     rendezvous_port: int
 
@@ -92,17 +103,10 @@ def pytest_configure(config):
     num_gpus = torch.cuda.device_count()
     if num_gpus > 0 and is_parallel:
         # We spread workers across GPUs.
-        gpu_id = worker_id % num_gpus
-        # We set the device through "CUDA_VISIBLE_DEVICES", and this needs to happen before cuda initialization.
-        # The `device_count` call above doesn't initialize, but `mem_get_info` below does.
-        assert not torch.cuda.is_initialized()
-        # TODO: Support this?
-        assert "CUDA_VISIBLE_DEVICES" not in os.environ
-        os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str((gpu_id + i) % num_gpus) for i in range(num_gpus))
+        logger.warning(f"[Worker {worker_id}] Using GPUs {os.environ["CUDA_VISIBLE_DEVICES"]}")
     elif num_gpus > 0:
-        gpu_id = 0
-    else:
-        gpu_id = None
+        if "CUDA_VISIBLE_DEVICES" not in os.environ:
+            os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(i) for i in range(num_gpus))
 
     gpu_memory = torch.cuda.mem_get_info(0)[1] if num_gpus > 0 else 0
     if num_gpus > 0:
@@ -118,9 +122,6 @@ def pytest_configure(config):
             )
 
     config.worker_resources = WorkerResources(
-        worker_id=worker_id,
-        gpu_id=gpu_id,
-        num_gpus=num_gpus,
         # Each worker needs its own set of ports for safe distributed run. Hopefully these are free.
         torchrun_port=TORCHRUN_DEFAULT_PORT + 2 * worker_id,
         rendezvous_port=TORCHRUN_DEFAULT_PORT + 2 * worker_id + 1,
diff --git a/tests/models/test_checkpoint.py b/tests/models/test_checkpoint.py
index 0781ee549..4d70857ff 100644
--- a/tests/models/test_checkpoint.py
+++ b/tests/models/test_checkpoint.py
@@ -59,7 +59,7 @@ def do_prepare_resume(distributed_testing_config: DistributedTestingConfig):
     return do_prepare_resume
 
 
-# @requires_cuda
+@requires_cuda
 @pytest.mark.depends_on(on=["test_checkpoint_and_eval[{model_testing_config}]"])
 @pytest.mark.model_testing_group(ModelTestingGroup.checkpoint)
 def test_resume(run_test_script_for_all_models, compare_results_for_all_models, prepare_resume):
@@ -72,7 +72,7 @@ def test_resume(run_test_script_for_all_models, compare_results_for_all_models,
     compare_results_for_all_models(distributed_testing_config, ("train_2",))
 
 
-# @requires_cuda
+@requires_cuda
 @pytest.mark.depends_on(on=["test_checkpoint_and_eval[{model_testing_config}]"])
 @pytest.mark.model_testing_group(ModelTestingGroup.checkpoint)
 def test_resume_frozen(run_test_script_for_all_models, prepare_resume):
@@ -104,7 +104,7 @@ def do_run_conversion(
     return do_run_conversion
 
 
-# @requires_cuda
+@requires_cuda
 @pytest.mark.depends_on(on=["test_checkpoint_and_eval[{model_testing_config}]"])
 @pytest.mark.model_testing_group(ModelTestingGroup.convert)
 def test_conversion(model_testing_config, run_conversion, get_convert_path):
@@ -160,7 +160,7 @@ def _compare_safetensor_files(
             Assert.all_equal(reference[key], other[key])
 
 
-# @requires_cuda
+@requires_cuda
 @pytest.mark.depends_on(on=["test_conversion[{model_testing_config}]"])
 @pytest.mark.model_testing_group(ModelTestingGroup.convert)
 def test_converted_round_trip(model_testing_config, get_convert_path):
@@ -208,7 +208,7 @@ def do_load_and_compare_checkpoints(
     return do_load_and_compare_checkpoints
 
 
-# @requires_cuda
+@requires_cuda
 @pytest.mark.depends_on(on=["test_conversion[{model_testing_config}]"])
 @pytest.mark.model_testing_group(ModelTestingGroup.convert)
 def test_load_pretrained(
@@ -275,7 +275,7 @@ def test_load_pretrained(
     )
 
 
-# @requires_cuda
+@requires_cuda
 @pytest.mark.depends_on(on=["test_load_pretrained[{model_testing_config}]"])
 @pytest.mark.model_testing_group(ModelTestingGroup.convert)
 def test_huggingface_model(model_testing_config, get_convert_path):
@@ -334,7 +334,7 @@ def test_huggingface_model(model_testing_config, get_convert_path):
         raise ValueError(f"Comparison failed ({len(errors)} errors)")
 
 
-# @requires_cuda
+@requires_cuda
 @pytest.mark.depends_on(on=["test_load_pretrained[{model_testing_config}]"])
 @pytest.mark.model_testing_group(ModelTestingGroup.convert, ModelTestingGroup.distributed)
 def test_save_and_load_in_parallel(run_distributed_script, run_test_script_base_path, model_testing_config, request):
@@ -358,12 +358,16 @@ def test_save_and_load_in_parallel(run_distributed_script, run_test_script_base_
 
 
 @pytest.fixture(scope="module")
-def reference_distributed_shard(get_convert_path) -> torch.Tensor:
+def reference_distributed_shard(get_convert_path) -> torch.Tensor | None:
     # Load the file in a fixture (on cpu) so it's not loaded from disk each time.
-    return safetensors.torch.load_file(get_convert_path() / "rank_0.safetensors")[_WEIGHT_SHARD_SAVE_NAME]
+    try:
+        return safetensors.torch.load_file(get_convert_path() / "rank_0.safetensors")[_WEIGHT_SHARD_SAVE_NAME]
+    except OSError:
+        # The fixture may be evaluated even if the tests are to be skipped.
+        return None
 
 
-# @requires_cuda
+@requires_cuda
 @pytest.mark.depends_on(on=["test_load_pretrained[{model_testing_config}]"])
 @pytest.mark.model_testing_group(ModelTestingGroup.convert, ModelTestingGroup.distributed)
 def test_load_parallel_checkpoint_in_single_gpu(
@@ -374,6 +378,8 @@ def test_load_parallel_checkpoint_in_single_gpu(
     reference_distributed_shard,
     report_subtest,
 ):
+    # This should only happen when test is skipped (failed dependency).
+    assert reference_distributed_shard is not None
     distributed_save_load_config = distributed_save_load_config.resolve(
         base_path=run_test_script_base_path, model_testing_config=model_testing_config
     )
@@ -410,20 +416,25 @@ def test_parallel_checkpoint_consistency(model_testing_config, run_test_script_b
 
 
 @pytest.fixture(scope="module")
-def reference_fast_llm_shard(get_convert_path) -> dict[str, torch.Tensor]:
+def reference_fast_llm_shard(get_convert_path) -> dict[str, torch.Tensor] | None:
     # Load the file in a fixture (on cpu) so it's not loaded from disk each time.
-    path = get_convert_path(FastLLMCheckpointFormat, DistributedCheckpointFormat) / f"model_0.safetensors"
-    if not path.is_file():
-        pytest.skip(f"Reference model failed or did not run.")
-    return safetensors.torch.load_file(path)
+    try:
+        return safetensors.torch.load_file(
+            get_convert_path(FastLLMCheckpointFormat, DistributedCheckpointFormat) / f"model_0.safetensors"
+        )
+    except OSError:
+        # The fixture may be evaluated even if the tests are to be skipped.
+        return None
 
 
-# @requires_cuda
+@requires_cuda
 @pytest.mark.depends_on(on=["test_save_and_load_in_parallel[{model_testing_config}]"])
 @pytest.mark.model_testing_group(ModelTestingGroup.convert, ModelTestingGroup.distributed)
 def test_multi_gpu_fast_llm_checkpoint(
     model_testing_config, distributed_save_load_config_non_pp, run_test_script_base_path, reference_fast_llm_shard
 ):
+    # This should only happen when test is skipped (failed dependency).
+    assert reference_fast_llm_shard is not None
     # Fast-LLM checkpoints are independent of the distributed configuration that saved it.
     # TODO: Check pipeline-parallel checkpoints (two files).
     distributed_save_load_config_non_pp = distributed_save_load_config_non_pp.resolve(

From 97d86b818b9d1acb7d6689f454fc805adf86ed4d Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Tue, 8 Jul 2025 15:40:24 -0400
Subject: [PATCH 08/14] stuff

---
 tests/models/test_checkpoint.py     | 1 +
 tests/models/test_match_megatron.py | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/tests/models/test_checkpoint.py b/tests/models/test_checkpoint.py
index 4d70857ff..ecd23649f 100644
--- a/tests/models/test_checkpoint.py
+++ b/tests/models/test_checkpoint.py
@@ -36,6 +36,7 @@
 ]
 
 
+@requires_cuda
 @pytest.mark.model_testing_group(ModelTestingGroup.checkpoint)
 def test_checkpoint_and_eval(run_test_script_for_all_models, model_testing_config):
     # A baseline config (single-gpu, bf16, flash-attn).
diff --git a/tests/models/test_match_megatron.py b/tests/models/test_match_megatron.py
index 5d974172d..be5ddb608 100644
--- a/tests/models/test_match_megatron.py
+++ b/tests/models/test_match_megatron.py
@@ -6,8 +6,10 @@
 from tests.utils.dataset import DATASET_PREFIX, get_test_dataset
 from tests.utils.distributed_configs import DistributedTestingConfig
 from tests.utils.model_configs import ModelTestingGroup
+from tests.utils.utils import requires_cuda
 
 
+@requires_cuda
 @pytest.mark.model_testing_group(ModelTestingGroup.megatron)
 def test_megatron(run_distributed_script, model_testing_config, run_test_script_base_path):
     path = run_test_script_base_path / "megatron"
@@ -28,6 +30,7 @@ def test_megatron(run_distributed_script, model_testing_config, run_test_script_
     )
 
 
+@requires_cuda
 @pytest.mark.depends_on(on=["test_megatron[{model_testing_config}]"])
 @pytest.mark.model_testing_group(ModelTestingGroup.megatron)
 def test_match_megatron(run_test_script_for_all_models, model_testing_config, compare_results_for_all_models):

From ea81ef68a847fdf28671dfebea34c3c928c207d8 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Wed, 9 Jul 2025 13:04:07 -0400
Subject: [PATCH 09/14] Fp32 tests

---
 tests/models/test_checkpoint.py     |   7 +-
 tests/models/test_match_megatron.py |   2 +-
 tests/models/test_model.py          |   4 +-
 tests/utils/compare_tensor_logs.py  | 282 +++++++++++++++-------------
 tests/utils/distributed_configs.py  |  97 +++++++---
 tests/utils/model_configs.py        |   1 -
 tests/utils/run_test_script.py      |   4 +-
 7 files changed, 233 insertions(+), 164 deletions(-)

diff --git a/tests/models/test_checkpoint.py b/tests/models/test_checkpoint.py
index ecd23649f..6f30bd318 100644
--- a/tests/models/test_checkpoint.py
+++ b/tests/models/test_checkpoint.py
@@ -65,12 +65,15 @@ def do_prepare_resume(distributed_testing_config: DistributedTestingConfig):
 @pytest.mark.model_testing_group(ModelTestingGroup.checkpoint)
 def test_resume(run_test_script_for_all_models, compare_results_for_all_models, prepare_resume):
     distributed_testing_config = DistributedTestingConfig(
-        name="resume", compare="checkpoint_and_eval", config_args=_CHECKPOINT_AND_EVAL_ARGS
+        name="resume",
+        compare="checkpoint_and_eval",
+        config_args=_CHECKPOINT_AND_EVAL_ARGS,
+        compare_config=CompareConfig(sub_configs={(("init", "train_1"), None): CompareConfig(ignore_tensors=True)}),
     )
     prepare_resume(distributed_testing_config)
     # Resume from iteration=1 and compare outputs with the baseline run.
     run_test_script_for_all_models(distributed_testing_config)
-    compare_results_for_all_models(distributed_testing_config, ("train_2",))
+    compare_results_for_all_models(distributed_testing_config)
 
 
 @requires_cuda
diff --git a/tests/models/test_match_megatron.py b/tests/models/test_match_megatron.py
index be5ddb608..edc524e04 100644
--- a/tests/models/test_match_megatron.py
+++ b/tests/models/test_match_megatron.py
@@ -56,7 +56,7 @@ def test_match_megatron(run_test_script_for_all_models, model_testing_config, co
             "model.base_model.use_megatron_initialization=True",
         ],
         num_gpus=1,
-        compare_config=CompareConfig(ignore_tensors=ignore_tensors),
+        compare_config=CompareConfig(sub_configs={(None, ignore_tensors): CompareConfig(ignore_tensors=True)}),
     )
 
     run_test_script_for_all_models(distributed_testing_config)
diff --git a/tests/models/test_model.py b/tests/models/test_model.py
index 91670b253..4a344cdc7 100644
--- a/tests/models/test_model.py
+++ b/tests/models/test_model.py
@@ -40,7 +40,7 @@ def test_and_compare_model(
     set_subtest_success(run_test_script_base_path / config.name)
 
     if config.compare is not None:
-        compare_results_for_all_models(config, ("init", "train_1", "train_2"))
+        compare_results_for_all_models(config)
 
 
 @requires_cuda
@@ -81,4 +81,4 @@ def test_model_distributed(
     if config.compare is not None:
         if not check_subtest_success(run_test_script_base_path / config.compare):
             pytest.fail(f"Test {config.compare} failed", pytrace=False)
-        compare_results_for_all_models(config, ("init", "train_1", "train_2"))
+        compare_results_for_all_models(config)
diff --git a/tests/utils/compare_tensor_logs.py b/tests/utils/compare_tensor_logs.py
index f22859dfd..59577e25a 100644
--- a/tests/utils/compare_tensor_logs.py
+++ b/tests/utils/compare_tensor_logs.py
@@ -9,157 +9,175 @@
 _TENSOR_LOG_PREFIX = "tensor_logs_"
 
 
+def _compare_pattern(pattern: typing.Iterable[str] | str | None, name: str):
+    # TODO: Regex?
+    return (
+        True
+        if pattern is None
+        else pattern in name if isinstance(pattern, str) else any(pattern_ in name for pattern_ in pattern)
+    )
+
+
 @dataclasses.dataclass()
 class CompareConfig:
-    rms_eps: float = 1e-3
-    rms_rel_tolerance: float = 3e-2
-    rms_abs_tolerance: float = 5e-3
-    max_rel_tolerance: float = 1.5e-1
-    max_abs_tolerance: float = 5e-2
+    rms_eps: float = 1e-4
+    rms_rel_tolerance: float = 3e-3
+    rms_abs_tolerance: float = 5e-4
+    max_rel_tolerance: float = 1.5e-2
+    max_abs_tolerance: float = 5e-3
     show_samples: int = 10
-    ignore_tensors: list[str] = dataclasses.field(default_factory=list)
-    ignore_duplicates: list[str] = dataclasses.field(default_factory=list)
-
+    ignore_tensors: bool = False
+    ignore_duplicates: bool = False
+    # Use a different config for specific step and/or tensor names. First match is used.
+    sub_configs: dict[tuple[typing.Iterable[str] | str | None, typing.Iterable[str] | str | None], "CompareConfig"] = (
+        dataclasses.field(default_factory=dict)
+    )
+
+    def _get_sub_config(self, step_name: str, tensor_name: str) -> typing.Self:
+        for (step_key, name_key), sub_config in self.sub_configs.items():
+            if _compare_pattern(step_key, step_name) and _compare_pattern(name_key, tensor_name):
+                return sub_config._get_sub_config(step_name, tensor_name)
+        return self
+
+    def _extract_tensor_logs(self, artifact_path: pathlib.Path, errors):
+        tensor_logs = {}
+        ignore_keys = set()
+        for rank_path in sorted(artifact_path.iterdir()):
+            for p in rank_path.iterdir():
+                if p.name.startswith(_TENSOR_LOG_PREFIX) and p.suffix == ".pt":
+                    step_name = p.stem[len(_TENSOR_LOG_PREFIX) :]
+                    for step_log in torch.load(p):
+                        tensor_name = step_log["name"]
+                        sub_config = self._get_sub_config(step_name, tensor_name)
+                        if sub_config.ignore_tensors:
+                            ignore_keys.add(f"{step_name}/{tensor_name}")
+                        else:
+                            if step_name not in tensor_logs:
+                                tensor_logs[step_name] = {}
+                            if (
+                                tensor_name in (tensor_step_logs := tensor_logs[step_name])
+                                and not self.ignore_duplicates
+                            ):
+                                errors.append(f"Duplicate tensor log in step {step_name}: {tensor_name}")
+                            tensor_step_logs[tensor_name] = step_log
+        if ignore_keys:
+            warnings.warn(f"Ignoring keys in {artifact_path}: {ignore_keys}")
+        return tensor_logs
+
+    def _compare_dict_keys(self, dict_ref, dict_test, errors, name):
+        keys_ref = set(dict_ref)
+        keys_test = set(dict_test)
+        if keys_ref != keys_test:
+            errors.append(
+                f">>>> {name} do not match. Missing = {keys_ref - keys_test}, extra = {keys_test - keys_ref}."
+            )
 
-def extract_tensor_logs(
-    artifact_path: pathlib.Path, errors, config: CompareConfig, artifacts: typing.Sequence[str] | None = None
-):
-    tensor_logs = {}
-    ignore_keys = set()
-    for rank_path in sorted(artifact_path.iterdir()):
-        for p in rank_path.iterdir():
-            if p.name.startswith(_TENSOR_LOG_PREFIX) and p.suffix == ".pt":
-                step_name = p.stem[len(_TENSOR_LOG_PREFIX) :]
-                if artifacts is not None and step_name not in artifacts:
-                    continue
-                step_logs = torch.load(p)
-                if step_name not in tensor_logs:
-                    tensor_logs[step_name] = {}
-                for step_log in step_logs:
-                    name = step_log["name"]
-                    if any(ignore_name in name for ignore_name in config.ignore_tensors):
-                        ignore_keys.add(name)
-                    else:
-                        if name in tensor_logs[step_name] and not any(
-                            ignore_name in name for ignore_name in config.ignore_duplicates
-                        ):
-                            errors.append(f"Duplicate tensor log in step {step_name}: {name}")
-                        tensor_logs[step_name][name] = step_log
-    if ignore_keys:
-        warnings.warn(f"Ignoring keys in {artifact_path}: {ignore_keys}")
-    return tensor_logs
-
-
-def compare_dict_keys(dict_ref, dict_test, errors, name):
-    keys_ref = set(dict_ref)
-    keys_test = set(dict_test)
-    if keys_ref != keys_test:
-        errors.append(f">>>> {name} do not match. Missing = {keys_ref-keys_test}, extra = {keys_test-keys_ref}.")
-
-    # Avoid set to preserve ordering.
-    return [key for key in dict_test if key in dict_ref]
-
-
-def compare_logged_tensor(tensor_ref, tensor_test, errors, step, name, config: CompareConfig):
-    if tensor_ref["shape"] != tensor_test["shape"]:
-        errors.append(
-            "\n".join(
-                [f">>>> [{step}] Incompatible shape for tensor {name}: {tensor_test['shape']}!={tensor_ref['shape']}"]
+        # Avoid set to preserve ordering.
+        return [key for key in dict_test if key in dict_ref]
+
+    def _compare_tensors(self, tensor_ref, tensor_test, errors, step_name, tensor_name):
+        sub_config = self._get_sub_config(step_name, tensor_name)
+        if tensor_ref["shape"] != tensor_test["shape"]:
+            errors.append(
+                "\n".join(
+                    [
+                        f">>>> [{step_name}] Incompatible shape for tensor {tensor_name}: {tensor_test['shape']}!={tensor_ref['shape']}"
+                    ]
+                )
             )
-        )
-        return
-    if tensor_ref["step"] != tensor_test["step"]:
-        errors.append(
-            "\n".join(
-                [
-                    f">>>> [{step}] Incompatible sampling rate for tensor {name}: {tensor_test['step']}!={tensor_ref['step']}"
-                ]
+            return
+        if tensor_ref["step"] != tensor_test["step"]:
+            errors.append(
+                "\n".join(
+                    [
+                        f">>>> [{step_name}] Incompatible sampling rate for tensor {tensor_name}: {tensor_test['step']}!={tensor_ref['step']}"
+                    ]
+                )
             )
-        )
-        return
-
-    samples_ref = tensor_ref["samples"].flatten().float()
-    samples_test = tensor_test["samples"].flatten().float()
-    scale_unreg = (samples_ref**2).mean() ** 0.5
-    rms_scale = (scale_unreg**2 + config.rms_eps**2) ** 0.5
-    rms = ((samples_ref - samples_test) ** 2).mean() ** 0.5
-    max_diff = (samples_ref - samples_test).abs().max()
-
-    tensor_errors = []
+            return
 
-    if rms > config.rms_abs_tolerance:
-        tensor_errors.append(f"  * RMS diff absolute = {rms} > {config.rms_abs_tolerance}")
+        samples_ref = tensor_ref["samples"].flatten().float()
+        samples_test = tensor_test["samples"].flatten().float()
+        scale_unreg = (samples_ref**2).mean() ** 0.5
+        rms_scale = (scale_unreg**2 + sub_config.rms_eps**2) ** 0.5
+        rms = ((samples_ref - samples_test) ** 2).mean() ** 0.5
+        max_diff = (samples_ref - samples_test).abs().max()
 
-    if rms / rms_scale > config.rms_rel_tolerance:
-        tensor_errors.append(
-            f"  * RMS diff scaled = {rms/rms_scale} > {config.rms_rel_tolerance} (scale={rms_scale}, unregularized={scale_unreg})"
-        )
+        tensor_errors = []
 
-    if max_diff > config.max_abs_tolerance:
-        tensor_errors.append(f"  * Max diff absolute = {max_diff} > {config.max_abs_tolerance}")
+        if rms > sub_config.rms_abs_tolerance:
+            tensor_errors.append(f"  * RMS diff absolute = {rms} > {sub_config.rms_abs_tolerance}")
 
-    if max_diff / rms_scale > config.max_rel_tolerance:
-        tensor_errors.append(
-            f"  * Max diff scaled = {max_diff/rms_scale} > {config.max_rel_tolerance} (scale={rms_scale}, unregularized={scale_unreg})"
-        )
-
-    if tensor_errors:
-        tensor_errors.extend(
-            [
-                f"  Test samples: " + "".join(f"{x:12.4e}" for x in samples_test[: config.show_samples].tolist()),
-                f"  Ref samples:  " + "".join(f"{x:12.4e}" for x in samples_ref[: config.show_samples].tolist()),
-            ]
-        )
-        errors.append("\n".join([f">>>> [{step}] Excessive diff for tensor {name}:"] + tensor_errors))
+        if rms / rms_scale > sub_config.rms_rel_tolerance:
+            tensor_errors.append(
+                f"  * RMS diff scaled = {rms / rms_scale} > {sub_config.rms_rel_tolerance} (scale={rms_scale}, unregularized={scale_unreg})"
+            )
 
+        if max_diff > sub_config.max_abs_tolerance:
+            tensor_errors.append(f"  * Max diff absolute = {max_diff} > {sub_config.max_abs_tolerance}")
 
-def compare_tensor_logs_base(
-    artifact_path_ref: pathlib.Path,
-    artifact_path_test: pathlib.Path,
-    config: CompareConfig | None = None,
-    artifacts: typing.Sequence[str] | None = None,
-):
-    errors = []
-
-    if config is None:
-        config = CompareConfig()
-
-    logs_ref = extract_tensor_logs(artifact_path_ref, errors, config=config, artifacts=artifacts)
-    logs_test = extract_tensor_logs(artifact_path_test, errors, config=config, artifacts=artifacts)
-
-    for step_key in sorted(compare_dict_keys(logs_ref, logs_test, errors, "Logged steps")):
-        step_logs_ref = logs_ref[step_key]
-        step_logs_test = logs_test[step_key]
-
-        for tensor_key in compare_dict_keys(
-            step_logs_ref, step_logs_test, errors=errors, name=f"[{step_key}] Tensor keys"
-        ):
-            compare_logged_tensor(
-                step_logs_ref[tensor_key],
-                step_logs_test[tensor_key],
-                errors,
-                step_key,
-                tensor_key,
-                config,
+        if max_diff / rms_scale > sub_config.max_rel_tolerance:
+            tensor_errors.append(
+                f"  * Max diff scaled = {max_diff / rms_scale} > {sub_config.max_rel_tolerance} (scale={rms_scale}, unregularized={scale_unreg})"
             )
 
-    return errors
+        if tensor_errors:
+            tensor_errors.extend(
+                [
+                    f"  Test samples: " + "".join(f"{x:12.4e}" for x in samples_test[: self.show_samples].tolist()),
+                    f"  Ref samples:  " + "".join(f"{x:12.4e}" for x in samples_ref[: self.show_samples].tolist()),
+                ]
+            )
+            errors.append("\n".join([f">>>> [{step_name}] Excessive diff for tensor {tensor_name}:"] + tensor_errors))
+
+    def _compare_tensor_logs(
+        self,
+        artifact_path_ref: pathlib.Path,
+        artifact_path_test: pathlib.Path,
+    ):
+        errors = []
+
+        logs_ref = self._extract_tensor_logs(artifact_path_ref, errors)
+        logs_test = self._extract_tensor_logs(artifact_path_test, errors)
+
+        for step_key in sorted(self._compare_dict_keys(logs_ref, logs_test, errors, "Logged steps")):
+            step_logs_ref = logs_ref[step_key]
+            step_logs_test = logs_test[step_key]
+
+            for tensor_key in self._compare_dict_keys(
+                step_logs_ref, step_logs_test, errors=errors, name=f"[{step_key}] Tensor keys"
+            ):
+                self._compare_tensors(
+                    step_logs_ref[tensor_key],
+                    step_logs_test[tensor_key],
+                    errors,
+                    step_key,
+                    tensor_key,
+                )
+
+        return errors
+
+    def compare_tensor_logs(
+        self,
+        artifact_path_ref: pathlib.Path,
+        artifact_path_test: pathlib.Path,
+    ):
+        print(f'Comparing tensor logs in "{artifact_path_test}" with reference logs "{artifact_path_ref}"')
+        errors = self._compare_tensor_logs(artifact_path_ref, artifact_path_test)
+        if errors:
+            for error in errors:
+                print(error)
+            raise ValueError(f"Comparison failed ({len(errors)} errors)")
+        else:
+            print("Comparison succeeded!")
 
 
 def compare_tensor_logs(
+    self,
     artifact_path_ref: pathlib.Path,
     artifact_path_test: pathlib.Path,
-    config: CompareConfig | None = None,
-    artifacts: typing.Sequence[str] | None = None,
 ):
-    print(f'Comparing tensor logs in "{artifact_path_test}" with reference logs "{artifact_path_ref}"')
-    errors = compare_tensor_logs_base(artifact_path_ref, artifact_path_test, config, artifacts)
-    if errors:
-        for error in errors:
-            print(error)
-        raise ValueError(f"Comparison failed ({len(errors)} errors)")
-    else:
-        print("Comparison succeeded!")
+    pass
 
 
 if __name__ == "__main__":
@@ -167,4 +185,4 @@ def compare_tensor_logs(
     parser.add_argument("path_ref", type=pathlib.Path)
     parser.add_argument("path_test", type=pathlib.Path)
     args = parser.parse_args()
-    compare_tensor_logs(args.path_ref, args.path_test)
+    CompareConfig().compare_tensor_logs(args.path_ref, args.path_test)
diff --git a/tests/utils/distributed_configs.py b/tests/utils/distributed_configs.py
index c38939eae..d81e1a33e 100644
--- a/tests/utils/distributed_configs.py
+++ b/tests/utils/distributed_configs.py
@@ -15,6 +15,43 @@ class DistributedTestingConfig:
     compare_config: CompareConfig | None = None
 
 
+# TODO: Ajust
+_default_compare = CompareConfig(
+    rms_eps=1e-4,
+    rms_rel_tolerance=3e-3,
+    rms_abs_tolerance=5e-4,
+    max_rel_tolerance=1.5e-2,
+    max_abs_tolerance=5e-3,
+)
+_pp_tied_weight_compare = dataclasses.replace(
+    _default_compare,
+    sub_configs={
+        (None, ("layers.0.word_embeddings_weight", "layers.0.position_embeddings_weight")): CompareConfig(
+            ignore_duplicates=True
+        )
+    },
+)
+
+_z3_accumulation_compare = dataclasses.replace(
+    _default_compare, sub_configs={(None, "Global gradient"): CompareConfig(ignore_duplicates=True)}
+)
+_bf16_compare = dataclasses.replace(
+    _default_compare,
+    rms_eps=1e-3,
+    rms_rel_tolerance=3e-2,
+    rms_abs_tolerance=5e-3,
+    max_rel_tolerance=1.5e-1,
+    max_abs_tolerance=5e-2,
+)
+_fp16_compare = dataclasses.replace(
+    _default_compare,
+    rms_eps=1e-3,
+    rms_rel_tolerance=3e-2,
+    rms_abs_tolerance=5e-3,
+    max_rel_tolerance=1.5e-1,
+    max_abs_tolerance=5e-2,
+)
+
 # Baseline (also tests data-parallel workers)
 SIMPLE_TESTING_CONFIG = DistributedTestingConfig(
     name="simple",
@@ -24,12 +61,27 @@ class DistributedTestingConfig:
 )
 
 _SINGLE_GPU_TESTING_CONFIGS = [
+    DistributedTestingConfig(
+        name="bf16",
+        compare="simple",
+        config_args=["model.distributed.training_dtype=bf16"],
+        num_gpus=1,
+        compare_config=_bf16_compare,
+    ),
+    DistributedTestingConfig(
+        name="fp16",
+        compare="simple",
+        config_args=["model.distributed.training_dtype=bf16"],
+        num_gpus=1,
+        compare_config=_fp16_compare,
+    ),
     # Sequence-first baseline
     DistributedTestingConfig(
         name="sf",
         compare=None,
         config_args=["model.base_model.sequence_first=True"],
         num_gpus=1,
+        compare_config=_default_compare,
     ),
     # Cross-entropy splits.
     DistributedTestingConfig(
@@ -58,6 +110,7 @@ class DistributedTestingConfig:
         compare="df4",
         config_args=["batch.breadth_first_micro_batches=4"],
         num_gpus=1,
+        compare_config=_default_compare,
     ),
     # Mixed gradient accumulation.
     DistributedTestingConfig(
@@ -65,6 +118,7 @@ class DistributedTestingConfig:
         compare="df4",
         config_args=["batch.depth_first_micro_batches=2", "batch.breadth_first_micro_batches=2"],
         num_gpus=1,
+        compare_config=_default_compare,
     ),
     # Sequence-first gradient accumulation baseline.
     DistributedTestingConfig(
@@ -86,6 +140,7 @@ class DistributedTestingConfig:
         compare="simple",
         config_args=[],
         num_gpus=2,
+        compare_config=_default_compare,
     ),
     # Zero stage 2
     DistributedTestingConfig(
@@ -93,6 +148,7 @@ class DistributedTestingConfig:
         compare="simple",
         config_args=["model.multi_stage.zero_stage=2"],
         num_gpus=2,
+        compare_config=_default_compare,
     ),
     # Zero stage 3
     DistributedTestingConfig(
@@ -100,6 +156,7 @@ class DistributedTestingConfig:
         compare="simple",
         config_args=["model.multi_stage.zero_stage=3"],
         num_gpus=2,
+        compare_config=_default_compare,
     ),
     # Depth-first micro-batches
     DistributedTestingConfig(
@@ -107,11 +164,7 @@ class DistributedTestingConfig:
         compare="df4",
         config_args=["model.multi_stage.zero_stage=3", "batch.depth_first_micro_batches=4"],
         num_gpus=2,
-        compare_config=CompareConfig(
-            ignore_duplicates=[
-                "Global gradient",
-            ]
-        ),
+        compare_config=_z3_accumulation_compare,
     ),
     # Sequence-data-parallel
     DistributedTestingConfig(
@@ -119,6 +172,7 @@ class DistributedTestingConfig:
         compare="sf",
         config_args=["model.distributed.sequence_data_parallel=2"],
         num_gpus=2,
+        compare_config=_default_compare,
     ),
     # ===== Tensor-parallel configs
     # Simple tensor-parallel
@@ -127,6 +181,7 @@ class DistributedTestingConfig:
         compare="simple",
         config_args=["model.distributed.tensor_parallel=2"],
         num_gpus=2,
+        compare_config=_default_compare,
     ),
     # Simple sequence-tensor-parallel
     DistributedTestingConfig(
@@ -134,6 +189,7 @@ class DistributedTestingConfig:
         compare="sf",
         config_args=["model.distributed.tensor_parallel=2", "model.distributed.sequence_tensor_parallel=True"],
         num_gpus=2,
+        compare_config=_default_compare,
     ),
     # Cross-entropy splits
     DistributedTestingConfig(
@@ -146,6 +202,7 @@ class DistributedTestingConfig:
             "model.base_model.cross_entropy_splits=4",
         ],
         num_gpus=2,
+        compare_config=_default_compare,
     ),
     # ===== 2d configs (Data + Tensor)
     # Simple
@@ -157,6 +214,7 @@ class DistributedTestingConfig:
             "model.distributed.sequence_tensor_parallel=True",
         ],
         num_gpus=4,
+        compare_config=_default_compare,
     ),
     # Depth-first micro-batches, tensor-parallel
     DistributedTestingConfig(
@@ -167,6 +225,7 @@ class DistributedTestingConfig:
             "batch.depth_first_micro_batches=4",
         ],
         num_gpus=4,
+        compare_config=_default_compare,
     ),
     # Breadth-first micro-batches
     DistributedTestingConfig(
@@ -179,6 +238,7 @@ class DistributedTestingConfig:
             "batch.breadth_first_micro_batches=4",
         ],
         num_gpus=4,
+        compare_config=_default_compare,
     ),
     # Sequence-data-parallel
     DistributedTestingConfig(
@@ -190,6 +250,7 @@ class DistributedTestingConfig:
             "model.distributed.sequence_tensor_parallel=True",
         ],
         num_gpus=4,
+        compare_config=_default_compare,
     ),
     # ===== Pipeline-parallel configs
     # Simple [mb]
@@ -202,6 +263,7 @@ class DistributedTestingConfig:
             "batch.breadth_first_micro_batches=4",
         ],
         num_gpus=2,
+        compare_config=_default_compare,
     ),
     # Tied weights on different ranks
     DistributedTestingConfig(
@@ -213,12 +275,7 @@ class DistributedTestingConfig:
             "batch.breadth_first_micro_batches=4",
         ],
         num_gpus=2,
-        compare_config=CompareConfig(
-            ignore_duplicates=[
-                "layers.0.word_embeddings_weight",
-                "layers.0.position_embeddings_weight",
-            ]
-        ),
+        compare_config=_pp_tied_weight_compare,
     ),
     # Micro-sequence [ms]
     DistributedTestingConfig(
@@ -230,6 +287,7 @@ class DistributedTestingConfig:
             "batch.micro_sequence_length=256",
         ],
         num_gpus=2,
+        compare_config=_default_compare,
     ),
     # ===== 2d configs (Data + Pipeline)
     # Simple
@@ -242,6 +300,7 @@ class DistributedTestingConfig:
             "batch.breadth_first_micro_batches=4",
         ],
         num_gpus=4,
+        compare_config=_default_compare,
     ),
     # ===== 2d configs (Tensor + Pipeline)
     # Simple [sf, mb]
@@ -256,12 +315,7 @@ class DistributedTestingConfig:
             "batch.breadth_first_micro_batches=4",
         ],
         num_gpus=4,
-        compare_config=CompareConfig(
-            ignore_duplicates=[
-                "layers.0.word_embeddings_weight",
-                "layers.0.position_embeddings_weight",
-            ]
-        ),
+        compare_config=_pp_tied_weight_compare,
     ),
     # ===== Data + Tensor + Pipeline
     # Simple
@@ -275,6 +329,7 @@ class DistributedTestingConfig:
             "batch.breadth_first_micro_batches=4",
         ],
         num_gpus=8,
+        compare_config=_default_compare,
     ),
     # Tied weights on different ranks
     DistributedTestingConfig(
@@ -288,12 +343,7 @@ class DistributedTestingConfig:
             "batch.breadth_first_micro_batches=4",
         ],
         num_gpus=8,
-        compare_config=CompareConfig(
-            ignore_duplicates=[
-                "layers.0.word_embeddings_weight",
-                "layers.0.position_embeddings_weight",
-            ]
-        ),
+        compare_config=_pp_tied_weight_compare,
     ),
     # Micro-sequence
     DistributedTestingConfig(
@@ -308,6 +358,7 @@ class DistributedTestingConfig:
             "batch.micro_sequence_length=256",
         ],
         num_gpus=8,
+        compare_config=_default_compare,
     ),
 ]
 
diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py
index 199d5b72c..cbe8539aa 100644
--- a/tests/utils/model_configs.py
+++ b/tests/utils/model_configs.py
@@ -144,7 +144,6 @@ def _update_and_add_testing_config(
         "model.multi_stage.debug_tensor_parallel=True",
         "model.distributed.reproducible_init=True",
         "model.distributed.timeout=20",
-        "model.distributed.training_dtype=bf16",
         "training.train_iters=2",
         "training.num_workers=0",
         "training.timeout=30",
diff --git a/tests/utils/run_test_script.py b/tests/utils/run_test_script.py
index 602afeb23..6656e2bbe 100644
--- a/tests/utils/run_test_script.py
+++ b/tests/utils/run_test_script.py
@@ -10,7 +10,6 @@
 
 from fast_llm.engine.distributed.config import DistributedConfig
 from fast_llm.utils import Assert
-from tests.utils.compare_tensor_logs import compare_tensor_logs
 from tests.utils.dataset import get_test_dataset
 from tests.utils.distributed_configs import DistributedTestingConfig
 from tests.utils.model_configs import MODEL_CONFIGS, ModelTestingConfig
@@ -117,10 +116,9 @@ def do_compare_results_for_all_models(
         config: DistributedTestingConfig, artifacts: typing.Iterable[str] | None = None
     ):
         assert config.compare is not None
-        compare_tensor_logs(
+        config.compare_config.compare_tensor_logs(
             run_test_script_base_path / config.compare / ARTIFACT_PATH,
             run_test_script_base_path / config.name / ARTIFACT_PATH,
-            config.compare_config,
             artifacts,
         )
 

From 6a712229e0dfa9d2caa6832643f058a6ab478bb1 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Mon, 14 Jul 2025 14:02:21 -0400
Subject: [PATCH 10/14] stuff

---
 fast_llm/logging.py                |   1 +
 tests/utils/compare_tensor_logs.py |  28 ++++--
 tests/utils/dataset.py             |   2 +-
 tests/utils/distributed_configs.py | 140 +++++++++++++++++------------
 tests/utils/run_test_script.py     |  11 +--
 5 files changed, 112 insertions(+), 70 deletions(-)

diff --git a/fast_llm/logging.py b/fast_llm/logging.py
index 385a8b960..e8334de6e 100644
--- a/fast_llm/logging.py
+++ b/fast_llm/logging.py
@@ -137,6 +137,7 @@ def log_tensor[
 ) -> (T | None):
     if level < 1:
         return
+    tensor = tensor.detach()
     save_stats = TensorLogs.config.save
     shape = tuple(tensor.shape)
     _, dtype = str(tensor.dtype).split("torch.")
diff --git a/tests/utils/compare_tensor_logs.py b/tests/utils/compare_tensor_logs.py
index 59577e25a..743fafea0 100644
--- a/tests/utils/compare_tensor_logs.py
+++ b/tests/utils/compare_tensor_logs.py
@@ -25,6 +25,8 @@ class CompareConfig:
     rms_abs_tolerance: float = 5e-4
     max_rel_tolerance: float = 1.5e-2
     max_abs_tolerance: float = 5e-3
+    # Test tensors are scaled by this amount (ex. gradient scaling). Unscale (divide) them before comparison.
+    scale: float = 1.0
     show_samples: int = 10
     ignore_tensors: bool = False
     ignore_duplicates: bool = False
@@ -33,6 +35,20 @@ class CompareConfig:
         dataclasses.field(default_factory=dict)
     )
 
+    def rescale(self, factor: float) -> typing.Self:
+        # Scale all tolerances by this factor.
+        if factor == 1.0:
+            return self
+        return dataclasses.replace(
+            self,
+            rms_eps=self.rms_eps * factor,
+            rms_rel_tolerance=self.rms_rel_tolerance * factor,
+            rms_abs_tolerance=self.rms_abs_tolerance * factor,
+            max_rel_tolerance=self.max_rel_tolerance * factor,
+            max_abs_tolerance=self.max_abs_tolerance * factor,
+            sub_configs={key: sub_config.rescale(factor) for key, sub_config in self.sub_configs.items()},
+        )
+
     def _get_sub_config(self, step_name: str, tensor_name: str) -> typing.Self:
         for (step_key, name_key), sub_config in self.sub_configs.items():
             if _compare_pattern(step_key, step_name) and _compare_pattern(name_key, tensor_name):
@@ -56,7 +72,7 @@ def _extract_tensor_logs(self, artifact_path: pathlib.Path, errors):
                                 tensor_logs[step_name] = {}
                             if (
                                 tensor_name in (tensor_step_logs := tensor_logs[step_name])
-                                and not self.ignore_duplicates
+                                and not sub_config.ignore_duplicates
                             ):
                                 errors.append(f"Duplicate tensor log in step {step_name}: {tensor_name}")
                             tensor_step_logs[tensor_name] = step_log
@@ -98,6 +114,8 @@ def _compare_tensors(self, tensor_ref, tensor_test, errors, step_name, tensor_na
 
         samples_ref = tensor_ref["samples"].flatten().float()
         samples_test = tensor_test["samples"].flatten().float()
+        if sub_config.scale != 1.0:
+            samples_test = samples_test / sub_config.scale
         scale_unreg = (samples_ref**2).mean() ** 0.5
         rms_scale = (scale_unreg**2 + sub_config.rms_eps**2) ** 0.5
         rms = ((samples_ref - samples_test) ** 2).mean() ** 0.5
@@ -172,14 +190,6 @@ def compare_tensor_logs(
             print("Comparison succeeded!")
 
 
-def compare_tensor_logs(
-    self,
-    artifact_path_ref: pathlib.Path,
-    artifact_path_test: pathlib.Path,
-):
-    pass
-
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("path_ref", type=pathlib.Path)
diff --git a/tests/utils/dataset.py b/tests/utils/dataset.py
index 2a12c4f7d..713211da6 100644
--- a/tests/utils/dataset.py
+++ b/tests/utils/dataset.py
@@ -15,7 +15,7 @@
 DATASET_CACHE = TEST_RESULTS_PATH / "dataset"
 DATASET_PREFIX = DATASET_CACHE / "common" / "dataset"
 DATASET_SAMPLING_CACHE = TEST_RESULTS_PATH / "dataset" / "cache"
-TEST_VOCAB_SIZE = 8192
+TEST_VOCAB_SIZE = 384
 # Random lowercase: 80.7% (3.1% each); space: 18.6%; doc end: 0.6%
 TEST_CHARACTERS = (string.ascii_lowercase) * 5 + " " * 30 + "\n"
 TEST_DATASET_TOKENS = 1000000
diff --git a/tests/utils/distributed_configs.py b/tests/utils/distributed_configs.py
index d81e1a33e..f70a87956 100644
--- a/tests/utils/distributed_configs.py
+++ b/tests/utils/distributed_configs.py
@@ -1,3 +1,4 @@
+import copy
 import dataclasses
 import logging
 
@@ -13,45 +14,70 @@ class DistributedTestingConfig:
     config_args: list[str]
     num_gpus: int = 1
     compare_config: CompareConfig | None = None
+    # Scale the comparison thresholds for specific models.
+    compare_factor: float = 1.0
+
+
+def get_config(relative: float = 0, absolute: float = 0, **kwargs) -> CompareConfig:
+    return CompareConfig(
+        rms_rel_tolerance=relative,
+        max_rel_tolerance=relative * 10,
+        rms_abs_tolerance=absolute,
+        max_abs_tolerance=absolute * 10,
+        rms_eps=absolute / 10,
+        **kwargs,
+    )
 
 
 # TODO: Ajust
-_default_compare = CompareConfig(
-    rms_eps=1e-4,
-    rms_rel_tolerance=3e-3,
-    rms_abs_tolerance=5e-4,
-    max_rel_tolerance=1.5e-2,
-    max_abs_tolerance=5e-3,
-)
-_pp_tied_weight_compare = dataclasses.replace(
-    _default_compare,
+_compare_layer_match = get_config(
     sub_configs={
-        (None, ("layers.0.word_embeddings_weight", "layers.0.position_embeddings_weight")): CompareConfig(
-            ignore_duplicates=True
-        )
-    },
+        ("init", None): get_config(),
+        ("train_1", "fw"): get_config(1e-3, 3e-5),
+        ("train_2", "fw"): get_config(1e-3, 1e-4),
+        ("train_1", "bw"): get_config(3e-3, 3e-6),
+        ("train_2", "bw"): get_config(3e-3, 1e-5),
+        ("train_1", "gradient"): get_config(3e-3, 1e-5),
+        ("train_2", "gradient"): get_config(3e-3, 3e-5),
+    }
 )
 
-_z3_accumulation_compare = dataclasses.replace(
-    _default_compare, sub_configs={(None, "Global gradient"): CompareConfig(ignore_duplicates=True)}
-)
-_bf16_compare = dataclasses.replace(
-    _default_compare,
-    rms_eps=1e-3,
-    rms_rel_tolerance=3e-2,
-    rms_abs_tolerance=5e-3,
-    max_rel_tolerance=1.5e-1,
-    max_abs_tolerance=5e-2,
+_compare_layer_mismatch = copy.deepcopy(_compare_layer_match)
+_pp_tied_weight_compare = copy.deepcopy(_compare_layer_match)
+_z3_accumulation_compare = copy.deepcopy(_compare_layer_match)
+for step in ("train_1", "train_2"):
+    _z3_accumulation_compare.sub_configs[(step, "gradient")].ignore_duplicates = True
+    for tensor in ("fw", "bw"):
+        _compare_layer_mismatch.sub_configs[(step, tensor)].ignore_tensors = True
+        _pp_tied_weight_compare.sub_configs[(step, tensor)].ignore_duplicates = True
+
+
+_bf16_compare = get_config(
+    sub_configs={
+        ("init", None): get_config(),
+        ("train_1", "fw"): get_config(1e-2, 1e-3),
+        ("train_2", "fw"): get_config(1e-2, 1e-3),
+        ("train_1", "bw"): get_config(1e-2, 1e-5),
+        ("train_2", "bw"): get_config(1e-2, 1e-5),
+        ("train_1", "gradient"): get_config(2e-2, 3e-5),
+        ("train_2", "gradient"): get_config(2e-2, 3e-5),
+    }
 )
-_fp16_compare = dataclasses.replace(
-    _default_compare,
-    rms_eps=1e-3,
-    rms_rel_tolerance=3e-2,
-    rms_abs_tolerance=5e-3,
-    max_rel_tolerance=1.5e-1,
-    max_abs_tolerance=5e-2,
+
+_fp16_compare = get_config(
+    sub_configs={
+        ("init", None): get_config(),
+        # Saved gradient include the gradient scaling by 2**16 (default initial value)
+        ("train_1", "fw"): get_config(1e-3, 1e-4),
+        ("train_2", "fw"): get_config(1e-3, 1e-4),
+        ("train_1", "bw"): get_config(3e-3, 1e-5, scale=2**16),
+        ("train_2", "bw"): get_config(3e-3, 1e-5, scale=2**16),
+        ("train_1", "gradient"): get_config(3e-3, 1e-5, scale=2**16),
+        ("train_2", "gradient"): get_config(3e-3, 1e-5, scale=2**16),
+    }
 )
 
+
 # Baseline (also tests data-parallel workers)
 SIMPLE_TESTING_CONFIG = DistributedTestingConfig(
     name="simple",
@@ -71,38 +97,41 @@ class DistributedTestingConfig:
     DistributedTestingConfig(
         name="fp16",
         compare="simple",
-        config_args=["model.distributed.training_dtype=bf16"],
+        config_args=["model.distributed.training_dtype=fp16"],
         num_gpus=1,
         compare_config=_fp16_compare,
     ),
     # Sequence-first baseline
     DistributedTestingConfig(
         name="sf",
-        compare=None,
+        compare="simple",
         config_args=["model.base_model.sequence_first=True"],
         num_gpus=1,
-        compare_config=_default_compare,
+        compare_config=_compare_layer_mismatch,
     ),
     # Cross-entropy splits.
     DistributedTestingConfig(
         name="ce4",
-        compare=None,
+        compare="simple",
         config_args=["model.base_model.cross_entropy_splits=4"],
         num_gpus=1,
+        compare_config=_compare_layer_mismatch,
     ),
     # Micro-sequence baseline
     DistributedTestingConfig(
         name="ms",
-        compare=None,
+        compare="simple",
         config_args=["batch.micro_sequence_length=256"],
         num_gpus=1,
+        compare_config=_compare_layer_mismatch,
     ),
     # Gradient accumulation baseline.
     DistributedTestingConfig(
         name="df4",
-        compare=None,
+        compare="simple",
         config_args=["batch.depth_first_micro_batches=4"],
         num_gpus=1,
+        compare_config=_compare_layer_mismatch,
     ),
     # Breadth-first gradient accumulation.
     DistributedTestingConfig(
@@ -110,7 +139,7 @@ class DistributedTestingConfig:
         compare="df4",
         config_args=["batch.breadth_first_micro_batches=4"],
         num_gpus=1,
-        compare_config=_default_compare,
+        compare_config=_compare_layer_match,
     ),
     # Mixed gradient accumulation.
     DistributedTestingConfig(
@@ -118,14 +147,15 @@ class DistributedTestingConfig:
         compare="df4",
         config_args=["batch.depth_first_micro_batches=2", "batch.breadth_first_micro_batches=2"],
         num_gpus=1,
-        compare_config=_default_compare,
+        compare_config=_compare_layer_match,
     ),
     # Sequence-first gradient accumulation baseline.
     DistributedTestingConfig(
         name="df4_sf",
-        compare=None,
+        compare="simple",
         config_args=["batch.depth_first_micro_batches=4", "model.base_model.sequence_first=True"],
         num_gpus=1,
+        compare_config=_compare_layer_mismatch,
     ),
 ]
 
@@ -140,7 +170,7 @@ class DistributedTestingConfig:
         compare="simple",
         config_args=[],
         num_gpus=2,
-        compare_config=_default_compare,
+        compare_config=_compare_layer_match,
     ),
     # Zero stage 2
     DistributedTestingConfig(
@@ -148,7 +178,7 @@ class DistributedTestingConfig:
         compare="simple",
         config_args=["model.multi_stage.zero_stage=2"],
         num_gpus=2,
-        compare_config=_default_compare,
+        compare_config=_compare_layer_match,
     ),
     # Zero stage 3
     DistributedTestingConfig(
@@ -156,7 +186,7 @@ class DistributedTestingConfig:
         compare="simple",
         config_args=["model.multi_stage.zero_stage=3"],
         num_gpus=2,
-        compare_config=_default_compare,
+        compare_config=_compare_layer_match,
     ),
     # Depth-first micro-batches
     DistributedTestingConfig(
@@ -172,7 +202,7 @@ class DistributedTestingConfig:
         compare="sf",
         config_args=["model.distributed.sequence_data_parallel=2"],
         num_gpus=2,
-        compare_config=_default_compare,
+        compare_config=_compare_layer_match,
     ),
     # ===== Tensor-parallel configs
     # Simple tensor-parallel
@@ -181,7 +211,7 @@ class DistributedTestingConfig:
         compare="simple",
         config_args=["model.distributed.tensor_parallel=2"],
         num_gpus=2,
-        compare_config=_default_compare,
+        compare_config=_compare_layer_match,
     ),
     # Simple sequence-tensor-parallel
     DistributedTestingConfig(
@@ -189,7 +219,7 @@ class DistributedTestingConfig:
         compare="sf",
         config_args=["model.distributed.tensor_parallel=2", "model.distributed.sequence_tensor_parallel=True"],
         num_gpus=2,
-        compare_config=_default_compare,
+        compare_config=_compare_layer_match,
     ),
     # Cross-entropy splits
     DistributedTestingConfig(
@@ -202,7 +232,7 @@ class DistributedTestingConfig:
             "model.base_model.cross_entropy_splits=4",
         ],
         num_gpus=2,
-        compare_config=_default_compare,
+        compare_config=_compare_layer_match,
     ),
     # ===== 2d configs (Data + Tensor)
     # Simple
@@ -214,7 +244,7 @@ class DistributedTestingConfig:
             "model.distributed.sequence_tensor_parallel=True",
         ],
         num_gpus=4,
-        compare_config=_default_compare,
+        compare_config=_compare_layer_match,
     ),
     # Depth-first micro-batches, tensor-parallel
     DistributedTestingConfig(
@@ -225,7 +255,7 @@ class DistributedTestingConfig:
             "batch.depth_first_micro_batches=4",
         ],
         num_gpus=4,
-        compare_config=_default_compare,
+        compare_config=_compare_layer_match,
     ),
     # Breadth-first micro-batches
     DistributedTestingConfig(
@@ -238,7 +268,7 @@ class DistributedTestingConfig:
             "batch.breadth_first_micro_batches=4",
         ],
         num_gpus=4,
-        compare_config=_default_compare,
+        compare_config=_compare_layer_match,
     ),
     # Sequence-data-parallel
     DistributedTestingConfig(
@@ -250,7 +280,7 @@ class DistributedTestingConfig:
             "model.distributed.sequence_tensor_parallel=True",
         ],
         num_gpus=4,
-        compare_config=_default_compare,
+        compare_config=_compare_layer_match,
     ),
     # ===== Pipeline-parallel configs
     # Simple [mb]
@@ -263,7 +293,7 @@ class DistributedTestingConfig:
             "batch.breadth_first_micro_batches=4",
         ],
         num_gpus=2,
-        compare_config=_default_compare,
+        compare_config=_compare_layer_match,
     ),
     # Tied weights on different ranks
     DistributedTestingConfig(
@@ -287,7 +317,7 @@ class DistributedTestingConfig:
             "batch.micro_sequence_length=256",
         ],
         num_gpus=2,
-        compare_config=_default_compare,
+        compare_config=_compare_layer_match,
     ),
     # ===== 2d configs (Data + Pipeline)
     # Simple
@@ -300,7 +330,7 @@ class DistributedTestingConfig:
             "batch.breadth_first_micro_batches=4",
         ],
         num_gpus=4,
-        compare_config=_default_compare,
+        compare_config=_compare_layer_match,
     ),
     # ===== 2d configs (Tensor + Pipeline)
     # Simple [sf, mb]
@@ -329,7 +359,7 @@ class DistributedTestingConfig:
             "batch.breadth_first_micro_batches=4",
         ],
         num_gpus=8,
-        compare_config=_default_compare,
+        compare_config=_compare_layer_match,
     ),
     # Tied weights on different ranks
     DistributedTestingConfig(
@@ -358,7 +388,7 @@ class DistributedTestingConfig:
             "batch.micro_sequence_length=256",
         ],
         num_gpus=8,
-        compare_config=_default_compare,
+        compare_config=_compare_layer_match,
     ),
 ]
 
diff --git a/tests/utils/run_test_script.py b/tests/utils/run_test_script.py
index 6656e2bbe..f4d4dfab0 100644
--- a/tests/utils/run_test_script.py
+++ b/tests/utils/run_test_script.py
@@ -2,6 +2,7 @@
 import functools
 import os
 import pathlib
+import pprint
 import subprocess
 import sys
 import typing
@@ -111,15 +112,15 @@ def parse_run_distributed_script(args: list[str] | None = None):
 def compare_results_for_all_models(
     worker_resources: "WorkerResources",
     run_test_script_base_path: pathlib.Path,
+    model_testing_config: ModelTestingConfig,
 ):
-    def do_compare_results_for_all_models(
-        config: DistributedTestingConfig, artifacts: typing.Iterable[str] | None = None
-    ):
+    def do_compare_results_for_all_models(config: DistributedTestingConfig):
         assert config.compare is not None
-        config.compare_config.compare_tensor_logs(
+        compare_config = config.compare_config.rescale(config.compare_factor)
+        pprint.pprint(compare_config)
+        compare_config.compare_tensor_logs(
             run_test_script_base_path / config.compare / ARTIFACT_PATH,
             run_test_script_base_path / config.name / ARTIFACT_PATH,
-            artifacts,
         )
 
     return do_compare_results_for_all_models

From 28f1a886cc9006362785c59b90931ec9080f6542 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Tue, 15 Jul 2025 16:10:12 -0400
Subject: [PATCH 11/14] fixes

---
 tests/models/test_checkpoint.py     |  6 ++--
 tests/models/test_match_megatron.py |  6 ++--
 tests/utils/compare_tensor_logs.py  |  4 +--
 tests/utils/dataset.py              | 12 +++++++-
 tests/utils/distributed_configs.py  | 43 +++++++++++++----------------
 tests/utils/model_configs.py        | 22 ++++++++++-----
 tests/utils/run_test_script.py      | 17 +++++++-----
 7 files changed, 62 insertions(+), 48 deletions(-)

diff --git a/tests/models/test_checkpoint.py b/tests/models/test_checkpoint.py
index 6f30bd318..05acf23dc 100644
--- a/tests/models/test_checkpoint.py
+++ b/tests/models/test_checkpoint.py
@@ -19,7 +19,7 @@
 from fast_llm.engine.checkpoint.convert import ConvertConfig
 from fast_llm.engine.multi_stage.config import FastLLMModelConfig, ShardName
 from fast_llm.utils import Assert
-from tests.utils.compare_tensor_logs import CompareConfig, compare_logged_tensor
+from tests.utils.compare_tensor_logs import CompareConfig
 from tests.utils.distributed_configs import DistributedTestingConfig
 from tests.utils.model_configs import ModelTestingConfig, ModelTestingGroup
 from tests.utils.save_load_configs import DISTRIBUTED_SAVE_LOAD_CONFIGS, DistributedSaveLoadConfig
@@ -307,7 +307,6 @@ def test_huggingface_model(model_testing_config, get_convert_path):
         )
     )
     errors = []
-    compare = CompareConfig()
     auto_model = (
         transformers.AutoModel
         if model_testing_config.name in ("diffusion_llama", "dream")
@@ -323,13 +322,12 @@ def test_huggingface_model(model_testing_config, get_convert_path):
         print(name)
         output = model(test_input)
         # TODO: Make a generic comparison util.
-        compare_logged_tensor(
+        CompareConfig().compare_tensors(
             {"samples": output_ref.logits, "shape": output_ref.logits.shape, "step": 0},
             {"samples": output.logits, "shape": output.logits.shape, "step": 0},
             errors,
             name,
             "logits",
-            compare,
         )
 
     if errors:
diff --git a/tests/models/test_match_megatron.py b/tests/models/test_match_megatron.py
index edc524e04..c7fa623e5 100644
--- a/tests/models/test_match_megatron.py
+++ b/tests/models/test_match_megatron.py
@@ -3,7 +3,7 @@
 import pytest
 
 from tests.utils.compare_tensor_logs import CompareConfig
-from tests.utils.dataset import DATASET_PREFIX, get_test_dataset
+from tests.utils.dataset import MODEL_DATASET_PREFIX, get_model_test_dataset
 from tests.utils.distributed_configs import DistributedTestingConfig
 from tests.utils.model_configs import ModelTestingGroup
 from tests.utils.utils import requires_cuda
@@ -17,7 +17,7 @@ def test_megatron(run_distributed_script, model_testing_config, run_test_script_
     # Prevent Megatron from complaining.
     env["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
     env["NVTE_FLASH_ATTN"] = "0"
-    get_test_dataset()
+    get_model_test_dataset()
     run_distributed_script(
         [
             "Megatron-LM/pretrain_gpt.py",
@@ -52,7 +52,7 @@ def test_match_megatron(run_test_script_for_all_models, model_testing_config, co
         config_args=[
             "model.distributed.training_dtype=fp32",
             "data.datasets={}",
-            f"data.path={DATASET_PREFIX}",
+            f"data.path={MODEL_DATASET_PREFIX}",
             "model.base_model.use_megatron_initialization=True",
         ],
         num_gpus=1,
diff --git a/tests/utils/compare_tensor_logs.py b/tests/utils/compare_tensor_logs.py
index 743fafea0..a1c17379a 100644
--- a/tests/utils/compare_tensor_logs.py
+++ b/tests/utils/compare_tensor_logs.py
@@ -91,7 +91,7 @@ def _compare_dict_keys(self, dict_ref, dict_test, errors, name):
         # Avoid set to preserve ordering.
         return [key for key in dict_test if key in dict_ref]
 
-    def _compare_tensors(self, tensor_ref, tensor_test, errors, step_name, tensor_name):
+    def compare_tensors(self, tensor_ref, tensor_test, errors, step_name, tensor_name):
         sub_config = self._get_sub_config(step_name, tensor_name)
         if tensor_ref["shape"] != tensor_test["shape"]:
             errors.append(
@@ -165,7 +165,7 @@ def _compare_tensor_logs(
             for tensor_key in self._compare_dict_keys(
                 step_logs_ref, step_logs_test, errors=errors, name=f"[{step_key}] Tensor keys"
             ):
-                self._compare_tensors(
+                self.compare_tensors(
                     step_logs_ref[tensor_key],
                     step_logs_test[tensor_key],
                     errors,
diff --git a/tests/utils/dataset.py b/tests/utils/dataset.py
index 713211da6..ad8385ae9 100644
--- a/tests/utils/dataset.py
+++ b/tests/utils/dataset.py
@@ -15,11 +15,14 @@
 DATASET_CACHE = TEST_RESULTS_PATH / "dataset"
 DATASET_PREFIX = DATASET_CACHE / "common" / "dataset"
 DATASET_SAMPLING_CACHE = TEST_RESULTS_PATH / "dataset" / "cache"
-TEST_VOCAB_SIZE = 384
+TEST_VOCAB_SIZE = 8192
 # Random lowercase: 80.7% (3.1% each); space: 18.6%; doc end: 0.6%
 TEST_CHARACTERS = (string.ascii_lowercase) * 5 + " " * 30 + "\n"
 TEST_DATASET_TOKENS = 1000000
 
+MODEL_DATASET_PREFIX = DATASET_CACHE / "common" / "model_dataset"
+MODEL_TEST_VOCAB_SIZE = 384
+
 
 def get_test_dataset(
     prefix: pathlib.Path = DATASET_PREFIX,
@@ -60,6 +63,13 @@ def get_test_dataset(
         )
 
 
+def get_model_test_dataset(
+    prefix: pathlib.Path = MODEL_DATASET_PREFIX,
+    vocab_size: int = MODEL_TEST_VOCAB_SIZE,
+):
+    return get_test_dataset(prefix=prefix, vocab_size=vocab_size)
+
+
 def get_test_concatenated_memmap_dataset(
     path: pathlib.Path,
     num_files: int,
diff --git a/tests/utils/distributed_configs.py b/tests/utils/distributed_configs.py
index f70a87956..ef7d5d214 100644
--- a/tests/utils/distributed_configs.py
+++ b/tests/utils/distributed_configs.py
@@ -14,7 +14,7 @@ class DistributedTestingConfig:
     config_args: list[str]
     num_gpus: int = 1
     compare_config: CompareConfig | None = None
-    # Scale the comparison thresholds for specific models.
+    # Scale the comparison thresholds for specific distributed configs.
     compare_factor: float = 1.0
 
 
@@ -33,34 +33,31 @@ def get_config(relative: float = 0, absolute: float = 0, **kwargs) -> CompareCon
 _compare_layer_match = get_config(
     sub_configs={
         ("init", None): get_config(),
-        ("train_1", "fw"): get_config(1e-3, 3e-5),
-        ("train_2", "fw"): get_config(1e-3, 1e-4),
-        ("train_1", "bw"): get_config(3e-3, 3e-6),
-        ("train_2", "bw"): get_config(3e-3, 1e-5),
-        ("train_1", "gradient"): get_config(3e-3, 1e-5),
-        ("train_2", "gradient"): get_config(3e-3, 3e-5),
+        (None, "fw"): get_config(1e-3, 1e-4),
+        (None, "bw"): get_config(3e-3, 1e-5),
+        # Biases have higher absolute error.
+        (None, "bias"): get_config(3e-3, 5e-5),
+        (None, "gradient"): get_config(3e-3, 3e-5),
     }
 )
 
 _compare_layer_mismatch = copy.deepcopy(_compare_layer_match)
 _pp_tied_weight_compare = copy.deepcopy(_compare_layer_match)
 _z3_accumulation_compare = copy.deepcopy(_compare_layer_match)
-for step in ("train_1", "train_2"):
-    _z3_accumulation_compare.sub_configs[(step, "gradient")].ignore_duplicates = True
-    for tensor in ("fw", "bw"):
-        _compare_layer_mismatch.sub_configs[(step, tensor)].ignore_tensors = True
-        _pp_tied_weight_compare.sub_configs[(step, tensor)].ignore_duplicates = True
+_z3_accumulation_compare.sub_configs[(None, "gradient")].ignore_duplicates = True
+_pp_tied_weight_compare.sub_configs[(None, "gradient")].ignore_duplicates = True
+for tensor in ("fw", "bw"):
+    _compare_layer_mismatch.sub_configs[(None, tensor)].ignore_tensors = True
+    _pp_tied_weight_compare.sub_configs[(None, tensor)].ignore_duplicates = True
 
 
 _bf16_compare = get_config(
     sub_configs={
         ("init", None): get_config(),
-        ("train_1", "fw"): get_config(1e-2, 1e-3),
-        ("train_2", "fw"): get_config(1e-2, 1e-3),
-        ("train_1", "bw"): get_config(1e-2, 1e-5),
-        ("train_2", "bw"): get_config(1e-2, 1e-5),
-        ("train_1", "gradient"): get_config(2e-2, 3e-5),
-        ("train_2", "gradient"): get_config(2e-2, 3e-5),
+        (None, "fw"): get_config(1e-2, 1e-3),
+        (None, "bw"): get_config(1e-2, 1e-5),
+        (None, "bias"): get_config(2e-2, 1e-4),
+        (None, "gradient"): get_config(2e-2, 3e-5),
     }
 )
 
@@ -68,12 +65,10 @@ def get_config(relative: float = 0, absolute: float = 0, **kwargs) -> CompareCon
     sub_configs={
         ("init", None): get_config(),
         # Saved gradient include the gradient scaling by 2**16 (default initial value)
-        ("train_1", "fw"): get_config(1e-3, 1e-4),
-        ("train_2", "fw"): get_config(1e-3, 1e-4),
-        ("train_1", "bw"): get_config(3e-3, 1e-5, scale=2**16),
-        ("train_2", "bw"): get_config(3e-3, 1e-5, scale=2**16),
-        ("train_1", "gradient"): get_config(3e-3, 1e-5, scale=2**16),
-        ("train_2", "gradient"): get_config(3e-3, 1e-5, scale=2**16),
+        (None, "fw"): get_config(1e-3, 1e-4),
+        (None, "bw"): get_config(3e-3, 1e-5, scale=2**16),
+        (None, "bias"): get_config(3e-3, 1e-4, scale=2**16),
+        (None, "gradient"): get_config(3e-3, 5e-5, scale=2**16),
     }
 )
 
diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py
index cbe8539aa..2c07fd0a1 100644
--- a/tests/utils/model_configs.py
+++ b/tests/utils/model_configs.py
@@ -20,7 +20,7 @@
     Starcoder2GPTHuggingfaceCheckpointFormat,
 )
 from fast_llm.models.ssm.config import LLambaHuggingfaceCheckpointFormat
-from tests.utils.dataset import DATASET_PREFIX, TEST_VOCAB_SIZE
+from tests.utils.dataset import MODEL_DATASET_PREFIX, MODEL_TEST_VOCAB_SIZE
 
 _LOG_LEVEL = int(os.environ.get("LOG_LEVEL", 13))
 
@@ -55,6 +55,8 @@ class ModelTestingConfig:
     megatron_args: list[str] | None
     checkpoint_format: type[CheckpointFormat] | None
     groups: dict[ModelTestingGroup, ModelTestingGroupAction]
+    # Scale the comparison thresholds for specific models.
+    compare_factor: float = 1.0
 
     @functools.cached_property
     def trainer_config_class(self) -> type[TrainerConfig]:
@@ -96,6 +98,7 @@ def _update_and_add_testing_config(
     megatron_args: list[str] | None = ...,
     checkpoint_format: CheckpointFormat | None = ...,
     groups: dict[ModelTestingGroup, ModelTestingGroupAction],
+    compare_factor: float = ...,
 ):
     config = MODEL_CONFIGS[old_name]
     updates: dict[str, typing.Any] = {
@@ -115,6 +118,8 @@ def _update_and_add_testing_config(
             updates["megatron_args"] = config.megatron_args + megatron_args
     if checkpoint_format is not ...:
         updates["checkpoint_format"] = checkpoint_format
+    if compare_factor is not ...:
+        updates["compare_factor"] = compare_factor
 
     MODEL_CONFIGS[new_name] = dataclasses.replace(config, **updates)
 
@@ -136,7 +141,7 @@ def _update_and_add_testing_config(
         "model.base_model.transformer.num_attention_heads=8",
         "model.base_model.transformer.head_groups=8",
         "model.base_model.transformer.init_method_std=0.022",
-        f"model.base_model.vocab_size={TEST_VOCAB_SIZE}",
+        f"model.base_model.vocab_size={MODEL_TEST_VOCAB_SIZE}",
         f"model.multi_stage.debug_param_init={_LOG_LEVEL}",
         f"model.multi_stage.debug_layer_outputs={_LOG_LEVEL}",
         f"model.multi_stage.debug_layer_gradients={_LOG_LEVEL}",
@@ -152,17 +157,17 @@ def _update_and_add_testing_config(
         "data.datasets.training.type=slice",
         "data.datasets.training.end=0.969",
         "data.datasets.training.dataset.type=memmap",
-        f"data.datasets.training.dataset.path={DATASET_PREFIX}",
+        f"data.datasets.training.dataset.path={MODEL_DATASET_PREFIX}",
         "data.datasets.validation.type=slice",
         "data.datasets.validation.begin=0.969",
         "data.datasets.validation.end=0.999",
         "data.datasets.validation.dataset.type=memmap",
-        f"data.datasets.validation.dataset.path={DATASET_PREFIX}",
+        f"data.datasets.validation.dataset.path={MODEL_DATASET_PREFIX}",
         "data.datasets.test.type=slice",
         "data.datasets.test.begin=0.999",
         "data.datasets.test.end=1",
         "data.datasets.test.dataset.type=memmap",
-        f"data.datasets.test.dataset.path={DATASET_PREFIX}",
+        f"data.datasets.test.dataset.path={MODEL_DATASET_PREFIX}",
         "optimizer.learning_rate.base=0.0001",
     ],
     megatron_args=[
@@ -189,8 +194,8 @@ def _update_and_add_testing_config(
         "--valid-num-workers=0",
         "--tokenizer-type=NullTokenizer",
         # Megatron messes with the vocab size, so we have to subtract 1.
-        f"--vocab-size={TEST_VOCAB_SIZE - 1}",
-        f"--data-path={DATASET_PREFIX}",
+        f"--vocab-size={MODEL_TEST_VOCAB_SIZE - 1}",
+        f"--data-path={MODEL_DATASET_PREFIX}",
         "--lr-decay-style=constant",
         # Initialization is set up to match MCore models (MCore inverts self-attn qkv and dense layers compared to original Megatron)
         "--use-mcore-models",
@@ -439,6 +444,7 @@ def _update_and_add_testing_config(
         ModelTestingGroup.megatron: ModelTestingGroupAction.normal,
         ModelTestingGroup.distributed: ModelTestingGroupAction.normal,
     },
+    compare_factor=2.0,
 )
 
 _update_and_add_testing_config(
@@ -466,6 +472,7 @@ def _update_and_add_testing_config(
         # TODO: Fix and bring back to `testing_groups`
         ModelTestingGroup.distributed: ModelTestingGroupAction.broken,
     },
+    compare_factor=10.0,
 )
 
 
@@ -487,6 +494,7 @@ def _update_and_add_testing_config(
         ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented,
         ModelTestingGroup.distributed: ModelTestingGroupAction.unimportant,
     },
+    compare_factor=10.0,
 )
 
 
diff --git a/tests/utils/run_test_script.py b/tests/utils/run_test_script.py
index f4d4dfab0..c188ccd5c 100644
--- a/tests/utils/run_test_script.py
+++ b/tests/utils/run_test_script.py
@@ -6,12 +6,13 @@
 import subprocess
 import sys
 import typing
+import warnings
 
 import pytest
 
 from fast_llm.engine.distributed.config import DistributedConfig
 from fast_llm.utils import Assert
-from tests.utils.dataset import get_test_dataset
+from tests.utils.dataset import get_model_test_dataset
 from tests.utils.distributed_configs import DistributedTestingConfig
 from tests.utils.model_configs import MODEL_CONFIGS, ModelTestingConfig
 
@@ -71,7 +72,7 @@ def do_run_test_script_for_all_models(
     base_path: pathlib.Path,
 ):
     Assert.leq(distributed_testing_config.num_gpus, DistributedConfig.default_world_size)
-    get_test_dataset()
+    get_model_test_dataset()
     args = [
         "fast-llm",
         "train",
@@ -116,11 +117,13 @@ def compare_results_for_all_models(
 ):
     def do_compare_results_for_all_models(config: DistributedTestingConfig):
         assert config.compare is not None
-        compare_config = config.compare_config.rescale(config.compare_factor)
+        compare_config = config.compare_config.rescale(config.compare_factor * model_testing_config.compare_factor)
         pprint.pprint(compare_config)
-        compare_config.compare_tensor_logs(
-            run_test_script_base_path / config.compare / ARTIFACT_PATH,
-            run_test_script_base_path / config.name / ARTIFACT_PATH,
-        )
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", category=DeprecationWarning, message="Ignoring keys in ")
+            compare_config.compare_tensor_logs(
+                run_test_script_base_path / config.compare / ARTIFACT_PATH,
+                run_test_script_base_path / config.name / ARTIFACT_PATH,
+            )
 
     return do_compare_results_for_all_models

From ca65becfd76eecccec5ffa87b4bec777f69b9ae0 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Tue, 15 Jul 2025 18:17:23 -0400
Subject: [PATCH 12/14] fixes

---
 fast_llm/engine/schedule/config.py     |  6 -----
 fast_llm/layers/language_model/head.py |  6 +++++
 fast_llm/utils.py                      |  2 +-
 tests/data/test_concatenated_memmap.py | 36 ++++++++++++++------------
 tests/models/test_match_megatron.py    |  4 +--
 tests/utils/compare_tensor_logs.py     |  8 +-----
 tests/utils/distributed_configs.py     | 20 +++++++++++---
 tests/utils/run_test_script.py         | 11 +++-----
 8 files changed, 51 insertions(+), 42 deletions(-)

diff --git a/fast_llm/engine/schedule/config.py b/fast_llm/engine/schedule/config.py
index 141490ac3..272b7c6ae 100644
--- a/fast_llm/engine/schedule/config.py
+++ b/fast_llm/engine/schedule/config.py
@@ -1,6 +1,5 @@
 import enum
 import functools
-import warnings
 
 from fast_llm.config import Config, Field, FieldHint, check_field, config_class, test_field
 from fast_llm.engine.distributed.config import DistributedConfig
@@ -105,11 +104,6 @@ def _validate(self) -> None:
 
         if self._distributed.pipeline_parallel > 1 and self.depth_first_micro_batches > 1:
             raise NotImplementedError("Depth-first pipeline parallelism not yet implemented")
-        if self.depth_first_micro_batches > 1 and self.breadth_first_micro_batches > 1:
-            warnings.warn(
-                "Mixing of breadth-first and depth-first gradient accumulation is not thoroughly tested."
-                " Use at your own risk."
-            )
         super()._validate()
 
 
diff --git a/fast_llm/layers/language_model/head.py b/fast_llm/layers/language_model/head.py
index 69eebff39..25fc2b28d 100644
--- a/fast_llm/layers/language_model/head.py
+++ b/fast_llm/layers/language_model/head.py
@@ -6,6 +6,7 @@
 from torch.distributed import all_reduce
 
 from fast_llm.config import Configurable
+from fast_llm.core.ops import split_op
 from fast_llm.engine.base_model.base_model import Layer
 from fast_llm.engine.config_utils.tensor_space import DefaultDimNames, TensorDim, TensorSpace
 from fast_llm.engine.distributed.config import DistributedDimNames
@@ -234,6 +235,11 @@ def _get_targets(
                 lm_target = None
 
         targets = (dpo_target, lm_target, distillation_target, loss_mask)
+        if self._sequence_parallel_logits:
+            targets = [
+                None if target is None else split_op(target, self._tensor_space.distributed.tensor_group, 0)
+                for target in targets
+            ]
         if not any(target is not None for target in targets):
             # Simplify so we don't have to check every time.
             targets = None
diff --git a/fast_llm/utils.py b/fast_llm/utils.py
index 821ec5874..472f5e9b7 100644
--- a/fast_llm/utils.py
+++ b/fast_llm/utils.py
@@ -145,7 +145,7 @@ def multiple(x, y):
 
     @staticmethod
     def rms_close(x, y, threshold):
-        rms = rms_diff(x, y).item()
+        rms = rms_diff(x, y).detach().item()
         assert rms <= threshold, f"Rms diff too big ({rms:.3e} > {threshold:.3e}) between tensors {x} and {y}"
 
     @staticmethod
diff --git a/tests/data/test_concatenated_memmap.py b/tests/data/test_concatenated_memmap.py
index 0ab7c7fe4..1cc22250d 100644
--- a/tests/data/test_concatenated_memmap.py
+++ b/tests/data/test_concatenated_memmap.py
@@ -1,3 +1,5 @@
+import pytest
+
 from fast_llm.data.dataset.gpt.config import GPTConcatenatedMemmapConfig
 from tests.data.common import (
     compare_indexed_dataset,
@@ -42,10 +44,11 @@ def test_gpt_concatenated_memmap():
     # Make sure dataset splitting works and check for unintended changes in behavior.
     _get_test_dataset_concatenated_memmap()
     # samples[9:18]
-    dataset = get_dataset_config(
-        {"type": "concatenated_memmap", "path": _DATASET_PREFIX_MIX_CONCATENATED_MEMMAP},
-        GPTConcatenatedMemmapConfig,
-    ).build()
+    with pytest.warns(DeprecationWarning):
+        dataset = get_dataset_config(
+            {"type": "concatenated_memmap", "path": _DATASET_PREFIX_MIX_CONCATENATED_MEMMAP},
+            GPTConcatenatedMemmapConfig,
+        ).build()
     compare_indexed_dataset(
         dataset,
         CONCATENATED_MEMMAP_DATASET_LENGTH,
@@ -58,16 +61,17 @@ def test_gpt_concatenated_memmap():
 
 def test_gpt_concatenated_memmap_data():
     _get_test_dataset_concatenated_memmap()
-    get_test_data_and_compare_samples(
-        {
-            "datasets": {
-                "Training": {
-                    "type": "concatenated_memmap",
-                    "path": _DATASET_PREFIX_MIX_CONCATENATED_MEMMAP,
+    with pytest.warns(DeprecationWarning):
+        get_test_data_and_compare_samples(
+            {
+                "datasets": {
+                    "Training": {
+                        "type": "concatenated_memmap",
+                        "path": _DATASET_PREFIX_MIX_CONCATENATED_MEMMAP,
+                    }
                 }
-            }
-        },
-        8,
-        sequence_length=5,
-        expected_samples=CONCATENATED_MEMMAP_SAMPLES,
-    )
+            },
+            8,
+            sequence_length=5,
+            expected_samples=CONCATENATED_MEMMAP_SAMPLES,
+        )
diff --git a/tests/models/test_match_megatron.py b/tests/models/test_match_megatron.py
index c7fa623e5..081f3fb1e 100644
--- a/tests/models/test_match_megatron.py
+++ b/tests/models/test_match_megatron.py
@@ -36,13 +36,13 @@ def test_megatron(run_distributed_script, model_testing_config, run_test_script_
 def test_match_megatron(run_test_script_for_all_models, model_testing_config, compare_results_for_all_models):
     assert model_testing_config.megatron_args is not None
 
-    ignore_tensors = [
+    ignore_tensors = (
         ".self_attn.query_key_value.",
         ".self_attn.query.",
         ".self_attn.key_value.",
         ".mlp.layer_2.weight",
         ".mlp.experts.",
-    ]
+    )
     if model_testing_config.name == "mixtral":
         ignore_tensors.extend([".mlp.experts.", ".mlp.layer_1.weight"])
 
diff --git a/tests/utils/compare_tensor_logs.py b/tests/utils/compare_tensor_logs.py
index a1c17379a..51ee66d31 100644
--- a/tests/utils/compare_tensor_logs.py
+++ b/tests/utils/compare_tensor_logs.py
@@ -2,7 +2,6 @@
 import dataclasses
 import pathlib
 import typing
-import warnings
 
 import torch
 
@@ -57,7 +56,6 @@ def _get_sub_config(self, step_name: str, tensor_name: str) -> typing.Self:
 
     def _extract_tensor_logs(self, artifact_path: pathlib.Path, errors):
         tensor_logs = {}
-        ignore_keys = set()
         for rank_path in sorted(artifact_path.iterdir()):
             for p in rank_path.iterdir():
                 if p.name.startswith(_TENSOR_LOG_PREFIX) and p.suffix == ".pt":
@@ -65,9 +63,7 @@ def _extract_tensor_logs(self, artifact_path: pathlib.Path, errors):
                     for step_log in torch.load(p):
                         tensor_name = step_log["name"]
                         sub_config = self._get_sub_config(step_name, tensor_name)
-                        if sub_config.ignore_tensors:
-                            ignore_keys.add(f"{step_name}/{tensor_name}")
-                        else:
+                        if not sub_config.ignore_tensors:
                             if step_name not in tensor_logs:
                                 tensor_logs[step_name] = {}
                             if (
@@ -76,8 +72,6 @@ def _extract_tensor_logs(self, artifact_path: pathlib.Path, errors):
                             ):
                                 errors.append(f"Duplicate tensor log in step {step_name}: {tensor_name}")
                             tensor_step_logs[tensor_name] = step_log
-        if ignore_keys:
-            warnings.warn(f"Ignoring keys in {artifact_path}: {ignore_keys}")
         return tensor_logs
 
     def _compare_dict_keys(self, dict_ref, dict_test, errors, name):
diff --git a/tests/utils/distributed_configs.py b/tests/utils/distributed_configs.py
index ef7d5d214..d054c9889 100644
--- a/tests/utils/distributed_configs.py
+++ b/tests/utils/distributed_configs.py
@@ -44,8 +44,10 @@ def get_config(relative: float = 0, absolute: float = 0, **kwargs) -> CompareCon
 _compare_layer_mismatch = copy.deepcopy(_compare_layer_match)
 _pp_tied_weight_compare = copy.deepcopy(_compare_layer_match)
 _z3_accumulation_compare = copy.deepcopy(_compare_layer_match)
+_z3_accumulation_compare.sub_configs[(None, "bias")].ignore_duplicates = True
 _z3_accumulation_compare.sub_configs[(None, "gradient")].ignore_duplicates = True
 _pp_tied_weight_compare.sub_configs[(None, "gradient")].ignore_duplicates = True
+_pp_tied_weight_compare.sub_configs[("init", None)].ignore_duplicates = True
 for tensor in ("fw", "bw"):
     _compare_layer_mismatch.sub_configs[(None, tensor)].ignore_tensors = True
     _pp_tied_weight_compare.sub_configs[(None, tensor)].ignore_duplicates = True
@@ -55,8 +57,8 @@ def get_config(relative: float = 0, absolute: float = 0, **kwargs) -> CompareCon
     sub_configs={
         ("init", None): get_config(),
         (None, "fw"): get_config(1e-2, 1e-3),
-        (None, "bw"): get_config(1e-2, 1e-5),
-        (None, "bias"): get_config(2e-2, 1e-4),
+        (None, "bw"): get_config(1.5e-2, 1e-5),
+        (None, "bias"): get_config(2e-2, 1e-3),
         (None, "gradient"): get_config(2e-2, 3e-5),
     }
 )
@@ -212,7 +214,11 @@ def get_config(relative: float = 0, absolute: float = 0, **kwargs) -> CompareCon
     DistributedTestingConfig(
         name="stp2",
         compare="sf",
-        config_args=["model.distributed.tensor_parallel=2", "model.distributed.sequence_tensor_parallel=True"],
+        config_args=[
+            "model.distributed.tensor_parallel=2",
+            "model.distributed.sequence_tensor_parallel=True",
+            "model.base_model.transformer.dropless_moe=False",
+        ],
         num_gpus=2,
         compare_config=_compare_layer_match,
     ),
@@ -223,6 +229,7 @@ def get_config(relative: float = 0, absolute: float = 0, **kwargs) -> CompareCon
         config_args=[
             "model.distributed.tensor_parallel=2",
             "model.distributed.sequence_tensor_parallel=True",
+            "model.base_model.transformer.dropless_moe=False",
             "model.base_model.parallel_embeddings=False",
             "model.base_model.cross_entropy_splits=4",
         ],
@@ -237,6 +244,7 @@ def get_config(relative: float = 0, absolute: float = 0, **kwargs) -> CompareCon
         config_args=[
             "model.distributed.tensor_parallel=2",
             "model.distributed.sequence_tensor_parallel=True",
+            "model.base_model.transformer.dropless_moe=False",
         ],
         num_gpus=4,
         compare_config=_compare_layer_match,
@@ -260,6 +268,7 @@ def get_config(relative: float = 0, absolute: float = 0, **kwargs) -> CompareCon
             "model.distributed.sequence_data_parallel=2",
             "model.distributed.tensor_parallel=2",
             "model.distributed.sequence_tensor_parallel=True",
+            "model.base_model.transformer.dropless_moe=False",
             "batch.breadth_first_micro_batches=4",
         ],
         num_gpus=4,
@@ -273,6 +282,7 @@ def get_config(relative: float = 0, absolute: float = 0, **kwargs) -> CompareCon
             "model.distributed.sequence_data_parallel=2",
             "model.distributed.tensor_parallel=2",
             "model.distributed.sequence_tensor_parallel=True",
+            "model.base_model.transformer.dropless_moe=False",
         ],
         num_gpus=4,
         compare_config=_compare_layer_match,
@@ -335,6 +345,7 @@ def get_config(relative: float = 0, absolute: float = 0, **kwargs) -> CompareCon
         config_args=[
             "model.distributed.tensor_parallel=2",
             "model.distributed.sequence_tensor_parallel=True",
+            "model.base_model.transformer.dropless_moe=False",
             "model.distributed.pipeline_parallel=2",
             "model.multi_stage.layers_per_stage=2",
             "batch.breadth_first_micro_batches=4",
@@ -349,6 +360,8 @@ def get_config(relative: float = 0, absolute: float = 0, **kwargs) -> CompareCon
         compare="mb",
         config_args=[
             "model.distributed.tensor_parallel=2",
+            "model.distributed.sequence_tensor_parallel=True",
+            "model.base_model.transformer.dropless_moe=False",
             "model.distributed.pipeline_parallel=2",
             "model.multi_stage.layers_per_stage=2",
             "batch.breadth_first_micro_batches=4",
@@ -378,6 +391,7 @@ def get_config(relative: float = 0, absolute: float = 0, **kwargs) -> CompareCon
             "model.distributed.sequence_data_parallel=2",
             "model.distributed.tensor_parallel=2",
             "model.distributed.sequence_tensor_parallel=True",
+            "model.base_model.transformer.dropless_moe=False",
             "model.distributed.pipeline_parallel=2",
             "model.multi_stage.layers_per_stage=2",
             "batch.micro_sequence_length=256",
diff --git a/tests/utils/run_test_script.py b/tests/utils/run_test_script.py
index c188ccd5c..b8f996a82 100644
--- a/tests/utils/run_test_script.py
+++ b/tests/utils/run_test_script.py
@@ -6,7 +6,6 @@
 import subprocess
 import sys
 import typing
-import warnings
 
 import pytest
 
@@ -119,11 +118,9 @@ def do_compare_results_for_all_models(config: DistributedTestingConfig):
         assert config.compare is not None
         compare_config = config.compare_config.rescale(config.compare_factor * model_testing_config.compare_factor)
         pprint.pprint(compare_config)
-        with warnings.catch_warnings():
-            warnings.filterwarnings("ignore", category=DeprecationWarning, message="Ignoring keys in ")
-            compare_config.compare_tensor_logs(
-                run_test_script_base_path / config.compare / ARTIFACT_PATH,
-                run_test_script_base_path / config.name / ARTIFACT_PATH,
-            )
+        compare_config.compare_tensor_logs(
+            run_test_script_base_path / config.compare / ARTIFACT_PATH,
+            run_test_script_base_path / config.name / ARTIFACT_PATH,
+        )
 
     return do_compare_results_for_all_models

From a488e03dd4d77912b5729b59753e39d7791dd3e7 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Tue, 15 Jul 2025 23:05:49 -0400
Subject: [PATCH 13/14] fixes

---
 fast_llm/layers/ssm/discrete_mamba2.py |  3 +++
 fast_llm/models/ssm/config.py          |  6 ++++++
 tests/models/distributed_test_model.py |  2 ++
 tests/models/test_match_megatron.py    |  2 +-
 tests/models/test_model.py             | 11 ++++++++++-
 tests/utils/distributed_configs.py     |  4 ++--
 tests/utils/model_configs.py           | 20 ++++++++++++--------
 7 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/fast_llm/layers/ssm/discrete_mamba2.py b/fast_llm/layers/ssm/discrete_mamba2.py
index 31e81e99b..b0aa96805 100644
--- a/fast_llm/layers/ssm/discrete_mamba2.py
+++ b/fast_llm/layers/ssm/discrete_mamba2.py
@@ -7,6 +7,7 @@
 from fast_llm.engine.config_utils.tensor_space import TensorDim, TensorSpace
 from fast_llm.layers.common.linear import Linear
 from fast_llm.layers.ssm.config import SSMConfig, SSMDimNames
+from fast_llm.layers.transformer.config import TransformerKwargs
 from fast_llm.tensor import ParameterMeta, init_ones_, init_uniform_, init_zeros_, kaiming_init_
 from fast_llm.utils import get_lr_scale
 
@@ -157,6 +158,8 @@ def forward(self, hidden_states, kwargs):
             outputs["hidden_states"]: (B, L, D).
             outputs["state"]: inference cache.
         """
+        if kwargs[TransformerKwargs.sequence_first]:
+            raise NotImplementedError(f"Sequence-first not supported for SSMs.")
 
         assert _mamba_available
         input_ = hidden_states
diff --git a/fast_llm/models/ssm/config.py b/fast_llm/models/ssm/config.py
index 3c47ff0b2..ecd8908ee 100644
--- a/fast_llm/models/ssm/config.py
+++ b/fast_llm/models/ssm/config.py
@@ -197,6 +197,12 @@ def _validate(self):
         logger.warning(
             "HybridSSMModelConfig is being instantiated. This model is experimental and may not work as expected."
         )
+        if (
+            self.base_model.sequence_first
+            or self.distributed.sequence_data_parallel > 1
+            or self.distributed.sequence_tensor_parallel
+        ):
+            raise NotImplementedError(f"Sequence-first not supported for SSMs.")
         super()._validate()
 
 
diff --git a/tests/models/distributed_test_model.py b/tests/models/distributed_test_model.py
index 933b215e7..564920bd5 100644
--- a/tests/models/distributed_test_model.py
+++ b/tests/models/distributed_test_model.py
@@ -27,6 +27,8 @@ def main(args: list[str] | None = None) -> None:
         group = pool.get_process_group(range(world_size), rank)
 
         for name, config in DISTRIBUTED_TESTING_CONFIGS.items():
+            if model_testing_config.should_skip(config):
+                continue
             if world_size < config.num_gpus:
                 logger.warning(f"{name} {f"SKIPPED (not enough GPUs: {world_size} < {config.num_gpus})"})")
                 continue
diff --git a/tests/models/test_match_megatron.py b/tests/models/test_match_megatron.py
index 081f3fb1e..30667cd17 100644
--- a/tests/models/test_match_megatron.py
+++ b/tests/models/test_match_megatron.py
@@ -44,7 +44,7 @@ def test_match_megatron(run_test_script_for_all_models, model_testing_config, co
         ".mlp.experts.",
     )
     if model_testing_config.name == "mixtral":
-        ignore_tensors.extend([".mlp.experts.", ".mlp.layer_1.weight"])
+        ignore_tensors += (".mlp.experts.", ".mlp.layer_1.weight")
 
     distributed_testing_config = DistributedTestingConfig(
         name="match_megatron",
diff --git a/tests/models/test_model.py b/tests/models/test_model.py
index 4a344cdc7..5c4897646 100644
--- a/tests/models/test_model.py
+++ b/tests/models/test_model.py
@@ -28,10 +28,16 @@ def test_model_simple(run_test_script_for_all_models, run_test_script_base_path)
 # Parametrize with config name so it shows in test name.
 @pytest.mark.parametrize("config_name", SINGLE_GPU_TESTING_CONFIGS)
 def test_and_compare_model(
-    run_test_script_for_all_models, compare_results_for_all_models, config_name, run_test_script_base_path
+    run_test_script_for_all_models,
+    compare_results_for_all_models,
+    config_name,
+    run_test_script_base_path,
+    model_testing_config,
 ):
     # We can expect tests to respect the ordering of `SINGLE_GPU_TESTING_CONFIGS`, so compare should have run already.
     config = SINGLE_GPU_TESTING_CONFIGS[config_name]
+    if model_testing_config.should_skip(config):
+        pytest.skip(f"Configuration not supported.")
     if config.compare is not None:
         check_subtest_success(run_test_script_base_path / config.compare)
     # A baseline config (single-gpu, bf16, flash-attn).
@@ -73,8 +79,11 @@ def test_model_distributed(
     config_name,
     run_test_script_base_path,
     report_subtest,
+    model_testing_config,
 ):
     config = DISTRIBUTED_TESTING_CONFIGS[config_name]
+    if model_testing_config.should_skip(config):
+        pytest.skip(f"Configuration not supported.")
     if torch.cuda.device_count() < config.num_gpus:
         pytest.skip(f"Not enough GPUs: {torch.cuda.device_count()} < {config.num_gpus}")
     report_subtest(run_test_script_base_path / config.name, config.num_gpus)
diff --git a/tests/utils/distributed_configs.py b/tests/utils/distributed_configs.py
index d054c9889..c3064d987 100644
--- a/tests/utils/distributed_configs.py
+++ b/tests/utils/distributed_configs.py
@@ -59,7 +59,7 @@ def get_config(relative: float = 0, absolute: float = 0, **kwargs) -> CompareCon
         (None, "fw"): get_config(1e-2, 1e-3),
         (None, "bw"): get_config(1.5e-2, 1e-5),
         (None, "bias"): get_config(2e-2, 1e-3),
-        (None, "gradient"): get_config(2e-2, 3e-5),
+        (None, "gradient"): get_config(2e-2, 5e-5),
     }
 )
 
@@ -67,7 +67,7 @@ def get_config(relative: float = 0, absolute: float = 0, **kwargs) -> CompareCon
     sub_configs={
         ("init", None): get_config(),
         # Saved gradient include the gradient scaling by 2**16 (default initial value)
-        (None, "fw"): get_config(1e-3, 1e-4),
+        (None, "fw"): get_config(1e-3, 3e-4),
         (None, "bw"): get_config(3e-3, 1e-5, scale=2**16),
         (None, "bias"): get_config(3e-3, 1e-4, scale=2**16),
         (None, "gradient"): get_config(3e-3, 5e-5, scale=2**16),
diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py
index 2c07fd0a1..f1890aff8 100644
--- a/tests/utils/model_configs.py
+++ b/tests/utils/model_configs.py
@@ -21,6 +21,7 @@
 )
 from fast_llm.models.ssm.config import LLambaHuggingfaceCheckpointFormat
 from tests.utils.dataset import MODEL_DATASET_PREFIX, MODEL_TEST_VOCAB_SIZE
+from tests.utils.distributed_configs import DistributedTestingConfig
 
 _LOG_LEVEL = int(os.environ.get("LOG_LEVEL", 13))
 
@@ -57,6 +58,8 @@ class ModelTestingConfig:
     groups: dict[ModelTestingGroup, ModelTestingGroupAction]
     # Scale the comparison thresholds for specific models.
     compare_factor: float = 1.0
+    # Option to skip specific distributed configuration with name containing any of the provided strings.
+    skip_tests: tuple[str] = ()
 
     @functools.cached_property
     def trainer_config_class(self) -> type[TrainerConfig]:
@@ -88,6 +91,9 @@ def model_class(self):
     def base_model_config_class(self):
         return self.model_config_class.get_base_model_config_class()
 
+    def should_skip(self, distributed_config: DistributedTestingConfig) -> bool:
+        return any(key in distributed_config.name for key in self.skip_tests)
+
 
 def _update_and_add_testing_config(
     old_name: str,
@@ -96,9 +102,8 @@ def _update_and_add_testing_config(
     model_type: str | None = None,
     extra_args: list[str] | None = None,
     megatron_args: list[str] | None = ...,
-    checkpoint_format: CheckpointFormat | None = ...,
     groups: dict[ModelTestingGroup, ModelTestingGroupAction],
-    compare_factor: float = ...,
+    **kwargs,
 ):
     config = MODEL_CONFIGS[old_name]
     updates: dict[str, typing.Any] = {
@@ -116,10 +121,7 @@ def _update_and_add_testing_config(
             updates["megatron_args"] = megatron_args
         else:
             updates["megatron_args"] = config.megatron_args + megatron_args
-    if checkpoint_format is not ...:
-        updates["checkpoint_format"] = checkpoint_format
-    if compare_factor is not ...:
-        updates["compare_factor"] = compare_factor
+    updates.update(kwargs)
 
     MODEL_CONFIGS[new_name] = dataclasses.replace(config, **updates)
 
@@ -362,6 +364,7 @@ def _update_and_add_testing_config(
         ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented,
         ModelTestingGroup.distributed: ModelTestingGroupAction.unimportant,
     },
+    compare_factor=2.0,
 )
 
 _update_and_add_testing_config(
@@ -472,7 +475,9 @@ def _update_and_add_testing_config(
         # TODO: Fix and bring back to `testing_groups`
         ModelTestingGroup.distributed: ModelTestingGroupAction.broken,
     },
-    compare_factor=10.0,
+    compare_factor=2.0,
+    # SSMs don't support sequence-first configurations.
+    skip_tests=("sf", "sdp", "stp", "ms"),
 )
 
 
@@ -494,7 +499,6 @@ def _update_and_add_testing_config(
         ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented,
         ModelTestingGroup.distributed: ModelTestingGroupAction.unimportant,
     },
-    compare_factor=10.0,
 )
 
 

From 5f0b87a130b0a8837e866fbfa4aac530c9aab921 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Wed, 16 Jul 2025 14:53:22 -0400
Subject: [PATCH 14/14] Parallel safe

---
 tests/conftest.py      |  6 ++----
 tests/utils/dataset.py | 12 ++++++------
 tests/utils/utils.py   | 11 ++++++++++-
 3 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index e9011979a..298117e1d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -95,10 +95,8 @@ def pytest_configure(config):
     else:
         worker_id = 0
 
-    # TODO: Remove the whole `TEST_RESULTS_PATH` once `get_test_dataset` is parallel-safe.
-    model_result_path = TEST_RESULTS_PATH / "models"
-    if model_result_path.exists():
-        shutil.rmtree(model_result_path)
+    if TEST_RESULTS_PATH.exists():
+        shutil.rmtree(TEST_RESULTS_PATH)
 
     num_gpus = torch.cuda.device_count()
     if num_gpus > 0 and is_parallel:
diff --git a/tests/utils/dataset.py b/tests/utils/dataset.py
index ad8385ae9..a4136c40e 100644
--- a/tests/utils/dataset.py
+++ b/tests/utils/dataset.py
@@ -7,20 +7,20 @@
 
 from fast_llm.data.dataset.gpt.memmap import GPTMemmapDataset
 from fast_llm.data.dataset.gpt.sampled import GPTSample
-from tests.utils.utils import TEST_RESULTS_PATH
+from tests.utils.utils import SHARED_RESULT_PATH, TEST_RESULTS_PATH
 
 # TODO: Fixtures
-TOKENIZER_PATH = TEST_RESULTS_PATH / "tokenizer" / "common"
+TOKENIZER_PATH = SHARED_RESULT_PATH / "tokenizer"
 TOKENIZER_FILE = TOKENIZER_PATH / "tokenizer.json"
-DATASET_CACHE = TEST_RESULTS_PATH / "dataset"
-DATASET_PREFIX = DATASET_CACHE / "common" / "dataset"
-DATASET_SAMPLING_CACHE = TEST_RESULTS_PATH / "dataset" / "cache"
+DATASET_CACHE = SHARED_RESULT_PATH / "dataset"
+DATASET_PREFIX = DATASET_CACHE / "common_dataset"
+DATASET_SAMPLING_CACHE = TEST_RESULTS_PATH / "dataset_sampling_cache"
 TEST_VOCAB_SIZE = 8192
 # Random lowercase: 80.7% (3.1% each); space: 18.6%; doc end: 0.6%
 TEST_CHARACTERS = (string.ascii_lowercase) * 5 + " " * 30 + "\n"
 TEST_DATASET_TOKENS = 1000000
 
-MODEL_DATASET_PREFIX = DATASET_CACHE / "common" / "model_dataset"
+MODEL_DATASET_PREFIX = DATASET_CACHE / "model_dataset"
 MODEL_TEST_VOCAB_SIZE = 384
 
 
diff --git a/tests/utils/utils.py b/tests/utils/utils.py
index 54efe0966..25d5221d8 100644
--- a/tests/utils/utils.py
+++ b/tests/utils/utils.py
@@ -1,6 +1,7 @@
 import json
 import logging
 import math
+import os
 import pathlib
 import sys
 import time
@@ -23,9 +24,17 @@
 
 requires_cuda = pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available")
 
-
+# Directory for all test data and results.
+# Cannot be a fixture because it's used outside testing environment (ex. distributed scripts).
 TEST_RESULTS_PATH = pathlib.Path("/tmp/fast_llm_tests")
 
+# Directory for data that is shared between independent tests and may not be parallel-safe,
+# ex. generated dataset and downloaded files.
+if worker_name := os.environ.get("PYTEST_XDIST_WORKER"):
+    SHARED_RESULT_PATH = TEST_RESULTS_PATH / f"common_{worker_name}"
+else:
+    SHARED_RESULT_PATH = TEST_RESULTS_PATH / "common"
+
 
 @pytest.fixture(scope="session")
 def result_path():