Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions test/distributed/_tools/test_fsdp2_mem_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def test_tracker_non_root_forward_backward(self):
self.assertAlmostEqual(
accuracy,
1.0,
delta=0.1,
delta=0.16,
msg=f"Tracker Max:{tracker_max} CUDA Max:{cuda_max}",
)
del inp
Expand Down Expand Up @@ -258,7 +258,7 @@ def _test_tracker_with_activation_checkpointing(
self.assertAlmostEqual(
accuracy,
1.0,
delta=0.1,
delta=0.25,
msg=f"Tracker Max:{tracker_max} CUDA Max:{cuda_max}",
)
del inp
Expand Down
5 changes: 2 additions & 3 deletions test/distributed/_tools/test_sac_ilp.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,8 @@
)
from torch.testing._internal.common_cuda import TEST_CUDA
from torch.testing._internal.common_utils import (
MI300_ARCH,
run_tests,
skipIfRocmArch,
skipIfRocm,
skipIfTorchDynamo,
TestCase,
)
Expand Down Expand Up @@ -136,7 +135,7 @@ def _collect_module_info_with_fake_tensor_mode(self) -> ModuleInfo:

@skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/115653")
@unittest.skipIf(not TEST_CUDA, "CUDA not available")
@skipIfRocmArch(MI300_ARCH)
@skipIfRocm
def test_sac_ilp_case1(self):
"""
This is a case where the memory budget is either binding or too tight,
Expand Down
3 changes: 2 additions & 1 deletion test/distributed/tensor/test_matrix_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
)
from torch.distributed.tensor.debug import CommDebugMode
from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8
from torch.testing._internal.common_utils import run_tests, skipIfRocm
from torch.testing._internal.common_utils import run_tests, skipIfRocm, skipIfRocmArch, MI350_ARCH
from torch.testing._internal.distributed._tensor.common_dtensor import (
DTensorTestBase,
skip_unless_torch_gpu,
Expand Down Expand Up @@ -140,6 +140,7 @@ def test_placement_comb(
not PLATFORM_SUPPORTS_FP8,
"FP8 is only supported on H100+, SM 8.9 and MI300+ devices",
)
@skipIfRocmArch(MI350_ARCH) #Enable via https://github.com/ROCm/frameworks-internal/issues/13103
def test_scaled_mm(self):
device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
shrd0 = Shard(0)
Expand Down
2 changes: 0 additions & 2 deletions test/distributed/test_c10d_gloo.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@
retry_on_connect_failures,
run_tests,
skip_but_pass_in_sandcastle,
skipIfRocmArch,
skipIfRocm,
TestCase,
)
Expand Down Expand Up @@ -1105,7 +1104,6 @@ def test_gather_stress(self):

@skipIfRocm
@skip_if_lt_x_gpu(2)
@skipIfRocmArch(MI300_ARCH)
@requires_gloo()
def test_gather_stress_cuda(self):
inputs = [torch.tensor([i + self.rank]).cuda() for i in range(1000)]
Expand Down
8 changes: 8 additions & 0 deletions test/distributed/test_c10d_nccl.py
Original file line number Diff line number Diff line change
Expand Up @@ -622,6 +622,14 @@ def _helper_test_extra_cuda_context_by_memory(self):
"""
device = torch.device(f"cuda:{self.rank:d}")
x = torch.empty((1,), device=device)

# We need this barrier to ensure that all nodes have completed init_process_group
# If rank=0 gets a mem snapshot before other nodes have finished init_process_group,
# then we artificially see a bump in memory usage. As per the following comment,
# we are going to be moving away from this function:
# https://github.com/pytorch/pytorch/pull/154174#discussion_r2105065931
c10d.barrier()

# Rank 0 takes a snapshot before collective -- this snapshot should have
# included rank 0's own context.
if self.rank == 0:
Expand Down
1 change: 1 addition & 0 deletions test/distributed/test_inductor_collectives.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ def compile(func, example_inputs):

@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
@skip_if_lt_x_gpu(2)
@skipIfRocm #Skip as flaky upstream as well, enable via https://github.com/ROCm/frameworks-internal/issues/13105
def test_allreduce_inductor_cudagraph_trees(self):
"""
Tests whether cudagraph trees support all_reduce from nccl
Expand Down
2 changes: 2 additions & 0 deletions test/distributed/test_symmetric_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ def _init_process(self, set_device: bool = True):
)
torch.manual_seed(42 + self.rank)

@requires_multicast_support()
def test_has_multicast_support(self) -> None:
# validate that has_multicast_support() returns "false" instead of throwing
self.assertFalse(_SymmetricMemory.has_multicast_support(DeviceType.CPU, 0))
Expand Down Expand Up @@ -927,6 +928,7 @@ def _verify_all_reduce_result(self, inp, res):

@skip_if_lt_x_gpu(4)
@parametrize("align_bytes", [4, 8, 16])
@requires_multicast_support()
def test_multimem_all_gather(self, align_bytes: int) -> None:
self._init_process()
group_name = dist.group.WORLD.group_name
Expand Down
7 changes: 7 additions & 0 deletions torch/testing/_internal/distributed/distributed_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2068,6 +2068,7 @@ def test_broadcast_full_group(self):
"Only NCCL backend supports high priority stream",
)
@skip_if_no_gpu
@skip_if_rocm_multiprocess #enable via https://github.com/ROCm/frameworks-internal/issues/13115
def test_nccl_high_priority_stream(self):
group, _, rank = self._init_global_test()
rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
Expand Down Expand Up @@ -3228,6 +3229,7 @@ def test_scatter(self):
BACKEND != "nccl", "Only Nccl supports CUDA gather"
)
@skip_if_no_gpu
@skip_if_rocm_multiprocess #enable via https://github.com/ROCm/frameworks-internal/issues/13115
def test_scatter_cuda(self):
group, group_id, rank = self._init_global_test()
rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
Expand Down Expand Up @@ -3418,6 +3420,7 @@ def test_all_gather(self):
BACKEND != "nccl", "Only Nccl supports CUDA all gather"
)
@skip_if_no_gpu
@skip_if_rocm_multiprocess #enable via https://github.com/ROCm/frameworks-internal/issues/13115
def test_all_gather_cuda(self):
group, group_id, rank = self._init_global_test()
rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
Expand All @@ -3434,6 +3437,7 @@ def test_all_gather_complex(self):
BACKEND != "nccl", "Only Nccl supports CUDA all gather"
)
@skip_if_no_gpu
@skip_if_rocm_multiprocess #enable via https://github.com/ROCm/frameworks-internal/issues/13115
def test_all_gather_cuda_complex(self):
group, group_id, rank = self._init_global_test()
rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
Expand Down Expand Up @@ -3546,6 +3550,7 @@ def test_all_gather_into_cat_tensor_cuda(self):
BACKEND != "nccl", "Only Nccl supports CUDA all_gather_into_tensor"
)
@skip_if_no_gpu
@skip_if_rocm_multiprocess #enable via https://github.com/ROCm/frameworks-internal/issues/13115
def test_all_gather_into_stack_tensor_cuda(self):
group, group_id, rank = self._init_global_test()
rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
Expand Down Expand Up @@ -3801,6 +3806,7 @@ def test_all_to_all_single_equal_split(self):
BACKEND != "nccl", "Only Nccl supports CUDA all_to_all_single"
)
@skip_if_no_gpu
@skip_if_rocm_multiprocess #enable via https://github.com/ROCm/frameworks-internal/issues/13115
def test_all_to_all_single_equal_split_cuda(self):
group, group_id, rank = self._init_global_test()
rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
Expand Down Expand Up @@ -5410,6 +5416,7 @@ def add(fut):
f"The {BACKEND} backend does not support DistributedDataParallel",
)
@skip_if_no_gpu
@skip_if_rocm_multiprocess #enable via https://github.com/ROCm/frameworks-internal/issues/13115
def test_DistributedDataParallel(self):
_group, _group_id, rank = self._init_global_test()
rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
Expand Down