Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions test/distributed/_tools/test_fsdp2_mem_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def test_tracker_non_root_forward_backward(self):
self.assertAlmostEqual(
accuracy,
1.0,
delta=0.1,
delta=0.16,
msg=f"Tracker Max:{tracker_max} CUDA Max:{cuda_max}",
)
del inp
Expand Down Expand Up @@ -258,7 +258,7 @@ def _test_tracker_with_activation_checkpointing(
self.assertAlmostEqual(
accuracy,
1.0,
delta=0.1,
delta=0.25,
msg=f"Tracker Max:{tracker_max} CUDA Max:{cuda_max}",
)
del inp
Expand Down
7 changes: 2 additions & 5 deletions test/distributed/_tools/test_sac_ilp.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,8 @@
)
from torch.testing._internal.common_cuda import TEST_CUDA
from torch.testing._internal.common_utils import (
MI300_ARCH,
NAVI_ARCH,
run_tests,
skipIfRocmArch,
skipIfRocm,
skipIfTorchDynamo,
TestCase,
)
Expand Down Expand Up @@ -137,8 +135,7 @@ def _collect_module_info_with_fake_tensor_mode(self) -> ModuleInfo:

@skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/115653")
@unittest.skipIf(not TEST_CUDA, "CUDA not available")
@skipIfRocmArch(MI300_ARCH)
@skipIfRocmArch(NAVI_ARCH)
@skipIfRocm
def test_sac_ilp_case1(self):
"""
This is a case where the memory budget is either binding or too tight,
Expand Down
3 changes: 2 additions & 1 deletion test/distributed/tensor/test_matrix_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from torch.distributed.tensor.debug import CommDebugMode
from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8, SM90OrLater
from torch.testing._internal.common_device_type import E4M3_MAX_POS, e4m3_type
from torch.testing._internal.common_utils import run_tests, TEST_WITH_ROCM
from torch.testing._internal.common_utils import run_tests, TEST_WITH_ROCM, skipIfRocmArch, MI350_ARCH
from torch.testing._internal.distributed._tensor.common_dtensor import (
DTensorTestBase,
skip_unless_torch_gpu,
Expand Down Expand Up @@ -146,6 +146,7 @@ def test_placement_comb(
not PLATFORM_SUPPORTS_FP8,
"FP8 is only supported on H100+, SM 8.9 and MI300+ devices",
)
@skipIfRocmArch(MI350_ARCH) #Enable via https://github.com/ROCm/frameworks-internal/issues/13103
def test_scaled_mm(self):
device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
shrd0 = Shard(0)
Expand Down
2 changes: 0 additions & 2 deletions test/distributed/test_c10d_gloo.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@
retry_on_connect_failures,
run_tests,
skip_but_pass_in_sandcastle,
skipIfRocmArch,
skipIfRocm,
TestCase,
)
Expand Down Expand Up @@ -1134,7 +1133,6 @@ def test_gather_stress(self):

@skipIfRocm
@skip_if_lt_x_gpu(2)
@skipIfRocmArch(MI300_ARCH)
@requires_gloo()
def test_gather_stress_cuda(self):
inputs = [torch.tensor([i + self.rank]).cuda() for i in range(1000)]
Expand Down
16 changes: 8 additions & 8 deletions test/distributed/test_c10d_nccl.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,6 @@
TEST_WITH_DEV_DBG_ASAN,
TEST_WITH_ROCM,
TestCase,
is_arch,
NAVI_ARCH,
)


Expand Down Expand Up @@ -624,15 +622,17 @@ def _helper_test_extra_cuda_context_by_memory(self):
"""
device = torch.device(f"cuda:{self.rank:d}")
x = torch.empty((1,), device=device)

# We need this barrier to ensure that all nodes have completed init_process_group
# If rank=0 gets a mem snapshot before other nodes have finished init_process_group,
# then we artificially see a bump in memory usage. As per the following comment,
# we are going to be moving away from this function:
# https://github.com/pytorch/pytorch/pull/154174#discussion_r2105065931
c10d.barrier()

# Rank 0 takes a snapshot before collective -- this snapshot should have
# included rank 0's own context.
if self.rank == 0:
# We need this extra sleep for NAVI_ARCH because rccl_init inside init_process_group
# is happening in a separate process and it is taking longer to finish on NAVI_ARCH.
# Sleeping here ensures that the init is competed successfully and mem_get_info can
# get stable numbers.
if is_arch(NAVI_ARCH):
time.sleep(5)
free, total = torch.cuda.mem_get_info(device)
used_before = float(total - free)

Expand Down
1 change: 1 addition & 0 deletions test/distributed/test_inductor_collectives.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ def compile(func, example_inputs):

@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
@skip_if_lt_x_gpu(2)
@skipIfRocm #Skip as flaky upstream as well, enable via https://github.com/ROCm/frameworks-internal/issues/13105
def test_allreduce_inductor_cudagraph_trees(self):
"""
Tests whether cudagraph trees support all_reduce from nccl
Expand Down
2 changes: 2 additions & 0 deletions test/distributed/test_symmetric_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ def _init_process(self, set_device: bool = True):
)
torch.manual_seed(42 + self.rank)

@requires_multicast_support()
def test_has_multicast_support(self) -> None:
# validate that has_multicast_support() returns "false" instead of throwing
self.assertFalse(_SymmetricMemory.has_multicast_support(DeviceType.CPU, 0))
Expand Down Expand Up @@ -1014,6 +1015,7 @@ def _verify_reduce_scatter_result(self, inp, res):

@skip_if_lt_x_gpu(4)
@parametrize("align_bytes", [4, 8, 16])
@requires_multicast_support()
def test_multimem_all_gather(self, align_bytes: int) -> None:
self._init_process()
group_name = dist.group.WORLD.group_name
Expand Down
9 changes: 8 additions & 1 deletion torch/testing/_internal/distributed/distributed_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2068,6 +2068,7 @@ def test_broadcast_full_group(self):
"Only NCCL backend supports high priority stream",
)
@skip_if_no_gpu
@skip_if_rocm_multiprocess #enable via https://github.com/ROCm/frameworks-internal/issues/13115
def test_nccl_high_priority_stream(self):
group, _, rank = self._init_global_test()
rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
Expand Down Expand Up @@ -3228,6 +3229,7 @@ def test_scatter(self):
BACKEND != "nccl", "Only Nccl supports CUDA gather"
)
@skip_if_no_gpu
@skip_if_rocm_multiprocess #enable via https://github.com/ROCm/frameworks-internal/issues/13115
def test_scatter_cuda(self):
group, group_id, rank = self._init_global_test()
rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
Expand Down Expand Up @@ -3418,6 +3420,7 @@ def test_all_gather(self):
BACKEND != "nccl", "Only Nccl supports CUDA all gather"
)
@skip_if_no_gpu
@skip_if_rocm_multiprocess #enable via https://github.com/ROCm/frameworks-internal/issues/13115
def test_all_gather_cuda(self):
group, group_id, rank = self._init_global_test()
rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
Expand All @@ -3434,6 +3437,7 @@ def test_all_gather_complex(self):
BACKEND != "nccl", "Only Nccl supports CUDA all gather"
)
@skip_if_no_gpu
@skip_if_rocm_multiprocess #enable via https://github.com/ROCm/frameworks-internal/issues/13115
def test_all_gather_cuda_complex(self):
group, group_id, rank = self._init_global_test()
rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
Expand Down Expand Up @@ -3546,6 +3550,7 @@ def test_all_gather_into_cat_tensor_cuda(self):
BACKEND != "nccl", "Only Nccl supports CUDA all_gather_into_tensor"
)
@skip_if_no_gpu
@skip_if_rocm_multiprocess #enable via https://github.com/ROCm/frameworks-internal/issues/13115
def test_all_gather_into_stack_tensor_cuda(self):
group, group_id, rank = self._init_global_test()
rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
Expand Down Expand Up @@ -3801,6 +3806,7 @@ def test_all_to_all_single_equal_split(self):
BACKEND != "nccl", "Only Nccl supports CUDA all_to_all_single"
)
@skip_if_no_gpu
@skip_if_rocm_multiprocess #enable via https://github.com/ROCm/frameworks-internal/issues/13115
def test_all_to_all_single_equal_split_cuda(self):
group, group_id, rank = self._init_global_test()
rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
Expand Down Expand Up @@ -4816,7 +4822,7 @@ def _test_ddp_apply_optim_in_backward(
# set_to_none for regular optimizer to match in backward
# case.
optim.zero_grad(set_to_none=True)

@skip_but_pass_in_sandcastle_if(
BACKEND == "gloo" and HAS_TORCHVISION,
"Failing with gloo backend + torchvision due to ongoing issue https://github.com/pytorch/pytorch/issues/111834",
Expand Down Expand Up @@ -5410,6 +5416,7 @@ def add(fut):
f"The {BACKEND} backend does not support DistributedDataParallel",
)
@skip_if_no_gpu
@skip_if_rocm_multiprocess #enable via https://github.com/ROCm/frameworks-internal/issues/13115
def test_DistributedDataParallel(self):
_group, _group_id, rank = self._init_global_test()
rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
Expand Down