From df7dff7c592897b5d691bc078271b6a0976949aa Mon Sep 17 00:00:00 2001 From: Artem Kuzmitckii Date: Wed, 23 Jul 2025 20:02:44 +0000 Subject: [PATCH 1/5] [SWDEV-523736] Skip&Fix some testcases for Navi4x --- test/distributed/_tools/test_sac_ilp.py | 4 ++-- test/distributed/tensor/parallel/test_tp_examples.py | 1 + test/inductor/test_cooperative_reductions.py | 1 - test/test_linalg.py | 2 ++ 4 files changed, 5 insertions(+), 3 deletions(-) diff --git a/test/distributed/_tools/test_sac_ilp.py b/test/distributed/_tools/test_sac_ilp.py index 777b93ea3164..cdc5210c22cb 100644 --- a/test/distributed/_tools/test_sac_ilp.py +++ b/test/distributed/_tools/test_sac_ilp.py @@ -17,7 +17,7 @@ get_optimal_checkpointing_policy_per_module, sac_milp, ) -from torch.testing._internal.common_cuda import TEST_CUDA +from torch.testing._internal.common_cuda import TEST_CUDA, PLATFORM_SUPPORTS_FLASH_ATTENTION from torch.testing._internal.common_utils import ( run_tests, skipIfTorchDynamo, @@ -180,7 +180,7 @@ def test_sac_ilp_case1(self): @skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/115653") @unittest.skipIf(not TEST_CUDA, "CUDA not available") - @skipIfRocmArch(NAVI_ARCH) + @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA") def test_sac_ilp_case2(self): """ This is a case where the memory budget is not binding, meaning that no diff --git a/test/distributed/tensor/parallel/test_tp_examples.py b/test/distributed/tensor/parallel/test_tp_examples.py index 194735f91236..4b481511a70e 100644 --- a/test/distributed/tensor/parallel/test_tp_examples.py +++ b/test/distributed/tensor/parallel/test_tp_examples.py @@ -43,6 +43,7 @@ Transformer, with_comms, ) +from unittest import skipIf c10d_functional = torch.ops.c10d_functional diff --git a/test/inductor/test_cooperative_reductions.py b/test/inductor/test_cooperative_reductions.py index fc296b12a9d7..1649ac297d59 100644 --- a/test/inductor/test_cooperative_reductions.py +++ b/test/inductor/test_cooperative_reductions.py @@ -151,7 +151,6 @@ def run_and_check(self, fn, args, dtype=None, *, expect_kernel_count=1): torch._inductor.metrics.generated_kernel_count, expect_kernel_count ) return source_code - @parametrize( "name", [ diff --git a/test/test_linalg.py b/test/test_linalg.py index 6fc46663a67e..fe8cac1c06c5 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -20,6 +20,7 @@ TEST_WITH_ROCM, IS_FBCODE, IS_REMOTE_GPU, iter_indices, make_fullrank_matrices_with_distinct_singular_values, freeze_rng_state, IS_ARM64, IS_SANDCASTLE, TEST_OPT_EINSUM, parametrize, skipIfTorchDynamo, + skipIfRocmArch, NAVI4_ARCH, setBlasBackendsToDefaultFinally, setLinalgBackendsToDefaultFinally, serialTest, runOnRocmArch, MI300_ARCH) from torch.testing._internal.common_device_type import \ @@ -7149,6 +7150,7 @@ def test_baddbmm_input_dtypes_compatibility(self, device, dtype): @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "cublas runtime error") @onlyCUDA + @skipIfRocmArch(NAVI4_ARCH) def test_matmul_45724(self, device): # https://github.com/pytorch/pytorch/issues/45724 a = torch.rand(65537, 22, 64, device=device, dtype=torch.half) From 02aaaa76912d6d79f87293dfba34426f9e2e15fa Mon Sep 17 00:00:00 2001 From: Prachi Gupta Date: Wed, 9 Jul 2025 13:18:36 -0400 Subject: [PATCH 2/5] [SWDEV-523736] Partial cherry-pick to release/2.6 [release/2.5][SWDEV-489778] NAVI4x UT parity for distributed config (#2327) I did a sweep of all the distributed failures on NAVI4x. On a high level, we were running into following issues: - MEM_EFF_ATTENTION is not supported on NAVI4x for 2.5 causing tensors not alike issues - Some UTs pass in future releases, skipped those - Some had slight tolerance fixes as we use hipblas in this branch as compared to hipblaslt in future branches Fixes #ISSUE_NUMBER --- functorch/experimental/__init__.py | 2 +- .../_composable/fsdp/test_fully_shard_training.py | 12 +++++++++++- test/distributed/elastic/test_control_plane.py | 8 +++++++- test/distributed/fsdp/test_fsdp_core.py | 4 ++++ .../fsdp/test_fsdp_sharded_grad_scaler.py | 4 ++++ .../optim/test_zero_redundancy_optimizer.py | 2 ++ 6 files changed, 29 insertions(+), 3 deletions(-) diff --git a/functorch/experimental/__init__.py b/functorch/experimental/__init__.py index 3941f6d96e1f..ec414d8c135b 100644 --- a/functorch/experimental/__init__.py +++ b/functorch/experimental/__init__.py @@ -1,5 +1,5 @@ # PyTorch forward-mode is not mature yet -from functorch import functionalize +from torch._functorch.deprecated import functionalize from torch._functorch.apis import chunk_vmap from torch._functorch.batch_norm_replacement import replace_all_batch_norm_modules_ from torch._functorch.eager_transforms import hessian, jacfwd, jvp diff --git a/test/distributed/_composable/fsdp/test_fully_shard_training.py b/test/distributed/_composable/fsdp/test_fully_shard_training.py index bc9f941101ba..129a61f16e03 100644 --- a/test/distributed/_composable/fsdp/test_fully_shard_training.py +++ b/test/distributed/_composable/fsdp/test_fully_shard_training.py @@ -27,7 +27,10 @@ ) from torch.distributed.tensor import DTensor, init_device_mesh, Shard from torch.distributed.tensor.debug import CommDebugMode -from torch.testing._internal.common_cuda import TEST_CUDA +from torch.testing._internal.common_cuda import ( + PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, + TEST_CUDA, +) from torch.testing._internal.common_distributed import skip_if_lt_x_gpu from torch.testing._internal.common_fsdp import ( check_sharded_parity, @@ -41,7 +44,9 @@ ) from torch.testing._internal.common_utils import ( get_cycles_per_ms, + NAVI_ARCH, run_tests, + skipIfRocmArch, wrapSwapTensorsTest, ) from torch.testing._internal.distributed._tensor.common_dtensor import ( @@ -94,6 +99,7 @@ def world_size(self) -> int: return 4 @unittest.skipIf(not TEST_CUDA, "no cuda") + @skipIfRocmArch(NAVI_ARCH) # Supported in future releaes def test_param_registration_after_forward(self): """Tests the parameter registration after forward.""" device = torch.device("cuda", 0) @@ -200,6 +206,7 @@ def world_size(self) -> int: @unittest.skipIf(not TEST_CUDA, "no cuda") @wrapSwapTensorsTest(True) + @skipIfRocmArch(NAVI_ARCH) # Supported in future releaes def test_to_float64_after_init(self): """Tests that the user can cast the module to float64 after init.""" # NOTE: Test fp64 instead of a lower precision dtype like bf16 for @@ -310,6 +317,9 @@ def _shard_placement_fn(param: nn.Parameter) -> Optional[Shard]: @skip_if_lt_x_gpu(2) @compiled_fsdp_test(compile_compute_on_module=Transformer) + @unittest.skipIf( + not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Platform does not support fused SDPA" + ) def test_train_parity_multi_group(self): """ Tests train parity against DDP when using multiple parameter groups for diff --git a/test/distributed/elastic/test_control_plane.py b/test/distributed/elastic/test_control_plane.py index cfa221147789..10be085b70aa 100644 --- a/test/distributed/elastic/test_control_plane.py +++ b/test/distributed/elastic/test_control_plane.py @@ -15,7 +15,12 @@ TORCH_WORKER_SERVER_SOCKET, worker_main, ) -from torch.testing._internal.common_utils import requires_cuda, run_tests, TestCase +from torch.testing._internal.common_utils import ( + requires_cuda, + run_tests, + skipIfRocm, + TestCase, +) class UnixHTTPConnection(HTTPConnection): @@ -151,6 +156,7 @@ def test_dump_nccl_trace_pickle_with_json(self) -> None: ) self.assertEqual(resp.status, 200) + @skipIfRocm # skipped upstream too def test_tcp(self) -> None: import requests diff --git a/test/distributed/fsdp/test_fsdp_core.py b/test/distributed/fsdp/test_fsdp_core.py index 5f8b88bb6e59..e347347e2a59 100644 --- a/test/distributed/fsdp/test_fsdp_core.py +++ b/test/distributed/fsdp/test_fsdp_core.py @@ -35,8 +35,11 @@ TransformerWithSharedParams, ) from torch.testing._internal.common_utils import ( + instantiate_parametrized_tests, + NAVI_ARCH, parametrize, run_tests, + skipIfRocmArch, TEST_HPU, TEST_WITH_DEV_DBG_ASAN, ) @@ -160,6 +163,7 @@ def test_nested_always_wrap_model( @skip_if_lt_x_gpu(2) @parametrize(params, configs, subtest_name) + @skipIfRocmArch(NAVI_ARCH) # Supported in future releases def test_transformer( self, cpu_offload: CPUOffload, diff --git a/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py b/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py index 047972252fc6..8d0ce114f0d6 100644 --- a/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py +++ b/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py @@ -19,6 +19,7 @@ from torch.distributed.fsdp.wrap import ModuleWrapPolicy from torch.nn import TransformerDecoderLayer, TransformerEncoderLayer from torch.nn.parallel.distributed import DistributedDataParallel as DDP +from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_MEM_EFF_ATTENTION from torch.testing._internal.common_distributed import skip_if_lt_x_gpu from torch.testing._internal.common_fsdp import ( DEVICEInitMode, @@ -236,6 +237,9 @@ def _build_model_and_optim( return model, optim, ref_model, ref_optim @skip_if_lt_x_gpu(2) + @unittest.skipIf( + not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Platform does not support fused SDPA" + ) def test_sharded_grad_scaler_found_inf(self): self.run_subtests( { diff --git a/test/distributed/optim/test_zero_redundancy_optimizer.py b/test/distributed/optim/test_zero_redundancy_optimizer.py index 5cf402a3b6e7..671496c15fa0 100644 --- a/test/distributed/optim/test_zero_redundancy_optimizer.py +++ b/test/distributed/optim/test_zero_redundancy_optimizer.py @@ -917,6 +917,8 @@ def closure_sharded(input_tensor=input_tensor): torch.testing.assert_close( loss_ddp, loss_sharded_optim, + atol=1.3e-3, + rtol=3e-6, msg="Losses differ between local optimizer and ZeRO", ) self._check_same_model_params( From 00f48c63a533c333865211f1f3b424cc5c16632c Mon Sep 17 00:00:00 2001 From: Prachi Gupta Date: Wed, 9 Jul 2025 13:18:36 -0400 Subject: [PATCH 3/5] [SWDEV-523736] Partial cherry-pick to release/2.6 [release/2.5][SWDEV-489778] NAVI4x UT parity for distributed config (#2327) I did a sweep of all the distributed failures on NAVI4x. On a high level, we were running into following issues: - MEM_EFF_ATTENTION is not supported on NAVI4x for 2.5 causing tensors not alike issues - Some UTs pass in future releases, skipped those - Some had slight tolerance fixes as we use hipblas in this branch as compared to hipblaslt in future branches Fixes #ISSUE_NUMBER --- test/distributed/fsdp/test_fsdp_hybrid_shard.py | 5 +++++ test/distributed/optim/test_zero_redundancy_optimizer.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/test/distributed/fsdp/test_fsdp_hybrid_shard.py b/test/distributed/fsdp/test_fsdp_hybrid_shard.py index dc9b54be2dd7..f39752acb932 100644 --- a/test/distributed/fsdp/test_fsdp_hybrid_shard.py +++ b/test/distributed/fsdp/test_fsdp_hybrid_shard.py @@ -6,6 +6,7 @@ from enum import auto, Enum from functools import partial from typing import Optional +import unittest import torch import torch.distributed as dist @@ -31,6 +32,9 @@ FSDPTest, TransformerWithSharedParams, ) +from torch.testing._internal.common_cuda import ( + PLATFORM_SUPPORTS_FLASH_ATTENTION, +) from torch.testing._internal.common_utils import ( instantiate_parametrized_tests, run_tests, @@ -227,6 +231,7 @@ def test_invalid_pg_specification_raises(self): # resharded after forward. @skip_if_lt_x_gpu(2) + @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention") def test_fsdp_hybrid_shard_basic_setup(self): """ Tests basic functionality of HYBRID_SHARD and _HYBRID_SHARD_ZERO2: diff --git a/test/distributed/optim/test_zero_redundancy_optimizer.py b/test/distributed/optim/test_zero_redundancy_optimizer.py index 671496c15fa0..10a84492c75e 100644 --- a/test/distributed/optim/test_zero_redundancy_optimizer.py +++ b/test/distributed/optim/test_zero_redundancy_optimizer.py @@ -917,7 +917,7 @@ def closure_sharded(input_tensor=input_tensor): torch.testing.assert_close( loss_ddp, loss_sharded_optim, - atol=1.3e-3, + atol=1.6e-3, rtol=3e-6, msg="Losses differ between local optimizer and ZeRO", ) From a6a6ee3f9096c92245b7794972d1916f906ebe73 Mon Sep 17 00:00:00 2001 From: Artem Kuzmitckii Date: Wed, 1 Oct 2025 09:33:45 +0000 Subject: [PATCH 4/5] Remove useless change --- test/inductor/test_cooperative_reductions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/inductor/test_cooperative_reductions.py b/test/inductor/test_cooperative_reductions.py index 1649ac297d59..fc296b12a9d7 100644 --- a/test/inductor/test_cooperative_reductions.py +++ b/test/inductor/test_cooperative_reductions.py @@ -151,6 +151,7 @@ def run_and_check(self, fn, args, dtype=None, *, expect_kernel_count=1): torch._inductor.metrics.generated_kernel_count, expect_kernel_count ) return source_code + @parametrize( "name", [ From 870f6f2f2ad07eb74cc3461ef1ca6471c0c4cb3b Mon Sep 17 00:00:00 2001 From: Artem Kuzmitckii Date: Wed, 1 Oct 2025 11:26:48 +0000 Subject: [PATCH 5/5] Skip for Navi4x only --- .../_composable/fsdp/test_fully_shard_training.py | 6 +++--- test/distributed/fsdp/test_fsdp_core.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/test/distributed/_composable/fsdp/test_fully_shard_training.py b/test/distributed/_composable/fsdp/test_fully_shard_training.py index 129a61f16e03..c11c0f93e03a 100644 --- a/test/distributed/_composable/fsdp/test_fully_shard_training.py +++ b/test/distributed/_composable/fsdp/test_fully_shard_training.py @@ -44,7 +44,7 @@ ) from torch.testing._internal.common_utils import ( get_cycles_per_ms, - NAVI_ARCH, + NAVI4_ARCH, run_tests, skipIfRocmArch, wrapSwapTensorsTest, @@ -99,7 +99,7 @@ def world_size(self) -> int: return 4 @unittest.skipIf(not TEST_CUDA, "no cuda") - @skipIfRocmArch(NAVI_ARCH) # Supported in future releaes + @skipIfRocmArch(NAVI4_ARCH) # Supported in future releaes def test_param_registration_after_forward(self): """Tests the parameter registration after forward.""" device = torch.device("cuda", 0) @@ -206,7 +206,7 @@ def world_size(self) -> int: @unittest.skipIf(not TEST_CUDA, "no cuda") @wrapSwapTensorsTest(True) - @skipIfRocmArch(NAVI_ARCH) # Supported in future releaes + @skipIfRocmArch(NAVI4_ARCH) # Supported in future releaes def test_to_float64_after_init(self): """Tests that the user can cast the module to float64 after init.""" # NOTE: Test fp64 instead of a lower precision dtype like bf16 for diff --git a/test/distributed/fsdp/test_fsdp_core.py b/test/distributed/fsdp/test_fsdp_core.py index e347347e2a59..4f415b03320f 100644 --- a/test/distributed/fsdp/test_fsdp_core.py +++ b/test/distributed/fsdp/test_fsdp_core.py @@ -36,7 +36,7 @@ ) from torch.testing._internal.common_utils import ( instantiate_parametrized_tests, - NAVI_ARCH, + NAVI4_ARCH, parametrize, run_tests, skipIfRocmArch, @@ -163,7 +163,7 @@ def test_nested_always_wrap_model( @skip_if_lt_x_gpu(2) @parametrize(params, configs, subtest_name) - @skipIfRocmArch(NAVI_ARCH) # Supported in future releases + @skipIfRocmArch(NAVI4_ARCH) # Supported in future releases def test_transformer( self, cpu_offload: CPUOffload,