From df7dff7c592897b5d691bc078271b6a0976949aa Mon Sep 17 00:00:00 2001
From: Artem Kuzmitckii <artem.kuzmitckii@amd.com>
Date: Wed, 23 Jul 2025 20:02:44 +0000
Subject: [PATCH 1/5] [SWDEV-523736] Skip&Fix some testcases for Navi4x

---
 test/distributed/_tools/test_sac_ilp.py              | 4 ++--
 test/distributed/tensor/parallel/test_tp_examples.py | 1 +
 test/inductor/test_cooperative_reductions.py         | 1 -
 test/test_linalg.py                                  | 2 ++
 4 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/test/distributed/_tools/test_sac_ilp.py b/test/distributed/_tools/test_sac_ilp.py
index 777b93ea3164..cdc5210c22cb 100644
--- a/test/distributed/_tools/test_sac_ilp.py
+++ b/test/distributed/_tools/test_sac_ilp.py
@@ -17,7 +17,7 @@
     get_optimal_checkpointing_policy_per_module,
     sac_milp,
 )
-from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_cuda import TEST_CUDA, PLATFORM_SUPPORTS_FLASH_ATTENTION
 from torch.testing._internal.common_utils import (
     run_tests,
     skipIfTorchDynamo,
@@ -180,7 +180,7 @@ def test_sac_ilp_case1(self):
 
     @skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/115653")
     @unittest.skipIf(not TEST_CUDA, "CUDA not available")
-    @skipIfRocmArch(NAVI_ARCH)
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA")
     def test_sac_ilp_case2(self):
         """
         This is a case where the memory budget is not binding, meaning that no
diff --git a/test/distributed/tensor/parallel/test_tp_examples.py b/test/distributed/tensor/parallel/test_tp_examples.py
index 194735f91236..4b481511a70e 100644
--- a/test/distributed/tensor/parallel/test_tp_examples.py
+++ b/test/distributed/tensor/parallel/test_tp_examples.py
@@ -43,6 +43,7 @@
     Transformer,
     with_comms,
 )
+from unittest import skipIf
 
 
 c10d_functional = torch.ops.c10d_functional
diff --git a/test/inductor/test_cooperative_reductions.py b/test/inductor/test_cooperative_reductions.py
index fc296b12a9d7..1649ac297d59 100644
--- a/test/inductor/test_cooperative_reductions.py
+++ b/test/inductor/test_cooperative_reductions.py
@@ -151,7 +151,6 @@ def run_and_check(self, fn, args, dtype=None, *, expect_kernel_count=1):
                 torch._inductor.metrics.generated_kernel_count, expect_kernel_count
             )
         return source_code
-
     @parametrize(
         "name",
         [
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 6fc46663a67e..fe8cac1c06c5 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -20,6 +20,7 @@
      TEST_WITH_ROCM, IS_FBCODE, IS_REMOTE_GPU, iter_indices,
      make_fullrank_matrices_with_distinct_singular_values,
      freeze_rng_state, IS_ARM64, IS_SANDCASTLE, TEST_OPT_EINSUM, parametrize, skipIfTorchDynamo,
+     skipIfRocmArch, NAVI4_ARCH,
      setBlasBackendsToDefaultFinally, setLinalgBackendsToDefaultFinally, serialTest,
      runOnRocmArch, MI300_ARCH)
 from torch.testing._internal.common_device_type import \
@@ -7149,6 +7150,7 @@ def test_baddbmm_input_dtypes_compatibility(self, device, dtype):
 
     @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "cublas runtime error")
     @onlyCUDA
+    @skipIfRocmArch(NAVI4_ARCH)
     def test_matmul_45724(self, device):
         # https://github.com/pytorch/pytorch/issues/45724
         a = torch.rand(65537, 22, 64, device=device, dtype=torch.half)

From 02aaaa76912d6d79f87293dfba34426f9e2e15fa Mon Sep 17 00:00:00 2001
From: Prachi Gupta <pracgupt@amd.com>
Date: Wed, 9 Jul 2025 13:18:36 -0400
Subject: [PATCH 2/5] [SWDEV-523736] Partial cherry-pick to release/2.6

[release/2.5][SWDEV-489778] NAVI4x UT parity for distributed config (#2327)
I did a sweep of all the distributed failures on NAVI4x. On a high
level, we were running into following issues:
- MEM_EFF_ATTENTION is not supported on NAVI4x for 2.5 causing tensors
not alike issues
- Some UTs pass in future releases, skipped those
- Some had slight tolerance fixes as we use hipblas in this branch as
compared to hipblaslt in future branches

Fixes #ISSUE_NUMBER
---
 functorch/experimental/__init__.py                   |  2 +-
 .../_composable/fsdp/test_fully_shard_training.py    | 12 +++++++++++-
 test/distributed/elastic/test_control_plane.py       |  8 +++++++-
 test/distributed/fsdp/test_fsdp_core.py              |  4 ++++
 .../fsdp/test_fsdp_sharded_grad_scaler.py            |  4 ++++
 .../optim/test_zero_redundancy_optimizer.py          |  2 ++
 6 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/functorch/experimental/__init__.py b/functorch/experimental/__init__.py
index 3941f6d96e1f..ec414d8c135b 100644
--- a/functorch/experimental/__init__.py
+++ b/functorch/experimental/__init__.py
@@ -1,5 +1,5 @@
 # PyTorch forward-mode is not mature yet
-from functorch import functionalize
+from torch._functorch.deprecated import functionalize
 from torch._functorch.apis import chunk_vmap
 from torch._functorch.batch_norm_replacement import replace_all_batch_norm_modules_
 from torch._functorch.eager_transforms import hessian, jacfwd, jvp
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_training.py b/test/distributed/_composable/fsdp/test_fully_shard_training.py
index bc9f941101ba..129a61f16e03 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_training.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_training.py
@@ -27,7 +27,10 @@
 )
 from torch.distributed.tensor import DTensor, init_device_mesh, Shard
 from torch.distributed.tensor.debug import CommDebugMode
-from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_cuda import (
+    PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
+    TEST_CUDA,
+)
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import (
     check_sharded_parity,
@@ -41,7 +44,9 @@
 )
 from torch.testing._internal.common_utils import (
     get_cycles_per_ms,
+    NAVI_ARCH,
     run_tests,
+    skipIfRocmArch,
     wrapSwapTensorsTest,
 )
 from torch.testing._internal.distributed._tensor.common_dtensor import (
@@ -94,6 +99,7 @@ def world_size(self) -> int:
         return 4
 
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+    @skipIfRocmArch(NAVI_ARCH)  # Supported in future releaes
     def test_param_registration_after_forward(self):
         """Tests the parameter registration after forward."""
         device = torch.device("cuda", 0)
@@ -200,6 +206,7 @@ def world_size(self) -> int:
 
     @unittest.skipIf(not TEST_CUDA, "no cuda")
     @wrapSwapTensorsTest(True)
+    @skipIfRocmArch(NAVI_ARCH)  # Supported in future releaes
     def test_to_float64_after_init(self):
         """Tests that the user can cast the module to float64 after init."""
         # NOTE: Test fp64 instead of a lower precision dtype like bf16 for
@@ -310,6 +317,9 @@ def _shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
 
     @skip_if_lt_x_gpu(2)
     @compiled_fsdp_test(compile_compute_on_module=Transformer)
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Platform does not support fused SDPA"
+    )
     def test_train_parity_multi_group(self):
         """
         Tests train parity against DDP when using multiple parameter groups for
diff --git a/test/distributed/elastic/test_control_plane.py b/test/distributed/elastic/test_control_plane.py
index cfa221147789..10be085b70aa 100644
--- a/test/distributed/elastic/test_control_plane.py
+++ b/test/distributed/elastic/test_control_plane.py
@@ -15,7 +15,12 @@
     TORCH_WORKER_SERVER_SOCKET,
     worker_main,
 )
-from torch.testing._internal.common_utils import requires_cuda, run_tests, TestCase
+from torch.testing._internal.common_utils import (
+    requires_cuda,
+    run_tests,
+    skipIfRocm,
+    TestCase,
+)
 
 
 class UnixHTTPConnection(HTTPConnection):
@@ -151,6 +156,7 @@ def test_dump_nccl_trace_pickle_with_json(self) -> None:
             )
             self.assertEqual(resp.status, 200)
 
+    @skipIfRocm  # skipped upstream too
     def test_tcp(self) -> None:
         import requests
 
diff --git a/test/distributed/fsdp/test_fsdp_core.py b/test/distributed/fsdp/test_fsdp_core.py
index 5f8b88bb6e59..e347347e2a59 100644
--- a/test/distributed/fsdp/test_fsdp_core.py
+++ b/test/distributed/fsdp/test_fsdp_core.py
@@ -35,8 +35,11 @@
     TransformerWithSharedParams,
 )
 from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    NAVI_ARCH,
     parametrize,
     run_tests,
+    skipIfRocmArch,
     TEST_HPU,
     TEST_WITH_DEV_DBG_ASAN,
 )
@@ -160,6 +163,7 @@ def test_nested_always_wrap_model(
 
     @skip_if_lt_x_gpu(2)
     @parametrize(params, configs, subtest_name)
+    @skipIfRocmArch(NAVI_ARCH)  # Supported in future releases
     def test_transformer(
         self,
         cpu_offload: CPUOffload,
diff --git a/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py b/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py
index 047972252fc6..8d0ce114f0d6 100644
--- a/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py
+++ b/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py
@@ -19,6 +19,7 @@
 from torch.distributed.fsdp.wrap import ModuleWrapPolicy
 from torch.nn import TransformerDecoderLayer, TransformerEncoderLayer
 from torch.nn.parallel.distributed import DistributedDataParallel as DDP
+from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_MEM_EFF_ATTENTION
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import (
     DEVICEInitMode,
@@ -236,6 +237,9 @@ def _build_model_and_optim(
         return model, optim, ref_model, ref_optim
 
     @skip_if_lt_x_gpu(2)
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Platform does not support fused SDPA"
+    )
     def test_sharded_grad_scaler_found_inf(self):
         self.run_subtests(
             {
diff --git a/test/distributed/optim/test_zero_redundancy_optimizer.py b/test/distributed/optim/test_zero_redundancy_optimizer.py
index 5cf402a3b6e7..671496c15fa0 100644
--- a/test/distributed/optim/test_zero_redundancy_optimizer.py
+++ b/test/distributed/optim/test_zero_redundancy_optimizer.py
@@ -917,6 +917,8 @@ def closure_sharded(input_tensor=input_tensor):
                 torch.testing.assert_close(
                     loss_ddp,
                     loss_sharded_optim,
+                    atol=1.3e-3,
+                    rtol=3e-6,
                     msg="Losses differ between local optimizer and ZeRO",
                 )
                 self._check_same_model_params(

From 00f48c63a533c333865211f1f3b424cc5c16632c Mon Sep 17 00:00:00 2001
From: Prachi Gupta <pracgupt@amd.com>
Date: Wed, 9 Jul 2025 13:18:36 -0400
Subject: [PATCH 3/5] [SWDEV-523736] Partial cherry-pick to release/2.6

[release/2.5][SWDEV-489778] NAVI4x UT parity for distributed config (#2327)
I did a sweep of all the distributed failures on NAVI4x. On a high
level, we were running into following issues:
- MEM_EFF_ATTENTION is not supported on NAVI4x for 2.5 causing tensors
not alike issues
- Some UTs pass in future releases, skipped those
- Some had slight tolerance fixes as we use hipblas in this branch as
compared to hipblaslt in future branches

Fixes #ISSUE_NUMBER
---
 test/distributed/fsdp/test_fsdp_hybrid_shard.py          | 5 +++++
 test/distributed/optim/test_zero_redundancy_optimizer.py | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/test/distributed/fsdp/test_fsdp_hybrid_shard.py b/test/distributed/fsdp/test_fsdp_hybrid_shard.py
index dc9b54be2dd7..f39752acb932 100644
--- a/test/distributed/fsdp/test_fsdp_hybrid_shard.py
+++ b/test/distributed/fsdp/test_fsdp_hybrid_shard.py
@@ -6,6 +6,7 @@
 from enum import auto, Enum
 from functools import partial
 from typing import Optional
+import unittest
 
 import torch
 import torch.distributed as dist
@@ -31,6 +32,9 @@
     FSDPTest,
     TransformerWithSharedParams,
 )
+from torch.testing._internal.common_cuda import (
+    PLATFORM_SUPPORTS_FLASH_ATTENTION,
+)
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     run_tests,
@@ -227,6 +231,7 @@ def test_invalid_pg_specification_raises(self):
     # resharded after forward.
 
     @skip_if_lt_x_gpu(2)
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention")
     def test_fsdp_hybrid_shard_basic_setup(self):
         """
         Tests basic functionality of HYBRID_SHARD and _HYBRID_SHARD_ZERO2:
diff --git a/test/distributed/optim/test_zero_redundancy_optimizer.py b/test/distributed/optim/test_zero_redundancy_optimizer.py
index 671496c15fa0..10a84492c75e 100644
--- a/test/distributed/optim/test_zero_redundancy_optimizer.py
+++ b/test/distributed/optim/test_zero_redundancy_optimizer.py
@@ -917,7 +917,7 @@ def closure_sharded(input_tensor=input_tensor):
                 torch.testing.assert_close(
                     loss_ddp,
                     loss_sharded_optim,
-                    atol=1.3e-3,
+                    atol=1.6e-3,
                     rtol=3e-6,
                     msg="Losses differ between local optimizer and ZeRO",
                 )

From a6a6ee3f9096c92245b7794972d1916f906ebe73 Mon Sep 17 00:00:00 2001
From: Artem Kuzmitckii <artem.kuzmitckii@amd.com>
Date: Wed, 1 Oct 2025 09:33:45 +0000
Subject: [PATCH 4/5] Remove useless change

---
 test/inductor/test_cooperative_reductions.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/inductor/test_cooperative_reductions.py b/test/inductor/test_cooperative_reductions.py
index 1649ac297d59..fc296b12a9d7 100644
--- a/test/inductor/test_cooperative_reductions.py
+++ b/test/inductor/test_cooperative_reductions.py
@@ -151,6 +151,7 @@ def run_and_check(self, fn, args, dtype=None, *, expect_kernel_count=1):
                 torch._inductor.metrics.generated_kernel_count, expect_kernel_count
             )
         return source_code
+
     @parametrize(
         "name",
         [

From 870f6f2f2ad07eb74cc3461ef1ca6471c0c4cb3b Mon Sep 17 00:00:00 2001
From: Artem Kuzmitckii <artem.kuzmitckii@amd.com>
Date: Wed, 1 Oct 2025 11:26:48 +0000
Subject: [PATCH 5/5] Skip for Navi4x only

---
 .../_composable/fsdp/test_fully_shard_training.py           | 6 +++---
 test/distributed/fsdp/test_fsdp_core.py                     | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/distributed/_composable/fsdp/test_fully_shard_training.py b/test/distributed/_composable/fsdp/test_fully_shard_training.py
index 129a61f16e03..c11c0f93e03a 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_training.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_training.py
@@ -44,7 +44,7 @@
 )
 from torch.testing._internal.common_utils import (
     get_cycles_per_ms,
-    NAVI_ARCH,
+    NAVI4_ARCH,
     run_tests,
     skipIfRocmArch,
     wrapSwapTensorsTest,
@@ -99,7 +99,7 @@ def world_size(self) -> int:
         return 4
 
     @unittest.skipIf(not TEST_CUDA, "no cuda")
-    @skipIfRocmArch(NAVI_ARCH)  # Supported in future releaes
+    @skipIfRocmArch(NAVI4_ARCH)  # Supported in future releaes
     def test_param_registration_after_forward(self):
         """Tests the parameter registration after forward."""
         device = torch.device("cuda", 0)
@@ -206,7 +206,7 @@ def world_size(self) -> int:
 
     @unittest.skipIf(not TEST_CUDA, "no cuda")
     @wrapSwapTensorsTest(True)
-    @skipIfRocmArch(NAVI_ARCH)  # Supported in future releaes
+    @skipIfRocmArch(NAVI4_ARCH)  # Supported in future releaes
     def test_to_float64_after_init(self):
         """Tests that the user can cast the module to float64 after init."""
         # NOTE: Test fp64 instead of a lower precision dtype like bf16 for
diff --git a/test/distributed/fsdp/test_fsdp_core.py b/test/distributed/fsdp/test_fsdp_core.py
index e347347e2a59..4f415b03320f 100644
--- a/test/distributed/fsdp/test_fsdp_core.py
+++ b/test/distributed/fsdp/test_fsdp_core.py
@@ -36,7 +36,7 @@
 )
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
-    NAVI_ARCH,
+    NAVI4_ARCH,
     parametrize,
     run_tests,
     skipIfRocmArch,
@@ -163,7 +163,7 @@ def test_nested_always_wrap_model(
 
     @skip_if_lt_x_gpu(2)
     @parametrize(params, configs, subtest_name)
-    @skipIfRocmArch(NAVI_ARCH)  # Supported in future releases
+    @skipIfRocmArch(NAVI4_ARCH)  # Supported in future releases
     def test_transformer(
         self,
         cpu_offload: CPUOffload,