From 9b01e6d6a952e92af6de50851f1435bdd3026a9d Mon Sep 17 00:00:00 2001
From: Prachi Gupta <pracgupt@amd.com>
Date: Fri, 16 May 2025 10:48:11 -0400
Subject: [PATCH 1/2] [release/2.7][ROCm] update state check for
 test_trace_while_active* (#2110)

When timing in enabled, ROCR runtime used to sleep for a small amount
which ensured that the application saw the correct state. However, for
perf reasons this sleep was removed and now the state is not guaranteed
to be "started". That's why, I updated the test state check to be either
"started" or "scheduled"

Fixes https://ontrack-internal.amd.com/browse/SWDEV-525883

Upstream PR: https://github.com/pytorch/pytorch/pull/153545

(cherry picked from commit 8a1ad2c45e2851466c7e14e35e2bfad8459a4bd8)
---
 test/distributed/test_c10d_nccl.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index 158fa46633c3e..09ee216430f3f 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -39,7 +39,7 @@
 from torch import nn
 from torch._C._distributed_c10d import ErrorType, OpType, WorkResult
 from torch.nn.parallel import DistributedDataParallel
-from torch.testing._internal.common_cuda import TEST_MULTIGPU
+from torch.testing._internal.common_cuda import TEST_MULTIGPU, _get_torch_rocm_version
 from torch.testing._internal.common_distributed import (
     get_timeout,
     init_multigpu_helper,
@@ -4634,9 +4634,17 @@ def test_trace_while_active(self, timing_enabled, only_active):
                 else:
                     self.assertEqual(t[-1]["profiling_name"], "nccl:all_reduce")
                     self.assertEqual(t[-1]["collective_seq_id"], 2)
-                    self.assertEqual(
-                        t[-1]["state"], self.started_or_scheduled(timing_enabled)
-                    )
+
+                    #ROCm runtime used to call uSleep(20 µs)inside the default‑signal busy-wait loop.
+                    #Now, this sleep is removed which lets the host thread spin continuously
+                    #Therefore, the state can either be scheduled or started before test dumps the trace.
+                    if TEST_WITH_ROCM and _get_torch_rocm_version() >= (6,4) and timing_enabled:
+                        assert(
+                            t[-1]["state"] in ("scheduled", "started"))
+                    else:
+                        self.assertEqual(
+                            t[-1]["state"], self.started_or_scheduled(timing_enabled)
+                        )
 
             self.parent.send("next")
             self.assertEqual("next", self.parent.recv())

From f1ac3b26cf95f1112830bbcc8efc541457806cbc Mon Sep 17 00:00:00 2001
From: Prachi Gupta <prachi.gupta@amd.com>
Date: Wed, 28 May 2025 15:29:04 +0000
Subject: [PATCH 2/2] Use torch.version.hip

---
 test/distributed/test_c10d_nccl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index 09ee216430f3f..fbd4a7b1e6d22 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -4638,7 +4638,7 @@ def test_trace_while_active(self, timing_enabled, only_active):
                     #ROCm runtime used to call uSleep(20 µs)inside the default‑signal busy-wait loop.
                     #Now, this sleep is removed which lets the host thread spin continuously
                     #Therefore, the state can either be scheduled or started before test dumps the trace.
-                    if TEST_WITH_ROCM and _get_torch_rocm_version() >= (6,4) and timing_enabled:
+                    if torch.version.hip and _get_torch_rocm_version() >= (6,4) and timing_enabled:
                         assert(
                             t[-1]["state"] in ("scheduled", "started"))
                     else: