From ec9987dc71a1e9b84863b0262c838780e7b3576e Mon Sep 17 00:00:00 2001
From: Pruthvi Madugundu <pruthvigithub@gmail.com>
Date: Fri, 8 Mar 2024 11:29:12 -0800
Subject: [PATCH 01/16] CONSOLIDATED COMMITS: Triton build updates

==========================================

Triton build conditionalized on ROCM_VERSION

Include the ROCm version in triton version

(cherry picked from commit 7d33910198d20c755e40f30de6c9b9e2e03b1a8c)
(cherry picked from commit 0412eb4eed668567d394615aa644eacc2741dcd3)

Update triton-rocm.txt to triton.txt

(cherry picked from commit 0ce9f6ea5f6a654e05fa098f1693c05f3c33d6ec)

Use ROCm/triton for install_triton.sh

(cherry picked from commit 6e9714bbfbe8f003643252ec58f7b42a3c9c364c)

update triton commit

Revert "Use ROCm/triton for install_triton.sh"

This reverts commit 81b0cbc8435122030044049c661f252ee8aa7ae5.

change triton repo

Update triton.txt to use release/internal/3.3.x branch

Use ROCm/triton

Use ROCm/triton for install_triton.sh

(cherry picked from commit 0036db5ab1a6fcf13662cd8ccd99f021422fe547)
---
 .ci/docker/common/install_triton.sh   |  2 +-
 .github/scripts/build_triton_wheel.py | 37 +++++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/.ci/docker/common/install_triton.sh b/.ci/docker/common/install_triton.sh
index 1b68e3c247839..b2fdebdcc4747 100755
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@@ -21,7 +21,7 @@ elif [ -n "${TRITON_CPU}" ]; then
   TRITON_REPO="https://github.com/triton-lang/triton-cpu"
   TRITON_TEXT_FILE="triton-cpu"
 else
-  TRITON_REPO="https://github.com/triton-lang/triton"
+  TRITON_REPO="https://github.com/ROCm/triton"
   TRITON_TEXT_FILE="triton"
 fi
 
diff --git a/.github/scripts/build_triton_wheel.py b/.github/scripts/build_triton_wheel.py
index 11fa8404273d3..b12d7bfbeb1ee 100644
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 
 import os
+import re
 import shutil
 import sys
 from pathlib import Path
@@ -50,6 +51,31 @@ def patch_init_py(
     with open(path, "w") as f:
         f.write(orig)
 
+def get_rocm_version() -> str:
+    rocm_path = os.environ.get('ROCM_HOME') or os.environ.get('ROCM_PATH') or "/opt/rocm"
+    rocm_version = "0.0.0"
+    rocm_version_h = f"{rocm_path}/include/rocm-core/rocm_version.h"
+    if not os.path.isfile(rocm_version_h):
+        rocm_version_h = f"{rocm_path}/include/rocm_version.h"
+
+    # The file could be missing due to 1) ROCm version < 5.2, or 2) no ROCm install.
+    if os.path.isfile(rocm_version_h):
+        RE_MAJOR = re.compile(r"#define\s+ROCM_VERSION_MAJOR\s+(\d+)")
+        RE_MINOR = re.compile(r"#define\s+ROCM_VERSION_MINOR\s+(\d+)")
+        RE_PATCH = re.compile(r"#define\s+ROCM_VERSION_PATCH\s+(\d+)")
+        major, minor, patch = 0, 0, 0
+        for line in open(rocm_version_h):
+            match = RE_MAJOR.search(line)
+            if match:
+                major = int(match.group(1))
+            match = RE_MINOR.search(line)
+            if match:
+                minor = int(match.group(1))
+            match = RE_PATCH.search(line)
+            if match:
+                patch = int(match.group(1))
+        rocm_version = str(major)+"."+str(minor)+"."+str(patch)
+    return rocm_version
 
 def build_triton(
     *,
@@ -65,6 +91,14 @@ def build_triton(
         max_jobs = os.cpu_count() or 1
         env["MAX_JOBS"] = str(max_jobs)
 
+    version_suffix = ""
+    if not release:
+        # Nightly binaries include the triton commit hash, i.e. 2.1.0+e6216047b8
+        # while release build should only include the version, i.e. 2.1.0
+        rocm_version = get_rocm_version()
+        version_suffix = f"+rocm{rocm_version}_{commit_hash[:10]}"
+        version += version_suffix
+
     with TemporaryDirectory() as tmpdir:
         triton_basedir = Path(tmpdir) / "triton"
         triton_pythondir = triton_basedir / "python"
@@ -72,6 +106,7 @@ def build_triton(
         triton_repo = "https://github.com/openai/triton"
         if device == "rocm":
             triton_pkg_name = "pytorch-triton-rocm"
+            triton_repo = "https://github.com/ROCm/triton/"
         elif device == "xpu":
             triton_pkg_name = "pytorch-triton-xpu"
             triton_repo = "https://github.com/intel/intel-xpu-backend-for-triton"
@@ -104,6 +139,8 @@ def build_triton(
                 cwd=triton_basedir,
                 shell=True,
             )
+            cur_rocm_ver = get_rocm_version()
+            check_call(["scripts/amd/setup_rocm_libs.sh", cur_rocm_ver], cwd=triton_basedir)
             print("ROCm libraries setup for triton installation...")
 
         # old triton versions have setup.py in the python/ dir,

From 68824af7d8c9d49ca7f0b4992f31c293299a55b6 Mon Sep 17 00:00:00 2001
From: Ethan Wee <Ethan.Wee@amd.com>
Date: Tue, 12 Aug 2025 11:40:53 -0700
Subject: [PATCH 02/16] [rocm7.1_internal_testing] Change pytorch-triton-rocm
 to pytorch-triton (#2482)

Related to https://github.com/ROCm/builder/pull/90/files

http://rocm-ci.amd.com/job/mainline-pytorch_internal-manylinux-wheels/305/

PyTorch wheel installs successfully when building torchvision/torchaudio

(cherry picked from commit c1ee54d9804886c5de9cb8eff295185afffafd1d)
---
 .circleci/scripts/binary_populate_env.sh | 4 ++--
 .github/scripts/build_triton_wheel.py    | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh
index 11f9678579935..f876ac8efcf7f 100755
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@@ -84,10 +84,10 @@ fi
 
 # Set triton via PYTORCH_EXTRA_INSTALL_REQUIREMENTS for triton rocm package
 if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*rocm.* && $(uname) == "Linux" ]]; then
-    TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
+    TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
     if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
         TRITON_SHORTHASH=$(cut -c1-8 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton.txt)
-        TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}+git${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}"
+        TRITON_REQUIREMENT="triton==${TRITON_VERSION}+git${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}"
     fi
     if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
         export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${TRITON_REQUIREMENT}"
diff --git a/.github/scripts/build_triton_wheel.py b/.github/scripts/build_triton_wheel.py
index b12d7bfbeb1ee..3b5877b6a987c 100644
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@@ -105,8 +105,8 @@ def build_triton(
 
         triton_repo = "https://github.com/openai/triton"
         if device == "rocm":
-            triton_pkg_name = "pytorch-triton-rocm"
-            triton_repo = "https://github.com/ROCm/triton/"
+            triton_pkg_name = "triton"
+            triton_repo = "https://github.com/ROCm/triton"
         elif device == "xpu":
             triton_pkg_name = "pytorch-triton-xpu"
             triton_repo = "https://github.com/intel/intel-xpu-backend-for-triton"

From 05e104c7c9941bab925c2d61859587856070a358 Mon Sep 17 00:00:00 2001
From: Prachi Gupta <pracgupt@amd.com>
Date: Fri, 22 Aug 2025 18:35:35 -0500
Subject: [PATCH 03/16] [rocm7.1_internal_testing] Add triton_kernels wheel
 generation (#2566)

Fixes #ISSUE_NUMBER

(cherry picked from commit 0ea0592f5fd18fd105995af578ce8156de71d57c)
---
 .github/scripts/build_triton_wheel.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/scripts/build_triton_wheel.py b/.github/scripts/build_triton_wheel.py
index 3b5877b6a987c..16eb6a7851974 100644
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@@ -164,6 +164,13 @@ def build_triton(
                 cwd=triton_basedir,
             )
 
+        # For gpt-oss models, triton requires this extra triton_kernels wheel
+        # triton_kernels came after pytorch release/2.8
+        triton_kernels_dir = Path(f"{triton_basedir}/python/triton_kernels")
+        check_call([sys.executable, "-m", "build", "--wheel"], cwd=triton_kernels_dir, env=env)
+        kernels_whl_path = next(iter((triton_kernels_dir / "dist").glob("*.whl")))
+        shutil.copy(kernels_whl_path, Path.cwd())
+
         return Path.cwd() / whl_path.name
 
 

From 9b4ad7dd6afea77b14c337d80c32f812f98a9888 Mon Sep 17 00:00:00 2001
From: Bo Li <110066325+BLOrange-AMD@users.noreply.github.com>
Date: Thu, 18 Apr 2024 11:12:30 -0500
Subject: [PATCH 04/16] CONSOLIDATED COMMITS: Implementation of PyTorch ut
 parsing script - QA helper functions

=======================================================================================

Implementation of PyTorch ut parsing script - QA helper function (#1386)

* Initial implementation of PyTorch ut parsing script

* Extracted path variables

* Use nested dict to save results

* Fixes typo

* Cleanup

* Fixes several issues

* Minor name change

* Update run_pytorch_unit_tests.py

* Added file banners

* Supported running from API

* Added more help info

* Consistent naming

* Format help text

---------

Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
Co-authored-by: Jithun Nair <jithun.nair@amd.com>

Print consolidated log file for pytorch unit test automation scripts (#1433)

* Print consolidated log file for pytorch uts

* Update run_entire_tests subprocess call as well

* lint

* Add ERROR string

[SWDEV-466849] Enhancements for PyTorch UT helper scripts (#1491)

* Check that >1 GPUs are visible when running TEST_CONFIG=distributed

* Add EXECUTION_TIME to file-level and aggregate statistics

PyTorch unit test helper scripts enhancements (#1517)

* Fail earlier for distributed-on-1-GPU scenario
* print cmd in consolidated log with prettier formatting
* python->python3

Fixes https://ontrack-internal.amd.com/browse/SWDEV-477264

---------

Co-authored-by: blorange-amd <bo.li2@amd.com>

Several issues fix of QA helper script (#1564)

Fixes SWDEV-475071: https://ontrack-internal.amd.com/browse/SWDEV-475071

Removed args inside function (#1595)

Fixes SWDEV-475071

(cherry picked from commit 041aa1b47978154de63edc6b7ffcdea218a847a3)

QA script - Added multi gpu check with priority_tests (#1604)

Fixes SWDEV-487907. Verified throwing exception for distributed is
working correctly on single gpu with command: python
.automation_scripts/run_pytorch_unit_tests.py --priority_test

(cherry picked from commit 57cc742271cbf4547f9213710e57f6444bbc983e)
(cherry picked from commit 6d5c3dcae5d02196749df11b57e6d0d5e4e2212e)
(cherry picked from commit 2ee3aa2de081680756002bbfc4006b5f68e68c16)
---
 .automation_scripts/parse_xml_results.py      | 178 ++++++
 .automation_scripts/run_pytorch_unit_tests.py | 518 ++++++++++++++++++
 2 files changed, 696 insertions(+)
 create mode 100644 .automation_scripts/parse_xml_results.py
 create mode 100644 .automation_scripts/run_pytorch_unit_tests.py

diff --git a/.automation_scripts/parse_xml_results.py b/.automation_scripts/parse_xml_results.py
new file mode 100644
index 0000000000000..7db2e1ce9233c
--- /dev/null
+++ b/.automation_scripts/parse_xml_results.py
@@ -0,0 +1,178 @@
+""" The Python PyTorch testing script.
+##
+# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+"""
+
+import xml.etree.ElementTree as ET
+from pathlib import Path
+from typing import Any, Dict, Tuple
+
+# Backends list
+BACKENDS_LIST = [
+    "dist-gloo",
+    "dist-nccl"
+]
+
+TARGET_WORKFLOW = "--rerun-disabled-tests"
+
+def get_job_id(report: Path) -> int:
+    # [Job id in artifacts]
+    # Retrieve the job id from the report path. In our GHA workflows, we append
+    # the job id to the end of the report name, so `report` looks like:
+    #     unzipped-test-reports-foo_5596745227/test/test-reports/foo/TEST-foo.xml
+    # and we want to get `5596745227` out of it.
+    try:
+        return int(report.parts[0].rpartition("_")[2])
+    except ValueError:
+        return -1
+
+def is_rerun_disabled_tests(root: ET.ElementTree) -> bool:
+    """
+    Check if the test report is coming from rerun_disabled_tests workflow
+    """
+    skipped = root.find(".//*skipped")
+    # Need to check against None here, if not skipped doesn't work as expected
+    if skipped is None:
+        return False
+
+    message = skipped.attrib.get("message", "")
+    return TARGET_WORKFLOW in message or "num_red" in message
+
+def parse_xml_report(
+    tag: str,
+    report: Path,
+    workflow_id: int,
+    workflow_run_attempt: int,
+    work_flow_name: str
+) -> Dict[Tuple[str], Dict[str, Any]]:
+    """Convert a test report xml file into a JSON-serializable list of test cases."""
+    print(f"Parsing {tag}s for test report: {report}")
+
+    job_id = get_job_id(report)
+    print(f"Found job id: {job_id}")
+
+    test_cases: Dict[Tuple[str], Dict[str, Any]] = {}
+
+    root = ET.parse(report)
+    # TODO: unlike unittest, pytest-flakefinder used by rerun disabled tests for test_ops
+    # includes skipped messages multiple times (50 times by default). This slows down
+    # this script too much (O(n)) because it tries to gather all the stats. This should
+    # be fixed later in the way we use pytest-flakefinder. A zipped test report from rerun
+    # disabled test is only few MB, but will balloon up to a much bigger XML file after
+    # extracting from a dozen to few hundred MB
+    if is_rerun_disabled_tests(root):
+        return test_cases
+
+    for test_case in root.iter(tag):
+        case = process_xml_element(test_case)
+        if tag == 'testcase':
+            case["workflow_id"] = workflow_id
+            case["workflow_run_attempt"] = workflow_run_attempt
+            case["job_id"] = job_id
+            case["work_flow_name"] = work_flow_name
+
+            # [invoking file]
+            # The name of the file that the test is located in is not necessarily
+            # the same as the name of the file that invoked the test.
+            # For example, `test_jit.py` calls into multiple other test files (e.g.
+            # jit/test_dce.py). For sharding/test selection purposes, we want to
+            # record the file that invoked the test.
+            #
+            # To do this, we leverage an implementation detail of how we write out
+            # tests (https://bit.ly/3ajEV1M), which is that reports are created
+            # under a folder with the same name as the invoking file.
+            case_name = report.parent.name
+            for ind in range(len(BACKENDS_LIST)):
+                if BACKENDS_LIST[ind] in report.parts:
+                    case_name = case_name + "_" + BACKENDS_LIST[ind]
+                    break
+            case["invoking_file"] = case_name
+            test_cases[ ( case["invoking_file"], case["classname"], case["name"], case["work_flow_name"] ) ] = case
+        elif tag == 'testsuite':
+            case["work_flow_name"] = work_flow_name
+            case["invoking_xml"] = report.name
+            case["running_time_xml"] = case["time"]
+            case_name = report.parent.name
+            for ind in range(len(BACKENDS_LIST)):
+                if BACKENDS_LIST[ind] in report.parts:
+                    case_name = case_name + "_" + BACKENDS_LIST[ind]
+                    break
+            case["invoking_file"] = case_name
+
+            test_cases[ ( case["invoking_file"], case["invoking_xml"], case["work_flow_name"] ) ] = case
+
+    return test_cases
+
+def process_xml_element(element: ET.Element) -> Dict[str, Any]:
+    """Convert a test suite element into a JSON-serializable dict."""
+    ret: Dict[str, Any] = {}
+
+    # Convert attributes directly into dict elements.
+    # e.g.
+    #     <testcase name="test_foo" classname="test_bar"></testcase>
+    # becomes:
+    #     {"name": "test_foo", "classname": "test_bar"}
+    ret.update(element.attrib)
+
+    # The XML format encodes all values as strings. Convert to ints/floats if
+    # possible to make aggregation possible in Rockset.
+    for k, v in ret.items():
+        try:
+            ret[k] = int(v)
+        except ValueError:
+            pass
+        try:
+            ret[k] = float(v)
+        except ValueError:
+            pass
+
+    # Convert inner and outer text into special dict elements.
+    # e.g.
+    #     <testcase>my_inner_text</testcase> my_tail
+    # becomes:
+    #     {"text": "my_inner_text", "tail": " my_tail"}
+    if element.text and element.text.strip():
+        ret["text"] = element.text
+    if element.tail and element.tail.strip():
+        ret["tail"] = element.tail
+
+    # Convert child elements recursively, placing them at a key:
+    # e.g.
+    #     <testcase>
+    #       <foo>hello</foo>
+    #       <foo>world</foo>
+    #       <bar>another</bar>
+    #     </testcase>
+    # becomes
+    #    {
+    #       "foo": [{"text": "hello"}, {"text": "world"}],
+    #       "bar": {"text": "another"}
+    #    }
+    for child in element:
+        if child.tag not in ret:
+            ret[child.tag] = process_xml_element(child)
+        else:
+            # If there are multiple tags with the same name, they should be
+            # coalesced into a list.
+            if not isinstance(ret[child.tag], list):
+                ret[child.tag] = [ret[child.tag]]
+            ret[child.tag].append(process_xml_element(child))
+    return ret
\ No newline at end of file
diff --git a/.automation_scripts/run_pytorch_unit_tests.py b/.automation_scripts/run_pytorch_unit_tests.py
new file mode 100644
index 0000000000000..514afd19624c3
--- /dev/null
+++ b/.automation_scripts/run_pytorch_unit_tests.py
@@ -0,0 +1,518 @@
+#!/usr/bin/env python3
+
+""" The Python PyTorch testing script.
+##
+# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+"""
+
+import argparse
+import os
+import shutil
+import subprocess
+from subprocess import STDOUT, CalledProcessError
+
+from collections import namedtuple
+from datetime import datetime
+from pathlib import Path
+from parse_xml_results import (
+        parse_xml_report
+)
+from pprint import pprint
+from typing import Any, Dict, List
+
+# unit test status list
+UT_STATUS_LIST = [
+    "PASSED",
+    "MISSED",
+    "SKIPPED",
+    "FAILED",
+    "XFAILED",
+    "ERROR"
+]
+
+DEFAULT_CORE_TESTS = [
+    "test_nn",
+    "test_torch",
+    "test_cuda",
+    "test_ops",
+    "test_unary_ufuncs",
+    "test_autograd",
+    "inductor/test_torchinductor"
+]
+
+DISTRIBUTED_CORE_TESTS = [
+    "distributed/test_c10d_common",
+    "distributed/test_c10d_nccl",
+    "distributed/test_distributed_spawn"
+]
+
+CONSOLIDATED_LOG_FILE_NAME="pytorch_unit_tests.log"
+
+def parse_xml_reports_as_dict(workflow_run_id, workflow_run_attempt, tag, workflow_name, path="."):
+    test_cases = {}
+    items_list = os.listdir(path)
+    for dir in items_list:
+        new_dir = path + '/' + dir + '/'
+        if os.path.isdir(new_dir):
+            for xml_report in Path(new_dir).glob("**/*.xml"):
+                test_cases.update(
+                    parse_xml_report(
+                        tag,
+                        xml_report,
+                        workflow_run_id,
+                        workflow_run_attempt,
+                        workflow_name
+                    )
+                )
+    return test_cases
+
+def get_test_status(test_case):
+  # In order of priority: S=skipped, F=failure, E=error, P=pass
+  if "skipped" in test_case and test_case["skipped"]:
+      type_message = test_case["skipped"]
+      if type_message.__contains__('type') and type_message['type'] == "pytest.xfail":
+          return "XFAILED"
+      else:
+          return "SKIPPED"
+  elif "failure" in test_case and test_case["failure"]:
+    return "FAILED"
+  elif "error" in test_case and test_case["error"]:
+    return "ERROR"
+  else:
+    return "PASSED"
+
+def get_test_message(test_case, status=None):
+  if status == "SKIPPED":
+    return test_case["skipped"] if "skipped" in test_case else ""
+  elif status == "FAILED":
+    return test_case["failure"] if "failure" in test_case else ""
+  elif status == "ERROR":
+    return test_case["error"] if "error" in test_case else ""
+  else:
+    if "skipped" in test_case:
+      return test_case["skipped"]
+    elif "failure" in test_case:
+      return test_case["failure"]
+    elif "error" in test_case:
+      return test_case["error"]
+    else:
+      return ""
+
+def get_test_file_running_time(test_suite):
+  if test_suite.__contains__('time'):
+    return test_suite["time"]
+  return 0
+
+def get_test_running_time(test_case):
+  if test_case.__contains__('time'):
+    return test_case["time"]
+  return ""
+
+def summarize_xml_files(path, workflow_name):
+    # statistics
+    TOTAL_TEST_NUM = 0
+    TOTAL_PASSED_NUM = 0
+    TOTAL_SKIPPED_NUM = 0
+    TOTAL_XFAIL_NUM = 0
+    TOTAL_FAILED_NUM = 0
+    TOTAL_ERROR_NUM = 0
+    TOTAL_EXECUTION_TIME = 0
+
+    #parse the xml files
+    test_cases = parse_xml_reports_as_dict(-1, -1, 'testcase', workflow_name, path)
+    test_suites = parse_xml_reports_as_dict(-1, -1, 'testsuite', workflow_name, path)
+    test_file_and_status = namedtuple("test_file_and_status", ["file_name", "status"])
+    # results dict
+    res = {}
+    res_item_list = [ "PASSED", "SKIPPED", "XFAILED", "FAILED", "ERROR" ]
+    test_file_items = set()
+    for (k,v) in list(test_suites.items()):
+        file_name = k[0]
+        if not file_name in test_file_items:
+            test_file_items.add(file_name)
+            # initialization
+            for item in res_item_list:
+                temp_item = test_file_and_status(file_name, item)
+                res[temp_item] = {}
+            temp_item_statistics = test_file_and_status(file_name, "STATISTICS")
+            res[temp_item_statistics] = {'TOTAL': 0, 'PASSED': 0, 'SKIPPED': 0, 'XFAILED': 0, 'FAILED': 0, 'ERROR': 0, 'EXECUTION_TIME': 0}
+            test_running_time = get_test_file_running_time(v)
+            res[temp_item_statistics]["EXECUTION_TIME"] += test_running_time
+            TOTAL_EXECUTION_TIME += test_running_time
+        else:
+            test_tuple_key_statistics = test_file_and_status(file_name, "STATISTICS")
+            test_running_time = get_test_file_running_time(v)
+            res[test_tuple_key_statistics]["EXECUTION_TIME"] += test_running_time
+            TOTAL_EXECUTION_TIME += test_running_time
+
+    for (k,v) in list(test_cases.items()):
+        file_name = k[0]
+        class_name = k[1]
+        test_name = k[2]
+        combined_name = file_name + "::" + class_name + "::" + test_name
+        test_status = get_test_status(v)
+        test_running_time = get_test_running_time(v)
+        test_message = get_test_message(v, test_status)
+        test_info_value = ""
+        test_tuple_key_status = test_file_and_status(file_name, test_status)
+        test_tuple_key_statistics = test_file_and_status(file_name, "STATISTICS")
+        TOTAL_TEST_NUM += 1
+        res[test_tuple_key_statistics]["TOTAL"] += 1
+        if test_status == "PASSED":
+            test_info_value = str(test_running_time)
+            res[test_tuple_key_status][combined_name] = test_info_value
+            res[test_tuple_key_statistics]["PASSED"] += 1
+            TOTAL_PASSED_NUM += 1
+        elif test_status == "SKIPPED":
+            test_info_value = str(test_running_time)
+            res[test_tuple_key_status][combined_name] = test_info_value
+            res[test_tuple_key_statistics]["SKIPPED"] += 1
+            TOTAL_SKIPPED_NUM += 1
+        elif test_status == "XFAILED":
+            test_info_value = str(test_running_time)
+            res[test_tuple_key_status][combined_name] = test_info_value
+            res[test_tuple_key_statistics]["XFAILED"] += 1
+            TOTAL_XFAIL_NUM += 1
+        elif test_status == "FAILED":
+            test_info_value = test_message
+            res[test_tuple_key_status][combined_name] = test_info_value
+            res[test_tuple_key_statistics]["FAILED"] += 1
+            TOTAL_FAILED_NUM += 1
+        elif test_status == "ERROR":
+            test_info_value = test_message
+            res[test_tuple_key_status][combined_name] = test_info_value
+            res[test_tuple_key_statistics]["ERROR"] += 1
+            TOTAL_ERROR_NUM += 1
+
+    # generate statistics_dict
+    statistics_dict = {}
+    statistics_dict["TOTAL"] = TOTAL_TEST_NUM
+    statistics_dict["PASSED"] = TOTAL_PASSED_NUM
+    statistics_dict["SKIPPED"] = TOTAL_SKIPPED_NUM
+    statistics_dict["XFAILED"] = TOTAL_XFAIL_NUM
+    statistics_dict["FAILED"] = TOTAL_FAILED_NUM
+    statistics_dict["ERROR"] = TOTAL_ERROR_NUM
+    statistics_dict["EXECUTION_TIME"] = TOTAL_EXECUTION_TIME
+    aggregate_item = workflow_name + "_aggregate"
+    total_item = test_file_and_status(aggregate_item, "STATISTICS")
+    res[total_item] = statistics_dict
+
+    return res
+
+def run_command_and_capture_output(cmd):
+    try:
+        print(f"Running command '{cmd}'")
+        with open(CONSOLIDATED_LOG_FILE_PATH, "a+") as output_file:
+            print(f"========================================", file=output_file, flush=True)
+            print(f"[RUN_PYTORCH_UNIT_TESTS] Running command '{cmd}'", file=output_file, flush=True) # send to consolidated file as well
+            print(f"========================================", file=output_file, flush=True)
+            p = subprocess.run(cmd, shell=True, stdout=output_file, stderr=STDOUT, text=True)
+    except CalledProcessError as e:
+        print(f"ERROR: Cmd {cmd} failed with return code: {e.returncode}!")
+
+def run_entire_tests(workflow_name, test_shell_path, overall_logs_path_current_run, test_reports_src):
+    if os.path.exists(test_reports_src):
+        shutil.rmtree(test_reports_src)
+
+    os.mkdir(test_reports_src)
+    copied_logs_path = ""
+    if workflow_name == "default":
+        os.environ['TEST_CONFIG'] = 'default'
+        copied_logs_path = overall_logs_path_current_run + "default_xml_results_entire_tests/"
+    elif workflow_name == "distributed":
+        os.environ['TEST_CONFIG'] = 'distributed'
+        copied_logs_path = overall_logs_path_current_run + "distributed_xml_results_entire_tests/"
+    elif workflow_name == "inductor":
+        os.environ['TEST_CONFIG'] = 'inductor'
+        copied_logs_path = overall_logs_path_current_run + "inductor_xml_results_entire_tests/"
+    # use test.sh for tests execution
+    run_command_and_capture_output(test_shell_path)
+    copied_logs_path_destination = shutil.copytree(test_reports_src, copied_logs_path)
+    entire_results_dict = summarize_xml_files(copied_logs_path_destination, workflow_name)
+    return entire_results_dict
+
+def run_priority_tests(workflow_name, test_run_test_path, overall_logs_path_current_run, test_reports_src):
+    if os.path.exists(test_reports_src):
+        shutil.rmtree(test_reports_src)
+
+    os.mkdir(test_reports_src)
+    copied_logs_path = ""
+    if workflow_name == "default":
+        os.environ['TEST_CONFIG'] = 'default'
+        os.environ['HIP_VISIBLE_DEVICES'] = '0'
+        copied_logs_path = overall_logs_path_current_run + "default_xml_results_priority_tests/"
+        # use run_test.py for tests execution
+        default_priority_test_suites = " ".join(DEFAULT_CORE_TESTS)
+        command = "python3 " + test_run_test_path + " --include " + default_priority_test_suites + " --exclude-jit-executor --exclude-distributed-tests --verbose"
+        run_command_and_capture_output(command)
+        del os.environ['HIP_VISIBLE_DEVICES']
+    elif workflow_name == "distributed":
+        os.environ['TEST_CONFIG'] = 'distributed'
+        os.environ['HIP_VISIBLE_DEVICES'] = '0,1'
+        copied_logs_path = overall_logs_path_current_run + "distributed_xml_results_priority_tests/"
+        # use run_test.py for tests execution
+        distributed_priority_test_suites = " ".join(DISTRIBUTED_CORE_TESTS)
+        command = "python3 " + test_run_test_path + " --include " + distributed_priority_test_suites + " --distributed-tests --verbose"
+        run_command_and_capture_output(command)
+        del os.environ['HIP_VISIBLE_DEVICES']
+    copied_logs_path_destination = shutil.copytree(test_reports_src, copied_logs_path)
+    priority_results_dict = summarize_xml_files(copied_logs_path_destination, workflow_name)
+
+    return priority_results_dict
+
+def run_selected_tests(workflow_name, test_run_test_path, overall_logs_path_current_run, test_reports_src, selected_list):
+    if os.path.exists(test_reports_src):
+        shutil.rmtree(test_reports_src)
+
+    os.mkdir(test_reports_src)
+    copied_logs_path = ""
+    if workflow_name == "default":
+        os.environ['TEST_CONFIG'] = 'default'
+        os.environ['HIP_VISIBLE_DEVICES'] = '0'
+        copied_logs_path = overall_logs_path_current_run + "default_xml_results_selected_tests/"
+        # use run_test.py for tests execution
+        default_selected_test_suites = " ".join(selected_list)
+        command = "python3 " + test_run_test_path + " --include " + default_selected_test_suites  + " --exclude-jit-executor --exclude-distributed-tests --verbose"
+        run_command_and_capture_output(command)
+        del os.environ['HIP_VISIBLE_DEVICES']
+    elif workflow_name == "distributed":
+        os.environ['TEST_CONFIG'] = 'distributed'
+        os.environ['HIP_VISIBLE_DEVICES'] = '0,1'
+        copied_logs_path = overall_logs_path_current_run + "distributed_xml_results_selected_tests/"
+        # use run_test.py for tests execution
+        distributed_selected_test_suites = " ".join(selected_list)
+        command = "python3 " + test_run_test_path + " --include " + distributed_selected_test_suites + " --distributed-tests --verbose"
+        run_command_and_capture_output(command)
+        del os.environ['HIP_VISIBLE_DEVICES']
+    elif workflow_name == "inductor":
+        os.environ['TEST_CONFIG'] = 'inductor'
+        copied_logs_path = overall_logs_path_current_run + "inductor_xml_results_selected_tests/"
+        inductor_selected_test_suites = ""
+        non_inductor_selected_test_suites = ""
+        for item in selected_list:
+            if "inductor/" in item:
+                inductor_selected_test_suites += item
+                inductor_selected_test_suites += " "
+            else:
+                non_inductor_selected_test_suites += item
+                non_inductor_selected_test_suites += " "
+        if inductor_selected_test_suites != "":
+            inductor_selected_test_suites = inductor_selected_test_suites[:-1]
+            command = "python3 " + test_run_test_path + " --include " + inductor_selected_test_suites + " --verbose"
+            run_command_and_capture_output(command)
+        if non_inductor_selected_test_suites != "":
+            non_inductor_selected_test_suites = non_inductor_selected_test_suites[:-1]
+            command = "python3 " + test_run_test_path + " --inductor --include " + non_inductor_selected_test_suites + " --verbose"
+            run_command_and_capture_output(command)
+    copied_logs_path_destination = shutil.copytree(test_reports_src, copied_logs_path)
+    selected_results_dict = summarize_xml_files(copied_logs_path_destination, workflow_name)
+
+    return selected_results_dict
+
+def run_test_and_summarize_results(
+    pytorch_root_dir: str,
+    priority_tests: bool,
+    test_config: List[str],
+    default_list: List[str],
+    distributed_list: List[str],
+    inductor_list: List[str],
+    skip_rerun: bool) -> Dict[str, Any]:
+
+    # copy current environment variables
+    _environ = dict(os.environ)
+    
+    # modify path
+    test_shell_path = pytorch_root_dir + "/.ci/pytorch/test.sh"
+    test_run_test_path = pytorch_root_dir + "/test/run_test.py"
+    repo_test_log_folder_path = pytorch_root_dir + "/.automation_logs/"
+    test_reports_src = pytorch_root_dir + "/test/test-reports/"
+    run_test_python_file = pytorch_root_dir + "/test/run_test.py"
+
+    # change directory to pytorch root
+    os.chdir(pytorch_root_dir)
+
+    # all test results dict
+    res_all_tests_dict = {}
+
+    # patterns
+    search_text = "--reruns=2"
+    replace_text = "--reruns=0"
+
+    # create logs folder
+    if not os.path.exists(repo_test_log_folder_path):
+        os.mkdir(repo_test_log_folder_path)
+
+    # Set common environment variables for all scenarios
+    os.environ['CI'] = '1'
+    os.environ['PYTORCH_TEST_WITH_ROCM'] = '1'
+    os.environ['HSA_FORCE_FINE_GRAIN_PCIE'] = '1'
+    os.environ['PYTORCH_TESTING_DEVICE_ONLY_FOR'] = 'cuda'
+    os.environ['CONTINUE_THROUGH_ERROR'] = 'True'
+    if skip_rerun:
+        # modify run_test.py in-place
+        with open(run_test_python_file, 'r') as file:
+            data = file.read()
+            data = data.replace(search_text, replace_text)
+        with open(run_test_python_file, 'w') as file:
+            file.write(data)
+
+    # Time stamp
+    current_datetime = datetime.now().strftime("%Y%m%d_%H-%M-%S")
+    print("Current date & time : ", current_datetime)
+    # performed as Job ID
+    str_current_datetime = str(current_datetime)
+    overall_logs_path_current_run = repo_test_log_folder_path + str_current_datetime + "/"
+    os.mkdir(overall_logs_path_current_run)
+
+    global CONSOLIDATED_LOG_FILE_PATH
+    CONSOLIDATED_LOG_FILE_PATH = overall_logs_path_current_run + CONSOLIDATED_LOG_FILE_NAME
+
+    # Check multi gpu availability if distributed tests are enabled
+    if ("distributed" in test_config) or len(distributed_list) != 0:
+        check_num_gpus_for_distributed()
+
+    # Install test requirements
+    command = "pip3 install -r requirements.txt && pip3 install -r .ci/docker/requirements-ci.txt"
+    run_command_and_capture_output(command)
+
+    # Run entire tests for each workflow
+    if not priority_tests and not default_list and not distributed_list and not inductor_list:
+        # run entire tests for default, distributed and inductor workflows → use test.sh
+        if not test_config:
+            check_num_gpus_for_distributed()
+            # default test process
+            res_default_all = run_entire_tests("default", test_shell_path, overall_logs_path_current_run, test_reports_src)
+            res_all_tests_dict["default"] = res_default_all
+            # distributed test process
+            res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src)
+            res_all_tests_dict["distributed"] = res_distributed_all
+            # inductor test process
+            res_inductor_all = run_entire_tests("inductor", test_shell_path, overall_logs_path_current_run, test_reports_src)
+            res_all_tests_dict["inductor"] = res_inductor_all
+        else:
+            workflow_list = []
+            for item in test_config:
+                workflow_list.append(item)
+            if "default" in workflow_list:
+                res_default_all = run_entire_tests("default", test_shell_path, overall_logs_path_current_run, test_reports_src)
+                res_all_tests_dict["default"] = res_default_all
+            if "distributed" in workflow_list:
+                res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src)
+                res_all_tests_dict["distributed"] = res_distributed_all
+            if "inductor" in workflow_list:
+                res_inductor_all = run_entire_tests("inductor", test_shell_path, overall_logs_path_current_run, test_reports_src)
+                res_all_tests_dict["inductor"] = res_inductor_all
+    # Run priority test for each workflow
+    elif priority_tests and not default_list and not distributed_list and not inductor_list:
+        if not test_config:
+            check_num_gpus_for_distributed()
+            # default test process
+            res_default_priority = run_priority_tests("default", test_run_test_path, overall_logs_path_current_run, test_reports_src)
+            res_all_tests_dict["default"] = res_default_priority
+            # distributed test process
+            res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src)
+            res_all_tests_dict["distributed"] = res_distributed_priority
+            # will not run inductor priority tests
+            print("Inductor priority tests cannot run since no core tests defined with inductor workflow.")
+        else:
+            workflow_list = []
+            for item in test_config:
+                workflow_list.append(item)
+            if "default" in workflow_list:
+                res_default_priority = run_priority_tests("default", test_run_test_path, overall_logs_path_current_run, test_reports_src)
+                res_all_tests_dict["default"] = res_default_priority
+            if "distributed" in workflow_list:
+                res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src)
+                res_all_tests_dict["distributed"] = res_distributed_priority
+            if "inductor" in workflow_list:
+                print("Inductor priority tests cannot run since no core tests defined with inductor workflow.")
+    # Run specified tests for each workflow
+    elif (default_list or distributed_list or inductor_list) and not test_config and not priority_tests:
+        if default_list:
+            default_workflow_list = []
+            for item in default_list:
+                default_workflow_list.append(item)
+            res_default_selected = run_selected_tests("default", test_run_test_path, overall_logs_path_current_run, test_reports_src, default_workflow_list)
+            res_all_tests_dict["default"] = res_default_selected
+        if distributed_list:
+            distributed_workflow_list = []
+            for item in distributed_list:
+                distributed_workflow_list.append(item)
+            res_distributed_selected = run_selected_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src, distributed_workflow_list)
+            res_all_tests_dict["distributed"] = res_distributed_selected
+        if inductor_list:
+            inductor_workflow_list = []
+            for item in inductor_list:
+                 inductor_workflow_list.append(item)
+            res_inductor_selected = run_selected_tests("inductor", test_run_test_path, overall_logs_path_current_run, test_reports_src, inductor_workflow_list)
+            res_all_tests_dict["inductor"] = res_inductor_selected
+    else:
+        raise Exception("Invalid test configurations!")
+
+    # restore environment variables
+    os.environ.clear()
+    os.environ.update(_environ)
+
+    # restore files
+    if skip_rerun:
+        # modify run_test.py in-place
+        with open(run_test_python_file, 'r') as file:
+            data = file.read()
+            data = data.replace(replace_text, search_text)
+        with open(run_test_python_file, 'w') as file:
+            file.write(data)
+
+    return res_all_tests_dict
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Run PyTorch unit tests and generate xml results summary', formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument('--test_config', nargs='+', default=[], type=str, help="space-separated list of test workflows to be executed eg. 'default distributed'")
+    parser.add_argument('--priority_tests', action='store_true', help="run priority tests only")
+    parser.add_argument('--default_list', nargs='+', default=[], help="space-separated list of 'default' config test suites/files to be executed eg. 'test_weak test_dlpack'")
+    parser.add_argument('--distributed_list', nargs='+', default=[], help="space-separated list of 'distributed' config test suites/files to be executed eg. 'distributed/test_c10d_common distributed/test_c10d_nccl'")
+    parser.add_argument('--inductor_list', nargs='+', default=[], help="space-separated list of 'inductor' config test suites/files to be executed eg. 'inductor/test_torchinductor test_ops'")
+    parser.add_argument('--pytorch_root', default='.', type=str, help="PyTorch root directory")
+    parser.add_argument('--skip_rerun', action='store_true', help="skip rerun process")
+    parser.add_argument('--example_output', type=str, help="{'workflow_name': {\n"
+                                                           "  test_file_and_status(file_name='workflow_aggregate', status='STATISTICS'): {}, \n"
+                                                           "  test_file_and_status(file_name='test_file_name_1', status='ERROR'): {}, \n"
+                                                           "  test_file_and_status(file_name='test_file_name_1', status='FAILED'): {}, \n"
+                                                           "  test_file_and_status(file_name='test_file_name_1', status='PASSED'): {}, \n"
+                                                           "  test_file_and_status(file_name='test_file_name_1', status='SKIPPED'): {}, \n"
+                                                           "  test_file_and_status(file_name='test_file_name_1', status='STATISTICS'): {} \n"
+                                                           "}}\n")
+    parser.add_argument('--example_usages', type=str, help="RUN ALL TESTS: python3 run_pytorch_unit_tests.py \n"
+                                                            "RUN PRIORITY TESTS: python3 run_pytorch_unit_tests.py --test_config distributed --priority_test \n"
+                                                            "RUN SELECTED TESTS: python3 run_pytorch_unit_tests.py --default_list test_weak test_dlpack --inductor_list inductor/test_torchinductor")
+    return parser.parse_args()
+
+def check_num_gpus_for_distributed():
+    p = subprocess.run("rocminfo | grep -cE 'Name:\s+gfx'", shell=True, capture_output=True, text=True)
+    num_gpus_visible = int(p.stdout)
+    assert num_gpus_visible > 1, "Number of visible GPUs should be >1 to run distributed unit tests"
+
+def main():
+    args = parse_args()
+    all_tests_results = run_test_and_summarize_results(args.pytorch_root, args.priority_tests, args.test_config, args.default_list, args.distributed_list, args.inductor_list, args.skip_rerun)
+    pprint(dict(all_tests_results))
+
+if __name__ == "__main__":
+    main()

From 799f8dbd9611d30d94a9703602e0d7e1daf81483 Mon Sep 17 00:00:00 2001
From: Ethan Wee <158101733+ethanwee1@users.noreply.github.com>
Date: Thu, 20 Feb 2025 14:54:08 -0800
Subject: [PATCH 05/16] [rocm6.4_internal_testing] Enable wheels (#1884)
 (#1907)

* Use triton commit same as that used for release/2.6 branch since both
are triton version 3.2.0, so assuming they're compatible.

Relates to:
https://github.com/ROCm/rocAutomation/pull/660/files
https://github.com/ROCm/builder/pull/70/files

Validation

http://ml-ci-internal.amd.com:8080/job/pytorch/job/manylinux_rocm_wheels/568/

---------

Co-authored-by: Jithun Nair <jithun.nair@amd.com>
Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
(cherry picked from commit 14c1417d4b13abb77db5fcba521791a9b6086737)
(cherry picked from commit c20a8f8632a447adc3ed369a5f85ea87a2b518f0)
---
 .circleci/scripts/binary_populate_env.sh | 42 ------------------------
 .github/scripts/build_triton_wheel.py    |  5 ++-
 CMakeLists.txt                           |  6 ++--
 3 files changed, 5 insertions(+), 48 deletions(-)

diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh
index f876ac8efcf7f..e64a690af1d6a 100755
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@@ -69,48 +69,6 @@ fi
 
 export PYTORCH_BUILD_NUMBER=1
 
-# Set triton version as part of PYTORCH_EXTRA_INSTALL_REQUIREMENTS
-TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
-TRITON_CONSTRAINT="platform_system == 'Linux'"
-
-if [[ "$PACKAGE_TYPE" =~ .*wheel.* &&  -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" && ! "$PYTORCH_BUILD_VERSION" =~ .*xpu.* ]]; then
-  TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
-  if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
-      TRITON_SHORTHASH=$(cut -c1-8 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton.txt)
-      TRITON_REQUIREMENT="pytorch-triton==${TRITON_VERSION}+git${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}"
-  fi
-  export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS} | ${TRITON_REQUIREMENT}"
-fi
-
-# Set triton via PYTORCH_EXTRA_INSTALL_REQUIREMENTS for triton rocm package
-if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*rocm.* && $(uname) == "Linux" ]]; then
-    TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
-    if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
-        TRITON_SHORTHASH=$(cut -c1-8 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton.txt)
-        TRITON_REQUIREMENT="triton==${TRITON_VERSION}+git${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}"
-    fi
-    if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
-        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${TRITON_REQUIREMENT}"
-    else
-        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS} | ${TRITON_REQUIREMENT}"
-    fi
-fi
-
-# Set triton via PYTORCH_EXTRA_INSTALL_REQUIREMENTS for triton xpu package
-if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*xpu.* ]]; then
-    TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_xpu_version.txt)
-    TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}"
-    if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
-        TRITON_SHORTHASH=$(cut -c1-8 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton-xpu.txt)
-        TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}+git${TRITON_SHORTHASH}"
-    fi
-    if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
-        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${TRITON_REQUIREMENT}"
-    else
-        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS} | ${TRITON_REQUIREMENT}"
-    fi
-fi
-
 USE_GLOO_WITH_OPENSSL="ON"
 if [[ "$GPU_ARCH_TYPE" =~ .*aarch64.* ]]; then
   USE_GLOO_WITH_OPENSSL="OFF"
diff --git a/.github/scripts/build_triton_wheel.py b/.github/scripts/build_triton_wheel.py
index 16eb6a7851974..e541e7a86f653 100644
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@@ -96,7 +96,7 @@ def build_triton(
         # Nightly binaries include the triton commit hash, i.e. 2.1.0+e6216047b8
         # while release build should only include the version, i.e. 2.1.0
         rocm_version = get_rocm_version()
-        version_suffix = f"+rocm{rocm_version}_{commit_hash[:10]}"
+        version_suffix = f"+rocm{rocm_version}.git{commit_hash[:8]}"
         version += version_suffix
 
     with TemporaryDirectory() as tmpdir:
@@ -124,6 +124,7 @@ def build_triton(
 
         # change built wheel name and version
         env["TRITON_WHEEL_NAME"] = triton_pkg_name
+        env["TRITON_WHEEL_VERSION_SUFFIX"] = version_suffix
         if with_clang_ldd:
             env["TRITON_BUILD_WITH_CLANG_LLD"] = "1"
 
@@ -139,8 +140,6 @@ def build_triton(
                 cwd=triton_basedir,
                 shell=True,
             )
-            cur_rocm_ver = get_rocm_version()
-            check_call(["scripts/amd/setup_rocm_libs.sh", cur_rocm_ver], cwd=triton_basedir)
             print("ROCm libraries setup for triton installation...")
 
         # old triton versions have setup.py in the python/ dir,
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0b88247df27a5..991ea336a175b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,11 +56,11 @@ set(CMAKE_C_STANDARD
 # ---[ Utils
 include(cmake/public/utils.cmake)
 
-# --- [ Check that minimal gcc version is 9.3+
-if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.3)
+# --- [ Check that minimal gcc version is 9.2+
+if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.2)
   message(
     FATAL_ERROR
-      "GCC-9.3 or newer is required to compile PyTorch, but found ${CMAKE_CXX_COMPILER_VERSION}"
+      "GCC-9.2 or newer is required to compile PyTorch, but found ${CMAKE_CXX_COMPILER_VERSION}"
   )
 endif()
 

From 2c73b981e12b7d91d592ad32b427e7b32d69ea49 Mon Sep 17 00:00:00 2001
From: Jagadish Krishnamoorthy <jagadish.krishnamoorthy@amd.com>
Date: Fri, 18 Apr 2025 18:56:37 -0700
Subject: [PATCH 06/16] ROCm: Enable tf32 testing on test_nn (#55)

* Add trailing comma for consistency in gfx architecture list

Signed-off-by: Jagadish Krishnamoorthy <jagadish.krishnamoorthy@amd.com>

* ROCm: Enable tf32 testing on test_nn

Signed-off-by: Jagadish Krishnamoorthy <jagadish.krishnamoorthy@amd.com>

---------

Signed-off-by: Jagadish Krishnamoorthy <jagadish.krishnamoorthy@amd.com>
(cherry picked from commit c113e1482c68c9bcf11b43351e9187003747804e)
---
 torch/testing/_internal/common_cuda.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py
index 74dfe0c56c232..3f475bd6823b5 100644
--- a/torch/testing/_internal/common_cuda.py
+++ b/torch/testing/_internal/common_cuda.py
@@ -192,6 +192,9 @@ def tf32_off():
 
 @contextlib.contextmanager
 def tf32_on(self, tf32_precision=1e-5):
+    if torch.version.hip:
+        hip_allow_tf32 = os.environ.get("HIPBLASLT_ALLOW_TF32", None)
+        os.environ["HIPBLASLT_ALLOW_TF32"] = "1"
     old_allow_tf32_matmul = torch.backends.cuda.matmul.allow_tf32
     old_precision = self.precision
     try:
@@ -200,6 +203,11 @@ def tf32_on(self, tf32_precision=1e-5):
         with torch.backends.cudnn.flags(enabled=None, benchmark=None, deterministic=None, allow_tf32=True):
             yield
     finally:
+        if torch.version.hip:
+            if hip_allow_tf32 is not None:
+                os.environ["HIPBLASLT_ALLOW_TF32"] = hip_allow_tf32
+            else:
+                del os.environ["HIPBLASLT_ALLOW_TF32"]
         torch.backends.cuda.matmul.allow_tf32 = old_allow_tf32_matmul
         self.precision = old_precision
 

From 9237b263e12ef8bb8e4f87fa9432a3f90dd2a8ce Mon Sep 17 00:00:00 2001
From: omkar kakarparthi <75638701+okakarpa@users.noreply.github.com>
Date: Tue, 13 May 2025 13:14:36 -0500
Subject: [PATCH 07/16] [AUTOGENERATED] [rocm6.5_internal_testing] Remove
 --no-index and --no-deps flags (#2121)

Cherry-pick of https://github.com/ROCm/pytorch/pull/2103

Co-authored-by: Ethan Wee <Ethan.Wee@amd.com>
(cherry picked from commit 1dea6e825b91409af44d873d025eae9d6f578e7c)
---
 .ci/pytorch/common_utils.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.ci/pytorch/common_utils.sh b/.ci/pytorch/common_utils.sh
index ff9d8ad41cc92..9c9d223777466 100644
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@@ -67,13 +67,13 @@ function pip_install_whl() {
     # Loop through each path and install individually
     for path in "${paths[@]}"; do
       echo "Installing $path"
-      python3 -mpip install --no-index --no-deps "$path"
+      python3 -mpip install "$path"
     done
   else
     # Loop through each argument and install individually
     for path in "${args[@]}"; do
       echo "Installing $path"
-      python3 -mpip install --no-index --no-deps "$path"
+      python3 -mpip install "$path"
     done
   fi
 }

From ee8c4e91866b3749f176e381de4475f910f26b17 Mon Sep 17 00:00:00 2001
From: Ethan Wee <Ethan.Wee@amd.com>
Date: Tue, 10 Jun 2025 07:08:01 -0700
Subject: [PATCH 08/16] [rocm7.0_internal_testing] upgrading numpy (#2256)

Relates to: https://github.com/ROCm/builder/pull/82

Validation:
http://rocm-ci.amd.com/job/mainline-pytorch_internal-manylinux-wheels/98/

Using
`registry-sc-harbor.amd.com/framework/compute-rocm-dkms-no-npi-hipclang:16180_ubuntu24.04_py3.12_pytorch_lw_rocm7.0_IT_upgrade_numpy_452f3df6`:
```
root@d92befdbb2a6:/# pip list | egrep "numpy|pandas"
numpy                   2.1.2
pandas                  2.2.3
root@d92befdbb2a6:/# python3
Python 3.12.3 (main, Feb  4 2025, 14:48:35) [GCC 13.3.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import pandas
>>> import torch
>>> import numpy
>>> exit()
root@d92befdbb2a6:/data/pytorch-micro-benchmarking# HIP_VISIBLE_DEVICES=1 python3 micro_benchmarking_pytorch.py --network resnet50
INFO: running forward and backward for warmup.
INFO: running the benchmark..
OK: finished running benchmark..
--------------------SUMMARY--------------------------
Microbenchmark for network : resnet50
Num devices: 1
Dtype: FP32
Mini batch size [img] : 64
Time per mini-batch : 0.11369450092315674
Throughput [img/sec] : 562.9120096428937
```

---------

Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
(cherry picked from commit cf324795568f35f91a78fae9cb8f0efc8d955351)
---
 .ci/docker/requirements-ci.txt | 14 ++++++--------
 requirements-build.txt         |  2 +-
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index d44dfb1ed67ae..47c9a44a775bb 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -117,10 +117,10 @@ ninja==1.11.1.4
 #Pinned versions: 1.11.1.4
 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
 
-numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
-numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
+numba==0.60.0 ; python_version == "3.9"
+numba==0.61.2 ; python_version > "3.9"
 #Description: Just-In-Time Compiler for Numerical Functions
-#Pinned versions: 0.55.2, 0.60.0
+#Pinned versions: 0.61.2, 0.60.0
 #test that import: test_numba_integration.py
 #Need release > 0.61.2 for s390x due to https://github.com/numba/numba/pull/10073
 
@@ -136,12 +136,10 @@ numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
 #test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py,
 #test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py,
 #test_binary_ufuncs.py
-numpy==1.22.4; python_version == "3.10"
-numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
-numpy==2.1.2; python_version >= "3.13"
+numpy==2.0.2; python_version == "3.9"
+numpy==2.1.2; python_version >= "3.9"
 
-pandas==2.0.3; python_version < "3.13"
-pandas==2.2.3; python_version >= "3.13"
+pandas==2.2.3
 
 #onnxruntime
 #Description: scoring engine for Open Neural Network Exchange (ONNX) models
diff --git a/requirements-build.txt b/requirements-build.txt
index 85923ae39cbdb..170868e57db1a 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -2,7 +2,7 @@
 setuptools>=70.1.0
 cmake>=3.27
 ninja
-numpy
+numpy==2.1.2
 packaging
 pyyaml
 requests

From 9621d7f956ebf0f1302cd61a5c59d2c4d6a7407b Mon Sep 17 00:00:00 2001
From: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
Date: Thu, 12 Jun 2025 23:32:43 -0500
Subject: [PATCH 09/16] [rocm7.0_internal_testing] Use different package
 versions for py3.9 (#2269)

Fixes SWDEV-536456

Fixes error post-https://github.com/ROCm/pytorch/pull/2256:
```
00:12:44.248  #22 155.3 ERROR: Ignored the following versions that require a different python version: 0.52.0 Requires-Python >=3.6,<3.9; 0.52.0rc3 Requires-Python >=3.6,<3.9; 0.61.0 Requires-Python >=3.10; 0.61.0rc1 Requires-Python >=3.10; 0.61.0rc2 Requires-Python >=3.10; 0.61.1rc1 Requires-Python >=3.10; 0.61.2 Requires-Python >=3.10; 3.3 Requires-Python >=3.10; 3.3rc0 Requires-Python >=3.10; 3.4 Requires-Python >=3.10; 3.4.1 Requires-Python >=3.10; 3.4.2 Requires-Python >=3.10; 3.4rc0 Requires-Python >=3.10; 3.5 Requires-Python >=3.11; 3.5rc0 Requires-Python >=3.11; 8.2.0 Requires-Python >=3.10; 8.2.1 Requires-Python >=3.10
00:12:44.248  #22 155.3 ERROR: Could not find a version that satisfies the requirement numba==0.61.2 (from versions: 0.1, 0.2, 0.3, 0.5.0, 0.6.0, 0.7.0, 0.7.1, 0.7.2, 0.8.0, 0.8.1, 0.9.0, 0.10.0, 0.10.1, 0.11.0, 0.12.0, 0.12.1, 0.12.2, 0.13.0, 0.13.2, 0.13.3, 0.13.4, 0.14.0, 0.15.1, 0.16.0, 0.17.0, 0.18.1, 0.18.2, 0.19.1, 0.19.2, 0.20.0, 0.21.0, 0.22.0, 0.22.1, 0.23.0, 0.23.1, 0.24.0, 0.25.0, 0.26.0, 0.27.0, 0.28.1, 0.29.0, 0.30.0, 0.30.1, 0.31.0, 0.32.0, 0.33.0, 0.34.0, 0.35.0, 0.36.1, 0.36.2, 0.37.0, 0.38.0, 0.38.1, 0.39.0, 0.40.0, 0.40.1, 0.41.0, 0.42.0, 0.42.1, 0.43.0, 0.43.1, 0.44.0, 0.44.1, 0.45.0, 0.45.1, 0.46.0, 0.47.0, 0.48.0, 0.49.0, 0.49.1rc1, 0.49.1, 0.50.0rc1, 0.50.0, 0.50.1, 0.51.0rc1, 0.51.0, 0.51.1, 0.51.2, 0.52.0rc2, 0.53.0rc1.post1, 0.53.0rc2, 0.53.0rc3, 0.53.0, 0.53.1, 0.54.0rc2, 0.54.0rc3, 0.54.0, 0.54.1rc1, 0.54.1, 0.55.0rc1, 0.55.0, 0.55.1, 0.55.2, 0.56.0rc1, 0.56.0, 0.56.2, 0.56.3, 0.56.4, 0.57.0rc1, 0.57.0, 0.57.1rc1, 0.57.1, 0.58.0rc1, 0.58.0rc2, 0.58.0, 0.58.1, 0.59.0rc1, 0.59.0, 0.59.1, 0.60.0rc1, 0.60.0)
00:12:44.248  #22 155.3 ERROR: No matching distribution found for numba==0.61.2
```

Validation:
* Docker image:
http://rocm-ci.amd.com/job/mainline-framework-pytorch-internal-cs9-ci/132
* Wheels:
http://rocm-ci.amd.com/job/mainline-pytorch_internal-manylinux-wheels/102/

From
`registry-sc-harbor.amd.com/framework/compute-rocm-dkms-no-npi-hipclang:16180_ubuntu22.04_py3.9_pytorch_lw_rocm7.0_IT_py3.9_a11d94ad`:
```
root@f43861a0a856:/# pip list | egrep "numpy|pandas"
numpy                   2.0.2
pandas                  2.2.3
root@f43861a0a856:/# python
Python 3.9.23 (main, Jun  4 2025, 08:55:38)
[GCC 11.4.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import torch
>>> import numpy
>>> import pandas
root@f43861a0a856:/data/pytorch-micro-benchmarking# HIP_VISIBLE_DEVICES=1 python3 micro_benchmarking_pytorch.py --network resnet50
INFO: running forward and backward for warmup.
INFO: running the benchmark..
OK: finished running benchmark..
--------------------SUMMARY--------------------------
Microbenchmark for network : resnet50
Num devices: 1
Dtype: FP32
Mini batch size [img] : 64
Time per mini-batch : 0.11354223489761353
Throughput [img/sec] : 563.6669038416574
```

(cherry picked from commit a0a9d816537b921a3d82edf77d103fbe0fbb5fe1)
---
 .ci/docker/requirements-ci.txt | 10 +++++-----
 requirements-build.txt         |  3 ++-
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index 47c9a44a775bb..0081762c320d3 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -120,7 +120,7 @@ ninja==1.11.1.4
 numba==0.60.0 ; python_version == "3.9"
 numba==0.61.2 ; python_version > "3.9"
 #Description: Just-In-Time Compiler for Numerical Functions
-#Pinned versions: 0.61.2, 0.60.0
+#Pinned versions: 0.54.1, 0.49.0, <=0.49.1
 #test that import: test_numba_integration.py
 #Need release > 0.61.2 for s390x due to https://github.com/numba/numba/pull/10073
 
@@ -136,8 +136,8 @@ numba==0.61.2 ; python_version > "3.9"
 #test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py,
 #test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py,
 #test_binary_ufuncs.py
-numpy==2.0.2; python_version == "3.9"
-numpy==2.1.2; python_version >= "3.9"
+numpy==2.0.2 ; python_version == "3.9"
+numpy==2.1.2 ; python_version > "3.9"
 
 pandas==2.2.3
 
@@ -249,8 +249,8 @@ scikit-image==0.22.0
 #Pinned versions: 0.20.3
 #test that import:
 
-scipy==1.10.1 ; python_version <= "3.11"
-scipy==1.14.1 ; python_version >= "3.12"
+scipy==1.13.1 ; python_version == "3.9"
+scipy==1.14.1 ; python_version > "3.9"
 # Pin SciPy because of failing distribution tests (see #60347)
 #Description: scientific python
 #Pinned versions: 1.10.1
diff --git a/requirements-build.txt b/requirements-build.txt
index 170868e57db1a..25f1e47a14968 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -2,7 +2,8 @@
 setuptools>=70.1.0
 cmake>=3.27
 ninja
-numpy==2.1.2
+numpy==2.0.2 ; python_version == "3.9""
+numpy==2.1.2 ; python_version > "3.9"
 packaging
 pyyaml
 requests

From b387c9e429d4f463e06b6032eab0ce076c3a6c81 Mon Sep 17 00:00:00 2001
From: Dmitry Nikolaev <139769634+dnikolaev-amd@users.noreply.github.com>
Date: Mon, 30 Jun 2025 19:01:04 +0200
Subject: [PATCH 10/16] [rocm7.0_internal_testing] fix enabling sparse tests
 fp16/bf16 for rocm7.0/7.1 (#2239)

Revamped version of #2108

PR to:
- enable complex data types for sparse matmul on ROCm
- fix sparse addmm/baddbmm on ROCm
- fix sparse hipification for ROCm
- fix/enable sparse tests on ROCm (~50 tests total for non-fp16/bf16):
- enable fp16/bf16 sparse path for rocm7.0
- enable fp16/bf16 sparse tests for rocm7.0/7.1
```
test_sparse_csr.py::TestSparseCSRCUDA::test_bmm_cuda_*
test_sparse.py::TestSparseCUDA::test_sparse_matmul_cuda_*
test_sparse_csr.py::TestSparseCSRCUDA::test_mm_cuda_float64
test_sparse_csr.py::TestSparseCSRCUDA::test_addmm_all_sparse_csr_SparseCS*
test_sparse_csr.py::TestSparseCSRCUDA::test_addmm_sizes_all_sparse_csr_*
test_sparse_csr.py::TestSparseCSRCUDA::test_sparse_addmm_cuda_float16
```

(cherry picked from commit cc2a69c6b6c35e6024f40189ceebe530c1968810)
---
 .../ATen/native/sparse/cuda/SparseMatMul.cu   | 20 +++++++++++++++++++
 test/test_sparse.py                           |  6 ++++++
 test/test_sparse_csr.py                       |  7 ++++---
 torch/utils/hipify/cuda_to_hip_mappings.py    |  8 ++++++++
 4 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
index 49bea10c65104..8402555a5c340 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
@@ -40,7 +40,27 @@
 #include <thrust/iterator/discard_iterator.h>
 
 
+#if defined(__CUDACC__) && (defined(CUSPARSE_VERSION) || (defined(USE_ROCM) && ROCM_VERSION >= 60300))
+#define IS_CUSPARSE11_AVAILABLE() 1
+#else
+#define IS_CUSPARSE11_AVAILABLE() 0
+#endif
+
+#if defined(USE_ROCM) && (ROCM_VERSION >= 70000)
+#define HIPSPARSE_FP16_SUPPORT 1
+#else
+#define HIPSPARSE_FP16_SUPPORT 0
+#endif
+
+#if defined(USE_ROCM) && (ROCM_VERSION >= 70100)
+#define HIPSPARSE_FP16_BF16_SUPPORT 1
+#else
+#define HIPSPARSE_FP16_BF16_SUPPORT 0
+#endif
+
+#if IS_CUSPARSE11_AVAILABLE()
 #include <library_types.h>
+#endif
 
 namespace at::native {
 
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 5150dab4b7cf1..eb6877b419d0b 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -69,6 +69,12 @@ def _op_supports_any_sparse(op):
 ) or (not IS_WINDOWS and not TEST_WITH_ROCM)
 
 HIPSPARSE_SPMM_COMPLEX128_SUPPORTED = torch.version.hip and version.parse(torch.version.hip.split("-")[0]) >= version.parse("6.0")
+HIPSPARSE_FP16_SUPPORTED = torch.version.hip and version.parse(torch.version.hip.split("-")[0]) >= version.parse("7.0")
+HIPSPARSE_BF16_SUPPORTED = torch.version.hip and version.parse(torch.version.hip.split("-")[0]) >= version.parse("7.1")
+
+SPARSE_COMPLEX128_SUPPORTED = CUSPARSE_SPMM_COMPLEX128_SUPPORTED or HIPSPARSE_SPMM_COMPLEX128_SUPPORTED
+SPARSE_FLOAT16_SUPPORTED = (SM53OrLater and torch.version.cuda) or (HIPSPARSE_FP16_SUPPORTED)
+SPARSE_BFLOAT16_SUPPORTED = (SM80OrLater and torch.version.cuda) or (HIPSPARSE_BF16_SUPPORTED)
 
 def all_sparse_layouts(test_name='layout', include_strided=False):
     return parametrize(test_name, [
diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index f84adcc7bd262..e1bfd3f146991 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -25,7 +25,8 @@
     all_types_and_complex, floating_and_complex_types_and)
 from torch.testing._internal.opinfo.definitions.linalg import sample_inputs_linalg_solve
 from torch.testing._internal.opinfo.definitions.sparse import validate_sample_input_sparse
-from test_sparse import CUSPARSE_SPMM_COMPLEX128_SUPPORTED, HIPSPARSE_SPMM_COMPLEX128_SUPPORTED
+from test_sparse import HIPSPARSE_BF16_SUPPORTED, HIPSPARSE_FP16_SUPPORTED, \
+    SPARSE_FLOAT16_SUPPORTED, SPARSE_BFLOAT16_SUPPORTED, SPARSE_COMPLEX128_SUPPORTED
 import operator
 
 if TEST_SCIPY:
@@ -1940,8 +1941,8 @@ def test_shape(d1, d2, d3, nnz, transposed, index_dtype):
 
     @dtypes(*floating_and_complex_types())
     @dtypesIfCUDA(*floating_and_complex_types_and(
-                  *[torch.half] if SM53OrLater and TEST_CUSPARSE_GENERIC else [],
-                  *[torch.bfloat16] if SM80OrLater and TEST_CUSPARSE_GENERIC else []))
+                  *[torch.half] if SPARSE_FLOAT16_SUPPORTED else [],
+                  *[torch.bfloat16] if SPARSE_BFLOAT16_SUPPORTED else []))
     @precisionOverride({torch.bfloat16: 3.5e-2, torch.float16: 1e-2})
     def test_sparse_addmm(self, device, dtype):
         def test_shape(m, n, p, nnz, broadcast, index_dtype, alpha_beta=None):
diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
index 82547c8e28540..12e1a1209c2cd 100644
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -8593,6 +8593,14 @@
             "CUSPARSE_STATUS_ZERO_PIVOT",
             ("HIPSPARSE_STATUS_ZERO_PIVOT", CONV_NUMERIC_LITERAL, API_SPECIAL),
         ),
+        (
+            "CUSPARSE_STATUS_NOT_SUPPORTED",
+            ("HIPSPARSE_STATUS_NOT_SUPPORTED", CONV_NUMERIC_LITERAL, API_SPECIAL),
+        ),
+        (
+            "CUSPARSE_STATUS_INSUFFICIENT_RESOURCES",
+            ("HIPSPARSE_STATUS_INSUFFICIENT_RESOURCES", CONV_NUMERIC_LITERAL, API_SPECIAL),
+        ),
         (
             "CUSPARSE_OPERATION_TRANSPOSE",
             ("HIPSPARSE_OPERATION_TRANSPOSE", CONV_NUMERIC_LITERAL, API_SPECIAL),

From 8812312ad426d56aeae74294c06bc7fe09d41e13 Mon Sep 17 00:00:00 2001
From: Ethan Wee <Ethan.Wee@amd.com>
Date: Wed, 9 Jul 2025 11:59:44 -0700
Subject: [PATCH 11/16] [rocm7.0_internal_testing] upgrade tensorboard
 compatible with numpy 2 (#2326)

Fixes https://ontrack-internal.amd.com/browse/SWDEV-541809

Upgrading tensorboard after numpy upgrade
Ran in
**registry-sc-harbor.amd.com/framework/compute-rocm-dkms-no-npi-hipclang:16381_ubuntu24.04_py3.12_pytorch_lw_rocm7.0_internal_testing_afe8b782**

```
    7  git checkout rocm7.0_IT_upgrade_tensorboard
    8  pip install .ci/docker/requirements-ci.txt
    9  pip install -r .ci/docker/requirements-ci.txt
   10  PYTORCH_TEST_WITH_ROCM=1 python test/test_monitor.py TestMonitorTensorboard.test_event_handler

root@ubb4-rack-22:/var/lib/jenkins/pytorch# PYTORCH_TEST_WITH_ROCM=1 python test/test_monitor.py TestMonitorTensorboard.test_event_handler
/opt/venv/lib/python3.12/site-packages/google/protobuf/internal/well_known_types.py:91: DeprecationWarning: datetime.datetime.utcfromtimestamp() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.fromtimestamp(timestamp, datetime.UTC).
  _EPOCH_DATETIME_NAIVE = datetime.datetime.utcfromtimestamp(0)
.
----------------------------------------------------------------------
Ran 1 test in 0.327s

OK
root@ubb4-rack-22:/var/lib/jenkins/pytorch#

```

(cherry picked from commit c7f61f42059a9a56956190eef5ef4e5f6f127038)
---
 .ci/docker/requirements-ci.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index 0081762c320d3..93d32b803b199 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -309,8 +309,7 @@ z3-solver==4.15.1.0 ; platform_machine != "s390x"
 #Pinned versions:
 #test that import:
 
-tensorboard==2.13.0 ; python_version < "3.13"
-tensorboard==2.18.0 ; python_version >= "3.13"
+tensorboard==2.18.0
 #Description: Also included in .ci/docker/requirements-docs.txt
 #Pinned versions:
 #test that import: test_tensorboard

From 504fea97dc8f70ea61573463cb68781c181f90bb Mon Sep 17 00:00:00 2001
From: Ethan Wee <Ethan.Wee@amd.com>
Date: Fri, 15 Aug 2025 13:28:53 -0700
Subject: [PATCH 12/16] [rocm7.1_internal_testing] Pin requirements (#2526)

Tested locally successfully
```
root@rocm-framework-47:/var/lib/jenkins/pytorch# pip install -r requirements.txt
Ignoring numpy: markers 'python_version == "3.9"' don't match your environment
Requirement already satisfied: setuptools<80.0,>=70.1.0 in /opt/venv/lib/python3.10/site-packages (from -r /var/lib/jenkins/pytorch/requirements-build.txt (line 2)) (79.0.1)
Requirement already satisfied: cmake>=3.31.4 in /opt/venv/lib/python3.10/site-packages (from -r /var/lib/jenkins/pytorch/requirements-build.txt (line 3)) (4.0.0)
Requirement already satisfied: ninja==1.11.1.3 in /opt/venv/lib/python3.10/site-packages (from -r /var/lib/jenkins/pytorch/requirements-build.txt (line 4)) (1.11.1.3)
Requirement already satisfied: numpy==2.1.2 in /opt/venv/lib/python3.10/site-packages (from -r /var/lib/jenkins/pytorch/requirements-build.txt (line 5)) (2.1.2)
Requirement already satisfied: packaging==25.0 in /opt/venv/lib/python3.10/site-packages (from -r /var/lib/jenkins/pytorch/requirements-build.txt (line 6)) (25.0)
Requirement already satisfied: pyyaml==6.0.2 in /opt/venv/lib/python3.10/site-packages (from -r /var/lib/jenkins/pytorch/requirements-build.txt (line 7)) (6.0.2)
Requirement already satisfied: requests==2.32.4 in /opt/venv/lib/python3.10/site-packages (from -r /var/lib/jenkins/pytorch/requirements-build.txt (line 8)) (2.32.4)
Requirement already satisfied: six==1.17.0 in /opt/venv/lib/python3.10/site-packages (from -r /var/lib/jenkins/pytorch/requirements-build.txt (line 9)) (1.17.0)
Requirement already satisfied: typing-extensions==4.14.1 in /opt/venv/lib/python3.10/site-packages (from -r /var/lib/jenkins/pytorch/requirements-build.txt (line 10)) (4.14.1)
Requirement already satisfied: expecttest==0.3.0 in /opt/venv/lib/python3.10/site-packages (from -r requirements.txt (line 8)) (0.3.0)
Requirement already satisfied: filelock==3.18.0 in /opt/venv/lib/python3.10/site-packages (from -r requirements.txt (line 9)) (3.18.0)
Requirement already satisfied: fsspec==2025.7.0 in /opt/venv/lib/python3.10/site-packages (from -r requirements.txt (line 10)) (2025.7.0)
Requirement already satisfied: hypothesis==5.35.1 in /opt/venv/lib/python3.10/site-packages (from -r requirements.txt (line 11)) (5.35.1)
Requirement already satisfied: jinja2==3.1.6 in /opt/venv/lib/python3.10/site-packages (from -r requirements.txt (line 12)) (3.1.6)
Requirement already satisfied: lintrunner==0.12.7 in /opt/venv/lib/python3.10/site-packages (from -r requirements.txt (line 13)) (0.12.7)
Requirement already satisfied: networkx==2.8.8 in /opt/venv/lib/python3.10/site-packages (from -r requirements.txt (line 14)) (2.8.8)
Requirement already satisfied: optree==0.13.0 in /opt/venv/lib/python3.10/site-packages (from -r requirements.txt (line 18)) (0.13.0)
Requirement already satisfied: psutil==7.0.0 in /opt/venv/lib/python3.10/site-packages (from -r requirements.txt (line 19)) (7.0.0)
Requirement already satisfied: sympy==1.13.3 in /opt/venv/lib/python3.10/site-packages (from -r requirements.txt (line 20)) (1.13.3)
Requirement already satisfied: wheel==0.45.1 in /opt/venv/lib/python3.10/site-packages (from -r requirements.txt (line 22)) (0.45.1)
Requirement already satisfied: build[uv] in /opt/venv/lib/python3.10/site-packages (from -r requirements.txt (line 7)) (1.3.0)
Requirement already satisfied: charset_normalizer<4,>=2 in /opt/venv/lib/python3.10/site-packages (from requests==2.32.4->-r /var/lib/jenkins/pytorch/requirements-build.txt (line 8)) (3.4.3)
Requirement already satisfied: idna<4,>=2.5 in /opt/venv/lib/python3.10/site-packages (from requests==2.32.4->-r /var/lib/jenkins/pytorch/requirements-build.txt (line 8)) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/venv/lib/python3.10/site-packages (from requests==2.32.4->-r /var/lib/jenkins/pytorch/requirements-build.txt (line 8)) (2.5.0)
Requirement already satisfied: certifi>=2017.4.17 in /opt/venv/lib/python3.10/site-packages (from requests==2.32.4->-r /var/lib/jenkins/pytorch/requirements-build.txt (line 8)) (2025.8.3)
Requirement already satisfied: attrs>=19.2.0 in /opt/venv/lib/python3.10/site-packages (from hypothesis==5.35.1->-r requirements.txt (line 11)) (25.3.0)
Requirement already satisfied: sortedcontainers<3.0.0,>=2.1.0 in /opt/venv/lib/python3.10/site-packages (from hypothesis==5.35.1->-r requirements.txt (line 11)) (2.4.0)
Requirement already satisfied: MarkupSafe>=2.0 in /opt/venv/lib/python3.10/site-packages (from jinja2==3.1.6->-r requirements.txt (line 12)) (3.0.2)
Requirement already satisfied: mpmath<1.4,>=1.1.0 in /opt/venv/lib/python3.10/site-packages (from sympy==1.13.3->-r requirements.txt (line 20)) (1.3.0)
Requirement already satisfied: pyproject_hooks in /opt/venv/lib/python3.10/site-packages (from build[uv]->-r requirements.txt (line 7)) (1.2.0)
Requirement already satisfied: tomli>=1.1.0 in /opt/venv/lib/python3.10/site-packages (from build[uv]->-r requirements.txt (line 7)) (2.2.1)
Requirement already satisfied: uv>=0.1.18 in /opt/venv/lib/python3.10/site-packages (from build[uv]->-r requirements.txt (line 7)) (0.8.10)
root@rocm-framework-47:/var/lib/jenkins/pytorch# pip install -r requirements-build.txt

```

(cherry picked from commit 6e6e45424f4ba643f9f642f69206f6d1df4e9192)
---
 requirements-build.txt | 20 ++++++++++----------
 requirements.txt       | 27 +++++++++++++++------------
 2 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/requirements-build.txt b/requirements-build.txt
index 25f1e47a14968..f2edf387fb97a 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -1,12 +1,12 @@
 # Build System requirements
-setuptools>=70.1.0
-cmake>=3.27
-ninja
-numpy==2.0.2 ; python_version == "3.9""
-numpy==2.1.2 ; python_version > "3.9"
-packaging
-pyyaml
-requests
-six  # dependency chain: NNPACK -> PeachPy -> six
-typing-extensions>=4.10.0
 pip  # not technically needed, but this makes setup.py invocation work
+setuptools>=70.1.0,<80.0  # setuptools develop deprecated on 80.0
+cmake>=3.31.4
+ninja==1.11.1.3
+numpy==2.0.2 ; python_version == "3.9"
+numpy==2.1.2 ; python_version > "3.9"
+packaging==25.0
+pyyaml==6.0.2
+requests==2.32.4
+six==1.17.0  # dependency chain: NNPACK -> PeachPy -> six
+typing-extensions==4.14.1
diff --git a/requirements.txt b/requirements.txt
index fc4b53dfd49ea..090a733726658 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,15 +5,18 @@
 
 # Install / Development extra requirements
 build[uv]  # for building sdist and wheel
-expecttest>=0.3.0
-filelock
-fsspec>=0.8.5
-hypothesis
-jinja2
-lintrunner ; platform_machine != "s390x" and platform_machine != "riscv64"
-networkx>=2.5.1
-optree>=0.13.0
-psutil
-sympy>=1.13.3
-typing-extensions>=4.13.2
-wheel
+expecttest==0.3.0
+filelock==3.18.0
+fsspec==2025.7.0
+hypothesis==5.35.1
+jinja2==3.1.6
+lintrunner==0.12.7 ; platform_machine != "s390x"
+networkx==2.8.8
+ninja==1.11.1.3
+numpy==2.0.2 ; python_version == "3.9"
+numpy==2.1.2 ; python_version > "3.9"
+optree==0.13.0
+psutil==7.0.0
+sympy==1.13.3
+typing-extensions==4.14.1
+wheel==0.45.1

From 3a11b6dddb446419314fd1d0f08fcf9b2f3774f6 Mon Sep 17 00:00:00 2001
From: Xinya Zhang <Xinya.Zhang@amd.com>
Date: Wed, 20 Dec 2023 16:26:17 -0600
Subject: [PATCH 13/16] Enable gesvda for ROCM >= 6.1 (#1339)

This also fixes a problem in gesvd driver when UV is not needed.

(cherry picked from commit 4ce57ecb3665796ff737d23cccba314c7ea12e92)
(cherry picked from commit 167b4c1c24abc61968b776a30db6794065de5625)
---
 .../native/cuda/linalg/BatchLinearAlgebraLib.cpp   | 14 +++++++-------
 aten/src/ATen/native/cuda/linalg/CUDASolver.cpp    |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
index 267d1f5acea52..5b28cc6eccf01 100644
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
@@ -332,11 +332,11 @@ static void svd_cusolver_gesvd(const Tensor& A, const Tensor& U, const Tensor& S
   // gesvd just knows how to handle m >= n, so in the other case we need to transpose A
   const auto not_A_H = A.size(-2) >= A.size(-1);
   Tensor Vcopy = V; // Shallow copy
-#ifdef USE_ROCM
+#ifdef ROCM_VERSION
   // Similar to the case in svd_magma(), experiments have shown Vh tensor is
   // not guaranteed to be column major on ROCM, we have to create a copy to
   // deal with this
-  if (!not_A_H) {
+  if (compute_uv && !not_A_H) {
     Vcopy = at::empty_like(V.mT(),
                            V.options()
                            .device(V.device())
@@ -351,8 +351,8 @@ static void svd_cusolver_gesvd(const Tensor& A, const Tensor& U, const Tensor& S
                                        infos,
                                        full_matrices, compute_uv, calculate_all_batches, batches);
   });
-#ifdef USE_ROCM
-  if (!not_A_H) {
+#ifdef ROCM_VERSION
+  if (compute_uv && !not_A_H) {
     V.copy_(Vcopy);
   }
 #endif
@@ -526,8 +526,8 @@ static void svd_cusolver_gesvdjBatched(const Tensor& A, const Tensor& U, const T
 template<typename scalar_t>
 static void apply_svd_cusolver_gesvdaStridedBatched(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V,
     const Tensor& infos, bool full_matrices, bool compute_uv) {
-#ifndef CUDART_VERSION
-  TORCH_CHECK(false, "gesvda: Batched version is supported only with cuBLAS backend.")
+#if defined(CUDART_VERSION) || defined(USE_ROCM) && ROCM_VERSION < 60100
+  TORCH_CHECK(false, "gesvda: Batched version is supported only with cuBLAS backend or ROCM >= 5.7.0.")
 #else
   using value_t = typename c10::scalar_value_type<scalar_t>::type;
   int m = cuda_int_cast(A.size(-2), "m");
@@ -665,7 +665,7 @@ void svd_cusolver(const Tensor& A,
   static constexpr const char* check_svd_doc = "Check doc at https://pytorch.org/docs/stable/generated/torch.linalg.svd.html";
 
   // The default heuristic is to use gesvdj driver
-#ifdef USE_ROCM
+#if defined(ROCM_VERSION) && ROCM_VERSION < 60100
   const auto driver_v = std::string_view("gesvdj");
 #else
   const auto driver_v = driver.value_or("gesvdj");
diff --git a/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp b/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp
index 99c38077611d6..af183038bb8e4 100644
--- a/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp
+++ b/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp
@@ -470,8 +470,8 @@ void gesvdjBatched<c10::complex<double>>(
 }
 
 
-// ROCM does not implement gesdva yet
-#ifdef CUDART_VERSION
+// ROCM does not implement gesdva correctly before 6.1
+#if defined(CUDART_VERSION) || defined(ROCM_VERSION) && ROCM_VERSION >= 60100
 template<>
 void gesvdaStridedBatched_buffersize<float>(
     cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n, float *A, int lda, long long int strideA,

From 2a0b33a356d74bd459ac812343316338bb709e3e Mon Sep 17 00:00:00 2001
From: Pruthvi Madugundu <pruthvigithub@gmail.com>
Date: Tue, 12 Mar 2024 09:56:58 -0700
Subject: [PATCH 14/16] Remove ROCmloops specific test

(cherry picked from commit d6879fa6b482188ccfc54a8ea8be272a35fc7cdf)
(cherry picked from commit 123a16486958b171b538126ef4fe0045464a505e)
---
 aten/src/ATen/test/cuda_vectorized_test.cu | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/aten/src/ATen/test/cuda_vectorized_test.cu b/aten/src/ATen/test/cuda_vectorized_test.cu
index e4c18102526ac..1b3ed4dc4ac42 100644
--- a/aten/src/ATen/test/cuda_vectorized_test.cu
+++ b/aten/src/ATen/test/cuda_vectorized_test.cu
@@ -32,23 +32,6 @@ void reset_buffers() {
   }
 }
 
-#if defined(USE_ROCM) && !defined(_WIN32)
-TEST(TestLoops, HasSameArgTypes) {
-  // This is a compile-time unit test. If this file compiles without error,
-  // then the test passes and during runtime, we just need to return.
-  using namespace at::native::modern::detail;
-  using func1_t = int (*)(float, float);
-  using func2_t = int (*)(bool, float, float);
-  using func3_t = int (*)(float);
-  using func4_t = int (*)();
-  static_assert(has_same_arg_types<func1_t>::value, "func1_t has the same argument types");
-  static_assert(!has_same_arg_types<func2_t>::value, "func2_t does not have the same argument types");
-  static_assert(has_same_arg_types<func3_t>::value, "func3_t has the same argument types");
-  static_assert(has_same_arg_types<func4_t>::value, "func4_t has the same argument types");
-  return;
-}
-#endif
-
 TEST(TestVectorizedMemoryAccess, CanVectorizeUpTo) {
   char *ptr = reinterpret_cast<char *>(buffer1);
 

From 462187f60c9bc525eab466068cb8b2a559935220 Mon Sep 17 00:00:00 2001
From: Prachi Gupta <prachi.gupta@amd.com>
Date: Wed, 29 Oct 2025 17:41:21 +0000
Subject: [PATCH 15/16] Bump triton to 3.5.x and update related_commits

---
 .ci/docker/ci_commit_pins/triton.txt |  2 +-
 related_commits                      | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)
 create mode 100644 related_commits

diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt
index 10f1207e60e6c..d893bdd32ab34 100644
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@@ -1 +1 @@
-7416ffcb92cdbe98d9f97e4e6f95247e46dfc9fd
+ac80c4190aa0321f761a08af97e1e1eee41f01d9
diff --git a/related_commits b/related_commits
new file mode 100644
index 0000000000000..ee36e55601d0f
--- /dev/null
+++ b/related_commits
@@ -0,0 +1,10 @@
+ubuntu|pytorch|apex|master|2190fbaeb88384ed792373adbb83c182af117ca0|https://github.com/ROCm/apex
+centos|pytorch|apex|master|2190fbaeb88384ed792373adbb83c182af117ca0|https://github.com/ROCm/apex
+ubuntu|pytorch|torchvision|main|218d2ab791d437309f91e0486eb9fa7f00badc17|https://github.com/pytorch/vision
+centos|pytorch|torchvision|main|218d2ab791d437309f91e0486eb9fa7f00badc17|https://github.com/pytorch/vision
+ubuntu|pytorch|torchdata|main|92950795e0790eb74df995daf40b658e85fd2c9f|https://github.com/pytorch/data
+centos|pytorch|torchdata|main|92950795e0790eb74df995daf40b658e85fd2c9f|https://github.com/pytorch/data
+ubuntu|pytorch|torchaudio|main|3b0e7a6f192ca2715e7e6cbe5db007aea7165fe2|https://github.com/pytorch/audio
+centos|pytorch|torchaudio|main|3b0e7a6f192ca2715e7e6cbe5db007aea7165fe2|https://github.com/pytorch/audio
+ubuntu|pytorch|ao|main|3577306c8b32517afe8eb6eb7e84335601180598|https://github.com/pytorch/ao
+centos|pytorch|ao|main|3577306c8b32517afe8eb6eb7e84335601180598|https://github.com/pytorch/ao

From ba4531d2560231e22e80f0d0cae1ec7d555d7ea1 Mon Sep 17 00:00:00 2001
From: Jagadish Krishnamoorthy <jagadish.krishnamoorthy@amd.com>
Date: Tue, 1 Apr 2025 09:19:40 -0700
Subject: [PATCH 16/16] Revert to prev sccache by ROCm

Signed-off-by: Jagadish Krishnamoorthy <jagadish.krishnamoorthy@amd.com>

(cherry picked from commit 1ad5bb95d796283d5f56ac1edd16f1731d24a49d)
(cherry picked from commit 519160d466782f5a62365be051fcb3ef90fa0b00)
---
 .ci/docker/common/install_cache.sh | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.ci/docker/common/install_cache.sh b/.ci/docker/common/install_cache.sh
index f38cb3d06d88b..80839990e4e6f 100644
--- a/.ci/docker/common/install_cache.sh
+++ b/.ci/docker/common/install_cache.sh
@@ -36,7 +36,12 @@ sed -e 's|PATH="\(.*\)"|PATH="/opt/cache/bin:\1"|g' -i /etc/environment
 export PATH="/opt/cache/bin:$PATH"
 
 # Setup compiler cache
-install_ubuntu
+if [ -n "$ROCM_VERSION" ]; then
+  curl --retry 3 http://repo.radeon.com/misc/.sccache_amd/sccache -o /opt/cache/bin/sccache
+else
+  install_ubuntu
+fi
+
 chmod a+x /opt/cache/bin/sccache
 
 function write_sccache_stub() {