From 8734655ed3b8d7ab0990936d775e3be291933218 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Tue, 22 Apr 2025 12:13:05 +0000
Subject: [PATCH 01/83] [release/2.8] Enable wheels

(cherry picked from commit e294d4d6fb7f0552e430fd38b2acf864c8e051f2 with
modifications for release/2.8)

Reintroduce CIRCLE_TAG to be able to set PYTORCH_BUILD_VERSION without date
---
 .circleci/scripts/binary_populate_env.sh |  7 ++++-
 .github/scripts/build_triton_wheel.py    | 33 +++++++++++++++++++++++-
 2 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh
index 7f89c5c2dd8e6..b2df131ec33cb 100755
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@@ -5,7 +5,9 @@ export TZ=UTC
 tagged_version() {
   GIT_DIR="${workdir}/pytorch/.git"
   GIT_DESCRIBE="git --git-dir ${GIT_DIR} describe --tags --match v[0-9]*.[0-9]*.[0-9]*"
-  if [[ ! -d "${GIT_DIR}" ]]; then
+  if [[ -n "${CIRCLE_TAG:-}" ]]; then
+    echo "${CIRCLE_TAG}"
+  elif [[ ! -d "${GIT_DIR}" ]]; then
     echo "Abort, abort! Git dir ${GIT_DIR} does not exists!"
     kill $$
   elif ${GIT_DESCRIBE} --exact >/dev/null; then
@@ -69,6 +71,8 @@ fi
 
 export PYTORCH_BUILD_NUMBER=1
 
+# This part is done in the builder scripts so commenting the duplicate code
+: <<'BLOCK_COMMENT'
 # Set triton version as part of PYTORCH_EXTRA_INSTALL_REQUIREMENTS
 TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
 
@@ -117,6 +121,7 @@ if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_B
         export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS} | ${TRITON_REQUIREMENT}"
     fi
 fi
+BLOCK_COMMENT
 
 USE_GLOO_WITH_OPENSSL="ON"
 if [[ "$GPU_ARCH_TYPE" =~ .*aarch64.* ]]; then
diff --git a/.github/scripts/build_triton_wheel.py b/.github/scripts/build_triton_wheel.py
index beec9f96aba21..1302570432046 100644
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 
 import os
+import re
 import shutil
 import sys
 from pathlib import Path
@@ -50,6 +51,30 @@ def patch_init_py(
     with open(path, "w") as f:
         f.write(orig)
 
+def get_rocm_version() -> str:
+    rocm_path = os.environ.get('ROCM_HOME') or os.environ.get('ROCM_PATH') or "/opt/rocm"
+    rocm_version = "0.0.0"
+    rocm_version_h = f"{rocm_path}/include/rocm-core/rocm_version.h"
+    if not os.path.isfile(rocm_version_h):
+        rocm_version_h = f"{rocm_path}/include/rocm_version.h"
+    # The file could be missing due to 1) ROCm version < 5.2, or 2) no ROCm install.
+    if os.path.isfile(rocm_version_h):
+        RE_MAJOR = re.compile(r"#define\s+ROCM_VERSION_MAJOR\s+(\d+)")
+        RE_MINOR = re.compile(r"#define\s+ROCM_VERSION_MINOR\s+(\d+)")
+        RE_PATCH = re.compile(r"#define\s+ROCM_VERSION_PATCH\s+(\d+)")
+        major, minor, patch = 0, 0, 0
+        for line in open(rocm_version_h):
+            match = RE_MAJOR.search(line)
+            if match:
+                major = int(match.group(1))
+            match = RE_MINOR.search(line)
+            if match:
+                minor = int(match.group(1))
+            match = RE_PATCH.search(line)
+            if match:
+                patch = int(match.group(1))
+        rocm_version = str(major)+"."+str(minor)+"."+str(patch)
+    return rocm_version
 
 def build_triton(
     *,
@@ -64,7 +89,12 @@ def build_triton(
     if "MAX_JOBS" not in env:
         max_jobs = os.cpu_count() or 1
         env["MAX_JOBS"] = str(max_jobs)
-
+    if not release:
+        # Nightly binaries include the triton commit hash, i.e. 2.1.0+e6216047b8
+        # while release build should only include the version, i.e. 2.1.0
+        rocm_version = get_rocm_version()
+        version_suffix = f"+rocm{rocm_version}.git{commit_hash[:8]}"
+        version += version_suffix
     with TemporaryDirectory() as tmpdir:
         triton_basedir = Path(tmpdir) / "triton"
         triton_pythondir = triton_basedir / "python"
@@ -88,6 +118,7 @@ def build_triton(
 
         # change built wheel name and version
         env["TRITON_WHEEL_NAME"] = triton_pkg_name
+        env["TRITON_WHEEL_VERSION_SUFFIX"] = version_suffix
         if with_clang_ldd:
             env["TRITON_BUILD_WITH_CLANG_LLD"] = "1"
 

From dc95b0c29fab05322806bf322b8fd2f442af2e42 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Tue, 15 Jul 2025 23:13:35 +0000
Subject: [PATCH 02/83] Updates to build for Noble (Ubuntu 24.04) and py3.12

(Changes selected from ef226be26bcf41291ad05ff6a51688ae7cc0bb37
and https://github.com/pytorch/pytorch/commit/fadc936fad0793e931ed2eb89577e1d10d212f71)
---
 .ci/docker/build.sh                | 2 ++
 .ci/docker/common/common_utils.sh  | 4 ++++
 .ci/docker/common/install_base.sh  | 3 +++
 .ci/docker/common/install_conda.sh | 4 ++++
 .ci/docker/common/install_rocm.sh  | 8 +++++---
 .ci/docker/requirements-ci.txt     | 3 +++
 6 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index 6b978b8f4b552..6624d9928cbe0 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -52,6 +52,8 @@ fi
 
 if [[ "$image" == *-jammy* ]]; then
   UBUNTU_VERSION=22.04
+elif [[ "$image" == *-noble* ]]; then
+  UBUNTU_VERSION=24.04
 elif [[ "$image" == *ubuntu* ]]; then
   extract_version_from_image_name ubuntu UBUNTU_VERSION
 fi
diff --git a/.ci/docker/common/common_utils.sh b/.ci/docker/common/common_utils.sh
index 27c1b815a0ea8..110065698b587 100644
--- a/.ci/docker/common/common_utils.sh
+++ b/.ci/docker/common/common_utils.sh
@@ -23,6 +23,10 @@ conda_install() {
   as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION" $*
 }
 
+conda_install_through_forge() {
+  as_jenkins conda install -c conda-forge -q -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION" $*
+}
+
 conda_run() {
   as_jenkins conda run -n py_$ANACONDA_PYTHON_VERSION --no-capture-output $*
 }
diff --git a/.ci/docker/common/install_base.sh b/.ci/docker/common/install_base.sh
index 64304fec6ed9d..7d8ae247d7a0b 100755
--- a/.ci/docker/common/install_base.sh
+++ b/.ci/docker/common/install_base.sh
@@ -15,6 +15,9 @@ install_ubuntu() {
   elif [[ "$UBUNTU_VERSION" == "22.04"* ]]; then
     cmake3="cmake=3.22*"
     maybe_libiomp_dev=""
+  elif [[ "$UBUNTU_VERSION" == "24.04"* ]]; then
+    cmake3="cmake=3.28*"
+    maybe_libiomp_dev=""
   else
     cmake3="cmake=3.5*"
     maybe_libiomp_dev="libiomp-dev"
diff --git a/.ci/docker/common/install_conda.sh b/.ci/docker/common/install_conda.sh
index 11c51cac0bf83..b33f7f0a1e9d3 100755
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@@ -87,6 +87,10 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
     conda_run ${SCRIPT_FOLDER}/install_magma_conda.sh $(cut -f1-2 -d'.' <<< ${CUDA_VERSION})
   fi
 
+  if [[ "$UBUNTU_VERSION" == "24.04"* ]] ; then
+    conda_install_through_forge libstdcxx-ng=14
+  fi
+
   # Install some other packages, including those needed for Python test reporting
   pip_install -r /opt/conda/requirements-ci.txt
 
diff --git a/.ci/docker/common/install_rocm.sh b/.ci/docker/common/install_rocm.sh
index 2a8d5b30e74e3..fe2f35838fd9c 100644
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@@ -8,9 +8,11 @@ ver() {
 
 install_ubuntu() {
     apt-get update
-    if [[ $UBUNTU_VERSION == 20.04 ]]; then
-      # gpg-agent is not available by default on 20.04
-      apt-get install -y --no-install-recommends gpg-agent
+    # gpg-agent is not available by default
+    apt-get install -y --no-install-recommends gpg-agent
+    if [[ $(ver $UBUNTU_VERSION) -ge $(ver 22.04) ]]; then
+        echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' \
+            | sudo tee /etc/apt/preferences.d/rocm-pin-600
     fi
     apt-get install -y kmod
     apt-get install -y wget
diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index 4ecdde62408de..9f9360fc53270 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -16,6 +16,7 @@ click
 #test that import:
 
 coremltools==5.0b5 ; python_version < "3.12"
+coremltools==8.3 ; python_version == "3.12"
 #Description: Apple framework for ML integration
 #Pinned versions: 5.0b5
 #test that import:
@@ -63,6 +64,7 @@ lark==0.12.0
 #test that import:
 
 librosa>=0.6.2 ; python_version < "3.11"
+librosa==0.10.2 ; python_version == "3.12"
 #Description: A python package for music and audio analysis
 #Pinned versions: >=0.6.2
 #test that import: test_spectral_ops.py
@@ -111,6 +113,7 @@ ninja==1.11.1.3
 numba==0.49.0 ; python_version < "3.9"
 numba==0.55.2 ; python_version == "3.9"
 numba==0.55.2 ; python_version == "3.10"
+numba==0.60.0 ; python_version == "3.12"
 #Description: Just-In-Time Compiler for Numerical Functions
 #Pinned versions: 0.54.1, 0.49.0, <=0.49.1
 #test that import: test_numba_integration.py

From b741af3b5be771c131352cc063fe495aa654c3d4 Mon Sep 17 00:00:00 2001
From: Ethan Wee <Ethan.Wee@amd.com>
Date: Tue, 20 May 2025 02:00:27 -0500
Subject: [PATCH 03/83] [release/2.8] Make triton build ROCm version agnostic

Cherry-pick of https://github.com/ROCm/pytorch/pull/2130

Validation:
http://rocm-ci.amd.com/job/rocm-pytorch-manylinux-wheel-builder-lw/155/

---------

Co-authored-by: Ethan Wee <Ethan.Wee@amd.com>
Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
(cherry picked from commit 2c220b2ee62d4a9391ff5e441bee46477f65849e)
---
 .github/scripts/amd/package_triton_wheel.sh | 31 +++++----------------
 .github/scripts/amd/patch_triton_wheel.sh   | 16 +++++------
 2 files changed, 14 insertions(+), 33 deletions(-)

diff --git a/.github/scripts/amd/package_triton_wheel.sh b/.github/scripts/amd/package_triton_wheel.sh
index 6ecf8bab116b9..fe8d915422dac 100755
--- a/.github/scripts/amd/package_triton_wheel.sh
+++ b/.github/scripts/amd/package_triton_wheel.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 set -ex
 
 # Set ROCM_HOME isn't available, use ROCM_PATH if set or /opt/rocm
@@ -50,29 +51,15 @@ do
     cp $lib $TRITON_ROCM_DIR/lib/
 done
 
-# Required ROCm libraries
-if [[ "${MAJOR_VERSION}" == "6" ]]; then
-    libamdhip="libamdhip64.so.6"
-else
-    libamdhip="libamdhip64.so.5"
-fi
-
 # Required ROCm libraries - ROCm 6.0
 ROCM_SO=(
-    "${libamdhip}"
-    "libhsa-runtime64.so.1"
-    "libdrm.so.2"
-    "libdrm_amdgpu.so.1"
+    "libamdhip64.so"
+    "libhsa-runtime64.so"
+    "libdrm.so"
+    "libdrm_amdgpu.so"
+    "libamd_comgr.so"
+    "librocprofiler-register.so"
 )
-if [[ $ROCM_INT -ge 60400 ]]; then
-    ROCM_SO+=("libamd_comgr.so.3")
-else
-    ROCM_SO+=("libamd_comgr.so.2")
-fi
-
-if [[ $ROCM_INT -ge 60100 ]]; then
-    ROCM_SO+=("librocprofiler-register.so.0")
-fi
 
 for lib in "${ROCM_SO[@]}"
 do
@@ -94,10 +81,6 @@ do
     fi
 
     cp $file_path $TRITON_ROCM_DIR/lib
-    # When running locally, and not building a wheel, we need to satisfy shared objects requests that don't look for versions
-    LINKNAME=$(echo $lib | sed -e 's/\.so.*/.so/g')
-    ln -sf $lib $TRITON_ROCM_DIR/lib/$LINKNAME
-
 done
 
 # Copy Include Files
diff --git a/.github/scripts/amd/patch_triton_wheel.sh b/.github/scripts/amd/patch_triton_wheel.sh
index 3669134631546..fb3c0f36ddb47 100755
--- a/.github/scripts/amd/patch_triton_wheel.sh
+++ b/.github/scripts/amd/patch_triton_wheel.sh
@@ -19,15 +19,13 @@ replace_needed_sofiles() {
     find $1 -name '*.so*' -o -name 'ld.lld' | while read sofile; do
         origname=$2
         patchedname=$3
-        if [[ "$origname" != "$patchedname" ]]; then
-            set +e
-            origname=$($PATCHELF_BIN --print-needed $sofile | grep "$origname.*")
-            ERRCODE=$?
-            set -e
-            if [ "$ERRCODE" -eq "0" ]; then
-                echo "patching $sofile entry $origname to $patchedname"
-                $PATCHELF_BIN --replace-needed $origname $patchedname $sofile
-            fi
+        set +e
+        origname=$($PATCHELF_BIN --print-needed $sofile | grep "$origname.*")
+        ERRCODE=$?
+        set -e
+        if [ "$ERRCODE" -eq "0" ]; then
+            echo "patching $sofile entry $origname to $patchedname"
+            $PATCHELF_BIN --replace-needed $origname $patchedname $sofile
         fi
     done
 }

From b4c293a73d6d7d340dbcafb9c31d60f55313e2be Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 16 Jul 2025 03:33:26 +0000
Subject: [PATCH 04/83] [release/2.8] Replace upstream install_rocm_magma.sh
 with rocm fork version

since we need to rebuild magma for all supported architectures, we
cannot use upstream magma tarball anyway
---
 .ci/docker/common/install_rocm_magma.sh | 82 +++++++++++++++++--------
 1 file changed, 55 insertions(+), 27 deletions(-)

diff --git a/.ci/docker/common/install_rocm_magma.sh b/.ci/docker/common/install_rocm_magma.sh
index 364ee23b97e57..db826ed6e0278 100644
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@@ -1,32 +1,60 @@
-#!/usr/bin/env bash
-# Script used only in CD pipeline
+#!/bin/bash
+# Script used in CI and CD pipeline
 
-set -eou pipefail
+set -ex
 
-function do_install() {
-    rocm_version=$1
-    rocm_version_nodot=${1//./}
+ver() {
+    printf "%3d%03d%03d%03d" $(echo "$1" | tr '.' ' ');
+}
 
-    # Version 2.7.2 + ROCm related updates
-    MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
-    magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
+# Magma build scripts need `python`
+ln -sf /usr/bin/python3 /usr/bin/python
 
-    rocm_dir="/opt/rocm"
-    (
-        set -x
-        tmp_dir=$(mktemp -d)
-        pushd ${tmp_dir}
-        curl -OLs https://ossci-linux.s3.us-east-1.amazonaws.com/${magma_archive}
-        if tar -xvf "${magma_archive}"
-        then
-            mkdir -p "${rocm_dir}/magma"
-            mv include "${rocm_dir}/magma/include"
-            mv lib "${rocm_dir}/magma/lib"
-        else
-            echo "${magma_archive} not found, skipping magma install"
-        fi
-        popd
-    )
-}
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+case "$ID" in
+  almalinux)
+    yum install -y gcc-gfortran
+    ;;
+  *)
+    echo "No preinstalls to build magma..."
+    ;;
+esac
+
+MKLROOT=${MKLROOT:-/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION}
+
+# "install" hipMAGMA into /opt/rocm/magma by copying after build
+if [[ $(ver $ROCM_VERSION) -ge $(ver 7.0) ]]; then
+    git clone https://github.com/ROCm/utk-magma.git -b release/2.9.0_rocm70 magma
+    pushd magma
+    # version 2.9 + ROCm 7.0 related updates
+    git checkout 91c4f720a17e842b364e9de41edeef76995eb9ad
+else
+    git clone https://bitbucket.org/icl/magma.git
+    pushd magma
+    # Version 2.7.2 + ROCm related updates
+    git checkout a1625ff4d9bc362906bd01f805dbbe12612953f6
+fi
 
-do_install $1
+cp make.inc-examples/make.inc.hip-gcc-mkl make.inc
+echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc
+if [[ -f "${MKLROOT}/lib/libmkl_core.a" ]]; then
+    echo 'LIB = -Wl,--start-group -lmkl_gf_lp64 -lmkl_gnu_thread -lmkl_core -Wl,--end-group -lpthread -lstdc++ -lm -lgomp -lhipblas -lhipsparse' >> make.inc
+fi
+echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib -ldl' >> make.inc
+echo 'DEVCCFLAGS += --gpu-max-threads-per-block=256' >> make.inc
+export PATH="${PATH}:/opt/rocm/bin"
+if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
+  amdgpu_targets=`echo $PYTORCH_ROCM_ARCH | sed 's/;/ /g'`
+else
+  amdgpu_targets=`rocm_agent_enumerator | grep -v gfx000 | sort -u | xargs`
+fi
+for arch in $amdgpu_targets; do
+  echo "DEVCCFLAGS += --offload-arch=$arch" >> make.inc
+done
+# hipcc with openmp flag may cause isnan() on __device__ not to be found; depending on context, compiler may attempt to match with host definition
+sed -i 's/^FOPENMP/#FOPENMP/g' make.inc
+make -f make.gen.hipMAGMA -j $(nproc)
+LANG=C.UTF-8 make lib/libmagma.so -j $(nproc) MKLROOT="${MKLROOT}"
+make testing/testing_dgemm -j $(nproc) MKLROOT="${MKLROOT}"
+popd
+mv magma /opt/rocm

From 9ed3d2e978a03b2c28109d6485db39d919fc2385 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 16 Jul 2025 03:43:38 +0000
Subject: [PATCH 05/83] [release/2.8] Upgrade numpy versions; Use different
 package versions for py3.9; upgrade tensorboard compatible with numpy 2

Co-authored-by: Ethan Wee <Ethan.Wee@amd.com>
(cherry picked from commit e867a3de4b0196621e8e53d5338a8bb8bb62e828)
(cherry picked from commit c7a1e32fbcf9e0a458d959a453de65c27c51452c)
(cherry picked from commit 2a215e4a2115c999e4bb058956d888aed67787d1)
(cherry picked from commit 866cc1dbb9c93f807af1ef59801c645062cbb95e)
---
 .ci/docker/requirements-ci.txt | 21 ++++++++-------------
 requirements.txt               |  3 ++-
 2 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index 9f9360fc53270..72811c384900c 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -110,10 +110,8 @@ ninja==1.11.1.3
 #Pinned versions: 1.11.1.3
 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
 
-numba==0.49.0 ; python_version < "3.9"
-numba==0.55.2 ; python_version == "3.9"
-numba==0.55.2 ; python_version == "3.10"
-numba==0.60.0 ; python_version == "3.12"
+numba==0.60.0 ; python_version == "3.9"
+numba==0.61.2 ; python_version > "3.9"
 #Description: Just-In-Time Compiler for Numerical Functions
 #Pinned versions: 0.54.1, 0.49.0, <=0.49.1
 #test that import: test_numba_integration.py
@@ -131,12 +129,10 @@ numba==0.60.0 ; python_version == "3.12"
 #test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py,
 #test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py,
 #test_binary_ufuncs.py
-numpy==1.22.4; python_version == "3.9" or python_version == "3.10"
-numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
-numpy==2.1.2; python_version >= "3.13"
+numpy==2.0.2 ; python_version == "3.9"
+numpy==2.1.2 ; python_version > "3.9"
 
-pandas==2.0.3; python_version < "3.13"
-pandas==2.2.3; python_version >= "3.13"
+pandas==2.2.3
 
 #onnxruntime
 #Description: scoring engine for Open Neural Network Exchange (ONNX) models
@@ -247,8 +243,8 @@ scikit-image==0.22.0 ; python_version >= "3.10"
 #Pinned versions: 0.20.3
 #test that import:
 
-scipy==1.10.1 ; python_version <= "3.11"
-scipy==1.14.1 ; python_version >= "3.12"
+scipy==1.13.1 ; python_version == "3.9"
+scipy==1.14.1 ; python_version > "3.9"
 # Pin SciPy because of failing distribution tests (see #60347)
 #Description: scientific python
 #Pinned versions: 1.10.1
@@ -312,8 +308,7 @@ z3-solver==4.12.6.0
 #Pinned versions:
 #test that import:
 
-tensorboard==2.13.0 ; python_version < "3.13"
-tensorboard==2.18.0 ; python_version >= "3.13"
+tensorboard==2.18.0
 #Description: Also included in .ci/docker/requirements-docs.txt
 #Pinned versions:
 #test that import: test_tensorboard
diff --git a/requirements.txt b/requirements.txt
index 18f7810de9512..f65837a0097e0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,7 +9,8 @@ jinja2
 lintrunner ; platform_machine != "s390x"
 networkx
 ninja
-numpy
+numpy==2.0.2 ; python_version == "3.9"
+numpy==2.1.2 ; python_version > "3.9"
 optree>=0.13.0
 packaging
 psutil

From 12508fdaa70eba2f786c2438465ea590d8e83749 Mon Sep 17 00:00:00 2001
From: Ethan Wee <Ethan.Wee@amd.com>
Date: Wed, 18 Jun 2025 22:32:20 -0700
Subject: [PATCH 06/83] [release/2.8] Removing --user flag from all pip install
 commands

From upstream PR https://github.com/pytorch/pytorch/pull/154900
Resolves:https://ontrack-internal.amd.com/browse/SWDEV-536994

After following steps to reproduce in container
**registry-sc-harbor.amd.com/framework/compute-rocm-dkms-no-npi-hipclang:16231_ubuntu22.04_py3.10_pytorch_lw_release2.7_no_user_66a18277**:
```
root@ubb4-rack-22:/var/lib/jenkins/pytorch# history
    1  cd /var/lib/jenkins/pytorch
    2  TEST_CONFIG=default CONTINUE_THROUGH_ERROR=True .ci/pytorch/test.sh

 Name:                    gfx90a
  Marketing Name:          AMD Instinct MI250X/MI250
+ MAYBE_ROCM=rocm/
+ [[ rocm == *xpu* ]]
+ [[ rocm != *-bazel-* ]]
+ pip_install ninja==1.10.2
+ pip_install_pkg='python3 -m pip install --progress-bar off'
+ python3 -m pip install --progress-bar off ninja==1.10.2
Collecting ninja==1.10.2
  Downloading ninja-1.10.2-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata (5.0 kB)
Downloading ninja-1.10.2-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (108 kB)
Installing collected packages: ninja
  Attempting uninstall: ninja
    Found existing installation: ninja 1.11.1.4
    Uninstalling ninja-1.11.1.4:
      Successfully uninstalled ninja-1.11.1.4
Successfully installed ninja-1.10.2
+ export PATH=/root/.local/bin:/opt/venv/bin:/opt/rocm/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+ PATH=/root/.local/bin:/opt/venv/bin:/opt/rocm/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+ [[ rocm == *aarch64* ]]
+ [[ rocm == *asan* ]]
+ [[ rocm == *-debug* ]]
+ [[ rocm != *-bazel-* ]]
+ echo 'We are not in debug mode: rocm. Expect the assertion to pas
```
http://rocm-ci.amd.com/job/mainline-pytorch2.7-manylinux-wheels/126/

---------

Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
(cherry picked from commit 0bd4030892d02407ccb9f844727326579764c6b4)
---
 .ci/caffe2/test.sh          |  6 +++---
 .ci/onnx/test.sh            |  2 +-
 .ci/pytorch/common_utils.sh | 18 +++++++++---------
 .ci/pytorch/test.sh         |  8 ++++----
 4 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/.ci/caffe2/test.sh b/.ci/caffe2/test.sh
index eaef1e3ebf88a..7d1ce2fb4fa10 100755
--- a/.ci/caffe2/test.sh
+++ b/.ci/caffe2/test.sh
@@ -5,7 +5,7 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
 
 if [[ ${BUILD_ENVIRONMENT} == *onnx* ]]; then
   pip install click mock tabulate networkx==2.0
-  pip -q install --user "file:///var/lib/jenkins/workspace/third_party/onnx#egg=onnx"
+  pip -q install "file:///var/lib/jenkins/workspace/third_party/onnx#egg=onnx"
 fi
 
 # Skip tests in environments where they are not built/applicable
@@ -147,8 +147,8 @@ export DNNL_MAX_CPU_ISA=AVX2
 if [[ "${SHARD_NUMBER:-1}" == "1" ]]; then
   # TODO(sdym@meta.com) remove this when the linked issue resolved.
   # py is temporary until https://github.com/Teemu/pytest-sugar/issues/241 is fixed
-  pip install --user py==1.11.0
-  pip install --user pytest-sugar
+  pip install py==1.11.0
+  pip install pytest-sugar
   # NB: Warnings are disabled because they make it harder to see what
   # the actual erroring test is
   "$PYTHON" \
diff --git a/.ci/onnx/test.sh b/.ci/onnx/test.sh
index a7d3b72c62a7e..d42ca2c218dec 100755
--- a/.ci/onnx/test.sh
+++ b/.ci/onnx/test.sh
@@ -19,7 +19,7 @@ git config --global --add safe.directory /var/lib/jenkins/workspace
 
 if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
   # TODO: This can be removed later once vision is also part of the Docker image
-  pip install -q --user --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)"
+  pip install -q --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)"
   # JIT C++ extensions require ninja, so put it into PATH.
   export PATH="/var/lib/jenkins/.local/bin:$PATH"
   # NB: ONNX test is fast (~15m) so it's ok to retry it few more times to avoid any flaky issue, we
diff --git a/.ci/pytorch/common_utils.sh b/.ci/pytorch/common_utils.sh
index 8b05766ef4002..092d88d6387fb 100644
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@@ -127,9 +127,9 @@ function install_torchaudio() {
   if [[ "$1" == "cuda" ]]; then
     # TODO: This is better to be passed as a parameter from _linux-test workflow
     # so that it can be consistent with what is set in build
-    TORCH_CUDA_ARCH_LIST="8.0;8.6" pip_install --no-use-pep517 --user "git+https://github.com/pytorch/audio.git@${commit}"
+    TORCH_CUDA_ARCH_LIST="8.0;8.6" pip_install --no-use-pep517 "git+https://github.com/pytorch/audio.git@${commit}"
   else
-    pip_install --no-use-pep517 --user "git+https://github.com/pytorch/audio.git@${commit}"
+    pip_install --no-use-pep517 "git+https://github.com/pytorch/audio.git@${commit}"
   fi
 
 }
@@ -139,8 +139,8 @@ function install_torchtext() {
   local text_commit
   data_commit=$(get_pinned_commit data)
   text_commit=$(get_pinned_commit text)
-  pip_install --no-use-pep517 --user "git+https://github.com/pytorch/data.git@${data_commit}"
-  pip_install --no-use-pep517 --user "git+https://github.com/pytorch/text.git@${text_commit}"
+  pip_install --no-use-pep517 "git+https://github.com/pytorch/data.git@${data_commit}"
+  pip_install --no-use-pep517 "git+https://github.com/pytorch/text.git@${text_commit}"
 }
 
 function install_torchvision() {
@@ -153,7 +153,7 @@ function install_torchvision() {
     echo 'char* dlerror(void) { return "";}'|gcc -fpic -shared -o "${HOME}/dlerror.so" -x c -
     LD_PRELOAD=${orig_preload}:${HOME}/dlerror.so
   fi
-  pip_install --no-use-pep517 --user "git+https://github.com/pytorch/vision.git@${commit}"
+  pip_install --no-use-pep517 "git+https://github.com/pytorch/vision.git@${commit}"
   if [ -n "${LD_PRELOAD}" ]; then
     LD_PRELOAD=${orig_preload}
   fi
@@ -173,7 +173,7 @@ function install_torchrec_and_fbgemm() {
 
   if [[ "$BUILD_ENVIRONMENT" == *rocm* ]] ; then
     # install torchrec first because it installs fbgemm nightly on top of rocm fbgemm
-    pip_install --no-use-pep517 --user "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}"
+    pip_install --no-use-pep517 "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}"
     pip_uninstall fbgemm-gpu-nightly
 
     pip_install tabulate  # needed for newer fbgemm
@@ -190,8 +190,8 @@ function install_torchrec_and_fbgemm() {
     rm -rf fbgemm
   else
     # See https://github.com/pytorch/pytorch/issues/106971
-    CUDA_PATH=/usr/local/cuda-12.1 pip_install --no-use-pep517 --user "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#egg=fbgemm-gpu&subdirectory=fbgemm_gpu"
-    pip_install --no-use-pep517 --user "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}"
+    CUDA_PATH=/usr/local/cuda-12.1 pip_install --no-use-pep517 "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#egg=fbgemm-gpu&subdirectory=fbgemm_gpu"
+    pip_install --no-use-pep517 "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}"
   fi
 }
 
@@ -234,7 +234,7 @@ function checkout_install_torchbench() {
 function install_torchao() {
   local commit
   commit=$(get_pinned_commit torchao)
-  pip_install --no-use-pep517 --user "git+https://github.com/pytorch/ao.git@${commit}"
+  pip_install --no-use-pep517 "git+https://github.com/pytorch/ao.git@${commit}"
 }
 
 function print_sccache_stats() {
diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
index 425cc2a80dc73..52fb572b81c46 100755
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@@ -201,7 +201,7 @@ fi
 
 if [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]] ; then
   # JIT C++ extensions require ninja.
-  pip_install --user "ninja==1.10.2"
+  pip_install "ninja==1.10.2"
   # ninja is installed in $HOME/.local/bin, e.g., /var/lib/jenkins/.local/bin for CI user jenkins
   # but this script should be runnable by any user, including root
   export PATH="$HOME/.local/bin:$PATH"
@@ -502,7 +502,7 @@ DYNAMO_BENCHMARK_FLAGS=()
 
 pr_time_benchmarks() {
 
-  pip_install --user "fbscribelogger"
+  pip_install "fbscribelogger"
 
   TEST_REPORTS_DIR=$(pwd)/test/test-reports
   mkdir -p "$TEST_REPORTS_DIR"
@@ -1469,8 +1469,8 @@ test_bazel() {
 
 test_benchmarks() {
   if [[ "$BUILD_ENVIRONMENT" == *cuda* && $TEST_CONFIG != *nogpu* ]]; then
-    pip_install --user "pytest-benchmark==3.2.3"
-    pip_install --user "requests"
+    pip_install "pytest-benchmark==3.2.3"
+    pip_install "requests"
     BENCHMARK_DATA="benchmarks/.data"
     mkdir -p ${BENCHMARK_DATA}
     pytest benchmarks/fastrnns/test_bench.py --benchmark-sort=Name --benchmark-json=${BENCHMARK_DATA}/fastrnns_default.json --fuser=default --executor=default

From 90d7d4b7726ccff40ffde92138e1660efe22d0aa Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 16 Jul 2025 04:21:26 +0000
Subject: [PATCH 07/83] [ROCm] Remove use of warpsize on host-side compilation
 (pytorch#156979)

(cherry picked from upstream commit 04bd7e6850e8efec77994963ffee87549555b9c3)
---
 aten/src/ATen/native/cuda/Embedding.cu        |  2 +-
 .../src/ATen/native/cuda/MultinomialKernel.cu |  2 +-
 aten/src/ATen/native/cuda/SoftMax.cu          | 19 ++++++++++---------
 aten/src/ATen/native/cuda/TensorModeKernel.cu |  2 +-
 aten/src/ATen/native/cuda/TensorTopK.cu       |  4 ++++
 aten/src/ATen/native/cuda/block_reduce.cuh    | 12 +++++++++++-
 c10/macros/Macros.h                           | 16 +++++++++++++++-
 .../c10d/symm_mem/CUDASymmetricMemory.cu      |  6 +++---
 .../c10d/symm_mem/CUDASymmetricMemoryOps.cu   |  2 +-
 9 files changed, 47 insertions(+), 18 deletions(-)

diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu
index 5d19b95b32f9b..4b1e420d5da98 100644
--- a/aten/src/ATen/native/cuda/Embedding.cu
+++ b/aten/src/ATen/native/cuda/Embedding.cu
@@ -369,7 +369,7 @@ Tensor & embedding_renorm_cuda_(Tensor & self, const Tensor & indices,
 
     int warp_size = at::cuda::warp_size();
     TORCH_INTERNAL_ASSERT(num_threads() % warp_size == 0 &&
-                  num_threads() <= cuda_utils::kCUDABlockReduceMaxThreads,
+                  num_threads() <= cuda_utils::kCUDABlockReduceMaxThreads(),
                   "BlockReduceSum requires all warps be active");
     const int64_t *num_unique_indices_ptr = num_unique_indices.const_data_ptr<int64_t>();
     dim3 grid = unique_indices.numel();
diff --git a/aten/src/ATen/native/cuda/MultinomialKernel.cu b/aten/src/ATen/native/cuda/MultinomialKernel.cu
index 65770e40a8b2b..8132e7df57b51 100644
--- a/aten/src/ATen/native/cuda/MultinomialKernel.cu
+++ b/aten/src/ATen/native/cuda/MultinomialKernel.cu
@@ -86,7 +86,7 @@ void renormRows(Tensor& t) {
   TORCH_CHECK(props != nullptr);
   int numSM = props->multiProcessorCount;
   const int64_t maxThreads = std::min(
-      props->maxThreadsPerBlock, cuda_utils::kCUDABlockReduceMaxThreads);
+      props->maxThreadsPerBlock, cuda_utils::kCUDABlockReduceMaxThreads());
 
   int warp_size = at::cuda::warp_size();
   dim3 grid(rows < numSM * 4 ? rows : numSM * 4);
diff --git a/aten/src/ATen/native/cuda/SoftMax.cu b/aten/src/ATen/native/cuda/SoftMax.cu
index 5157d37f68b34..f27d76256cdb3 100644
--- a/aten/src/ATen/native/cuda/SoftMax.cu
+++ b/aten/src/ATen/native/cuda/SoftMax.cu
@@ -183,15 +183,16 @@ inline dim3 SoftMaxForward_getBlockSize(uint64_t dim_size) {
   uint64_t block_size = 1;
   uint64_t max_block_size = std::min(dim_size, static_cast<uint64_t>(max_threads));
 
-  // We need a block size that is a multiple of C10_WARP_SIZE in order
+  // We need a block size that is a multiple of at::cuda::warp_size() in order
   // to perform block size reductions using warp shuffle instructions.
-  // Since max_threads is also a multiple of C10_WARPS_SIZE we do not
+  // Since max_threads is also a multiple of at::cuda::warp_size() we do not
   // risk creating a block size larger than the limit.
 
-  if (max_block_size % C10_WARP_SIZE == 0) {
+  int warp_size = at::cuda::warp_size();
+  if (max_block_size % warp_size == 0) {
     block_size = max_block_size;
   } else {
-    block_size = (max_block_size / C10_WARP_SIZE + 1) * C10_WARP_SIZE;
+    block_size = (max_block_size / warp_size + 1) * warp_size;
   }
 
   return dim3(block_size);
@@ -1107,7 +1108,7 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t
             constexpr int ILP = sizeof(float4) / sizeof(scalar_t);
             if constexpr (use_fast_softmax) {
               dim3 block(512);
-              size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
+              size_t smem_reduction_sz = block.x / at::cuda::warp_size() * sizeof(accscalar_t);
               if (dim_size % ILP == 0) {
                 cunn_SoftMaxForwardGmem<ILP, scalar_t, accscalar_t, scalar_t, EpilogueWithMul>
                     <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
@@ -1117,7 +1118,7 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t
               }
             } else {
               dim3 block = SoftMaxForward_getBlockSize(dim_size);
-              size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
+              size_t smem_reduction_sz = block.x / at::cuda::warp_size() * sizeof(accscalar_t);
               auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock -
                 smem_reduction_sz) / sizeof(scalar_t);
 
@@ -1198,7 +1199,7 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t
             constexpr int ILP = sizeof(float4) / sizeof(scalar_t);
             if constexpr (use_fast_softmax) {
               dim3 block(512);
-              size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
+              size_t smem_reduction_sz = block.x / at::cuda::warp_size() * sizeof(accscalar_t);
               if (dim_size % ILP == 0) {
                 cunn_SoftMaxForwardGmem<ILP, scalar_t, accscalar_t, accscalar_t, EpilogueWithMul>
                     <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
@@ -1208,7 +1209,7 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t
               }
             } else {
               dim3 block = SoftMaxForward_getBlockSize(dim_size);
-              size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
+              size_t smem_reduction_sz = block.x / at::cuda::warp_size() * sizeof(accscalar_t);
               auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock -
                 smem_reduction_sz) / sizeof(scalar_t);
 
@@ -1274,7 +1275,7 @@ void dispatch_host_softmax_backward(int64_t dim_size, dim3 grid, Tensor &grad, T
   constexpr int ILP = sizeof(float4) / sizeof(output_t);
   dim3 block = SoftMax_getBlockSize(ILP, dim_size);
 
-  size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
+  size_t smem_reduction_sz = block.x / at::cuda::warp_size() * sizeof(accscalar_t);
   auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock -
     smem_reduction_sz) / sizeof(output_t);
   bool can_use_smem = static_cast<size_t>(dim_size) < max_elements_per_smem;
diff --git a/aten/src/ATen/native/cuda/TensorModeKernel.cu b/aten/src/ATen/native/cuda/TensorModeKernel.cu
index 4764b078c050b..0c97ab742103f 100644
--- a/aten/src/ATen/native/cuda/TensorModeKernel.cu
+++ b/aten/src/ATen/native/cuda/TensorModeKernel.cu
@@ -207,7 +207,7 @@ void handle_fused_mode(
   constexpr int num_threads = size / 2;
   int warp_size = at::cuda::warp_size();
   TORCH_INTERNAL_ASSERT(num_threads % warp_size == 0 &&
-                num_threads <= cuda_utils::kCUDABlockReduceMaxThreads, "");
+                num_threads <= cuda_utils::kCUDABlockReduceMaxThreads(), "");
   const auto memsize =
       (sizeof(scalar_t) * size) + (2 * size * sizeof(unsigned int));
   compute_mode<scalar_t, size>
diff --git a/aten/src/ATen/native/cuda/TensorTopK.cu b/aten/src/ATen/native/cuda/TensorTopK.cu
index 103b360bcb868..49086c42cd4a2 100644
--- a/aten/src/ATen/native/cuda/TensorTopK.cu
+++ b/aten/src/ATen/native/cuda/TensorTopK.cu
@@ -439,8 +439,12 @@ __global__ void computeBlockwiseWithinKCounts(
     warp_counts[warp] = count;
   }
   __syncthreads();
+#ifdef USE_ROCM
+  CUDA_KERNEL_ASSERT(RADIX_DIGITS < C10_WARP_SIZE * C10_WARP_SIZE);
+#else
   static_assert(RADIX_DIGITS < C10_WARP_SIZE * C10_WARP_SIZE,
     "Assuming only 1 warp is needed for final reduction");
+#endif
   if (warp != 0) {
     return;
   }
diff --git a/aten/src/ATen/native/cuda/block_reduce.cuh b/aten/src/ATen/native/cuda/block_reduce.cuh
index 2a272d22c0c60..1818987c6a588 100644
--- a/aten/src/ATen/native/cuda/block_reduce.cuh
+++ b/aten/src/ATen/native/cuda/block_reduce.cuh
@@ -12,7 +12,17 @@ constexpr int kCUDABlockReduceNumThreads = 512;
 // of which reduces C10_WARP_SIZE elements. So, at most
 // C10_WARP_SIZE**2 elements can be reduced at a time.
 // NOTE: This is >= the max block size on current hardware anyway (1024).
-constexpr int kCUDABlockReduceMaxThreads = C10_WARP_SIZE * C10_WARP_SIZE;
+// ROCm NOTE: C10_WARP_SIZE should only be used inside device functions,
+// and kCUDABlockReduceMaxThreads is a host-side variable.
+#ifdef USE_ROCM
+static int kCUDABlockReduceMaxThreads() {
+    return at::cuda::warp_size() * at::cuda::warp_size();
+}
+#else
+constexpr int kCUDABlockReduceMaxThreads() {
+    return C10_WARP_SIZE * C10_WARP_SIZE;
+}
+#endif
 
 // Sums `val` across all threads in a warp.
 //
diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h
index 7d8238f910464..6b51a39f2a943 100644
--- a/c10/macros/Macros.h
+++ b/c10/macros/Macros.h
@@ -312,7 +312,21 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
 #endif
 
 #if defined(USE_ROCM)
-#define C10_WARP_SIZE warpSize // = 64 or 32 (Defined in hip_runtime.h)
+// C10_WARP_SIZE is only allowed for device code.
+// Host code _must_ use at::cuda::warp_size()
+// HIP header used to define warpSize as a constexpr that was either 32 or 64
+// depending on the target device, and then always set it to 64 for host code.
+// Host pass of HIP compiler needs C10_WARP_SIZE defined to _something_ so we
+// set it to something unreasonable to trigger obvious host code errors.
+#if defined(__HIP_DEVICE_COMPILE__)
+#if defined(__GFX9__)
+static constexpr int C10_WARP_SIZE = 64;
+#else // __GFX9__
+static constexpr int C10_WARP_SIZE = 32;
+#endif // __GFX9__
+#else
+static constexpr int C10_WARP_SIZE = 1;
+#endif // __HIP_DEVICE_COMPILE__
 #else
 #define C10_WARP_SIZE 32
 #endif
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
index 4cc29b0c347fc..20ccf0d74b601 100644
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
@@ -255,7 +255,7 @@ static __global__ void barrier_kernel(
 void CUDASymmetricMemory::barrier(int channel, size_t timeout_ms) {
   check_channel(channel, world_size_);
   c10::cuda::CUDAGuard guard(local_device_idx_);
-  barrier_kernel<<<1, C10_WARP_SIZE, 0, at::cuda::getCurrentCUDAStream()>>>(
+  barrier_kernel<<<1, at::cuda::warp_size(), 0, at::cuda::getCurrentCUDAStream()>>>(
       reinterpret_cast<uint32_t**>(signal_pads_dev_),
       channel,
       rank_,
@@ -293,7 +293,7 @@ void CUDASymmetricMemory::put_signal(
     size_t timeout_ms) {
   check_channel(channel, world_size_);
   c10::cuda::CUDAGuard guard(local_device_idx_);
-  put_signal_kernel<<<1, C10_WARP_SIZE, 0, at::cuda::getCurrentCUDAStream()>>>(
+  put_signal_kernel<<<1, at::cuda::warp_size(), 0, at::cuda::getCurrentCUDAStream()>>>(
       reinterpret_cast<uint32_t**>(signal_pads_dev_),
       dst_rank,
       channel,
@@ -337,7 +337,7 @@ void CUDASymmetricMemory::wait_signal(
     size_t timeout_ms) {
   check_channel(channel, world_size_);
   c10::cuda::CUDAGuard guard(local_device_idx_);
-  wait_signal_kernel<<<1, C10_WARP_SIZE, 0, at::cuda::getCurrentCUDAStream()>>>(
+  wait_signal_kernel<<<1, at::cuda::warp_size(), 0, at::cuda::getCurrentCUDAStream()>>>(
       reinterpret_cast<uint32_t**>(signal_pads_dev_),
       src_rank,
       channel,
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
index d7652b77ebab4..a2d5f8f9f67ba 100644
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
@@ -114,7 +114,7 @@ void init_elementwise_launch_config(
     num_blocks = 1;
     num_threads = at::round_up(
         at::ceil_div(numel_per_split, numel_per_thread),
-        static_cast<size_t>(C10_WARP_SIZE));
+        static_cast<size_t>(at::cuda::warp_size()));
   } else {
     num_blocks = std::min(
         at::ceil_div(numel_per_split, max_num_threads * numel_per_thread),

From 186180d9aa1dd92919be12b7a1a8b4d9db5a8479 Mon Sep 17 00:00:00 2001
From: Xinya Zhang <Xinya.Zhang@amd.com>
Date: Tue, 15 Jul 2025 23:12:21 -0500
Subject: [PATCH 08/83] [release/2.8] Improve C10_WARP_SIZE compatibility

Cherry-pick of https://github.com/ROCm/pytorch/pull/2328

Co-authored-by: Xinya Zhang <Xinya.Zhang@amd.com>
Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
(cherry picked from commit fe3d37a958142978d027dc921d66b46ecc6ffdad)
---
 .../src/ATen/native/cuda/layer_norm_kernel.cu |  5 ++++
 .../sparse/cuda/SparseCUDAApplyUtils.cuh      |  4 +++
 c10/macros/Macros.h                           | 30 ++++++++++++++-----
 3 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
index bdb169e26b142..0709164e60555 100644
--- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu
+++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
@@ -33,7 +33,12 @@ namespace at::native {
 namespace {
 
 constexpr int kCUDANumThreads = 256;
+#ifdef USE_ROCM
+// C10_WARP_SIZE is not constexpr for host code.
+#define kWarpSize C10_WARP_SIZE
+#else
 constexpr unsigned int kWarpSize = C10_WARP_SIZE;
+#endif
 constexpr int vec_size = 4; //we could make it dependent on dtype, but that would lead to different results between float and low-p types
 
 // aligned vector generates vectorized load/store on CUDA (copy-pasted from MemoryAccess.cuh)
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh b/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh
index c9412d74e9cda..693ca536a3198 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh
@@ -242,7 +242,11 @@ __global__ void coalesceValuesKernel(
 // `if constexpr` when CUDA codes will be compiled under C++-17, see
 // gh-56055 for blockers.
 template<typename Dtype>
+#ifdef USE_ROCM
+C10_LAUNCH_BOUNDS_1(C10_WARP_SIZE_STATIC*4)
+#else
 C10_LAUNCH_BOUNDS_1(C10_WARP_SIZE*4)
+#endif
 __global__ void coalesceValuesKernel(
   int64_t *segment_offsets, int64_t *value_indices,
   bool *values, bool *newValues,
diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h
index 6b51a39f2a943..77ca999090d93 100644
--- a/c10/macros/Macros.h
+++ b/c10/macros/Macros.h
@@ -318,16 +318,32 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
 // depending on the target device, and then always set it to 64 for host code.
 // Host pass of HIP compiler needs C10_WARP_SIZE defined to _something_ so we
 // set it to something unreasonable to trigger obvious host code errors.
-#if defined(__HIP_DEVICE_COMPILE__)
+
+namespace at::cuda {
+TORCH_CUDA_CPP_API int warp_size();
+}
+#ifdef __HIPCC__
+static inline int __host__ C10_WARP_SIZE_INTERNAL() {
+  return at::cuda::warp_size();
+}
+
+static inline constexpr int __device__ C10_WARP_SIZE_INTERNAL() {
 #if defined(__GFX9__)
-static constexpr int C10_WARP_SIZE = 64;
+  return 64;
 #else // __GFX9__
-static constexpr int C10_WARP_SIZE = 32;
+  return 32;
 #endif // __GFX9__
-#else
-static constexpr int C10_WARP_SIZE = 1;
-#endif // __HIP_DEVICE_COMPILE__
-#else
+}
+#else // __HIPCC__
+inline int C10_WARP_SIZE_INTERNAL() {
+  return at::cuda::warp_size();
+}
+#endif // __HIPCC__
+
+#define C10_WARP_SIZE (C10_WARP_SIZE_INTERNAL())
+#define C10_WARP_SIZE_STATIC 64
+
+#else // defined(USE_ROCM)
 #define C10_WARP_SIZE 32
 #endif
 

From 8e7b99f5b048edae5df3f2557dc8d0e9085be867 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 16 Jul 2025 06:18:32 +0000
Subject: [PATCH 09/83] Fix sha256 for aotriton ROCm7.0 tarball

---
 cmake/External/aotriton.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/External/aotriton.cmake b/cmake/External/aotriton.cmake
index 8004b0f400a8d..8b380d24f6c8c 100644
--- a/cmake/External/aotriton.cmake
+++ b/cmake/External/aotriton.cmake
@@ -24,7 +24,7 @@ if(NOT __AOTRITON_INCLUDED)
   set(__AOTRITON_SHA256_LIST
       "861cd9f7479eec943933c27cb86920247e5b5dd139bc7c1376c81808abb7d7fe"  # rocm6.3
       "acea7d811a2d3bbe718b6e07fc2a9f739e49eecd60b4b6a36fcb3fe8edf85d78"  # rocm6.4
-      "7e29c325d5bd33ba896ddb106f5d4fc7d715274dca7fe937f724fffa82017838"  # rocm7.0
+      "1e9b3dddf0c7fc07131c6f0f5266129e83ce2331f459fa2be8c63f4ae91b0f5b"  # rocm7.0
       )
   set(__AOTRITON_Z "gz")
 

From d7c64fc4be65e8b567b94a0649fbcc21fa9809d2 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 16 Jul 2025 06:26:24 +0000
Subject: [PATCH 10/83] Update third_party/composable_kernel submodule commit
 as per
 https://github.com/ROCm/pytorch/commit/80cca7006d94df97ee932fd5903ed20c08c2eb34
 to enable PyTorch build on ROCm7.0

---
 third_party/composable_kernel | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/composable_kernel b/third_party/composable_kernel
index 8086bbe3a78d9..df6023e305f38 160000
--- a/third_party/composable_kernel
+++ b/third_party/composable_kernel
@@ -1 +1 @@
-Subproject commit 8086bbe3a78d931eb96fe12fdc014082e18d18d3
+Subproject commit df6023e305f389bbf7249b0c4414e649f3ad6598

From b81d4d1ff7ce03630110354ddfc34af708cb0d7c Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 16 Jul 2025 20:08:23 +0000
Subject: [PATCH 11/83] Use ROCm/triton and update triton.txt

---
 .ci/docker/ci_commit_pins/triton.txt  | 2 +-
 .ci/docker/common/install_triton.sh   | 2 +-
 .github/scripts/build_triton_wheel.py | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt
index 568756a804f07..cf43cba72a42b 100644
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@@ -1 +1 @@
-ae848267bebc65c6181e8cc5e64a6357d2679260
+5e5685356b9fc7b5ad9cdf4e510a1994a5b8601a
diff --git a/.ci/docker/common/install_triton.sh b/.ci/docker/common/install_triton.sh
index f5e39fbaf9ccb..f5b552e079710 100755
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@@ -21,7 +21,7 @@ elif [ -n "${TRITON_CPU}" ]; then
   TRITON_REPO="https://github.com/triton-lang/triton-cpu"
   TRITON_TEXT_FILE="triton-cpu"
 else
-  TRITON_REPO="https://github.com/triton-lang/triton"
+  TRITON_REPO="https://github.com/ROCm/triton"
   TRITON_TEXT_FILE="triton"
 fi
 
diff --git a/.github/scripts/build_triton_wheel.py b/.github/scripts/build_triton_wheel.py
index 1302570432046..695b4a9c865a6 100644
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@@ -102,6 +102,7 @@ def build_triton(
         triton_repo = "https://github.com/openai/triton"
         if device == "rocm":
             triton_pkg_name = "pytorch-triton-rocm"
+            triton_repo = "https://github.com/ROCm/triton"
         elif device == "xpu":
             triton_pkg_name = "pytorch-triton-xpu"
             triton_repo = "https://github.com/intel/intel-xpu-backend-for-triton"

From 98e953717022b6980037b0f88ac05f5cf7a8a562 Mon Sep 17 00:00:00 2001
From: Prachi Gupta <pracgupt@amd.com>
Date: Tue, 22 Jul 2025 14:31:27 -0400
Subject: [PATCH 12/83] Add related_commits file (#2396)

---
 related_commits | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 related_commits

diff --git a/related_commits b/related_commits
new file mode 100644
index 0000000000000..fd2787398fc13
--- /dev/null
+++ b/related_commits
@@ -0,0 +1,10 @@
+ubuntu|pytorch|apex|release/1.8.0|eab2474650906473d7d2d7053b870fe512438f90|https://github.com/ROCm/apex
+centos|pytorch|apex|release/1.8.0|eab2474650906473d7d2d7053b870fe512438f90|https://github.com/ROCm/apex
+ubuntu|pytorch|torchvision|release/0.23|824e8c8726b65fd9d5abdc9702f81c2b0c4c0dc8|https://github.com/pytorch/vision
+centos|pytorch|torchvision|release/0.23|824e8c8726b65fd9d5abdc9702f81c2b0c4c0dc8|https://github.com/pytorch/vision
+ubuntu|pytorch|torchdata|release/0.11|377e64c1be69a9be6649d14c9e3664070323e464|https://github.com/pytorch/data
+centos|pytorch|torchdata|release/0.11|377e64c1be69a9be6649d14c9e3664070323e464|https://github.com/pytorch/data
+ubuntu|pytorch|torchaudio|release/2.8|6e1c7fe9ff6d82b8665d0a46d859d3357d2ebaaa|https://github.com/pytorch/audio
+centos|pytorch|torchaudio|release/2.8|6e1c7fe9ff6d82b8665d0a46d859d3357d2ebaaa|https://github.com/pytorch/audio
+ubuntu|pytorch|ao|main|a96eeb1c7d7ba24cf0ccfc105141729acfed22bf|https://github.com/pytorch/ao
+centos|pytorch|ao|main|a96eeb1c7d7ba24cf0ccfc105141729acfed22bf|https://github.com/pytorch/ao

From 12a145a630f9659ceb708899af6c23c0226ba6d3 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 19 Feb 2025 18:11:37 +0000
Subject: [PATCH 13/83] Add QA automation scripts for running PyTorch unit
 tests

(cherry picked from commit ba1ba2680558fee0c93ea030ab1a375c8c1a83fc)
(cherry picked from commit 4e3462e257fd8245424bb0df67191659d8fadfda)
---
 .automation_scripts/parse_xml_results.py      | 178 ++++++
 .automation_scripts/run_pytorch_unit_tests.py | 518 ++++++++++++++++++
 2 files changed, 696 insertions(+)
 create mode 100644 .automation_scripts/parse_xml_results.py
 create mode 100644 .automation_scripts/run_pytorch_unit_tests.py

diff --git a/.automation_scripts/parse_xml_results.py b/.automation_scripts/parse_xml_results.py
new file mode 100644
index 0000000000000..7db2e1ce9233c
--- /dev/null
+++ b/.automation_scripts/parse_xml_results.py
@@ -0,0 +1,178 @@
+""" The Python PyTorch testing script.
+##
+# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+"""
+
+import xml.etree.ElementTree as ET
+from pathlib import Path
+from typing import Any, Dict, Tuple
+
+# Backends list
+BACKENDS_LIST = [
+    "dist-gloo",
+    "dist-nccl"
+]
+
+TARGET_WORKFLOW = "--rerun-disabled-tests"
+
+def get_job_id(report: Path) -> int:
+    # [Job id in artifacts]
+    # Retrieve the job id from the report path. In our GHA workflows, we append
+    # the job id to the end of the report name, so `report` looks like:
+    #     unzipped-test-reports-foo_5596745227/test/test-reports/foo/TEST-foo.xml
+    # and we want to get `5596745227` out of it.
+    try:
+        return int(report.parts[0].rpartition("_")[2])
+    except ValueError:
+        return -1
+
+def is_rerun_disabled_tests(root: ET.ElementTree) -> bool:
+    """
+    Check if the test report is coming from rerun_disabled_tests workflow
+    """
+    skipped = root.find(".//*skipped")
+    # Need to check against None here, if not skipped doesn't work as expected
+    if skipped is None:
+        return False
+
+    message = skipped.attrib.get("message", "")
+    return TARGET_WORKFLOW in message or "num_red" in message
+
+def parse_xml_report(
+    tag: str,
+    report: Path,
+    workflow_id: int,
+    workflow_run_attempt: int,
+    work_flow_name: str
+) -> Dict[Tuple[str], Dict[str, Any]]:
+    """Convert a test report xml file into a JSON-serializable list of test cases."""
+    print(f"Parsing {tag}s for test report: {report}")
+
+    job_id = get_job_id(report)
+    print(f"Found job id: {job_id}")
+
+    test_cases: Dict[Tuple[str], Dict[str, Any]] = {}
+
+    root = ET.parse(report)
+    # TODO: unlike unittest, pytest-flakefinder used by rerun disabled tests for test_ops
+    # includes skipped messages multiple times (50 times by default). This slows down
+    # this script too much (O(n)) because it tries to gather all the stats. This should
+    # be fixed later in the way we use pytest-flakefinder. A zipped test report from rerun
+    # disabled test is only few MB, but will balloon up to a much bigger XML file after
+    # extracting from a dozen to few hundred MB
+    if is_rerun_disabled_tests(root):
+        return test_cases
+
+    for test_case in root.iter(tag):
+        case = process_xml_element(test_case)
+        if tag == 'testcase':
+            case["workflow_id"] = workflow_id
+            case["workflow_run_attempt"] = workflow_run_attempt
+            case["job_id"] = job_id
+            case["work_flow_name"] = work_flow_name
+
+            # [invoking file]
+            # The name of the file that the test is located in is not necessarily
+            # the same as the name of the file that invoked the test.
+            # For example, `test_jit.py` calls into multiple other test files (e.g.
+            # jit/test_dce.py). For sharding/test selection purposes, we want to
+            # record the file that invoked the test.
+            #
+            # To do this, we leverage an implementation detail of how we write out
+            # tests (https://bit.ly/3ajEV1M), which is that reports are created
+            # under a folder with the same name as the invoking file.
+            case_name = report.parent.name
+            for ind in range(len(BACKENDS_LIST)):
+                if BACKENDS_LIST[ind] in report.parts:
+                    case_name = case_name + "_" + BACKENDS_LIST[ind]
+                    break
+            case["invoking_file"] = case_name
+            test_cases[ ( case["invoking_file"], case["classname"], case["name"], case["work_flow_name"] ) ] = case
+        elif tag == 'testsuite':
+            case["work_flow_name"] = work_flow_name
+            case["invoking_xml"] = report.name
+            case["running_time_xml"] = case["time"]
+            case_name = report.parent.name
+            for ind in range(len(BACKENDS_LIST)):
+                if BACKENDS_LIST[ind] in report.parts:
+                    case_name = case_name + "_" + BACKENDS_LIST[ind]
+                    break
+            case["invoking_file"] = case_name
+
+            test_cases[ ( case["invoking_file"], case["invoking_xml"], case["work_flow_name"] ) ] = case
+
+    return test_cases
+
+def process_xml_element(element: ET.Element) -> Dict[str, Any]:
+    """Convert a test suite element into a JSON-serializable dict."""
+    ret: Dict[str, Any] = {}
+
+    # Convert attributes directly into dict elements.
+    # e.g.
+    #     <testcase name="test_foo" classname="test_bar"></testcase>
+    # becomes:
+    #     {"name": "test_foo", "classname": "test_bar"}
+    ret.update(element.attrib)
+
+    # The XML format encodes all values as strings. Convert to ints/floats if
+    # possible to make aggregation possible in Rockset.
+    for k, v in ret.items():
+        try:
+            ret[k] = int(v)
+        except ValueError:
+            pass
+        try:
+            ret[k] = float(v)
+        except ValueError:
+            pass
+
+    # Convert inner and outer text into special dict elements.
+    # e.g.
+    #     <testcase>my_inner_text</testcase> my_tail
+    # becomes:
+    #     {"text": "my_inner_text", "tail": " my_tail"}
+    if element.text and element.text.strip():
+        ret["text"] = element.text
+    if element.tail and element.tail.strip():
+        ret["tail"] = element.tail
+
+    # Convert child elements recursively, placing them at a key:
+    # e.g.
+    #     <testcase>
+    #       <foo>hello</foo>
+    #       <foo>world</foo>
+    #       <bar>another</bar>
+    #     </testcase>
+    # becomes
+    #    {
+    #       "foo": [{"text": "hello"}, {"text": "world"}],
+    #       "bar": {"text": "another"}
+    #    }
+    for child in element:
+        if child.tag not in ret:
+            ret[child.tag] = process_xml_element(child)
+        else:
+            # If there are multiple tags with the same name, they should be
+            # coalesced into a list.
+            if not isinstance(ret[child.tag], list):
+                ret[child.tag] = [ret[child.tag]]
+            ret[child.tag].append(process_xml_element(child))
+    return ret
\ No newline at end of file
diff --git a/.automation_scripts/run_pytorch_unit_tests.py b/.automation_scripts/run_pytorch_unit_tests.py
new file mode 100644
index 0000000000000..514afd19624c3
--- /dev/null
+++ b/.automation_scripts/run_pytorch_unit_tests.py
@@ -0,0 +1,518 @@
+#!/usr/bin/env python3
+
+""" The Python PyTorch testing script.
+##
+# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+"""
+
+import argparse
+import os
+import shutil
+import subprocess
+from subprocess import STDOUT, CalledProcessError
+
+from collections import namedtuple
+from datetime import datetime
+from pathlib import Path
+from parse_xml_results import (
+        parse_xml_report
+)
+from pprint import pprint
+from typing import Any, Dict, List
+
+# unit test status list
+UT_STATUS_LIST = [
+    "PASSED",
+    "MISSED",
+    "SKIPPED",
+    "FAILED",
+    "XFAILED",
+    "ERROR"
+]
+
+DEFAULT_CORE_TESTS = [
+    "test_nn",
+    "test_torch",
+    "test_cuda",
+    "test_ops",
+    "test_unary_ufuncs",
+    "test_autograd",
+    "inductor/test_torchinductor"
+]
+
+DISTRIBUTED_CORE_TESTS = [
+    "distributed/test_c10d_common",
+    "distributed/test_c10d_nccl",
+    "distributed/test_distributed_spawn"
+]
+
+CONSOLIDATED_LOG_FILE_NAME="pytorch_unit_tests.log"
+
+def parse_xml_reports_as_dict(workflow_run_id, workflow_run_attempt, tag, workflow_name, path="."):
+    test_cases = {}
+    items_list = os.listdir(path)
+    for dir in items_list:
+        new_dir = path + '/' + dir + '/'
+        if os.path.isdir(new_dir):
+            for xml_report in Path(new_dir).glob("**/*.xml"):
+                test_cases.update(
+                    parse_xml_report(
+                        tag,
+                        xml_report,
+                        workflow_run_id,
+                        workflow_run_attempt,
+                        workflow_name
+                    )
+                )
+    return test_cases
+
+def get_test_status(test_case):
+  # In order of priority: S=skipped, F=failure, E=error, P=pass
+  if "skipped" in test_case and test_case["skipped"]:
+      type_message = test_case["skipped"]
+      if type_message.__contains__('type') and type_message['type'] == "pytest.xfail":
+          return "XFAILED"
+      else:
+          return "SKIPPED"
+  elif "failure" in test_case and test_case["failure"]:
+    return "FAILED"
+  elif "error" in test_case and test_case["error"]:
+    return "ERROR"
+  else:
+    return "PASSED"
+
+def get_test_message(test_case, status=None):
+  if status == "SKIPPED":
+    return test_case["skipped"] if "skipped" in test_case else ""
+  elif status == "FAILED":
+    return test_case["failure"] if "failure" in test_case else ""
+  elif status == "ERROR":
+    return test_case["error"] if "error" in test_case else ""
+  else:
+    if "skipped" in test_case:
+      return test_case["skipped"]
+    elif "failure" in test_case:
+      return test_case["failure"]
+    elif "error" in test_case:
+      return test_case["error"]
+    else:
+      return ""
+
+def get_test_file_running_time(test_suite):
+  if test_suite.__contains__('time'):
+    return test_suite["time"]
+  return 0
+
+def get_test_running_time(test_case):
+  if test_case.__contains__('time'):
+    return test_case["time"]
+  return ""
+
+def summarize_xml_files(path, workflow_name):
+    # statistics
+    TOTAL_TEST_NUM = 0
+    TOTAL_PASSED_NUM = 0
+    TOTAL_SKIPPED_NUM = 0
+    TOTAL_XFAIL_NUM = 0
+    TOTAL_FAILED_NUM = 0
+    TOTAL_ERROR_NUM = 0
+    TOTAL_EXECUTION_TIME = 0
+
+    #parse the xml files
+    test_cases = parse_xml_reports_as_dict(-1, -1, 'testcase', workflow_name, path)
+    test_suites = parse_xml_reports_as_dict(-1, -1, 'testsuite', workflow_name, path)
+    test_file_and_status = namedtuple("test_file_and_status", ["file_name", "status"])
+    # results dict
+    res = {}
+    res_item_list = [ "PASSED", "SKIPPED", "XFAILED", "FAILED", "ERROR" ]
+    test_file_items = set()
+    for (k,v) in list(test_suites.items()):
+        file_name = k[0]
+        if not file_name in test_file_items:
+            test_file_items.add(file_name)
+            # initialization
+            for item in res_item_list:
+                temp_item = test_file_and_status(file_name, item)
+                res[temp_item] = {}
+            temp_item_statistics = test_file_and_status(file_name, "STATISTICS")
+            res[temp_item_statistics] = {'TOTAL': 0, 'PASSED': 0, 'SKIPPED': 0, 'XFAILED': 0, 'FAILED': 0, 'ERROR': 0, 'EXECUTION_TIME': 0}
+            test_running_time = get_test_file_running_time(v)
+            res[temp_item_statistics]["EXECUTION_TIME"] += test_running_time
+            TOTAL_EXECUTION_TIME += test_running_time
+        else:
+            test_tuple_key_statistics = test_file_and_status(file_name, "STATISTICS")
+            test_running_time = get_test_file_running_time(v)
+            res[test_tuple_key_statistics]["EXECUTION_TIME"] += test_running_time
+            TOTAL_EXECUTION_TIME += test_running_time
+
+    for (k,v) in list(test_cases.items()):
+        file_name = k[0]
+        class_name = k[1]
+        test_name = k[2]
+        combined_name = file_name + "::" + class_name + "::" + test_name
+        test_status = get_test_status(v)
+        test_running_time = get_test_running_time(v)
+        test_message = get_test_message(v, test_status)
+        test_info_value = ""
+        test_tuple_key_status = test_file_and_status(file_name, test_status)
+        test_tuple_key_statistics = test_file_and_status(file_name, "STATISTICS")
+        TOTAL_TEST_NUM += 1
+        res[test_tuple_key_statistics]["TOTAL"] += 1
+        if test_status == "PASSED":
+            test_info_value = str(test_running_time)
+            res[test_tuple_key_status][combined_name] = test_info_value
+            res[test_tuple_key_statistics]["PASSED"] += 1
+            TOTAL_PASSED_NUM += 1
+        elif test_status == "SKIPPED":
+            test_info_value = str(test_running_time)
+            res[test_tuple_key_status][combined_name] = test_info_value
+            res[test_tuple_key_statistics]["SKIPPED"] += 1
+            TOTAL_SKIPPED_NUM += 1
+        elif test_status == "XFAILED":
+            test_info_value = str(test_running_time)
+            res[test_tuple_key_status][combined_name] = test_info_value
+            res[test_tuple_key_statistics]["XFAILED"] += 1
+            TOTAL_XFAIL_NUM += 1
+        elif test_status == "FAILED":
+            test_info_value = test_message
+            res[test_tuple_key_status][combined_name] = test_info_value
+            res[test_tuple_key_statistics]["FAILED"] += 1
+            TOTAL_FAILED_NUM += 1
+        elif test_status == "ERROR":
+            test_info_value = test_message
+            res[test_tuple_key_status][combined_name] = test_info_value
+            res[test_tuple_key_statistics]["ERROR"] += 1
+            TOTAL_ERROR_NUM += 1
+
+    # generate statistics_dict
+    statistics_dict = {}
+    statistics_dict["TOTAL"] = TOTAL_TEST_NUM
+    statistics_dict["PASSED"] = TOTAL_PASSED_NUM
+    statistics_dict["SKIPPED"] = TOTAL_SKIPPED_NUM
+    statistics_dict["XFAILED"] = TOTAL_XFAIL_NUM
+    statistics_dict["FAILED"] = TOTAL_FAILED_NUM
+    statistics_dict["ERROR"] = TOTAL_ERROR_NUM
+    statistics_dict["EXECUTION_TIME"] = TOTAL_EXECUTION_TIME
+    aggregate_item = workflow_name + "_aggregate"
+    total_item = test_file_and_status(aggregate_item, "STATISTICS")
+    res[total_item] = statistics_dict
+
+    return res
+
+def run_command_and_capture_output(cmd):
+    try:
+        print(f"Running command '{cmd}'")
+        with open(CONSOLIDATED_LOG_FILE_PATH, "a+") as output_file:
+            print(f"========================================", file=output_file, flush=True)
+            print(f"[RUN_PYTORCH_UNIT_TESTS] Running command '{cmd}'", file=output_file, flush=True) # send to consolidated file as well
+            print(f"========================================", file=output_file, flush=True)
+            p = subprocess.run(cmd, shell=True, stdout=output_file, stderr=STDOUT, text=True)
+    except CalledProcessError as e:
+        print(f"ERROR: Cmd {cmd} failed with return code: {e.returncode}!")
+
+def run_entire_tests(workflow_name, test_shell_path, overall_logs_path_current_run, test_reports_src):
+    if os.path.exists(test_reports_src):
+        shutil.rmtree(test_reports_src)
+
+    os.mkdir(test_reports_src)
+    copied_logs_path = ""
+    if workflow_name == "default":
+        os.environ['TEST_CONFIG'] = 'default'
+        copied_logs_path = overall_logs_path_current_run + "default_xml_results_entire_tests/"
+    elif workflow_name == "distributed":
+        os.environ['TEST_CONFIG'] = 'distributed'
+        copied_logs_path = overall_logs_path_current_run + "distributed_xml_results_entire_tests/"
+    elif workflow_name == "inductor":
+        os.environ['TEST_CONFIG'] = 'inductor'
+        copied_logs_path = overall_logs_path_current_run + "inductor_xml_results_entire_tests/"
+    # use test.sh for tests execution
+    run_command_and_capture_output(test_shell_path)
+    copied_logs_path_destination = shutil.copytree(test_reports_src, copied_logs_path)
+    entire_results_dict = summarize_xml_files(copied_logs_path_destination, workflow_name)
+    return entire_results_dict
+
+def run_priority_tests(workflow_name, test_run_test_path, overall_logs_path_current_run, test_reports_src):
+    if os.path.exists(test_reports_src):
+        shutil.rmtree(test_reports_src)
+
+    os.mkdir(test_reports_src)
+    copied_logs_path = ""
+    if workflow_name == "default":
+        os.environ['TEST_CONFIG'] = 'default'
+        os.environ['HIP_VISIBLE_DEVICES'] = '0'
+        copied_logs_path = overall_logs_path_current_run + "default_xml_results_priority_tests/"
+        # use run_test.py for tests execution
+        default_priority_test_suites = " ".join(DEFAULT_CORE_TESTS)
+        command = "python3 " + test_run_test_path + " --include " + default_priority_test_suites + " --exclude-jit-executor --exclude-distributed-tests --verbose"
+        run_command_and_capture_output(command)
+        del os.environ['HIP_VISIBLE_DEVICES']
+    elif workflow_name == "distributed":
+        os.environ['TEST_CONFIG'] = 'distributed'
+        os.environ['HIP_VISIBLE_DEVICES'] = '0,1'
+        copied_logs_path = overall_logs_path_current_run + "distributed_xml_results_priority_tests/"
+        # use run_test.py for tests execution
+        distributed_priority_test_suites = " ".join(DISTRIBUTED_CORE_TESTS)
+        command = "python3 " + test_run_test_path + " --include " + distributed_priority_test_suites + " --distributed-tests --verbose"
+        run_command_and_capture_output(command)
+        del os.environ['HIP_VISIBLE_DEVICES']
+    copied_logs_path_destination = shutil.copytree(test_reports_src, copied_logs_path)
+    priority_results_dict = summarize_xml_files(copied_logs_path_destination, workflow_name)
+
+    return priority_results_dict
+
+def run_selected_tests(workflow_name, test_run_test_path, overall_logs_path_current_run, test_reports_src, selected_list):
+    if os.path.exists(test_reports_src):
+        shutil.rmtree(test_reports_src)
+
+    os.mkdir(test_reports_src)
+    copied_logs_path = ""
+    if workflow_name == "default":
+        os.environ['TEST_CONFIG'] = 'default'
+        os.environ['HIP_VISIBLE_DEVICES'] = '0'
+        copied_logs_path = overall_logs_path_current_run + "default_xml_results_selected_tests/"
+        # use run_test.py for tests execution
+        default_selected_test_suites = " ".join(selected_list)
+        command = "python3 " + test_run_test_path + " --include " + default_selected_test_suites  + " --exclude-jit-executor --exclude-distributed-tests --verbose"
+        run_command_and_capture_output(command)
+        del os.environ['HIP_VISIBLE_DEVICES']
+    elif workflow_name == "distributed":
+        os.environ['TEST_CONFIG'] = 'distributed'
+        os.environ['HIP_VISIBLE_DEVICES'] = '0,1'
+        copied_logs_path = overall_logs_path_current_run + "distributed_xml_results_selected_tests/"
+        # use run_test.py for tests execution
+        distributed_selected_test_suites = " ".join(selected_list)
+        command = "python3 " + test_run_test_path + " --include " + distributed_selected_test_suites + " --distributed-tests --verbose"
+        run_command_and_capture_output(command)
+        del os.environ['HIP_VISIBLE_DEVICES']
+    elif workflow_name == "inductor":
+        os.environ['TEST_CONFIG'] = 'inductor'
+        copied_logs_path = overall_logs_path_current_run + "inductor_xml_results_selected_tests/"
+        inductor_selected_test_suites = ""
+        non_inductor_selected_test_suites = ""
+        for item in selected_list:
+            if "inductor/" in item:
+                inductor_selected_test_suites += item
+                inductor_selected_test_suites += " "
+            else:
+                non_inductor_selected_test_suites += item
+                non_inductor_selected_test_suites += " "
+        if inductor_selected_test_suites != "":
+            inductor_selected_test_suites = inductor_selected_test_suites[:-1]
+            command = "python3 " + test_run_test_path + " --include " + inductor_selected_test_suites + " --verbose"
+            run_command_and_capture_output(command)
+        if non_inductor_selected_test_suites != "":
+            non_inductor_selected_test_suites = non_inductor_selected_test_suites[:-1]
+            command = "python3 " + test_run_test_path + " --inductor --include " + non_inductor_selected_test_suites + " --verbose"
+            run_command_and_capture_output(command)
+    copied_logs_path_destination = shutil.copytree(test_reports_src, copied_logs_path)
+    selected_results_dict = summarize_xml_files(copied_logs_path_destination, workflow_name)
+
+    return selected_results_dict
+
+def run_test_and_summarize_results(
+    pytorch_root_dir: str,
+    priority_tests: bool,
+    test_config: List[str],
+    default_list: List[str],
+    distributed_list: List[str],
+    inductor_list: List[str],
+    skip_rerun: bool) -> Dict[str, Any]:
+
+    # copy current environment variables
+    _environ = dict(os.environ)
+    
+    # modify path
+    test_shell_path = pytorch_root_dir + "/.ci/pytorch/test.sh"
+    test_run_test_path = pytorch_root_dir + "/test/run_test.py"
+    repo_test_log_folder_path = pytorch_root_dir + "/.automation_logs/"
+    test_reports_src = pytorch_root_dir + "/test/test-reports/"
+    run_test_python_file = pytorch_root_dir + "/test/run_test.py"
+
+    # change directory to pytorch root
+    os.chdir(pytorch_root_dir)
+
+    # all test results dict
+    res_all_tests_dict = {}
+
+    # patterns
+    search_text = "--reruns=2"
+    replace_text = "--reruns=0"
+
+    # create logs folder
+    if not os.path.exists(repo_test_log_folder_path):
+        os.mkdir(repo_test_log_folder_path)
+
+    # Set common environment variables for all scenarios
+    os.environ['CI'] = '1'
+    os.environ['PYTORCH_TEST_WITH_ROCM'] = '1'
+    os.environ['HSA_FORCE_FINE_GRAIN_PCIE'] = '1'
+    os.environ['PYTORCH_TESTING_DEVICE_ONLY_FOR'] = 'cuda'
+    os.environ['CONTINUE_THROUGH_ERROR'] = 'True'
+    if skip_rerun:
+        # modify run_test.py in-place
+        with open(run_test_python_file, 'r') as file:
+            data = file.read()
+            data = data.replace(search_text, replace_text)
+        with open(run_test_python_file, 'w') as file:
+            file.write(data)
+
+    # Time stamp
+    current_datetime = datetime.now().strftime("%Y%m%d_%H-%M-%S")
+    print("Current date & time : ", current_datetime)
+    # performed as Job ID
+    str_current_datetime = str(current_datetime)
+    overall_logs_path_current_run = repo_test_log_folder_path + str_current_datetime + "/"
+    os.mkdir(overall_logs_path_current_run)
+
+    global CONSOLIDATED_LOG_FILE_PATH
+    CONSOLIDATED_LOG_FILE_PATH = overall_logs_path_current_run + CONSOLIDATED_LOG_FILE_NAME
+
+    # Check multi gpu availability if distributed tests are enabled
+    if ("distributed" in test_config) or len(distributed_list) != 0:
+        check_num_gpus_for_distributed()
+
+    # Install test requirements
+    command = "pip3 install -r requirements.txt && pip3 install -r .ci/docker/requirements-ci.txt"
+    run_command_and_capture_output(command)
+
+    # Run entire tests for each workflow
+    if not priority_tests and not default_list and not distributed_list and not inductor_list:
+        # run entire tests for default, distributed and inductor workflows → use test.sh
+        if not test_config:
+            check_num_gpus_for_distributed()
+            # default test process
+            res_default_all = run_entire_tests("default", test_shell_path, overall_logs_path_current_run, test_reports_src)
+            res_all_tests_dict["default"] = res_default_all
+            # distributed test process
+            res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src)
+            res_all_tests_dict["distributed"] = res_distributed_all
+            # inductor test process
+            res_inductor_all = run_entire_tests("inductor", test_shell_path, overall_logs_path_current_run, test_reports_src)
+            res_all_tests_dict["inductor"] = res_inductor_all
+        else:
+            workflow_list = []
+            for item in test_config:
+                workflow_list.append(item)
+            if "default" in workflow_list:
+                res_default_all = run_entire_tests("default", test_shell_path, overall_logs_path_current_run, test_reports_src)
+                res_all_tests_dict["default"] = res_default_all
+            if "distributed" in workflow_list:
+                res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src)
+                res_all_tests_dict["distributed"] = res_distributed_all
+            if "inductor" in workflow_list:
+                res_inductor_all = run_entire_tests("inductor", test_shell_path, overall_logs_path_current_run, test_reports_src)
+                res_all_tests_dict["inductor"] = res_inductor_all
+    # Run priority test for each workflow
+    elif priority_tests and not default_list and not distributed_list and not inductor_list:
+        if not test_config:
+            check_num_gpus_for_distributed()
+            # default test process
+            res_default_priority = run_priority_tests("default", test_run_test_path, overall_logs_path_current_run, test_reports_src)
+            res_all_tests_dict["default"] = res_default_priority
+            # distributed test process
+            res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src)
+            res_all_tests_dict["distributed"] = res_distributed_priority
+            # will not run inductor priority tests
+            print("Inductor priority tests cannot run since no core tests defined with inductor workflow.")
+        else:
+            workflow_list = []
+            for item in test_config:
+                workflow_list.append(item)
+            if "default" in workflow_list:
+                res_default_priority = run_priority_tests("default", test_run_test_path, overall_logs_path_current_run, test_reports_src)
+                res_all_tests_dict["default"] = res_default_priority
+            if "distributed" in workflow_list:
+                res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src)
+                res_all_tests_dict["distributed"] = res_distributed_priority
+            if "inductor" in workflow_list:
+                print("Inductor priority tests cannot run since no core tests defined with inductor workflow.")
+    # Run specified tests for each workflow
+    elif (default_list or distributed_list or inductor_list) and not test_config and not priority_tests:
+        if default_list:
+            default_workflow_list = []
+            for item in default_list:
+                default_workflow_list.append(item)
+            res_default_selected = run_selected_tests("default", test_run_test_path, overall_logs_path_current_run, test_reports_src, default_workflow_list)
+            res_all_tests_dict["default"] = res_default_selected
+        if distributed_list:
+            distributed_workflow_list = []
+            for item in distributed_list:
+                distributed_workflow_list.append(item)
+            res_distributed_selected = run_selected_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src, distributed_workflow_list)
+            res_all_tests_dict["distributed"] = res_distributed_selected
+        if inductor_list:
+            inductor_workflow_list = []
+            for item in inductor_list:
+                 inductor_workflow_list.append(item)
+            res_inductor_selected = run_selected_tests("inductor", test_run_test_path, overall_logs_path_current_run, test_reports_src, inductor_workflow_list)
+            res_all_tests_dict["inductor"] = res_inductor_selected
+    else:
+        raise Exception("Invalid test configurations!")
+
+    # restore environment variables
+    os.environ.clear()
+    os.environ.update(_environ)
+
+    # restore files
+    if skip_rerun:
+        # modify run_test.py in-place
+        with open(run_test_python_file, 'r') as file:
+            data = file.read()
+            data = data.replace(replace_text, search_text)
+        with open(run_test_python_file, 'w') as file:
+            file.write(data)
+
+    return res_all_tests_dict
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Run PyTorch unit tests and generate xml results summary', formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument('--test_config', nargs='+', default=[], type=str, help="space-separated list of test workflows to be executed eg. 'default distributed'")
+    parser.add_argument('--priority_tests', action='store_true', help="run priority tests only")
+    parser.add_argument('--default_list', nargs='+', default=[], help="space-separated list of 'default' config test suites/files to be executed eg. 'test_weak test_dlpack'")
+    parser.add_argument('--distributed_list', nargs='+', default=[], help="space-separated list of 'distributed' config test suites/files to be executed eg. 'distributed/test_c10d_common distributed/test_c10d_nccl'")
+    parser.add_argument('--inductor_list', nargs='+', default=[], help="space-separated list of 'inductor' config test suites/files to be executed eg. 'inductor/test_torchinductor test_ops'")
+    parser.add_argument('--pytorch_root', default='.', type=str, help="PyTorch root directory")
+    parser.add_argument('--skip_rerun', action='store_true', help="skip rerun process")
+    parser.add_argument('--example_output', type=str, help="{'workflow_name': {\n"
+                                                           "  test_file_and_status(file_name='workflow_aggregate', status='STATISTICS'): {}, \n"
+                                                           "  test_file_and_status(file_name='test_file_name_1', status='ERROR'): {}, \n"
+                                                           "  test_file_and_status(file_name='test_file_name_1', status='FAILED'): {}, \n"
+                                                           "  test_file_and_status(file_name='test_file_name_1', status='PASSED'): {}, \n"
+                                                           "  test_file_and_status(file_name='test_file_name_1', status='SKIPPED'): {}, \n"
+                                                           "  test_file_and_status(file_name='test_file_name_1', status='STATISTICS'): {} \n"
+                                                           "}}\n")
+    parser.add_argument('--example_usages', type=str, help="RUN ALL TESTS: python3 run_pytorch_unit_tests.py \n"
+                                                            "RUN PRIORITY TESTS: python3 run_pytorch_unit_tests.py --test_config distributed --priority_test \n"
+                                                            "RUN SELECTED TESTS: python3 run_pytorch_unit_tests.py --default_list test_weak test_dlpack --inductor_list inductor/test_torchinductor")
+    return parser.parse_args()
+
+def check_num_gpus_for_distributed():
+    p = subprocess.run("rocminfo | grep -cE 'Name:\s+gfx'", shell=True, capture_output=True, text=True)
+    num_gpus_visible = int(p.stdout)
+    assert num_gpus_visible > 1, "Number of visible GPUs should be >1 to run distributed unit tests"
+
+def main():
+    args = parse_args()
+    all_tests_results = run_test_and_summarize_results(args.pytorch_root, args.priority_tests, args.test_config, args.default_list, args.distributed_list, args.inductor_list, args.skip_rerun)
+    pprint(dict(all_tests_results))
+
+if __name__ == "__main__":
+    main()

From 3c7ddbfc71f24138fe78c9484ec44447dc9f65f3 Mon Sep 17 00:00:00 2001
From: Dmitry Nikolaev <139769634+dnikolaev-amd@users.noreply.github.com>
Date: Fri, 11 Apr 2025 18:08:28 +0200
Subject: [PATCH 14/83] [release/2.6] enable NHWC batchnorm with MIOpen (#2023)

This PR enables NHWC batchnorm on MIOpen in release/2.6 branch

`ROCm version >= 6.5` and `PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM=1`
environment variable required to enable nhwc batchnorm

This PR branch for `release/2.6` was built and tested using docker
image:
`compute-artifactory.amd.com:5000/rocm-plus-docker/framework/compute-rocm-dkms-no-npi-hipclang:15845_ubuntu22.04_py3.10_pytorch_rocm6.4_internal_testing_8190c80`.

New batchnorm tests introduced:
train:
```
test_batchnorm_train_NCHW_vs_cpu_float32 (__main__.TestNN) ... ok (0.040s)
test_batchnorm_train_NCHW_vs_cpu_mixed_bfloat16 (__main__.TestNN) ... ok (0.007s)
test_batchnorm_train_NCHW_vs_cpu_mixed_float16 (__main__.TestNN) ... ok (0.005s)
test_batchnorm_train_NCHW_vs_native_float32 (__main__.TestNN) ... ok (0.089s)
test_batchnorm_train_NCHW_vs_native_mixed_float16 (__main__.TestNN) ... ok (0.004s)
test_batchnorm_train_NHWC_vs_NCHW_float32 (__main__.TestNN) ... ok (0.020s)
test_batchnorm_train_NHWC_vs_NCHW_mixed_bfloat16 (__main__.TestNN) ... ok (0.006s)
test_batchnorm_train_NHWC_vs_NCHW_mixed_float16 (__main__.TestNN) ... ok (0.006s)
test_batchnorm_train_NHWC_vs_cpu_float32 (__main__.TestNN) ... ok (0.004s)
test_batchnorm_train_NHWC_vs_cpu_mixed_bfloat16 (__main__.TestNN) ... ok (0.004s)
test_batchnorm_train_NHWC_vs_cpu_mixed_float16 (__main__.TestNN) ... ok (0.004s)
test_batchnorm_train_NHWC_vs_native_float32 (__main__.TestNN) ... ok (0.004s)
test_batchnorm_train_NHWC_vs_native_mixed_bfloat16 (__main__.TestNN) ... ok (0.004s)
test_batchnorm_train_NHWC_vs_native_mixed_float16 (__main__.TestNN) ... ok (0.004s)
```

inference:
```
test_batchnorm_inference_NCHW_vs_cpu_float32 (__main__.TestNN) ... ok (0.025s)
test_batchnorm_inference_NCHW_vs_cpu_mixed_bfloat16 (__main__.TestNN) ... ok (0.005s)
test_batchnorm_inference_NCHW_vs_cpu_mixed_float16 (__main__.TestNN) ... ok (0.004s)
test_batchnorm_inference_NCHW_vs_native_float32 (__main__.TestNN) ... ok (0.102s)
test_batchnorm_inference_NCHW_vs_native_mixed_float16 (__main__.TestNN) ... ok (0.003s)
test_batchnorm_inference_NHWC_vs_NCHW_float32 (__main__.TestNN) ... ok (0.018s)
test_batchnorm_inference_NHWC_vs_NCHW_mixed_bfloat16 (__main__.TestNN) ... ok (0.004s)
test_batchnorm_inference_NHWC_vs_NCHW_mixed_float16 (__main__.TestNN) ... ok (0.004s)
test_batchnorm_inference_NHWC_vs_cpu_float32 (__main__.TestNN) ... ok (0.004s)
test_batchnorm_inference_NHWC_vs_cpu_mixed_bfloat16 (__main__.TestNN) ... ok (0.004s)
test_batchnorm_inference_NHWC_vs_cpu_mixed_float16 (__main__.TestNN) ... ok (0.004s)
test_batchnorm_inference_NHWC_vs_native_float32 (__main__.TestNN) ... ok (0.003s)
test_batchnorm_inference_NHWC_vs_native_mixed_bfloat16 (__main__.TestNN) ... ok (0.003s)
test_batchnorm_inference_NHWC_vs_native_mixed_float16 (__main__.TestNN) ... ok (0.003s)
```

---------

Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
(cherry picked from commit 45896ac60e0d80f9038ec7d4d655cfd1deae4463)
(cherry picked from commit 7010d60047c85d679c4a3062b3146003be8d2bfe)
---
 aten/src/ATen/native/Normalization.cpp        |  15 +-
 .../ATen/native/miopen/BatchNorm_miopen.cpp   |  13 +-
 test/test_nn.py                               | 175 +++++++++++++++++-
 tools/autograd/derivatives.yaml               |   2 +-
 4 files changed, 195 insertions(+), 10 deletions(-)

diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
index ecad7d7f34197..3f473a7146453 100644
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@@ -520,6 +520,11 @@ BatchNormBackend _select_batch_norm_backend(
     return BatchNormBackend::Cudnn;
   }
 
+  // TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM once ROCm officially supports NHWC in MIOpen
+  // See #64427
+  // non static variable is used to be able to change environment variable in runtime for testing
+  bool PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM").value_or(false);
+
   if (
       detail::getCUDAHooks().compiledWithMIOpen()
       && cudnn_enabled
@@ -527,6 +532,9 @@ BatchNormBackend _select_batch_norm_backend(
       && input.dim() <= MIOPEN_DIM_MAX
       && input.dim() >= 3
       && input.scalar_type() != at::kDouble
+#if (defined(USE_ROCM) && ROCM_VERSION < 60400)
+      && (input.scalar_type() != at::kBFloat16)
+#endif
       && (detail::getCUDAHooks().versionMIOpen() >= 30400 || input.scalar_type() != at::kBFloat16)
       && weight.scalar_type() == at::kFloat // only FP32 weight for FP32 or FP16/BF16(mixed) input
       && weight.defined() && bias.defined()
@@ -534,6 +542,11 @@ BatchNormBackend _select_batch_norm_backend(
         || (!running_mean.defined() && !running_var.defined() && training))
       && input.suggest_memory_format() != MemoryFormat::ChannelsLast
       && input.suggest_memory_format() != MemoryFormat::ChannelsLast3d
+      && (input.suggest_memory_format() == MemoryFormat::Contiguous
+#if (defined(USE_ROCM) && ROCM_VERSION >= 60500)
+        || (input.suggest_memory_format() == MemoryFormat::ChannelsLast && PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM)
+#endif
+        )
   ) {
     return BatchNormBackend::Miopen;
   }
@@ -613,7 +626,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
   if (backend == BatchNormBackend::Miopen) {
     return std::tuple_cat(
              at::miopen_batch_norm(
-               input.contiguous(), weight.contiguous(), bias.contiguous(),
+               input.contiguous(input.suggest_memory_format()), weight.contiguous(), bias.contiguous(),
                running_mean.defined() ? running_mean.contiguous() : running_mean,
                running_var.defined() ? running_var.contiguous() : running_var,
                training, momentum, eps),
diff --git a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
index af69dfc76e571..f21325cd0848f 100644
--- a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
+++ b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
@@ -102,7 +102,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm(
     mode = miopenBNSpatial;
   }
 
-  auto output_t = at::empty(input->sizes(), input->options());
+  auto output_t = at::empty(input->sizes(), input->options(), input->suggest_memory_format());
   TensorArg output{ output_t, "output", 0 };
 
   auto handle = getMiopenHandle();
@@ -179,8 +179,10 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
   const Tensor& save_var_t =
       save_var_t_opt.value_or(Tensor());
 
+  auto grad_output_contig =
+      grad_output_t.contiguous(input_t.suggest_memory_format());
   TensorArg input{ input_t, "input", 1 },
-            grad_output{ grad_output_t, "grad_output", 2 },
+            grad_output{ grad_output_contig, "grad_output", 2 },
             weight{ weight_t, "weight", 3 },
             save_mean{ save_mean_t, "save_mean", 4 },
             save_var{ save_var_t, "save_var", 5 };
@@ -195,7 +197,9 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
   }
   checkAllSameType(c, {input, grad_output});
   checkAllSameType(c, {weight, save_mean, save_var});
-  checkAllContiguous(c, {input, grad_output, save_mean, save_var});
+  checkAllContiguous(c, {save_mean, save_var});
+  TORCH_CHECK(input->is_contiguous(input->suggest_memory_format()));
+  TORCH_CHECK(grad_output->is_contiguous(input->suggest_memory_format()));
   checkDimRange(c, input, 2, 6 /* exclusive */);
   checkSameSize(c, input, grad_output);
   auto num_features = input->size(1);
@@ -210,7 +214,8 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
     mode = miopenBNSpatial;
   }
 
-  auto grad_input_t  = at::empty(input->sizes(), input->options());
+  auto grad_input_t = at::empty(
+      input->sizes(), input->options(), input->suggest_memory_format());
   auto grad_weight_t = at::empty(weight->sizes(), weight->options());
   auto grad_bias_t   = at::empty(weight->sizes(), weight->options());
 
diff --git a/test/test_nn.py b/test/test_nn.py
index 2ece5fbdbd72c..3adc5a5e2e3a2 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -8,6 +8,7 @@
 import io
 import itertools
 import warnings
+import os
 import pickle
 import re
 from copy import deepcopy
@@ -30,12 +31,12 @@
 from torch.nn import Buffer, Parameter
 from torch.nn.parallel._functions import Broadcast
 from torch.testing._internal.common_dtype import integral_types, get_all_math_dtypes, floating_types
-from torch.testing._internal.common_utils import freeze_rng_state, run_tests, TestCase, skipIfNoLapack, skipIfRocm, \
+from torch.testing._internal.common_utils import dtype_name, freeze_rng_state, run_tests, TestCase, skipIfNoLapack, skipIfRocm, \
     TEST_NUMPY, TEST_SCIPY, TEST_WITH_CROSSREF, TEST_WITH_ROCM, \
     download_file, get_function_arglist, load_tests, skipIfMPS, \
     IS_PPC, \
     parametrize as parametrize_test, subtest, instantiate_parametrized_tests, \
-    skipIfTorchDynamo, gcIfJetson, set_default_dtype
+    skipIfTorchDynamo, skipIfRocmVersionLessThan, gcIfJetson, set_default_dtype
 from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, PLATFORM_SUPPORTS_FLASH_ATTENTION
 from torch.testing._internal.common_nn import NNTestCase, NewModuleTest, CriterionTest, \
     module_tests, criterion_tests, loss_reference_fns, _create_basic_net, \
@@ -5136,7 +5137,174 @@ def test_batchnorm_nhwc_cuda(self):
             inp2 = inp1.contiguous(memory_format=torch.channels_last)
             out1 = model(inp1)
             out2 = model(inp2)
-            self.assertTrue(torch.equal(out1, out2))
+            self.assertEqual(out1, out2)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    @parametrize_test("mode", ["train", "inference"], name_fn=lambda x: x)
+    @parametrize_test(
+        # test verifies cudnn/miopen batchnorm with the reference backend or memory format
+        # memory_format - one of ("NCHW", NHWC")
+        # ref_backend - one of ("cpu", "native", "NCHW", "NHWC")
+        #   "cpu"    - cpu backend with the same memory_format will be used as reference
+        #   "native" - native backend (`with torch.backends.cudnn.flags(enabled=False)`)
+        #              with the same memory_format will be used
+        #   "NCHW" or "NHWC" - the same backend will be used but another memory format
+        # mixed - True or False. Mixed batchnorm mode where inputs are 16-bit and batchnorm is fp32
+        #
+        "memory_format,ref_backend,mixed,dtype",
+        [
+            ("NCHW", "cpu", False, torch.float),
+            ("NCHW", "cpu", True, torch.half),
+            # NCHW bfloat16 path uses native kernels for rocm<=6.3
+            # train failed on rocm<=6.3 due to native tolerance issue SWDEV-507600
+            subtest(("NCHW", "cpu", True, torch.bfloat16), decorators=[skipIfRocmVersionLessThan((6, 4))]),
+
+            ("NCHW", "native", False, torch.float),
+            ("NCHW", "native", True, torch.half),
+            # this config failed for train and passed for inference on ROCm6.4
+            # subtest(("NCHW", "native", True, torch.bfloat16), decorators=[unittest.expectedFailure]),
+
+            ("NHWC", "cpu", False, torch.float),
+            ("NHWC", "cpu", True, torch.half),
+            ("NHWC", "cpu", True, torch.bfloat16),
+
+            ("NHWC", "native", False, torch.float),
+            ("NHWC", "native", True, torch.half),
+            ("NHWC", "native", True, torch.bfloat16),
+
+            ("NHWC", "NCHW", False, torch.float),
+            ("NHWC", "NCHW", True, torch.half),
+            # NCHW bfloat16 path uses native kernels for rocm<=6.3
+            # train failed on rocm<=6.3 due to native tolerance issue SWDEV-507600
+            subtest(("NHWC", "NCHW", True, torch.bfloat16), decorators=[skipIfRocmVersionLessThan((6, 4))]),
+        ],
+        name_fn=lambda f, b, m, t: f"{f}_vs_{b}{'_mixed' if m else ''}_{dtype_name(t)}"
+    )
+    def test_batchnorm(self, mode, memory_format, ref_backend, mixed, dtype):
+        def _create_tensor(size, memory_format, dtype, device):
+            t = torch.empty(size=size, memory_format=memory_format, dtype=dtype, device=device)
+            t = t.random_(1, 10)
+            return t
+
+        def _get_ref_device(backend: str , device: str):
+            # If 'backend' specifies the memory format, return 'device' arg, otherwise return a device matches the backend
+            if backend in ("NHWC", "NCHW"):
+                return device
+            if backend == "native":
+                return "cuda"
+            if backend == "cpu":
+                return "cpu"
+            else:
+                raise ValueError("Unknown backend")
+
+        def _get_backend_memory_format(backend: str, memory_format: torch.memory_format) -> torch.memory_format:
+            # If 'backend' specifies the memory format, return it, otherwise look at 'memory_format' arg
+            if backend == "NHWC":
+                return torch.channels_last
+            if backend == "NCHW":
+                return torch.contiguous_format
+            if memory_format in (torch.contiguous_format, torch.channels_last):
+                return memory_format
+            raise ValueError("Unable to detect memory format for backend={backend} and memory_format={memory_format}")
+
+        def _get_memory_format(t: torch.Tensor) -> torch.memory_format:
+            if t.is_contiguous(memory_format=torch.contiguous_format):
+                return torch.contiguous_format
+            if t.is_contiguous(memory_format=torch.channels_last):
+                return torch.channels_last
+            return ValueError("Unsupported memory_format")
+
+        def _create_backend(inp: torch.Tensor, mixed: bool = False):
+            mod = nn.BatchNorm2d(inp.size(1), device=inp.device, dtype=torch.float if mixed else inp.dtype)
+            return mod
+
+        def _test_batchnorm_train(inp, grad, mixed, ref_inp, ref_grad, ref_backend):
+            mod = _create_backend(inp, mixed).train()
+            mod.weight.data.uniform_()
+            mod.bias.data.uniform_()
+
+            ref_mod = _create_backend(ref_inp, mixed).train()
+            ref_mod.load_state_dict(mod.state_dict())
+
+            out = mod(inp)
+            out.backward(grad)
+
+            with torch.backends.cudnn.flags(enabled=False) if ref_backend == "native" else contextlib.nullcontext():
+                ref_out = ref_mod(ref_inp)
+                ref_out.backward(ref_grad)
+
+            self.assertTrue(out.is_contiguous(memory_format=_get_memory_format(inp)))
+            self.assertTrue(ref_out.is_contiguous(memory_format=_get_memory_format(ref_inp)))
+            self.assertEqual(out, ref_out)
+            self.assertEqual(mod.weight.grad, ref_mod.weight.grad)
+            self.assertEqual(mod.bias.grad, ref_mod.bias.grad)
+            self.assertEqual(mod.running_mean, ref_mod.running_mean)
+            self.assertEqual(mod.running_var, ref_mod.running_var)
+            self.assertEqual(inp.grad, ref_inp.grad)
+
+        def _train(memory_format, ref_backend, mixed, dtype):
+            memory_format = torch.contiguous_format if memory_format == "NCHW" else torch.channels_last
+            ref_memory_format = _get_backend_memory_format(ref_backend, memory_format)
+            ref_device = _get_ref_device(ref_backend, device="cuda")
+
+            size = (4, 8, 2, 2)
+            inp = _create_tensor(size, memory_format, dtype, device="cuda").detach().requires_grad_()
+            grad = _create_tensor(size, memory_format, dtype, device="cuda")
+            ref_inp = inp.detach().clone(memory_format=ref_memory_format).to(device=ref_device).requires_grad_()
+            ref_grad = grad.detach().clone(memory_format=ref_memory_format).to(device=ref_device)
+
+            _test_batchnorm_train(inp=inp, grad=grad, mixed=mixed,
+                                  ref_inp=ref_inp, ref_grad=ref_grad, ref_backend=ref_backend)
+
+            # TODO: enable permute logic later
+            # size = (2, 8, 8, 1)
+            # input = _create_tensor(size, memory_format, dtype, device="cuda").detach().requires_grad_()
+            # grad = _create_tensor(size, memory_format=torch.contiguous_format, dtype=dtype, device="cuda")
+            # # grad = _create_tensor(size, memory_format=memory_format, dtype=dtype, device="cuda")
+
+            # ref_input = input.detach().clone(memory_format=ref_memory_format).to(device=ref_device).requires_grad_(True)
+            # ref_grad = grad.detach().clone(memory_format=torch.contiguous_format).to(device=ref_device)
+            # # ref_grad = grad.detach().clone(memory_format=ref_memory_format).to(device=ref_device)
+
+            # if memory_format == torch.channels_last:
+            #     grad = grad.permute(0, 2, 1, 3)
+            #     # grad = grad.permute(0, 2, 3, 1)
+            # if ref_memory_format == torch.channels_last:
+            #     ref_grad = ref_grad.permute(0, 2, 1, 3)
+            #     # ef_grad = ref_grad.permute(0, 2, 3, 1)
+            # _test_batchnorm_train(input=input, grad=grad, mixed=mixed,
+            #                       ref_input=ref_input, ref_grad=ref_grad, ref_backend=ref_backend)
+
+        def _inference(memory_format, ref_backend, mixed, dtype):
+            memory_format = torch.contiguous_format if memory_format == "NCHW" else torch.channels_last
+            ref_memory_format = _get_backend_memory_format(ref_backend, memory_format)
+            ref_device = _get_ref_device(ref_backend, device="cuda")
+
+            size = (2, 64, 50, 50)
+            inp = _create_tensor(size, memory_format, dtype, device="cuda")
+            ref_inp = inp.detach().clone(memory_format=ref_memory_format).to(device=ref_device)
+            mod = _create_backend(inp, mixed).eval()
+            ref_mod = _create_backend(ref_inp, mixed).eval()
+
+            out = mod(inp)
+            with torch.backends.cudnn.flags(enabled=False) if ref_backend == "native" else contextlib.nullcontext():
+                ref_out = ref_mod(ref_inp)
+            self.assertEqual(out, ref_out)
+
+        # TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM once ROCm officially supports NHWC in MIOpen
+        PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM = "PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM"
+        prev_val = os.getenv(PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM)
+        try:
+            os.environ[PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM] = "1"
+            if mode == "train":
+                _train(memory_format, ref_backend, mixed, dtype)
+            else:
+                _inference(memory_format, ref_backend, mixed, dtype)
+        finally:
+            if prev_val is None:
+                del os.environ[PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM]
+            else:
+                os.environ[PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM] = prev_val
 
     def test_batchnorm_load_state_dict(self):
         bn = torch.nn.BatchNorm2d(3)
@@ -8377,7 +8545,6 @@ def test_affine_3d_rotateRandom(self, device):
 
             self.assertEqual(scipy_ary, gridsample_ary.reshape_as(scipy_ary))
 
-
     @onlyCUDA
     @dtypes(torch.float, torch.half)
     def test_batchnorm_large_batch(self, device, dtype):
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index e2419aab268b1..d711480bb85ef 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -2793,7 +2793,7 @@
   self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<c10::SymInt>(padding.size(), 0), groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
 - name: miopen_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor)
-  input, weight, bias: "grad.defined() ? (training ? miopen_batch_norm_backward(input, grad.contiguous(), weight, running_mean, running_var, result1, result2, epsilon) : native_batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, training, epsilon, grad_input_mask)) : std::tuple<Tensor, Tensor, Tensor>()"
+  input, weight, bias: "grad.defined() ? (training ? miopen_batch_norm_backward(input, grad.contiguous(input.suggest_memory_format()), weight, running_mean, running_var, result1, result2, epsilon) : native_batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, training, epsilon, grad_input_mask)) : std::tuple<Tensor, Tensor, Tensor>()"
   result0: batch_norm_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, running_mean, running_var, result1, result2, training, epsilon)
 
 - name: miopen_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon) -> (Tensor, Tensor, Tensor)

From fb20451c34bab48f9c4a94a0bf102f9dfa13ff4b Mon Sep 17 00:00:00 2001
From: iupaikov-amd <Iurii.Paikov@amd.com>
Date: Tue, 13 May 2025 18:52:32 +0200
Subject: [PATCH 15/83] test_decompose_mem_bound_mm.py tolerance increase for
 navi3x

(cherry picked from commit 03c7da05f61890bbf5ae41e23c8df6d5f6805bac)
---
 test/inductor/test_decompose_mem_bound_mm.py | 45 ++++++++++++++++----
 torch/testing/_internal/common_utils.py      | 37 +++++++++++++++-
 2 files changed, 72 insertions(+), 10 deletions(-)

diff --git a/test/inductor/test_decompose_mem_bound_mm.py b/test/inductor/test_decompose_mem_bound_mm.py
index d21de3178cf1e..828d05738739f 100644
--- a/test/inductor/test_decompose_mem_bound_mm.py
+++ b/test/inductor/test_decompose_mem_bound_mm.py
@@ -12,6 +12,8 @@
 from torch.testing import FileCheck
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
+    patch_test_members,
+    is_navi3_arch,
     parametrize,
     TEST_XPU,
 )
@@ -61,31 +63,46 @@ def forward(self, input1, input2):
 )
 @instantiate_parametrized_tests
 class TestDecomposeMemMM(TestCase):
-    def compare_dict_tensors(self, ref_dict, res_dict, rtol=1e-3, atol=1e-3):
+    def __init__(self, method_name='runTest', methodName='runTest'):
+        super().__init__(method_name, methodName)
+        self.atol = 1e-3
+        self.rtol = 1e-3
+
+    def setup_tolerance(self, rtol=None, atol=None):
+        if rtol is None:
+            rtol = self.rtol
+        if atol is None:
+            atol = self.rtol
+
+    def compare_dict_tensors(self, ref_dict, res_dict, rtol=None, atol=None):
+        self.setup_tolerance(rtol, atol)
         if len(set(ref_dict.keys())) != len(set(res_dict.keys())):
             return False
         for key1 in ref_dict.keys():
             key2 = "_orig_mod." + key1
             assert key2 in res_dict, f"{key1} does not exist in traced module"
-            if not torch.allclose(ref_dict[key1], res_dict[key2], rtol=rtol, atol=atol):
+            if not torch.allclose(ref_dict[key1], res_dict[key2], rtol=self.rtol, atol=self.atol):
                 return False
         return True
 
-    def compare_pred(self, module, traced, input, rtol=1e-3, atol=1e-3):
+    def compare_pred(self, module, traced, input, rtol=None, atol=None):
+        self.setup_tolerance(rtol, atol)
         ref = module(*input)
         res = traced(*input)
-        self.assertEqual(ref, res, rtol=rtol, atol=atol)
+        self.assertEqual(ref, res, rtol=self.rtol, atol=self.atol)
 
-    def compare_parameters(self, module, traced, rtol=1e-3, atol=1e-3):
+    def compare_parameters(self, module, traced, rtol=None, atol=None):
+        self.setup_tolerance(rtol, atol)
         ref_params = dict(module.named_parameters())
         res_params = dict(traced.named_parameters())
-        self.assertTrue(self.compare_dict_tensors(ref_params, res_params, rtol, atol))
+        self.assertTrue(self.compare_dict_tensors(ref_params, res_params, rtol=self.rtol, atol=self.atol))
 
-    def compare_gradients(self, module, traced, rtol=1e-3, atol=1e-3):
+    def compare_gradients(self, module, traced, rtol=None, atol=None):
+        self.setup_tolerance(rtol, atol)
         ref_grad = {key: param.grad for key, param in module.named_parameters()}
         res_grad = {key: param.grad for key, param in traced.named_parameters()}
         self.assertTrue(
-            self.compare_dict_tensors(ref_grad, res_grad, rtol=rtol, atol=atol)
+            self.compare_dict_tensors(ref_grad, res_grad, rtol=self.rtol, atol=self.atol)
         )
 
     @parametrize(
@@ -192,6 +209,12 @@ def test_decompose_linear(self, m, n, k, has_bias, should_decompose):
         )
         counters.clear()
 
+    # We have to increase tolerance for navi3 because all fp16, bf16
+    # GEMMs operations have an accuracy issue caused by hardware limitation
+    @patch_test_members({
+        "atol": 2e-3 if is_navi3_arch() else 1e-3,
+        "rtol": 2e-3 if is_navi3_arch() else 1e-3
+    })
     @parametrize(
         "m,k,n, should_decompose",
         [(20480, 5, 2, True), (20480, 32, 2, False), (2048, 2, 2, False)],
@@ -302,6 +325,12 @@ def test_decompose_mm_cpu(self, m, n, k, should_decompose):
         )
         counters.clear()
 
+    # We have to increase tolerance for navi3 because all fp16, bf16
+    # GEMMs operations have an accuracy issue caused by hardware limitation
+    @patch_test_members({
+        "atol": 3e-3 if is_navi3_arch() else 1e-3,
+        "rtol": 4e-3 if is_navi3_arch() else 1e-3
+    })
     @parametrize(
         "m,k,n, should_decompose",
         [(20480, 5, 2, True), (20480, 32, 2, False), (2048, 2, 2, False)],
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 45b7378f88cc8..e2bd47fe62000 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -102,8 +102,18 @@
     has_pytest = False
 
 
-MI300_ARCH = ("gfx942",)
-
+MI300_ARCH = ("gfx940", "gfx941", "gfx942")
+NAVI_ARCH = ("gfx1030", "gfx1100", "gfx1101", "gfx1200", "gfx1201")
+NAVI3_ARCH = ("gfx1100", "gfx1101")
+NAVI4_ARCH = ("gfx1200", "gfx1201")
+
+def is_navi3_arch():
+    if torch.cuda.is_available():
+        prop = torch.cuda.get_device_properties(0)
+        gfx_arch = prop.gcnArchName.split(":")[0]
+        if gfx_arch in NAVI3_ARCH:
+            return True
+    return False
 
 def freeze_rng_state(*args, **kwargs):
     return torch.testing._utils.freeze_rng_state(*args, **kwargs)
@@ -5710,3 +5720,26 @@ def load_inline(*args, **kwargs):
         return func(*args, load_inline=load_inline, **kwargs)
 
     return wrapper
+
+# Decorator to patch multiple test class members for the duration of the subtest
+def patch_test_members(updates: Dict[str, Any]):
+    def decorator(test_func):
+        @wraps(test_func)
+        def wrapper(self, *args, **kwargs):
+            # Store the original values of the specified members
+            original_values = {member: getattr(self, member) for member in updates}
+
+            # Update the members before running the subtest
+            for member, value in updates.items():
+                setattr(self, member, value)
+
+            # Run the test function, allowing subtests to run
+            try:
+                return test_func(self, *args, **kwargs)
+            finally:
+                # Restore the original values of the specified members after the subtest finishes
+                for member, original_value in original_values.items():
+                    setattr(self, member, original_value)
+
+        return wrapper
+    return decorator
\ No newline at end of file

From 32449c9a7af659e157423ab834330142d870254d Mon Sep 17 00:00:00 2001
From: Dmitry Nikolaev <139769634+dnikolaev-amd@users.noreply.github.com>
Date: Thu, 22 May 2025 22:35:04 +0200
Subject: [PATCH 16/83] [release/2.7] enable NHWC batchnorm by default on
 ROCm7.0+ (#2180)

NHWC batchnorm enabled by default if ROCm>=7.0

(cherry picked from commit e0afc3acaab703224e0faf9f54fa3ed39cb16e2b)
---
 aten/src/ATen/native/Normalization.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
index 3f473a7146453..6021f9d7edf1e 100644
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@@ -523,7 +523,8 @@ BatchNormBackend _select_batch_norm_backend(
   // TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM once ROCm officially supports NHWC in MIOpen
   // See #64427
   // non static variable is used to be able to change environment variable in runtime for testing
-  bool PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM").value_or(false);
+  // enabled by default for ROCm >= 7.0.0
+  bool PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM").value_or(ROCM_VERSION >= 70000);
 
   if (
       detail::getCUDAHooks().compiledWithMIOpen()

From 23f0b5f3b7870eb689b72f3de28eb10964c29bed Mon Sep 17 00:00:00 2001
From: Dmitry Nikolaev <139769634+dnikolaev-amd@users.noreply.github.com>
Date: Sat, 24 May 2025 04:51:06 +0200
Subject: [PATCH 17/83] [release/2.7] import 'Dict' to fix common_utils.py
 (#2181)

Bug introduced by
https://github.com/ROCm/pytorch/commit/03c7da05f61890bbf5ae41e23c8df6d5f6805bac

(cherry picked from commit bbd0112a2cfbefbf687f5e907766aba4d030258e)
---
 torch/testing/_internal/common_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index e2bd47fe62000..052a968d51e22 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -49,6 +49,7 @@
 from typing import (
     Any,
     Callable,
+    Dict,
     Optional,
     TypeVar,
     Union,

From 48630d8997007790924d82f117b91c43be6c4ab0 Mon Sep 17 00:00:00 2001
From: omkar kakarparthi <75638701+okakarpa@users.noreply.github.com>
Date: Thu, 29 May 2025 00:33:36 -0500
Subject: [PATCH 18/83] [AUTOGENERATED] [release/2.7]
 [rocm6.4_internal_testing] Replaced ROCm specific skips to generalized
 conditions (#2126)

Cherry-pick of https://github.com/ROCm/pytorch/pull/2100
Need to resolve conflicts

---------

Co-authored-by: iupaikov-amd <Iurii.Paikov@amd.com>
(cherry picked from commit f0c1ce8cd1232adc80fae55aa054626ad0d2b430)
---
 test/inductor/test_aot_inductor.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py
index 9f93a3959d831..6a768a3dbbb15 100644
--- a/test/inductor/test_aot_inductor.py
+++ b/test/inductor/test_aot_inductor.py
@@ -31,7 +31,11 @@
 from torch.export.pt2_archive._package import load_pt2
 from torch.testing import FileCheck
 from torch.testing._internal import common_utils
-from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8, SM80OrLater
+from torch.testing._internal.common_cuda import (
+    SM80OrLater,
+    SM90OrLater,
+    PLATFORM_SUPPORTS_FLASH_ATTENTION
+)
 from torch.testing._internal.common_device_type import (
     _has_sufficient_memory,
     skipCUDAIf,
@@ -1363,6 +1367,7 @@ def forward(self, q, k, v):
         self.check_model(Model(), example_inputs)
 
     @unittest.skipIf(not SM80OrLater, "bfloat16 only supported in sm80+")
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA")
     def test_sdpa_2(self):
         class Model(torch.nn.Module):
             def __init__(self) -> None:
@@ -1615,6 +1620,7 @@ def forward(self, values, repeats, mask, embeddings, x, y, z, lst):
         self.check_model(Repro(), example_inputs, dynamic_shapes=spec)
 
     @skipIfXpu(msg="_scaled_dot_product_flash_attention is not supported on XPU yet")
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA")
     def test_fallback_kernel_with_symexpr_output(self):
         if self.device != GPU_TYPE:
             raise unittest.SkipTest("requires GPU")
@@ -4173,6 +4179,7 @@ def grid(meta):
             dynamic_shapes=dynamic_shapes,
         )
 
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA")
     def test_scaled_dot_product_efficient_attention(self):
         if self.device != GPU_TYPE:
             raise unittest.SkipTest("requires GPU")

From ae17c3a98ffe389c67c3258408cdca277d77c753 Mon Sep 17 00:00:00 2001
From: omkar kakarparthi <75638701+okakarpa@users.noreply.github.com>
Date: Tue, 3 Jun 2025 19:20:18 -0500
Subject: [PATCH 19/83] [release/2.7] [SWDEV-535259] enable miopen channels
 last 3d for conv and batchnorm (#2232)

Cherry-pick of https://github.com/ROCm/pytorch/pull/2209

Co-authored-by: Jeff Daily <jeff.daily@amd.com>
(cherry picked from commit bf0079dce7c9fb5acb4d1ab0b3219c6a415302e7)
---
 aten/src/ATen/native/ConvUtils.h            | 16 ++++++++++------
 aten/src/ATen/native/Convolution.cpp        |  2 +-
 aten/src/ATen/native/Normalization.cpp      |  1 +
 aten/src/ATen/native/miopen/Conv_miopen.cpp | 12 ++++++------
 4 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h
index 6e99e9565240c..84381efe55b0b 100644
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@@ -362,20 +362,24 @@ inline bool miopen_conv_use_channels_last(const at::Tensor& input, const at::Ten
     return false;
   }
 
-  bool can_use_miopen_channels_last_2d = false;
   // TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen
   // See #64427
   static std::optional<bool> PYTORCH_MIOPEN_SUGGEST_NHWC = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC");
+  static bool suggest_nhwc = PYTORCH_MIOPEN_SUGGEST_NHWC && *PYTORCH_MIOPEN_SUGGEST_NHWC;
 
   auto input_memory_format = input.suggest_memory_format();
   auto weight_memory_format = weight.suggest_memory_format();
+  auto weight_ndim = weight.ndimension();
 
-  can_use_miopen_channels_last_2d = PYTORCH_MIOPEN_SUGGEST_NHWC &&  *PYTORCH_MIOPEN_SUGGEST_NHWC && (
-            ( (input_memory_format  == at::MemoryFormat::ChannelsLast) ||
-            (weight_memory_format == at::MemoryFormat::ChannelsLast) )
-        );
+  bool can_use_miopen_channels_last_2d = suggest_nhwc && (weight_ndim == 4) && (
+    (input_memory_format  == at::MemoryFormat::ChannelsLast) ||
+    (weight_memory_format == at::MemoryFormat::ChannelsLast)
+  );
 
-  bool can_use_miopen_channels_last_3d = false;
+  bool can_use_miopen_channels_last_3d = suggest_nhwc && (weight_ndim == 5) && (
+    (input_memory_format  == at::MemoryFormat::ChannelsLast3d) ||
+    (weight_memory_format == at::MemoryFormat::ChannelsLast3d)
+  );
 
   return can_use_miopen_channels_last_2d || can_use_miopen_channels_last_3d;
 }
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index d06fc5168a0fd..1122d9c8d38af 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -1421,7 +1421,7 @@ static inline at::MemoryFormat determine_backend_memory_format(
       if (detail::getCUDAHooks().compiledWithMIOpen() && miopen_conv_use_channels_last(input, weight)) {
         TORCH_INTERNAL_ASSERT((k == 4 || k == 5),
             "Expected 4D or 5D input for miopen memory format selection in determine_backend_memory_format()");
-        backend_memory_format = (k == 5) ? at::MemoryFormat::Contiguous /*at::MemoryFormat::ChannelsLast3d*/ : at::MemoryFormat::ChannelsLast;
+        backend_memory_format = (k == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
       }
       break;
     case ConvBackend::Mkldnn:
diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
index 6021f9d7edf1e..b9cf3c907f09f 100644
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@@ -546,6 +546,7 @@ BatchNormBackend _select_batch_norm_backend(
       && (input.suggest_memory_format() == MemoryFormat::Contiguous
 #if (defined(USE_ROCM) && ROCM_VERSION >= 60500)
         || (input.suggest_memory_format() == MemoryFormat::ChannelsLast && PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM)
+        || (input.suggest_memory_format() == MemoryFormat::ChannelsLast3d && PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM)
 #endif
         )
   ) {
diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp
index d2cef0ca62188..f4e67e4fc307a 100644
--- a/aten/src/ATen/native/miopen/Conv_miopen.cpp
+++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp
@@ -763,7 +763,7 @@ Tensor miopen_convolution_forward(
 
   auto memory_format = at::MemoryFormat::Contiguous;
   if (miopen_conv_use_channels_last(*input, *weight)) {
-    memory_format = (weight->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+    memory_format = (weight->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
   }
 
   Tensor output_t = at::detail::empty_cuda(
@@ -872,7 +872,7 @@ Tensor miopen_depthwise_convolution_forward(
 
   auto memory_format = at::MemoryFormat::Contiguous;
   if (miopen_conv_use_channels_last(*input, *weight)) {
-    memory_format = (weight->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+    memory_format = (weight->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
   }
 
   Tensor output_t = at::detail::empty_cuda(
@@ -1074,7 +1074,7 @@ Tensor miopen_depthwise_convolution_backward_weight(
 
   auto memory_format = at::MemoryFormat::Contiguous;
   if (miopen_conv_use_channels_last(*input, *grad_output)) {
-    memory_format = (input->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+    memory_format = (input->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
   }
 
   Tensor grad_output_contig_t = grad_output->contiguous(memory_format);
@@ -1127,7 +1127,7 @@ Tensor miopen_convolution_backward_weight(
 
   auto memory_format = at::MemoryFormat::Contiguous;
   if (miopen_conv_use_channels_last(*input, *grad_output)) {
-    memory_format = (input->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+    memory_format = (input->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
   }
 
   Tensor grad_output_contig_t = grad_output->contiguous(memory_format);
@@ -1281,7 +1281,7 @@ Tensor miopen_convolution_backward_input(
 
   auto memory_format = at::MemoryFormat::Contiguous;
   if (miopen_conv_use_channels_last(*grad_output, *weight)) {
-    memory_format = (weight->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+    memory_format = (weight->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
   }
 
   Tensor grad_input_t = at::detail::empty_cuda(
@@ -1389,7 +1389,7 @@ Tensor miopen_depthwise_convolution_backward_input(
 
   auto memory_format = at::MemoryFormat::Contiguous;
   if (miopen_conv_use_channels_last(*grad_output, *weight)) {
-    memory_format = (weight->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+    memory_format = (weight->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
   }
 
   Tensor grad_input_t = at::detail::empty_cuda(

From e4d62b137876a53c00324e967d23f91f3f9837ac Mon Sep 17 00:00:00 2001
From: omkar kakarparthi <75638701+okakarpa@users.noreply.github.com>
Date: Wed, 4 Jun 2025 18:33:11 -0500
Subject: [PATCH 20/83] [AUTOGENERATED] [release/2.7] Add 3D batchnorm tests
 (#2243)

Cherry-pick of https://github.com/ROCm/pytorch/pull/2214

Co-authored-by: Dmitry Nikolaev <139769634+dnikolaev-amd@users.noreply.github.com>
Co-authored-by: Jeff Daily <jeff.daily@amd.com>
(cherry picked from commit 5631e0791910c7ea40e81027fa6de7b5e0e24253)
---
 test/test_nn.py | 75 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 55 insertions(+), 20 deletions(-)

diff --git a/test/test_nn.py b/test/test_nn.py
index 3adc5a5e2e3a2..14d4eed971d2e 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -37,7 +37,7 @@
     IS_PPC, \
     parametrize as parametrize_test, subtest, instantiate_parametrized_tests, \
     skipIfTorchDynamo, skipIfRocmVersionLessThan, gcIfJetson, set_default_dtype
-from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, PLATFORM_SUPPORTS_FLASH_ATTENTION
+from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, PLATFORM_SUPPORTS_FLASH_ATTENTION, _get_torch_rocm_version
 from torch.testing._internal.common_nn import NNTestCase, NewModuleTest, CriterionTest, \
     module_tests, criterion_tests, loss_reference_fns, _create_basic_net, \
     ctcloss_reference, get_new_module_tests, single_batch_reference_fn, _test_bfloat16_ops, _test_module_empty_input
@@ -5140,6 +5140,7 @@ def test_batchnorm_nhwc_cuda(self):
             self.assertEqual(out1, out2)
 
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    @parametrize_test("dims", [2, 3], name_fn=lambda x: f"{x}D")
     @parametrize_test("mode", ["train", "inference"], name_fn=lambda x: x)
     @parametrize_test(
         # test verifies cudnn/miopen batchnorm with the reference backend or memory format
@@ -5155,14 +5156,11 @@ def test_batchnorm_nhwc_cuda(self):
         [
             ("NCHW", "cpu", False, torch.float),
             ("NCHW", "cpu", True, torch.half),
-            # NCHW bfloat16 path uses native kernels for rocm<=6.3
-            # train failed on rocm<=6.3 due to native tolerance issue SWDEV-507600
-            subtest(("NCHW", "cpu", True, torch.bfloat16), decorators=[skipIfRocmVersionLessThan((6, 4))]),
+            ("NCHW", "cpu", True, torch.bfloat16),
 
             ("NCHW", "native", False, torch.float),
             ("NCHW", "native", True, torch.half),
-            # this config failed for train and passed for inference on ROCm6.4
-            # subtest(("NCHW", "native", True, torch.bfloat16), decorators=[unittest.expectedFailure]),
+            ("NCHW", "native", True, torch.bfloat16),
 
             ("NHWC", "cpu", False, torch.float),
             ("NHWC", "cpu", True, torch.half),
@@ -5174,13 +5172,33 @@ def test_batchnorm_nhwc_cuda(self):
 
             ("NHWC", "NCHW", False, torch.float),
             ("NHWC", "NCHW", True, torch.half),
-            # NCHW bfloat16 path uses native kernels for rocm<=6.3
-            # train failed on rocm<=6.3 due to native tolerance issue SWDEV-507600
-            subtest(("NHWC", "NCHW", True, torch.bfloat16), decorators=[skipIfRocmVersionLessThan((6, 4))]),
+            ("NHWC", "NCHW", True, torch.bfloat16),
         ],
         name_fn=lambda f, b, m, t: f"{f}_vs_{b}{'_mixed' if m else ''}_{dtype_name(t)}"
     )
-    def test_batchnorm(self, mode, memory_format, ref_backend, mixed, dtype):
+    def test_batchnorm(self, dims, mode, memory_format, ref_backend, mixed, dtype):
+        if torch.version.hip:
+            if self._testMethodName in ("test_batchnorm_2D_train_NHWC_vs_NCHW_mixed_bfloat16",
+                                    "test_batchnorm_2D_train_NCHW_vs_cpu_mixed_bfloat16",
+                                    "test_batchnorm_3D_train_NHWC_vs_NCHW_mixed_bfloat16",
+                                    "test_batchnorm_3D_train_NCHW_vs_cpu_mixed_bfloat16"
+                                    ) and _get_torch_rocm_version() < (6, 4):
+                # NCHW bfloat16 path uses native kernels for rocm<=6.3
+                # train failed on rocm<=6.3 due to native tolerance issue SWDEV-507600
+                self.skipTest("bfloat16 NHWC train failed on ROCm <= 6.3")
+
+            if self._testMethodName in ("test_batchnorm_2D_train_NCHW_vs_native_mixed_bfloat16",
+                                        "test_batchnorm_3D_train_NCHW_vs_native_mixed_bfloat16"
+                                        ) and _get_torch_rocm_version() >= (6, 4):
+                self.skipTest("bfloat16 NCHW train failed due to native tolerance issue SWDEV-507600")
+
+            if self._testMethodName == "test_batchnorm_3D_train_NCHW_vs_native_mixed_float16" \
+                and _get_torch_rocm_version() < (6, 4):
+                self.skipTest("3D float16 NCHW train failed on ROCm<=6.3 ")
+
+        if dims == 3 and memory_format in ("NHWC", "NCHW"):
+            memory_format = memory_format + "3D"
+
         def _create_tensor(size, memory_format, dtype, device):
             t = torch.empty(size=size, memory_format=memory_format, dtype=dtype, device=device)
             t = t.random_(1, 10)
@@ -5188,7 +5206,7 @@ def _create_tensor(size, memory_format, dtype, device):
 
         def _get_ref_device(backend: str , device: str):
             # If 'backend' specifies the memory format, return 'device' arg, otherwise return a device matches the backend
-            if backend in ("NHWC", "NCHW"):
+            if backend in ("NHWC", "NHWC3D", "NCHW", "NCHW3D"):
                 return device
             if backend == "native":
                 return "cuda"
@@ -5201,9 +5219,11 @@ def _get_backend_memory_format(backend: str, memory_format: torch.memory_format)
             # If 'backend' specifies the memory format, return it, otherwise look at 'memory_format' arg
             if backend == "NHWC":
                 return torch.channels_last
-            if backend == "NCHW":
+            if backend == "NHWC3D":
+                return torch.channels_last_3d
+            if backend in ("NCHW", "NCHW3D"):
                 return torch.contiguous_format
-            if memory_format in (torch.contiguous_format, torch.channels_last):
+            if memory_format in (torch.contiguous_format, torch.channels_last, torch.channels_last_3d):
                 return memory_format
             raise ValueError("Unable to detect memory format for backend={backend} and memory_format={memory_format}")
 
@@ -5212,10 +5232,24 @@ def _get_memory_format(t: torch.Tensor) -> torch.memory_format:
                 return torch.contiguous_format
             if t.is_contiguous(memory_format=torch.channels_last):
                 return torch.channels_last
+            if t.is_contiguous(memory_format=torch.channels_last_3d):
+                return torch.channels_last_3d
+            return ValueError("Unsupported memory_format")
+
+        def _get_memory_format_from_name(memory_format_name: str) -> torch.memory_format:
+            if memory_format_name == "NHWC":
+                return torch.channels_last
+            elif memory_format_name == "NHWC3D":
+                return torch.channels_last_3d
+            elif memory_format_name in ("NCHW", "NCHW3D"):
+                return torch.contiguous_format
             return ValueError("Unsupported memory_format")
 
         def _create_backend(inp: torch.Tensor, mixed: bool = False):
-            mod = nn.BatchNorm2d(inp.size(1), device=inp.device, dtype=torch.float if mixed else inp.dtype)
+
+            mod = nn.BatchNorm2d(inp.size(1), device=inp.device, dtype=torch.float if mixed else inp.dtype) \
+                if inp.dim() == 4 else \
+                    nn.BatchNorm3d(inp.size(1), device=inp.device, dtype=torch.float if mixed else inp.dtype)
             return mod
 
         def _test_batchnorm_train(inp, grad, mixed, ref_inp, ref_grad, ref_backend):
@@ -5242,12 +5276,13 @@ def _test_batchnorm_train(inp, grad, mixed, ref_inp, ref_grad, ref_backend):
             self.assertEqual(mod.running_var, ref_mod.running_var)
             self.assertEqual(inp.grad, ref_inp.grad)
 
-        def _train(memory_format, ref_backend, mixed, dtype):
-            memory_format = torch.contiguous_format if memory_format == "NCHW" else torch.channels_last
+        def _train(memory_format_name, ref_backend, mixed, dtype):
+            memory_format = _get_memory_format_from_name(memory_format_name)
+
             ref_memory_format = _get_backend_memory_format(ref_backend, memory_format)
             ref_device = _get_ref_device(ref_backend, device="cuda")
 
-            size = (4, 8, 2, 2)
+            size = (4, 8, 2, 2, 2) if memory_format_name in ("NCHW3D", "NHWC3D") else (4, 8, 2, 2)
             inp = _create_tensor(size, memory_format, dtype, device="cuda").detach().requires_grad_()
             grad = _create_tensor(size, memory_format, dtype, device="cuda")
             ref_inp = inp.detach().clone(memory_format=ref_memory_format).to(device=ref_device).requires_grad_()
@@ -5275,12 +5310,12 @@ def _train(memory_format, ref_backend, mixed, dtype):
             # _test_batchnorm_train(input=input, grad=grad, mixed=mixed,
             #                       ref_input=ref_input, ref_grad=ref_grad, ref_backend=ref_backend)
 
-        def _inference(memory_format, ref_backend, mixed, dtype):
-            memory_format = torch.contiguous_format if memory_format == "NCHW" else torch.channels_last
+        def _inference(memory_format_name, ref_backend, mixed, dtype):
+            memory_format = _get_memory_format_from_name(memory_format_name)
             ref_memory_format = _get_backend_memory_format(ref_backend, memory_format)
             ref_device = _get_ref_device(ref_backend, device="cuda")
 
-            size = (2, 64, 50, 50)
+            size = (2, 64, 50, 50, 50) if memory_format_name in ("NCHW3D", "NHWC3D") else (2, 64, 50, 50)
             inp = _create_tensor(size, memory_format, dtype, device="cuda")
             ref_inp = inp.detach().clone(memory_format=ref_memory_format).to(device=ref_device)
             mod = _create_backend(inp, mixed).eval()

From d40f3c8babe65790a438c55bf3bba909c09b186a Mon Sep 17 00:00:00 2001
From: rocm-mici <49319081+rocm-mici@users.noreply.github.com>
Date: Wed, 18 Dec 2024 10:35:55 -0600
Subject: [PATCH 21/83] [AUTOGENERATED] [release/2.5] [ROCm][layer_norm] Use
 __builtin_amdgcn_rcpf(x) instead of 1.f/x (#1800)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cherry-pick of https://github.com/ROCm/pytorch/pull/1688

Co-authored-by: Michael Halkenhäuser <michaelhalk@web.de>
Co-authored-by: Hashem Hashemi <hashem.hashemi@amd.com>
(cherry picked from commit f8544afa6d17de9cc63b6ff28b4b32cf0c1a4381)
(cherry picked from commit ed487541c625699693d7ba025593bb41f555968b)
(cherry picked from commit d62a39eb77f5d7c922e93f3292ef7e072cdf6f84)
---
 aten/src/ATen/native/cuda/layer_norm_kernel.cu |  8 ++++++++
 cmake/Dependencies.cmake                       | 16 ++++++++++++++++
 setup.py                                       |  4 ++++
 3 files changed, 28 insertions(+)

diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
index 0709164e60555..aa25ad5f6ee0c 100644
--- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu
+++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
@@ -131,7 +131,11 @@ WelfordDataLN cuWelfordOnlineSum(
 {
   U delta = val - curr_sum.mean;
   U new_count = curr_sum.count + 1.f;
+#if defined(USE_ROCM) && defined(PYTORCH_LAYERNORM_FAST_RECIPROCAL)
+  U new_mean = curr_sum.mean + delta * __builtin_amdgcn_rcpf(new_count);
+#else
   U new_mean = curr_sum.mean + delta * (1.f/new_count); //proper division is slow, this is less accurate but noticeably faster
+#endif
   return {new_mean, curr_sum.sigma2 + delta * (val - new_mean), new_count};
 }
 
@@ -145,7 +149,11 @@ WelfordDataLN cuWelfordCombine(
   U count = dataA.count + dataB.count;
   U mean, sigma2;
   if (count > decltype(dataB.count){0}) {
+#if defined(USE_ROCM) && defined(PYTORCH_LAYERNORM_FAST_RECIPROCAL)
+    auto coef = __builtin_amdgcn_rcpf(count);
+#else
     auto coef = 1.f/count; //NB we don't use --use_fast_math, but this is emulation, 1./count goes to intrinsic, `* coef` is multiplication, instead of slow fp division
+#endif
     auto nA = dataA.count * coef;
     auto nB = dataB.count * coef;
     mean = nA*dataA.mean + nB*dataB.mean;
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index a93386c27f8d8..1a06f2915787d 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1048,6 +1048,22 @@ if(USE_ROCM)
        list(APPEND HIP_HIPCC_FLAGS -fdebug-info-for-profiling)
     endif(CMAKE_BUILD_TYPE MATCHES Debug)
 
+    # Get EnVar 'PYTORCH_LAYERNORM_FAST_RECIPROCAL' (or default to on).
+    if(DEFINED ENV{PYTORCH_LAYERNORM_FAST_RECIPROCAL})
+      set(PYTORCH_LAYERNORM_FAST_RECIPROCAL_CMAKE $ENV{PYTORCH_LAYERNORM_FAST_RECIPROCAL})
+    else()
+      set(PYTORCH_LAYERNORM_FAST_RECIPROCAL_CMAKE ON)
+    endif()
+
+    set(PYTORCH_LAYERNORM_FAST_RECIPROCAL
+      ${PYTORCH_LAYERNORM_FAST_RECIPROCAL_CMAKE}
+      CACHE BOOL "Enable fast reciprocals within layer normalization." FORCE
+    )
+
+    if(PYTORCH_LAYERNORM_FAST_RECIPROCAL)
+      add_definitions(-DPYTORCH_LAYERNORM_FAST_RECIPROCAL)
+    endif()
+
     # needed for compat with newer versions of hip-clang that introduced C++20 mangling rules
     list(APPEND HIP_HIPCC_FLAGS -fclang-abi-compat=17)
 
diff --git a/setup.py b/setup.py
index b4ebc92f59268..4db59ecd0b087 100644
--- a/setup.py
+++ b/setup.py
@@ -153,6 +153,10 @@
 #   USE_ROCM_KERNEL_ASSERT=1
 #     Enable kernel assert in ROCm platform
 #
+#   PYTORCH_LAYERNORM_FAST_RECIPROCAL
+#     If set, enables the use of builtin functions for fast reciprocals (1/x) w.r.t.
+#     layer normalization. Default: enabled.
+#
 # Environment variables we respect (these environment variables are
 # conventional and are often understood/set by other software.)
 #

From dbb9f2a1d56edf6f738ee3afd61a6673b7d4ae1a Mon Sep 17 00:00:00 2001
From: Dmitry Nikolaev <139769634+dnikolaev-amd@users.noreply.github.com>
Date: Wed, 30 Apr 2025 19:11:29 +0200
Subject: [PATCH 22/83] [release/2.6] remove xfail from
 'batch_norm_with_update' (#2070)

remove `xfail` from `batch_norm_with_update` op in `test_grad` and
`test_vmap_autograd_grad`

these tests are passed since ROCm6.4

Fixes https://ontrack-internal.amd.com/browse/SWDEV-529820

(cherry picked from commit 99b075842070e2b73625440d8712b87e1833515c)
(cherry picked from commit a7044a4005eeb6d60cdc77a23e629a63b977eb1a)
(cherry picked from commit 3fc00a85a28c854ffc4832551d720e839c48700c)
---
 test/functorch/test_ops.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index cef00f83eb72d..244d8518c6a10 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -436,13 +436,6 @@ class TestOperators(TestCase):
                 ),  # Works on ROCm
                 xfail("torch.ops.aten._flash_attention_forward"),
                 xfail("torch.ops.aten._efficient_attention_forward"),
-                # RuntimeError: Expected contiguous tensor, but got
-                # non-contiguous tensor for argument #2 'grad_output'
-                decorate(
-                    "_batch_norm_with_update",
-                    decorator=expectedFailureIf(TEST_WITH_ROCM),
-                    device_type="cuda",
-                ),
             }
         ),
     )
@@ -2368,13 +2361,6 @@ def fn(input, weight, bias):
             skip("sparse.sampled_addmm", ""),
             skip("sparse.mm", "reduce"),
             skip("native_layer_norm", "", device_type="cpu"),
-            # RuntimeError: Expected contiguous tensor, but got
-            # non-contiguous tensor for argument #2 'grad_output'
-            decorate(
-                "_batch_norm_with_update",
-                decorator=expectedFailureIf(TEST_WITH_ROCM),
-                device_type="cuda",
-            ),
         },
     )
     @opsToleranceOverride(

From e62e3947b3b80e90f3e70261d045809de5444e08 Mon Sep 17 00:00:00 2001
From: Jagadish Krishnamoorthy <jagadish.krishnamoorthy@amd.com>
Date: Wed, 4 Jun 2025 09:40:52 -0700
Subject: [PATCH 23/83] [release/2.7] Enable mx fp8 support on ROCm (#2199)

Ported mx fp8 part from https://github.com/ROCm/pytorch/pull/2046

Current test stats (accounting only blockwise scale tests)
PYTORCH_TEST_WITH_ROCM=1 python test/test_matmul_cuda.py -k
test_blockwise -v

Ran 225 tests in 8.256s
FAILED (failures=1, skipped=150)
_74 test pass_

**fp8 mx data type sample test case.**

test_blockwise_mxfp8_numerics_test_case_name_data_random_scales_one_fast_accum_True_512_128_256_cuda
(__main__.TestFP8MatmulCudaCUDA)
hipblaslt-bench --api_method c -m 256 -n 512 -k 128 --lda 128 --ldb 128
--ldc 256 --ldd 256 --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0
--alpha 1 --beta 0 --transA T --transB N --batch_count 1 --scaleA 3
--scaleB 3 --a_type f8_r --b_type f8_r --c_type bf16_r --d_type bf16_r
--compute_type f32_r --algo_method index --solution_index -2146957310
--rotating 0 --cold_iters 0 --iters 0

---------

Signed-off-by: Jagadish Krishnamoorthy <jagadish.krishnamoorthy@amd.com>
(cherry picked from commit d17e2227ecfdf81b3684aa034113a747928ea6cb)
---
 aten/src/ATen/cuda/CUDABlas.cpp            | 14 +++++++--
 aten/src/ATen/native/cuda/Blas.cpp         | 36 ++++++++++++++++++++--
 test/test_matmul_cuda.py                   |  8 +++--
 torch/testing/_internal/common_cuda.py     | 10 +++++-
 torch/utils/hipify/cuda_to_hip_mappings.py |  5 +++
 5 files changed, 65 insertions(+), 8 deletions(-)

diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
index 89350a11bea7e..d884fd7ffa915 100644
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -1879,6 +1879,16 @@ void scaled_gemm(
     matmulDescA = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER_VEC_EXT;
     matmulDescB = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER_VEC_EXT;
   }
+    else if(mat1_scale_dtype == kFloat8_e8m0fnu && mat2_scale_dtype == kFloat8_e8m0fnu) {
+#if ROCM_VERSION >= 70000
+          if (at::detail::getCUDAHooks().isGPUArch(0, {"gfx950"})) {
+            // Validate matrix dimensions for MX format
+            TORCH_CHECK((m % 32 == 0) && (n % 32 == 0) && (k % 32 == 0),
+                       "Matrix dimensions must be multiples of 32 for MX format. ",
+                       "Got m=", m, ", n=", n, ", k=", k);
+          }
+#endif
+  }
 #else
   // rowwise isn't supported using older hipblaslt
   TORCH_INTERNAL_ASSERT(use_rowwise == false, "rowwise scaled_gemm not supported with older hipblaslt");
@@ -1917,11 +1927,11 @@ void scaled_gemm(
   }
 
   if (mat1_scale_dtype == kFloat8_e8m0fnu && mat2_scale_dtype == kFloat8_e8m0fnu) {
-#if CUDA_VERSION >= 12080
+#if (!defined(USE_ROCM) && CUDA_VERSION >= 12080) || (defined(USE_ROCM) && ROCM_VERSION >= 70000)
     computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0);
     computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0);
 #else
-    TORCH_CHECK(false, "scaled_gemm with `torch.float8_e8m0fnu` scales is only supported for CUDA 12.8 and above");
+    TORCH_CHECK(false, "scaled_gemm with `torch.float8_e8m0fnu` scales is only supported for CUDA 12.8 or ROCm 7.0(with gfx950) and above");
 #endif // if CUDA_VERSION >= 12080
   } else if (mat1_scale_dtype == kFloat8_e4m3fn && mat2_scale_dtype == kFloat8_e4m3fn) {
 #if CUDA_VERSION >= 12080
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
index 1834839bb6e85..21e6f9f65dd70 100644
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@@ -1133,12 +1133,15 @@ ScalingType get_scaling_type(
     auto expected_b_size =
         BLOCK_SIZE_MN * ceil_div(dim_n, BLOCK_SIZE_MN) * padded_num_k_blocks;
 
+    //TODO: enable the checks for ROCm
+#ifndef USE_ROCM
     TORCH_CHECK(scale_a.numel() == expected_a_size,
                 "For BlockWise scaling: Expected scale_a size to be ",
                 expected_a_size, " but got ", scale_a.numel());
     TORCH_CHECK(scale_b.numel() == expected_b_size,
                 "For BlockWise scaling: Expected scale_b size to be ",
                 expected_b_size, " but got ", scale_b.numel());
+#endif
 
     TORCH_CHECK(
         scale_a.is_contiguous() && scale_b.is_contiguous(),
@@ -1205,6 +1208,7 @@ ScalingType get_scaling_type(
 
 } // namespace
 
+
 // Computes matrix multiply + bias while applying scaling to input and output matrices
 // Scales are only applicable when matrices are of Float8 type and assumed to be equal to 1.0 by default.
 // If output matrix type is 16 or 32-bit type, scale_result is not applied.
@@ -1268,6 +1272,14 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
   // Type restrictions imposed by CuBLASLt as of CUDA-12.1
   TORCH_CHECK(mat1.scalar_type() != ScalarType::Float8_e5m2 || mat2.scalar_type() != ScalarType::Float8_e5m2,
         "Multiplication of two Float8_e5m2 matrices is not supported");
+#endif
+#ifdef USE_ROCM
+  if (mat1.scalar_type() == ScalarType::Float8_e5m2 || mat2.scalar_type() == ScalarType::Float8_e5m2) {
+    TORCH_CHECK(ROCM_VERSION >= 60000, "Float8_e5m2 is only supported for ROCm 6.0 and above");
+  }
+  if (mat1.scalar_type() == ScalarType::Float8_e4m3fn || mat2.scalar_type() == ScalarType::Float8_e4m3fn) {
+    TORCH_CHECK(ROCM_VERSION >= 60000, "Float8_e4m3fn is only supported for ROCm 6.0 and above");
+  }
 #endif
   if (use_fast_accum) {
     TORCH_CHECK(mat1.scalar_type() != ScalarType::Float4_e2m1fn_x2 && mat2.scalar_type() != ScalarType::Float4_e2m1fn_x2, "`use_fast_accum` is not supported when `mat1` or `mat2` tensors have the `Float4_e2m1fn_x2` dtype.");
@@ -1327,7 +1339,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
   }
 #else
   if (scaling_choice == ScalingType::RowWise) {
-    // For ROCm, match behavior of f8f8bf16_rowwise type checking, for unit test purposes.
+    // For ROCm, match behavior of f8f8bf16_rowwise type checking
     Tensor b = mat2;
     if (_scaled_mm_is_fnuz()) {
       TORCH_CHECK(b.dtype() == at::kFloat8_e4m3fnuz);
@@ -1335,9 +1347,25 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
     else {
       TORCH_CHECK(b.dtype() == at::kFloat8_e4m3fn);
     }
-    // Until more than bf16 is supported.
+    // Until more than bf16 is supported
     TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16,
-         "hipblaslt rowwise _scaled_mm only supports BFloat16 output but got ", out.scalar_type());
+         "hipblaslt rowwise _scaled_mm only supports BFloat16 output");
+  }
+  else if (scaling_choice == ScalingType::BlockWise) {
+#if ROCM_VERSION >= 70000
+    TORCH_CHECK(at::detail::getCUDAHooks().isGPUArch({"gfx950"}, 0),
+               "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
+
+    TORCH_CHECK(mat1.size(0) % 32 == 0 && mat1.size(1) % 32 == 0 &&
+               mat2.size(0) % 32 == 0 && mat2.size(1) % 32 == 0,
+               "Matrix dimensions must be multiples of 32 for block-wise scaling");
+
+    TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16 ||
+                out.scalar_type() == ScalarType::Half,
+                "Block-wise scaling only supports BFloat16 or Half output types");
+#else
+    TORCH_CHECK(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 7.0 or later");
+#endif
   }
 #endif
 
@@ -1416,10 +1444,12 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
       params.k = args.k;
       params.a = args.mata->data_ptr();
       params.a_scale_ptr = args.scale_mata_ptr;
+      params.a_scale_dtype = scale_a.scalar_type();
       params.lda = args.lda;
       params.a_dtype = args.mata->scalar_type();
       params.b = args.matb->data_ptr();
       params.b_scale_ptr = args.scale_matb_ptr;
+      params.b_scale_dtype = scale_b.scalar_type();
       params.ldb = args.ldb;
       params.b_dtype = args.matb->scalar_type();
       params.bias_ptr = bias ? bias->data_ptr(): nullptr;
diff --git a/test/test_matmul_cuda.py b/test/test_matmul_cuda.py
index e8b4d9092cdd0..8ec832e40a163 100644
--- a/test/test_matmul_cuda.py
+++ b/test/test_matmul_cuda.py
@@ -1453,6 +1453,10 @@ def test_blockwise_mxfp8_nvfp4_numerics(self, test_case_name, fast_accum, mkn, r
 
         device = "cuda"
         M, K, N = mkn
+        if torch.version.hip:
+            if not (M % 32 == 0 and K % 32 == 0 and N % 32 == 0):
+                raise unittest.SkipTest("Matrix dimensions must be multiples of 32 on ROCm, skipping")
+
         if recipe == "nvfp4" and K % 32 != 0:
             return unittest.skip("K must be divisible by 32 for nvfp4 cublas gemm, skipping")
 
@@ -1462,7 +1466,7 @@ def test_blockwise_mxfp8_nvfp4_numerics(self, test_case_name, fast_accum, mkn, r
 
         if test_case_name == "a_eye_b_eye":
             if not ((M == K) and (M == N)):
-                return unittest.skip("this test is only defined for M == K == N, skipping")
+                raise unittest.SkipTest("this test is only defined for M == K == N, skipping")
             A_ref = torch.eye(M, device=device, dtype=torch.bfloat16)
             B_ref = torch.eye(M, device=device, dtype=torch.bfloat16)
 
@@ -1601,7 +1605,7 @@ def test_blockwise_mxfp8_nvfp4_numerics(self, test_case_name, fast_accum, mkn, r
 
         elif test_case_name == "data_random_scales_from_data":
             if not K % BLOCK_SIZE == 0:
-                return unittest.skip(f"this test is only defined for K a multiple of {BLOCK_SIZE}, skipping")
+                raise unittest.SkipTest(f"this test is only defined for K a multiple of {BLOCK_SIZE}, skipping")
             require_exact_match = False
             # random data, scales from data
             A_ref = torch.randn((M, K), device=device, dtype=torch.bfloat16) * 1000
diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py
index a211851d671fa..2620c64a95ef1 100644
--- a/torch/testing/_internal/common_cuda.py
+++ b/torch/testing/_internal/common_cuda.py
@@ -108,7 +108,15 @@ def evaluate_platform_supports_fp8():
 
 PLATFORM_SUPPORTS_FP8: bool = LazyVal(lambda: evaluate_platform_supports_fp8())
 
-PLATFORM_SUPPORTS_MX_GEMM: bool = LazyVal(lambda: TEST_CUDA and SM100OrLater)
+def _platform_supports_mx_gemm():
+    if torch.cuda.is_available():
+        if torch.version.hip:
+            return 'gfx95' in torch.cuda.get_device_properties(0).gcnArchName
+        else:
+            return SM100OrLater
+    return False
+
+PLATFORM_SUPPORTS_MX_GEMM: bool = LazyVal(lambda: _platform_supports_mx_gemm())
 
 if TEST_NUMBA:
     try:
diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
index a5145a2f4870a..b251a85e245a5 100644
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -3870,6 +3870,7 @@
         ("CUDA_C_64U", ("HIP_C_64U", CONV_TYPE, API_RUNTIME)),
         ("CUDA_R_8F_E4M3", ("HIP_R_8F_E4M3", CONV_TYPE, API_RUNTIME)),
         ("CUDA_R_8F_E5M2", ("HIP_R_8F_E5M2", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_R_4F_E2M1", ("HIP_R_4F_E2M1", CONV_TYPE, API_RUNTIME)),
         (
             "MAJOR_VERSION",
             ("hipLibraryMajorVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
@@ -7347,6 +7348,10 @@
         ("CUBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F", ("HIPBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F", CONV_MATH_FUNC, API_BLAS)),
         ("CUBLASLT_MATMUL_DESC_AMAX_D_POINTER", ("HIPBLASLT_MATMUL_DESC_AMAX_D_POINTER", CONV_MATH_FUNC, API_BLAS)),
         ("CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE", ("HIPBLASLT_MATMUL_DESC_BIAS_DATA_TYPE", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_MATMUL_DESC_A_SCALE_MODE", ("HIPBLASLT_MATMUL_DESC_A_SCALE_MODE", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_MATMUL_DESC_B_SCALE_MODE", ("HIPBLASLT_MATMUL_DESC_B_SCALE_MODE", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0", ("HIPBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3", ("HIPBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3", CONV_MATH_FUNC, API_BLAS)),
         ("cublasLtMatrixLayout_t", ("hipblasLtMatrixLayout_t", CONV_MATH_FUNC, API_BLAS)),
         ("cublasLtMatrixLayoutOpaque_t", ("hipblasLtMatrixLayoutOpaque_t", CONV_MATH_FUNC, API_BLAS)),
         ("cublasLtMatrixLayoutAttribute_t", ("hipblasLtMatrixLayoutAttribute_t", CONV_MATH_FUNC, API_BLAS)),

From e0160f1b99ba589db23d421697a4b861038c485b Mon Sep 17 00:00:00 2001
From: Andres Lugo <108368282+alugorey@users.noreply.github.com>
Date: Tue, 1 Apr 2025 22:03:37 -0500
Subject: [PATCH 24/83] Extend CK gemm/sdpa support to gfx950 (#45)

Update CK for gfx950 (#49)

(cherry picked from commit 8ccfc47420df0a17b79e351ef240e743ebfd9a77)
(cherry picked from commit b5d5987eb6d4eaf6577e7a9f80d53fa9a3f79177)
---
 aten/src/ATen/Context.cpp                                       | 2 +-
 .../src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd.hpp | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index fd346b2d9af00..fefcf731cd825 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -419,7 +419,7 @@ void Context::setROCmFAPreferredBackend(at::ROCmFABackend b) {
   if(b == at::ROCmFABackend::Ck) {
     static const bool ck_unsupported = []() {
       static const std::vector<std::string> archs = {
-          "gfx90a",  "gfx942"
+          "gfx90a",  "gfx942", "gfx950"
       };
       for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) {
         if (!detail::getCUDAHooks().isGPUArch(archs, index)) {
diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd.hpp b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd.hpp
index 38ec2ef20c5cc..affa40619b598 100644
--- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd.hpp
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd.hpp
@@ -453,4 +453,5 @@ struct fmha_bwd_traits
     bool is_deterministic;
     // TODO: padding check is inside this api
 };
+template <int Version = 2>
 float fmha_bwd(fmha_bwd_traits, fmha_bwd_args, const ck_tile::stream_config&);

From 08390c7ce36e76624a0a1c6f503313c783056f9c Mon Sep 17 00:00:00 2001
From: pmaybank <113125070+pmaybank@users.noreply.github.com>
Date: Thu, 29 May 2025 16:41:38 +0100
Subject: [PATCH 25/83] [release/2.6] [SWDEV-529824] Fix Float16
 CooperativeReduction Test Failure (#2204)

- Previously expected values were calculated on GPU using same dtype as
result values
- Now expected values are calculated on CPU using Float32 dtype
- This fixes a test failure that was observed on Navi48 where difference
between Eager mode (expected) and Inductor / Triton (result) did not
meet the error tolerance when sum was evaluated on an array of Float16
values

Co-authored-by: pnikolic-amd <pnikolic@amd.com>
(cherry picked from commit 8fe3cdd5a0a8529b07ccb0c4735b013aaa2828ca)
(cherry picked from commit 34f3b3e005f1b16011b1bc9642ad0ca054dae619)
---
 test/inductor/test_cooperative_reductions.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/inductor/test_cooperative_reductions.py b/test/inductor/test_cooperative_reductions.py
index 469ceec2e1b2b..a913ffb25bf33 100644
--- a/test/inductor/test_cooperative_reductions.py
+++ b/test/inductor/test_cooperative_reductions.py
@@ -58,7 +58,8 @@ def setUp(self):
         torch._dynamo.reset()
 
     def run_and_check(self, fn, args, *, expect_kernel_count=1):
-        expected = fn(*args)
+        args_cpu = [tensor.cpu().to(torch.float32) for tensor in args]
+        expected = fn(*args_cpu).to(torch.float16)
         fn = torch.compile(fn, fullgraph=True)
         result, (source_code,) = run_and_get_code(fn, *args)
         self.assertEqual(result, expected)

From 01857c6576812b96148fae55655f034c60941112 Mon Sep 17 00:00:00 2001
From: Jerry Mannil <65309407+jerrymannil@users.noreply.github.com>
Date: Tue, 10 Jun 2025 10:55:55 -0700
Subject: [PATCH 26/83] [ROCm] Set thread_work_size to 16 for vectorized
 elementwise kernels (#2259)

* thread_work_size of 16 is giving better perf with many workloads

(cherry picked from commit 7edf50cab4e34f2e0acdb35abf40c7f3967a3426)
---
 aten/src/ATen/native/cuda/CUDALoops.cuh | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/cuda/CUDALoops.cuh b/aten/src/ATen/native/cuda/CUDALoops.cuh
index 9b104a7966363..f96b8d687bc86 100644
--- a/aten/src/ATen/native/cuda/CUDALoops.cuh
+++ b/aten/src/ATen/native/cuda/CUDALoops.cuh
@@ -226,8 +226,9 @@ C10_LAUNCH_BOUNDS_1(num_threads())
 __global__ void vectorized_elementwise_kernel(int N, func_t f, array_t data) {
   using traits = function_traits<func_t>;
   constexpr auto io_size = calc_io_size<func_t>();
-#ifdef __gfx942__
-  constexpr int tws = (io_size >= 2) ? 8 : 16;
+#if defined(USE_ROCM) && defined(__gfx942__)
+  // Similar check in launch_vectorized_kernel() as well. Both should be in sync.
+  constexpr int tws = 16;
 #else
   constexpr int tws = elems_per_thread<io_size>();
 #endif
@@ -296,7 +297,7 @@ static inline void launch_vectorized_kernel(
   int vec_size = memory::can_vectorize_up_to<func_t>(data);
   c10::DeviceIndex curDevice = -1;
   AT_CUDA_CHECK(c10::cuda::GetDevice(&curDevice));
-  int tws = at::detail::getCUDAHooks().isGPUArch({"gfx942"}, curDevice) ? ((io_size >= 2) ? 8 : 16) : elems_per_thread<io_size>();
+  int tws = at::detail::getCUDAHooks().isGPUArch({"gfx942"}, curDevice) ? 16 : elems_per_thread<io_size>();
 #else
   using cpp_type = typename function_traits<func_t>::result_type;
   const uint16_t max_vec_size = memory::can_vectorize_up_to<func_t>(data);

From e60c0c46add9d4d12c6e375a2c7874eb634c39c8 Mon Sep 17 00:00:00 2001
From: Sampsa Riikonen <sriikone@amd.com>
Date: Thu, 19 Jun 2025 17:05:56 +0300
Subject: [PATCH 27/83] [release/2.7] Fix SDPA skip logic (#2281)

fixes https://ontrack-internal.amd.com/browse/SWDEV-522391 for PT 2.7

(cherry picked from commit df38cca03ece6d06485f5cebc26db909fd258246)
---
 test/inductor/test_aot_inductor.py  | 20 +++++++++++++++-----
 test/inductor/test_torchinductor.py |  3 +++
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py
index 6a768a3dbbb15..32a36653b225a 100644
--- a/test/inductor/test_aot_inductor.py
+++ b/test/inductor/test_aot_inductor.py
@@ -32,9 +32,10 @@
 from torch.testing import FileCheck
 from torch.testing._internal import common_utils
 from torch.testing._internal.common_cuda import (
+    PLATFORM_SUPPORTS_FLASH_ATTENTION,
+    PLATFORM_SUPPORTS_FP8,
+    PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
     SM80OrLater,
-    SM90OrLater,
-    PLATFORM_SUPPORTS_FLASH_ATTENTION
 )
 from torch.testing._internal.common_device_type import (
     _has_sufficient_memory,
@@ -1367,7 +1368,12 @@ def forward(self, q, k, v):
         self.check_model(Model(), example_inputs)
 
     @unittest.skipIf(not SM80OrLater, "bfloat16 only supported in sm80+")
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA")
+    @unittest.skipIf(
+        # for archs where this isn't lowered to flash attention, the math
+        # backend will be used and it doesn't work for bfloat16
+        not PLATFORM_SUPPORTS_FLASH_ATTENTION,
+        "Some archs don't support SDPA with bfloat16",
+    )
     def test_sdpa_2(self):
         class Model(torch.nn.Module):
             def __init__(self) -> None:
@@ -1620,7 +1626,9 @@ def forward(self, values, repeats, mask, embeddings, x, y, z, lst):
         self.check_model(Repro(), example_inputs, dynamic_shapes=spec)
 
     @skipIfXpu(msg="_scaled_dot_product_flash_attention is not supported on XPU yet")
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA")
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support flash SDPA"
+    )
     def test_fallback_kernel_with_symexpr_output(self):
         if self.device != GPU_TYPE:
             raise unittest.SkipTest("requires GPU")
@@ -4179,7 +4187,9 @@ def grid(meta):
             dynamic_shapes=dynamic_shapes,
         )
 
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA")
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Some archs don't support mem eff SDPA"
+    )
     def test_scaled_dot_product_efficient_attention(self):
         if self.device != GPU_TYPE:
             raise unittest.SkipTest("requires GPU")
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 20ce486f6faab..ee18c3eb508f0 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -11538,6 +11538,9 @@ def fn(q, k, v):
 
     @xfail_if_mps_unimplemented
     @expectedFailureXPU
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Some archs don't support mem eff SDPA"
+    )
     def test_scaled_dot_product_efficient_attention(self):
         if self.device == "cpu":
             raise unittest.SkipTest(f"requires {GPU_TYPE}")

From 01eaee8f31327832cc4da7d44f1cf0ac2acb41ce Mon Sep 17 00:00:00 2001
From: Ethan Wee <Ethan.Wee@amd.com>
Date: Tue, 1 Jul 2025 13:27:44 -0700
Subject: [PATCH 28/83] [release/2.7] Update test_binary_ufuncs.py after numpy
 upgrade (#2289)

Related to
https://github.com/ROCm/pytorch/commit/c7a1e32fbcf9e0a458d959a453de65c27c51452c
Fixes https://ontrack-internal.amd.com/browse/SWDEV-537835

Not a Navi specific failure:
```
  File "/opt/conda/envs/py_3.12/lib/python3.12/site-packages/torch/testing/_internal/common_device_type.py", line 1412, in only_fn
    return fn(slf, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/lib/jenkins/pytorch/test/test_binary_ufuncs.py", line 1671, in test_cuda_tensor_pow_scalar_tensor
    self._test_pow(base, exp)
  File "/var/lib/jenkins/pytorch/test/test_binary_ufuncs.py", line 1482, in _test_pow
    self.assertEqual(actual, expected)
  File "/opt/conda/envs/py_3.12/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 4052, in assertEqual
    raise error_metas.pop()[0].to_error(
AssertionError: The values for attribute 'dtype' do not match: torch.float32 != torch.float64.
```

Using .to(actual) without specifying dtype/device assumes actual is a
tensor or tensor-like, which may fail silently or promote. Fixed by
explicitly matching dtype and device. Going from
https://github.com/pytorch/pytorch/issues/107302
Fix:
```
root@ubb4-rack-22:/var/lib/jenkins/pytorch# TEST_CONFIG=default HIP_VISIBLE_DEVICES=0 PYTORCH_TEST_WITH_ROCM=1 python test/test_binary_ufuncs.py TestBinaryUfuncsCUDA.test_cuda_tensor_pow_scalar_tensor_cuda
/opt/conda/envs/py_3.12/lib/python3.12/site-packages/hypothesis/entry_points.py:23: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
  import pkg_resources

Running tests...
----------------------------------------------------------------------
.
----------------------------------------------------------------------
Ran 1 test in 0.141s

OK

Generating XML reports...
root@ubb4-rack-22:/var/lib/jenkins/pytorch# pip list | grep numpy
numpy                   2.1.2

```

(cherry picked from commit a4d60fa319fc275d138abba367528d0a45243ea7)
---
 test/test_binary_ufuncs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py
index bdc0d7329df59..b3f81f1332863 100644
--- a/test/test_binary_ufuncs.py
+++ b/test/test_binary_ufuncs.py
@@ -1447,7 +1447,7 @@ def to_np(value):
         try:
             np_res = np.power(to_np(base), to_np(np_exponent))
             expected = (
-                torch.from_numpy(np_res)
+                torch.from_numpy(np_res).to(dtype=base.dtype)
                 if isinstance(np_res, np.ndarray)
                 else torch.tensor(np_res, dtype=base.dtype)
             )

From 80e89740c17525e4065a2160f084664d1caca3f8 Mon Sep 17 00:00:00 2001
From: omkar kakarparthi <75638701+okakarpa@users.noreply.github.com>
Date: Tue, 8 Jul 2025 01:07:52 -0500
Subject: [PATCH 29/83] [AUTOGENERATED] [release/2.7] fix jit_utils.cpp (#2320)

Cherry-pick of https://github.com/ROCm/pytorch/pull/2319

Co-authored-by: Jeff Daily <jeff.daily@amd.com>
(cherry picked from commit e725e2e7c51be0a0b022e4404bfc69777e2ef010)
---
 aten/src/ATen/native/cuda/jit_utils.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/cuda/jit_utils.cpp b/aten/src/ATen/native/cuda/jit_utils.cpp
index 19d0447369914..cd1441a14b1e9 100644
--- a/aten/src/ATen/native/cuda/jit_utils.cpp
+++ b/aten/src/ATen/native/cuda/jit_utils.cpp
@@ -45,7 +45,7 @@ namespace at::cuda::jit {
 // Copied from aten/src/ATen/cuda/llvm_basic.cpp, then modified as above.
 // If not compiling for ROCm, return the original get_traits_string().
 std::string get_traits_string_but_hiprtc_safe() {
-#ifdef USE_ROCM
+#if defined(USE_ROCM) && ROCM_VERSION < 70000
     return R"ESCAPE(
 namespace std {
 

From bb44c0cc737ba8aa4978f57ff2763a729af1cd6a Mon Sep 17 00:00:00 2001
From: Ramya Ramineni <62723901+rraminen@users.noreply.github.com>
Date: Mon, 14 Jul 2025 12:23:45 -0500
Subject: [PATCH 30/83] Clean up CUDA state between tests (#2335)

This PR fixes the unit test,

test/test_cuda.py::TestCuda::test_set_per_process_memory_fraction FAILED
[0.1163s]

```
Traceback (most recent call last):
  File "/var/lib/jenkins/pytorch/test/test_cuda.py", line 471, in test_set_per_process_memory_fraction
    tmp_tensor = torch.empty(application, dtype=torch.int8, device="cuda")
RuntimeError: Trying to create tensor with negative dimension -5681285432: [-5681285432]
```
This error occurs only on gfx1101 arch.

This error is coming from an integer overflow when another unit test,
test/test_cuda.py::TestCuda::test_randint_generation_for_large_numel
creates a tensor with a huge numel, which overflows into a higher
torch.cuda.max_memory_reserved() when you call
test/test_cuda.py::TestCuda::test_set_per_process_memory_fraction
afterward. To avoid this we introduced torch.cuda.empty_cache() and
torch.cuda.reset_peak_memory_stats() to clean up CUDA states.

JIRA: https://ontrack-internal.amd.com/browse/SWDEV-535295
(cherry picked from commit f86d18439897232a374504c36b40da99c14ade1a)
---
 test/test_cuda.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/test_cuda.py b/test/test_cuda.py
index 5e5adcf56a60e..35e834d3b62ee 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -467,6 +467,9 @@ def test_out_of_memory_retry(self):
         IS_JETSON, "oom reporting has issues on jetson igx due to partial nvml support"
     )
     def test_set_per_process_memory_fraction(self):
+        if torch.version.hip and ('gfx1101' in torch.cuda.get_device_properties(0).gcnArchName):
+           torch.cuda.empty_cache()
+           torch.cuda.reset_peak_memory_stats()
         orig = torch.cuda.get_per_process_memory_fraction(0)
         torch.cuda.reset_peak_memory_stats(0)
         try:

From 1f312c4ff01cfebbf8da9d623e8183183d266c09 Mon Sep 17 00:00:00 2001
From: Jeff Daily <jeff.daily@amd.com>
Date: Sat, 28 Jun 2025 22:38:43 +0000
Subject: [PATCH 31/83] cublaslt/hipblaslt persistent workspace (#156495)

Similar to cublas/hipblas, LT now allocates one workspace per handle+stream combo.

- fixes hipblaslt issue where memory use increased during graph capture
- preserves CUDA env var TORCH_CUBLASLT_UNIFIED_WORKSPACE
- moves LT workspace and size from CUDABlas.cpp into CublasHandlePool.cpp, new APIs
  - size_t getCUDABlasLtWorkspaceSize()
  - void* getCUDABlasLtWorkspace()

Fixes https://github.com/ROCm/pytorch/issues/2286.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/156495
Approved by: https://github.com/eqy

(cherry picked from commit 996206e66fcafff25d0af5177497e8f792000869)
---
 aten/src/ATen/cuda/CUDABlas.cpp            | 85 ++------------------
 aten/src/ATen/cuda/CUDAContextLight.h      |  3 +
 aten/src/ATen/cuda/CublasHandlePool.cpp    | 91 ++++++++++++++++++++++
 aten/src/ATen/cuda/tunable/GemmHipblaslt.h | 32 +-------
 4 files changed, 102 insertions(+), 109 deletions(-)

diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
index d884fd7ffa915..573389bcd7c70 100644
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -188,82 +188,11 @@ uint32_t _getAlignment(uintptr_t address) {
 }
 #endif
 
-static size_t _parseChosenWorkspaceSize() {
-  auto val = c10::utils::get_env("CUBLASLT_WORKSPACE_SIZE");
-#ifdef USE_ROCM
-  if (!val.has_value()) {
-    // accept either env var
-    val = c10::utils::get_env("HIPBLASLT_WORKSPACE_SIZE");
-  }
-  size_t workspace_size = 76*1024; /* Use 76 MB for hipBLASLt */
-#else
-  size_t workspace_size = 1024; /* default size in KiB according to #73328 */
-#endif
-
-  if (val.has_value()) {
-    try {
-      workspace_size = std::stoi(val.value());
-    } catch (std::invalid_argument const&) {
-      TORCH_WARN(
-          "invalid CUBLASLT_WORKSPACE_SIZE,",
-          " using default workspace size of ",
-          workspace_size,
-          " KiB.");
-    } catch (std::out_of_range const&) {
-      TORCH_WARN(
-          "CUBLASLT_WORKSPACE_SIZE out of range,",
-          " using default workspace size of ",
-          workspace_size,
-          " KiB.");
-    }
-  }
-  return workspace_size * 1024;
-}
-
-static size_t _getWorkspaceSize() {
-  static size_t workspace_size = _parseChosenWorkspaceSize();
-  return workspace_size;
-}
-
-void* _getUnifiedWorkspaceWithoutHandle() {
-  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
-  auto stream = c10::cuda::getCurrentCUDAStream();
-  cudaStream_t _stream = stream;
-  auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
-  auto workspace_it = at::cuda::cublas_handle_stream_to_workspace().find(key);
-  TORCH_INTERNAL_ASSERT(workspace_it != at::cuda::cublas_handle_stream_to_workspace().end());
-  return workspace_it->second.mutable_get();
-}
-
 struct CublasLtWorkspace {
   CublasLtWorkspace() {
-    size = _getWorkspaceSize();
-#ifndef USE_ROCM
-    static bool unified = c10::utils::check_env("TORCH_CUBLASLT_UNIFIED_WORKSPACE") == true;
-    if (unified) {
-      auto cublasWorkspaceSize = at::cuda::getChosenWorkspaceSize();
-      if (cublasWorkspaceSize < size) {
-        TORCH_WARN_ONCE("Requested unified CUBLASLT workspace size of ", size,
-                        " bytes exceeds CUBLAS workspace size of ", cublasWorkspaceSize,
-                        " bytes. Please increase CUBLAS workspace size",
-                        " via CUBLAS_WORKSPACE_CONFIG or decrease requested"
-                        " CUBLASLT_WORKSPACE_SIZE. Otherwise CUBLASLT workspace"
-                        " size will be limited to the CUBLAS workspace size.");
-        size = cublasWorkspaceSize;
-      }
-      ptr = _getUnifiedWorkspaceWithoutHandle();
-    } else {
-      auto allocator = c10::cuda::CUDACachingAllocator::get();
-      stashed_ptr_ = allocator->allocate(size);
-      ptr = stashed_ptr_.mutable_get();
-    }
-#else
-    auto allocator = c10::cuda::CUDACachingAllocator::get();
-    stashed_ptr_ = allocator->allocate(size);
-    ptr = stashed_ptr_.mutable_get();
-#endif
+    size = at::cuda::getCUDABlasLtWorkspaceSize();
+    ptr = at::cuda::getCUDABlasLtWorkspace();
   }
-  at::DataPtr stashed_ptr_;
   void * ptr;
   size_t size;
 };
@@ -2111,10 +2040,8 @@ void int8_gemm(
 
 #ifdef USE_ROCM
   CuBlasLtMatmulPreference preference;
-  size_t workspaceSize = _getWorkspaceSize();
-  preference.setAttribute(CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, workspaceSize);
-  auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
-  auto workspace = allocator.allocate(workspaceSize);
+  auto ltworkspace = CublasLtWorkspace();
+  preference.setAttribute(CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, ltworkspace.size);
   cublasLtMatmulHeuristicResult_t heuristicResult = {};
   int returnedResult = 0;
   TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
@@ -2152,12 +2079,12 @@ void int8_gemm(
       nullptr, // Heuristics don't seem to work for int8
 #endif
 #ifdef USE_ROCM
-      workspace.mutable_get(),
+      ltworkspace.ptr,
 #else
       nullptr, // Non-zero workspace doesn't seem to work.
 #endif
 #ifdef USE_ROCM
-      workspaceSize,
+      ltworkspace.size,
 #else
       0,
 #endif
diff --git a/aten/src/ATen/cuda/CUDAContextLight.h b/aten/src/ATen/cuda/CUDAContextLight.h
index 65019bb6097c9..86e960cc1ab4a 100644
--- a/aten/src/ATen/cuda/CUDAContextLight.h
+++ b/aten/src/ATen/cuda/CUDAContextLight.h
@@ -89,7 +89,10 @@ TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle();
 
 TORCH_CUDA_CPP_API void clearCublasWorkspaces();
 TORCH_CUDA_CPP_API std::map<std::tuple<void *, void *>, at::DataPtr>& cublas_handle_stream_to_workspace();
+TORCH_CUDA_CPP_API std::map<std::tuple<void *, void *>, at::DataPtr>& cublaslt_handle_stream_to_workspace();
 TORCH_CUDA_CPP_API size_t getChosenWorkspaceSize();
+TORCH_CUDA_CPP_API size_t getCUDABlasLtWorkspaceSize();
+TORCH_CUDA_CPP_API void* getCUDABlasLtWorkspace();
 
 #if defined(CUDART_VERSION) || defined(USE_ROCM)
 TORCH_CUDA_CPP_API cusolverDnHandle_t getCurrentCUDASolverDnHandle();
diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp
index 720304ad198e8..81159d5c0e272 100644
--- a/aten/src/ATen/cuda/CublasHandlePool.cpp
+++ b/aten/src/ATen/cuda/CublasHandlePool.cpp
@@ -23,6 +23,9 @@
  * To work around this difference in behavior, a separate handle pool is available for ROCm builds.
  * For CUDA builds, getCurrentCUDABlasLtHandle will alias for getCurrentCUDABlasHandle,
  * whereas for ROCm builds, it is a distinct function.
+ *
+ * The workspace pools are separate for ROCm. On CUDA, the env var
+ * TORCH_CUBLASLT_UNIFIED_WORKSPACE can be used to opt-in to unifying the workspace pools.
  */
 
 namespace at::cuda {
@@ -109,8 +112,14 @@ std::map<std::tuple<void *, void *>, at::DataPtr>& cublas_handle_stream_to_works
   return instance;
 }
 
+std::map<std::tuple<void *, void *>, at::DataPtr>& cublaslt_handle_stream_to_workspace() {
+  static auto& instance = *new std::map<std::tuple<void *, void *>, at::DataPtr>;
+  return instance;
+}
+
 void clearCublasWorkspaces() {
   cublas_handle_stream_to_workspace().clear();
+  cublaslt_handle_stream_to_workspace().clear();
 }
 
 size_t parseChosenWorkspaceSize() {
@@ -157,15 +166,97 @@ size_t parseChosenWorkspaceSize() {
   }
 }
 
+size_t parseCUDABlasLtWorkspaceSize() {
+  auto val = c10::utils::get_env("CUBLASLT_WORKSPACE_SIZE");
+#ifdef USE_ROCM
+  if (!val.has_value()) {
+    // accept either env var
+    val = c10::utils::get_env("HIPBLASLT_WORKSPACE_SIZE");
+  }
+  size_t workspace_size = 76*1024; /* Use 76 MB for hipBLASLt */
+#else
+  size_t workspace_size = 1024; /* default size in KiB according to #73328 */
+#endif
+
+  if (val.has_value()) {
+    try {
+      workspace_size = std::stoi(val.value());
+    } catch (std::invalid_argument const&) {
+      TORCH_WARN(
+          "invalid CUBLASLT_WORKSPACE_SIZE,",
+          " using default workspace size of ",
+          workspace_size,
+          " KiB.");
+    } catch (std::out_of_range const&) {
+      TORCH_WARN(
+          "CUBLASLT_WORKSPACE_SIZE out of range,",
+          " using default workspace size of ",
+          workspace_size,
+          " KiB.");
+    }
+  }
+  return workspace_size * 1024;
+}
+
 size_t getChosenWorkspaceSize() {
   size_t pool_size = parseChosenWorkspaceSize();
   return pool_size;
 }
 
+#define TORCH_CUBLASLT_UNIFIED_WORKSPACE "TORCH_CUBLASLT_UNIFIED_WORKSPACE"
+
+size_t getCUDABlasLtWorkspaceSize() {
+  size_t pool_size = parseCUDABlasLtWorkspaceSize();
+#ifndef USE_ROCM
+  static bool unified = c10::utils::check_env(TORCH_CUBLASLT_UNIFIED_WORKSPACE) == true;
+  if (unified) {
+    auto cublasWorkspaceSize = getChosenWorkspaceSize();
+    if (cublasWorkspaceSize < pool_size) {
+      TORCH_WARN_ONCE("Requested unified CUBLASLT workspace size of ", pool_size,
+                      " bytes exceeds CUBLAS workspace size of ", cublasWorkspaceSize,
+                      " bytes. Please increase CUBLAS workspace size",
+                      " via CUBLAS_WORKSPACE_CONFIG or decrease requested"
+                      " CUBLASLT_WORKSPACE_SIZE. Otherwise CUBLASLT workspace"
+                      " size will be limited to the CUBLAS workspace size.");
+      pool_size = cublasWorkspaceSize;
+    }
+  }
+#endif
+  return pool_size;
+}
+
 at::DataPtr getNewWorkspace() {
   return c10::cuda::CUDACachingAllocator::get()->allocate(getChosenWorkspaceSize());
 }
 
+at::DataPtr getNewCUDABlasLtWorkspace() {
+  return c10::cuda::CUDACachingAllocator::get()->allocate(getCUDABlasLtWorkspaceSize());
+}
+
+void* getCUDABlasLtWorkspace() {
+#ifndef USE_ROCM
+  static bool unified = c10::utils::check_env(TORCH_CUBLASLT_UNIFIED_WORKSPACE) == true;
+  if (unified) {
+    cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
+    auto stream = c10::cuda::getCurrentCUDAStream();
+    cudaStream_t _stream = stream;
+    auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
+    auto workspace_it = at::cuda::cublas_handle_stream_to_workspace().find(key);
+    TORCH_INTERNAL_ASSERT(workspace_it != at::cuda::cublas_handle_stream_to_workspace().end());
+    return workspace_it->second.mutable_get();
+  }
+#endif
+  cublasLtHandle_t handle = getCurrentCUDABlasLtHandle();
+  auto stream = c10::cuda::getCurrentCUDAStream();
+  cudaStream_t _stream = stream;
+  auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
+  auto workspace_it = cublaslt_handle_stream_to_workspace().find(key);
+  if (workspace_it == cublaslt_handle_stream_to_workspace().end()) {
+    workspace_it = cublaslt_handle_stream_to_workspace().insert(workspace_it, {key, getNewCUDABlasLtWorkspace()});
+  }
+  return workspace_it->second.mutable_get();
+}
+
 cublasHandle_t getCurrentCUDABlasHandle() {
   c10::DeviceIndex device = 0;
   AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
diff --git a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
index a23a2d720c5c4..fe6d1161d1ba9 100644
--- a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
+++ b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
@@ -381,28 +381,6 @@ static hipblasOperation_t MapLayoutToHipBlasLt(BlasOp layout) {
   return HIPBLAS_OP_T;
 }
 
-static size_t GetHipblasltWorkspaceSize() {
-  static const auto env = c10::utils::get_env("HIPBLASLT_WORKSPACE_SIZE");
-  // 256MB is max workspace size allowed for hipblaslt
-  // hipblaslt-bench uses 32MB
-  // recommendation from hipblaslt author was 76MB
-  // TunableOp hipBLASLt workspace size is aligned with
-  // PyTorch's default in CUDABlas.cpp (_parseChosenWorkspaceSize)
-  size_t workspace_size = 76*1024;
-  if (env) {
-    try {
-      workspace_size = std::stoi(env.value());
-    } catch(std::invalid_argument const& e) {
-      TORCH_WARN("invalid HIPBLASLT_WORKSPACE_SIZE,",
-                 " using default workspace size of ", workspace_size, " KiB.");
-    } catch(std::out_of_range const& e) {
-      TORCH_WARN("HIPBLASLT_WORKSPACE_SIZE out of range,",
-                 " using default workspace size of ", workspace_size, " KiB.");
-    }
-  }
-  return workspace_size * 1024;
-}
-
 template <typename T, cublasStatus_t (*destructor)(T*)>
 struct HipBlasLtDeleter {
   void operator()(T* x) {
@@ -550,7 +528,7 @@ class HipblasltGemmOp : public Callable<ParamsT> {
         }
       }
 
-      size_t workspace_size = GetHipblasltWorkspaceSize();
+      size_t workspace_size = at::cuda::getCUDABlasLtWorkspaceSize();
 
       auto op_handle = at::cuda::getCurrentCUDABlasLtHandle();
 
@@ -575,10 +553,7 @@ class HipblasltGemmOp : public Callable<ParamsT> {
         return FAIL;
       }
 
-      void* workspace_buffer = nullptr;
-      if (workspace_size > 0) {
-        workspace_buffer = c10::cuda::CUDACachingAllocator::raw_alloc(workspace_size);
-      }
+      void* workspace_buffer = at::cuda::getCUDABlasLtWorkspace();
 
       TORCH_HIPBLASLT_CHECK(hipblasLtMatmul(op_handle,
             matmul.descriptor(),
@@ -601,9 +576,6 @@ class HipblasltGemmOp : public Callable<ParamsT> {
       TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_a));
       TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_b));
       TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_c));
-      if (workspace_size > 0) {
-        c10::cuda::CUDACachingAllocator::raw_delete(workspace_buffer);
-      }
       return OK;
     }
 

From 3b7f377541a9aea6548faecfd2e22cb00c78d0c0 Mon Sep 17 00:00:00 2001
From: omkar kakarparthi <75638701+okakarpa@users.noreply.github.com>
Date: Tue, 15 Jul 2025 11:54:34 -0500
Subject: [PATCH 32/83] [AUTOGENERATED] [release/2.7] [release/2.6] Fix dtype
 before comparing torch and numpy tensors (#2362)

Cherry-pick of https://github.com/ROCm/pytorch/pull/2340

Co-authored-by: Dmitry Nikolaev <139769634+dnikolaev-amd@users.noreply.github.com>
(cherry picked from commit 22c98ea3175f5520a7724ada85a6d6ed482b4785)
---
 test/test_binary_ufuncs.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py
index b3f81f1332863..7772134fd1534 100644
--- a/test/test_binary_ufuncs.py
+++ b/test/test_binary_ufuncs.py
@@ -1447,7 +1447,7 @@ def to_np(value):
         try:
             np_res = np.power(to_np(base), to_np(np_exponent))
             expected = (
-                torch.from_numpy(np_res).to(dtype=base.dtype)
+                torch.from_numpy(np_res)
                 if isinstance(np_res, np.ndarray)
                 else torch.tensor(np_res, dtype=base.dtype)
             )
@@ -1480,8 +1480,8 @@ def to_np(value):
                     self.assertRaisesRegex(RuntimeError, regex, base.pow_, exponent)
                 elif torch.can_cast(torch.result_type(base, exponent), base.dtype):
                     actual2 = actual.pow_(exponent)
-                    self.assertEqual(actual, expected)
-                    self.assertEqual(actual2, expected)
+                    self.assertEqual(actual, expected.to(actual))
+                    self.assertEqual(actual2, expected.to(actual))
                 else:
                     self.assertRaisesRegex(
                         RuntimeError,

From 8b2361443564d9d9753124fe9e39e3d8d187e114 Mon Sep 17 00:00:00 2001
From: tvukovic-amd <tvukovic@amd.com>
Date: Fri, 27 Jun 2025 07:23:28 +0000
Subject: [PATCH 33/83] [ROCm][Windows] Fixing undefined symbol linker error
 after exposing MIOpen symbols (#156479)

Fixing undefined symbol linker error after [exposing MIOpen symbols](https://github.com/pytorch/pytorch/pull/154545).
This fix:

- Hipifies `aten/src/ATen/miopen` and `aten/src/ATen/native/miopen` files
- Adds `aten/src/ATen/miopen` and `aten/src/ATen/native/miopen` hipified source files to `all_hip_cpp` list

Pull Request resolved: https://github.com/pytorch/pytorch/pull/156479
Approved by: https://github.com/jeffdaily

Co-authored-by: Jeff Daily <jeff.daily@amd.com>
---
 aten/src/ATen/miopen/Descriptors.h | 12 ++++++------
 aten/src/ATen/miopen/Handle.h      |  2 +-
 aten/src/ATen/miopen/Types.h       |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/aten/src/ATen/miopen/Descriptors.h b/aten/src/ATen/miopen/Descriptors.h
index a0ad4a4e1098a..2eee837cd533d 100644
--- a/aten/src/ATen/miopen/Descriptors.h
+++ b/aten/src/ATen/miopen/Descriptors.h
@@ -39,7 +39,7 @@ struct DescriptorDeleter {
 // function.
 template <typename T, miopenStatus_t (*ctor)(T**), miopenStatus_t (*dtor)(T*)>
 // NOLINTNEXTLINE(bugprone-exception-escape)
-class TORCH_CUDA_CPP_API Descriptor {
+class TORCH_HIP_CPP_API Descriptor {
  public:
   // Use desc() to access the underlying descriptor pointer in
   // a read-only fashion.  Most client code should use this.
@@ -65,7 +65,7 @@ class TORCH_CUDA_CPP_API Descriptor {
   std::unique_ptr<T, DescriptorDeleter<T, dtor>> desc_;
 };
 
-class TORCH_CUDA_CPP_API TensorDescriptor : public Descriptor<
+class TORCH_HIP_CPP_API TensorDescriptor : public Descriptor<
                                                miopenTensorDescriptor,
                                                &miopenCreateTensorDescriptor,
                                                &miopenDestroyTensorDescriptor> {
@@ -88,7 +88,7 @@ class TORCH_CUDA_CPP_API TensorDescriptor : public Descriptor<
 
 std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d);
 
-class TORCH_CUDA_CPP_API FilterDescriptor : public Descriptor<
+class TORCH_HIP_CPP_API FilterDescriptor : public Descriptor<
                                                miopenTensorDescriptor,
                                                &miopenCreateTensorDescriptor,
                                                &miopenDestroyTensorDescriptor> {
@@ -105,7 +105,7 @@ class TORCH_CUDA_CPP_API FilterDescriptor : public Descriptor<
   }
 };
 
-struct TORCH_CUDA_CPP_API ConvolutionDescriptor
+struct TORCH_HIP_CPP_API ConvolutionDescriptor
     : public Descriptor<
           miopenConvolutionDescriptor,
           &miopenCreateConvolutionDescriptor,
@@ -121,7 +121,7 @@ struct TORCH_CUDA_CPP_API ConvolutionDescriptor
 };
 
 // NOLINTNEXTLINE(bugprone-exception-escape)
-struct TORCH_CUDA_CPP_API DropoutDescriptor
+struct TORCH_HIP_CPP_API DropoutDescriptor
     : public Descriptor<
           miopenDropoutDescriptor,
           &miopenCreateDropoutDescriptor,
@@ -137,7 +137,7 @@ struct TORCH_CUDA_CPP_API DropoutDescriptor
     }
 };
 
-struct TORCH_CUDA_CPP_API RNNDescriptor
+struct TORCH_HIP_CPP_API RNNDescriptor
   : public Descriptor<miopenRNNDescriptor,
                       &miopenCreateRNNDescriptor,
                       &miopenDestroyRNNDescriptor>
diff --git a/aten/src/ATen/miopen/Handle.h b/aten/src/ATen/miopen/Handle.h
index 4c80c3aea65bf..b1637fca0a582 100644
--- a/aten/src/ATen/miopen/Handle.h
+++ b/aten/src/ATen/miopen/Handle.h
@@ -5,5 +5,5 @@
 
 namespace at::native {
 
-TORCH_CUDA_CPP_API miopenHandle_t getMiopenHandle();
+TORCH_HIP_CPP_API miopenHandle_t getMiopenHandle();
 } // namespace at::native
diff --git a/aten/src/ATen/miopen/Types.h b/aten/src/ATen/miopen/Types.h
index 0a8a1a952e2e2..fdc0f6a607b71 100644
--- a/aten/src/ATen/miopen/Types.h
+++ b/aten/src/ATen/miopen/Types.h
@@ -6,7 +6,7 @@
 
 namespace at::native {
 
-TORCH_CUDA_CPP_API miopenDataType_t getMiopenDataType(const at::Tensor& tensor);
+TORCH_HIP_CPP_API miopenDataType_t getMiopenDataType(const at::Tensor& tensor);
 
 int64_t miopen_version();
 

From 5446c03d3e6335a63adec8b0191e6bf45120d0fe Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Wed, 16 Jul 2025 15:14:59 -0400
Subject: [PATCH 34/83] [MPS] Fix `index_kernel` for large tensors (#158239)

[MPS] Fix `index_kernel` for large tensors (#158064)

Move `MetalShaderLibrary::bind_tensors` private method to OperatorUtils.h and extract `iter_tensor_offset` method, that returns an offset from the start of the storage associated with given tensor inside the iterator

Migrated `index`, `index_put[_accumulate][_serial]` to the new paradigm that does not require additional tensor for indices nor special handling for 32 vs 64-bit offset, which resulted in almost 2x perf gain for 2000x2000 tensor, see results below before
```
[------------------------------------------------------------  -----------------------------------------------------------]
                                                |  11x50x50  |  11x100x100  |  11x500x500  |  11x1000x1000  |  11x2000x2000
1 threads: ----------------------------------------------------------------------------------------------------------------
      __getitem__ (torch.int8, torch.int64)     |   383.5    |    379.8     |    470.9     |     1232.9     |     4410.3
      __getitem__ (torch.float16, torch.int64)  |   379.6    |    354.5     |    533.2     |     1290.3     |     4442.2
      __getitem__ (torch.float32, torch.int64)  |   360.8    |    338.6     |    478.6     |     1348.9     |     4870.4

Times are in microseconds (us).
```
and after
```
[------------------------------------------------------------  -----------------------------------------------------------]
                                                |  11x50x50  |  11x100x100  |  11x500x500  |  11x1000x1000  |  11x2000x2000
1 threads: ----------------------------------------------------------------------------------------------------------------
      __getitem__ (torch.int8, torch.int64)     |   349.8    |    330.5     |    432.6     |     764.5      |     1961.2
      __getitem__ (torch.float16, torch.int64)  |   342.5    |    330.7     |    434.7     |     741.0      |     1969.4
      __getitem__ (torch.float32, torch.int64)  |   332.2    |    326.1     |    445.4     |     751.3      |     1972.6

Times are in microseconds (us).
```

While migrating also fixed index_put_accumulate for boolean types, by using compare_and_exchange trick over uint

Fixes https://github.com/pytorch/pytorch/issues/153560
Pull Request resolved: https://github.com/pytorch/pytorch/pull/158064
Approved by: https://github.com/dcci

(cherry picked from commit beed033b6e6ac57c0b4a1f47eb436e115a52e41b)

Co-authored-by: Nikita Shulga <nikita.shulga@gmail.com>
---
 aten/src/ATen/native/mps/MetalShaderLibrary.h |   1 -
 aten/src/ATen/native/mps/OperationUtils.h     |  29 +-
 aten/src/ATen/native/mps/OperationUtils.mm    |  21 +-
 .../ATen/native/mps/kernels/Indexing.metal    | 341 ++++++++----------
 .../ATen/native/mps/operations/Indexing.mm    | 155 ++++----
 c10/metal/atomic.h                            |  31 ++
 test/bench_mps_ops.py                         |  11 +
 test/test_mps.py                              |  17 +
 torch/testing/_internal/common_mps.py         |   2 -
 9 files changed, 309 insertions(+), 299 deletions(-)

diff --git a/aten/src/ATen/native/mps/MetalShaderLibrary.h b/aten/src/ATen/native/mps/MetalShaderLibrary.h
index 6d35a5e9b2a31..535edd29ebd7a 100644
--- a/aten/src/ATen/native/mps/MetalShaderLibrary.h
+++ b/aten/src/ATen/native/mps/MetalShaderLibrary.h
@@ -156,7 +156,6 @@ class MetalShaderLibrary {
       MTLLibrary_t lib,
       const std::string& fname);
   MTLLibrary_t compileLibrary(const std::string& src);
-  void bind_tensors(MTLComputeCommandEncoder_t, TensorIteratorBase&);
   std::string shaderSource;
   unsigned nparams;
   MTLCompileOptions* compile_options;
diff --git a/aten/src/ATen/native/mps/OperationUtils.h b/aten/src/ATen/native/mps/OperationUtils.h
index 6474faac43ab8..976b62c7ac4b4 100644
--- a/aten/src/ATen/native/mps/OperationUtils.h
+++ b/aten/src/ATen/native/mps/OperationUtils.h
@@ -5,6 +5,7 @@
 #include <initializer_list>
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/Tensor.h>
+#include <ATen/TensorIterator.h>
 #include <ATen/Utils.h>
 #include <ATen/mps/MPSStream.h>
 #include <ATen/native/mps/MetalShaderLibrary.h>
@@ -35,10 +36,6 @@
                                                                        name:(NSString*)name;
 @end
 
-// Fwd declarations
-namespace at {
-struct TensorIteratorBase;
-}
 using namespace at::mps;
 
 namespace at::native::mps {
@@ -508,6 +505,30 @@ static inline void mtl_setBytes(id<MTLComputeCommandEncoder> encoder, const MPSS
   [encoder setBytes:&s.value length:s.size atIndex:idx];
 }
 
+static size_t iter_tensor_offset(TensorIteratorBase& iter, unsigned idx) {
+  // At the moment, MPS storage data is not the real GPU pointer, but rather a pointer to id<MTLBuffer> object
+  // But TensorIterator constructs data_ptr as if base was just a raw pointer
+  // Workaround this problem by computing an offset from the start of the tensor, which works for both
+  // tensor views and sliced 64-bit iterators
+  return reinterpret_cast<size_t>(iter.data_ptr(idx)) -
+      reinterpret_cast<size_t>(iter.tensor_base(idx).storage().data());
+}
+
+static inline void bind_iter_tensors(id<MTLComputeCommandEncoder> encoder,
+                                     TensorIteratorBase& iter,
+                                     std::optional<size_t> ntensors = std::nullopt) {
+  for (auto idx : c10::irange(ntensors.value_or(iter.ntensors()))) {
+    auto& t = iter.tensor_base(idx);
+    // Handle CPU scalars
+    if (C10_UNLIKELY(t.device().type() == kCPU)) {
+      mtl_setBuffer(encoder, t, idx);
+      continue;
+    }
+    auto offs = iter_tensor_offset(iter, idx);
+    [encoder setBuffer:getMTLBufferStorage(t) offset:offs atIndex:idx];
+  }
+}
+
 namespace detail {
 template <typename T>
 inline void mtl_setArg(id<MTLComputeCommandEncoder> encoder, const T& val, unsigned idx) {
diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm
index 583eb41034508..142186b748b17 100644
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@@ -971,23 +971,6 @@ static dispatch_data_t getSectionData(const std::string& name) {
   }
 };
 
-void MetalShaderLibrary::bind_tensors(id<MTLComputeCommandEncoder> encoder, TensorIteratorBase& iter) {
-  for (auto idx : c10::irange(iter.ntensors())) {
-    auto& t = iter.tensor_base(idx);
-    // Handle CPU scalars
-    if (C10_UNLIKELY(t.device().type() == kCPU)) {
-      mtl_setBuffer(encoder, t, idx);
-      continue;
-    }
-    // At the moment, MPS storage data is not the real GPU pointer, but rather a pointer to id<MTLBuffer> object
-    // But TensorIterator constructs data_ptr as if base was just a raw pointer
-    // Workaround this problem by computing an offset from the start of the tensor, which works for both
-    // tensor vies and sliced 64-bit iterators
-    auto offs = reinterpret_cast<size_t>(iter.data_ptr(idx)) - reinterpret_cast<size_t>(t.storage().data());
-    [encoder setBuffer:getMTLBufferStorage(t) offset:offs atIndex:idx];
-  }
-}
-
 void MetalShaderLibrary::exec_unary_kernel(TensorIteratorBase& iter,
                                            const std::string& name,
                                            std::optional<c10::Scalar> alpha,
@@ -1024,7 +1007,7 @@ static dispatch_data_t getSectionData(const std::string& name) {
       getMPSProfiler().beginProfileKernel(cplState, name, {inputTensor});
 
       [computeEncoder setComputePipelineState:cplState];
-      bind_tensors(computeEncoder, iter);
+      bind_iter_tensors(computeEncoder, iter);
       if (!iter.is_contiguous()) {
         mtl_setArgs<2>(computeEncoder,
                        outputTensor.sizes(),
@@ -1100,7 +1083,7 @@ static dispatch_data_t getSectionData(const std::string& name) {
       getMPSProfiler().beginProfileKernel(binaryPSO, kernel_name, {input, other});
       [computeEncoder setComputePipelineState:binaryPSO];
       // Set input and output tensors
-      bind_tensors(computeEncoder, iter);
+      bind_iter_tensors(computeEncoder, iter);
       // Iterator is contiguous if all of its elements are dense in storage,
       // i.e. it's true for both row-first and column-first tensors
       if (iter.is_contiguous()) {
diff --git a/aten/src/ATen/native/mps/kernels/Indexing.metal b/aten/src/ATen/native/mps/kernels/Indexing.metal
index 444b4c1711510..5651d4cbd4c4a 100644
--- a/aten/src/ATen/native/mps/kernels/Indexing.metal
+++ b/aten/src/ATen/native/mps/kernels/Indexing.metal
@@ -9,164 +9,191 @@ struct IndexAB {
   constant int64_t* indexArray;
 };
 
-template <typename T, typename OffsetsT>
+template <typename T, typename OffsetT = ulong>
 kernel void index_select(
-    constant IndexAB* indexAB [[buffer(0)]],
-    constant void* indexSizes [[buffer(1)]],
-    constant void* indexStrides [[buffer(2)]],
-    constant OffsetsT* offsets [[buffer(3)]],
-    constant void* inputData [[buffer(4)]],
-    device void* outputData [[buffer(5)]],
-    constant uint32_t& num_indices [[buffer(6)]],
+    device T* output,
+    constant T* input,
+    constant IndexAB* indices,
+    constant int64_t* sizes,
+    constant int64_t* output_strides,
+    constant int64_t* input_strides,
+    constant int64_t* indices_strides,
+    constant int64_t* index_sizes,
+    constant int64_t* index_strides,
+    constant uint4& ndim_nindices_numel,
     uint thread_index [[thread_position_in_grid]]) {
-  constant int64_t* index_sizes = (constant int64_t*)indexSizes;
-  constant int64_t* index_strides = (constant int64_t*)indexStrides;
-  int64_t offset = 0;
-  for (uint32_t i = 0; i < num_indices; i++) {
-    constant int64_t* indexArray = indexAB[i].indexArray;
-    int64_t index = indexArray[offsets[thread_index].z / sizeof(int64_t)];
-    if (index < 0) {
-      index += index_sizes[i];
+  const auto ndim = ndim_nindices_numel.x;
+  const auto num_indices = ndim_nindices_numel.y;
+  uint pos[max_ndim];
+  pos_from_thread_index(thread_index, pos, sizes, ndim);
+  const auto output_offs = offset_from_coord(pos, output_strides, ndim);
+  OffsetT input_offs = offset_from_coord(pos, input_strides, ndim);
+  const auto indices_offs =
+      offset_from_coord(pos, indices_strides, ndim) / sizeof(int64_t);
+  for (uint i = 0; i < num_indices; i++) {
+    auto idx = indices[i].indexArray[indices_offs];
+    if (idx < 0) {
+      idx += index_sizes[i];
     }
-    offset += index * index_strides[i];
+    input_offs += idx * index_strides[i];
   }
-  device T* out =
-      (device T*)((device char*)outputData + offsets[thread_index].x);
-  constant T* in = (constant T*)((constant char*)inputData +
-                                 offsets[thread_index].y + offset);
-  *out = *in;
+  output[output_offs / sizeof(T)] = input[input_offs / sizeof(T)];
 }
 
-template <typename T, typename OffsetsT>
-void index_put_impl(
-    constant IndexAB* indexAB,
+template <typename T, typename OffsetT = ulong>
+inline void index_put_impl(
+    device T* output,
+    constant T* input,
+    constant IndexAB* indices,
+    constant int64_t* sizes,
+    constant int64_t* output_strides,
+    constant int64_t* input_strides,
+    constant int64_t* indices_strides,
     constant int64_t* index_sizes,
     constant int64_t* index_strides,
-    constant OffsetsT* offsets,
-    constant void* inputData,
-    device void* outputData,
-    constant uint32_t& num_indices,
+    constant uint4& ndim_nindices_numel,
     uint thread_index) {
-  int64_t offset = 0;
-  for (uint32_t i = 0; i < num_indices; i++) {
-    constant int64_t* indexArray = indexAB[i].indexArray;
-    int64_t index = indexArray[offsets[thread_index].z / sizeof(int64_t)];
-
-    if (index < 0) {
-      index += index_sizes[i];
+  const auto ndim = ndim_nindices_numel.x;
+  const auto num_indices = ndim_nindices_numel.y;
+  uint pos[max_ndim];
+  pos_from_thread_index(thread_index, pos, sizes, ndim);
+  OffsetT output_offs = offset_from_coord(pos, output_strides, ndim);
+  const auto input_offs = offset_from_coord(pos, input_strides, ndim);
+  const auto indices_offs =
+      offset_from_coord(pos, indices_strides, ndim) / sizeof(int64_t);
+  for (uint i = 0; i < num_indices; i++) {
+    auto idx = indices[i].indexArray[indices_offs];
+    if (idx < 0) {
+      idx += index_sizes[i];
     }
-    offset += index * index_strides[i];
+    output_offs += idx * index_strides[i];
   }
-  device T* out =
-      (device T*)((device char*)outputData + offsets[thread_index].x + offset);
-  constant T* in =
-      (constant T*)((constant char*)inputData + offsets[thread_index].y);
-  *out = *in;
+  output[output_offs / sizeof(T)] = input[input_offs / sizeof(T)];
+}
+
+template <typename T, typename OffsetT = ulong>
+kernel void index_put(
+    device T* output,
+    constant T* input,
+    constant IndexAB* indices,
+    constant int64_t* sizes,
+    constant int64_t* output_strides,
+    constant int64_t* input_strides,
+    constant int64_t* indices_strides,
+    constant int64_t* index_sizes,
+    constant int64_t* index_strides,
+    constant uint4& ndim_nindices_numel,
+    uint thread_index [[thread_position_in_grid]]) {
+  index_put_impl(
+      output,
+      input,
+      indices,
+      sizes,
+      output_strides,
+      input_strides,
+      indices_strides,
+      index_sizes,
+      index_strides,
+      ndim_nindices_numel,
+      thread_index);
 }
 
-template <typename T, typename OffsetsT>
+template <typename T, typename OffsetT = ulong>
 kernel void index_put_serial(
-    constant IndexAB* indexAB [[buffer(0)]],
-    constant void* indexSizes [[buffer(1)]],
-    constant void* indexStrides [[buffer(2)]],
-    constant OffsetsT* offsets [[buffer(3)]],
-    constant void* inputData [[buffer(4)]],
-    device void* outputData [[buffer(5)]],
-    constant uint32_t& num_indices [[buffer(6)]],
-    constant uint* numIters [[buffer(7)]]) {
-  constant int64_t* index_sizes = (constant int64_t*)indexSizes;
-  constant int64_t* index_strides = (constant int64_t*)indexStrides;
-
-  for (uint iter_i = 0; iter_i < *numIters; iter_i++) {
-    index_put_impl<T>(
-        indexAB,
+    device T* output,
+    constant T* input,
+    constant IndexAB* indices,
+    constant int64_t* sizes,
+    constant int64_t* output_strides,
+    constant int64_t* input_strides,
+    constant int64_t* indices_strides,
+    constant int64_t* index_sizes,
+    constant int64_t* index_strides,
+    constant uint4& ndim_nindices_numel,
+    uint thread_index [[thread_position_in_grid]]) {
+  (void)thread_index; // Suppress unused vairable varning
+  for (uint idx = 0; idx < ndim_nindices_numel.z; ++idx) {
+    index_put_impl(
+        output,
+        input,
+        indices,
+        sizes,
+        output_strides,
+        input_strides,
+        indices_strides,
         index_sizes,
         index_strides,
-        offsets,
-        inputData,
-        outputData,
-        num_indices,
-        iter_i);
+        ndim_nindices_numel,
+        idx);
   }
 }
 
-template <typename T, typename OffsetsT>
-kernel void index_put(
-    constant IndexAB* indexAB [[buffer(0)]],
-    constant void* indexSizes [[buffer(1)]],
-    constant void* indexStrides [[buffer(2)]],
-    constant OffsetsT* offsets [[buffer(3)]],
-    constant void* inputData [[buffer(4)]],
-    device void* outputData [[buffer(5)]],
-    constant uint32_t& num_indices [[buffer(6)]],
+template <typename T, typename OffsetT = ulong>
+kernel void index_put_accumulate(
+    device T* output,
+    constant T* input,
+    constant IndexAB* indices,
+    constant int64_t* sizes,
+    constant int64_t* output_strides,
+    constant int64_t* input_strides,
+    constant int64_t* indices_strides,
+    constant int64_t* index_sizes,
+    constant int64_t* index_strides,
+    constant uint4& ndim_nindices_numel,
     uint thread_index [[thread_position_in_grid]]) {
-  constant int64_t* index_sizes = (constant int64_t*)indexSizes;
-  constant int64_t* index_strides = (constant int64_t*)indexStrides;
-  index_put_impl<T>(
-      indexAB,
-      index_sizes,
-      index_strides,
-      offsets,
-      inputData,
-      outputData,
-      num_indices,
-      thread_index);
+  const auto ndim = ndim_nindices_numel.x;
+  const auto num_indices = ndim_nindices_numel.y;
+  uint pos[max_ndim];
+  pos_from_thread_index(thread_index, pos, sizes, ndim);
+  OffsetT output_offs = offset_from_coord(pos, output_strides, ndim);
+  const auto input_offs = offset_from_coord(pos, input_strides, ndim);
+  const auto indices_offs =
+      offset_from_coord(pos, indices_strides, ndim) / sizeof(int64_t);
+  for (uint i = 0; i < num_indices; i++) {
+    auto idx = indices[i].indexArray[indices_offs];
+    if (idx < 0) {
+      idx += index_sizes[i];
+    }
+    output_offs += idx * index_strides[i];
+  }
+  AtomicType<T>::atomic_add(
+      reinterpret_cast<device AtomicType_t<T>*>(output),
+      output_offs / sizeof(T),
+      input[input_offs / sizeof(T)]);
 }
 
-#define REGISTER_INDEX_OP(                                     \
-    DTYPE_SIZE, IDX_SIZE, DTYPE, INDEX_OP_TYPE, IDX_DTYPE)     \
-  template [[host_name("index_" #INDEX_OP_TYPE "_" #DTYPE_SIZE \
-                       "_" #IDX_SIZE)]] kernel void            \
-      index_##INDEX_OP_TYPE<DTYPE, IDX_DTYPE>(                 \
-          constant IndexAB * indexAB [[buffer(0)]],            \
-          constant void* indexSizes [[buffer(1)]],             \
-          constant void* indexStrides [[buffer(2)]],           \
-          constant IDX_DTYPE* offsets [[buffer(3)]],           \
-          constant void* inputData [[buffer(4)]],              \
-          device void* outputData [[buffer(5)]],               \
-          constant uint32_t& num_indices [[buffer(6)]],        \
+#define REGISTER_INDEX_OP(OP_NAME, SUFFIX, DTYPE)                   \
+  template [[host_name("index_" #OP_NAME "_" #SUFFIX)]] kernel void \
+      index_##OP_NAME<DTYPE>(                                       \
+          device DTYPE * output,                                    \
+          constant DTYPE * input,                                   \
+          constant IndexAB * indices,                               \
+          constant int64_t* sizes,                                  \
+          constant int64_t* output_strides,                         \
+          constant int64_t* input_strides,                          \
+          constant int64_t* indices_strides,                        \
+          constant int64_t* index_sizes,                            \
+          constant int64_t* index_strides,                          \
+          constant uint4& ndim_nindices_numel,                      \
           uint thread_index [[thread_position_in_grid]])
 
-#define REGISTER_INDEX_OP_ALL_DTYPES(INDEX_OP_TYPE)              \
-  REGISTER_INDEX_OP(8bit, idx32, char, INDEX_OP_TYPE, uint3);    \
-  REGISTER_INDEX_OP(8bit, idx64, char, INDEX_OP_TYPE, ulong3);   \
-  REGISTER_INDEX_OP(16bit, idx32, short, INDEX_OP_TYPE, uint3);  \
-  REGISTER_INDEX_OP(16bit, idx64, short, INDEX_OP_TYPE, ulong3); \
-  REGISTER_INDEX_OP(32bit, idx32, int, INDEX_OP_TYPE, uint3);    \
-  REGISTER_INDEX_OP(32bit, idx64, int, INDEX_OP_TYPE, ulong3);   \
-  REGISTER_INDEX_OP(64bit, idx32, long, INDEX_OP_TYPE, uint3);   \
-  REGISTER_INDEX_OP(64bit, idx64, long, INDEX_OP_TYPE, ulong3);
+#define REGISTER_INDEX_OP_ALL_DTYPES(OP_NAME) \
+  REGISTER_INDEX_OP(OP_NAME, 8bit, char);     \
+  REGISTER_INDEX_OP(OP_NAME, 16bit, short);   \
+  REGISTER_INDEX_OP(OP_NAME, 32bit, int);     \
+  REGISTER_INDEX_OP(OP_NAME, 64bit, long)
 
 REGISTER_INDEX_OP_ALL_DTYPES(select);
 REGISTER_INDEX_OP_ALL_DTYPES(put);
+REGISTER_INDEX_OP_ALL_DTYPES(put_serial);
 
-#define REGISTER_SINGLE_THREADED_INDEX_OP(                     \
-    DTYPE_SIZE, IDX_SIZE, DTYPE, INDEX_OP_TYPE, IDX_DTYPE)     \
-  template [[host_name("index_" #INDEX_OP_TYPE "_" #DTYPE_SIZE \
-                       "_" #IDX_SIZE)]] kernel void            \
-      index_##INDEX_OP_TYPE<DTYPE, IDX_DTYPE>(                 \
-          constant IndexAB * indexAB [[buffer(0)]],            \
-          constant void* indexSizes [[buffer(1)]],             \
-          constant void* indexStrides [[buffer(2)]],           \
-          constant IDX_DTYPE* offsets [[buffer(3)]],           \
-          constant void* inputData [[buffer(4)]],              \
-          device void* outputData [[buffer(5)]],               \
-          constant uint32_t& num_indices [[buffer(6)]],        \
-          constant uint* numIters [[buffer(7)]])
-
-#define REGISTER_SINGLE_THREADED_INDEX_OP_ALL_DTYPES(INDEX_OP_TYPE)            \
-  REGISTER_SINGLE_THREADED_INDEX_OP(8bit, idx32, char, INDEX_OP_TYPE, uint3);  \
-  REGISTER_SINGLE_THREADED_INDEX_OP(8bit, idx64, char, INDEX_OP_TYPE, ulong3); \
-  REGISTER_SINGLE_THREADED_INDEX_OP(                                           \
-      16bit, idx32, short, INDEX_OP_TYPE, uint3);                              \
-  REGISTER_SINGLE_THREADED_INDEX_OP(                                           \
-      16bit, idx64, short, INDEX_OP_TYPE, ulong3);                             \
-  REGISTER_SINGLE_THREADED_INDEX_OP(32bit, idx32, int, INDEX_OP_TYPE, uint3);  \
-  REGISTER_SINGLE_THREADED_INDEX_OP(32bit, idx64, int, INDEX_OP_TYPE, ulong3); \
-  REGISTER_SINGLE_THREADED_INDEX_OP(64bit, idx32, long, INDEX_OP_TYPE, uint3); \
-  REGISTER_SINGLE_THREADED_INDEX_OP(64bit, idx64, long, INDEX_OP_TYPE, ulong3);
-
-REGISTER_SINGLE_THREADED_INDEX_OP_ALL_DTYPES(put_serial);
+REGISTER_INDEX_OP(put_accumulate, float, float);
+REGISTER_INDEX_OP(put_accumulate, half, half);
+REGISTER_INDEX_OP(put_accumulate, int, int);
+REGISTER_INDEX_OP(put_accumulate, bool, bool);
+#if __METAL_VERSION__ >= 310
+REGISTER_INDEX_OP(put_accumulate, bfloat, bfloat);
+#endif
 
 template <typename StridesT, typename DataT>
 kernel void kernel_index_offsets(
@@ -201,60 +228,6 @@ kernel_index_offsets<packed_uint3, ulong3>(
     constant uint& num_dimensions [[buffer(3)]],
     uint thread_index [[thread_position_in_grid]]);
 
-template <typename T, typename OffsetsT>
-kernel void index_put_accumulate(
-    constant IndexAB* indexAB [[buffer(0)]],
-    constant void* indexSizes [[buffer(1)]],
-    constant void* indexStrides [[buffer(2)]],
-    constant OffsetsT* offsets [[buffer(3)]],
-    constant void* inputData [[buffer(4)]],
-    device void* outputData [[buffer(5)]],
-    constant uint32_t& num_indices [[buffer(6)]],
-    uint thread_index [[thread_position_in_grid]]) {
-  constant int64_t* index_sizes = (constant int64_t*)indexSizes;
-  constant int64_t* index_strides = (constant int64_t*)indexStrides;
-  int64_t offset = offsets[thread_index].x;
-  for (uint32_t i = 0; i < num_indices; i++) {
-    constant int64_t* indexArray = indexAB[i].indexArray;
-    int64_t index = indexArray[offsets[thread_index].z / sizeof(int64_t)];
-    if (index < 0) {
-      index += index_sizes[i];
-    }
-    offset += index * index_strides[i];
-  }
-  const auto in =
-      *(constant T*)((constant char*)inputData + offsets[thread_index].y);
-  AtomicType<T>::atomic_add(
-      reinterpret_cast<device AtomicType_t<T>*>(outputData),
-      offset / sizeof(T),
-      in);
-}
-
-#define REGISTER_INDEX_PUT_ACCUMULATE(DTS, DTYPE, IDXS, IDX_DTYPE) \
-  template [[host_name("index_put_accumulate_" #DTS "_" #DTYPE     \
-                       "_" #IDXS)]] kernel void                    \
-  index_put_accumulate<DTYPE, IDX_DTYPE>(                          \
-      constant IndexAB * indexAB [[buffer(0)]],                    \
-      constant void* indexSizes [[buffer(1)]],                     \
-      constant void* indexStrides [[buffer(2)]],                   \
-      constant IDX_DTYPE* offsets [[buffer(3)]],                   \
-      constant void* inputData [[buffer(4)]],                      \
-      device void* outputData [[buffer(5)]],                       \
-      constant uint32_t& num_indices [[buffer(6)]],                \
-      uint thread_index [[thread_position_in_grid]])
-
-REGISTER_INDEX_PUT_ACCUMULATE(32bit, float, idx32, uint3);
-REGISTER_INDEX_PUT_ACCUMULATE(32bit, float, idx64, ulong3);
-REGISTER_INDEX_PUT_ACCUMULATE(32bit, int, idx32, uint3);
-REGISTER_INDEX_PUT_ACCUMULATE(32bit, int, idx64, ulong3);
-REGISTER_INDEX_PUT_ACCUMULATE(16bit, half, idx32, uint3);
-REGISTER_INDEX_PUT_ACCUMULATE(16bit, half, idx64, ulong3);
-
-#if __METAL_VERSION__ >= 310
-REGISTER_INDEX_PUT_ACCUMULATE(16bit, bfloat, idx32, uint3);
-REGISTER_INDEX_PUT_ACCUMULATE(16bit, bfloat, idx64, ulong3);
-#endif
-
 template <typename T>
 kernel void masked_fill_scalar_dense(
     device T* input,
diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm
index a226a7327b842..8d54ebd692191 100644
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@@ -102,91 +102,9 @@
   TORCH_CHECK(scalarBitSize <= 64, "Unsupported data type: ", getMPSTypeString(scalar_type));
   return std::to_string(scalarBitSize) + "bit";
 }
-static std::string getIndexFunctionName(ScalarType scalar_type,
-                                        bool index_select,
-                                        bool accumulate,
-                                        bool serial,
-                                        bool use_64bit_indexing) {
-  std::string indexFunction = index_select     ? "index_select_"
-      : (accumulate && (scalar_type != kBool)) ? "index_put_accumulate_"
-                                               : (serial ? "index_put_serial_" : "index_put_");
-
-  indexFunction.append(getBitSizeString(scalar_type));
-  if (accumulate) {
-    indexFunction.append(1, '_');
-    indexFunction.append(scalarToMetalTypeString(scalar_type));
-  }
-  indexFunction.append(use_64bit_indexing ? "_idx64" : "_idx32");
-  return indexFunction;
-}
-
-static bool dispatchIndexKernel(TensorIteratorBase& iter,
-                                IntArrayRef index_size,
-                                IntArrayRef index_stride,
-                                bool index_select,
-                                bool accumulate) {
-  using namespace mps;
-
-  if (iter.numel() == 0) {
-    return true;
-  }
-  const bool serial_index_put = at::globalContext().deterministicAlgorithms() && !accumulate && !index_select;
-
-  const Tensor& inputTensor = iter.tensor(1);
-  Tensor outputTensor = iter.tensor(0);
-  MPSStream* mpsStream = getCurrentMPSStream();
-  id<MTLDevice> device = MPSDevice::getInstance()->device();
-
-  dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
-    @autoreleasepool {
-      NSError* error = nil;
-      const int64_t num_indices = index_size.size();
-      const uint32_t numIters = serial_index_put ? iter.numel() : 1;
-      uint32_t numThreads = iter.numel();
-
-      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
-      const bool use_64bit_indexing = !iter.can_use_32bit_indexing();
-      auto kernelDataOffsets = generateKernelDataOffsets(computeEncoder, iter, use_64bit_indexing);
-
-      auto indexFunction = getIndexFunctionName(
-          inputTensor.scalar_type(), index_select, accumulate, serial_index_put, use_64bit_indexing);
-      auto indexSelectPSO = lib.getPipelineStateForFunc(indexFunction);
-      size_t argumentBufferLength = sizeof(uint64_t) * num_indices;
-      auto indexAB = [[device newBufferWithLength:argumentBufferLength options:0] autorelease];
-      uint64_t* indexABContents = (uint64_t*)(indexAB.contents);
-      for (uint32_t idx = 0; idx < num_indices; idx++) {
-        const Tensor& indexTensor = iter.tensor(idx + 2);
-        indexABContents[idx] =
-            getMTLBufferStorage(indexTensor).gpuAddress + (indexTensor.storage_offset() * indexTensor.element_size());
-        TORCH_CHECK(indexTensor.scalar_type() == ScalarType::Long, "index(): Expected dtype int64 for Index");
-        [computeEncoder useResource:getMTLBufferStorage(indexTensor) usage:MTLResourceUsageRead];
-      }
-      // this function call is a no-op if MPS Profiler is not enabled
-      getMPSProfiler().beginProfileKernel(indexSelectPSO, indexFunction, {inputTensor});
-
-      [computeEncoder setComputePipelineState:indexSelectPSO];
-      mtl_setArgs(
-          computeEncoder, indexAB, index_size, index_stride, kernelDataOffsets, inputTensor, outputTensor, num_indices);
-      MTLSize gridSize = MTLSizeMake(numThreads, 1, 1);
-      if (serial_index_put) {
-        mtl_setBytes(computeEncoder, numIters, 7);
-        gridSize = MTLSizeMake(1, 1, 1);
-        numThreads = 1;
-      }
-
-      NSUInteger tgSize = indexSelectPSO.maxTotalThreadsPerThreadgroup;
-      if (tgSize > numThreads) {
-        tgSize = numThreads;
-      }
-
-      MTLSize threadGroupSize = MTLSizeMake(tgSize, 1, 1);
-      [computeEncoder dispatchThreads:gridSize threadsPerThreadgroup:threadGroupSize];
 
-      getMPSProfiler().endProfileKernel(indexSelectPSO);
-    }
-  });
-
-  return true;
+static std::string getBitSizeString(const TensorBase& t) {
+  return getBitSizeString(t.scalar_type());
 }
 
 static void validateInputData(const TensorIteratorBase& iter,
@@ -237,11 +155,56 @@ static void validateInputData(const TensorIteratorBase& iter,
   return result;
 }
 
-static void index_kernel_mps(TensorIteratorBase& iter, IntArrayRef index_size, IntArrayRef index_stride) {
-  @autoreleasepool {
-    validateInputData(iter, index_size, index_stride, "index.Tensor_out", /*accumulate=*/false);
-    dispatchIndexKernel(iter, index_size, index_stride, /*index_select=*/true, /*accumulate=*/false);
+static void dispatch_index_kernel(TensorIteratorBase& iter,
+                                  IntArrayRef index_size,
+                                  IntArrayRef index_stride,
+                                  const std::string& kernel_name,
+                                  const bool serial = false) {
+  validateInputData(iter, index_size, index_stride, "index.Tensor_out", /*accumulate=*/false);
+  if (iter.numel() == 0)
+    return;
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      dispatch_index_kernel(sub_iter, index_size, index_stride, kernel_name);
+    }
+    return;
   }
+  const auto mpsStream = getCurrentMPSStream();
+  dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
+    const int64_t num_indices = index_size.size();
+    auto indexSelectPSO = lib.getPipelineStateForFunc(kernel_name);
+    auto computeEncoder = mpsStream->commandEncoder();
+    size_t argumentBufferLength = sizeof(uint64_t) * num_indices;
+    std::vector<uint64_t> indexAB;
+    std::array<uint32_t, 4> ndim_nindiees = {static_cast<uint32_t>(iter.ndim()),
+                                             static_cast<uint32_t>(index_size.size()),
+                                             static_cast<uint32_t>(iter.numel()),
+                                             0};
+    for (uint32_t idx = 0; idx < num_indices; idx++) {
+      const auto& indexTensor = iter.tensor_base(idx + 2);
+      indexAB.push_back(getMTLBufferStorage(indexTensor).gpuAddress + iter_tensor_offset(iter, idx + 2));
+      TORCH_CHECK(indexTensor.scalar_type() == ScalarType::Long, "index(): Expected dtype int64 for Index");
+      [computeEncoder useResource:getMTLBufferStorage(indexTensor) usage:MTLResourceUsageRead];
+    }
+    [computeEncoder setComputePipelineState:indexSelectPSO];
+    bind_iter_tensors(computeEncoder, iter, 2);
+    mtl_setArgs<2>(computeEncoder,
+                   indexAB,
+                   iter.shape(),
+                   iter.strides(0),
+                   iter.strides(1),
+                   iter.strides(2),
+                   index_size,
+                   index_stride,
+                   ndim_nindiees);
+    mtl_dispatch1DJob(computeEncoder, indexSelectPSO, serial ? 1 : iter.numel());
+  });
+}
+
+static void index_kernel_mps(TensorIteratorBase& iter, IntArrayRef index_size, IntArrayRef index_stride) {
+  validateInputData(iter, index_size, index_stride, "index.Tensor_out", /*accumulate=*/false);
+  dispatch_index_kernel(
+      iter, index_size, index_stride, fmt::format("index_select_{}", getBitSizeString(iter.tensor_base(0))));
 }
 
 static void index_put_kernel_mps(TensorIterator& iter,
@@ -250,7 +213,21 @@ static void index_put_kernel_mps(TensorIterator& iter,
                                  bool accumulate) {
   @autoreleasepool {
     validateInputData(iter, index_size, index_stride, "index_put_impl", accumulate);
-    dispatchIndexKernel(iter, index_size, index_stride, /*index_select=*/false, accumulate);
+    if (accumulate) {
+      dispatch_index_kernel(iter,
+                            index_size,
+                            index_stride,
+                            fmt::format("index_put_accumulate_{}", scalarToMetalTypeString(iter.tensor_base(0))));
+    } else if (at::globalContext().deterministicAlgorithms()) {
+      dispatch_index_kernel(iter,
+                            index_size,
+                            index_stride,
+                            fmt::format("index_put_serial_{}", getBitSizeString(iter.tensor_base(0))),
+                            true);
+    } else {
+      dispatch_index_kernel(
+          iter, index_size, index_stride, fmt::format("index_put_{}", getBitSizeString(iter.tensor_base(0))));
+    }
   }
 }
 } // namespace mps
diff --git a/c10/metal/atomic.h b/c10/metal/atomic.h
index 141fa692ab5da..84698024e8873 100644
--- a/c10/metal/atomic.h
+++ b/c10/metal/atomic.h
@@ -70,5 +70,36 @@ struct AtomicType<bfloat> {
 };
 #endif
 
+// Metal supports atomic_store_explicit for bools, but
+// sizeof(::metal::atomic_bool) is 4 Therefore it could not be used to
+// atomically modify unaligned memory, so fall back to compare and exchange
+// trick As accumulation over booleans are just or operation, do nothing if
+// value is false
+template <>
+struct AtomicType<bool> {
+  using type = ::metal::atomic<uint>;
+  static inline void atomic_add(device type* data, long offset, bool value) {
+    if (!value) {
+      return;
+    }
+    auto ptr = data + (offset >> 2);
+    auto old =
+        ::metal::atomic_load_explicit(ptr, ::metal::memory_order_relaxed);
+    union {
+      uint i;
+      bool t[4];
+    } val;
+    do {
+      val.i = old;
+      val.t[offset & 3] = true;
+    } while (!::metal::atomic_compare_exchange_weak_explicit(
+        ptr,
+        &old,
+        val.i,
+        ::metal::memory_order_relaxed,
+        ::metal::memory_order_relaxed));
+  }
+};
+
 } // namespace metal
 } // namespace c10
diff --git a/test/bench_mps_ops.py b/test/bench_mps_ops.py
index 7bc2da5455950..c3f1999f1deb4 100644
--- a/test/bench_mps_ops.py
+++ b/test/bench_mps_ops.py
@@ -153,6 +153,17 @@ def main() -> None:
     if torch.backends.mps.is_macos_or_newer(14, 0):
         dtypes.append(torch.bfloat16)
 
+    # Profile index ops
+    B = 11
+    rc = []
+    for dtype, N in itertools.product(
+        [torch.int8, torch.float16, torch.float32], [50, 100, 500, 1000, 2000]
+    ):
+        x = torch.testing.make_tensor((B, N, N), device="mps", dtype=dtype)
+        y = torch.randint(0, B, (3,))
+        rc.append(bench_binary_op(torch.Tensor.__getitem__, x, y, f"{B}x{N}x{N}"))
+    Compare(rc).print()
+
     # Profile unary ops
     rc = []
     for op, dtype in itertools.product([torch.sqrt, torch.sin], dtypes):
diff --git a/test/test_mps.py b/test/test_mps.py
index 9bae2889f5843..d6170e0793336 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -7968,6 +7968,23 @@ def test_64bit_binops(self):
         rc_slice_cpu = (a.cpu() + b.cpu()[slice_idx:]).sin()
         self.assertEqual(rc_slice, rc_slice_cpu)
 
+    @serialTest()
+    def test_64bit_index_select(self):
+        if torch.mps.recommended_max_memory() < 16_000_000_000:
+            raise unittest.SkipTest("Needs at least 16Gb of RAM")
+        B, N = 11, 20000
+        x = torch.empty(B, N, N, dtype=torch.float16, device='mps')
+        for i in range(B):
+            x[i] = 1.0 * i
+        batch_idx = torch.tensor([9], device='mps')
+        y = x[batch_idx]
+        self.assertEqual(y[0, 1, 2].item(), 9.0)
+        # Reclaim memory after running the tests
+        del y
+        del x
+        gc.collect()
+        torch.mps.empty_cache()
+
 
 class TestLogical(TestCaseMPS):
     def _wrap_tensor(self, x, device="cpu", dtype=None, requires_grad=False):
diff --git a/torch/testing/_internal/common_mps.py b/torch/testing/_internal/common_mps.py
index 503312e843954..5674154bac006 100644
--- a/torch/testing/_internal/common_mps.py
+++ b/torch/testing/_internal/common_mps.py
@@ -544,7 +544,6 @@ def mps_ops_modifier(
             "rounddecimals_0": [torch.bfloat16],
             # atomic operations not supported
             "_unsafe_masked_index_put_accumulate": [
-                torch.bool,
                 torch.int8,
                 torch.uint8,
                 torch.int16,
@@ -645,7 +644,6 @@ def mps_ops_modifier(
                 torch.bfloat16,
             ],
             "index_put": [
-                torch.bool,
                 torch.uint8,
                 torch.int8,
                 torch.int16,

From 71c68bcc275df19f471f4e3e01d8b92baad82c06 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Wed, 16 Jul 2025 15:43:51 -0400
Subject: [PATCH 35/83] Add flag to fx.passes.split_module to normalize input
 names (#157793)

Add flag to fx.passes.split_module to normalize input names (#157733)

This is useful for vLLM, which runs AOTAutograd directly on graphs after
they have been split.

I created a new flag for this instead of reusing
`keep_original_node_name` (please let me know if you think I should reuse this).
The reasoning is:
- The names of the placeholder nodes is different from the targets of
  the placehoder nodes. The targets are the actual input names.
- Backwards compatibility: this API has been out for ~4 years, it
  looks public, and it has extensive public use. For example, this change
  would actually be BC-breaking to vLLM (they rely on the subgraph input
  names being different at the moment).

Test Plan:
- new tests

Pull Request resolved: https://github.com/pytorch/pytorch/pull/157733
Approved by: https://github.com/ezyang

(cherry picked from commit b9afdd9bcc738697c6eefc90899508ab783bf6ab)

Co-authored-by: rzou <zou3519@gmail.com>
---
 ...t-fx_backcompat_function_signatures.expect |  2 +-
 test/test_fx_experimental.py                  | 40 +++++++++++++++++++
 torch/fx/passes/split_module.py               | 35 +++++++++++-----
 3 files changed, 65 insertions(+), 12 deletions(-)

diff --git a/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect b/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect
index cd7d6374f6dad..fab0dbd066761 100644
--- a/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect
+++ b/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect
@@ -64,7 +64,7 @@ torch.fx.node.map_aggregate(a: torch.fx.node.Argument, fn: Callable[[torch.fx.no
 torch.fx.node.map_arg(a: torch.fx.node.Argument, fn: Callable[[torch.fx.node.Node], torch.fx.node.Argument]) -> torch.fx.node.Argument
 torch.fx.passes.reinplace.reinplace(gm, *sample_args)
 torch.fx.passes.runtime_assert.insert_deferred_runtime_asserts(gm: torch.fx.graph_module.GraphModule, shape_env: Any, name: str, export: bool = False) -> None
-torch.fx.passes.split_module.split_module(m: torch.fx.graph_module.GraphModule, root_m: torch.nn.modules.module.Module, split_callback: Callable[[torch.fx.node.Node], int], qualname_map: Optional[Dict[str, str]] = None, keep_original_order: Optional[bool] = False, keep_original_node_name: Optional[bool] = False)
+torch.fx.passes.split_module.split_module(m: torch.fx.graph_module.GraphModule, root_m: torch.nn.modules.module.Module, split_callback: Callable[[torch.fx.node.Node], int], qualname_map: Optional[Dict[str, str]] = None, keep_original_order: Optional[bool] = False, keep_original_node_name: Optional[bool] = False, keep_original_input_name: bool = True)
 torch.fx.proxy.Attribute.__init__(self, root: torch.fx.proxy.Proxy, attr: str)
 torch.fx.proxy.Proxy.__init__(self, node: torch.fx.node.Node, tracer: 'Optional[TracerBase]' = None)
 torch.fx.proxy.Proxy.keys(self)
diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py
index 434de5243c139..91b574c9b04c0 100644
--- a/test/test_fx_experimental.py
+++ b/test/test_fx_experimental.py
@@ -791,6 +791,46 @@ def mod_partition(node: Node):
 
         self.assertEqual(orig_out, submodules_out)
 
+    def test_split_module_input_names(self):
+        class Mod(torch.nn.Module):
+            def forward(self, x, a0, a1, b0, b1, c0, c1):
+                x = x + (a0 ** 2) + (a1 / 2)
+                x = x + (b0 ** 2) + (b1 / 2)
+                x = x + (c0 ** 2) + (c1 / 2)
+                return x
+
+        mod = Mod()
+        traced = torch.fx.symbolic_trace(mod)
+
+        seen = 0
+
+        def split(n):
+            nonlocal seen
+            result = seen // 4
+            seen += 1
+            return result
+
+        split = split_module(traced, mod, split, keep_original_input_name=False)
+
+        # All the submodules should take in the inputs in the same order.
+        args = [torch.tensor(2.), torch.tensor(3.), torch.tensor(4.)]
+        output0 = split.submod_0(*args)
+        output1 = split.submod_1(*args)
+        output2 = split.submod_2(*args)
+        self.assertEqual(output0, output1)
+        self.assertEqual(output1, output2)
+
+        # Each submodule should have normalized input names
+        def check_ph(gm):
+            nodes = list(gm.graph.nodes)
+            self.assertEqual(nodes[0].target, "arg_0")
+            self.assertEqual(nodes[1].target, "arg_1")
+            self.assertEqual(nodes[2].target, "arg_2")
+
+        check_ph(split.submod_0)
+        check_ph(split.submod_1)
+        check_ph(split.submod_2)
+
     def test_split_module_dead_code(self):
         class ModWithDeadCode(torch.nn.Module):
             def forward(self, x):
diff --git a/torch/fx/passes/split_module.py b/torch/fx/passes/split_module.py
index 59c560423d401..413584070d133 100644
--- a/torch/fx/passes/split_module.py
+++ b/torch/fx/passes/split_module.py
@@ -58,6 +58,7 @@ def split_module(
     qualname_map: Optional[dict[str, str]] = None,
     keep_original_order: Optional[bool] = False,
     keep_original_node_name: Optional[bool] = False,
+    keep_original_input_name: bool = True,
 ):
     """
     Creates subgraphs out of main graph
@@ -76,7 +77,10 @@ def split_module(
             names in the original module.
         keep_original_order: Optional[bool]: keep the original order of the GraphModule
             or use the Topological order of the new constructed GraphModule
-
+        keep_original_node_name: Optional[bool]: If the partitioned graphs should
+            have the same node names as the original graph.
+        keep_original_input_name: bool: If the partitioned graphs should
+            have the same input names as the original graph.
 
     Returns:
         GraphModule: the module after split.
@@ -419,11 +423,28 @@ def instantiate_node_partition_mapping(node):
     for partition_name in sorted_partitions:
         partition = partitions[partition_name]
         new_inputs: dict[str, None] = {}
+
+        counter = 0
+
         for inp in partition.inputs:
             orig_node = orig_nodes[inp]
             # We don't pass in get_attr nodes as inputs to the partition, but
             # instead set them as targets and use getattr within the module
 
+            def add_placeholder():
+                if keep_original_input_name:
+                    name = inp
+                else:
+                    nonlocal counter
+                    name = f"arg_{counter}"
+                    counter += 1
+                placeholder = partition.graph.placeholder(
+                    name,
+                    type_expr=orig_nodes[inp].type,
+                )
+                new_inputs[inp] = None
+                return placeholder
+
             if orig_node.op == "get_attr":
                 assert isinstance(orig_node.target, str)
 
@@ -432,17 +453,9 @@ def instantiate_node_partition_mapping(node):
                     placeholder = partition.graph.get_attr(orig_node.target)
                     partition.targets[orig_node.target] = orig_attr
                 else:
-                    placeholder = partition.graph.placeholder(
-                        inp,
-                        type_expr=orig_nodes[inp].type,
-                    )
-                    new_inputs[inp] = None
+                    placeholder = add_placeholder()
             else:
-                placeholder = partition.graph.placeholder(
-                    inp,
-                    type_expr=orig_nodes[inp].type,
-                )
-                new_inputs[inp] = None
+                placeholder = add_placeholder()
             placeholder.meta = orig_nodes[inp].meta.copy()
             partition.environment[orig_nodes[inp]] = placeholder
         partition.inputs = new_inputs

From 4c1d666ee8170a32fd4b527f3fed58cb3ef6079a Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Wed, 16 Jul 2025 19:14:19 -0400
Subject: [PATCH 36/83] Add warning about removed sm50 and sm60 arches
 (#158478)

Add warning about removed sm50 and sm60 arches (#158301)

Related to https://github.com/pytorch/pytorch/issues/157517

Detect when users are executing torch build with cuda 12.8/12.9 and running on Maxwell or Pascal architectures.
We would like to include reference to the issue: https://github.com/pytorch/pytorch/issues/157517 as well as ask people to install CUDA 12.6 builds if they are running on sm50 or sm60 architectures.

Test:
```
>>> torch.cuda.get_arch_list()
['sm_70', 'sm_75', 'sm_80', 'sm_86', 'sm_90', 'sm_100', 'sm_120', 'compute_120']
>>> torch.cuda.init()
/home/atalman/.conda/envs/py312/lib/python3.12/site-packages/torch/cuda/__init__.py:263: UserWarning:
    Found <GPU Name> which is of cuda capability 5.0.
    PyTorch no longer supports this GPU because it is too old.
    The minimum cuda capability supported by this library is 7.0.

  warnings.warn(
/home/atalman/.conda/envs/py312/lib/python3.12/site-packages/torch/cuda/__init__.py:268: UserWarning:
                        Support for Maxwell and Pascal architectures is removed for CUDA 12.8+ builds.
                        Please see https://github.com/pytorch/pytorch/issues/157517
                        Please install CUDA 12.6 builds if you require Maxwell or Pascal support.
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/158301
Approved by: https://github.com/nWEIdia, https://github.com/albanD

(cherry picked from commit fb731fe371cb1b5bf95de84b19c213590526acb2)

Co-authored-by: atalman <atalman@fb.com>
---
 torch/cuda/__init__.py | 56 +++++++++++++++++++++++++++++++-----------
 1 file changed, 41 insertions(+), 15 deletions(-)

diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index b70808a57c2da..e3793234f2bc8 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -244,21 +244,25 @@ def _extract_arch_version(arch_string: str):
 
 
 def _check_capability():
-    incorrect_binary_warn = """
-    Found GPU%d %s which requires CUDA_VERSION >= %d to
-     work properly, but your PyTorch was compiled
-     with CUDA_VERSION %d. Please install the correct PyTorch binary
-     using instructions from https://pytorch.org
-    """  # noqa: F841
-
-    old_gpu_warn = """
+    incompatible_gpu_warn = """
     Found GPU%d %s which is of cuda capability %d.%d.
-    PyTorch no longer supports this GPU because it is too old.
-    The minimum cuda capability supported by this library is %d.%d.
+    Minimum and Maximum cuda capability supported by this version of PyTorch is
+    (%d.%d) - (%d.%d)
     """
+    matched_cuda_warn = """
+    Please install PyTorch with a following CUDA
+    configurations: {} following instructions at
+    https://pytorch.org/get-started/locally/
+    """
+
+    # Binary CUDA_ARCHES SUPPORTED by PyTorch
+    CUDA_ARCHES_SUPPORTED = {
+        "12.6": {"min": 50, "max": 90},
+        "12.8": {"min": 70, "max": 120},
+        "12.9": {"min": 70, "max": 120},
+    }
 
     if torch.version.cuda is not None:  # on ROCm we don't want this check
-        CUDA_VERSION = torch._C._cuda_getCompiledVersion()  # noqa: F841
         for d in range(device_count()):
             capability = get_device_capability(d)
             major = capability[0]
@@ -267,13 +271,35 @@ def _check_capability():
             current_arch = major * 10 + minor
             min_arch = min(
                 (_extract_arch_version(arch) for arch in torch.cuda.get_arch_list()),
-                default=35,
+                default=50,
             )
-            if current_arch < min_arch:
+            max_arch = max(
+                (_extract_arch_version(arch) for arch in torch.cuda.get_arch_list()),
+                default=50,
+            )
+            if current_arch < min_arch or current_arch > max_arch:
                 warnings.warn(
-                    old_gpu_warn
-                    % (d, name, major, minor, min_arch // 10, min_arch % 10)
+                    incompatible_gpu_warn
+                    % (
+                        d,
+                        name,
+                        major,
+                        minor,
+                        min_arch // 10,
+                        min_arch % 10,
+                        max_arch // 10,
+                        max_arch % 10,
+                    )
                 )
+                matched_arches = ""
+                for arch, arch_info in CUDA_ARCHES_SUPPORTED.items():
+                    if (
+                        current_arch >= arch_info["min"]
+                        and current_arch <= arch_info["max"]
+                    ):
+                        matched_arches += f" {arch}"
+                if matched_arches != "":
+                    warnings.warn(matched_cuda_warn.format(matched_arches))
 
 
 def _check_cubins():

From 352edf2a8db0b387b762f7f1446d4b0be032aac9 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Fri, 18 Jul 2025 10:44:04 -0400
Subject: [PATCH 37/83] [cherry-pick][inductor][triton] Update HAS_WARP_SPEC to
 check triton.Config params. Update Triton Hash to top of release/3.4.x stack
 (#158646)

* [inductor][triton] Update HAS_WARP_SPEC to check triton.Config params. Update Triton Hash to top of release/3.4.x stack (#158459)

Update triton commit hash to `11ec6354315768a85da41032535e3b7b99c5f706`, which is the new release/3.4.x branch in triton-lang/triton.

Also, update HAS_WARP_SPEC handling: In triton 3.4, warp spec will have a different interface: num_consumer_groups will be determined automatically by the compiler. This breaks the current Inductor integration, so for now, update HAS_WARP_SPEC to check whether triton.Config takes num_consumer_groups and num_buffers_warp_spec as parameters.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/158459
Approved by: https://github.com/atalman

* dont_upde_hash

* Revert "dont_upde_hash"

This reverts commit 5fffb12a3adead5c1ac9f6d9d1f505cbc74f3421.

* fix_docker_builds

---------

Co-authored-by: David Berard <dberard@fb.com>
---
 .ci/docker/ci_commit_pins/triton.txt       |  2 +-
 .ci/docker/common/install_conda.sh         |  2 ++
 test/inductor/test_static_cuda_launcher.py | 31 ----------------------
 torch/_inductor/runtime/triton_compat.py   | 13 ++++++++-
 4 files changed, 15 insertions(+), 33 deletions(-)

diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt
index cf43cba72a42b..6dc1c44507ebd 100644
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@@ -1 +1 @@
-5e5685356b9fc7b5ad9cdf4e510a1994a5b8601a
+11ec6354315768a85da41032535e3b7b99c5f706
diff --git a/.ci/docker/common/install_conda.sh b/.ci/docker/common/install_conda.sh
index b33f7f0a1e9d3..3029b5967a87a 100755
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@@ -54,6 +54,8 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
     export SYSROOT_DEP="sysroot_linux-64=2.17"
   fi
 
+  # Please see: https://github.com/pytorch/pytorch/pull/158370#issuecomment-3084705725
+  export CONDA_PLUGINS_AUTO_ACCEPT_TOS="yes"
   # Install correct Python version
   # Also ensure sysroot is using a modern GLIBC to match system compilers
   as_jenkins conda create -n py_$ANACONDA_PYTHON_VERSION -y\
diff --git a/test/inductor/test_static_cuda_launcher.py b/test/inductor/test_static_cuda_launcher.py
index 477d5ac2e6c20..2ce294ed0ff55 100644
--- a/test/inductor/test_static_cuda_launcher.py
+++ b/test/inductor/test_static_cuda_launcher.py
@@ -14,7 +14,6 @@
 from torch._inductor.test_case import TestCase
 from torch.testing._internal.common_utils import skipIfRocm
 from torch.testing._internal.triton_utils import requires_cuda
-from torch.torch_version import TorchVersion
 
 
 @requires_cuda
@@ -140,36 +139,6 @@ def signed_integers(
         launcher.run(1, 1, 1, stream, new_arg0, 50, 50, 50, 50)
         self.assertEqual(new_arg0, arg0)
 
-    # TODO: floats don't work properly, triton seems to think they're all tl.float32
-    # despite type annotations.
-    # There's also not really a good way for me to make a float16 in python...
-    @skipIfRocm
-    def test_floats(self):
-        @triton.jit
-        def floats(arg0, arg1: tl.float16, arg2: tl.float32, arg3: tl.float64):
-            x = tl.load(arg0)
-            y = arg1 + arg2 + arg3
-            tl.store(arg0, x + y)
-
-        arg0 = torch.zeros(1, dtype=torch.float64, device="cuda")
-
-        args = (arg0, 1.0, 1.0, 1.0)
-
-        compiled_kernel = floats[1,](*args)
-        launcher = self._make_launcher(compiled_kernel)
-        if TorchVersion(triton.__version__) >= TorchVersion("3.4.0"):
-            self.assertEqual(launcher.arg_tys, "Offd")
-        else:
-            self.assertEqual(launcher.arg_tys, "Offf")
-        # TODO this line fails on Triton 3.4.0 (https://github.com/triton-lang/triton/issues/6176)
-        # Add the check back when this is fixed in Triton
-        # self.assertEqual(arg0, torch.tensor([3.0], dtype=torch.float64, device="cuda"))
-        new_arg0 = torch.zeros(1, dtype=torch.float64, device="cuda")
-        device_interface = get_interface_for_device("cuda")
-        stream = device_interface.get_raw_stream(device_interface.current_device())
-        launcher.run(1, 1, 1, stream, new_arg0, 1.0, 1.0, 1.0)
-        self.assertEqual(new_arg0, arg0)
-
     @skipIfRocm
     def test_basic_1arg(self):
         @triton.jit
diff --git a/torch/_inductor/runtime/triton_compat.py b/torch/_inductor/runtime/triton_compat.py
index b753c492a2899..d5aeb90d7d684 100644
--- a/torch/_inductor/runtime/triton_compat.py
+++ b/torch/_inductor/runtime/triton_compat.py
@@ -69,7 +69,18 @@ def GPUTarget(
         def _log2(x: Any) -> Any:
             raise NotImplementedError
 
-    HAS_WARP_SPEC = hasattr(tl, "async_task")
+    def _triton_config_has(param_name: str) -> bool:
+        if not hasattr(triton, "Config"):
+            return False
+        if not hasattr(triton.Config, "__init__"):
+            return False
+        return param_name in inspect.signature(triton.Config.__init__).parameters
+
+    HAS_WARP_SPEC = (
+        hasattr(tl, "async_task")
+        and _triton_config_has("num_consumer_groups")
+        and _triton_config_has("num_buffers_warp_spec")
+    )
 
     try:
         from triton import knobs

From 66b89d19f7d0d510d96af45a5a777f8bb37afb8f Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Fri, 18 Jul 2025 10:45:13 -0400
Subject: [PATCH 38/83] [CUDA] Use runtime driver API for cuStreamWriteValue32
 (#158585)

[CUDA] Use runtime driver API for cuStreamWriteValue32 (#158295)

Reopen https://github.com/pytorch/pytorch/pull/156097

Fixes https://github.com/pytorch/pytorch/issues/154073

Reference: https://github.com/NVIDIA/Fuser/pull/4197

See PR https://github.com/pytorch/pytorch/pull/156097 and https://github.com/pytorch/pytorch/pull/154097

Pull Request resolved: https://github.com/pytorch/pytorch/pull/158295
Approved by: https://github.com/Skylion007, https://github.com/ngimel, https://github.com/eqy, https://github.com/huydhn


(cherry picked from commit a9f902add02383ca1b0386eb865767641975fede)

Co-authored-by: Frank Lin <eee4017@gmail.com>
Co-authored-by: Wei Wang <weiwan@nvidia.com>
---
 c10/cuda/driver_api.cpp                   | 55 ++++++++++++++++-----
 c10/cuda/driver_api.h                     | 60 ++++++++++++++---------
 test/distributed/test_symmetric_memory.py |  4 --
 3 files changed, 81 insertions(+), 38 deletions(-)

diff --git a/c10/cuda/driver_api.cpp b/c10/cuda/driver_api.cpp
index bb201b5c0397f..f4b62e53fcc00 100644
--- a/c10/cuda/driver_api.cpp
+++ b/c10/cuda/driver_api.cpp
@@ -1,30 +1,35 @@
 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+#include <c10/cuda/CUDAException.h>
 #include <c10/cuda/driver_api.h>
 #include <c10/util/CallOnce.h>
 #include <c10/util/Exception.h>
+#include <c10/util/Logging.h>
+#include <cuda_runtime.h>
 #include <dlfcn.h>
 
 namespace c10::cuda {
 
 namespace {
 
+void* get_symbol(const char* name, int version);
+
 DriverAPI create_driver_api() {
-  void* handle_0 = dlopen("libcuda.so.1", RTLD_LAZY | RTLD_NOLOAD);
-  TORCH_CHECK(handle_0, "Can't open libcuda.so.1: ", dlerror());
   void* handle_1 = DriverAPI::get_nvml_handle();
   DriverAPI r{};
 
-#define LOOKUP_LIBCUDA_ENTRY(name)                       \
-  r.name##_ = ((decltype(&name))dlsym(handle_0, #name)); \
-  TORCH_INTERNAL_ASSERT(r.name##_, "Can't find ", #name, ": ", dlerror())
-  C10_LIBCUDA_DRIVER_API(LOOKUP_LIBCUDA_ENTRY)
-#undef LOOKUP_LIBCUDA_ENTRY
+#define LOOKUP_LIBCUDA_ENTRY_WITH_VERSION_REQUIRED(name, version)            \
+  r.name##_ = reinterpret_cast<decltype(&name)>(get_symbol(#name, version)); \
+  TORCH_INTERNAL_ASSERT(r.name##_, "Can't find ", #name);
+  C10_LIBCUDA_DRIVER_API_REQUIRED(LOOKUP_LIBCUDA_ENTRY_WITH_VERSION_REQUIRED)
+#undef LOOKUP_LIBCUDA_ENTRY_WITH_VERSION_REQUIRED
 
-#define LOOKUP_LIBCUDA_ENTRY(name)                       \
-  r.name##_ = ((decltype(&name))dlsym(handle_0, #name)); \
-  dlerror();
-  C10_LIBCUDA_DRIVER_API_12030(LOOKUP_LIBCUDA_ENTRY)
-#undef LOOKUP_LIBCUDA_ENTRY
+// Users running drivers between 12.0 and 12.3 will not have these symbols,
+// they would be resolved into nullptr, but we guard their usage at runtime
+// to ensure safe fallback behavior.
+#define LOOKUP_LIBCUDA_ENTRY_WITH_VERSION_OPTIONAL(name, version) \
+  r.name##_ = reinterpret_cast<decltype(&name)>(get_symbol(#name, version));
+  C10_LIBCUDA_DRIVER_API_OPTIONAL(LOOKUP_LIBCUDA_ENTRY_WITH_VERSION_OPTIONAL)
+#undef LOOKUP_LIBCUDA_ENTRY_WITH_VERSION_OPTIONAL
 
   if (handle_1) {
 #define LOOKUP_NVML_ENTRY(name)                          \
@@ -35,6 +40,32 @@ DriverAPI create_driver_api() {
   }
   return r;
 }
+
+void* get_symbol(const char* name, int version) {
+  void* out = nullptr;
+  cudaDriverEntryPointQueryResult qres{};
+
+  // CUDA 12.5+ supports version-based lookup
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12050)
+  if (auto st = cudaGetDriverEntryPointByVersion(
+          name, &out, version, cudaEnableDefault, &qres);
+      st == cudaSuccess && qres == cudaDriverEntryPointSuccess && out) {
+    return out;
+  }
+#endif
+
+  // This fallback to the old API to try getting the symbol again.
+  if (auto st = cudaGetDriverEntryPoint(name, &out, cudaEnableDefault, &qres);
+      st == cudaSuccess && qres == cudaDriverEntryPointSuccess && out) {
+    return out;
+  }
+
+  // If the symbol cannot be resolved, report and return nullptr;
+  // the caller is responsible for checking the pointer.
+  LOG(INFO) << "Failed to resolve symbol " << name;
+  return nullptr;
+}
+
 } // namespace
 
 void* DriverAPI::get_nvml_handle() {
diff --git a/c10/cuda/driver_api.h b/c10/cuda/driver_api.h
index a8ded9de68d72..9800809d1e535 100644
--- a/c10/cuda/driver_api.h
+++ b/c10/cuda/driver_api.h
@@ -20,29 +20,42 @@
     }                                                                      \
   } while (0)
 
-#define C10_LIBCUDA_DRIVER_API(_)   \
-  _(cuDeviceGetAttribute)           \
-  _(cuMemAddressReserve)            \
-  _(cuMemRelease)                   \
-  _(cuMemMap)                       \
-  _(cuMemAddressFree)               \
-  _(cuMemSetAccess)                 \
-  _(cuMemUnmap)                     \
-  _(cuMemCreate)                    \
-  _(cuMemGetAllocationGranularity)  \
-  _(cuMemExportToShareableHandle)   \
-  _(cuMemImportFromShareableHandle) \
-  _(cuMemsetD32Async)               \
-  _(cuStreamWriteValue32)           \
-  _(cuGetErrorString)
+// The integer in the second column specifies the requested CUDA Driver API
+// version. The dynamic loader will accept a driver with a newer version, but it
+// ensures that the requested symbol exists in *at least* the specified version
+// or earlier.
+
+// Keep these requested versions as low as possible to maximize compatibility
+// across different driver versions.
+
+// Why do we pin to an older version instead of using the latest?
+// If a user installs a newer driver, blindly resolving the symbol may bind to a
+// newer version of the function with different behavior, potentially breaking
+// PyTorch.
+
+#define C10_LIBCUDA_DRIVER_API_REQUIRED(_) \
+  _(cuDeviceGetAttribute, 12000)           \
+  _(cuMemAddressReserve, 12000)            \
+  _(cuMemRelease, 12000)                   \
+  _(cuMemMap, 12000)                       \
+  _(cuMemAddressFree, 12000)               \
+  _(cuMemSetAccess, 12000)                 \
+  _(cuMemUnmap, 12000)                     \
+  _(cuMemCreate, 12000)                    \
+  _(cuMemGetAllocationGranularity, 12000)  \
+  _(cuMemExportToShareableHandle, 12000)   \
+  _(cuMemImportFromShareableHandle, 12000) \
+  _(cuMemsetD32Async, 12000)               \
+  _(cuStreamWriteValue32, 12000)           \
+  _(cuGetErrorString, 12000)
 
 #if defined(CUDA_VERSION) && (CUDA_VERSION >= 12030)
-#define C10_LIBCUDA_DRIVER_API_12030(_) \
-  _(cuMulticastAddDevice)               \
-  _(cuMulticastBindMem)                 \
-  _(cuMulticastCreate)
+#define C10_LIBCUDA_DRIVER_API_OPTIONAL(_) \
+  _(cuMulticastAddDevice, 12030)           \
+  _(cuMulticastBindMem, 12030)             \
+  _(cuMulticastCreate, 12030)
 #else
-#define C10_LIBCUDA_DRIVER_API_12030(_)
+#define C10_LIBCUDA_DRIVER_API_OPTIONAL(_)
 #endif
 
 #define C10_NVML_DRIVER_API(_)            \
@@ -56,11 +69,14 @@
 namespace c10::cuda {
 
 struct DriverAPI {
+#define CREATE_MEMBER_VERSIONED(name, version) decltype(&name) name##_;
 #define CREATE_MEMBER(name) decltype(&name) name##_;
-  C10_LIBCUDA_DRIVER_API(CREATE_MEMBER)
-  C10_LIBCUDA_DRIVER_API_12030(CREATE_MEMBER)
+  C10_LIBCUDA_DRIVER_API_REQUIRED(CREATE_MEMBER_VERSIONED)
+  C10_LIBCUDA_DRIVER_API_OPTIONAL(CREATE_MEMBER_VERSIONED)
   C10_NVML_DRIVER_API(CREATE_MEMBER)
+#undef CREATE_MEMBER_VERSIONED
 #undef CREATE_MEMBER
+
   static DriverAPI* get();
   static void* get_nvml_handle();
 };
diff --git a/test/distributed/test_symmetric_memory.py b/test/distributed/test_symmetric_memory.py
index 77820dee85f26..57cd9071bc07d 100644
--- a/test/distributed/test_symmetric_memory.py
+++ b/test/distributed/test_symmetric_memory.py
@@ -1078,10 +1078,6 @@ class SymmMemSingleProcTest(TestCase):
         not TEST_WITH_ROCM and _get_torch_cuda_version() < (12, 0),
         "stream_write_value32 currently only supports cuda version>=12.0",
     )
-    @skipIf(
-        _get_torch_cuda_version() >= (12, 6),
-        "https://github.com/pytorch/pytorch/issues/154073",
-    )
     @runOnRocmArch(MI300_ARCH)
     def test_stream_write_value32(self):
         tensor = torch.zeros(4, dtype=torch.uint32, device="cuda")

From 10eb3f210f903cae582e1a0f1b6c94e6a769c78d Mon Sep 17 00:00:00 2001
From: Cao E <e.cao@intel.com>
Date: Sat, 19 Jul 2025 01:11:55 +0800
Subject: [PATCH 39/83] Add stride check for attn_mask on non-cpu device
 (#158618)

Add stride check for attn_mask on non-cpu device (#158424)

Fixes #158374

Pull Request resolved: https://github.com/pytorch/pytorch/pull/158424
Approved by: https://github.com/Valentine233, https://github.com/drisspg, https://github.com/atalman
---
 .../ATen/native/transformers/sdp_utils_cpp.h  | 31 +++++++++++++------
 test/inductor/test_fused_attention.py         | 12 ++++++-
 test/test_transformers.py                     | 28 +++++++++++++++++
 3 files changed, 60 insertions(+), 11 deletions(-)

diff --git a/aten/src/ATen/native/transformers/sdp_utils_cpp.h b/aten/src/ATen/native/transformers/sdp_utils_cpp.h
index aa5c2b6cdd641..c63ca928613e6 100644
--- a/aten/src/ATen/native/transformers/sdp_utils_cpp.h
+++ b/aten/src/ATen/native/transformers/sdp_utils_cpp.h
@@ -6,6 +6,7 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/core/grad_mode.h>
 #include <ATen/native/DispatchStub.h>
+#include <c10/core/DeviceType.h>
 #include <c10/core/ScalarType.h>
 
 #include <c10/util/Exception.h>
@@ -503,17 +504,27 @@ inline bool check_last_dim_stride_equals_1_dense(sdp_params const& params, bool
   if (ignore_singleton_dim){
     qkv_strides_equal_1 = qkv_strides_equal_1 || params.query.sym_size(-1) == 1;
   }
-  if (!qkv_strides_equal_1) {
+  bool is_cpu = params.query.device().type() == c10::DeviceType::CPU;
+  bool mask_stride_equal_1 = params.attn_mask.has_value()
+      ? params.attn_mask.value().sym_stride(-1) == 1
+      : true;
+  bool mask_stride_valid = is_cpu ? true : mask_stride_equal_1;
+  if (!(qkv_strides_equal_1 && mask_stride_valid)) {
     if (debug) {
-      TORCH_WARN(
-          "All fused kernels require the last dimension of the input to have stride 1. ",
-          "Got Query.stride(-1): ",
-          params.query.sym_stride(-1),
-          ", Key.stride(-1): ",
-          params.key.sym_stride(-1),
-          ", Value.stride(-1): ",
-          params.value.sym_stride(-1),
-          " instead.");
+      std::ostringstream message;
+      message
+          << "All fused kernels require the last dimension of the input to have stride 1. ";
+      message << "Got Query.stride(-1): " << params.query.sym_stride(-1)
+              << ", Key.stride(-1): " << params.key.sym_stride(-1)
+              << ", Value.stride(-1): " << params.value.sym_stride(-1);
+
+      if (params.attn_mask.has_value()) {
+        message
+            << ", Attn_mask.stride(-1): "
+            << params.attn_mask.value().sym_stride(-1)
+            << " (GPU backends require attn_mask's last dimension to have stride 1 while the CPU does not).";
+      }
+      TORCH_WARN(message.str());
     }
 
     return false;
diff --git a/test/inductor/test_fused_attention.py b/test/inductor/test_fused_attention.py
index 4d52775ccbade..90b3ac877d709 100644
--- a/test/inductor/test_fused_attention.py
+++ b/test/inductor/test_fused_attention.py
@@ -1023,7 +1023,7 @@ def dot_prod_attention(
             return attn_weights.matmul(value), key, value
 
         tensor_shape = (4, 2, 16, 32)
-        attn_mask = torch.randn((1, 1, 1, 2), dtype=torch.float, device=self.device)
+        attn_mask = torch.randn((1, 1, 2, 2), dtype=torch.float, device=self.device)
         args = [
             torch.randn(tensor_shape, device=self.device),
             torch.randn(tensor_shape, device=self.device),
@@ -1036,6 +1036,16 @@ def dot_prod_attention(
             has_dropout=False,
             check_train=False,
         )
+        # test attn_mask with stride of last dim != 1
+        attn_mask_ = attn_mask.transpose(2, 3)
+        args[3] = attn_mask_
+        self._check_common(
+            dot_prod_attention,
+            args1=args,
+            has_dropout=False,
+            check_train=False,
+            contains=self.device == "cpu",
+        )
 
     def _test_sdpa_rewriter_23(self):
         def dot_prod_attention(
diff --git a/test/test_transformers.py b/test/test_transformers.py
index 5460f7466e097..8bdad854cd223 100644
--- a/test/test_transformers.py
+++ b/test/test_transformers.py
@@ -1618,6 +1618,34 @@ def test_invalid_last_dim_stride(self, device, kernel: SDPBackend):
                 self.assertRaises(RuntimeError, lambda: torch.nn.functional.scaled_dot_product_attention(
                     q, k, v, None, 0.0, False))
 
+    @onlyCUDA
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION
+        or not PLATFORM_SUPPORTS_CUDNN_ATTENTION,
+        "Efficient or cuDNN Attention was not built for this system",
+    )
+    @parametrize("kernel", [SDPBackend.EFFICIENT_ATTENTION, SDPBackend.CUDNN_ATTENTION])
+    def test_mask_invalid_last_dim_stride(self, device, kernel):
+        with sdpa_kernel(backends=[kernel]):
+            dtype = torch.float16
+            make_tensor = partial(torch.rand, device=device, dtype=dtype)
+            size = SdpaShape(2, 2, 8, 8)
+            q, k, v = make_tensor(size), make_tensor(size), make_tensor(size)
+            attn_mask = make_tensor((2, 2, 8, 8))
+            # Passing in a attn_mask with last dim stride not equal to 1 will error
+            attn_mask.as_strided_(size, [2, 2, 2, 2])
+
+            with self.assertWarnsRegex(
+                UserWarning,
+                "GPU backends require attn_mask's last dimension to have stride 1 while the CPU does not",
+            ):
+                self.assertRaises(
+                    RuntimeError,
+                    lambda: torch.nn.functional.scaled_dot_product_attention(
+                        q, k, v, attn_mask, 0.0, False
+                    ),
+                )
+
     @onlyCUDA
     @unittest.skipIf(not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Does not support SDPA or pre-SM80 hardware")
     @parametrize("fused_kernel", [SDPBackend.EFFICIENT_ATTENTION])

From 117d9d4b5973a823de2bd5c505a053ae5f053c9f Mon Sep 17 00:00:00 2001
From: Sidharth Subbarao <55903556+Sidharth123-cpu@users.noreply.github.com>
Date: Fri, 18 Jul 2025 12:37:48 -0700
Subject: [PATCH 40/83] [cherry-pick] temporarily disabling generation of
 weblinks for torch v2.8 & removing string literals for weblink generation
 (#157951)

* [dynamo] temporarily disabling generation of weblinks for torch v2.8 release (#157299)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/157299
Approved by: https://github.com/williamwen42

(cherry picked from commit 3ed4384f5b4bb7ae7d12298632a258385a51446e)

* [dynamo] removing string literals for weblink generation (#157820)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/157820
Approved by: https://github.com/williamwen42

(cherry picked from commit 9f18482d41227df3cf2248dfa54bd6601e61e1ca)
---
 .../fsdp/test_fully_shard_compile.py          |  3 +-
 test/dynamo/test_error_messages.py            | 50 ++-----------------
 test/dynamo/test_exc.py                       |  2 -
 test/dynamo/test_reorder_logs.py              |  3 +-
 test/test_custom_ops.py                       |  3 +-
 torch/_dynamo/exc.py                          | 45 ++---------------
 6 files changed, 11 insertions(+), 95 deletions(-)

diff --git a/test/distributed/_composable/fsdp/test_fully_shard_compile.py b/test/distributed/_composable/fsdp/test_fully_shard_compile.py
index c376aa0e1aa0d..62b06450da7c8 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_compile.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_compile.py
@@ -548,8 +548,7 @@ def test_compiled():
   Hint: This graph break is fundamental - it is unlikely that Dynamo will ever be able to trace through your code. Consider finding a workaround.
 
   Developer debug context: call_method TensorVariable() backward () {}
-
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0123""",  # noqa: B950
+""",  # noqa: B950
                     )
                 else:
                     self.assertGreater(len(counters["graph_break"]), 1)
diff --git a/test/dynamo/test_error_messages.py b/test/dynamo/test_error_messages.py
index 5f4a1f480a8ca..2e7407683cf0e 100644
--- a/test/dynamo/test_error_messages.py
+++ b/test/dynamo/test_error_messages.py
@@ -61,7 +61,6 @@ def fn():
 
   Developer debug context: aten.nonzero.default
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0036
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -83,7 +82,6 @@ def fn():
 
   Developer debug context: aten.linalg_lstsq.default
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0037
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -106,7 +104,6 @@ def fn(x):
 
   Developer debug context: call_method TensorVariable() item () {}
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0124
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -130,7 +127,6 @@ def fn(x):
 
   Developer debug context: aten.equal.default
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0033
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -158,7 +154,6 @@ def fn(lst):
 
   Developer debug context: TensorVariable()
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0207
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -183,7 +178,6 @@ def fn(it):
 
   Developer debug context: call_method UserDefinedObjectVariable(zip) __iter__ () {}
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0156
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -211,7 +205,6 @@ def fn(x, items):
 
   Developer debug context: call_method UserDefinedObjectVariable(dict_items) __iter__ () {}
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0156
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -235,7 +228,6 @@ def fn(it):
 
   Developer debug context: call_function UserDefinedObjectVariable(zip) [] {}
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0147
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -259,7 +251,6 @@ def fn(obj):
 
   Developer debug context: Attempted SETUP_WITH/BEFORE_WITH on ConstantVariable(int: 3)
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0142
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -287,10 +278,7 @@ def fn(x):
     Exception:test
     Traceback:
       File "test_error_messages.py", line N, in fn
-        return x + 1
-
-
- For more details about this graph break, please visit: None""",
+        return x + 1""",
         )
 
     def test_unsupported_builtin(self):
@@ -309,7 +297,6 @@ def fn():
 
   Developer debug context: builtin print [<class 'torch._dynamo.variables.constant.ConstantVariable'>] False
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0059
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -335,7 +322,6 @@ def post_munge(s):
 
   Developer debug context: module: unittest.case, qualname: skip, skip reason: <missing reason>
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0007
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -357,7 +343,6 @@ def fn():
 
   Developer debug context: module: torch._dynamo.decorators, qualname: disable, skip reason: <missing reason>
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0007
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -386,7 +371,6 @@ def post_munge(s):
 
   Developer debug context: qualname: skip, name: skip, filename: `case.py`, skip reason: skipped according trace_rules.lookup unittest
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0008
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -408,7 +392,6 @@ def fn():
 
   Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0025
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -429,7 +412,6 @@ def fn():
 
   Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{'msg': ConstantVariable(str: 'test graph break')}`
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0025
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -451,7 +433,6 @@ def fn():
 
   Developer debug context: module: _warnings, qualname: warn, skip reason: <missing reason>
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0007
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -479,8 +460,7 @@ def fn(x):
   Hint: Consider using torch.utils._pytree - https://github.com/pytorch/pytorch/blob/main/torch/utils/_pytree.py
 
   Developer debug context: module: optree._C, qualname: PyCapsule.flatten, skip reason: <missing reason>
-
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0007""",
+""",
         )
 
     @scoped_load_inline
@@ -526,8 +506,7 @@ def f(x):
   Hint: If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
 
   Developer debug context: module: mylib, qualname: PyCapsule.foobar, skip reason: <missing reason>
-
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0007""",
+""",
         )
 
         cpp_source = """
@@ -579,7 +558,6 @@ def fn(x, y):
 
   Developer debug context: SliceVariable start: ConstantVariable(NoneType: None), stop: TensorVariable(), step: ConstantVariable(NoneType: None)
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0038
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -601,7 +579,6 @@ def fn():
 
   Developer debug context: raised exception ExceptionVariable(<class 'RuntimeError'>)
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0088
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -627,7 +604,6 @@ def fn(mod):
 
   Developer debug context: Foo
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0119
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -656,7 +632,6 @@ def fn(mod, x):
 
   Developer debug context: nn.Module subclass: Foo, name: attr, attribute type: module
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0161
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -686,7 +661,6 @@ def fn():
 
   Developer debug context: Active generic context managers: [GenericContextWrappingVariable(GenericCtxMgr), GenericContextWrappingVariable(GenericCtxMgr)]
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0066
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -701,8 +675,7 @@ def fn():
   Hint: Remove the `torch._dynamo.graph_break()` call.
 
   Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`
-
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0025""",
+""",
         )
 
     def test_load_build_class(self):
@@ -723,7 +696,6 @@ class Foo:
 
   Developer debug context:
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0075
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -756,7 +728,6 @@ def post_munge(s):
   Hint: It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues.
 
   Developer debug context: GET_AITER with args (<torch._dynamo.symbolic_convert.InstructionTranslator object at 0xmem_addr>, Instruction(GET_AITER)
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0082
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -787,7 +758,6 @@ def post_munge(s):
 
   Developer debug context: UserMethodVariable(<function GraphBreakMessagesTest.test_reconstruction_failure.<locals>.Foo.meth at 0xmem_addr>, UserDefinedObjectVariable(Foo))
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0092
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -823,7 +793,6 @@ def post_munge(s):
 
   Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0025
 User code traceback:
   File "test_error_messages.py", line N, in test_reconstruction_failure_gb
     torch.compile(fn, backend="eager")()
@@ -843,7 +812,6 @@ def post_munge(s):
 
   Developer debug context: UserMethodVariable(<function GraphBreakMessagesTest.test_reconstruction_failure_gb.<locals>.Foo.meth at 0xmem_addr>, UserDefinedObjectVariable(Foo))
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0092
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -872,7 +840,6 @@ def fn(x):
 
   Developer debug context:
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0087
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -896,7 +863,6 @@ def fn(x):
 
   Developer debug context: attempted to jump with TensorVariable()
 
- For more details about this graph break, please visit: None
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -963,7 +929,6 @@ def fn(x):
 
   Developer debug context: value: ConstantVariable(bool: False)
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0034
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -1007,7 +972,6 @@ def gn():
 
   Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0025
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -1060,7 +1024,6 @@ def gn():
 
   Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0025
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -1096,7 +1059,6 @@ def hn(x):
 
   Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0025
 User code traceback:
   File "test_error_messages.py", line N, in test_nested_compile_user_frames
     torch.compile(fn, backend="eager")(torch.randn(3))
@@ -1210,7 +1172,6 @@ def f3(x):
 
   Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0025
 User code traceback:
   File "test_error_messages.py", line N, in test_graph_break_traceback_collapsed_resume_frames
     f1(torch.randn(3))
@@ -1300,7 +1261,6 @@ def post_munge(s):
 
   Developer debug context: <function GraphBreakMessagesTest.test_disable_message.<locals>.f at 0xmem_addr>
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0098
 
 from user code:
    File "test_error_messages.py", line N, in outer
@@ -1322,7 +1282,6 @@ def g(x):
 
   Developer debug context: <function GraphBreakMessagesTest.test_disable_message.<locals>.g at 0xmem_addr>
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0098
 
 from user code:
    File "test_error_messages.py", line N, in outer
@@ -1348,7 +1307,6 @@ def forward(self, x):
 
   Developer debug context: source: LocalSource(local_name='fn', is_input=True, dynamism=None, is_derefed_cell_contents=False)
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0148
 
 from user code:
    File "test_error_messages.py", line N, in outer
diff --git a/test/dynamo/test_exc.py b/test/dynamo/test_exc.py
index ce4af812a5f81..acc3fd55f6fb0 100644
--- a/test/dynamo/test_exc.py
+++ b/test/dynamo/test_exc.py
@@ -43,7 +43,6 @@ def fn001(x):
 
   Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0025
 
 from user code:
    File "test_exc.py", line N, in fn001
@@ -183,7 +182,6 @@ def fn001(x):
 
   Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`
 
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0025
 User code traceback:
   File "test_exc.py", line N, in test_graph_break_log
     torch.compile(fn001, backend="eager")(torch.randn(1))
diff --git a/test/dynamo/test_reorder_logs.py b/test/dynamo/test_reorder_logs.py
index 727f73017dbbc..84b4f00dc9d11 100644
--- a/test/dynamo/test_reorder_logs.py
+++ b/test/dynamo/test_reorder_logs.py
@@ -210,8 +210,7 @@ def f(x):
   Hint: Set `torch._dynamo.config.capture_scalar_outputs = True` or `export TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1` to include these operations in the captured graph.
 
   Developer debug context: call_method TensorVariable() item () {}
-
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0124""",  # noqa: B950
+""",  # noqa: B950
         )
 
 
diff --git a/test/test_custom_ops.py b/test/test_custom_ops.py
index ed2c63d1a8175..f9d231a7df851 100644
--- a/test/test_custom_ops.py
+++ b/test/test_custom_ops.py
@@ -1817,8 +1817,7 @@ def f(x):
   Hint: Enable tracing of dynamic shape operators with `torch._dynamo.config.capture_dynamic_output_shape_ops = True`
 
   Developer debug context: _torch_testing.numpy_nonzero.default
-
- For more details about this graph break, please visit: https://compile-graph-break-site.vercel.app/gb/GB0036""",
+""",
         )
 
     # pre-existing problem: torch.compile(dynamic=True) will, by default,
diff --git a/torch/_dynamo/exc.py b/torch/_dynamo/exc.py
index ffda65342a373..4daa72ef46470 100644
--- a/torch/_dynamo/exc.py
+++ b/torch/_dynamo/exc.py
@@ -26,15 +26,12 @@
     - Debugging utilities for error reporting
 """
 
-import json
 import logging
 import os
 import re
 import textwrap
 import typing
 from enum import auto, Enum
-from functools import lru_cache
-from pathlib import Path
 from traceback import extract_stack, format_exc, format_list, StackSummary
 from typing import Any, NoReturn, Optional, TYPE_CHECKING
 
@@ -497,42 +494,6 @@ def format_graph_break_message(
     return msg
 
 
-@lru_cache(maxsize=1)
-def _load_graph_break_registry() -> dict[str, Any]:
-    """
-    Loads the graph break registry from JSON file with caching.
-    """
-    try:
-        script_dir = Path(__file__).resolve().parent
-        registry_path = script_dir / "graph_break_registry.json"
-        with registry_path.open() as f:
-            return json.load(f)
-    except (FileNotFoundError, json.JSONDecodeError) as e:
-        log.error("Error accessing the registry file: %s", e)
-        return {}
-
-
-def get_gbid_documentation_link(gb_type: str) -> Optional[str]:
-    """
-    Retrieves the GBID documentation link for a given graph break type.
-
-    Args:
-        gb_type: The graph break type to look up.
-
-    Returns:
-        A string containing the documentation URL if found, otherwise None.
-    """
-    GRAPH_BREAK_SITE_URL = "https://compile-graph-break-site.vercel.app/gb/"
-
-    registry = _load_graph_break_registry()
-
-    for k, v in registry.items():
-        if v and v[0].get("Gb_type") == gb_type:
-            return f"{GRAPH_BREAK_SITE_URL}{k}"
-
-    return "None"
-
-
 # TODO replace old unimplemented later
 def unimplemented_v2(
     gb_type: str,
@@ -555,8 +516,10 @@ def unimplemented_v2(
 
     msg = format_graph_break_message(gb_type, context, explanation, hints)
 
-    documentation_link = get_gbid_documentation_link(gb_type)
-    msg += f"\n For more details about this graph break, please visit: {documentation_link}"
+    # Temporarily disabling the generation of the weblinks in error message
+
+    # documentation_link = get_gbid_documentation_link(gb_type)
+    # msg += f"\n For more details about this graph break, please visit: {documentation_link}"
 
     if log_warning:
         log.warning(msg)

From 88d04c8b977c638dcd1576a7bf7f3df6d4dda8cf Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Mon, 21 Jul 2025 11:40:17 -0400
Subject: [PATCH 41/83] [Reland] Add warning about removed sm50 and sm60 arches
 (#158744)

---
 torch/cuda/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index e3793234f2bc8..69a7d1e064d81 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -262,7 +262,9 @@ def _check_capability():
         "12.9": {"min": 70, "max": 120},
     }
 
-    if torch.version.cuda is not None:  # on ROCm we don't want this check
+    if (
+        torch.version.cuda is not None and torch.cuda.get_arch_list()
+    ):  # on ROCm we don't want this check
         for d in range(device_count()):
             capability = get_device_capability(d)
             major = capability[0]

From 3006279bcfff595bba70b738c1d09ade269ebdd4 Mon Sep 17 00:00:00 2001
From: Camyll Harajli <camyllh@meta.com>
Date: Mon, 21 Jul 2025 10:21:11 -0700
Subject: [PATCH 42/83] [cherry-pick][release 2.8] Update OpenBLAS commit 
 (#151547) (#158243)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Update OpenBLAS commit  (#151547)

Motivation: Update OpenBLAS and change build script to enable SBGEMM kernels . Update pytorch `jammy` builds for aarch64 to use `install_openblas.sh` instead of `conda_install`

Link to full [TorchInductor Performance Dashboard AArch64](https://hud.pytorch.org/benchmark/compilers?dashboard=torchinductor&startTime=Fri%2C%2006%20Jun%202025%2009%3A46%3A35%20GMT&stopTime=Fri%2C%2013%20Jun%202025%2009%3A46%3A35%20GMT&granularity=hour&mode=inference&dtype=bfloat16&deviceName=cpu%20(aarch64)&lBranch=adi/update_openblas&lCommit=0218b65bcf61971c1861cfe8bc586168b73aeb5f&rBranch=main&rCommit=9d59b516e9b3026948918e3ff8c2ef55a33d13ad)

1. This shows a promising speedup across most of the HF models in benchmark, specifically giving a significant boost to SDPA layers.
2. Overall torch-bench pass-rate (cpp_wrapper mode) increased `[87%, 65/75 → 96%, 72/75]`

<img width="676" alt="Screenshot 2025-06-20 at 17 05 15" src="https://github.com/user-attachments/assets/2ca9c1bc-80c6-464a-8db6-b758f2476582" />

Pull Request resolved: https://github.com/pytorch/pytorch/pull/151547
Approved by: https://github.com/malfet, https://github.com/snadampal, https://github.com/fadara01

Co-authored-by: Christopher Sidebottom <chris.sidebottom@arm.com>
Co-authored-by: Ryo Suzuki <ryo.suzuki@arm.com>
Co-authored-by: Ye Tao <ye.tao@arm.com>
Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>

* Update .ci/docker/common/install_conda.sh

Co-authored-by: Andrey Talman <atalman@fb.com>

* Update .ci/docker/common/install_conda.sh

Co-authored-by: Andrey Talman <atalman@fb.com>

* Update .ci/docker/common/install_conda.sh

Co-authored-by: Andrey Talman <atalman@fb.com>

* try reverting conda install

---------

Co-authored-by: Aditya Tewari <aditya.tewari@arm.com>
Co-authored-by: Christopher Sidebottom <chris.sidebottom@arm.com>
Co-authored-by: Ryo Suzuki <ryo.suzuki@arm.com>
Co-authored-by: Ye Tao <ye.tao@arm.com>
Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>
Co-authored-by: Andrey Talman <atalman@fb.com>
---
 .ci/docker/build.sh                   | 5 +++++
 .ci/docker/common/install_openblas.sh | 6 +++---
 .ci/docker/manywheel/build.sh         | 2 +-
 .ci/docker/ubuntu/Dockerfile          | 6 ++++++
 4 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index 6624d9928cbe0..50ec4b3841c68 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -335,6 +335,8 @@ case "$tag" in
     GCC_VERSION=11
     ACL=yes
     VISION=yes
+    CONDA_CMAKE=yes
+    OPENBLAS=yes
     # snadampal: skipping llvm src build install because the current version
     # from pytorch/llvm:9.0.1 is x86 specific
     SKIP_LLVM_SRC_BUILD_INSTALL=yes
@@ -344,6 +346,8 @@ case "$tag" in
     GCC_VERSION=11
     ACL=yes
     VISION=yes
+    CONDA_CMAKE=yes
+    OPENBLAS=yes
     # snadampal: skipping llvm src build install because the current version
     # from pytorch/llvm:9.0.1 is x86 specific
     SKIP_LLVM_SRC_BUILD_INSTALL=yes
@@ -430,6 +434,7 @@ docker build \
        --build-arg "XPU_VERSION=${XPU_VERSION}" \
        --build-arg "UNINSTALL_DILL=${UNINSTALL_DILL}" \
        --build-arg "ACL=${ACL:-}" \
+       --build-arg "OPENBLAS=${OPENBLAS:-}" \
        --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
        --build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
        -f $(dirname ${DOCKERFILE})/Dockerfile \
diff --git a/.ci/docker/common/install_openblas.sh b/.ci/docker/common/install_openblas.sh
index e932ecd1cdc1a..3c795acf2220b 100644
--- a/.ci/docker/common/install_openblas.sh
+++ b/.ci/docker/common/install_openblas.sh
@@ -4,8 +4,9 @@
 set -ex
 
 cd /
-git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION:-v0.3.29}" --depth 1 --shallow-submodules
+git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION:-v0.3.30}" --depth 1 --shallow-submodules
 
+OPENBLAS_CHECKOUT_DIR="OpenBLAS"
 OPENBLAS_BUILD_FLAGS="
 NUM_THREADS=128
 USE_OPENMP=1
@@ -13,9 +14,8 @@ NO_SHARED=0
 DYNAMIC_ARCH=1
 TARGET=ARMV8
 CFLAGS=-O3
+BUILD_BFLOAT16=1
 "
 
-OPENBLAS_CHECKOUT_DIR="OpenBLAS"
-
 make -j8 ${OPENBLAS_BUILD_FLAGS} -C ${OPENBLAS_CHECKOUT_DIR}
 make -j8 ${OPENBLAS_BUILD_FLAGS} install -C ${OPENBLAS_CHECKOUT_DIR}
diff --git a/.ci/docker/manywheel/build.sh b/.ci/docker/manywheel/build.sh
index b364d47c6c7a6..c1f9a7e0103ab 100755
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@@ -41,7 +41,7 @@ case ${image} in
         GPU_IMAGE=arm64v8/almalinux:8
         DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13 --build-arg NINJA_VERSION=1.12.1"
         MANY_LINUX_VERSION="2_28_aarch64"
-        OPENBLAS_VERSION="v0.3.29"
+        OPENBLAS_VERSION="v0.3.30"
         ;;
     manylinuxcxx11-abi-builder:cpu-cxx11-abi)
         TARGET=final
diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
index 88c2bdf70414e..27c466dd8d41d 100644
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@@ -147,6 +147,12 @@ RUN if [ -n "${ACL}" ]; then bash ./install_acl.sh; fi
 RUN rm install_acl.sh
 ENV INSTALLED_ACL ${ACL}
 
+ARG OPENBLAS
+COPY ./common/install_openblas.sh install_openblas.sh
+RUN if [ -n "${OPENBLAS}" ]; then bash ./install_openblas.sh; fi
+RUN rm install_openblas.sh
+ENV INSTALLED_OPENBLAS ${OPENBLAS}
+
 # Install ccache/sccache (do this last, so we get priority in PATH)
 ARG SKIP_SCCACHE_INSTALL
 COPY ./common/install_cache.sh install_cache.sh

From 45ef46b0c7c6d86d15bdcd817dfa2d7cdd5a6de6 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Mon, 21 Jul 2025 15:05:12 -0400
Subject: [PATCH 43/83] [cherry-pick][Docker builds] Move from Miniconda to
 Miniforge (#158370) (#158756)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* [Docker builds] Move from Miniconda to Miniforge (#158370)

This is related to: https://www.anaconda.com/legal/terms/terms-of-service

Trying to fix outage with docker builds.
https://github.com/pytorch/pytorch/actions/runs/16298993712/job/46033590799

Rocm and XPU builds since they use Miniforge are not affected

```
#22 ERROR: process "/bin/sh -c bash ./install_conda.sh && rm install_conda.sh install_magma_conda.sh common_utils.sh /opt/conda/requirements-ci.txt /opt/conda/requirements-docs.txt" did not complete successfully: exit code: 1
------
 > [base 14/42] RUN bash ./install_conda.sh && rm install_conda.sh install_magma_conda.sh common_utils.sh /opt/conda/requirements-ci.txt /opt/conda/requirements-docs.txt:
11.93 CondaToSNonInteractiveError: Terms of Service have not been accepted for the following channels. Please accept or remove them before proceeding:
11.93     • https://repo.anaconda.com/pkgs/main
11.93     • https://repo.anaconda.com/pkgs/r
11.93
11.93 To accept a channel's Terms of Service, run the following and replace `CHANNEL` with the channel name/URL:
11.93     ‣ conda tos accept --override-channels --channel CHANNEL
```
Hence solution is:
1. using `` conda tos accept --override-channels --channel defaults``
2. use Miniforge instead of Miniconda.

Using solution 2.

Solution Tried that don't work:
1. Using ``CONDA_ALWAYS_YES = true ``

4. Using older version of miniconda
```
[Miniconda3-py310_25.5.1-0-Linux-x86_64.sh](https://repo.anaconda.com/miniconda/Miniconda3-py310_25.5.1-0-Linux-x86_64.sh)
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/158370
Approved by: https://github.com/seemethere

Co-authored-by: Eli Uriegas <1700823+seemethere@users.noreply.github.com>

* Remove tos

---------

Co-authored-by: Eli Uriegas <1700823+seemethere@users.noreply.github.com>
---
 .ci/docker/common/install_conda.sh | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/.ci/docker/common/install_conda.sh b/.ci/docker/common/install_conda.sh
index 3029b5967a87a..ac98da8a130cb 100755
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@@ -4,12 +4,8 @@ set -ex
 
 # Optionally install conda
 if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
-  BASE_URL="https://repo.anaconda.com/miniconda"
-  CONDA_FILE="Miniconda3-latest-Linux-x86_64.sh"
-  if [[ $(uname -m) == "aarch64" ]] || [[ "$BUILD_ENVIRONMENT" == *xpu* ]] || [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
-    BASE_URL="https://github.com/conda-forge/miniforge/releases/latest/download"  # @lint-ignore
-    CONDA_FILE="Miniforge3-Linux-$(uname -m).sh"
-  fi
+  BASE_URL="https://github.com/conda-forge/miniforge/releases/latest/download"  # @lint-ignore
+  CONDA_FILE="Miniforge3-Linux-$(uname -m).sh"
 
   MAJOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 1)
   MINOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 2)
@@ -21,7 +17,6 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
       exit 1
       ;;
   esac
-
   mkdir -p /opt/conda
   chown jenkins:jenkins /opt/conda
 
@@ -54,8 +49,6 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
     export SYSROOT_DEP="sysroot_linux-64=2.17"
   fi
 
-  # Please see: https://github.com/pytorch/pytorch/pull/158370#issuecomment-3084705725
-  export CONDA_PLUGINS_AUTO_ACCEPT_TOS="yes"
   # Install correct Python version
   # Also ensure sysroot is using a modern GLIBC to match system compilers
   as_jenkins conda create -n py_$ANACONDA_PYTHON_VERSION -y\

From e5e8a386f088b53ff724c729e7a5b4eb97f0129e Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Mon, 21 Jul 2025 17:17:27 -0400
Subject: [PATCH 44/83] [async-TP] Turn asserts back into silent skips
 (#158736)

[async-TP] Turn asserts back into silent skips (#158572)

https://github.com/pytorch/pytorch/pull/149946 modified some checks that verify whether async-TP is "applicable" to a given collective operation in a graph. Before, the pattern-mathcing+replacement would just be skipped, but now these are asserts that fail and raise.

This is causing concrete issues in some graphs where 2-dimensional device meshes are being used (e.g., TP + CP) but only one dimension has symm-mem enabled. See #158569.

This PR is turning these asserts back into harmless early-exits. Note that this only needed to be done for reduce-scatters, as it was already the case for all-gathers.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/158572
Approved by: https://github.com/danielvegamyhre, https://github.com/atalman

(cherry picked from commit fac0be7b9c80f20bbff1e813225dcbced7ff4d31)

Co-authored-by: Luca Wehrstedt <lcw@meta.com>
---
 .../tensor/parallel/test_micro_pipeline_tp.py | 50 +++++++++++++++++++
 .../_inductor/fx_passes/micro_pipeline_tp.py  | 13 ++---
 .../distributed/_symmetric_memory/__init__.py | 11 +++-
 3 files changed, 66 insertions(+), 8 deletions(-)

diff --git a/test/distributed/tensor/parallel/test_micro_pipeline_tp.py b/test/distributed/tensor/parallel/test_micro_pipeline_tp.py
index 906b7d1a4a52b..df3e2ffb38858 100644
--- a/test/distributed/tensor/parallel/test_micro_pipeline_tp.py
+++ b/test/distributed/tensor/parallel/test_micro_pipeline_tp.py
@@ -494,5 +494,55 @@ def test_dtensor_seq_par(self, shard_dim: int):
         self.assertNotIn("reduce_scatter_tensor", code)
 
 
+@instantiate_parametrized_tests
+class MicroPipelineTP4GPUTest(TestCase):
+    def setUp(self):
+        torch._inductor.config._micro_pipeline_tp = True
+
+        self.rank = 0
+        self.world_size = 4
+        torch.cuda.set_device("cuda:0")
+
+        store = FakeStore()
+        dist.init_process_group(
+            backend="fake",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
+
+    def tearDown(self):
+        dist.destroy_process_group()
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @fresh_cache()
+    def test_extra_collectives(self):
+        device_mesh = DeviceMesh(
+            "cuda",
+            torch.arange(0, self.world_size).view(2, -1),
+            mesh_dim_names=("tp", "other"),
+        )
+
+        def func(inp: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor) -> torch.Tensor:
+            hidden = all_gather_tensor(inp, 0, (device_mesh, 0)) @ w1.t()
+            full_hidden = all_gather_tensor(hidden, 0, (device_mesh, 1))
+            full_hidden /= full_hidden.pow(2).sum().sqrt()
+            hidden = reduce_scatter_tensor(full_hidden, "avg", 0, (device_mesh, 1))
+            return reduce_scatter_tensor(hidden @ w2.t(), "avg", 0, (device_mesh, 0))
+
+        inp = torch.rand(8, 10, device="cuda")
+        w1 = torch.rand(7, 10, device="cuda")
+        w2 = torch.rand(10, 7, device="cuda")
+
+        with _test_mode(group_names={device_mesh["tp"].get_group().group_name}):
+            compiled = torch.compile(func)
+            code = run_and_get_triton_code(compiled, inp, w1, w2)
+
+        self.assertIn("fused_all_gather_matmul", code)
+        self.assertIn("all_gather_into_tensor", code)
+        self.assertIn("fused_matmul_reduce_scatter", code)
+        self.assertIn("reduce_scatter_tensor", code)
+
+
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/_inductor/fx_passes/micro_pipeline_tp.py b/torch/_inductor/fx_passes/micro_pipeline_tp.py
index af40d987f7d18..c4d935a4f8bb4 100644
--- a/torch/_inductor/fx_passes/micro_pipeline_tp.py
+++ b/torch/_inductor/fx_passes/micro_pipeline_tp.py
@@ -850,9 +850,11 @@ def fuse_matmul_reduce_scatter(reduce_scatter: _ReduceScatterMatch) -> None:
 
     Returns boolean indicating if fusion was successful or not.
     """
-    assert torch.distributed.is_available() and torch.distributed.is_nccl_available(), (
-        "torch.distributed and NCCL must be available to use async tensor parallelism"
-    )
+    if (
+        not torch.distributed.is_available()
+        or not torch.distributed.is_nccl_available()
+    ):
+        return
 
     from torch.distributed._symmetric_memory import (
         is_symm_mem_enabled_for_group,
@@ -875,9 +877,8 @@ def fuse_matmul_reduce_scatter(reduce_scatter: _ReduceScatterMatch) -> None:
         reduce_scatter.group_name,
     )
 
-    assert is_symm_mem_enabled_for_group(group_name), (
-        f"symmetric memory is not enabled for process group {group_name}, this is required for async TP"
-    )
+    if not is_symm_mem_enabled_for_group(group_name):
+        return
 
     # Currently fused_matmul_reduce_scatter doesn't return the matmul result,
     # so we can't apply the fusion if the matmul result is used by multiple
diff --git a/torch/distributed/_symmetric_memory/__init__.py b/torch/distributed/_symmetric_memory/__init__.py
index 279beea0df940..dba9476f91a2d 100644
--- a/torch/distributed/_symmetric_memory/__init__.py
+++ b/torch/distributed/_symmetric_memory/__init__.py
@@ -47,10 +47,11 @@ def enable_symm_mem_for_group(group_name: str) -> None:
 
 
 _is_test_mode: bool = False
+_mocked_group_names: Optional[set[str]] = None
 
 
 @contextmanager
-def _test_mode() -> Generator[None, None, None]:
+def _test_mode(group_names: Optional[set[str]] = None) -> Generator[None, None, None]:
     """
     Forces ``is_symm_mem_enabled_for_group()`` to return ``True`` and the ops
     defined in the ``symm_mem`` namespace to use fallback implementations.
@@ -58,12 +59,16 @@ def _test_mode() -> Generator[None, None, None]:
     The context manager is not thread safe.
     """
     global _is_test_mode
+    global _mocked_group_names
     prev = _is_test_mode
+    prev_group_names = _mocked_group_names
     try:
         _is_test_mode = True
+        _mocked_group_names = group_names
         yield
     finally:
         _is_test_mode = prev
+        _mocked_group_names = prev_group_names
 
 
 def is_symm_mem_enabled_for_group(group_name: str) -> bool:
@@ -73,7 +78,9 @@ def is_symm_mem_enabled_for_group(group_name: str) -> bool:
     Args:
         group_name (str): the name of the process group.
     """
-    return _is_test_mode or group_name in _group_name_to_store
+    if _is_test_mode:
+        return _mocked_group_names is None or group_name in _mocked_group_names
+    return group_name in _group_name_to_store
 
 
 _group_name_to_workspace_tensor: dict[str, Optional[torch.Tensor]] = {}

From a3dea7991ad0c97c7ba2365f4aa6a9a7e332ce3b Mon Sep 17 00:00:00 2001
From: Robert Hardwick <robert.hardwick@arm.com>
Date: Mon, 21 Jul 2025 23:04:33 +0100
Subject: [PATCH 45/83] [cherry-pick] Fix AArch64 segfaults by disabling
 strict-aliasing in GridSamplerKernel for GCC 12 and above (#158445)

Fix AArch64 grid sampler segfaults by disabling strict-aliasing gcc optimization

See https://github.com/pytorch/pytorch/issues/157626 for more context

(cherry picked from commit b62f4d03c7e76bfa7e25ad898232fade0111510b)
---
 aten/src/ATen/native/cpu/GridSamplerKernel.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
index 521a65c7cd948..9450b7eca9b37 100644
--- a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
+++ b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
@@ -14,6 +14,12 @@
 
 namespace at::native { namespace {
 
+// fixes segfaults for GCC >= 12 on some AArch64 cpus https://github.com/pytorch/pytorch/issues/157626
+#if defined(__GNUC__) && __GNUC__ >= 12 && defined(__aarch64__)
+#pragma GCC push_options
+#pragma GCC optimize ("no-strict-aliasing")
+#endif
+
 /**  NOTE [ Grid Sample CPU Kernels ]
  *
  *   Implementation of vectorized grid sample CPU kernels is divided into three
@@ -1014,6 +1020,10 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bicubic,
   }
 };
 
+#if defined(__GNUC__) && __GNUC__ >= 12 && defined(__aarch64__)
+#pragma GCC pop_options
+#endif
+
 // ~~~~~~~~~~~~~~~~~~ grid_sample_2d_grid_slice_iterator ~~~~~~~~~~~~~~~~~~~~~~
 // Function to apply a vectorized function on a grid slice tensor (without batch
 // dimension).

From d3960e52e9f63d23efd1065173e93e37e1c35404 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Tue, 22 Jul 2025 11:08:31 -0700
Subject: [PATCH 46/83] Pull latest Sphinx theme (#158595) (#158673)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/158595
Approved by: https://github.com/albanD

(cherry picked from commit 79e49efaddf3a049adbe2de839cc65d73a1edd42)
---
 .ci/docker/requirements-docs.txt | 2 +-
 docs/source/conf.py              | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.ci/docker/requirements-docs.txt b/.ci/docker/requirements-docs.txt
index 15e8075e617f4..c3d4e644dbabf 100644
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@@ -4,7 +4,7 @@ sphinx==5.3.0
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git@pytorch_sphinx_theme2#egg=pytorch_sphinx_theme2
 
 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
-# but it doesn't seem to work and hangs around idly. The initial thought is probably
+# but it doesn't seem to work and hangs around idly. The initial thought it is probably
 # something related to Docker setup. We can investigate this later
 
 sphinxcontrib.katex==0.8.6
diff --git a/docs/source/conf.py b/docs/source/conf.py
index acb2b088af727..c8315a577926c 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -82,6 +82,10 @@
 ]
 sitemap_url_scheme = "{link}"
 
+html_additional_pages = {
+    "404": "404.html",
+}
+
 # build the templated autosummary files
 autosummary_generate = True
 numpydoc_show_class_members = False

From 2f85ac28814377b25d75e28ec4b6ea3f81391121 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Tue, 22 Jul 2025 14:31:46 -0400
Subject: [PATCH 47/83] [Dynamo] Use proper sources for constructing dataclass
 defaults (#158689)

[Dynamo] Use proper sources for constructing dataclass defaults (#157993)

Partially fixes https://github.com/pytorch/pytorch/issues/154009

Pull Request resolved: https://github.com/pytorch/pytorch/pull/157993
Approved by: https://github.com/williamwen42, https://github.com/anijain2305

(cherry picked from commit 89850bbc073c4e27ca51b0b205742e1d316e7097)

Co-authored-by: Michael Lazos <mlazos@meta.com>
---
 test/dynamo/test_misc.py                | 20 ++++++++++++++++++++
 torch/_dynamo/guards.py                 | 11 +++++++++++
 torch/_dynamo/source.py                 | 16 ++++++++++++++++
 torch/_dynamo/utils.py                  |  4 ++++
 torch/_dynamo/variables/user_defined.py | 13 +++++++++++--
 5 files changed, 62 insertions(+), 2 deletions(-)

diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index 50ede0b54656d..f7a747f6e0475 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -10411,6 +10411,26 @@ def fn(x, y):
         actual = fn_opt(*inps)
         expected = fn(*inps)
 
+    def test_nested_dataclass_reconstruct(self):
+        @dataclasses.dataclass(frozen=True)
+        class NestedDataClass:
+            x: int = 2
+
+        @dataclasses.dataclass(frozen=True)
+        class TestDataClass:
+            y: torch.Tensor
+            ndc: NestedDataClass = NestedDataClass()
+
+        def fn(y):
+            dc = TestDataClass(y)
+            z = dc.y + dc.ndc.x
+            return z, dc
+
+        fn_opt = torch.compile()(fn)
+        inps = (torch.ones(2, 2),)
+        actual = fn_opt(*inps)
+        expected = fn(*inps)
+
     def test_frozen_dataclass_default_value(self):
         @dataclasses.dataclass(frozen=True)
         class TestDataClass:
diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py
index 77f0454a9cdce..23050946c9802 100644
--- a/torch/_dynamo/guards.py
+++ b/torch/_dynamo/guards.py
@@ -104,6 +104,7 @@
     ChainedSource,
     ConstantSource,
     ConstDictKeySource,
+    DataclassFieldsSource,
     DefaultsSource,
     DictGetItemSource,
     DictSubclassGetItemSource,
@@ -144,6 +145,7 @@
 from .utils import (
     builtin_dict_keys,
     common_constant_types,
+    dataclass_fields,
     dict_keys,
     get_custom_getattr,
     get_torch_function_mode_stack,
@@ -449,6 +451,7 @@ def _get_closure_vars():
             "___tuple_iterator_len": tuple_iterator_len,
             "___normalize_range_iter": normalize_range_iter,
             "___tuple_iterator_getitem": tuple_iterator_getitem,
+            "___dataclass_fields": dataclass_fields,
             "___get_torch_function_mode_stack_at": get_torch_function_mode_stack_at,
             "__math_isnan": math.isnan,
             "__numpy_isnan": None if np is None else np.isnan,
@@ -1301,6 +1304,14 @@ def get_guard_manager_from_source(self, source):
                 example_value=example_value,
                 guard_manager_enum=guard_manager_enum,
             )
+        elif istype(source, DataclassFieldsSource):
+            assert base_guard_manager
+            out = base_guard_manager.lambda_manager(
+                python_lambda=lambda x: dataclass_fields(x),
+                source=source_name,
+                example_value=example_value,
+                guard_manager_enum=guard_manager_enum,
+            )
         else:
             raise AssertionError(
                 f"missing guard manager builder {source} - {source.name()}"
diff --git a/torch/_dynamo/source.py b/torch/_dynamo/source.py
index 2ae169f099fd9..23c4c546b3d66 100644
--- a/torch/_dynamo/source.py
+++ b/torch/_dynamo/source.py
@@ -723,6 +723,22 @@ def name(self):
         return f"___tuple_iterator_getitem({self.base.name()}, {self.index!r})"
 
 
+@dataclasses.dataclass(frozen=True)
+class DataclassFieldsSource(ChainedSource):
+    def reconstruct(self, codegen: "PyCodegen"):
+        codegen.add_push_null(
+            lambda: codegen.load_import_from(utils.__name__, "dataclass_fields")
+        )
+        codegen(self.base)
+        codegen.extend_output(create_call_function(1, False))
+
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def name(self):
+        return f"___dataclass_fields({self.base.name()})"
+
+
 @dataclasses.dataclass(frozen=True)
 class TypeSource(ChainedSource):
     def __post_init__(self):
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index d2382ba12c17d..fbc75416984f4 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -2544,6 +2544,10 @@ def tuple_iterator_getitem(it, index):
     return obj[start + index]
 
 
+def dataclass_fields(cls):
+    return torch._dynamo.disable(dataclasses.fields)(cls)
+
+
 iter_next = next
 
 
diff --git a/torch/_dynamo/variables/user_defined.py b/torch/_dynamo/variables/user_defined.py
index 38fda3d2ffb21..9a65f0c4a99c6 100644
--- a/torch/_dynamo/variables/user_defined.py
+++ b/torch/_dynamo/variables/user_defined.py
@@ -29,6 +29,7 @@
 import enum
 import functools
 import inspect
+import itertools
 import random
 import sys
 import threading
@@ -56,6 +57,7 @@
 from ..source import (
     AttrSource,
     CallFunctionNoArgsSource,
+    DataclassFieldsSource,
     GetItemSource,
     RandomValueSource,
     TypeSource,
@@ -610,11 +612,12 @@ def call_function(
             return SizeVariable(tup.items)
         elif is_frozen_dataclass(self.value) and self.is_standard_new():
             fields = dataclasses.fields(self.value)
+            fields_source = DataclassFieldsSource(self.source)
             items = list(args)
             items.extend([None] * (len(fields) - len(items)))
 
             default_kwargs = {}
-            for field, var_tracker in zip(fields, items):
+            for ind, field, var_tracker in zip(itertools.count(), fields, items):
                 if var_tracker is None:
                     if field.name in kwargs:
                         var_tracker = kwargs[field.name]
@@ -623,7 +626,13 @@ def call_function(
                             continue
 
                         if field.default is not dataclasses.MISSING:
-                            var_tracker = VariableTracker.build(tx, field.default)
+                            var_tracker = VariableTracker.build(
+                                tx,
+                                field.default,
+                                source=AttrSource(
+                                    GetItemSource(fields_source, ind), "default"
+                                ),
+                            )
                         elif field.default_factory is not dataclasses.MISSING:
                             factory_fn = VariableTracker.build(
                                 tx, field.default_factory

From 9298444a5da4d5c48c245c4df13d0dc5a0a03b4c Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Tue, 22 Jul 2025 14:46:10 -0400
Subject: [PATCH 48/83] [cherry-pick] Unify torch.tensor and
 torch.ops.aten.scalar_tensor behavior (#158537) (#158655)

Unify torch.tensor and torch.ops.aten.scalar_tensor behavior (#158537)

Fixes #158376

Pull Request resolved: https://github.com/pytorch/pytorch/pull/158537
Approved by: https://github.com/atalman

Co-authored-by: bobrenjc93 <bobren@meta.com>
---
 aten/src/ATen/ScalarOps.cpp | 23 ++++++++++++++++++++++-
 test/dynamo/test_misc.py    | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/ScalarOps.cpp b/aten/src/ATen/ScalarOps.cpp
index 693fb46e639f2..da4f7a35a2f47 100644
--- a/aten/src/ATen/ScalarOps.cpp
+++ b/aten/src/ATen/ScalarOps.cpp
@@ -8,7 +8,28 @@ namespace at {
 namespace {
 template <typename scalar_t>
 inline void fill_inplace(Tensor& self, const Scalar& value_scalar) {
-  auto value = value_scalar.to<scalar_t>();
+  scalar_t value{};
+
+  if constexpr (std::is_same_v<scalar_t, at::Half> ||
+                std::is_same_v<scalar_t, at::BFloat16> ||
+                std::is_same_v<scalar_t, at::Float8_e5m2> ||
+                std::is_same_v<scalar_t, at::Float8_e5m2fnuz> ||
+                std::is_same_v<scalar_t, at::Float8_e4m3fn> ||
+                std::is_same_v<scalar_t, at::Float8_e4m3fnuz> ||
+                std::is_same_v<scalar_t, at::Float8_e8m0fnu>) {
+    // relaxed float cast: allow inf similar to the torch.tensor constructor
+    //
+    // without this, we had the following divergence:
+    //   torch.tensor(1123581321.0, dtype=torch.float16)
+    //     => tensor(inf, dtype=torch.float16)
+    //   torch.ops.aten.scalar_tensor.default(1123581321, dtype=torch.float16)
+    //     => RuntimeError: value cannot be converted to type at::Half without overflow
+
+    value = static_cast<scalar_t>(value_scalar.to<double>());
+  } else {
+    value = value_scalar.to<scalar_t>();
+  }
+
   scalar_t* dptr = static_cast<scalar_t*>(self.data_ptr());
   *dptr = value;
 }
diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index f7a747f6e0475..06ac1dc98c740 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -12975,6 +12975,38 @@ def f(actions, n_act, epsilon=0.1):
         y = torch.tensor(5)
         f(x, y)
 
+    def test_dynamic_float_scalar_tensor_coersion(self):
+        # Minified version of https://github.com/pytorch/pytorch/issues/158376#issuecomment-3079591367
+        class Foo:
+            def __init__(self):
+                self.config = type(
+                    "Config", (), {"pad_val": 1123581321.0, "tolerance": 1e-6}
+                )
+
+            @torch.compile(fullgraph=True)
+            def forward(self, input):
+                outputs = torch.where(
+                    torch.abs(input - self.config.pad_val) < self.config.tolerance,
+                    torch.tensor(
+                        self.config.pad_val, dtype=input.dtype, device=input.device
+                    ),
+                    torch.tensor(
+                        self.config.pad_val + 1, dtype=input.dtype, device=input.device
+                    ),
+                )
+                return outputs
+
+        foo = Foo()
+        inputs = torch.randn(3, 4)
+        result = foo.forward(inputs)
+
+        original_pad_val = foo.config.pad_val
+        foo.config.pad_val += 1.0
+        result2 = foo.forward(inputs)
+
+        # Previously would crash with:
+        #   RuntimeError: value cannot be converted to type at::Half without overflow
+
 
 devices = ("cuda", "hpu")
 instantiate_device_type_tests(MiscTestsDevice, globals(), only_for=devices)

From 29973ffc6ab49f94db169aae51ef0e1a3c57f2e6 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Tue, 22 Jul 2025 13:44:32 -0700
Subject: [PATCH 49/83] Cherry pick PR 158746 (#158801)

Fix the typos in the right nav by pulling the latest theme (#158746)

This will fix broken links in the right nav.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/158746
Approved by: https://github.com/malfet

(cherry picked from commit 2bb684304d26804ab87103ada05b6ba63e309b59)
(cherry picked from commit 0462fd4707374b28600bb6dd654ce94db57f8950)
---
 .ci/docker/requirements-docs.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.ci/docker/requirements-docs.txt b/.ci/docker/requirements-docs.txt
index c3d4e644dbabf..864b99a7f2a96 100644
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@@ -4,8 +4,8 @@ sphinx==5.3.0
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git@pytorch_sphinx_theme2#egg=pytorch_sphinx_theme2
 
 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
-# but it doesn't seem to work and hangs around idly. The initial thought it is probably
-# something related to Docker setup. We can investigate this later
+# but it doesn't seem to work and hangs around idly. The initial thought that it is probably
+# something related to Docker setup. We can investigate this later.
 
 sphinxcontrib.katex==0.8.6
 #Description: This is used to generate PyTorch docs

From d00758893d54f2517dc434900a06532f367a5fdc Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Tue, 22 Jul 2025 17:26:58 -0400
Subject: [PATCH 50/83] [MPS] Reimplement `tri[ul]` as Metal shaders (#158867)

[MPS] Reimplement `tri[ul]` as Metal shaders (#157179)

And add in-place flavor, as it is currently broken for non-contig tensors
Pull Request resolved: https://github.com/pytorch/pytorch/pull/157179
Approved by: https://github.com/dcci

(cherry picked from commit a1e4f1f98a0b9596fe52aaf2f85b0778498d5f49)

Co-authored-by: Nikita Shulga <nikita.shulga@gmail.com>
---
 .../native/mps/kernels/TriangularOps.metal    | 114 +++++++++++++++++
 .../native/mps/operations/TriangularOps.mm    | 119 ++++++------------
 test/test_mps.py                              |   5 +
 torch/testing/_internal/common_mps.py         |   4 +-
 4 files changed, 157 insertions(+), 85 deletions(-)

diff --git a/aten/src/ATen/native/mps/kernels/TriangularOps.metal b/aten/src/ATen/native/mps/kernels/TriangularOps.metal
index aa1093ec34d43..27ad506028488 100644
--- a/aten/src/ATen/native/mps/kernels/TriangularOps.metal
+++ b/aten/src/ATen/native/mps/kernels/TriangularOps.metal
@@ -1,5 +1,119 @@
 #include <metal_stdlib>
+
 using namespace metal;
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ triu/tril ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+template <bool upper>
+inline bool triul_mask(int row, int col, int k);
+template <>
+inline bool triul_mask<true>(int row, int col, int k) {
+  return col - row >= k;
+}
+template <>
+inline bool triul_mask<false>(int row, int col, int k) {
+  return col - row <= k;
+}
+
+template <typename IndexType>
+inline IndexType compute_offs(
+    constant IndexType* strides,
+    constant uint* sizes,
+    uint3 pos,
+    int ndim) {
+  auto offs = pos.x * strides[0] + pos.y * strides[1];
+  if (ndim < 4) {
+    return ndim == 3 ? offs + pos.z * strides[2] : offs;
+  }
+  auto idx = pos.z;
+  for (int i = 2; i < ndim; ++i) {
+    offs += strides[i] * (idx % sizes[i]);
+    idx /= sizes[i];
+  }
+  return offs;
+}
+
+template <typename T, typename IndexType, bool upper>
+kernel void triul_inplace(
+    device T* self,
+    constant IndexType* strides,
+    constant uint* sizes,
+    constant int2& k_ndim,
+    uint3 pos [[thread_position_in_grid]]) {
+  if (triul_mask<upper>(pos.y, pos.x, k_ndim.x)) {
+    return;
+  }
+  auto offs = compute_offs(strides, sizes, pos, k_ndim.y);
+  self[offs] = 0;
+}
+
+template <typename T, typename IndexType, bool upper>
+kernel void triul(
+    device T* out,
+    device T* inp,
+    constant IndexType* out_strides,
+    constant IndexType* inp_strides,
+    constant uint* sizes,
+    constant int2& k_ndim,
+    uint3 pos [[thread_position_in_grid]]) {
+  auto out_offs = compute_offs(out_strides, sizes, pos, k_ndim.y);
+  if (!triul_mask<upper>(pos.y, pos.x, k_ndim.x)) {
+    out[out_offs] = 0;
+    return;
+  }
+  auto inp_offs = compute_offs(inp_strides, sizes, pos, k_ndim.y);
+  out[out_offs] = inp[inp_offs];
+}
+
+#define INSTANTIATE_TRIUL_KERNELS(DTYPE, IDX_TYPE)                         \
+  template [[host_name("triu_inplace_" #IDX_TYPE "_" #DTYPE)]] kernel void \
+  triul_inplace<DTYPE, IDX_TYPE, true>(                                    \
+      device DTYPE * self,                                                 \
+      constant IDX_TYPE * strides,                                         \
+      constant uint * sizes,                                               \
+      constant int2 & k_ndim,                                              \
+      uint3 pos [[thread_position_in_grid]]);                              \
+  template [[host_name("tril_inplace_" #IDX_TYPE "_" #DTYPE)]] kernel void \
+  triul_inplace<DTYPE, IDX_TYPE, false>(                                   \
+      device DTYPE * self,                                                 \
+      constant IDX_TYPE * strides,                                         \
+      constant uint * sizes,                                               \
+      constant int2 & k_ndim,                                              \
+      uint3 pos [[thread_position_in_grid]]);                              \
+  template [[host_name("triu_" #IDX_TYPE "_" #DTYPE)]] kernel void         \
+  triul<DTYPE, IDX_TYPE, true>(                                            \
+      device DTYPE * out,                                                  \
+      device DTYPE * inp,                                                  \
+      constant IDX_TYPE * out_strides,                                     \
+      constant IDX_TYPE * inp_strides,                                     \
+      constant uint * sizes,                                               \
+      constant int2 & k_ndim,                                              \
+      uint3 pos [[thread_position_in_grid]]);                              \
+  template [[host_name("tril_" #IDX_TYPE "_" #DTYPE)]] kernel void         \
+  triul<DTYPE, IDX_TYPE, false>(                                           \
+      device DTYPE * out,                                                  \
+      device DTYPE * inp,                                                  \
+      constant IDX_TYPE * out_strides,                                     \
+      constant IDX_TYPE * inp_strides,                                     \
+      constant uint * sizes,                                               \
+      constant int2 & k_ndim,                                              \
+      uint3 pos [[thread_position_in_grid]])
+
+INSTANTIATE_TRIUL_KERNELS(float, int);
+INSTANTIATE_TRIUL_KERNELS(half, int);
+#if __METAL_VERSION__ >= 310
+INSTANTIATE_TRIUL_KERNELS(bfloat, int);
+#endif
+
+INSTANTIATE_TRIUL_KERNELS(float2, int);
+INSTANTIATE_TRIUL_KERNELS(half2, int);
+
+INSTANTIATE_TRIUL_KERNELS(long, int);
+INSTANTIATE_TRIUL_KERNELS(int, int);
+INSTANTIATE_TRIUL_KERNELS(short, int);
+INSTANTIATE_TRIUL_KERNELS(char, int);
+INSTANTIATE_TRIUL_KERNELS(uchar, int);
+INSTANTIATE_TRIUL_KERNELS(bool, int);
+
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ triangle ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 // To find the max integer that does not exceed the root of an int64_t variable,
diff --git a/aten/src/ATen/native/mps/operations/TriangularOps.mm b/aten/src/ATen/native/mps/operations/TriangularOps.mm
index 6867bafc562eb..647bac958ecae 100644
--- a/aten/src/ATen/native/mps/operations/TriangularOps.mm
+++ b/aten/src/ATen/native/mps/operations/TriangularOps.mm
@@ -5,6 +5,7 @@
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/TensorFactories.h>
 #include <ATen/native/mps/OperationUtils.h>
+#include <fmt/format.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -26,101 +27,53 @@
 #include <ATen/native/mps/TriangularOps_metallib.h>
 #endif
 
-TORCH_IMPL_FUNC(triu_mps_out)
-(const Tensor& self, int64_t k, const Tensor& output) {
-  using namespace mps;
-  using CachedGraph = MPSUnaryCachedGraph;
-
-  if (self.numel() == 0) {
-    return;
-  }
-  auto stream = getCurrentMPSStream();
-
-  @autoreleasepool {
-    std::string key = "triu_mps_out" + mps::getTensorsStringKey({self}) + ":" + std::to_string(k);
-    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
-      MPSGraphTensor* outputTensor = nil;
-      auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
-
-      auto minusOneTensor = [mpsGraph constantWithScalar:-1 dataType:MPSDataTypeInt32];
-
-      if (k > 0) {
-        auto diagMinusOneTensor = [mpsGraph constantWithScalar:(k - 1) dataType:MPSDataTypeInt32];
-        auto onesTensor = [mpsGraph constantWithScalar:1 shape:inputTensor.shape dataType:MPSDataTypeInt32];
-        auto maskTensor = [mpsGraph bandPartWithTensor:onesTensor
-                                        numLowerTensor:minusOneTensor
-                                        numUpperTensor:diagMinusOneTensor
-                                                  name:nil];
-        outputTensor = [mpsGraph selectWithPredicateTensor:maskTensor
-                                       truePredicateTensor:[mpsGraph constantWithScalar:0 dataType:inputTensor.dataType]
-                                      falsePredicateTensor:inputTensor
-                                                      name:nil];
-      } else {
-        auto minusDiagTensor = [mpsGraph constantWithScalar:(-k) dataType:MPSDataTypeInt32];
-        outputTensor = [mpsGraph bandPartWithTensor:inputTensor
-                                     numLowerTensor:minusDiagTensor
-                                     numUpperTensor:minusOneTensor
-                                               name:nil];
-      }
-
-      newCachedGraph->inputTensor_ = inputTensor;
-      newCachedGraph->outputTensor_ = outputTensor;
-    });
-
-    auto selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
-    auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
-    runMPSGraph(stream, cachedGraph->graph(), dictionaryFromPlaceholders(selfPlaceholder), outputPlaceholder);
+template <typename T>
+static std::vector<T> reverse_array(const IntArrayRef& arr) {
+  std::vector<T> rc(arr.size());
+  for (const auto& i : c10::irange(arr.size())) {
+    rc[i] = arr[arr.size() - 1 - i];
   }
+  return rc;
 }
 
-TORCH_IMPL_FUNC(tril_mps_out)
-(const Tensor& self, int64_t k, const Tensor& output) {
+static void triu_tril_impl(const Tensor& self, int64_t k, const Tensor& out, const std::string& name) {
   using namespace mps;
-  using CachedGraph = MPSUnaryCachedGraph;
-
   if (self.numel() == 0) {
     return;
   }
-
+  auto sizes = reverse_array<uint32_t>(self.sizes());
+  auto inp_strides = reverse_array<int32_t>(self.strides());
+  auto out_strides = reverse_array<int32_t>(out.strides());
+  std::array<int, 2> k_ndim = {int(k), int(self.ndimension())};
+  const bool inplace = self.is_same(out);
+  const auto kernel_name =
+      fmt::format("{}{}_{}_{}", name, inplace ? "_inplace" : "", "int", scalarToMetalTypeString(self));
+  auto triuPSO = lib.getPipelineStateForFunc(kernel_name);
+  uint32_t max_threads_per_group = [triuPSO maxTotalThreadsPerThreadgroup];
   auto stream = getCurrentMPSStream();
-
-  @autoreleasepool {
-    std::string key = "tril_mps_out" + mps::getTensorsStringKey({self}) + ":" + std::to_string(k);
-    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
-      MPSGraphTensor* outputTensor = nil;
-
-      auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
-      auto minusOneTensor = [mpsGraph constantWithScalar:-1 dataType:MPSDataTypeInt32];
-
-      if (k >= 0) {
-        auto diagTensor = [mpsGraph constantWithScalar:k dataType:MPSDataTypeInt32];
-        outputTensor = [mpsGraph bandPartWithTensor:inputTensor
-                                     numLowerTensor:minusOneTensor
-                                     numUpperTensor:diagTensor
-                                               name:nil];
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto computeEncoder = stream->commandEncoder();
+      [computeEncoder setComputePipelineState:triuPSO];
+      if (inplace) {
+        mtl_setArgs(computeEncoder, self, inp_strides, sizes, k_ndim);
       } else {
-        auto negDiagMinusOneTensor = [mpsGraph constantWithScalar:(-k - 1) dataType:MPSDataTypeInt32];
-        auto complementTensor = [mpsGraph bandPartWithTensor:inputTensor
-                                              numLowerTensor:negDiagMinusOneTensor
-                                              numUpperTensor:minusOneTensor
-                                                        name:nil];
-        auto zeroTensor = [mpsGraph constantWithScalar:0.0 dataType:getMPSDataType(self)];
-        auto mask = [mpsGraph equalWithPrimaryTensor:complementTensor secondaryTensor:zeroTensor name:nil];
-        outputTensor = [mpsGraph selectWithPredicateTensor:mask
-                                       truePredicateTensor:inputTensor
-                                      falsePredicateTensor:zeroTensor
-                                                      name:nil];
+        mtl_setArgs(computeEncoder, out, self, out_strides, inp_strides, sizes, k_ndim);
       }
+      [computeEncoder dispatchThreads:MTLSizeMake(sizes[0], sizes[1], self.numel() / (sizes[0] * sizes[1]))
+                threadsPerThreadgroup:MTLSizeMake(std::min(max_threads_per_group, sizes[0]), 1, 1)];
+    }
+  });
+}
 
-      newCachedGraph->inputTensor_ = inputTensor;
-      newCachedGraph->outputTensor_ = outputTensor;
-    });
-
-    auto selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
-    auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+TORCH_IMPL_FUNC(triu_mps_out)
+(const Tensor& self, int64_t k, const Tensor& output) {
+  triu_tril_impl(self, k, output, "triu");
+}
 
-    runMPSGraph(stream, cachedGraph->graph(), dictionaryFromPlaceholders(selfPlaceholder), outputPlaceholder);
-  }
+TORCH_IMPL_FUNC(tril_mps_out)
+(const Tensor& self, int64_t k, const Tensor& output) {
+  triu_tril_impl(self, k, output, "tril");
 }
 
 Tensor tril_indices_mps(int64_t row,
diff --git a/test/test_mps.py b/test/test_mps.py
index d6170e0793336..94087b7a8a0d8 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -7146,6 +7146,11 @@ def helper(shape, diag=0):
         helper((2, 8, 4, 5), diag=-1)
         helper((2, 8, 4, 5), diag=-2)
         helper((2, 8, 4, 5), diag=-3)
+        # Test inplace
+        x_mps = torch.arange(9.0, device='mps').reshape(3, 3).t().triu()
+        x_cpu = torch.arange(9.0, device='cpu').reshape(3, 3).t().triu()
+        self.assertEqual(x_cpu, x_mps)
+        self.assertEqual(x_cpu.stride(), x_mps.stride())
 
     # Test inverse
     def test_inverse(self):
diff --git a/torch/testing/_internal/common_mps.py b/torch/testing/_internal/common_mps.py
index 5674154bac006..3e93a3552dff8 100644
--- a/torch/testing/_internal/common_mps.py
+++ b/torch/testing/_internal/common_mps.py
@@ -157,6 +157,8 @@ def mps_ops_modifier(
             "tensor_split",
             "transpose",
             "transpose_copy",
+            "tril",
+            "triu",
             "true_divide",
             "T",
             "unbind",
@@ -283,8 +285,6 @@ def mps_ops_modifier(
             "trace",
             "trapz",
             "trapezoid",
-            "tril",
-            "triu",
             "vstack",
             "where",
             "byte",

From 9176b69ba2e976fcfe3677181463e95f3e7795cb Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Tue, 22 Jul 2025 18:25:46 -0400
Subject: [PATCH 51/83] [MPS] Switch Cholesky  decomp to column wise (#158237)

[MPS] Switch Cholesky  decomp to column wise (#157014)

Everything should go thru a generalized kernels, and Metal kernels should work with the same sizes and strides as CPU or CUDA backends to avoid problems with `torch.compile` that relies on the meta kernels to tell what its ouput going to look like.

To avoid returning tensors with different layout depending on whether upper parameter is true or false, templatize `factorDiagonalBlock`, `applyTRSM` and `applySYRK` to take upper/lower (actually row-wise vs column-wise) as template argument and call appropriate templates from host

TODOs:
 - Rename upper parameter to something more sensible and add comments
 - Use simd_groupsize instead of hardcoded 32 everywhere

Fixes https://github.com/pytorch/pytorch/issues/156658

Pull Request resolved: https://github.com/pytorch/pytorch/pull/157014
Approved by: https://github.com/Skylion007, https://github.com/dcci
ghstack dependencies: #157179

(cherry picked from commit 1c8844d9e7b2d72fb80b67ed51df4f6a1295b3b5)

Co-authored-by: Nikita Shulga <nshulga@meta.com>
---
 aten/src/ATen/native/BatchLinearAlgebra.cpp   |   2 +-
 .../native/mps/kernels/LinearAlgebra.metal    | 128 ++++++++++++++++--
 .../native/mps/operations/LinearAlgebra.mm    |  75 ++--------
 aten/src/ATen/native/native_functions.yaml    |   9 +-
 test/inductor/test_mps_basic.py               |   9 ++
 5 files changed, 137 insertions(+), 86 deletions(-)

diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp
index 775e1cb04e848..cfeb67bef3bd9 100644
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@@ -697,7 +697,7 @@ TORCH_META_FUNC(linalg_cholesky_ex)(const Tensor& A,
   auto ndim = A_shape.size();
 
   // L
-  auto L_strides = at::native::batched_matrix_contiguous_strides(A_shape, /*f-contig*=*/A.device().type() != at::kMPS);
+  auto L_strides = at::native::batched_matrix_contiguous_strides(A_shape, /*f-contig*=*/true);
   set_output_strided(0, A_shape, L_strides, A.options(), {});
 
   // info
diff --git a/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal b/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal
index f3e543f9dd732..15d46d8c8d8e1 100644
--- a/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal
+++ b/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal
@@ -145,6 +145,28 @@ inline float blockReduceSum(
   return sharedScratch[0];
 }
 
+template <bool col_major>
+inline device float& get_ref(device float* A, uint row, uint col, uint N);
+
+template <>
+inline device float& get_ref<true>(
+    device float* A,
+    uint row,
+    uint col,
+    uint N) {
+  return A[row * N + col];
+}
+
+template <>
+inline device float& get_ref<false>(
+    device float* A,
+    uint row,
+    uint col,
+    uint N) {
+  return A[row + col * N];
+}
+
+template <bool upper>
 kernel void factorDiagonalBlock(
     device float* A [[buffer(0)]],
     device int* info [[buffer(1)]],
@@ -171,7 +193,7 @@ kernel void factorDiagonalBlock(
   for (uint i = linear_tid; i < tileSize; i += group_size) {
     uint r = i / actSize;
     uint c = i % actSize;
-    tile[r][c] = A[batch_offset + (row0 + r) * N + (col0 + c)];
+    tile[r][c] = get_ref<upper>(A + batch_offset, row0 + r, col0 + c, N);
   }
   threadgroup_barrier(mem_flags::mem_threadgroup);
 
@@ -244,10 +266,33 @@ kernel void factorDiagonalBlock(
   for (uint i = linear_tid; i < tileSize; i += group_size) {
     uint r = i / actSize;
     uint c = i % actSize;
-    A[batch_offset + (row0 + r) * N + (col0 + c)] = tile[r][c];
+    get_ref<upper>(A + batch_offset, row0 + r, col0 + c, N) = tile[r][c];
   }
 }
 
+template [[host_name("factorDiagonalBlockU")]]
+kernel void factorDiagonalBlock<true>(
+    device float* A [[buffer(0)]],
+    device int* info [[buffer(1)]],
+    constant uint& N [[buffer(2)]],
+    constant uint& NB [[buffer(3)]],
+    constant uint& k [[buffer(4)]],
+    uint3 tid [[thread_position_in_threadgroup]],
+    uint3 bid [[threadgroup_position_in_grid]],
+    uint3 tpg [[threads_per_threadgroup]]);
+
+template [[host_name("factorDiagonalBlockL")]]
+kernel void factorDiagonalBlock<false>(
+    device float* A [[buffer(0)]],
+    device int* info [[buffer(1)]],
+    constant uint& N [[buffer(2)]],
+    constant uint& NB [[buffer(3)]],
+    constant uint& k [[buffer(4)]],
+    uint3 tid [[thread_position_in_threadgroup]],
+    uint3 bid [[threadgroup_position_in_grid]],
+    uint3 tpg [[threads_per_threadgroup]]);
+
+template <bool upper>
 kernel void applyTRSM(
     device float* A [[buffer(0)]],
     constant uint& N [[buffer(2)]],
@@ -283,12 +328,12 @@ kernel void applyTRSM(
   for (uint i = linear_tid; i < actSize_k * actSize_k; i += group_size) {
     uint r = i / actSize_k;
     uint c = i % actSize_k;
-    diag[i] = A[batch_offset + (k * NB + r) * N + (k * NB + c)];
+    diag[i] = get_ref<upper>(A + batch_offset, k * NB + r, k * NB + c, N);
   }
   for (uint i = linear_tid; i < actSize_j * actSize_k; i += group_size) {
     uint r = i / actSize_k;
     uint c = i % actSize_k;
-    target[i] = A[batch_offset + (row0 + r) * N + (col0 + c)];
+    target[i] = get_ref<upper>(A + batch_offset, row0 + r, col0 + c, N);
   }
   threadgroup_barrier(mem_flags::mem_threadgroup);
 
@@ -332,10 +377,31 @@ kernel void applyTRSM(
   for (uint i = linear_tid; i < actSize_j * actSize_k; i += group_size) {
     uint r = i / actSize_k;
     uint c = i % actSize_k;
-    A[batch_offset + (row0 + r) * N + (col0 + c)] = target[i];
+    get_ref<upper>(A + batch_offset, row0 + r, col0 + c, N) = target[i];
   }
 }
 
+template [[host_name("applyTRSMU")]]
+kernel void applyTRSM<true>(
+    device float* A [[buffer(0)]],
+    constant uint& N [[buffer(2)]],
+    constant uint& NB [[buffer(3)]],
+    constant uint& k [[buffer(4)]],
+    uint3 tid [[thread_position_in_threadgroup]],
+    uint3 tgid [[threadgroup_position_in_grid]],
+    uint3 tpg [[threads_per_threadgroup]]);
+
+template [[host_name("applyTRSML")]]
+kernel void applyTRSM<false>(
+    device float* A [[buffer(0)]],
+    constant uint& N [[buffer(2)]],
+    constant uint& NB [[buffer(3)]],
+    constant uint& k [[buffer(4)]],
+    uint3 tid [[thread_position_in_threadgroup]],
+    uint3 tgid [[threadgroup_position_in_grid]],
+    uint3 tpg [[threads_per_threadgroup]]);
+
+template <bool upper>
 kernel void applySYRK(
     device float* A [[buffer(0)]],
     constant uint& N [[buffer(2)]],
@@ -403,17 +469,25 @@ kernel void applySYRK(
       // Same logic to load/store Cfrag, Afrag, Bfrag...
       simdgroup_matrix<float, 8, 8> Cfrag;
       simdgroup_load(
-          Cfrag, &A[batch_offset + (row0 + sb_y) * N + (col0 + sb_x)], N);
+          Cfrag,
+          &get_ref<upper>(A + batch_offset, row0 + sb_y, col0 + sb_x, N),
+          N,
+          0,
+          !upper);
 
       for (uint kk = 0; kk < actSize_k; kk += 8) {
         simdgroup_load(
-            Afrag, &A[batch_offset + (row0 + sb_y) * N + (k * NB + kk)], N);
+            Afrag,
+            &get_ref<upper>(A + batch_offset, row0 + sb_y, k * NB + kk, N),
+            N,
+            0,
+            !upper);
         simdgroup_load(
             Bfrag,
-            &A[batch_offset + (col0 + sb_x) * N + (k * NB + kk)],
+            &get_ref<upper>(A + batch_offset, col0 + sb_x, k * NB + kk, N),
             N,
             /* matrix_origin = */ 0,
-            /* transpose = */ true);
+            /* transpose = */ upper);
 
         simdgroup_multiply(Prod, Afrag, Bfrag);
         simdgroup_multiply(Prod, Prod, negative_identity);
@@ -421,7 +495,11 @@ kernel void applySYRK(
       }
 
       simdgroup_store(
-          Cfrag, &A[batch_offset + (row0 + sb_y) * N + (col0 + sb_x)], N);
+          Cfrag,
+          &get_ref<upper>(A + batch_offset, row0 + sb_y, col0 + sb_x, N),
+          N,
+          0,
+          !upper);
     }
   } else {
     // Fallback for non-multiple-of-8 dimensions
@@ -442,8 +520,10 @@ kernel void applySYRK(
 
         float sum = 0.0f;
         for (uint i = 0; i < actSize_k; i++) {
-          float a_val = A[batch_offset + (row0 + y) * N + k * NB + i];
-          float b_val = A[batch_offset + (col0 + x) * N + k * NB + i];
+          float a_val =
+              get_ref<upper>(A + batch_offset, row0 + y, k * NB + i, N);
+          float b_val =
+              get_ref<upper>(A + batch_offset, col0 + x, k * NB + i, N);
           sum = fma(a_val, b_val, sum);
         }
         sum_accumulator[y * tpg.x + x] += sum;
@@ -452,13 +532,35 @@ kernel void applySYRK(
     threadgroup_barrier(mem_flags::mem_threadgroup);
     for (uint y = ty; y < actSize_j; y += tpg.y) {
       for (uint x = tx; x < actSize_h; x += tpg.x) {
-        A[batch_offset + (row0 + y) * N + col0 + x] -=
+        get_ref<upper>(A + batch_offset, row0 + y, col0 + x, N) -=
             sum_accumulator[y * tpg.x + x];
       }
     }
   }
 }
 
+template [[host_name("applySYRKU")]]
+kernel void applySYRK<true>(
+    device float* A [[buffer(0)]],
+    constant uint& N [[buffer(2)]],
+    constant uint& NB [[buffer(3)]],
+    constant uint& k [[buffer(4)]],
+    uint3 tid [[thread_position_in_threadgroup]],
+    uint3 tgid [[threadgroup_position_in_grid]],
+    uint3 tpg [[threads_per_threadgroup]],
+    uint sgitg [[simdgroup_index_in_threadgroup]]);
+
+template [[host_name("applySYRKL")]]
+kernel void applySYRK<false>(
+    device float* A [[buffer(0)]],
+    constant uint& N [[buffer(2)]],
+    constant uint& NB [[buffer(3)]],
+    constant uint& k [[buffer(4)]],
+    uint3 tid [[thread_position_in_threadgroup]],
+    uint3 tgid [[threadgroup_position_in_grid]],
+    uint3 tpg [[threads_per_threadgroup]],
+    uint sgitg [[simdgroup_index_in_threadgroup]]);
+
 kernel void applyPivots(
     device float* P [[buffer(0)]],
     device const int* pivots [[buffer(1)]],
diff --git a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
index 9be8ca1cc6513..3cdf0021e987f 100644
--- a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
+++ b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
@@ -2,6 +2,7 @@
 
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/mps/MPSProfiler.h>
+#include <ATen/native/BatchLinearAlgebra.h>
 #include <ATen/native/LinearAlgebra.h>
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/Resize.h>
@@ -22,7 +23,6 @@
 #include <ATen/ops/bmm_native.h>
 #include <ATen/ops/cholesky_native.h>
 #include <ATen/ops/linalg_cholesky_ex_native.h>
-#include <ATen/ops/linalg_cholesky_native.h>
 #include <ATen/ops/linalg_inv_ex_native.h>
 #include <ATen/ops/linalg_lu_factor_ex_native.h>
 #include <ATen/ops/linalg_lu_factor_native.h>
@@ -1097,25 +1097,8 @@ static void lu_unpack_mps_impl(const Tensor& LU_data,
   }
 }
 
-static void linalg_cholesky_mps_impl(const Tensor& input,
-                                     bool upper,
-                                     bool check_errors,
-                                     const Tensor& out,
-                                     const Tensor& info) {
-  using namespace mps;
-
-  TORCH_CHECK(out.is_mps());
-  TORCH_CHECK(input.scalar_type() == at::ScalarType::Float, "linalg.cholesky: Input tensor must be float32");
-  TORCH_CHECK(input.dim() >= 2, "linalg.cholesky: Input tensor must be at least 2D");
-  TORCH_CHECK(input.size(-2) == input.size(-1), "linalg.cholesky: Input tensor must be square");
-  auto input_sizes = input.sizes();
-  resize_output(out, input_sizes);
-  resize_output(info, {input_sizes.begin(), input_sizes.end() - 2});
-  if (input.numel() == 0) {
-    info.zero_();
-    return;
-  }
-  out.copy_(input);
+static void cholesky_stub_impl(const Tensor& out, const Tensor& info, bool upper) {
+  auto input_sizes = out.sizes();
 
   int64_t ndim = out.dim();
   int64_t N = out.size(-1);
@@ -1124,9 +1107,9 @@ static void linalg_cholesky_mps_impl(const Tensor& input,
   auto stream = getCurrentMPSStream();
   auto device = MPSDevice::getInstance()->device();
 
-  auto factorDiagonalPSO = lib.getPipelineStateForFunc("factorDiagonalBlock");
-  auto applyTRSMPSO = lib.getPipelineStateForFunc("applyTRSM");
-  auto applySYRKPSO = lib.getPipelineStateForFunc("applySYRK");
+  auto factorDiagonalPSO = lib.getPipelineStateForFunc(upper ? "factorDiagonalBlockU" : "factorDiagonalBlockL");
+  auto applyTRSMPSO = lib.getPipelineStateForFunc(upper ? "applyTRSMU" : "applyTRSML");
+  auto applySYRKPSO = lib.getPipelineStateForFunc(upper ? "applySYRKU" : "applySYRKL");
 
   int64_t NB = std::min<int64_t>(32, N);
   int64_t numBlocks = (N + NB - 1) / NB;
@@ -1168,33 +1151,8 @@ static void linalg_cholesky_mps_impl(const Tensor& input,
       }
     });
   }
-  int status;
-  if (check_errors) {
-    if (info_.dim() > 0) {
-      // batch case
-      for (const auto i : c10::irange(B)) {
-        status = info_[i].item<int>();
-        TORCH_CHECK(
-            status == 0,
-            "linalg.cholesky(): (Batch element ",
-            i,
-            "):  The factorization could not be completed because the input is not positive-definite (the leading minor of order ",
-            status,
-            " is not positive-definite).");
-      }
-    } else {
-      // single matrix case(no batch size)
-      status = info.item<int>();
-      TORCH_CHECK(
-          status == 0,
-          "linalg.cholesky(): The factorization could not be completed because the input is not positive-definite (the leading minor of order ",
-          status,
-          " is not positive-definite).");
-    }
-  }
-  out.tril_();
-  upper ? out.transpose_(ndim - 2, ndim - 1) : out;
 }
+
 } // namespace mps
 
 Tensor addr_mps(const Tensor& self, const Tensor& vec1, const Tensor& vec2, const Scalar& beta, const Scalar& alpha) {
@@ -1355,23 +1313,6 @@ Tensor addr_mps(const Tensor& self, const Tensor& vec1, const Tensor& vec2, cons
   return result;
 }
 
-Tensor cholesky_mps(const Tensor& self, bool upper) {
-  auto out = at::empty_like(self, MemoryFormat::Contiguous);
-  cholesky_mps_out(self, upper, out);
-  return out;
-}
-
-Tensor& cholesky_mps_out(const Tensor& self, bool upper, Tensor& out) {
-  auto info = at::empty({}, self.options().dtype(kInt));
-  mps::linalg_cholesky_mps_impl(self, upper, true, out, info);
-  return out;
-}
-
-TORCH_IMPL_FUNC(linalg_cholesky_ex_out_mps)
-(const Tensor& self, bool upper, bool check_errors, const Tensor& L, const Tensor& info) {
-  mps::linalg_cholesky_mps_impl(self, upper, check_errors, L, info);
-}
-
 Tensor addbmm_mps(const Tensor& self,
                   const Tensor& batch1,
                   const Tensor& batch2,
@@ -1460,4 +1401,6 @@ Tensor linalg_solve_triangular_mps(const Tensor& A, const Tensor& B, bool upper,
 TORCH_IMPL_FUNC(linalg_inv_ex_out_mps)(const Tensor& A, bool check_errors, const Tensor& result, const Tensor& info) {
   mps::linalg_inv_ex_out_mps_impl(A, check_errors, result, info);
 }
+
+REGISTER_DISPATCH(cholesky_stub, mps::cholesky_stub_impl)
 } // namespace at::native
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index cba2d2db96334..3d0c1da1c89c0 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -9476,14 +9476,12 @@
 
 - func: cholesky.out(Tensor self, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU, CUDA: cholesky_out
-    MPS: cholesky_mps_out
+    CPU, CUDA, MPS: cholesky_out
 
 - func: cholesky(Tensor self, bool upper=False) -> Tensor
   variants: method, function
   dispatch:
-    CPU, CUDA: cholesky
-    MPS: cholesky_mps
+    CPU, CUDA, MPS: cholesky
 
 - func: cholesky_solve.out(Tensor self, Tensor input2, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -13935,8 +13933,7 @@
   python_module: linalg
   structured: True
   dispatch:
-    CPU, CUDA: linalg_cholesky_ex_out
-    MPS: linalg_cholesky_ex_out_mps
+    CPU, CUDA, MPS: linalg_cholesky_ex_out
 
 - func: linalg_cholesky(Tensor self, *, bool upper=False) -> Tensor
   python_module: linalg
diff --git a/test/inductor/test_mps_basic.py b/test/inductor/test_mps_basic.py
index 3e5e514bce0d8..da041e1a48d06 100644
--- a/test/inductor/test_mps_basic.py
+++ b/test/inductor/test_mps_basic.py
@@ -180,6 +180,15 @@ def fn(x, y):
             ),
         )
 
+    def test_cholesky(self):
+        def fn(x):
+            return (
+                torch.linalg.cholesky(x, upper=False),
+                torch.linalg.cholesky(x, upper=True),
+            )
+
+        self.common(fn, (torch.eye(64),), check_lowp=False)
+
 
 class MPSBasicTestsAOTI(TestCase):
     def check_model(self, m, inp, dynamic_shapes=None):

From 2d0385b96f0790fc62001baeee97ffc2e4b59b7b Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Tue, 22 Jul 2025 19:09:17 -0400
Subject: [PATCH 52/83] Revert "[Dynamo] Allow inlining into AO quantization
 modules (#152934)" (#158677)

This reverts commit 20e2ca3e29ce9eb33eef17db077696222c175764.
---
 test/dynamo/test_repros.py   | 15 ---------------
 torch/_dynamo/trace_rules.py |  1 -
 2 files changed, 16 deletions(-)

diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index a1ea9f977835e..2dfd97b30e1d5 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -6450,21 +6450,6 @@ def inject_parameters(module, cls):
         with torch.no_grad():
             model(x)
 
-    def test_ao_fake_quantize_tracing(self):
-        import torch.ao.quantization.fake_quantize
-
-        q = torch.ao.quantization.FusedMovingAvgObsFakeQuantize()
-
-        def fn(x):
-            return q(x)
-
-        x = torch.ones(2, 2)
-        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
-        res = opt_fn(x)
-        eager_res = fn(x)
-
-        self.assertEqual(res, eager_res)
-
     def test_typed_dict(self):
         class LlavaImagePixelInputs(TypedDict):
             type: Literal["pixel_values"]
diff --git a/torch/_dynamo/trace_rules.py b/torch/_dynamo/trace_rules.py
index 2fa1c7362aa00..2139e035fd09f 100644
--- a/torch/_dynamo/trace_rules.py
+++ b/torch/_dynamo/trace_rules.py
@@ -3393,7 +3393,6 @@ def _module_dir(m: types.ModuleType):
     "torch._tensor",
     "torch.amp.autocast_mode",
     "torch.ao.nn",
-    "torch.ao.quantization.fake_quantize",
     "torch.autograd.function",
     "torch.backends.cuda",
     "torch.cuda.amp.autocast_mode",

From c1f20176e839497f0e12529a38d5d1cb7468361d Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Tue, 22 Jul 2025 19:12:15 -0400
Subject: [PATCH 53/83] Move out super large one off foreach_copy test
 (#158880)

Move out super large one off foreach_copy test (#156876)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/156876
Approved by: https://github.com/albanD, https://github.com/jeffdaily

(cherry picked from commit 50b2069b61942e923528c94ccbbc8ab5e92c381e)

Co-authored-by: Jane Xu <janeyx@meta.com>
---
 test/test_foreach.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/test/test_foreach.py b/test/test_foreach.py
index f1985d47f55ea..764f885898c1d 100644
--- a/test/test_foreach.py
+++ b/test/test_foreach.py
@@ -16,6 +16,7 @@
 from torch.testing._internal.common_device_type import (
     dtypes,
     instantiate_device_type_tests,
+    largeTensorTest,
     onlyCUDA,
     OpDTypes,
     ops,
@@ -1358,8 +1359,6 @@ def test_foreach_copy_with_multi_dtypes(self, device, dtype, op):
         # check (a) multi_tensor_apply is called and (b) numerical parity with for-loop and Tensor.copy_
         foreach_copy_ = ForeachFuncWrapper(op.inplace_variant)
 
-        tested_large_input = False
-
         for sample in op.sample_inputs(
             device, dtype, noncontiguous=False, allow_higher_dtype_scalars=True
         ):
@@ -1367,13 +1366,6 @@ def test_foreach_copy_with_multi_dtypes(self, device, dtype, op):
                 if src_dtype == dtype:
                     continue
                 self_tensors = [t.clone() for t in sample.input]
-                if not tested_large_input:
-                    # see https://github.com/pytorch/pytorch/issues/156261
-                    self_tensors.append(
-                        torch.empty(2**31 + 1, device=device, dtype=dtype)
-                    )
-                    tested_large_input = True
-
                 src_tensors = [t.to(src_dtype) for t in self_tensors]
                 out = foreach_copy_(
                     (self_tensors, src_tensors), is_cuda=True, expect_fastpath=True
@@ -1385,6 +1377,17 @@ def test_foreach_copy_with_multi_dtypes(self, device, dtype, op):
                 for t, ref_t in zip(out, ref_out):
                     self.assertTrue(torch.equal(t, ref_t))
 
+    @onlyCUDA
+    @largeTensorTest("40GB", device="cuda")
+    def test_foreach_copy_with_multi_dtypes_large_input(self):
+        # see https://github.com/pytorch/pytorch/issues/156261
+        self_tensor = torch.empty(2**31 + 1, device="cuda", dtype=torch.float32)
+        src_tensor = torch.ones(2**31 + 1, device="cuda", dtype=torch.bfloat16)
+
+        torch._foreach_copy_([self_tensor], [src_tensor])
+        ref_out = torch.empty_like(self_tensor).copy_(src_tensor)
+        self.assertEqual(self_tensor, ref_out)
+
     @requires_cuda
     @ops(filter(lambda op: op.name == "_foreach_copy", foreach_binary_op_db))
     def test_foreach_copy_with_different_device_inputs(self, device, dtype, op):

From 947a201f23450db5a65e1189599cf9039e409739 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Wed, 23 Jul 2025 10:59:41 -0400
Subject: [PATCH 54/83] [Release Only] Remove nvshmem from list of preload
 libraries (#158925)

[Release Only] Remove nvshmem from preloadlist
---
 torch/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/torch/__init__.py b/torch/__init__.py
index 4b862e8699a4e..fae804c8160fb 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -355,7 +355,6 @@ def _load_global_deps() -> None:
             "cusparselt": "libcusparseLt.so.*[0-9]",
             "cusolver": "libcusolver.so.*[0-9]",
             "nccl": "libnccl.so.*[0-9]",
-            "nvshmem": "libnvshmem_host.so.*[0-9]",
         }
         # cufiile is only available on cuda 12+
         # TODO: Remove once CUDA 11.8 binaries are deprecated

From 360aa1733efc6c1369ee25541d75a87a95f73875 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 16 Jul 2025 20:08:23 +0000
Subject: [PATCH 55/83] Use ROCm/triton and update triton.txt

---
 .ci/docker/ci_commit_pins/triton.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt
index 6dc1c44507ebd..cf43cba72a42b 100644
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@@ -1 +1 @@
-11ec6354315768a85da41032535e3b7b99c5f706
+5e5685356b9fc7b5ad9cdf4e510a1994a5b8601a

From f34b83a06c1dd9409b7865f945da605eeeb3a191 Mon Sep 17 00:00:00 2001
From: Prachi Gupta <pracgupt@amd.com>
Date: Fri, 25 Jul 2025 13:04:13 -0400
Subject: [PATCH 56/83] =?UTF-8?q?[release/2.8]=20[Bugfix][Inductor]=20Fix?=
 =?UTF-8?q?=20dependency=20list=20merged=20incorrectly=20for=20a=20custo?=
 =?UTF-8?q?=E2=80=A6=20(#2419)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…m op with multiple mutated inputs and None return type. (#157133)

This is an attempt to fix a memory allocation issue when using
`torch.compile` with a custom layernorm kernel in vllm:
```C++
  // In-place fused Add and RMS Normalization.
  ops.def(
      "fused_add_rms_norm(Tensor! input, Tensor! residual, Tensor weight, "
      "float epsilon) -> ()");
  ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm);
```
We observed abnormal extra memory allocations with this op enabled using
`torch.compile`: <img width="738"
alt="{374E9FCF-FB46-4750-8B60-D31E3ADCE00A}"
src="https://github.com/user-attachments/assets/6c45e1aa-ccde-4c56-99dc-bf4776d699d5"
/> and without this op:
<img width="738" alt="{9BB08EFE-FFE3-4D06-82C0-C70BBE6ADD56}"
src="https://github.com/user-attachments/assets/56e2ee43-ab87-492d-834c-69e9cafbb0df"
/>

After investigation, we found that this is because the compiler
considers the two buffers for the two mutated inputs `Tensor input` and
`Tensor residual` should share a same dependency list, which makes it
can not reuse the buffer of `Tensor input`.
```
buf1.users = [
        NodeUser(node=ExternKernelSchedulerNode(name='op2'), can_inplace=False, is_weak=False),
        NodeUser(node=ExternKernelSchedulerNode(name='op9'), can_inplace=False, is_weak=False),
        NodeUser(node=ExternKernelSchedulerNode(name='op13'), can_inplace=False, is_weak=False),
        NodeUser(node=ExternKernelSchedulerNode(name='op20'), can_inplace=False, is_weak=False),
        NodeUser(node=ExternKernelSchedulerNode(name='op24'), can_inplace=False, is_weak=False),
        NodeUser(node=ExternKernelSchedulerNode(name='op31'), can_inplace=False, is_weak=False),
        NodeUser(node=ExternKernelSchedulerNode(name='op35'), can_inplace=False, is_weak=False),
        NodeUser(node=ExternKernelSchedulerNode(name='op42'), can_inplace=False, is_weak=False),
        NodeUser(node=ExternKernelSchedulerNode(name='op46'), can_inplace=False, is_weak=False),
        NodeUser(node=ExternKernelSchedulerNode(name='op53'), can_inplace=False, is_weak=False),
    ]
buf16.users = [
        NodeUser(node=ExternKernelSchedulerNode(name='op2'), can_inplace=False, is_weak=False),
        NodeUser(node=ExternKernelSchedulerNode(name='op9'), can_inplace=False, is_weak=False),
        NodeUser(node=ExternKernelSchedulerNode(name='op13'), can_inplace=False, is_weak=False),
        NodeUser(node=ExternKernelSchedulerNode(name='op20'), can_inplace=False, is_weak=False),
        NodeUser(node=ExternKernelSchedulerNode(name='op24'), can_inplace=False, is_weak=False),
        NodeUser(node=ExternKernelSchedulerNode(name='op31'), can_inplace=False, is_weak=False),
        NodeUser(node=ExternKernelSchedulerNode(name='op35'), can_inplace=False, is_weak=False),
        NodeUser(node=ExternKernelSchedulerNode(name='op42'), can_inplace=False, is_weak=False),
        NodeUser(node=ExternKernelSchedulerNode(name='op46'), can_inplace=False, is_weak=False),
        NodeUser(node=ExternKernelSchedulerNode(name='op53'), can_inplace=False, is_weak=False),
    ]
```
```
op13: ExternKernelSchedulerNode(FallbackKernel)
op13.writes =
    [   StarDep(name='buf17', mode=None),
        StarDep(name='buf18', mode=None),
        StarDep(name='buf19', mode=None)]
op13.unmet_dependencies =
    [   StarDep(name='buf13', mode=None),
        StarDep(name='buf16', mode=None),
        WeakDep(name='buf11', mutating_buf='buf18'),
        WeakDep(name='buf12', mutating_buf='buf18'),
        WeakDep(name='buf13', mutating_buf='buf18'),
        WeakDep(name='buf2', mutating_buf='buf18'),
        WeakDep(name='buf3', mutating_buf='buf18')]
op13.met_dependencies = [StarDep(name='arg11_1', mode=None)]
op13.outputs = [
    buf17: FallbackKernel
    buf17.layout = NoneLayout(device=device(type='cuda', index=0), size=[0], stride=[0])
    buf17.aliases = ['buf16', 'buf1']
    buf17.users = [
        NodeUser(node=ExternKernelSchedulerNode(name='op2'), can_inplace=False, is_weak=False),
        NodeUser(node=ExternKernelSchedulerNode(name='op9'), can_inplace=False, is_weak=False),
        NodeUser(node=ExternKernelSchedulerNode(name='op13'), can_inplace=False, is_weak=False),
        NodeUser(node=ExternKernelSchedulerNode(name='op20'), can_inplace=False, is_weak=False),
        NodeUser(node=ExternKernelSchedulerNode(name='op24'), can_inplace=False, is_weak=False),
        NodeUser(node=ExternKernelSchedulerNode(name='op31'), can_inplace=False, is_weak=False),
        NodeUser(node=ExternKernelSchedulerNode(name='op35'), can_inplace=False, is_weak=False),
        NodeUser(node=ExternKernelSchedulerNode(name='op42'), can_inplace=False, is_weak=False),
        NodeUser(node=ExternKernelSchedulerNode(name='op46'), can_inplace=False, is_weak=False),
        NodeUser(node=ExternKernelSchedulerNode(name='op53'), can_inplace=False, is_weak=False),
    ]
    buf18: MutationOutput
    buf18.layout = NoneLayout(device=device(type='cuda', index=0), size=[0], stride=[0])
    buf18.mutations = ['buf16']
    buf18.users = [
        NodeUser(node=ExternKernelSchedulerNode(name='op14'), can_inplace=False, is_weak=False),
        NodeUser(node=ExternKernelSchedulerNode(name='op20'), can_inplace=False, is_weak=True),
        NodeUser(node=ExternKernelSchedulerNode(name='op24'), can_inplace=False, is_weak=True),
        NodeUser(node=ExternKernelSchedulerNode(name='op31'), can_inplace=False, is_weak=True),
        NodeUser(node=ExternKernelSchedulerNode(name='op35'), can_inplace=False, is_weak=True),
        NodeUser(node=ExternKernelSchedulerNode(name='op42'), can_inplace=False, is_weak=True),
        NodeUser(node=ExternKernelSchedulerNode(name='op46'), can_inplace=False, is_weak=True),
        NodeUser(node=ExternKernelSchedulerNode(name='op53'), can_inplace=False, is_weak=True),
    ]
    buf19: MutationOutput
    buf19.layout = NoneLayout(device=device(type='cuda', index=0), size=[0], stride=[0])
    buf19.mutations = ['buf1']
    buf19.users = [NodeUser(node=ExternKernelSchedulerNode(name='op20'), can_inplace=False, is_weak=False)]
]
op13.node.kernel = torch.ops._C.fused_add_rms_norm.default
```
Here we can see `buf16` shares the same dependency list with `buf1`
because `buf16` and `buf1` are in the aliases list of `buf17`. This is
incorrect since those two are two separate tensors. And this makes the
compiler could not reuse `buf16` for subsequent ops.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/157133
Approved by: https://github.com/jansel

(cherry picked from commit 02724b5f649b93ef7960962bdde7a667c0893d21)

Fixes #ISSUE_NUMBER

Co-authored-by: charlifu <charlifu@amd.com>
---
 test/dynamo/test_logging.py              |  1 +
 test/inductor/test_auto_functionalize.py | 48 ++++++++++++++++++++++--
 torch/_inductor/scheduler.py             | 24 ++++++++++++
 torch/_logging/_internal.py              |  2 +
 torch/_logging/_registrations.py         |  1 +
 5 files changed, 72 insertions(+), 4 deletions(-)

diff --git a/test/dynamo/test_logging.py b/test/dynamo/test_logging.py
index 2b120349ea01a..0ff58e49008cd 100644
--- a/test/dynamo/test_logging.py
+++ b/test/dynamo/test_logging.py
@@ -959,6 +959,7 @@ def bar():
     "autotuning",
     "graph_region_expansion",
     "hierarchical_compile",
+    "compute_dependencies",
 }
 for name in torch._logging._internal.log_registry.artifact_names:
     if name not in exclusions:
diff --git a/test/inductor/test_auto_functionalize.py b/test/inductor/test_auto_functionalize.py
index 6f15b493ec1bd..0cc2c9e3a7836 100644
--- a/test/inductor/test_auto_functionalize.py
+++ b/test/inductor/test_auto_functionalize.py
@@ -445,12 +445,17 @@ def run_aot_eager(self, f, orig_args, _dynamic=False):
             graph = "\n".join(log_stream.getvalue().strip().split("\n")[4:]).strip()
         return [aot_eager_args, result, graph]
 
-    def run_inductor(self, f, orig_args, _dynamic=False):
+    def run_inductor(
+        self,
+        f,
+        orig_args,
+        _dynamic=False,
+        log_module="torch._inductor.compile_fx",
+        log_function="post_grad_graphs",
+    ):
         compiled_args = pytree.tree_map_only(torch.Tensor, torch.clone, orig_args)
 
-        log_stream, ctx = logs_to_string(
-            "torch._inductor.compile_fx", "post_grad_graphs"
-        )
+        log_stream, ctx = logs_to_string(log_module, log_function)
         result = None
         with ctx():
             result = torch.compile(
@@ -1733,6 +1738,41 @@ def f(x, w):
             y = f(x, w)
         self.assertEqual(y, x.sin())
 
+    @torch._inductor.config.patch(enable_auto_functionalized_v2=True)
+    def test_scheduling_with_multiple_mutates(self):
+        with torch.library._scoped_library("mylib", "FRAGMENT") as lib:
+            torch.library.define(
+                "mylib::foo",
+                "(Tensor! x, Tensor! y, Tensor z) -> ()",
+                tags=torch.Tag.pt2_compliant_tag,
+                lib=lib,
+            )
+
+            @torch.library.impl("mylib::foo", "cpu", lib=lib)
+            @torch._dynamo.disable
+            def foo(x, y, z):
+                pass
+
+            def func(x, w):
+                a = torch.empty_like(x)  # buf0
+                b = torch.empty_like(x)  # buf1
+                torch.ops.mylib.foo(a, b, x)  # buf2, buf3, buf4
+                c = torch.mm(a, w)  # buf5
+                torch.ops.mylib.foo(c, b, x)  # buf6, buf7, buf8
+                return c
+
+            input = torch.rand(2, 2)
+            weight = torch.rand(2, 2)
+            [inductor_args, output, graph_inductor] = self.run_inductor(
+                func,
+                [input, weight],
+                False,
+                "torch._inductor.scheduler",
+                "compute_dependencies",
+            )
+            name_to_users = eval(graph_inductor)
+            self.assertNotEqual(name_to_users["buf1"], name_to_users["buf5"])
+
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
index 687ba95e1dd1d..f855cc1de922d 100644
--- a/torch/_inductor/scheduler.py
+++ b/torch/_inductor/scheduler.py
@@ -74,6 +74,9 @@
 log = logging.getLogger(__name__)
 fusion_log = torch._logging.getArtifactLogger(__name__, "fusion")
 loop_ordering_log = torch._logging.getArtifactLogger(__name__, "loop_ordering")
+compute_dependencies_log = torch._logging.getArtifactLogger(
+    __name__, "compute_dependencies"
+)
 
 PartitionType = list["BaseSchedulerNode"]
 
@@ -2278,6 +2281,15 @@ def __add__(self, other: DedupList[T]) -> DedupList[T]:
         for node in self.nodes:
             for buf1 in node.get_outputs():
                 buf1_name = buf1.get_name()
+                # This is for handling auto functionized ops which return None
+                # and mutate more than 1 inputs, we shouldn't let them all
+                # point to the same user list since buffers in the aliases
+                # list might not be alias to each other.
+                if (
+                    isinstance(buf1.node.layout, ir.NoneLayout)
+                    and len(buf1.get_aliases()) > 1
+                ):
+                    continue
                 for buf2_name in buf1.get_aliases():
                     if buf1_name in name_to_users and buf2_name in name_to_users:
                         # merge the two
@@ -2445,6 +2457,18 @@ def add_user(
         for name in self.name_to_donated_buffer:
             self.name_to_donated_buffer[name].set_users(name_to_users[name].items)
 
+        # For debug logging
+        logbuf = IndentedBuffer()
+        logbuf.splice("{")
+        for key, value in name_to_users.items():
+            with logbuf.indent():
+                users = [v.get_name() for v in value.items]
+                logbuf.splice(f"'{key}': {users},")
+        logbuf.splice("}")
+        str = logbuf.getrawvalue().rstrip()
+        compute_dependencies_log.debug("BUFFER USER LIST\n")
+        compute_dependencies_log.debug("===== AFTER SCHEDULING =====\n%s", str)
+
     def dead_node_elimination(self) -> None:
         """
         Remove any nodes without users
diff --git a/torch/_logging/_internal.py b/torch/_logging/_internal.py
index 3821218cefec9..f56f0165b206f 100644
--- a/torch/_logging/_internal.py
+++ b/torch/_logging/_internal.py
@@ -252,6 +252,7 @@ def set_logs(
     graph_region_expansion: bool = False,
     inductor_metrics: bool = False,
     hierarchical_compile: bool = False,
+    compute_dependencies: bool = False,
 ) -> None:
     """
     Sets the log level for individual components and toggles individual log
@@ -565,6 +566,7 @@ def _set_logs(**kwargs) -> None:
         graph_region_expansion=graph_region_expansion,
         inductor_metrics=inductor_metrics,
         hierarchical_compile=hierarchical_compile,
+        compute_dependencies=compute_dependencies,
     )
 
 
diff --git a/torch/_logging/_registrations.py b/torch/_logging/_registrations.py
index 62e5d9b7064ca..3c6f092ed4d24 100644
--- a/torch/_logging/_registrations.py
+++ b/torch/_logging/_registrations.py
@@ -183,6 +183,7 @@
 )
 register_artifact("perf_hints", "", off_by_default=True)
 register_artifact("onnx_diagnostics", "", off_by_default=True)
+register_artifact("compute_dependencies", "", off_by_default=True)
 register_artifact(
     "fusion",
     "Detailed Inductor fusion decisions. More detailed than 'schedule'",

From bbb1d6ef363818721bb173d2fb1612cabcdb52bc Mon Sep 17 00:00:00 2001
From: Ethan Wee <Ethan.Wee@amd.com>
Date: Thu, 17 Jul 2025 13:34:05 -0700
Subject: [PATCH 57/83] [release/2.8] enable py3.13 (#2366)

pip installed requirements.txt and .ci/docker/requirements-ci.txt

Local validation: `Successfully installed jinja2-3.1.6 lintrunner-0.12.7
mypy-1.14.0 onnxscript-0.2.2 sympy-1.13.3 tlparse-0.3.30
z3-solver-4.12.6.0`

(cherry picked from commit 30508ff795cf1ab317c5beea050751e6ba4b0f16)
---
 .ci/docker/requirements-ci.txt | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index 72811c384900c..c6490242d632e 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -162,10 +162,11 @@ pillow==11.0.0
 #Pinned versions: 10.3.0
 #test that import:
 
-protobuf==5.29.4
-#Description:  Google's data interchange format
-#Pinned versions: 5.29.4
-#test that import: test_tensorboard.py, test/onnx/*
+protobuf==3.20.2 ; python_version <= "3.12"
+protobuf==4.25.1 ; python_version == "3.13"
+#Description:  Google’s data interchange format
+#Pinned versions: 3.20.1
+#test that import: test_tensorboard.py
 
 psutil
 #Description: information on running processes and system utilization
@@ -320,7 +321,8 @@ pywavelets==1.7.0 ; python_version >= "3.12"
 #Pinned versions: 1.4.1
 #test that import:
 
-lxml==5.3.0
+lxml==5.3.0 ; python_version <= "3.12"
+lxml==6.0.0 ; python_version == "3.13"
 #Description: This is a requirement of unittest-xml-reporting
 
 # Python-3.9 binaries
@@ -332,8 +334,9 @@ sympy==1.13.3
 #Pinned versions:
 #test that import:
 
-onnx==1.18.0
-#Description: Required by onnx tests, and mypy and test_public_bindings.py when checking torch.onnx._internal
+onnx==1.16.1 ; python_version <= "3.12"
+onnx==1.18.0 ; python_version == "3.13"
+#Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
 

From af2ce8836b2f0c27546d451e8d35ede6eabb167d Mon Sep 17 00:00:00 2001
From: Jack Taylor <108682042+jataylo@users.noreply.github.com>
Date: Fri, 18 Jul 2025 19:27:51 +0100
Subject: [PATCH 58/83] [SWDEV-539076] Initial naive foreach autotune support
 (#2377)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds initial autotuning for foreach support required for
https://ontrack-internal.amd.com/browse/SWDEV-539076

4x improvement for some kernels

Before:
triton_for_fused_18.kd 🔍 | 4.986 ms | 4.986 ms | 2.493 ms | 2 |  
triton_for_fused_6.kd 🔍 | 0.098 ms | 0.098 ms | 0.049 ms | 2 |  
triton_for_fused_7.kd 🔍 | 0.036 ms | 0.036 ms | 0.018 ms | 2 |  

After:
triton_for_fused_18.kd 🔍 | 1.273 ms | 1.273 ms | 0.636 ms | 2 |  
triton_for_fused_6.kd 🔍 | 0.044 ms | 0.044 ms | 0.022 ms | 2 |  
triton_for_fused_7.kd 🔍 | 0.024 ms | 0.024 ms | 0.012 ms | 2 |  

(cherry picked from commit f07b7f703543935728e311b6435f7ab58da27bab)
---
 torch/_inductor/codegen/triton_combo_kernel.py |  2 +-
 torch/_inductor/runtime/triton_heuristics.py   | 15 ++++++++++++---
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/torch/_inductor/codegen/triton_combo_kernel.py b/torch/_inductor/codegen/triton_combo_kernel.py
index dc2392119cc51..94a905e4211ce 100644
--- a/torch/_inductor/codegen/triton_combo_kernel.py
+++ b/torch/_inductor/codegen/triton_combo_kernel.py
@@ -614,7 +614,7 @@ def jit_line(
         if heuristics == "foreach":
             heuristics_line = f"""
                 @triton_heuristics.foreach(
-                    num_warps={self.num_warps},
+                    filename=__file__,
                     triton_meta={triton_meta!r},
                     inductor_meta={inductor_meta!r},
                 )
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
index 4c50768b7188c..54c7e83c0879b 100644
--- a/torch/_inductor/runtime/triton_heuristics.py
+++ b/torch/_inductor/runtime/triton_heuristics.py
@@ -2779,20 +2779,29 @@ def user_autotune(
     )
 
 
-def foreach(triton_meta, num_warps, filename=None, inductor_meta=None):
+def foreach(triton_meta, filename=None, inductor_meta=None):
     """
     Compile a triton foreach kernel
     """
+    configs = []
+    if disable_pointwise_autotuning(inductor_meta) and not (
+        inductor_meta.get("max_autotune") or
+        inductor_meta.get("max_autotune_pointwise")
+    ):
+        configs.append(triton.Config({}, num_stages=1, num_warps=8))
+    else:
+        for warps in [1, 2, 4, 8]:
+            configs.append(triton.Config({}, num_stages=1, num_warps=warps))
+
     return cached_autotune(
         None,
-        [triton.Config({}, num_stages=1, num_warps=num_warps)],
+        configs,
         triton_meta=triton_meta,
         inductor_meta=inductor_meta,
         heuristic_type=HeuristicType.TEMPLATE,
         filename=filename,
     )
 
-
 @dataclasses.dataclass
 class GridExpr:
     """Generate code for grid size expressions in launcher"""

From 41956f15e648355ed70917376f112f6587a4629f Mon Sep 17 00:00:00 2001
From: "Nichols A. Romero" <165712832+naromero77amd@users.noreply.github.com>
Date: Wed, 23 Jul 2025 11:32:26 -0500
Subject: [PATCH 59/83] =?UTF-8?q?[release/2.7][ROCm][tunableop]=20UT=20tol?=
 =?UTF-8?q?erance=20increase=20for=20matmul=5Fsmall=5Fbrute=5Fforce=5F?=
 =?UTF-8?q?=E2=80=A6=20(#2397)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

TunableOp will sometimes find a less precise solution due to the small
input vectors used in this UT. Bumping up tolerance to eliminate
flakiness.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/158788
Approved by: https://github.com/jeffdaily

(cherry picked from commit c917c63282c467ef942c99da3ce4fa57bceba603)
(cherry picked from commit 35daec9357d2a111f37ed518e5833dd6b23ecd52)
---
 test/test_linalg.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_linalg.py b/test/test_linalg.py
index 108a5f590079c..e3e30c568c520 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -4762,6 +4762,7 @@ def test_matmul_small_brute_force_3d_Nd(self, device, dtype):
     @onlyCUDA
     @skipCUDAIfNotRocm  # Skipping due to SM89 OOM in CI, UT doesn't do much on NV anyways
     @dtypes(*floating_types_and(torch.half))
+    @precisionOverride({torch.float16: 1e-1})  # TunableOp may occasionally find less precise solution
     def test_matmul_small_brute_force_tunableop(self, device, dtype):
         # disable tunableop buffer rotation for all tests everywhere, it can be slow
         # We set the TunableOp numerical check environment variable here because it is

From 0826c75380322c37afe34232756940b29893c1e6 Mon Sep 17 00:00:00 2001
From: Jack Taylor <108682042+jataylo@users.noreply.github.com>
Date: Wed, 30 Jul 2025 21:19:57 +0100
Subject: [PATCH 60/83] [release/2.7] [SWDEV-543214] Reland #2416 Fix warps
 runtime (#2421)

Relands https://github.com/ROCm/pytorch/pull/2416 with caching fix

Upstream equivalent https://github.com/pytorch/pytorch/pull/159146

---------

Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
(cherry picked from commit f0aebdc31b8a4b7fa4c0b65a1ad6508e5470fe09)
---
 torch/_inductor/runtime/coordinate_descent_tuner.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torch/_inductor/runtime/coordinate_descent_tuner.py b/torch/_inductor/runtime/coordinate_descent_tuner.py
index 413dfaf09d061..6626c88a1e0d1 100644
--- a/torch/_inductor/runtime/coordinate_descent_tuner.py
+++ b/torch/_inductor/runtime/coordinate_descent_tuner.py
@@ -3,6 +3,7 @@
 import itertools
 import logging
 from typing import Callable, Optional, TYPE_CHECKING
+from functools import lru_cache
 
 from .hints import TRITON_MAX_BLOCK
 from .runtime_utils import red_text, triton_config_to_hashable
@@ -60,6 +61,7 @@ def get_config_max(self, prefix: str) -> int:
         size_hint = self.size_hints.get(prefix) if self.size_hints is not None else None
         return min(max_block, size_hint) if size_hint is not None else max_block
 
+    @lru_cache(maxsize=1)
     def get_warpsmax(self):
         # Currently, CUDA has a maximum of 1024 threads, so 32 is the max
         # number of warps.

From af7b538f71a59b51b14b8ca9d3e29a9c9b060f21 Mon Sep 17 00:00:00 2001
From: omkar kakarparthi <75638701+okakarpa@users.noreply.github.com>
Date: Thu, 31 Jul 2025 18:08:48 -0500
Subject: [PATCH 61/83] [AUTOGENERATED] [release/2.8] [ROCm] Use opportunistic
 fastatomics based on heuristics (#2441)

---
 aten/src/ATen/native/cuda/KernelUtils.cuh | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/aten/src/ATen/native/cuda/KernelUtils.cuh b/aten/src/ATen/native/cuda/KernelUtils.cuh
index 1696ee64eac67..89d88646ac566 100644
--- a/aten/src/ATen/native/cuda/KernelUtils.cuh
+++ b/aten/src/ATen/native/cuda/KernelUtils.cuh
@@ -281,6 +281,13 @@ __device__ __forceinline__ void opportunistic_fastAtomicAdd(
         }
     }
 
+    if (numel > 16 /*<-hueristic threshold*/ * 64 ) {
+      // well shucks, unlikely to capture same-dest atomics in a wave.
+      // fall back to direct fastAtomic...
+      fastAtomicAdd(self_ptr, index, numel, value, true);
+      return;
+    }
+
     // not coalsced, so now let try to capture lane-matches...
     // __activemask() -- finds the set of threads in the warp that are about to perform atomicAdd
     // __match_any_sync() -- returns bit mask of the threads that have same dest addr

From b10cd6b417a8749f5933efb7d62ad706aa2c3cf1 Mon Sep 17 00:00:00 2001
From: Jack Taylor <108682042+jataylo@users.noreply.github.com>
Date: Sat, 2 Aug 2025 07:01:53 +0100
Subject: [PATCH 62/83] Update triton pin for gfx950 improvements (#2443)

https://github.com/ROCm/triton/pull/846
---
 .ci/docker/ci_commit_pins/triton.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt
index cf43cba72a42b..567536db72100 100644
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@@ -1 +1 @@
-5e5685356b9fc7b5ad9cdf4e510a1994a5b8601a
+711e2a92522e0a9921ce58ae658571ca55c49b97

From 541313352a43acb5298cacb0d8d1f45385e67ba1 Mon Sep 17 00:00:00 2001
From: omkar kakarparthi <75638701+okakarpa@users.noreply.github.com>
Date: Mon, 4 Aug 2025 11:19:15 -0500
Subject: [PATCH 63/83] [AUTOGENERATED] [release/2.8] [release/2.7]
 [SWDEV-543214] Reland #2416 Fix warps runtime part 2 (#2455)

Cherry-pick of https://github.com/ROCm/pytorch/pull/2442

Co-authored-by: Jack Taylor <108682042+jataylo@users.noreply.github.com>
---
 torch/_inductor/runtime/coordinate_descent_tuner.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/torch/_inductor/runtime/coordinate_descent_tuner.py b/torch/_inductor/runtime/coordinate_descent_tuner.py
index 6626c88a1e0d1..f58f4da061136 100644
--- a/torch/_inductor/runtime/coordinate_descent_tuner.py
+++ b/torch/_inductor/runtime/coordinate_descent_tuner.py
@@ -63,9 +63,14 @@ def get_config_max(self, prefix: str) -> int:
 
     @lru_cache(maxsize=1)
     def get_warpsmax(self):
-        # Currently, CUDA has a maximum of 1024 threads, so 32 is the max
-        # number of warps.
-        return 1024 // 32
+        # CUDA/ROCm has a maximum of 1024 threads per block
+        from torch.cuda import current_device, get_device_properties, is_available
+        
+        warp_size = (
+            get_device_properties(current_device()).warp_size if is_available() else 32
+        )
+
+        return 1024 // warp_size
 
     def cache_benchmark_result(self, config, timing):
         self.cached_benchmark_results[triton_config_to_hashable(config)] = timing

From d6a638346278555ce8f2c23d13328b30180a056c Mon Sep 17 00:00:00 2001
From: omkar kakarparthi <75638701+okakarpa@users.noreply.github.com>
Date: Wed, 6 Aug 2025 16:07:41 -0500
Subject: [PATCH 64/83] [AUTOGENERATED] [release/2.8] [ROCm] Limit number of
 values per thread for reductions on three dimensions (#2469)

Cherry-pick of https://github.com/ROCm/pytorch/pull/2460

Co-authored-by: Jerry Mannil <65309407+jerrymannil@users.noreply.github.com>
---
 aten/src/ATen/native/cuda/Reduce.cuh | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh
index 15a572804af5f..521b467480900 100644
--- a/aten/src/ATen/native/cuda/Reduce.cuh
+++ b/aten/src/ATen/native/cuda/Reduce.cuh
@@ -209,6 +209,10 @@ struct ReduceConfig {
   int values_per_thread() const {
     return div_up(num_inputs, step_input);
   }
+
+  int mock_values_per_thread(int parallelism) {
+    return div_up(num_inputs, step_input * parallelism);
+  }
 };
 
 std::ostream& operator<<(std::ostream& out, const ReduceConfig& config);
@@ -1166,8 +1170,17 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){
     else if (config.ctas_per_output < 16)
       config.ctas_per_output = 1;
     bool is_channel_last = iter.tensor_base(1).is_contiguous(at::MemoryFormat::ChannelsLast);
-    if (iter.ndim() == 3 && !reduction_on_fastest_striding_dimension && !is_channel_last)
+    if (iter.ndim() == 3 && !reduction_on_fastest_striding_dimension && !is_channel_last) {
       config.ctas_per_output = 4;
+      int vpt = config.values_per_thread();
+      // Capping the number of values per thread to 2048 for now
+      // based on known use cases.
+      while (vpt >= 2048) {
+        config.ctas_per_output *= 2;
+        // Computes the new values per thread without side effects
+        vpt = config.mock_values_per_thread(config.ctas_per_output);
+      }
+    }
 #endif
     if (config.ctas_per_output > 1) {
       config.input_mult[2] = config.split_input(config.ctas_per_output);

From 3995f1a66f928a7dad8cf0101089758b4ea2e1cc Mon Sep 17 00:00:00 2001
From: Ramya Ramineni <62723901+rraminen@users.noreply.github.com>
Date: Thu, 7 Aug 2025 09:28:09 -0500
Subject: [PATCH 65/83] [release/2.8] Define datatypes when ROCM_VERSION >=
 70000 (#2470)

Fixes SWDEV-543698

Cherry-picked from https://github.com/ROCm/pytorch/pull/2468
This PR fixes the errors like below:

[rank7]: RuntimeError: /tmp/comgr-c3c81b/input/CompileSourceejOPx6:34:8:
error: unknown type name 'uint64_t'; did you mean
'__hip_internal::uint64_t'? [rank7]: 34 | if(((uint64_t) t0.data) % (4 *
sizeof(half)) != 0) flag_vec4 = false;

Earlier uint64_t was defined in HIP headers in std namespace. Now it is
moved to __hip_internal namespace in hip headers. This change is made in
ROCm 7.0.

Fixes https://ontrack-internal.amd.com/browse/SWDEV-543698
---
 torch/csrc/jit/codegen/fuser/cuda/resource_strings.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h b/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h
index ff2ef1f2377ce..9728d27d4d79b 100644
--- a/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h
+++ b/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h
@@ -12,7 +12,7 @@ cases*/
 
 static constexpr auto bfloat16_type_string = "__nv_bfloat16";
 
-#if defined(USE_ROCM)
+#if defined(USE_ROCM) && ROCM_VERSION < 70000
 static auto type_declarations_template = at::jit::CodeTemplate(R"(
 ${HalfHeader}
 ${BFloat16Header}

From 4fe2355c949d872e42c291f0d3fc969d2393f3e8 Mon Sep 17 00:00:00 2001
From: Jagadish Krishnamoorthy <jagadish.krishnamoorthy@amd.com>
Date: Thu, 7 Aug 2025 07:50:42 -0700
Subject: [PATCH 66/83] [release/2.8] Add mx fp4 support (#2472)

mx fp8 is enabled though cherrypick patch from rel 2.7. This patch adds
support to enable mx fp4.

PYTORCH_TEST_WITH_ROCM=1 python test/test_matmul_cuda.py -k
test_blockwise -v

Ran 452 tests in 23.776s
OK (skipped=340)
Passed 112

---------

Signed-off-by: Jagadish Krishnamoorthy <jagadish.krishnamoorthy@amd.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 aten/src/ATen/cuda/CUDADataType.h          |  2 +-
 aten/src/ATen/cuda/tunable/GemmHipblaslt.h |  9 +++
 aten/src/ATen/native/cuda/Blas.cpp         | 11 +++
 test/test_matmul_cuda.py                   | 87 +++++++++++++---------
 4 files changed, 71 insertions(+), 38 deletions(-)

diff --git a/aten/src/ATen/cuda/CUDADataType.h b/aten/src/ATen/cuda/CUDADataType.h
index 6ee6346732fa9..fba4f855a29b0 100644
--- a/aten/src/ATen/cuda/CUDADataType.h
+++ b/aten/src/ATen/cuda/CUDADataType.h
@@ -90,7 +90,7 @@ inline cudaDataType ScalarTypeToCudaDataType(const c10::ScalarType& scalar_type)
     case c10::ScalarType::Float8_e5m2fnuz:
       return HIP_R_8F_E5M2_FNUZ;
 #endif
-#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12080)
+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12080) || (defined(USE_ROCM) && ROCM_VERSION >= 70000)
     case c10::ScalarType::Float4_e2m1fn_x2:
       return CUDA_R_4F_E2M1;
 #endif
diff --git a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
index fe6d1161d1ba9..4c46aa736b6a0 100644
--- a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
+++ b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
@@ -85,6 +85,15 @@ constexpr hipDataType HipDataTypeFor<c10::Float8_e8m0fnu>() {
   return static_cast<hipDataType>(500);
 }
 
+template <>
+constexpr hipDataType HipDataTypeFor<c10::Float4_e2m1fn_x2>() {
+#if ROCM_VERSION >= 70000
+  return HIP_R_4F_E2M1;
+#else
+  return static_cast<hipDataType>(33);
+#endif
+}
+
 template <typename T>
 int GetBatchFromParams(const GemmParams<T>* params) {
   return 1;
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
index 21e6f9f65dd70..1fc9e14189e46 100644
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@@ -1284,6 +1284,17 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
   if (use_fast_accum) {
     TORCH_CHECK(mat1.scalar_type() != ScalarType::Float4_e2m1fn_x2 && mat2.scalar_type() != ScalarType::Float4_e2m1fn_x2, "`use_fast_accum` is not supported when `mat1` or `mat2` tensors have the `Float4_e2m1fn_x2` dtype.");
   }
+#ifdef USE_ROCM
+  if (mat1.scalar_type() == ScalarType::Float4_e2m1fn_x2 || mat2.scalar_type() == ScalarType::Float4_e2m1fn_x2) {
+    TORCH_CHECK(ROCM_VERSION >= 70000, "Float4_e2m1fn_x2 is only supported for ROCm 7.0 and above");
+  }
+  if (mat1.scalar_type() == ScalarType::Float8_e5m2 || mat2.scalar_type() == ScalarType::Float8_e5m2) {
+    TORCH_CHECK(ROCM_VERSION >= 70000, "Float8_e5m2 is only supported for ROCm 7.0 and above");
+  }
+  if (mat1.scalar_type() == ScalarType::Float8_e4m3fn || mat2.scalar_type() == ScalarType::Float8_e4m3fn) {
+    TORCH_CHECK(ROCM_VERSION >= 70000, "Float8_e4m3fn is only supported for ROCm 7.0 and above");
+  }
+#endif
   if (bias) {
     TORCH_CHECK(out.scalar_type() != kFloat, "Bias is not supported when out_dtype is set to Float32");
     TORCH_CHECK(bias->scalar_type() == ScalarType::BFloat16 || bias->scalar_type() == ScalarType::Half,
diff --git a/test/test_matmul_cuda.py b/test/test_matmul_cuda.py
index 8ec832e40a163..e8ff44fd40986 100644
--- a/test/test_matmul_cuda.py
+++ b/test/test_matmul_cuda.py
@@ -882,6 +882,8 @@ def compute_error(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 
 # largest power of 2 representable in `torch.float8_e4m3fn`
 F8E4M3_LARGEST_POW2 = 8
+# largest power of 2 representable in `torch.float4_e2m1fn_x2`
+FP4E2M1FN_LARGEST_POW2 = 1.0
 # max value of `torch.float8_e4m3fn` (448)
 F8E4M3_MAX_VAL = torch.finfo(torch.float8_e4m3fn).max
 # exponent bias of `torch.float8_e8m0fnu`
@@ -890,14 +892,20 @@ def compute_error(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 FP4_EBITS, FP4_MBITS = 2, 1
 FP4_MAX_VAL = 6.0
 
-def data_to_mx_scale(x, block_size):
+def data_to_mx_scale(x, block_size, recipe):
     # simple implementation of https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
     # section 6.3, not all edge cases (such as NaN) are handled/tested
+    if recipe == "mxfp8":
+        largest_pow2 = F8E4M3_LARGEST_POW2
+    elif recipe == "mxfp4":
+        largest_pow2 = FP4E2M1FN_LARGEST_POW2
+    else:
+        raise ValueError(f"data_to_mx_scale(): Unsupported mx recipe: {recipe}")
     orig_shape = x.shape
     x = x.reshape(-1, block_size)
     max_abs = torch.amax(torch.abs(x), 1)
     largest_p2_lt_max_abs = torch.floor(torch.log2(max_abs))
-    scale_e8m0_unbiased = largest_p2_lt_max_abs - F8E4M3_LARGEST_POW2
+    scale_e8m0_unbiased = largest_p2_lt_max_abs - largest_pow2
     scale_e8m0_unbiased = torch.clamp(scale_e8m0_unbiased, -1 * F8E8M0_EXP_BIAS, F8E8M0_EXP_BIAS)
     scale_e8m0_biased = scale_e8m0_unbiased + F8E8M0_EXP_BIAS
     scale_e8m0_biased = scale_e8m0_biased.to(torch.uint8)
@@ -1446,10 +1454,10 @@ def test_pack_uint4(self):
         (127, 96, 1024),
         (1025, 128, 96)
     ], name_fn=lambda mkn: f"{mkn[0]}_{mkn[1]}_{mkn[2]}")
-    @parametrize("recipe", ["mxfp8", "nvfp4"])
-    def test_blockwise_mxfp8_nvfp4_numerics(self, test_case_name, fast_accum, mkn, recipe) -> None:
-        if recipe == "nvfp4" and fast_accum:
-            return unittest.skip("fast_accum not supported in nvfp4 cublas gemm, skipping")
+    @parametrize("recipe", ["mxfp8", "mxfp4" if torch.version.hip else "nvfp4"])
+    def test_blockwise_mxfp8_nvfp4_mxfp4_numerics(self, test_case_name, fast_accum, mkn, recipe) -> None:
+        if (recipe == "nvfp4" or recipe == "mxfp4") and fast_accum:
+            raise unittest.SkipTest("fast_accum not supported in nvfp4/mxfp4 cublas gemm, skipping")
 
         device = "cuda"
         M, K, N = mkn
@@ -1457,9 +1465,10 @@ def test_blockwise_mxfp8_nvfp4_numerics(self, test_case_name, fast_accum, mkn, r
             if not (M % 32 == 0 and K % 32 == 0 and N % 32 == 0):
                 raise unittest.SkipTest("Matrix dimensions must be multiples of 32 on ROCm, skipping")
 
-        if recipe == "nvfp4" and K % 32 != 0:
-            return unittest.skip("K must be divisible by 32 for nvfp4 cublas gemm, skipping")
+        if (recipe == "nvfp4" or recipe == "mxfp4") and K % 32 != 0:
+            raise unittest.SkipTest("K must be divisible by 32 for nvfp4/mxfp4 cublas gemm, skipping")
 
+        fp4_scaling_dtype = torch.float8_e8m0fnu if torch.version.hip else torch.float8_e4m3fn
         BLOCK_SIZE = 16 if recipe == "nvfp4" else 32
         require_exact_match = True
         approx_match_sqnr_target = 22.0
@@ -1475,11 +1484,11 @@ def test_blockwise_mxfp8_nvfp4_numerics(self, test_case_name, fast_accum, mkn, r
                 B = B_ref.to(torch.float8_e4m3fn)
                 A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
                 B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
-            else:  # nvfp4
+            else:  # nvfp4 # mxfp4
                 A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
                 B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
-                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e4m3fn)
-                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e4m3fn)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
 
         elif test_case_name == "a_ones_b_ones":
             A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
@@ -1490,11 +1499,11 @@ def test_blockwise_mxfp8_nvfp4_numerics(self, test_case_name, fast_accum, mkn, r
                 B = B_ref.to(torch.float8_e4m3fn)
                 A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
                 B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
-            else:  # nvfp4
+            else:  # nvfp4 # mxfp4
                 A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
                 B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
-                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e4m3fn)
-                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e4m3fn)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
 
         elif test_case_name == "a_ones_modified_b_ones":
             A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
@@ -1506,11 +1515,11 @@ def test_blockwise_mxfp8_nvfp4_numerics(self, test_case_name, fast_accum, mkn, r
                 B = B_ref.to(torch.float8_e4m3fn)
                 A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
                 B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
-            else:  # nvfp4
+            else:  # nvfp4 # mxfp4
                 A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
                 B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
-                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e4m3fn)
-                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e4m3fn)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
 
         elif test_case_name == "a_ones_b_ones_modified":
             A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
@@ -1522,11 +1531,11 @@ def test_blockwise_mxfp8_nvfp4_numerics(self, test_case_name, fast_accum, mkn, r
                 B = B_ref.to(torch.float8_e4m3fn)
                 A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
                 B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
-            else:  # nvfp4
+            else:  # nvfp4 # mxfp4
                 A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
                 B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
-                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e4m3fn)
-                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e4m3fn)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
 
         elif test_case_name == "a_scale_modified_b_ones":
             A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
@@ -1540,11 +1549,11 @@ def test_blockwise_mxfp8_nvfp4_numerics(self, test_case_name, fast_accum, mkn, r
                 A_ref[1][0:BLOCK_SIZE] = 4
                 A[1][0:BLOCK_SIZE] = 2
                 A_scale[1][0] = 2
-            else:  # nvfp4
+            else:  # nvfp4 # mxfp4
                 A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
                 B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
-                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e4m3fn)
-                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e4m3fn)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
                 A_ref[1][0:BLOCK_SIZE] = 4
                 A.view(torch.uint8)[1][0:(BLOCK_SIZE // 2)] = 0b01000100
                 A_scale[1][0] = 2
@@ -1561,11 +1570,11 @@ def test_blockwise_mxfp8_nvfp4_numerics(self, test_case_name, fast_accum, mkn, r
                 B_ref[1][0:BLOCK_SIZE] = 4
                 B[1][0:BLOCK_SIZE] = 2
                 B_scale[1][0] = 2
-            else:  # nvfp4
+            else:  # nvfp4 # mxfp4
                 A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
                 B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
-                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e4m3fn)
-                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e4m3fn)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
                 B_ref[1][0:BLOCK_SIZE] = 4
                 B.view(torch.uint8)[1][0:(BLOCK_SIZE // 2)] = 0b01000100
                 B_scale[1][0] = 2
@@ -1585,7 +1594,7 @@ def test_blockwise_mxfp8_nvfp4_numerics(self, test_case_name, fast_accum, mkn, r
                 B = B_ref.to(torch.float8_e4m3fn)
                 A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
                 B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
-            else:  # nvfp4
+            else:  # nvfp4 # mxfp4
                 # scales all-ones, element data random while being exactly representable in float4_e2m1fn_x2
                 # generate integers in [0, 16] and cast to bfloat16
                 A_ref = _floatx_unpacked_to_f32(
@@ -1600,8 +1609,8 @@ def test_blockwise_mxfp8_nvfp4_numerics(self, test_case_name, fast_accum, mkn, r
                 ).bfloat16()
                 A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
                 B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
-                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e4m3fn)
-                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e4m3fn)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
 
         elif test_case_name == "data_random_scales_from_data":
             if not K % BLOCK_SIZE == 0:
@@ -1613,17 +1622,18 @@ def test_blockwise_mxfp8_nvfp4_numerics(self, test_case_name, fast_accum, mkn, r
 
             if recipe == "mxfp8":
                 # Calculate scales based on the inputs
-                A_scale = data_to_mx_scale(A_ref, BLOCK_SIZE)
-                B_scale = data_to_mx_scale(B_ref, BLOCK_SIZE)
+                A_scale = data_to_mx_scale(A_ref, BLOCK_SIZE, recipe)
+                B_scale = data_to_mx_scale(B_ref, BLOCK_SIZE, recipe)
                 max_val = F8E4M3_MAX_VAL
                 min_val = -1 * max_val
                 A = (A_ref.reshape(-1, BLOCK_SIZE) / A_scale.reshape(M * ceil_div(K, BLOCK_SIZE), 1).float()).reshape(M, K)
                 A = A.clamp(min=min_val, max=max_val).to(torch.float8_e4m3fn)
                 B = (B_ref.reshape(-1, BLOCK_SIZE) / B_scale.reshape(N * ceil_div(K, BLOCK_SIZE), 1).float()).reshape(N, K)
                 B = B.clamp(min=min_val, max=max_val).to(torch.float8_e4m3fn)
-            else:  # nvfp4
-                A_scale = data_to_nvfp4_scale(A_ref, BLOCK_SIZE)
-                B_scale = data_to_nvfp4_scale(B_ref, BLOCK_SIZE)
+            else:  # nvfp4 # mxfp4
+                scale_func = data_to_mx_scale if recipe == "mxfp4" else data_to_nvfp4_scale
+                A_scale = scale_func(A_ref, BLOCK_SIZE, recipe if recipe == "mxfp4" else None)
+                B_scale = scale_func(B_ref, BLOCK_SIZE, recipe if recipe == "mxfp4" else None)
                 max_val = FP4_MAX_VAL
                 min_val = -1 * max_val
 
@@ -1634,13 +1644,14 @@ def test_blockwise_mxfp8_nvfp4_numerics(self, test_case_name, fast_accum, mkn, r
                 B = B.clamp(min=min_val, max=max_val)
                 B = _bfloat16_to_float4_e2m1fn_x2(B)
 
-                approx_match_sqnr_target = 15.8
+                approx_match_sqnr_target = 12.0 if torch.version.hip else 15.8
 
         C_ref = A_ref @ B_ref.t()
 
         # convert to swizzled format
-        A_scale = to_blocked(A_scale)
-        B_scale = to_blocked(B_scale)
+        if not torch.version.hip:
+            A_scale = to_blocked(A_scale)
+            B_scale = to_blocked(B_scale)
 
         C = torch._scaled_mm(
             A,
@@ -1657,6 +1668,7 @@ def test_blockwise_mxfp8_nvfp4_numerics(self, test_case_name, fast_accum, mkn, r
             sqnr = compute_error(C_ref, C)
             assert sqnr.item() > approx_match_sqnr_target
 
+    @skipIfRocm
     @unittest.skipIf(not PLATFORM_SUPPORTS_MX_GEMM or IS_WINDOWS, mx_skip_msg)
     @parametrize("recipe", ["mxfp8", "nvfp4"])
     def test_blockwise_mxfp8_nvfp4_error_messages(self, device, recipe) -> None:
@@ -1899,6 +1911,7 @@ def test_blockwise_mxfp8_compile(self) -> None:
         )
         torch.testing.assert_close(C, C_ref, atol=0, rtol=0)
 
+    @skipIfRocm
     @unittest.skipIf(not PLATFORM_SUPPORTS_MX_GEMM, mx_skip_msg)
     def test_blockwise_nvfp4_compile(self) -> None:
 

From 016bbef8d0abf42ad9ff56e9bfad8a98902c9400 Mon Sep 17 00:00:00 2001
From: omkar kakarparthi <75638701+okakarpa@users.noreply.github.com>
Date: Thu, 7 Aug 2025 09:56:58 -0500
Subject: [PATCH 67/83] [AUTOGENERATED] [release/2.8]
 [rocm7.0_internal_testing] skip test_transformer_req_grad on Navi32/Navi4x
 (#2464)

Cherry-pick of https://github.com/ROCm/pytorch/pull/2385

Co-authored-by: Dmitry Nikolaev <139769634+dnikolaev-amd@users.noreply.github.com>
---
 test/distributed/tensor/parallel/test_tp_examples.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/distributed/tensor/parallel/test_tp_examples.py b/test/distributed/tensor/parallel/test_tp_examples.py
index 2365bd9ffc631..9a583bb65b1fe 100644
--- a/test/distributed/tensor/parallel/test_tp_examples.py
+++ b/test/distributed/tensor/parallel/test_tp_examples.py
@@ -27,6 +27,8 @@
     RowwiseParallel,
 )
 from torch.distributed.tensor.parallel.input_reshard import input_reshard
+from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FUSED_ATTENTION
+from torch.testing._internal.common_device_type import skipIf
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
@@ -412,6 +414,7 @@ def test_transformer_training(self, is_seq_parallel, dtype: torch.dtype):
         + f"{str(dtype).split('.')[-1]}_"
         + f"thaw_{'__'.join(sorted({n.rpartition('.')[0].replace('.', '_') for n in thaw})) if thaw else 'all'}",
     )
+    @skipIf(not PLATFORM_SUPPORTS_FUSED_ATTENTION, "Does not support fused scaled dot product attention")
     def test_transformer_req_grad(self, thaw_params, is_seq_parallel, dtype, exp_cnts):
         # Sample a subset of `requires_grad` patterns
 

From 8e96f1616ec66d1333c47043f140390571660813 Mon Sep 17 00:00:00 2001
From: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
Date: Fri, 8 Aug 2025 08:22:18 -0500
Subject: [PATCH 68/83] Update version as 2.8.0

---
 version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.txt b/version.txt
index 11922a5ce1684..834f262953832 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-2.8.0a0
+2.8.0

From 29b4c2479ac140ebfd17b48c073c974f8185c926 Mon Sep 17 00:00:00 2001
From: Ethan Wee <Ethan.Wee@amd.com>
Date: Fri, 8 Aug 2025 13:00:30 -0700
Subject: [PATCH 69/83] [release/2.8] pin requirements.txt (#2481)

docker image used;

registry-sc-harbor.amd.com/framework/compute-rocm-dkms-no-npi-hipclang:16510_ubuntu24.04_py3.12_pytorch_release-2.8_b4af472d

Keeping cmake at 3.31.4 or greater
---
 requirements.txt | 39 ++++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index f65837a0097e0..36414c046f256 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,23 +1,24 @@
 # Python dependencies required for development
-astunparse
-cmake
-expecttest>=0.3.0
-filelock
-fsspec
-hypothesis
-jinja2
-lintrunner ; platform_machine != "s390x"
-networkx
-ninja
+astunparse==1.6.3
+cmake>=3.31.4
+expecttest==0.3.0
+filelock==3.18.0
+fsspec==2025.7.0
+hypothesis==5.35.1
+jinja2==3.1.6
+lintrunner==0.12.7 ; platform_machine != "s390x"
+networkx==2.8.8
+ninja==1.11.1.3
 numpy==2.0.2 ; python_version == "3.9"
 numpy==2.1.2 ; python_version > "3.9"
-optree>=0.13.0
-packaging
-psutil
-pyyaml
-requests
+optree==0.13.0
+packaging==25.0
+psutil==7.0.0
+pyyaml==6.0.2
+requests==2.32.4
 # setuptools develop deprecated on 80.0
-setuptools>=62.3.0,<80.0
-sympy>=1.13.3
-types-dataclasses
-typing-extensions>=4.10.0
+# issue on Windows after >= 75.8.2 - https://github.com/pytorch/pytorch/issues/148877
+setuptools==75.8.2
+sympy==1.13.3
+types-dataclasses==0.6.6
+typing-extensions==4.14.1

From 16cac0c8847449316b4063f9e325986280240a2f Mon Sep 17 00:00:00 2001
From: omkar kakarparthi <75638701+okakarpa@users.noreply.github.com>
Date: Sun, 10 Aug 2025 19:41:41 -0500
Subject: [PATCH 70/83] [AUTOGENERATED] [release/2.8] [SWDEV-539215] - Autotune
 support for persistent reduction and no_x_dim removal (#2454)

Cherry-pick of https://github.com/ROCm/pytorch/pull/2417
Need to resolve conflicts

---------

Co-authored-by: Jack Taylor <108682042+jataylo@users.noreply.github.com>
---
 test/inductor/test_combo_kernels.py           | 17 -------------
 .../test_torchinductor_strided_blocks.py      | 25 -------------------
 torch/_inductor/choices.py                    |  8 +++---
 torch/_inductor/codegen/triton.py             | 11 +++-----
 torch/_inductor/runtime/triton_heuristics.py  | 25 ++++++++++++++-----
 5 files changed, 27 insertions(+), 59 deletions(-)

diff --git a/test/inductor/test_combo_kernels.py b/test/inductor/test_combo_kernels.py
index a054464bf6689..b6f356e256713 100644
--- a/test/inductor/test_combo_kernels.py
+++ b/test/inductor/test_combo_kernels.py
@@ -296,23 +296,6 @@ def fn(a0, a1, a2, b0, b1, b2):
 
         self.assertTrue(7 <= torch._inductor.metrics.generated_kernel_count <= 8)
 
-    @requires_cuda
-    def test_persistent_reduction_no_x_dim(self):
-        def fn(x, y):
-            return x.sum(1), y.sum(1)
-
-        inps = (
-            torch.rand(16, 256, device="cuda"),
-            torch.rand(32, 256, device="cuda"),
-        )
-        torch._dynamo.mark_dynamic(inps[0], 0, min=1, max=256)
-        torch._dynamo.mark_dynamic(inps[1], 0, min=1, max=256)
-        out_eager = fn(*inps)
-        out_compiled = torch.compile(fn)(*inps)
-
-        self.assertEqual(out_eager, out_compiled)
-        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 4)
-
 
 @instantiate_parametrized_tests
 class ComboKernelDynamicShapesTests(TestCase):
diff --git a/test/inductor/test_torchinductor_strided_blocks.py b/test/inductor/test_torchinductor_strided_blocks.py
index 19f83a35e96d7..53308ccf7f463 100644
--- a/test/inductor/test_torchinductor_strided_blocks.py
+++ b/test/inductor/test_torchinductor_strided_blocks.py
@@ -706,31 +706,6 @@ def test_2d_reduction_odd_shapes(
         # Check the code for multiple Rn_BLOCK's
         self._assert_reduction_ndims(code, 2)
 
-    def test_2d_reduction_no_x_dim(self):
-        """
-        Tests a 2D reduction without an "x" dimension.
-        """
-        # We need a size to get no x dim.
-        view = self._discontiguous_tensor((2, 346), self.device)
-
-        # Expect 1 block pointer for the input.
-        result, (code,) = run_and_compare(
-            self,
-            torch.prod,
-            view,
-            expected_num_block_pointers=1,
-            expected_num_triton_kernels=1,
-            config_patches=tiled_reduction_config,
-        )
-
-        # Check that there's no X dimension in the signature.
-        (signature_line,) = (
-            line for line in code.splitlines() if line.startswith("def triton")
-        )
-        self.assertNotIn("BLOCK", signature_line)
-
-        # Check for 2 reduction dimensions in the body.
-        self._assert_reduction_ndims(code, 2)
 
     @parametrize(
         "size,expected_num_block_pointers,expected_num_triton_kernels,expect_fallback",
diff --git a/torch/_inductor/choices.py b/torch/_inductor/choices.py
index 00de22393abf3..cffe585a236a1 100644
--- a/torch/_inductor/choices.py
+++ b/torch/_inductor/choices.py
@@ -215,11 +215,11 @@ def want_no_x_dim(features: SIMDKernelFeatures) -> bool:
         Heuristic to decide if we should drop the X dimension from a persistent reduction kernel.
         So the [XBLOCK, RBLOCK] block becomes a [RBLOCK] block and XBLOCK is forced to be always 1.
         Strangely this is faster than a [1, RBLOCK] block in some cases.
+
+        ROCm branch change: Remove want_no_x_dim for persistent reduction.
+        Inductor benchmarks show no perf advantage and simplifies autotune flow.
         """
-        return (
-            features.get_reduction_hint() == ReductionHint.INNER
-            and V.graph.sizevars.statically_known_geq(features.reduction_numel, 256)
-        )
+        return False
 
     @staticmethod
     def reduction_split_factor(
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index a404abc136f52..dc8722b3cee8f 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -1768,13 +1768,10 @@ def should_use_persistent_reduction(self) -> bool:
         )
 
     def want_no_x_dim(self):
-        if (
-            self.persistent_reduction
-            and len(self.numels) == self.num_reduction_dims + 1
-        ):
-            if self.fixed_config:
-                return self.fixed_config["XBLOCK"] == 1
-            return V.choices.want_no_x_dim(self.features)
+        """
+        ROCm branch change: Remove want_no_x_dim for persistent reduction.
+        Inductor benchmarks show no perf advantage and simplifies autotune flow.
+        """
         return False
 
     @property
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
index 54c7e83c0879b..2ea6a2d467a67 100644
--- a/torch/_inductor/runtime/triton_heuristics.py
+++ b/torch/_inductor/runtime/triton_heuristics.py
@@ -2556,6 +2556,10 @@ def _persistent_reduction_configs(
     rnumel = get_total_reduction_numel(size_hints)
 
     MAX_PERSISTENT_BLOCK_NUMEL = 4096
+    max_autotune_enabled = not disable_pointwise_autotuning(inductor_meta) or (
+        inductor_meta.get("max_autotune")
+        or inductor_meta.get("max_autotune_pointwise")
+    )
 
     if "y" not in size_hints:
         configs = [
@@ -2585,18 +2589,27 @@ def _persistent_reduction_configs(
     if "y" in size_hints:
         pass
     # TODO(jansel): we should be able to improve these heuristics
-    elif reduction_hint == ReductionHint.INNER and rnumel >= 256:
-        configs = configs[:1]
-    elif reduction_hint == ReductionHint.OUTER:
-        configs = configs[-1:]
-    elif reduction_hint == ReductionHint.OUTER_TINY:
-        configs = [
+    if not max_autotune_enabled: # Don't filter if tuning enabled
+        if reduction_hint == ReductionHint.INNER and rnumel >= 256:
+            configs = configs[:1]
+        elif reduction_hint == ReductionHint.OUTER:
+            configs = configs[-1:]
+
+    if reduction_hint == ReductionHint.OUTER_TINY:
+        tiny_configs = [
             triton_config_reduction(
                 size_hints,
                 2 * (256 // rnumel) if rnumel <= 256 else 1,
                 rnumel,
             )
         ]
+        if max_autotune_enabled:
+            for tconfig in tiny_configs:
+                if tconfig not in configs:
+                    configs.append(tconfig)
+            else:
+                configs = tiny_configs
+
     for c in configs:
         # we don't need Rn_BLOCK for persistent reduction
         for prefix in size_hints:

From 08569178c6b55911c9eaba087ac9a207a78f827f Mon Sep 17 00:00:00 2001
From: Jagadish Krishnamoorthy <jagadish.krishnamoorthy@amd.com>
Date: Sun, 10 Aug 2025 22:00:31 -0700
Subject: [PATCH 71/83] [release/2.8] fp8: skip rowwise tests (#2477)

fp8 rowwise scaling is not supported on ROCm 7.0 w/ gfx950, works on
mainline.
Skip the test for now.

Signed-off-by: Jagadish Krishnamoorthy <jagadish.krishnamoorthy@amd.com>
---
 test/test_matmul_cuda.py                |  3 +++
 torch/testing/_internal/common_utils.py | 17 +++++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/test/test_matmul_cuda.py b/test/test_matmul_cuda.py
index e8ff44fd40986..dead0bda1c0c3 100644
--- a/test/test_matmul_cuda.py
+++ b/test/test_matmul_cuda.py
@@ -46,6 +46,7 @@
     parametrize,
     run_tests,
     skipIfRocm,
+    skipIfRocmVersionAndArch,
     skipIfRocmVersionLessThan,
     TEST_CUDA,
     TEST_WITH_ROCM,
@@ -1197,6 +1198,7 @@ def test_float8_scale_fast_accum(self, device) -> None:
         out_fp8_s = torch._scaled_mm(x, y, scale_a=scale_a, scale_b=scale_b, use_fast_accum=True)
         self.assertEqual(out_fp8, out_fp8_s)
 
+    @skipIfRocmVersionAndArch((7, 1), "gfx950")
     @onlyCUDA
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8 or IS_WINDOWS, f8_msg)
     @unittest.skipIf(not SM89OrLater, "rowwise implementation is currently sm89-sm100 specific")
@@ -1304,6 +1306,7 @@ def test_float8_error_messages(self, device) -> None:
                 out_dtype=torch.bfloat16,
             )
 
+    @skipIfRocmVersionAndArch((7, 1), "gfx950")
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8 or IS_WINDOWS, f8_msg)
     @unittest.skipIf(not SM89OrLater, "rowwise implementation is currently sm89-sm100 specific")
     @parametrize("base_dtype", [torch.bfloat16])
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 052a968d51e22..58398f5287000 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -2024,6 +2024,23 @@ def wrap_fn(self, *args, **kwargs):
         return wrap_fn
     return dec_fn
 
+def skipIfRocmVersionAndArch(version=None, arch=None):
+    def dec_fn(fn):
+        @wraps(fn)
+        def wrap_fn(self, *args, **kwargs):
+            if TEST_WITH_ROCM:
+                rocm_version = str(torch.version.hip)
+                rocm_version = rocm_version.split("-")[0]    # ignore git sha
+                rocm_version_tuple = tuple(int(x) for x in rocm_version.split("."))
+                if rocm_version_tuple is None or version is None or rocm_version_tuple < tuple(version):
+                    prop = torch.cuda.get_device_properties(0)
+                    if prop.gcnArchName.split(":")[0] in arch:
+                        reason = f"ROCm {version} and {arch} combination not supported"
+                        raise unittest.SkipTest(reason)
+            return fn(self, *args, **kwargs)
+        return wrap_fn
+    return dec_fn
+
 def skipIfNotMiopenSuggestNHWC(fn):
     @wraps(fn)
     def wrapper(*args, **kwargs):

From 0da7d0214736e818c0f3e9ce06c938dbee50c622 Mon Sep 17 00:00:00 2001
From: Sriram Kumar <skishore@amd.com>
Date: Tue, 12 Aug 2025 20:42:27 +0300
Subject: [PATCH 72/83] [release/2.8] update related_commit (#2490)

Commit Messages:
- update the param_id calculation so that it works on both CPX and SPX
modes (#271) (#272)
- reset parameters for FusedDenseGeluDense similar to FusedDense to make
the test_gelu pass (#269) (#270)


PRs:
- https://github.com/ROCm/apex/pull/272
- https://github.com/ROCm/apex/pull/269

Fixes:
- https://ontrack-internal.amd.com/browse/SWDEV-540029
- https://ontrack-internal.amd.com/browse/SWDEV-548434
---
 related_commits | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/related_commits b/related_commits
index fd2787398fc13..ace82fe2a8abd 100644
--- a/related_commits
+++ b/related_commits
@@ -1,5 +1,5 @@
-ubuntu|pytorch|apex|release/1.8.0|eab2474650906473d7d2d7053b870fe512438f90|https://github.com/ROCm/apex
-centos|pytorch|apex|release/1.8.0|eab2474650906473d7d2d7053b870fe512438f90|https://github.com/ROCm/apex
+ubuntu|pytorch|apex|release/1.8.0|3f26640cff501d67d35acf424ed2566d50949f5b|https://github.com/ROCm/apex
+centos|pytorch|apex|release/1.8.0|3f26640cff501d67d35acf424ed2566d50949f5b|https://github.com/ROCm/apex
 ubuntu|pytorch|torchvision|release/0.23|824e8c8726b65fd9d5abdc9702f81c2b0c4c0dc8|https://github.com/pytorch/vision
 centos|pytorch|torchvision|release/0.23|824e8c8726b65fd9d5abdc9702f81c2b0c4c0dc8|https://github.com/pytorch/vision
 ubuntu|pytorch|torchdata|release/0.11|377e64c1be69a9be6649d14c9e3664070323e464|https://github.com/pytorch/data

From f7921f4b80a1d84333521a174d5019d6a4397542 Mon Sep 17 00:00:00 2001
From: Jack Taylor <108682042+jataylo@users.noreply.github.com>
Date: Tue, 12 Aug 2025 21:42:38 +0100
Subject: [PATCH 73/83] [SWDEV-539119] [release/2.8] Add fast_tanh support
 (#2484)

Perf improvement for triton tanh
---
 .ci/docker/ci_commit_pins/triton.txt | 2 +-
 torch/_inductor/codegen/triton.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt
index 567536db72100..23c1fdd0bf13b 100644
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@@ -1 +1 @@
-711e2a92522e0a9921ce58ae658571ca55c49b97
+6581064e5be2ac4ab0c198c1bb0e533a1900bbe3
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index dc8722b3cee8f..7d65354e7a2f4 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -1232,7 +1232,7 @@ def tan(x):
     @staticmethod
     @maybe_upcast_float32()
     def tanh(x):
-        return f"libdevice.tanh({x})"
+        return f"libdevice.fast_tanhf({x})"
 
     @staticmethod
     @maybe_upcast_float32()

From 2b292160c17ce56e08a5d57e723e6da501f47eb8 Mon Sep 17 00:00:00 2001
From: omkar kakarparthi <75638701+okakarpa@users.noreply.github.com>
Date: Tue, 12 Aug 2025 16:00:40 -0500
Subject: [PATCH 74/83] [AUTOGENERATED] [release/2.8] remove extra transposes
 in NHWC convolutions on MIOpen (#2410)

Cherry-pick of https://github.com/ROCm/pytorch/pull/2405

Co-authored-by: Dmitry Nikolaev <139769634+dnikolaev-amd@users.noreply.github.com>
---
 aten/src/ATen/native/miopen/Conv_miopen.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp
index f4e67e4fc307a..f9ac375c205ec 100644
--- a/aten/src/ATen/native/miopen/Conv_miopen.cpp
+++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp
@@ -1200,7 +1200,7 @@ std::tuple<at::Tensor,at::Tensor,at::Tensor> miopen_convolution_transpose_backwa
     IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
     bool benchmark, bool deterministic, std::array<bool,3> output_mask) {
 
-  Tensor grad_output = grad_output_t.contiguous();
+  Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format());
 
   Tensor grad_input, grad_weight, grad_bias;
   if (output_mask[0]) {
@@ -1452,7 +1452,7 @@ std::tuple<at::Tensor,at::Tensor,at::Tensor> miopen_depthwise_convolution_backwa
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
     bool benchmark, bool deterministic, std::array<bool,3> output_mask) {
 
-  Tensor grad_output = grad_output_t.contiguous();
+  Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format());
 
   Tensor grad_input, grad_weight, grad_bias;
   if (output_mask[0]) {

From 46342727d203b1c92d8873ec8f92432c92c41913 Mon Sep 17 00:00:00 2001
From: iupaikov-amd <Iurii.Paikov@amd.com>
Date: Wed, 13 Aug 2025 17:06:47 +0200
Subject: [PATCH 75/83] [release/2.8] [triton] Triton bump to fix ROCm 7.0
 issues (#2498)

Fixes https://github.com/ROCm/frameworks-internal/issues/13294
---
 .ci/docker/ci_commit_pins/triton.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt
index 23c1fdd0bf13b..cacbdb55a7a7c 100644
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@@ -1 +1 @@
-6581064e5be2ac4ab0c198c1bb0e533a1900bbe3
+f9e5bf54a2fe1a6262a41b27b38180cdb6fae6a2

From 0e1a3e962169c5e4c0bfdd6ef783f9529817153f Mon Sep 17 00:00:00 2001
From: Divin Honnappa <divin.honnappa@amd.com>
Date: Fri, 15 Aug 2025 10:27:34 -0500
Subject: [PATCH 76/83] [AUTOGENERATED] [release/2.8] [ROCm] Improve reduction
 sum performance (#2505)

Cherry-pick of https://github.com/ROCm/pytorch/pull/2492

Co-authored-by: Jerry Mannil <65309407+jerrymannil@users.noreply.github.com>
---
 aten/src/ATen/native/cuda/Reduce.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh
index 521b467480900..7d1c45e785b79 100644
--- a/aten/src/ATen/native/cuda/Reduce.cuh
+++ b/aten/src/ATen/native/cuda/Reduce.cuh
@@ -1062,7 +1062,7 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){
   // In such case, values in each loaded vector always correspond to different outputs.
   if (fastest_moving_stride == sizeof(scalar_t)) {
 #ifdef USE_ROCM
-    if (reduction_on_fastest_striding_dimension && dim0 > 128 && iter.num_reduce_dims() == 1) {
+    if (reduction_on_fastest_striding_dimension && dim0 >= 128 && iter.num_reduce_dims() == 1) {
 #else
     if (reduction_on_fastest_striding_dimension && dim0 > 128 && iter.num_reduce_dims() == 1 && vt0 >= input_vec_size) {
 #endif

From fe840fa44a89506e495ef222a96fe5a36e6e6657 Mon Sep 17 00:00:00 2001
From: Divin Honnappa <divin.honnappa@amd.com>
Date: Fri, 15 Aug 2025 10:38:27 -0500
Subject: [PATCH 77/83] [release/2.8] Using c10d.barrier() in
 test_extra_cuda_context test in test_c10d_nccl.py (#2522)

Cherry-pick of https://github.com/ROCm/pytorch/pull/2447

Co-authored-by: akashveramd <Akash.Verma3@amd.com>
---
 test/distributed/test_c10d_nccl.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index c02e968e23fb6..bcd6316a9d2a5 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -639,6 +639,14 @@ def _helper_test_extra_cuda_context_by_memory(self):
         """
         device = torch.device(f"cuda:{self.rank:d}")
         x = torch.empty((1,), device=device)
+
+        # We need this barrier to ensure that all nodes have completed init_process_group
+        # If rank=0 gets a mem snapshot before other nodes have finished init_process_group,
+        # then we artificially see a bump in memory usage. As per the following comment,
+        # we are going to be moving away from this function:
+        # https://github.com/pytorch/pytorch/pull/154174#discussion_r2105065931
+        c10d.barrier()
+
         # Rank 0 takes a snapshot before collective -- this snapshot should have
         # included rank 0's own context.
         if self.rank == 0:

From d9d5b964b906db79ff592572dd57ac45977431ff Mon Sep 17 00:00:00 2001
From: Divin Honnappa <divin.honnappa@amd.com>
Date: Fri, 15 Aug 2025 15:13:48 -0500
Subject: [PATCH 78/83] [AUTOGENERATED] [release/2.8] Change triton package
 name depending on rocm version (#2529)

Cherry-pick of https://github.com/ROCm/pytorch/pull/2518

Co-authored-by: Ethan Wee <Ethan.Wee@amd.com>
---
 .github/scripts/build_triton_wheel.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/scripts/build_triton_wheel.py b/.github/scripts/build_triton_wheel.py
index 695b4a9c865a6..ec26d8cbefaf9 100644
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@@ -101,8 +101,12 @@ def build_triton(
 
         triton_repo = "https://github.com/openai/triton"
         if device == "rocm":
-            triton_pkg_name = "pytorch-triton-rocm"
             triton_repo = "https://github.com/ROCm/triton"
+            rocm_version = get_rocm_version()  # e.g., "7.0.1"
+            if tuple(map(int, rocm_version.split("."))) > (7, 0, 0):
+                triton_pkg_name = "triton"
+            else:
+                triton_pkg_name = "pytorch-triton-rocm"
         elif device == "xpu":
             triton_pkg_name = "pytorch-triton-xpu"
             triton_repo = "https://github.com/intel/intel-xpu-backend-for-triton"

From 608069b26d9539f7fd0d12e8ea415820303bbd1f Mon Sep 17 00:00:00 2001
From: omkar kakarparthi <75638701+okakarpa@users.noreply.github.com>
Date: Mon, 18 Aug 2025 10:48:57 -0500
Subject: [PATCH 79/83] [AUTOGENERATED] [release/2.8] NAVI32 specific fixes
 (#2467)

Cherry-pick of https://github.com/ROCm/pytorch/pull/2450

---------

Co-authored-by: iupaikov-amd <iurii.paikov@amd.com>
---
 test/inductor/test_flex_decoding.py | 6 +++++-
 test/inductor/test_max_autotune.py  | 9 ++++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/test/inductor/test_flex_decoding.py b/test/inductor/test_flex_decoding.py
index 46acd0c1cdca3..dfd6dce40b999 100644
--- a/test/inductor/test_flex_decoding.py
+++ b/test/inductor/test_flex_decoding.py
@@ -22,7 +22,10 @@
 )
 from torch.testing import FileCheck
 from torch.testing._internal import common_utils
-from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_BF16
+from torch.testing._internal.common_cuda import (
+    PLATFORM_SUPPORTS_BF16,
+    PLATFORM_SUPPORTS_FLASH_ATTENTION,
+)
 from torch.testing._internal.common_device_type import (
     flex_attention_supported_platform as supported_platform,
     instantiate_device_type_tests,
@@ -1582,6 +1585,7 @@ def mask_mod(b, h, q, kv):
         self.assertEqual(out[:, :, M:, :].sum(), 0)
 
     @supported_platform
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA")
     def test_windowed_no_mask_vs_sdpa(self, device):
         score_mod = _generate_windowed(1000)
         attention = functools.partial(flex_attention, score_mod=score_mod)
diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py
index d1c5637d82008..eba9fbaf131c5 100644
--- a/test/inductor/test_max_autotune.py
+++ b/test/inductor/test_max_autotune.py
@@ -37,11 +37,15 @@
 )
 from torch._inductor.template_heuristics import CUDAConfigHeuristic, GemmConfig
 from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8
+from torch.testing._internal.common_device_type import largeTensorTest
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     IS_WINDOWS,
     parametrize,
     TEST_WITH_ROCM,
+    MI300_ARCH,
+    runOnRocmArch,
+    skipIfXpu,
 )
 from torch.testing._internal.logging_utils import multiple_logs_to_string
 from torch.utils._triton import has_triton_tma_device
@@ -54,7 +58,6 @@
 from torch._inductor.virtualized import V
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.testing import FileCheck
-from torch.testing._internal.common_utils import MI300_ARCH, runOnRocmArch, skipIfXpu
 from torch.testing._internal.inductor_utils import (
     get_func_call,
     get_kernel_launch,
@@ -804,6 +807,8 @@ def test_conv_backend(self):
 
         self.assertIn("NoValidChoicesError", str(context.exception))
 
+    # Some ROCm GPUs don't have enough VRAM to run all autotune configurations and padding benchmarks
+    @largeTensorTest("30 GB", device=GPU_TYPE)
     def test_non_contiguous_input_mm(self):
         """
         Make sure the triton template can work with non-contiguous inputs without crash.
@@ -856,6 +861,8 @@ def f(x, y):
     # TODO: fix accuracy failure of the triton template on XPU.
     # and enable this test case.
     @skipIfXpu
+    # Some ROCm GPUs don't have enough VRAM to run all autotune configurations and padding benchmarks
+    @largeTensorTest("30 GB", device=GPU_TYPE)
     def test_non_contiguous_input_mm_plus_mm(self):
         x1 = rand_strided((50257, 32768), (1, 50304), device=GPU_TYPE)
         y1 = rand_strided((32768, 768), (768, 1), device=GPU_TYPE)

From d6007d3e8aed68aeaf995a130389a5808ca766d9 Mon Sep 17 00:00:00 2001
From: Ramya Ramineni <62723901+rraminen@users.noreply.github.com>
Date: Tue, 19 Aug 2025 00:10:21 -0500
Subject: [PATCH 80/83] [release/2.8] Define uint32 t when ROCM_VERSION >=
 70000 (#2513)

Fixes SWDEV-543698
(https://ontrack-internal.amd.com/browse/SWDEV-543698)

Cherry-picked from https://github.com/ROCm/pytorch/pull/2502

This PR fixes the errors like below:
```
[rank3]: RuntimeError: The following operation failed in the TorchScript interpreter.
[rank3]: Traceback of TorchScript (most recent call last):
[rank3]: RuntimeError: /tmp/comgr-28f951/input/CompileSourceACC062:67:7: error: unknown type name 'uint32_t'; did you mean '__hip_internal::uint32_t'?
[rank3]:    67 |       uint32_t int32;
[rank3]:       |       ^~~~~~~~
[rank3]:       |       __hip_internal::uint32_t
```
Earlier uint32_t was defined in HIP headers in std namespace. Now it is
moved to __hip_internal namespace in hip headers. This change is made in
ROCm 7.0.
---
 .../jit/codegen/fuser/cuda/resource_strings.h | 71 ++++++++++++++++++-
 1 file changed, 70 insertions(+), 1 deletion(-)

diff --git a/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h b/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h
index 9728d27d4d79b..0ac2c79d1e98a 100644
--- a/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h
+++ b/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h
@@ -260,7 +260,7 @@ typedef __half half;
 )";
 #endif
 
-#if defined(USE_ROCM)
+#if defined(USE_ROCM) && ROCM_VERSION < 70000
 constexpr auto bfloat16_support_literal =
     R"(
 #ifndef __align__
@@ -317,6 +317,75 @@ __device__ __nv_bfloat16 __float2bfloat16(const float a) {
   return val;
 }
 
+__device__ float __bfloat162float(const __nv_bfloat16 a) {
+  union
+  {
+      uint32_t int32;
+      float    fp32;
+  } u = {uint32_t(a.__x) << 16};
+  return u.fp32;
+}
+#endif /* defined(__cplusplus) */
+)";
+#elif defined(USE_ROCM) && ROCM_VERSION >= 70000
+constexpr auto bfloat16_support_literal =
+    R"(
+#ifndef __align__
+#define __align__(x) __attribute__((aligned(x)))
+#endif
+
+typedef unsigned int uint32_t;
+
+typedef struct __align__(2) {
+  unsigned short x;
+}
+__nv_bfloat16_raw;
+
+#if defined(__cplusplus)
+struct __align__(2) __nv_bfloat16 {
+  __host__ __device__ __nv_bfloat16() {}
+
+  __host__ __device__ __nv_bfloat16& operator=(const __nv_bfloat16_raw& hr) {
+    __x = hr.x;
+    return *this;
+  }
+
+  unsigned short __x;
+};
+
+__device__ unsigned short __internal_float2bfloat16(
+    const float f,
+    unsigned int& sign,
+    unsigned int& remainder) {
+  unsigned int x;
+
+  x = __float_as_uint(f);
+
+  if ((x & 0x7fffffffU) > 0x7f800000U) {
+    sign = 0U;
+    remainder = 0U;
+    return static_cast<unsigned short>(0x7fffU);
+  }
+  sign = x >> 31;
+  remainder = x << 16;
+  return static_cast<unsigned short>(x >> 16);
+}
+
+/* Definitions of intrinsics */
+__device__ __nv_bfloat16 __float2bfloat16(const float a) {
+  __nv_bfloat16 val;
+  __nv_bfloat16_raw r;
+  unsigned int sign;
+  unsigned int remainder;
+  r.x = __internal_float2bfloat16(a, sign, remainder);
+  if ((remainder > 0x80000000U) ||
+      ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
+    r.x++;
+  }
+  val = r;
+  return val;
+}
+
 __device__ float __bfloat162float(const __nv_bfloat16 a) {
   union
   {

From 5d3dec1190b1b198eb1dca592995c94665fc00c5 Mon Sep 17 00:00:00 2001
From: Divin Honnappa <divin.honnappa@amd.com>
Date: Tue, 19 Aug 2025 21:25:49 -0500
Subject: [PATCH 81/83] [AUTOGENERATED] [release/2.8] Remove tb-nightly (#2538)

Cherry-pick of https://github.com/ROCm/pytorch/pull/2535

---------

Co-authored-by: Ethan Wee <Ethan.Wee@amd.com>
---
 .ci/docker/requirements-ci.txt | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index c6490242d632e..640095a068343 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -258,11 +258,6 @@ scipy==1.14.1 ; python_version > "3.9"
 #Pinned versions:
 #test that import:
 
-tb-nightly==2.13.0a20230426
-#Description: TensorBoard
-#Pinned versions:
-#test that import:
-
 # needed by torchgen utils
 typing-extensions>=4.10.0
 #Description: type hints for python

From 8ade7b5e62f2af10452a60087566946eddd0551e Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 16 Jul 2025 20:08:23 +0000
Subject: [PATCH 82/83] Use ROCm/triton and update triton.txt

---
 .ci/docker/ci_commit_pins/triton.txt  | 2 +-
 .github/scripts/build_triton_wheel.py | 6 +-----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt
index cacbdb55a7a7c..f3ae1dffcca24 100644
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@@ -1 +1 @@
-f9e5bf54a2fe1a6262a41b27b38180cdb6fae6a2
+5e5685356b9fc7b5ad9cdf4e510a1994a5b8601a
\ No newline at end of file
diff --git a/.github/scripts/build_triton_wheel.py b/.github/scripts/build_triton_wheel.py
index ec26d8cbefaf9..695b4a9c865a6 100644
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@@ -101,12 +101,8 @@ def build_triton(
 
         triton_repo = "https://github.com/openai/triton"
         if device == "rocm":
+            triton_pkg_name = "pytorch-triton-rocm"
             triton_repo = "https://github.com/ROCm/triton"
-            rocm_version = get_rocm_version()  # e.g., "7.0.1"
-            if tuple(map(int, rocm_version.split("."))) > (7, 0, 0):
-                triton_pkg_name = "triton"
-            else:
-                triton_pkg_name = "pytorch-triton-rocm"
         elif device == "xpu":
             triton_pkg_name = "pytorch-triton-xpu"
             triton_repo = "https://github.com/intel/intel-xpu-backend-for-triton"

From 52684a65990781e936fb17fd1276480392e70d5d Mon Sep 17 00:00:00 2001
From: tvukovic-amd <tvukovic@amd.com>
Date: Mon, 25 Aug 2025 18:24:37 +0200
Subject: [PATCH 83/83] revert triton change

---
 .ci/docker/ci_commit_pins/triton.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt
index 72f88ce2ad442..cacbdb55a7a7c 100644
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@@ -1 +1 @@
-f9e5bf54a2fe1a6262a41b27b38180cdb6fae6a2
\ No newline at end of file
+f9e5bf54a2fe1a6262a41b27b38180cdb6fae6a2