ROCm · pragupta · Nov 4, 2025 · Oct 30, 2025 · Oct 30, 2025 · Oct 30, 2025
diff --git a/.bc-linter.yml b/.bc-linter.yml
@@ -13,3 +13,4 @@ exclude:
   - "**/benchmarks/**"
   - "**/test_*.py"
   - "**/*_test.py"
+  - "tools/**"
@@ -195,13 +195,16 @@ case "$tag" in
     NINJA_VERSION=1.9.0
     TRITON=yes
     ;;
-  pytorch-linux-jammy-xpu-n-py3)
+  pytorch-linux-jammy-xpu-n-py3 | pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks)
     ANACONDA_PYTHON_VERSION=3.10
     GCC_VERSION=11
     VISION=yes
     XPU_VERSION=2025.2
     NINJA_VERSION=1.9.0
     TRITON=yes
+    if [[ $tag =~ "benchmarks" ]]; then
+      INDUCTOR_BENCHMARKS=yes
+    fi
     ;;
   pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
     ANACONDA_PYTHON_VERSION=3.10

@@ -3,7 +3,7 @@
 
 set -eux
 
-ACL_VERSION=${ACL_VERSION:-"v25.02"}
+ACL_VERSION=${ACL_VERSION:-"v52.6.0"}
 ACL_INSTALL_DIR="/acl"
 
 # Clone ACL

@@ -49,12 +49,20 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
     export SYSROOT_DEP="sysroot_linux-64=2.17"
   fi
 
+# Install correct Python version
+# Also ensure sysroot is using a modern GLIBC to match system compilers
+if [ "$ANACONDA_PYTHON_VERSION" = "3.14" ]; then
+  as_jenkins conda create -n py_$ANACONDA_PYTHON_VERSION -y\
+             python="3.14.0" \
+             ${SYSROOT_DEP} \
+             -c conda-forge
+else
   # Install correct Python version
   # Also ensure sysroot is using a modern GLIBC to match system compilers
   as_jenkins conda create -n py_$ANACONDA_PYTHON_VERSION -y\
              python="$ANACONDA_PYTHON_VERSION" \
              ${SYSROOT_DEP}
-
+fi
   # libstdcxx from conda default channels are too old, we need GLIBCXX_3.4.30
   # which is provided in libstdcxx 12 and up.
   conda_install libstdcxx-ng=12.3.0 --update-deps -c conda-forge

@@ -40,11 +40,7 @@ EOF
 
     # Default url values
     rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"
-    amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu"
-
-    # Add amdgpu repository
     UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
-    echo "deb [arch=amd64] ${amdgpu_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
 
     # Add rocm repository
     wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -

@@ -12,8 +12,8 @@ function do_install() {
 
     rocm_version_nodot=${rocm_version//./}
 
-    # https://github.com/icl-utk-edu/magma/pull/65
-    MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
+    # post merge of https://github.com/icl-utk-edu/magma/pull/65
+    MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f
     magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
 
     rocm_dir="/opt/rocm"

@@ -149,7 +149,7 @@ FROM cpu_final as rocm_final
 ARG ROCM_VERSION=6.0
 ARG PYTORCH_ROCM_ARCH
 ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
-ARG DEVTOOLSET_VERSION=11
+ARG DEVTOOLSET_VERSION=13
 ENV LDFLAGS="-Wl,-rpath=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64 -Wl,-rpath=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib"
 # Somewhere in ROCm stack, we still use non-existing /opt/rocm/hip path,
 # below workaround helps avoid error

@@ -97,7 +97,7 @@ case ${image} in
     manylinux2_28-builder:xpu)
         TARGET=xpu_final
         GPU_IMAGE=amd64/almalinux:8
-        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13"
         MANY_LINUX_VERSION="2_28"
         ;;
     *)

@@ -136,10 +136,11 @@ numba==0.61.2 ; python_version > "3.9"
 #test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py,
 #test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py,
 #test_binary_ufuncs.py
-numpy==2.0.2 ; python_version == "3.9"
-numpy==2.1.2 ; python_version > "3.9"
+numpy==2.1.2; python_version > "3.9" and python_version < "3.14"
+numpy==2.3.4; python_version >= "3.14"
 
-pandas==2.2.3
+pandas==2.2.3; python_version >= "3.9" and python_version < "3.14"
+pandas==2.3.3; python_version >= "3.14"
 
 #onnxruntime
 #Description: scoring engine for Open Neural Network Exchange (ONNX) models
@@ -151,7 +152,8 @@ opt-einsum==3.3
 #Pinned versions: 3.3
 #test that import: test_linalg.py
 
-optree==0.13.0
+optree==0.13.0 ; python_version < "3.14"
+optree==0.17.0 ; python_version >= "3.14"
 #Description: A library for tree manipulation
 #Pinned versions: 0.13.0
 #test that import: test_vmap.py, test_aotdispatch.py, test_dynamic_shapes.py,
@@ -249,8 +251,8 @@ scikit-image==0.22.0
 #Pinned versions: 0.20.3
 #test that import:
 
-scipy==1.13.1 ; python_version == "3.9"
-scipy==1.14.1 ; python_version > "3.9"
+scipy==1.14.1 ; python_version > "3.9" and python_version < "3.14"
+scipy==1.16.2 ; python_version >= "3.14"
 # Pin SciPy because of failing distribution tests (see #60347)
 #Description: scientific python
 #Pinned versions: 1.10.1
@@ -321,7 +323,8 @@ pywavelets==1.7.0 ; python_version >= "3.12"
 #Pinned versions: 1.4.1
 #test that import:
 
-lxml==5.3.0
+lxml==5.3.0 ; python_version < "3.14"
+lxml==6.0.2 ; python_version >= "3.14"
 #Description: This is a requirement of unittest-xml-reporting
 
 PyGithub==2.3.0
@@ -331,7 +334,9 @@ sympy==1.13.3
 #Pinned versions:
 #test that import:
 
-onnx==1.19.1
+onnx==1.19.1 ; python_version < "3.14"
+# Unpin once Python 3.14 is supported. See  onnxruntime issue 26309.
+onnx==1.18.0 ; python_version == "3.14"
 #Description: Required by onnx tests, and mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
@@ -356,7 +361,7 @@ pwlf==2.2.1
 #test that import: test_sac_estimator.py
 
 # To build PyTorch itself
-pyyaml==6.0.2
+pyyaml==6.0.3
 pyzstd
 setuptools==78.1.1
 packaging==23.1

@@ -54,12 +54,15 @@ ENV OPENSSL_DIR /opt/openssl
 RUN rm install_openssl.sh
 
 ARG INDUCTOR_BENCHMARKS
+ARG ANACONDA_PYTHON_VERSION
+ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
 COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
 COPY ci_commit_pins/timm.txt timm.txt
+COPY ci_commit_pins/torchbench.txt torchbench.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
 
 # Install XPU Dependencies
 ARG XPU_VERSION

diff --git a/.ci/lumen_cli/pyproject.toml b/.ci/lumen_cli/pyproject.toml
@@ -6,7 +6,7 @@ dependencies = [
     "GitPython==3.1.45",
     "docker==7.1.0",
     "pytest==7.3.2",
-    "uv==0.9.5"
+    "uv==0.9.6"
 ]
 
 [tool.setuptools]

diff --git a/.ci/magma-rocm/Makefile b/.ci/magma-rocm/Makefile
@@ -1,7 +1,7 @@
 SHELL=/usr/bin/env bash
 
 DOCKER_CMD ?= docker
-DESIRED_ROCM ?= 7.0
+DESIRED_ROCM ?= 7.1
 DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
 PACKAGE_NAME = magma-rocm
 # inherit this from underlying docker image, do not pass this env var to docker
@@ -16,6 +16,7 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	magma-rocm/build_magma.sh
 
 .PHONY: all
+all: magma-rocm71
 all: magma-rocm70
 all: magma-rocm64
 
@@ -24,6 +25,11 @@ clean:
 	$(RM) -r magma-*
 	$(RM) -r output
 
+.PHONY: magma-rocm71
+magma-rocm71: DESIRED_ROCM := 7.1
+magma-rocm71:
+	$(DOCKER_RUN)
+
 .PHONY: magma-rocm70
 magma-rocm70: DESIRED_ROCM := 7.0
 magma-rocm70:

diff --git a/.ci/magma-rocm/build_magma.sh b/.ci/magma-rocm/build_magma.sh
@@ -6,8 +6,8 @@ set -eou pipefail
 # The script expects DESIRED_CUDA and PACKAGE_NAME to be set
 ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 
-# https://github.com/icl-utk-edu/magma/pull/65
-MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
+# post merge of https://github.com/icl-utk-edu/magma/pull/65
+MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f
 
 # Folders for the build
 PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata
@@ -20,7 +20,7 @@ mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RE
 
 # Fetch magma sources and verify checksum
 pushd ${PACKAGE_DIR}
-git clone https://github.com/jeffdaily/magma
+git clone https://github.com/icl-utk-edu/magma
 pushd magma
 git checkout ${MAGMA_VERSION}
 popd

diff --git a/.ci/pytorch/build.sh b/.ci/pytorch/build.sh
@@ -426,7 +426,7 @@ fi
 if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]]; then
   # export test times so that potential sharded tests that'll branch off this build will use consistent data
   # don't do this for libtorch as libtorch is C++ only and thus won't have python tests run on its build
-  python tools/stats/export_test_times.py
+  PYTHONPATH=. python tools/stats/export_test_times.py
 fi
 # don't do this for bazel or s390x or riscv64 as they don't use sccache
 if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *riscv64* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then

diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
@@ -337,7 +337,7 @@ test_python() {
 
 test_python_smoke() {
   # Smoke tests for H100/B200
-  time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune inductor/test_cutedsl_grouped_mm $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
   assert_git_not_dirty
 }
 
@@ -572,6 +572,8 @@ fi
 
 if [[ "${TEST_CONFIG}" == *cpu* ]]; then
   DYNAMO_BENCHMARK_FLAGS+=(--device cpu)
+elif [[ "${TEST_CONFIG}" == *xpu* ]]; then
+  DYNAMO_BENCHMARK_FLAGS+=(--device xpu)
 else
   DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
 fi
@@ -665,6 +667,8 @@ test_perf_for_dashboard() {
     device=cuda_b200
   elif [[ "${TEST_CONFIG}" == *rocm* ]]; then
     device=rocm
+  elif [[ "${TEST_CONFIG}" == *xpu* ]]; then
+    device=xpu
   fi
 
   for mode in "${modes[@]}"; do
@@ -1649,7 +1653,7 @@ test_operator_microbenchmark() {
 
   cd "${TEST_DIR}"/benchmarks/operator_benchmark
 
-  for OP_BENCHMARK_TESTS in matmul mm addmm bmm; do
+  for OP_BENCHMARK_TESTS in matmul mm addmm bmm conv; do
     $TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
       --output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}_compile.json" \
       --benchmark-name "PyTorch operator microbenchmark" --use-compile
@@ -1757,7 +1761,7 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
   else
     # Do this after checkout_install_torchbench to ensure we clobber any
     # nightlies that torchbench may pull in
-    if [[ "${TEST_CONFIG}" != *cpu* ]]; then
+    if [[ "${TEST_CONFIG}" != *cpu* && "${TEST_CONFIG}" != *xpu* ]]; then
       install_torchrec_and_fbgemm
     fi
     PYTHONPATH=/torchbench test_dynamo_benchmark torchbench "$id"

diff --git a/.clang-tidy b/.clang-tidy
@@ -60,9 +60,11 @@ performance-*,
 readability-container-size-empty,
 readability-delete-null-pointer,
 readability-duplicate-include,
+readability-named-parameter,
 readability-misplaced-array-index,
 readability-redundant*,
 readability-simplify-subscript-expr,
+readability-static-definition-in-anonymous-namespace
 readability-string-compare,
 -readability-redundant-access-specifiers,
 -readability-redundant-control-flow,