From 3b57315b1bc85d4928ea385e6afd7e60eac99b2e Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Mon, 6 Oct 2025 16:08:40 -0400
Subject: [PATCH 1/7] [ROCm] Increase binary build timeout to 5 hours (300
 minutes) (#164770)

[ROCm] Increase binary build timeout to 5 hours (300 minutes) (#163776)

Despite narrowing down the [FBGEMM_GENAI build to gfx942](https://github.com/pytorch/pytorch/pull/162648), the nightly builds still timed out because they [didn't get enough time to finish the post-PyTorch-build steps](https://github.com/pytorch/pytorch/actions/runs/17969771026/job/51109432897).

This PR increases timeout for ROCm builds for both [libtorch ](https://github.com/pytorch/pytorch/actions/runs/17969771026)and [manywheel](https://github.com/pytorch/pytorch/actions/runs/17969771041), because both of those are close to the 4hr mark currently.

This PR is a more ROCm-targeted version of https://github.com/pytorch/pytorch/pull/162880 (which is for release/2.9 branch).

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163776
Approved by: https://github.com/jeffdaily


(cherry picked from commit 0ec946a0522748332f42675a4d690ff32d773d42)

Co-authored-by: Jithun Nair <jithun.nair@amd.com>
Co-authored-by: Jeff Daily <jeff.daily@amd.com>
---
 .../templates/linux_binary_build_workflow.yml.j2   |  3 +++
 .../generated-linux-binary-libtorch-nightly.yml    |  2 ++
 .../generated-linux-binary-manywheel-nightly.yml   | 14 ++++++++++++++
 .../generated-linux-binary-manywheel-rocm-main.yml |  1 +
 4 files changed, 20 insertions(+)

diff --git a/.github/templates/linux_binary_build_workflow.yml.j2 b/.github/templates/linux_binary_build_workflow.yml.j2
index f53472571993..bf7db5866e78 100644
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@@ -77,6 +77,9 @@ jobs:
       runs_on: linux.s390x
       ALPINE_IMAGE: "docker.io/s390x/alpine"
       timeout-minutes: 420
+      {%- elif config["gpu_arch_type"] == "rocm" %}
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       {%- elif "conda" in build_environment and config["gpu_arch_type"] == "cuda" %}
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.24xlarge.ephemeral
diff --git a/.github/workflows/generated-linux-binary-libtorch-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
index f5eca8751840..bc671ae80ae2 100644
--- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
@@ -333,6 +333,7 @@ jobs:
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       build_name: libtorch-rocm6_3-shared-with-deps-release
       build_environment: linux-binary-libtorch
     secrets:
@@ -446,6 +447,7 @@ jobs:
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       build_name: libtorch-rocm6_4-shared-with-deps-release
       build_environment: linux-binary-libtorch
     secrets:
diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
index c996437a3b9f..5f9eaab976a6 100644
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@@ -323,6 +323,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       build_name: manywheel-py3_10-rocm6_3
       build_environment: linux-binary-manywheel
     secrets:
@@ -433,6 +434,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       build_name: manywheel-py3_10-rocm6_4
       build_environment: linux-binary-manywheel
     secrets:
@@ -912,6 +914,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       build_name: manywheel-py3_11-rocm6_3
       build_environment: linux-binary-manywheel
     secrets:
@@ -1022,6 +1025,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       build_name: manywheel-py3_11-rocm6_4
       build_environment: linux-binary-manywheel
     secrets:
@@ -1501,6 +1505,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       build_name: manywheel-py3_12-rocm6_3
       build_environment: linux-binary-manywheel
     secrets:
@@ -1611,6 +1616,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       build_name: manywheel-py3_12-rocm6_4
       build_environment: linux-binary-manywheel
     secrets:
@@ -2090,6 +2096,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       build_name: manywheel-py3_13-rocm6_3
       build_environment: linux-binary-manywheel
     secrets:
@@ -2200,6 +2207,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       build_name: manywheel-py3_13-rocm6_4
       build_environment: linux-binary-manywheel
     secrets:
@@ -2679,6 +2687,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       build_name: manywheel-py3_13t-rocm6_3
       build_environment: linux-binary-manywheel
     secrets:
@@ -2789,6 +2798,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       build_name: manywheel-py3_13t-rocm6_4
       build_environment: linux-binary-manywheel
     secrets:
@@ -3268,6 +3278,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
       DESIRED_PYTHON: "3.14"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       build_name: manywheel-py3_14-rocm6_3
       build_environment: linux-binary-manywheel
     secrets:
@@ -3378,6 +3389,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.14"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       build_name: manywheel-py3_14-rocm6_4
       build_environment: linux-binary-manywheel
     secrets:
@@ -3857,6 +3869,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
       DESIRED_PYTHON: "3.14t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       build_name: manywheel-py3_14t-rocm6_3
       build_environment: linux-binary-manywheel
     secrets:
@@ -3967,6 +3980,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.14t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       build_name: manywheel-py3_14t-rocm6_4
       build_environment: linux-binary-manywheel
     secrets:
diff --git a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
index 9593391217ac..9df4835757c4 100644
--- a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
@@ -60,6 +60,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       build_name: manywheel-py3_10-rocm6_4
       build_environment: linux-binary-manywheel-rocm
     secrets:

From d4c43070320e8892fa2965e1805db445ea4d4274 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Mon, 6 Oct 2025 16:56:06 -0400
Subject: [PATCH 2/7] Fix docker build issue after 164575 (#164779)

Fix docker build issue after 164575 (#164774)

Looks like https://github.com/pytorch/pytorch/pull/164575 introduced an issue.
The command is wrong:
```
conda install -c "whl/nightly" -y python=3.11 conda=25.7.0
```
Should be just using default conda channel:
```
conda install  -y python=3.11 conda=25.7.0
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164774
Approved by: https://github.com/Camyll

(cherry picked from commit c1f40d33c89b361a1edad17aa25cfff1ab4014fd)

Co-authored-by: atalman <atalman@fb.com>
---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index f73dfcc1af3a..331cf00593cb 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -53,7 +53,7 @@ ARG CUDA_PATH=cu121
 ARG INSTALL_CHANNEL=whl/nightly
 # Automatically set by buildx
 # pinning version of conda here see: https://github.com/pytorch/pytorch/issues/164574
-RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y python=${PYTHON_VERSION} conda=25.7.0
+RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION} conda=25.7.0
 
 ARG TARGETPLATFORM
 

From b015422da1fd2aa3186a88cf3ed1d2cb77c4374d Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Mon, 6 Oct 2025 19:58:36 -0400
Subject: [PATCH 3/7] fix cpp extension distributed warning spew (#164785)

fix cpp extension distributed warning spew (#162764)

With the new change we only log the warning if we're running non distributed code or if we're in rank 0. Unit testing that certain messages get printed on certain ranks only feels kinda jank so test plan is below instead

Test plan

```python
# torchrun --nproc_per_node=2 demo_fix.py

import os
import logging

logging.getLogger('torch.utils.cpp_extension').setLevel(logging.DEBUG)

import torch
if 'RANK' in os.environ:
    torch.distributed.init_process_group('nccl')

from torch.utils.cpp_extension import _get_cuda_arch_flags
_get_cuda_arch_flags()

print(f"Rank {os.environ.get('RANK', '0')} done")
```

Logs showing how how `TORCH_CUDA_ARCH_LIST`only shows up once if we explicitly set the the logging level to `logging.DEBUG`. It also improves the debug message to explain what the actual behavior will be

```
(source) [marksaroufim@devgpu005]~% torchrun --nproc_per_node=2 demo_fix.py

W0911 18:30:16.594000 1315439 /home/marksaroufim/pytorch/torch/distributed/run.py:814]
W0911 18:30:16.594000 1315439 /home/marksaroufim/pytorch/torch/distributed/run.py:814] *****************************************
W0911 18:30:16.594000 1315439 /home/marksaroufim/pytorch/torch/distributed/run.py:814] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
W0911 18:30:16.594000 1315439 /home/marksaroufim/pytorch/torch/distributed/run.py:814] *****************************************
[rank0]:V0911 18:30:18.921000 1316753 pytorch/torch/utils/cpp_extension.py:2444] TORCH_CUDA_ARCH_LIST is not set, using TORCH_CUDA_ARCH_LIST='10.0+PTX' for visible GPU architectures. Set os.environ['TORCH_CUDA_ARCH_LIST'] to override.
Rank 0 done
Rank 1 done
```

But if we just use the default and comment out `logging.getLogger('torch.utils.cpp_extension').setLevel(logging.DEBUG)`

Then we get

```
(source) [marksaroufim@devgpu005]~% torchrun --nproc_per_node=2 demo_fix.py
W0911 18:14:33.926000 690759 /home/marksaroufim/pytorch/torch/distributed/run.py:814]
W0911 18:14:33.926000 690759 /home/marksaroufim/pytorch/torch/distributed/run.py:814] *****************************************
W0911 18:14:33.926000 690759 /home/marksaroufim/pytorch/torch/distributed/run.py:814] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
W0911 18:14:33.926000 690759 /home/marksaroufim/pytorch/torch/distributed/run.py:814] *****************************************
Rank 0 done
Rank 1 done
(source) [marksaroufim@devgpu005]~%
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/162764
Approved by: https://github.com/ezyang, https://github.com/zou3519

(cherry picked from commit f7e83219619a05934a344ca699c33ee69d5a3642)

Co-authored-by: Mark Saroufim <marksaroufim@meta.com>
---
 torch/utils/cpp_extension.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 7202a9638756..902d2fe6ce0f 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -2418,10 +2418,6 @@ def _get_cuda_arch_flags(cflags: Optional[list[str]] = None) -> list[str]:
 
     # If not given or set as native, determine what's best for the GPU / CUDA version that can be found
     if not _arch_list or _arch_list == "native":
-        if not _arch_list:
-            logger.warning(
-                "TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. \n"
-                "If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'] to specific architectures.")
         arch_list = []
         # the assumption is that the extension should run on any of the currently visible cards,
         # which could be of different types - therefore all archs for visible cards should be included
@@ -2440,6 +2436,15 @@ def _get_cuda_arch_flags(cflags: Optional[list[str]] = None) -> list[str]:
                 arch_list.append(arch)
         arch_list = sorted(arch_list)
         arch_list[-1] += '+PTX'
+
+        if not _arch_list:
+            # Only log on rank 0 in distributed settings to avoid spam
+            if not torch.distributed.is_available() or not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
+                arch_list_str = ';'.join(arch_list)
+                logger.debug(
+                    "TORCH_CUDA_ARCH_LIST is not set, using TORCH_CUDA_ARCH_LIST='%s' "
+                    "for visible GPU architectures. Set os.environ['TORCH_CUDA_ARCH_LIST'] to override.",
+                    arch_list_str)
     else:
         # Deal with lists that are ' ' separated (only deal with ';' after)
         _arch_list = _arch_list.replace(' ', ';')

From 42f0c2c970728d8933489ac247c6e091d9070ed3 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Tue, 7 Oct 2025 10:10:51 -0400
Subject: [PATCH 4/7] update the baseline data for the operator benchmark
 (#164789)

update the baseline data for the operator benchmark (#162693)

According to the results of the last four operator benchmark runs, we found that five models achieved more than a 30% improvement compared to the baseline. Therefore, we will update the operator benchmark baseline data.
We use the average results from the four runs as the new baseline for the five models.

And add a pull request trigger for the operator benchmark workflow

Benchmarking   Framework | Benchmarking   Module Name | Case Name | tag | run_backward | baseline   old | r1 | r2 | r3 | r4 | avg | speedup
-- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | --
PyTorch | add | add_M1_N1_K1_cpu | short | FALSE | 3.9497 | 2.57 | 2.54 | 2.38 | 2.31 | 2.45 | 1.61
PyTorch | functional.hardtanh | functional.hardtanh_dims(512	512)_contigFalse_inplaceFalse_dtypetorch.quint8 | short | FALSE | 67.118 | 50.02 | 49.80 | 46.78 | 48.94 | 48.88 | 1.37
PyTorch | relu6 | relu6_dims(512	512)_contigFalse_inplaceFalse_dtypetorch.quint8 | short | FALSE | 68.739 | 51.17 | 51.19 | 48.07 | 50.42 | 50.21 | 1.37
PyTorch | relu6 | relu6_dims(256	1024)_contigFalse_inplaceFalse_dtypetorch.quint8 | short | FALSE | 69.1875 | 51.97 | 52.77 | 50.00 | 51.24 | 51.50 | 1.34
PyTorch | functional.hardtanh | functional.hardtanh_dims(256	1024)_contigFalse_inplaceFalse_dtypetorch.quint8 | short | FALSE | 67.436 | 50.98 | 51.69 | 49.06 | 49.87 | 50.40 | 1.34

@chuanqi129 @huydhn @desertfire @jainapurva

Pull Request resolved: https://github.com/pytorch/pytorch/pull/162693
Approved by: https://github.com/huydhn

(cherry picked from commit f7ea4975abb0aeb0224894f0b54b1f8fd1fa70e3)

Co-authored-by: LifengWang <lifeng.a.wang@intel.com>
---
 .github/workflows/operator_benchmark.yml             |  4 ++++
 ...ected_ci_operator_benchmark_eager_float32_cpu.csv | 12 ++++++------
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/operator_benchmark.yml b/.github/workflows/operator_benchmark.yml
index dd262d31b8fc..dcdc2cd0ba24 100644
--- a/.github/workflows/operator_benchmark.yml
+++ b/.github/workflows/operator_benchmark.yml
@@ -14,6 +14,10 @@ on:
   schedule:
     # Run at 07:00 UTC every Sunday
     - cron: 0 7 * * 0
+  pull_request:
+    paths:
+      - benchmarks/operator_benchmark/**
+      - .github/workflows/operator_benchmark.yml
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
diff --git a/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv b/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv
index 873f14d20127..9a7b6797e982 100644
--- a/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv
+++ b/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv
@@ -1,5 +1,5 @@
 Benchmarking Framework,Benchmarking Module Name,Case Name,tag,run_backward,Execution Time
-PyTorch,add,add_M1_N1_K1_cpu,short,FALSE,3.9497
+PyTorch,add,add_M1_N1_K1_cpu,short,FALSE,2.459
 PyTorch,add,add_M64_N64_K64_cpu,short,FALSE,14.3181
 PyTorch,add,add_M64_N64_K128_cpu,short,FALSE,14.6826
 PyTorch,add,add_M1_N1_K1_cpu_bwdall_BACKWARD,short,TRUE,58.1449
@@ -376,10 +376,10 @@ PyTorch,relu6,"relu6_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",sho
 PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,9.6588
 PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,9.5969
 PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,9.547
-PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,68.739
+PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,50.21375
 PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,45.14133333
 PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,52.6664
-PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,69.1875
+PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,51.49525
 PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,48.3458
 PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,62.0719
 PyTorch,functional.hardtanh,"functional.hardtanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,7.5728
@@ -388,10 +388,10 @@ PyTorch,functional.hardtanh,"functional.hardtanh_dims(3,4,5)_contigFalse_inplace
 PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,8.1647
 PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,8.1768
 PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,8.0619
-PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,67.118
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,48.88475
 PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,43.702
 PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,50.3613
-PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,67.436
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,50.3995
 PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,46.9813
 PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,59.2295
 PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,6.5189
@@ -1316,4 +1316,4 @@ PyTorch,where,"where_cond_shape(8,16,1)_input_shape(1,)_other_shape(1,)_cpu_dtyp
 PyTorch,where,"where_cond_shape(8,16,1)_input_shape(16,1)_other_shape(8,16,1)_cpu_dtypetorch.float32",short,FALSE,5.763
 PyTorch,where,"where_cond_shape(8,16,1)_input_shape(8,1,1)_other_shape(1,)_cpu_dtypetorch.float32",short,FALSE,5.744666667
 PyTorch,clamp,clamp_M512_N512_cpu,short,FALSE,15.26233333
-PyTorch,gelu,gelu_M512_N512_cpu,short,FALSE,31.33166667
\ No newline at end of file
+PyTorch,gelu,gelu_M512_N512_cpu,short,FALSE,31.33166667

From 6f12be27709abe4e5365ec94376cb7529e219692 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Tue, 7 Oct 2025 22:33:08 -0400
Subject: [PATCH 5/7] CUDA 13.0 builds fix on Amazon Linux 2023 (#164893)

CUDA 13.0 builds fix on Amazon Linux 2023 (#164870)

During 2.9 rc testing I am seeing an issue on Amazon Linux 2023 with CUDA 13.0 builds

This is related to:
 https://github.com/pytorch/pytorch/issues/152756

Workflow: https://github.com/pytorch/test-infra/actions/runs/18324074610/job/52184079262

Error:
```
WARNING: There was an error checking the latest version of pip.
+ python3.11 .ci/pytorch/smoke_test/smoke_test.py --package torchonly
Traceback (most recent call last):
  File "/usr/local/lib64/python3.11/site-packages/torch/__init__.py", line 333, in _load_global_deps
    ctypes.CDLL(global_deps_lib_path, mode=ctypes.RTLD_GLOBAL)
  File "/usr/lib64/python3.11/ctypes/__init__.py", line 376, in __init__
    self._handle = _dlopen(self._name, mode)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^
OSError: libcudart.so.13: cannot open shared object file: No such file or directory

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/pytorch/pytorch/.ci/pytorch/smoke_test/smoke_test.py", line 12, in <module>
    import torch
  File "/usr/local/lib64/python3.11/site-packages/torch/__init__.py", line 425, in <module>
    _load_global_deps()
  File "/usr/local/lib64/python3.11/site-packages/torch/__init__.py", line 383, in _load_global_deps
    _preload_cuda_deps(lib_folder, lib_name)
  File "/usr/local/lib64/python3.11/site-packages/torch/__init__.py", line 317, in _preload_cuda_deps
    raise ValueError(f"{lib_name} not found in the system path {sys.path}")
Traceback (most recent call last):
ValueError: libnvToolsExt.so.*[0-9] not found in the system path ['/pytorch/pytorch/.ci/pytorch/smoke_test', '/usr/lib64/python311.zip', '/usr/lib64/python3.11', '/usr/lib64/python3.11/lib-dynload', '/usr/local/lib64/python3.11/site-packages', '/usr/local/lib/python3.11/site-packages', '/usr/lib64/python3.11/site-packages', '/usr/lib/python3.11/site-packages']
  File "/home/ec2-user/actions-runner/_work/test-infra/test-infra/test-infra/.github/scripts/run_with_env_secrets.py", line 102, in <module>
    main()
  File "/home/ec2-user/actions-runner/_work/test-infra/test-infra/test-infra/.github/scripts/run_with_env_secrets.py", line 98, in main
    run_cmd_or_die(f"docker exec -t {container_name} /exec")
  File "/home/ec2-user/actions-runner/_work/test-infra/test-infra/test-infra/.github/scripts/run_with_env_secrets.py", line 39, in run_cmd_or_die
    raise RuntimeError(f"Command {cmd} failed with exit code {exit_code}")
RuntimeError: Command docker exec -t 7d9c5bd403cac9a9ee824d63a1d6f6057ecce89a7daa94a81617dbf8eff0ff2e /exec failed with exit code 1
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/164870
Approved by: https://github.com/Camyll


(cherry picked from commit 483f4e0db91166128ad8922d86dc7222338d4ecc)

Co-authored-by: atalman <atalman@fb.com>
Co-authored-by: Eli Uriegas <1700823+seemethere@users.noreply.github.com>
---
 torch/__init__.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/torch/__init__.py b/torch/__init__.py
index a5c072396e1d..0625ad60bfff 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -302,7 +302,7 @@ def _get_cuda_dep_paths(path: str, lib_folder: str, lib_name: str) -> list[str]:
     return nvidia_lib_paths + lib_paths
 
 
-def _preload_cuda_deps(lib_folder: str, lib_name: str) -> None:
+def _preload_cuda_deps(lib_folder: str, lib_name: str, required: bool = True) -> None:  # type: ignore[valid-type]
     """Preloads cuda deps if they could not be found otherwise."""
     # Should only be called on Linux if default path resolution have failed
     assert platform.system() == "Linux", "Should only be called on Linux"
@@ -313,9 +313,10 @@ def _preload_cuda_deps(lib_folder: str, lib_name: str) -> None:
         if candidate_lib_paths:
             lib_path = candidate_lib_paths[0]
             break
-    if not lib_path:
+    if not lib_path and required:
         raise ValueError(f"{lib_name} not found in the system path {sys.path}")
-    ctypes.CDLL(lib_path)
+    if lib_path:
+        ctypes.CDLL(lib_path)
 
 
 # See Note [Global dependencies]
@@ -354,8 +355,6 @@ def _load_global_deps() -> None:
     except OSError as err:
         # Can only happen for wheel with cuda libs as PYPI deps
         # As PyTorch is not purelib, but nvidia-*-cu12 is
-        from torch.version import cuda as cuda_version
-
         cuda_libs: dict[str, str] = {
             "cublas": "libcublas.so.*[0-9]",
             "cudnn": "libcudnn.so.*[0-9]",
@@ -369,7 +368,6 @@ def _load_global_deps() -> None:
             "cusparselt": "libcusparseLt.so.*[0-9]",
             "cusolver": "libcusolver.so.*[0-9]",
             "nccl": "libnccl.so.*[0-9]",
-            "nvtx": "libnvToolsExt.so.*[0-9]",
             "nvshmem": "libnvshmem_host.so.*[0-9]",
             "cufile": "libcufile.so.*[0-9]",
         }
@@ -381,6 +379,9 @@ def _load_global_deps() -> None:
             raise err
         for lib_folder, lib_name in cuda_libs.items():
             _preload_cuda_deps(lib_folder, lib_name)
+
+        # libnvToolsExt is Optional Dependency
+        _preload_cuda_deps("nvtx", "libnvToolsExt.so.*[0-9]", required=False)
         ctypes.CDLL(global_deps_lib_path, mode=ctypes.RTLD_GLOBAL)
 
 
From 26e023a973cc3e70f0248957c96e9e1f9d593858 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Wed, 8 Oct 2025 14:11:48 -0400
Subject: [PATCH 6/7] [MPS] Update OS version in error message (#164949)

[MPS] Update OS version in error message (#164946)

Followup after https://github.com/pytorch/pytorch/pull/159912
Fixes https://github.com/pytorch/pytorch/issues/164943

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164946
Approved by: https://github.com/Camyll

(cherry picked from commit 01f3a43462da594b65a6c9e8b46c132cd360cea9)

Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>
---
 aten/src/ATen/mps/EmptyTensor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aten/src/ATen/mps/EmptyTensor.cpp b/aten/src/ATen/mps/EmptyTensor.cpp
index d858df073397..6c58de099648 100644
--- a/aten/src/ATen/mps/EmptyTensor.cpp
+++ b/aten/src/ATen/mps/EmptyTensor.cpp
@@ -12,7 +12,7 @@
 
 #define MPS_ERROR_NOT_COMPILED "PyTorch code is not compiled with MPS enabled"
 #define MPS_ERROR_RUNTIME_TOO_LOW \
-  "The MPS backend is supported on MacOS 13.0+.", \
+  "The MPS backend is supported on MacOS 14.0+. ", \
   "Current OS version can be queried using `sw_vers`"
 #define MPS_ERROR_DOUBLE_NOT_SUPPORTED "Cannot convert a MPS Tensor to float64 dtype " \
   "as the MPS framework doesn't support float64. Please use float32 instead."

From 0fabc3ba44823f257e70ce397d989c8de5e362c1 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Wed, 8 Oct 2025 21:09:57 -0400
Subject: [PATCH 7/7] CUDA aarch64 12.6 and 12.8 builds fix triton constraints
 (#165022)

CUDA aarch64 12.6 and 12.8 builds fix triton constraints (#165013)

Since we have introduced CUDA aarch64 builds for all cuda versions we need to remove this constraint.
This was missed by https://github.com/pytorch/pytorch/pull/162364

Proper constraint on triton should be:
```
Requires-Dist: triton==3.5.0; platform_system == "Linux"
```

not:
```
Requires-Dist: triton==3.5.0; platform_system == "Linux" and platform_machine == "x86_64"
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165013
Approved by: https://github.com/Camyll, https://github.com/nWEIdia, https://github.com/tinglvv

(cherry picked from commit 81dbeb06f4b3eb6c56625ec25d377eb7c7c6c573)

Co-authored-by: atalman <atalman@fb.com>
---
 .circleci/scripts/binary_populate_env.sh | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh
index f5b949858d60..f12a3ac07517 100755
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@@ -71,14 +71,7 @@ export PYTORCH_BUILD_NUMBER=1
 
 # Set triton version as part of PYTORCH_EXTRA_INSTALL_REQUIREMENTS
 TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
-
-# Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
-TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'"
-
-# CUDA 12.9/13.0 builds have triton for Linux and Linux aarch64 binaries.
-if [[ "$DESIRED_CUDA" == "cu129" ]] || [[ "$DESIRED_CUDA" == "cu130" ]]; then
-  TRITON_CONSTRAINT="platform_system == 'Linux'"
-fi
+TRITON_CONSTRAINT="platform_system == 'Linux'"
 
 if [[ "$PACKAGE_TYPE" =~ .*wheel.* &&  -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" && ! "$PYTORCH_BUILD_VERSION" =~ .*xpu.* ]]; then
   TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"