From 3b57315b1bc85d4928ea385e6afd7e60eac99b2e Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Mon, 6 Oct 2025 16:08:40 -0400 Subject: [PATCH 1/7] [ROCm] Increase binary build timeout to 5 hours (300 minutes) (#164770) [ROCm] Increase binary build timeout to 5 hours (300 minutes) (#163776) Despite narrowing down the [FBGEMM_GENAI build to gfx942](https://github.com/pytorch/pytorch/pull/162648), the nightly builds still timed out because they [didn't get enough time to finish the post-PyTorch-build steps](https://github.com/pytorch/pytorch/actions/runs/17969771026/job/51109432897). This PR increases timeout for ROCm builds for both [libtorch ](https://github.com/pytorch/pytorch/actions/runs/17969771026)and [manywheel](https://github.com/pytorch/pytorch/actions/runs/17969771041), because both of those are close to the 4hr mark currently. This PR is a more ROCm-targeted version of https://github.com/pytorch/pytorch/pull/162880 (which is for release/2.9 branch). Pull Request resolved: https://github.com/pytorch/pytorch/pull/163776 Approved by: https://github.com/jeffdaily (cherry picked from commit 0ec946a0522748332f42675a4d690ff32d773d42) Co-authored-by: Jithun Nair Co-authored-by: Jeff Daily --- .../templates/linux_binary_build_workflow.yml.j2 | 3 +++ .../generated-linux-binary-libtorch-nightly.yml | 2 ++ .../generated-linux-binary-manywheel-nightly.yml | 14 ++++++++++++++ .../generated-linux-binary-manywheel-rocm-main.yml | 1 + 4 files changed, 20 insertions(+) diff --git a/.github/templates/linux_binary_build_workflow.yml.j2 b/.github/templates/linux_binary_build_workflow.yml.j2 index f53472571993..bf7db5866e78 100644 --- a/.github/templates/linux_binary_build_workflow.yml.j2 +++ b/.github/templates/linux_binary_build_workflow.yml.j2 @@ -77,6 +77,9 @@ jobs: runs_on: linux.s390x ALPINE_IMAGE: "docker.io/s390x/alpine" timeout-minutes: 420 + {%- elif config["gpu_arch_type"] == "rocm" %} + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 {%- elif "conda" in build_environment and config["gpu_arch_type"] == "cuda" %} runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.24xlarge.ephemeral diff --git a/.github/workflows/generated-linux-binary-libtorch-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-nightly.yml index f5eca8751840..bc671ae80ae2 100644 --- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml +++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml @@ -333,6 +333,7 @@ jobs: LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: libtorch-rocm6_3-shared-with-deps-release build_environment: linux-binary-libtorch secrets: @@ -446,6 +447,7 @@ jobs: LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: libtorch-rocm6_4-shared-with-deps-release build_environment: linux-binary-libtorch secrets: diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml index c996437a3b9f..5f9eaab976a6 100644 --- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml +++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml @@ -323,6 +323,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.3 DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_10-rocm6_3 build_environment: linux-binary-manywheel secrets: @@ -433,6 +434,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_10-rocm6_4 build_environment: linux-binary-manywheel secrets: @@ -912,6 +914,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.3 DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_11-rocm6_3 build_environment: linux-binary-manywheel secrets: @@ -1022,6 +1025,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_11-rocm6_4 build_environment: linux-binary-manywheel secrets: @@ -1501,6 +1505,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.3 DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_12-rocm6_3 build_environment: linux-binary-manywheel secrets: @@ -1611,6 +1616,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_12-rocm6_4 build_environment: linux-binary-manywheel secrets: @@ -2090,6 +2096,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.3 DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_13-rocm6_3 build_environment: linux-binary-manywheel secrets: @@ -2200,6 +2207,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_13-rocm6_4 build_environment: linux-binary-manywheel secrets: @@ -2679,6 +2687,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.3 DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_13t-rocm6_3 build_environment: linux-binary-manywheel secrets: @@ -2789,6 +2798,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_13t-rocm6_4 build_environment: linux-binary-manywheel secrets: @@ -3268,6 +3278,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.3 DESIRED_PYTHON: "3.14" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_14-rocm6_3 build_environment: linux-binary-manywheel secrets: @@ -3378,6 +3389,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.14" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_14-rocm6_4 build_environment: linux-binary-manywheel secrets: @@ -3857,6 +3869,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.3 DESIRED_PYTHON: "3.14t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_14t-rocm6_3 build_environment: linux-binary-manywheel secrets: @@ -3967,6 +3980,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.14t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_14t-rocm6_4 build_environment: linux-binary-manywheel secrets: diff --git a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml index 9593391217ac..9df4835757c4 100644 --- a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml +++ b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml @@ -60,6 +60,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_10-rocm6_4 build_environment: linux-binary-manywheel-rocm secrets: From d4c43070320e8892fa2965e1805db445ea4d4274 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Mon, 6 Oct 2025 16:56:06 -0400 Subject: [PATCH 2/7] Fix docker build issue after 164575 (#164779) Fix docker build issue after 164575 (#164774) Looks like https://github.com/pytorch/pytorch/pull/164575 introduced an issue. The command is wrong: ``` conda install -c "whl/nightly" -y python=3.11 conda=25.7.0 ``` Should be just using default conda channel: ``` conda install -y python=3.11 conda=25.7.0 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164774 Approved by: https://github.com/Camyll (cherry picked from commit c1f40d33c89b361a1edad17aa25cfff1ab4014fd) Co-authored-by: atalman --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index f73dfcc1af3a..331cf00593cb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -53,7 +53,7 @@ ARG CUDA_PATH=cu121 ARG INSTALL_CHANNEL=whl/nightly # Automatically set by buildx # pinning version of conda here see: https://github.com/pytorch/pytorch/issues/164574 -RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y python=${PYTHON_VERSION} conda=25.7.0 +RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION} conda=25.7.0 ARG TARGETPLATFORM From b015422da1fd2aa3186a88cf3ed1d2cb77c4374d Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Mon, 6 Oct 2025 19:58:36 -0400 Subject: [PATCH 3/7] fix cpp extension distributed warning spew (#164785) fix cpp extension distributed warning spew (#162764) With the new change we only log the warning if we're running non distributed code or if we're in rank 0. Unit testing that certain messages get printed on certain ranks only feels kinda jank so test plan is below instead Test plan ```python # torchrun --nproc_per_node=2 demo_fix.py import os import logging logging.getLogger('torch.utils.cpp_extension').setLevel(logging.DEBUG) import torch if 'RANK' in os.environ: torch.distributed.init_process_group('nccl') from torch.utils.cpp_extension import _get_cuda_arch_flags _get_cuda_arch_flags() print(f"Rank {os.environ.get('RANK', '0')} done") ``` Logs showing how how `TORCH_CUDA_ARCH_LIST`only shows up once if we explicitly set the the logging level to `logging.DEBUG`. It also improves the debug message to explain what the actual behavior will be ``` (source) [marksaroufim@devgpu005]~% torchrun --nproc_per_node=2 demo_fix.py W0911 18:30:16.594000 1315439 /home/marksaroufim/pytorch/torch/distributed/run.py:814] W0911 18:30:16.594000 1315439 /home/marksaroufim/pytorch/torch/distributed/run.py:814] ***************************************** W0911 18:30:16.594000 1315439 /home/marksaroufim/pytorch/torch/distributed/run.py:814] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. W0911 18:30:16.594000 1315439 /home/marksaroufim/pytorch/torch/distributed/run.py:814] ***************************************** [rank0]:V0911 18:30:18.921000 1316753 pytorch/torch/utils/cpp_extension.py:2444] TORCH_CUDA_ARCH_LIST is not set, using TORCH_CUDA_ARCH_LIST='10.0+PTX' for visible GPU architectures. Set os.environ['TORCH_CUDA_ARCH_LIST'] to override. Rank 0 done Rank 1 done ``` But if we just use the default and comment out `logging.getLogger('torch.utils.cpp_extension').setLevel(logging.DEBUG)` Then we get ``` (source) [marksaroufim@devgpu005]~% torchrun --nproc_per_node=2 demo_fix.py W0911 18:14:33.926000 690759 /home/marksaroufim/pytorch/torch/distributed/run.py:814] W0911 18:14:33.926000 690759 /home/marksaroufim/pytorch/torch/distributed/run.py:814] ***************************************** W0911 18:14:33.926000 690759 /home/marksaroufim/pytorch/torch/distributed/run.py:814] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. W0911 18:14:33.926000 690759 /home/marksaroufim/pytorch/torch/distributed/run.py:814] ***************************************** Rank 0 done Rank 1 done (source) [marksaroufim@devgpu005]~% ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/162764 Approved by: https://github.com/ezyang, https://github.com/zou3519 (cherry picked from commit f7e83219619a05934a344ca699c33ee69d5a3642) Co-authored-by: Mark Saroufim --- torch/utils/cpp_extension.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py index 7202a9638756..902d2fe6ce0f 100644 --- a/torch/utils/cpp_extension.py +++ b/torch/utils/cpp_extension.py @@ -2418,10 +2418,6 @@ def _get_cuda_arch_flags(cflags: Optional[list[str]] = None) -> list[str]: # If not given or set as native, determine what's best for the GPU / CUDA version that can be found if not _arch_list or _arch_list == "native": - if not _arch_list: - logger.warning( - "TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. \n" - "If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'] to specific architectures.") arch_list = [] # the assumption is that the extension should run on any of the currently visible cards, # which could be of different types - therefore all archs for visible cards should be included @@ -2440,6 +2436,15 @@ def _get_cuda_arch_flags(cflags: Optional[list[str]] = None) -> list[str]: arch_list.append(arch) arch_list = sorted(arch_list) arch_list[-1] += '+PTX' + + if not _arch_list: + # Only log on rank 0 in distributed settings to avoid spam + if not torch.distributed.is_available() or not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: + arch_list_str = ';'.join(arch_list) + logger.debug( + "TORCH_CUDA_ARCH_LIST is not set, using TORCH_CUDA_ARCH_LIST='%s' " + "for visible GPU architectures. Set os.environ['TORCH_CUDA_ARCH_LIST'] to override.", + arch_list_str) else: # Deal with lists that are ' ' separated (only deal with ';' after) _arch_list = _arch_list.replace(' ', ';') From 42f0c2c970728d8933489ac247c6e091d9070ed3 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Tue, 7 Oct 2025 10:10:51 -0400 Subject: [PATCH 4/7] update the baseline data for the operator benchmark (#164789) update the baseline data for the operator benchmark (#162693) According to the results of the last four operator benchmark runs, we found that five models achieved more than a 30% improvement compared to the baseline. Therefore, we will update the operator benchmark baseline data. We use the average results from the four runs as the new baseline for the five models. And add a pull request trigger for the operator benchmark workflow Benchmarking Framework | Benchmarking Module Name | Case Name | tag | run_backward | baseline old | r1 | r2 | r3 | r4 | avg | speedup -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- PyTorch | add | add_M1_N1_K1_cpu | short | FALSE | 3.9497 | 2.57 | 2.54 | 2.38 | 2.31 | 2.45 | 1.61 PyTorch | functional.hardtanh | functional.hardtanh_dims(512 512)_contigFalse_inplaceFalse_dtypetorch.quint8 | short | FALSE | 67.118 | 50.02 | 49.80 | 46.78 | 48.94 | 48.88 | 1.37 PyTorch | relu6 | relu6_dims(512 512)_contigFalse_inplaceFalse_dtypetorch.quint8 | short | FALSE | 68.739 | 51.17 | 51.19 | 48.07 | 50.42 | 50.21 | 1.37 PyTorch | relu6 | relu6_dims(256 1024)_contigFalse_inplaceFalse_dtypetorch.quint8 | short | FALSE | 69.1875 | 51.97 | 52.77 | 50.00 | 51.24 | 51.50 | 1.34 PyTorch | functional.hardtanh | functional.hardtanh_dims(256 1024)_contigFalse_inplaceFalse_dtypetorch.quint8 | short | FALSE | 67.436 | 50.98 | 51.69 | 49.06 | 49.87 | 50.40 | 1.34 @chuanqi129 @huydhn @desertfire @jainapurva Pull Request resolved: https://github.com/pytorch/pytorch/pull/162693 Approved by: https://github.com/huydhn (cherry picked from commit f7ea4975abb0aeb0224894f0b54b1f8fd1fa70e3) Co-authored-by: LifengWang --- .github/workflows/operator_benchmark.yml | 4 ++++ ...ected_ci_operator_benchmark_eager_float32_cpu.csv | 12 ++++++------ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/.github/workflows/operator_benchmark.yml b/.github/workflows/operator_benchmark.yml index dd262d31b8fc..dcdc2cd0ba24 100644 --- a/.github/workflows/operator_benchmark.yml +++ b/.github/workflows/operator_benchmark.yml @@ -14,6 +14,10 @@ on: schedule: # Run at 07:00 UTC every Sunday - cron: 0 7 * * 0 + pull_request: + paths: + - benchmarks/operator_benchmark/** + - .github/workflows/operator_benchmark.yml concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} diff --git a/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv b/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv index 873f14d20127..9a7b6797e982 100644 --- a/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv +++ b/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv @@ -1,5 +1,5 @@ Benchmarking Framework,Benchmarking Module Name,Case Name,tag,run_backward,Execution Time -PyTorch,add,add_M1_N1_K1_cpu,short,FALSE,3.9497 +PyTorch,add,add_M1_N1_K1_cpu,short,FALSE,2.459 PyTorch,add,add_M64_N64_K64_cpu,short,FALSE,14.3181 PyTorch,add,add_M64_N64_K128_cpu,short,FALSE,14.6826 PyTorch,add,add_M1_N1_K1_cpu_bwdall_BACKWARD,short,TRUE,58.1449 @@ -376,10 +376,10 @@ PyTorch,relu6,"relu6_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",sho PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,9.6588 PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,9.5969 PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,9.547 -PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,68.739 +PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,50.21375 PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,45.14133333 PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,52.6664 -PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,69.1875 +PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,51.49525 PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,48.3458 PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,62.0719 PyTorch,functional.hardtanh,"functional.hardtanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,7.5728 @@ -388,10 +388,10 @@ PyTorch,functional.hardtanh,"functional.hardtanh_dims(3,4,5)_contigFalse_inplace PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,8.1647 PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,8.1768 PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,8.0619 -PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,67.118 +PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,48.88475 PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,43.702 PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,50.3613 -PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,67.436 +PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,50.3995 PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,46.9813 PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,59.2295 PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,6.5189 @@ -1316,4 +1316,4 @@ PyTorch,where,"where_cond_shape(8,16,1)_input_shape(1,)_other_shape(1,)_cpu_dtyp PyTorch,where,"where_cond_shape(8,16,1)_input_shape(16,1)_other_shape(8,16,1)_cpu_dtypetorch.float32",short,FALSE,5.763 PyTorch,where,"where_cond_shape(8,16,1)_input_shape(8,1,1)_other_shape(1,)_cpu_dtypetorch.float32",short,FALSE,5.744666667 PyTorch,clamp,clamp_M512_N512_cpu,short,FALSE,15.26233333 -PyTorch,gelu,gelu_M512_N512_cpu,short,FALSE,31.33166667 \ No newline at end of file +PyTorch,gelu,gelu_M512_N512_cpu,short,FALSE,31.33166667 From 6f12be27709abe4e5365ec94376cb7529e219692 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Tue, 7 Oct 2025 22:33:08 -0400 Subject: [PATCH 5/7] CUDA 13.0 builds fix on Amazon Linux 2023 (#164893) CUDA 13.0 builds fix on Amazon Linux 2023 (#164870) During 2.9 rc testing I am seeing an issue on Amazon Linux 2023 with CUDA 13.0 builds This is related to: https://github.com/pytorch/pytorch/issues/152756 Workflow: https://github.com/pytorch/test-infra/actions/runs/18324074610/job/52184079262 Error: ``` WARNING: There was an error checking the latest version of pip. + python3.11 .ci/pytorch/smoke_test/smoke_test.py --package torchonly Traceback (most recent call last): File "/usr/local/lib64/python3.11/site-packages/torch/__init__.py", line 333, in _load_global_deps ctypes.CDLL(global_deps_lib_path, mode=ctypes.RTLD_GLOBAL) File "/usr/lib64/python3.11/ctypes/__init__.py", line 376, in __init__ self._handle = _dlopen(self._name, mode) ^^^^^^^^^^^^^^^^^^^^^^^^^ OSError: libcudart.so.13: cannot open shared object file: No such file or directory During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/pytorch/pytorch/.ci/pytorch/smoke_test/smoke_test.py", line 12, in import torch File "/usr/local/lib64/python3.11/site-packages/torch/__init__.py", line 425, in _load_global_deps() File "/usr/local/lib64/python3.11/site-packages/torch/__init__.py", line 383, in _load_global_deps _preload_cuda_deps(lib_folder, lib_name) File "/usr/local/lib64/python3.11/site-packages/torch/__init__.py", line 317, in _preload_cuda_deps raise ValueError(f"{lib_name} not found in the system path {sys.path}") Traceback (most recent call last): ValueError: libnvToolsExt.so.*[0-9] not found in the system path ['/pytorch/pytorch/.ci/pytorch/smoke_test', '/usr/lib64/python311.zip', '/usr/lib64/python3.11', '/usr/lib64/python3.11/lib-dynload', '/usr/local/lib64/python3.11/site-packages', '/usr/local/lib/python3.11/site-packages', '/usr/lib64/python3.11/site-packages', '/usr/lib/python3.11/site-packages'] File "/home/ec2-user/actions-runner/_work/test-infra/test-infra/test-infra/.github/scripts/run_with_env_secrets.py", line 102, in main() File "/home/ec2-user/actions-runner/_work/test-infra/test-infra/test-infra/.github/scripts/run_with_env_secrets.py", line 98, in main run_cmd_or_die(f"docker exec -t {container_name} /exec") File "/home/ec2-user/actions-runner/_work/test-infra/test-infra/test-infra/.github/scripts/run_with_env_secrets.py", line 39, in run_cmd_or_die raise RuntimeError(f"Command {cmd} failed with exit code {exit_code}") RuntimeError: Command docker exec -t 7d9c5bd403cac9a9ee824d63a1d6f6057ecce89a7daa94a81617dbf8eff0ff2e /exec failed with exit code 1 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164870 Approved by: https://github.com/Camyll (cherry picked from commit 483f4e0db91166128ad8922d86dc7222338d4ecc) Co-authored-by: atalman Co-authored-by: Eli Uriegas <1700823+seemethere@users.noreply.github.com> --- torch/__init__.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/torch/__init__.py b/torch/__init__.py index a5c072396e1d..0625ad60bfff 100644 --- a/torch/__init__.py +++ b/torch/__init__.py @@ -302,7 +302,7 @@ def _get_cuda_dep_paths(path: str, lib_folder: str, lib_name: str) -> list[str]: return nvidia_lib_paths + lib_paths -def _preload_cuda_deps(lib_folder: str, lib_name: str) -> None: +def _preload_cuda_deps(lib_folder: str, lib_name: str, required: bool = True) -> None: # type: ignore[valid-type] """Preloads cuda deps if they could not be found otherwise.""" # Should only be called on Linux if default path resolution have failed assert platform.system() == "Linux", "Should only be called on Linux" @@ -313,9 +313,10 @@ def _preload_cuda_deps(lib_folder: str, lib_name: str) -> None: if candidate_lib_paths: lib_path = candidate_lib_paths[0] break - if not lib_path: + if not lib_path and required: raise ValueError(f"{lib_name} not found in the system path {sys.path}") - ctypes.CDLL(lib_path) + if lib_path: + ctypes.CDLL(lib_path) # See Note [Global dependencies] @@ -354,8 +355,6 @@ def _load_global_deps() -> None: except OSError as err: # Can only happen for wheel with cuda libs as PYPI deps # As PyTorch is not purelib, but nvidia-*-cu12 is - from torch.version import cuda as cuda_version - cuda_libs: dict[str, str] = { "cublas": "libcublas.so.*[0-9]", "cudnn": "libcudnn.so.*[0-9]", @@ -369,7 +368,6 @@ def _load_global_deps() -> None: "cusparselt": "libcusparseLt.so.*[0-9]", "cusolver": "libcusolver.so.*[0-9]", "nccl": "libnccl.so.*[0-9]", - "nvtx": "libnvToolsExt.so.*[0-9]", "nvshmem": "libnvshmem_host.so.*[0-9]", "cufile": "libcufile.so.*[0-9]", } @@ -381,6 +379,9 @@ def _load_global_deps() -> None: raise err for lib_folder, lib_name in cuda_libs.items(): _preload_cuda_deps(lib_folder, lib_name) + + # libnvToolsExt is Optional Dependency + _preload_cuda_deps("nvtx", "libnvToolsExt.so.*[0-9]", required=False) ctypes.CDLL(global_deps_lib_path, mode=ctypes.RTLD_GLOBAL) From 26e023a973cc3e70f0248957c96e9e1f9d593858 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Wed, 8 Oct 2025 14:11:48 -0400 Subject: [PATCH 6/7] [MPS] Update OS version in error message (#164949) [MPS] Update OS version in error message (#164946) Followup after https://github.com/pytorch/pytorch/pull/159912 Fixes https://github.com/pytorch/pytorch/issues/164943 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164946 Approved by: https://github.com/Camyll (cherry picked from commit 01f3a43462da594b65a6c9e8b46c132cd360cea9) Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com> --- aten/src/ATen/mps/EmptyTensor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aten/src/ATen/mps/EmptyTensor.cpp b/aten/src/ATen/mps/EmptyTensor.cpp index d858df073397..6c58de099648 100644 --- a/aten/src/ATen/mps/EmptyTensor.cpp +++ b/aten/src/ATen/mps/EmptyTensor.cpp @@ -12,7 +12,7 @@ #define MPS_ERROR_NOT_COMPILED "PyTorch code is not compiled with MPS enabled" #define MPS_ERROR_RUNTIME_TOO_LOW \ - "The MPS backend is supported on MacOS 13.0+.", \ + "The MPS backend is supported on MacOS 14.0+. ", \ "Current OS version can be queried using `sw_vers`" #define MPS_ERROR_DOUBLE_NOT_SUPPORTED "Cannot convert a MPS Tensor to float64 dtype " \ "as the MPS framework doesn't support float64. Please use float32 instead." From 0fabc3ba44823f257e70ce397d989c8de5e362c1 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Wed, 8 Oct 2025 21:09:57 -0400 Subject: [PATCH 7/7] CUDA aarch64 12.6 and 12.8 builds fix triton constraints (#165022) CUDA aarch64 12.6 and 12.8 builds fix triton constraints (#165013) Since we have introduced CUDA aarch64 builds for all cuda versions we need to remove this constraint. This was missed by https://github.com/pytorch/pytorch/pull/162364 Proper constraint on triton should be: ``` Requires-Dist: triton==3.5.0; platform_system == "Linux" ``` not: ``` Requires-Dist: triton==3.5.0; platform_system == "Linux" and platform_machine == "x86_64" ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/165013 Approved by: https://github.com/Camyll, https://github.com/nWEIdia, https://github.com/tinglvv (cherry picked from commit 81dbeb06f4b3eb6c56625ec25d377eb7c7c6c573) Co-authored-by: atalman --- .circleci/scripts/binary_populate_env.sh | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh index f5b949858d60..f12a3ac07517 100755 --- a/.circleci/scripts/binary_populate_env.sh +++ b/.circleci/scripts/binary_populate_env.sh @@ -71,14 +71,7 @@ export PYTORCH_BUILD_NUMBER=1 # Set triton version as part of PYTORCH_EXTRA_INSTALL_REQUIREMENTS TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt) - -# Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT -TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'" - -# CUDA 12.9/13.0 builds have triton for Linux and Linux aarch64 binaries. -if [[ "$DESIRED_CUDA" == "cu129" ]] || [[ "$DESIRED_CUDA" == "cu130" ]]; then - TRITON_CONSTRAINT="platform_system == 'Linux'" -fi +TRITON_CONSTRAINT="platform_system == 'Linux'" if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" && ! "$PYTORCH_BUILD_VERSION" =~ .*xpu.* ]]; then TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"