ROCm · pragupta · Nov 25, 2025 · Nov 18, 2025 · Nov 18, 2025 · Nov 18, 2025
@@ -125,10 +125,10 @@ case "$tag" in
     UCC_COMMIT=${_UCC_COMMIT}
     TRITON=yes
     ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks)
     CUDA_VERSION=12.8.1
     ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
+    GCC_VERSION=11
     VISION=yes
     KATEX=yes
     UCX_COMMIT=${_UCX_COMMIT}
@@ -146,16 +146,6 @@ case "$tag" in
     UCC_COMMIT=${_UCC_COMMIT}
     TRITON=yes
     ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9)
-    CUDA_VERSION=12.8.1
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    ;;
   pytorch-linux-jammy-py3-clang12-onnx)
     ANACONDA_PYTHON_VERSION=3.10
     CLANG_VERSION=12

diff --git a/.ci/docker/ci_commit_pins/triton-xpu.txt b/.ci/docker/ci_commit_pins/triton-xpu.txt
@@ -1 +1 @@
-1b0418a9a454b2b93ab8d71f40e59d2297157fae
+aa01f5c2cd4db2b7bfa53ea98a1a8dfbd6d77c92
@@ -64,14 +64,13 @@ function install_ubuntu() {
 
 function install_rhel() {
     . /etc/os-release
-    if [[ "${ID}" == "rhel" ]]; then
-        if [[ ! " 8.8 8.10 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
-            echo "RHEL version ${VERSION_ID} not supported"
-            exit
-        fi
-    elif [[ "${ID}" == "almalinux" ]]; then
-        # Workaround for almalinux8 which used by quay.io/pypa/manylinux_2_28_x86_64
-        VERSION_ID="8.8"
+    if [[ ! " 8.8 8.10 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
+        echo "RHEL version ${VERSION_ID} not supported"
+        exit
+    fi
+    # Using testing channel for CD build
+    if [[ "${ID}" == "almalinux" ]]; then
+        XPU_DRIVER_VERSION="/testing"
     fi
 
     dnf install -y 'dnf-command(config-manager)'

@@ -397,3 +397,6 @@ scikit-build==0.18.1
 pyre-extensions==0.0.32
 tabulate==0.9.0
 #Description: These package are needed to build FBGEMM and torchrec on PyTorch CI
+
+Jinja2==3.1.6
+#Description: required for torch.distributed.debug
diff --git a/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py
@@ -84,7 +84,6 @@ def __init__(self, args: Any):
         self.VLLM_TEST_WHLS_REGEX = [
             "xformers/*.whl",
             "vllm/vllm*.whl",
-            "flashinfer-python/flashinfer*.whl",
         ]
 
     def prepare(self):

diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
@@ -1763,12 +1763,14 @@ test_operator_microbenchmark() {
   mkdir -p "$TEST_REPORTS_DIR"
   TEST_DIR=$(pwd)
 
+  test_inductor_set_cpu_affinity
+
   cd benchmarks/operator_benchmark/pt_extension
-  python -m pip install .
+  python -m pip install . -v --no-build-isolation
 
   cd "${TEST_DIR}"/benchmarks/operator_benchmark
 
-  for OP_BENCHMARK_TESTS in matmul mm addmm bmm conv; do
+  for OP_BENCHMARK_TESTS in matmul mm addmm bmm conv optimizer; do
     $TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
       --output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}_compile.json" \
       --benchmark-name "PyTorch operator microbenchmark" --use-compile

diff --git a/.circleci/scripts/binary_linux_test.sh b/.circleci/scripts/binary_linux_test.sh
@@ -31,23 +31,6 @@ if [[ "$PACKAGE_TYPE" != libtorch ]]; then
   export PATH="\${python_path}/bin:\$PATH"
 fi
 
-EXTRA_CONDA_FLAGS=""
-NUMPY_PIN=""
-PROTOBUF_PACKAGE="defaults::protobuf"
-
-if [[ "\$python_nodot" = *310* ]]; then
-  # There's an issue with conda channel priority where it'll randomly pick 1.19 over 1.20
-  # we set a lower boundary here just to be safe
-  NUMPY_PIN=">=1.21.2"
-  PROTOBUF_PACKAGE="protobuf>=3.19.0"
-fi
-
-if [[ "\$python_nodot" = *39* ]]; then
-  # There's an issue with conda channel priority where it'll randomly pick 1.19 over 1.20
-  # we set a lower boundary here just to be safe
-  NUMPY_PIN=">=1.20"
-fi
-
 # Move debug wheels out of the package dir so they don't get installed
 mkdir -p /tmp/debug_final_pkgs
 mv /final_pkgs/debug-*.zip /tmp/debug_final_pkgs || echo "no debug packages to move"
@@ -66,12 +49,23 @@ fi
 if [[ "$PACKAGE_TYPE" != libtorch ]]; then
   if [[ "\$BUILD_ENVIRONMENT" != *s390x* ]]; then
     pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
-    retry pip install -q numpy protobuf typing-extensions
+
+    # numpy tests:
+    # We test 1 version no numpy. 1 version with numpy 1.x and rest with numpy 2.x
+    if [[ "\$python_nodot" = *311* ]]; then
+      retry pip install -q numpy==1.23.5 protobuf typing-extensions
+    elif [[ "\$python_nodot" = *312* ]]; then
+      retry pip install -q protobuf typing-extensions
+    else
+      retry pip install -q numpy protobuf typing-extensions
+    fi
+
   else
     pip install "\$pkg"
     retry pip install -q numpy protobuf typing-extensions
   fi
 fi
+
 if [[ "$PACKAGE_TYPE" == libtorch ]]; then
   pkg="\$(ls /final_pkgs/*-latest.zip)"
   unzip "\$pkg" -d /tmp

diff --git a/.github/ci_commit_pins/audio.txt b/.github/ci_commit_pins/audio.txt
@@ -1 +1 @@
-ee1a1350eb37804b94334768f328144f058f14e9
+32ce8c011855adb15438ddc9bf6c139d23f8cee5
diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-2d82dc5caa336d179d9b46ac4a0fb8c43d84c5cc
+617079d944b0e72632311c30ae2bbdf1168b901e
diff --git a/.github/ci_configs/vllm/Dockerfile b/.github/ci_configs/vllm/Dockerfile
@@ -1,4 +1,4 @@
-ARG CUDA_VERSION=12.8.1
+ARG CUDA_VERSION=12.9.1
 ARG PYTHON_VERSION=3.12
 
 # BUILD_BASE_IMAGE: used to setup python build xformers, and vllm wheels, It can be replaced with a different base image from local machine,
@@ -124,7 +124,7 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
     git clone https://github.com/facebookresearch/xformers.git
 
     pushd xformers
-    git checkout v0.0.32.post2
+    git checkout v0.0.33.post1
     git submodule update --init --recursive
     python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose
     popd
@@ -256,7 +256,7 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy
 
-# Install build and runtime dependencies, this is needed for flashinfer install
+# Install build and runtime dependencies
 COPY requirements/build.txt requirements/build.txt
 COPY use_existing_torch.py use_existing_torch.py
 RUN python3 use_existing_torch.py
@@ -294,33 +294,9 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system /wheels/xformers/*.whl --verbose
 
-# Build FlashInfer from source
-ARG torch_cuda_arch_list='8.0;8.9;9.0a;10.0a;12.0'
-ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
-
-# TODO(elainewy): remove this once vllm commit is updated, and install flashinfer from pip
-# see https://github.com/pytorch/pytorch/pull/165274#issuecomment-3408531784
-ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
-ARG FLASHINFER_GIT_REF="v0.2.14.post1"
-
-RUN --mount=type=cache,target=/root/.cache/uv \
-    git clone --depth 1 --recursive --shallow-submodules \
-        --branch ${FLASHINFER_GIT_REF} \
-        ${FLASHINFER_GIT_REPO} flashinfer \
-    && echo "Building FlashInfer with AOT for arches: ${torch_cuda_arch_list}" \
-    && cd flashinfer \
-    && python3 -m flashinfer.aot \
-    && python3 -m build --no-isolation --wheel --outdir ../wheels/flashinfer \
-    && cd .. \
-    && rm -rf flashinfer
-
-# Install FlashInfer
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system wheels/flashinfer/*.whl --verbose
-
 # Logging to confirm the torch versions
-RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
-RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio\|^xformers\|^vllm\|^flashinfer' > build_summary.txt
+RUN pip freeze | grep -E 'torch|xformers|vllm'
+RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio\|^xformers\|^vllm' > build_summary.txt
 ################### VLLM INSTALLED IMAGE ####################
 
 
@@ -331,4 +307,3 @@ FROM scratch as export-wheels
 COPY --from=base /workspace/xformers-dist /wheels/xformers
 COPY --from=build /workspace/vllm-dist /wheels/vllm
 COPY --from=vllm-base /workspace/build_summary.txt /wheels/build_summary.txt
-COPY --from=vllm-base /workspace/wheels/flashinfer /wheels/flashinfer-python
diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py
@@ -50,6 +50,7 @@
 
 PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
     "12.6": (
+        "cuda-bindings==12.9.4; platform_system == 'Linux' | "
         "nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | "
         "nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | "
         "nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | "
@@ -67,6 +68,7 @@
         "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'"
     ),
     "12.8": (
+        "cuda-bindings==12.9.4; platform_system == 'Linux' | "
         "nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | "
         "nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | "
         "nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | "
@@ -84,6 +86,7 @@
         "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'"
     ),
     "12.9": (
+        "cuda-bindings==12.9.4; platform_system == 'Linux' | "
         "nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | "
         "nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | "
         "nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | "
@@ -101,6 +104,7 @@
         "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'"
     ),
     "13.0": (
+        "cuda-bindings==13.0.3; platform_system == 'Linux' | "
         "nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | "
         "nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | "
         "nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | "

diff --git a/.github/scripts/prepare_vllm_wheels.sh b/.github/scripts/prepare_vllm_wheels.sh
@@ -88,7 +88,7 @@ repackage_wheel() {
 ${PYTHON_EXECUTABLE} -mpip install wheel==0.45.1
 
 pushd externals/vllm/wheels
-for package in xformers flashinfer-python vllm; do
+for package in xformers vllm; do
   repackage_wheel $package
 done
 popd
diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml
@@ -327,6 +327,7 @@ jobs:
           SCCACHE_REGION: ${{ !contains(matrix.runner, 'b200') && 'us-east-1' || '' }}
           SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
           DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+          DOCKER_IMAGE_S390X: ${{ inputs.docker-image }}
           XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
           XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
           PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
@@ -360,10 +361,12 @@ jobs:
             # if for some reason cleanup action doesn't stop container
             # when job is cancelled
             DOCKER_SHELL_CMD="sleep 12h"
+            USED_IMAGE="${DOCKER_IMAGE_S390X}"
           else
             SHM_OPTS="--shm-size=${SHM_SIZE}"
             JENKINS_USER="--user jenkins"
             DOCKER_SHELL_CMD=
+            USED_IMAGE="${DOCKER_IMAGE}"
           fi
 
           # detached container should get cleaned up by teardown_ec2_linux
@@ -426,7 +429,7 @@ jobs:
             ${JENKINS_USER} \
             -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
             -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}" \
+            "${USED_IMAGE}" \
             ${DOCKER_SHELL_CMD}
           )
           echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"

diff --git a/.github/workflows/attention_op_microbenchmark.yml b/.github/workflows/attention_op_microbenchmark.yml
@@ -23,7 +23,7 @@ jobs:
     uses: ./.github/workflows/_linux-build.yml
     with:
       runner: linux.12xlarge.memory
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
       cuda-arch-list: '8.0 9.0'
       test-matrix: |
@@ -39,7 +39,7 @@ jobs:
     needs: attn-microbenchmark-build
     with:
       timeout-minutes: 500
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
       docker-image: ${{ needs.attn-microbenchmark-build.outputs.docker-image }}
       test-matrix: ${{ needs.attn-microbenchmark-build.outputs.test-matrix }}
     secrets: inherit
@@ -51,7 +51,7 @@ jobs:
     uses: ./.github/workflows/_linux-build.yml
     with:
       runner: linux.12xlarge.memory
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
       cuda-arch-list: '10.0'
       test-matrix: |
@@ -66,7 +66,7 @@ jobs:
     needs: opmicrobenchmark-build-b200
     with:
       timeout-minutes: 500
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
       docker-image: ${{ needs.opmicrobenchmark-build-b200.outputs.docker-image }}
       test-matrix: ${{ needs.opmicrobenchmark-build-b200.outputs.test-matrix }}
       aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only

diff --git a/.github/workflows/b200-distributed.yml b/.github/workflows/b200-distributed.yml
@@ -37,7 +37,7 @@ jobs:
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: linux.12xlarge.memory
+      runner: linux.r7i.4xlarge
       build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed-b200
       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
       cuda-arch-list: '10.0'

diff --git a/.github/workflows/b200-symm-mem.yml b/.github/workflows/b200-symm-mem.yml
@@ -37,7 +37,7 @@ jobs:
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: linux.12xlarge.memory
+      runner: linux.r7i.4xlarge
       build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
       cuda-arch-list: '10.0'

diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
@@ -52,8 +52,7 @@ jobs:
           pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
           pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11,
           pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
-          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
-          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks,
           pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
           pytorch-linux-jammy-py3.10-clang12,
           pytorch-linux-jammy-py3.11-clang12,
@@ -75,7 +74,8 @@ jobs:
           pytorch-linux-jammy-py3-clang12-onnx,
           pytorch-linux-jammy-linter,
           pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter,
-          pytorch-linux-jammy-py3-clang12-executorch,
+          # TODO: Re-enable me when docker pin update happens
+          # pytorch-linux-jammy-py3-clang12-executorch,
           pytorch-linux-jammy-py3.12-triton-cpu,
           pytorch-linux-noble-riscv64-py3.12-gcc14
         ]
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		1b0418a9a454b2b93ab8d71f40e59d2297157fae
		aa01f5c2cd4db2b7bfa53ea98a1a8dfbd6d77c92
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		ee1a1350eb37804b94334768f328144f058f14e9
		32ce8c011855adb15438ddc9bf6c139d23f8cee5
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		2d82dc5caa336d179d9b46ac4a0fb8c43d84c5cc
		617079d944b0e72632311c30ae2bbdf1168b901e