From 13e472a3bca6121446e9911e3b6f1970f9a64f90 Mon Sep 17 00:00:00 2001
From: Prachi Gupta <prachi.gupta@amd.com>
Date: Tue, 5 Aug 2025 15:48:46 +0000
Subject: [PATCH 1/8] Fix issues with merge conflicts

- Fix the tensorpipe branch for ROCm
- External/aotriton.cmake: remove use of __AOTRITON_VER_WITH_COMMIT
- macros/Export.h: remove TORCH_HIP_CPP_API/TORCH_HIP_API as CUDA ones
  get hipified and converted correctly (need to upstream this)
- CUDALoops.cuh: Bad merge
- Blas.cpp: remove MX patch
- cuda_vectorized_test.cu: remove ROCmloops specific test, this was
  removed in rocm7.0_internal_testing branch. I had incorrectly
  addressed the merge conflicts when merging with upstream
---
 .gitmodules                                |  3 +-
 aten/src/ATen/native/cuda/Blas.cpp         | 21 ++----------
 aten/src/ATen/native/cuda/CUDALoops.cuh    | 38 ----------------------
 aten/src/ATen/test/cuda_vectorized_test.cu | 17 ----------
 cmake/External/aotriton.cmake              |  2 +-
 third_party/tensorpipe                     |  2 +-
 torch/headeronly/macros/Export.h           | 16 +++------
 7 files changed, 10 insertions(+), 89 deletions(-)
diff --git a/.gitmodules b/.gitmodules
index 88a3fd091a881..74a0b360a8af0 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -81,7 +81,8 @@
 [submodule "third_party/tensorpipe"]
     ignore = dirty
     path = third_party/tensorpipe
-    url = https://github.com/pytorch/tensorpipe.git
+    url = https://github.com/ROCm/tensorpipe.git
+    branch = tp_rocm_60
 [submodule "third_party/cudnn_frontend"]
 	path = third_party/cudnn_frontend
 	url = https://github.com/NVIDIA/cudnn-frontend.git
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
index ef144e9d99c38..40d39b3c7b606 100644
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@@ -1205,7 +1205,6 @@ std::pair<ScalingType, ScalingType> get_joint_scaling(
 
 } // namespace
 
-
 // Computes matrix multiply + bias while applying scaling to input and output matrices
 // Scales are only applicable when matrices are of Float8 type and assumed to be equal to 1.0 by default.
 // If output matrix type is 16 or 32-bit type, scale_result is not applied.
@@ -1362,25 +1361,9 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
     else {
       TORCH_CHECK(b.dtype() == at::kFloat8_e4m3fn);
     }
-    // Until more than bf16 is supported
+    // Until more than bf16 is supported.
     TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16,
-         "hipblaslt rowwise _scaled_mm only supports BFloat16 output");
-  }
-  else if (scaling_choice == ScalingType::BlockWise) {
-#if ROCM_VERSION >= 70000
-    TORCH_CHECK(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
-               "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
-
-    TORCH_CHECK(mat1.size(0) % 32 == 0 && mat1.size(1) % 32 == 0 &&
-               mat2.size(0) % 32 == 0 && mat2.size(1) % 32 == 0,
-               "Matrix dimensions must be multiples of 32 for block-wise scaling");
-
-    TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16 ||
-                out.scalar_type() == ScalarType::Half,
-                "Block-wise scaling only supports BFloat16 or Half output types");
-#else
-    TORCH_CHECK(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 7.0 or later");
-#endif
+         "hipblaslt rowwise _scaled_mm only supports BFloat16 output but got ", out.scalar_type());
   }
 #endif
 
diff --git a/aten/src/ATen/native/cuda/CUDALoops.cuh b/aten/src/ATen/native/cuda/CUDALoops.cuh
index 3767fe1d9a88b..9b104a7966363 100644
--- a/aten/src/ATen/native/cuda/CUDALoops.cuh
+++ b/aten/src/ATen/native/cuda/CUDALoops.cuh
@@ -531,44 +531,6 @@ __global__ void elementwise_kernel(int N, func_t f) {
   }
 }
 
-#ifdef USE_ROCM
-template <int nt, int vt, typename func_t>
-C10_LAUNCH_BOUNDS_2(nt, 4)
-__global__ void elementwise_kernel_manual_unroll(int N, func_t f) {
-  int tid = threadIdx.x;
-  int nv = nt * vt;
-  int idx = nv * blockIdx.x + tid;
-  if ((idx + nt*(vt-1)) < N) {
-    f(idx, true);
-  } else {
-#pragma unroll
-    for (int i = 0; i < vt; i++) {
-      if (idx < N) {
-        f(idx, false);
-        idx += nt;
-      }
-    }
-  }
-}
-
-template <int nt, int vt, typename func_t>
-C10_LAUNCH_BOUNDS_2(nt, 4)
-__global__ void elementwise_kernel_strided(int N, func_t f) {
-  int tid = threadIdx.x;
-  int idx = nt * vt * blockIdx.x + tid;
-  int step = nt * vt * gridDim.x;
-  while (idx < N) {
-#pragma unroll
-    for (int i = 0; i < vt; i++) {
-      if ((idx + nt * i) < N) {
-        f(idx + nt * i);
-      }
-    }
-    idx += step;
-  }
-}
-#endif
-
 template <int nt, int vt, typename func_t>
 static void launch_legacy_kernel(int64_t N, const func_t& f) {
   TORCH_INTERNAL_ASSERT(N >= 0 && N <= std::numeric_limits<int32_t>::max());
diff --git a/aten/src/ATen/test/cuda_vectorized_test.cu b/aten/src/ATen/test/cuda_vectorized_test.cu
index 7ba7bcb99bce1..236753c94d37b 100644
--- a/aten/src/ATen/test/cuda_vectorized_test.cu
+++ b/aten/src/ATen/test/cuda_vectorized_test.cu
@@ -27,23 +27,6 @@ void reset_buffers() {
   }
 }
 
-#if defined(USE_ROCM) && !defined(_WIN32)
-TEST(TestLoops, HasSameArgTypes) {
-  // This is a compile-time unit test. If this file compiles without error,
-  // then the test passes and during runtime, we just need to return.
-  using namespace at::native::modern::detail;
-  using func1_t = int (*)(float, float);
-  using func2_t = int (*)(bool, float, float);
-  using func3_t = int (*)(float);
-  using func4_t = int (*)();
-  static_assert(has_same_arg_types<func1_t>::value, "func1_t has the same argument types");
-  static_assert(!has_same_arg_types<func2_t>::value, "func2_t does not have the same argument types");
-  static_assert(has_same_arg_types<func3_t>::value, "func3_t has the same argument types");
-  static_assert(has_same_arg_types<func4_t>::value, "func4_t has the same argument types");
-  return;
-}
-#endif
-
 TEST(TestVectorizedMemoryAccess, CanVectorizeUpTo) {
   char *ptr = reinterpret_cast<char *>(buffer1);
 
diff --git a/cmake/External/aotriton.cmake b/cmake/External/aotriton.cmake
index 172521a592c3a..54564e42c9023 100644
--- a/cmake/External/aotriton.cmake
+++ b/cmake/External/aotriton.cmake
@@ -81,7 +81,7 @@ if(NOT __AOTRITON_INCLUDED)
     list(GET __AOTRITON_MANYLINUX_LIST ${__AOTRITON_ROCM_INDEX} __AOTRITON_MANYLINUX)
     set(__AOTRITON_ARCH ${CMAKE_HOST_SYSTEM_PROCESSOR})
     string(CONCAT __AOTRITON_FILE "aotriton-"
-                                  "${__AOTRITON_VER_WITH_COMMIT}-${__AOTRITON_MANYLINUX}"
+                                  "${__AOTRITON_VER}-${__AOTRITON_MANYLINUX}"
                                   "_${__AOTRITON_ARCH}-rocm${__AOTRITON_ROCM}"
                                   "-shared.tar.${__AOTRITON_Z}")
     string(CONCAT __AOTRITON_URL "https://github.com/ROCm/aotriton/releases/download/"  # @lint-ignore
diff --git a/third_party/tensorpipe b/third_party/tensorpipe
index 135ba25f6be99..c1dec474c59ec 160000
--- a/third_party/tensorpipe
+++ b/third_party/tensorpipe
@@ -1 +1 @@
-Subproject commit 135ba25f6be9991ebfe83d41d268d9c3d4cc5c5b
+Subproject commit c1dec474c59ec8cbb629cfe992977062de08b9bd
diff --git a/torch/headeronly/macros/Export.h b/torch/headeronly/macros/Export.h
index 8dd25419efb4e..a241439c86bca 100644
--- a/torch/headeronly/macros/Export.h
+++ b/torch/headeronly/macros/Export.h
@@ -100,10 +100,10 @@
 #define TORCH_API C10_IMPORT
 #endif
 
-// You may be wondering why we have TORCH_CUDA_CPP_API and TORCH_CUDA_CU_API
+// You may be wondering why we have TORCH_HIP_CPP_API and TORCH_HIP_API
 // belonging to the same library instead of just one TORCH_CUDA_API. Well, it
-// can indeed just be one TORCH_CUDA_API (and used to be)! TORCH_CUDA_CPP_API
-// and TORCH_CUDA_CU_API are artifacts of when we needed a split build to
+// can indeed just be one TORCH_CUDA_API (and used to be)! TORCH_HIP_CPP_API
+// and TORCH_HIP_API are artifacts of when we needed a split build to
 // avoid relocation marker linking errors. The context is as follows:
 //
 // Once upon a time, there _was_ only TORCH_CUDA_API. All was happy until we
@@ -130,14 +130,6 @@
 #define TORCH_CUDA_CU_API C10_IMPORT
 #endif
 
-#if defined(TORCH_HIP_BUILD_MAIN_LIB)
-#define TORCH_HIP_CPP_API C10_EXPORT
-#define TORCH_HIP_API C10_EXPORT
-#else
-#define TORCH_HIP_CPP_API C10_IMPORT
-#define TORCH_HIP_API C10_IMPORT
-#endif
-
 #if defined(TORCH_XPU_BUILD_MAIN_LIB)
 #define TORCH_XPU_API C10_EXPORT
 #else
@@ -145,7 +137,7 @@
 #endif
 
 // Enums only need to be exported on windows for non-CUDA files
-#if defined(_WIN32) && defined(__CUDACC__)
+#if defined(_WIN32) && defined(__HIPCC__)
 #define C10_API_ENUM C10_API
 #else
 #define C10_API_ENUM

From 7bd7047ad708ae240f538c3159fe2d5a5ab49a60 Mon Sep 17 00:00:00 2001
From: Prachi Gupta <prachi.gupta@amd.com>
Date: Wed, 6 Aug 2025 20:27:29 +0000
Subject: [PATCH 2/8] Update triton commit and related_commits files

- Update triton commit to point to ToT of release/internal/3.4.x
- Update related_commits file with ToT of respective repos
---
 .ci/docker/ci_commit_pins/triton.txt |  2 +-
 related_commits                      | 22 ++++++++++------------
 2 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt
index 60c896b80c8f4..567536db72100 100644
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@@ -1 +1 @@
-f7888497a1eb9e98d4c07537f0d0bcfe180d1363
+711e2a92522e0a9921ce58ae658571ca55c49b97
diff --git a/related_commits b/related_commits
index 95ebcf476dbce..71aeaf72071b9 100644
--- a/related_commits
+++ b/related_commits
@@ -1,12 +1,10 @@
-ubuntu|pytorch|apex|master|ca54c058f1094b3463788371325025be707a5982|https://github.com/ROCm/apex
-centos|pytorch|apex|master|ca54c058f1094b3463788371325025be707a5982|https://github.com/ROCm/apex
-ubuntu|pytorch|torchvision|main|f52c4f1afd7dec25cbe7b98bcf1cbc840298e8da|https://github.com/pytorch/vision
-centos|pytorch|torchvision|main|f52c4f1afd7dec25cbe7b98bcf1cbc840298e8da|https://github.com/pytorch/vision
-ubuntu|pytorch|torchtext|main|bde7ecdb6ba9179ccd30cde60a6550478d0a359f|https://github.com/pytorch/text
-centos|pytorch|torchtext|main|bde7ecdb6ba9179ccd30cde60a6550478d0a359f|https://github.com/pytorch/text
-ubuntu|pytorch|torchdata|main|922ac065407546b9cb4f629ab99f1fbf04d8fc12|https://github.com/pytorch/data
-centos|pytorch|torchdata|main|922ac065407546b9cb4f629ab99f1fbf04d8fc12|https://github.com/pytorch/data
-ubuntu|pytorch|torchaudio|main|bccaa454a54c3c648697cc2f46a4fb0500b1f01b|https://github.com/pytorch/audio
-centos|pytorch|torchaudio|main|bccaa454a54c3c648697cc2f46a4fb0500b1f01b|https://github.com/pytorch/audio
-ubuntu|pytorch|ao|main|a96eeb1c7d7ba24cf0ccfc105141729acfed22bf|https://github.com/pytorch/ao
-centos|pytorch|ao|main|a96eeb1c7d7ba24cf0ccfc105141729acfed22bf|https://github.com/pytorch/ao
+ubuntu|pytorch|apex|master|62c94ed1789bc177a83567985be6c1cb29b2d98c|https://github.com/ROCm/apex
+centos|pytorch|apex|master|62c94ed1789bc177a83567985be6c1cb29b2d98c|https://github.com/ROCm/apex
+ubuntu|pytorch|torchvision|main|98f8b3757c0648724064ca95434b18281c43c5f6|https://github.com/pytorch/vision
+centos|pytorch|torchvision|main|98f8b3757c0648724064ca95434b18281c43c5f6|https://github.com/pytorch/vision
+ubuntu|pytorch|torchdata|main|a05a54f797dd0f1a66610652a949fd47243ff952|https://github.com/pytorch/data
+centos|pytorch|torchdata|main|a05a54f797dd0f1a66610652a949fd47243ff952|https://github.com/pytorch/data
+ubuntu|pytorch|torchaudio|main|0c22347335f4c9a5b92a2f5bad65e05e2464c184|https://github.com/pytorch/audio
+centos|pytorch|torchaudio|main|0c22347335f4c9a5b92a2f5bad65e05e2464c184|https://github.com/pytorch/audio
+ubuntu|pytorch|ao|main|3b4bc9869d933927b2547d8231feab69789a80d4|https://github.com/pytorch/ao
+centos|pytorch|ao|main|3b4bc9869d933927b2547d8231feab69789a80d4|https://github.com/pytorch/ao

From 9025071507f9ca4ba1f9bfecf9c7630e0100930b Mon Sep 17 00:00:00 2001
From: Prachi Gupta <prachi.gupta@amd.com>
Date: Wed, 6 Aug 2025 21:28:44 +0000
Subject: [PATCH 3/8] Revert "CONSOLIDATED COMMITS: Enable tensorpipe with
 hip_basic backend"

This reverts commit 550bc776de086c3cdd8103da42d90574fc764223.
---
 .gitmodules                                         |  3 +--
 cmake/Dependencies.cmake                            | 12 ++----------
 third_party/tensorpipe                              |  2 +-
 torch/csrc/distributed/rpc/tensorpipe_cuda.cpp      |  6 +-----
 torch/testing/_internal/distributed/rpc/rpc_test.py |  1 -
 5 files changed, 5 insertions(+), 19 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 74a0b360a8af0..88a3fd091a881 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -81,8 +81,7 @@
 [submodule "third_party/tensorpipe"]
     ignore = dirty
     path = third_party/tensorpipe
-    url = https://github.com/ROCm/tensorpipe.git
-    branch = tp_rocm_60
+    url = https://github.com/pytorch/tensorpipe.git
 [submodule "third_party/cudnn_frontend"]
 	path = third_party/cudnn_frontend
 	url = https://github.com/NVIDIA/cudnn-frontend.git
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index a514924589a04..d11915fe43147 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1159,14 +1159,6 @@ if(USE_DISTRIBUTED AND USE_TENSORPIPE)
       set(TP_USE_CUDA ON CACHE BOOL "" FORCE)
       set(TP_ENABLE_CUDA_IPC ON CACHE BOOL "" FORCE)
     endif()
-    if(USE_ROCM)
-      add_compile_options(-D__HIP_PLATFORM_AMD__=1)
-      set(TP_USE_ROCM ON CACHE BOOL "" FORCE)
-      set(TP_ENABLE_HIP_IPC OFF CACHE BOOL "" FORCE)
-      set(TP_ENABLE_HIP_XTH OFF CACHE BOOL "" FORCE)
-      set(TP_ENABLE_HIP_GDR OFF CACHE BOOL "" FORCE)
-      set(TP_ENABLE_IBV OFF CACHE BOOL "" FORCE)
-    endif()
     set(TP_BUILD_LIBUV ON CACHE BOOL "" FORCE)
     add_compile_options(-DTORCH_USE_LIBUV)
     include_directories(BEFORE SYSTEM ${CMAKE_CURRENT_LIST_DIR}/../third_party/tensorpipe/third_party/libuv/include)
@@ -1192,9 +1184,9 @@ if(USE_DISTRIBUTED AND USE_TENSORPIPE)
     if(USE_CUDA)
       list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS tensorpipe_cuda)
     elseif(USE_ROCM)
-      message(WARNING "TensorPipe is supported on ROCm")
+      message(WARNING "TensorPipe doesn't yet support ROCm")
       # Not yet...
-      list(APPEND Caffe2_HIP_DEPENDENCY_LIBS tensorpipe_hip)
+      # list(APPEND Caffe2_HIP_DEPENDENCY_LIBS tensorpipe_hip)
     endif()
   endif()
 endif()
diff --git a/third_party/tensorpipe b/third_party/tensorpipe
index c1dec474c59ec..dacda0567d9f2 160000
--- a/third_party/tensorpipe
+++ b/third_party/tensorpipe
@@ -1 +1 @@
-Subproject commit c1dec474c59ec8cbb629cfe992977062de08b9bd
+Subproject commit dacda0567d9f23d4bc503e1c4f84aa65f33ac38a
diff --git a/torch/csrc/distributed/rpc/tensorpipe_cuda.cpp b/torch/csrc/distributed/rpc/tensorpipe_cuda.cpp
index 97fb18cac83bb..03b43184d143b 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_cuda.cpp
+++ b/torch/csrc/distributed/rpc/tensorpipe_cuda.cpp
@@ -1,7 +1,7 @@
 #include <torch/csrc/distributed/rpc/tensorpipe_agent.h>
 #include <torch/csrc/distributed/rpc/tensorpipe_utils.h>
 
-#if defined(USE_TENSORPIPE)
+#if defined(USE_TENSORPIPE) && !defined(USE_ROCM)
 
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAGuard.h>
@@ -50,8 +50,6 @@ C10_REGISTER_CREATOR(TensorPipeChannelRegistry, cuda_gdr, makeCudaGdrChannel)
 
 #endif
 
-#if TENSORPIPE_HAS_CUDA_XTH_CHANNEL
-
 std::unique_ptr<ChannelRegistration> makeCudaXthChannel() {
   auto context = tensorpipe::channel::cuda_xth::create();
   return std::make_unique<ChannelRegistration>(
@@ -61,8 +59,6 @@ std::unique_ptr<ChannelRegistration> makeCudaXthChannel() {
 // The cuda_xth channel supports same-process GPU-to-GPU comm
 C10_REGISTER_CREATOR(TensorPipeChannelRegistry, cuda_xth, makeCudaXthChannel)
 
-#endif
-
 std::unique_ptr<ChannelRegistration> makeCudaBasicChannel() {
   auto context = tensorpipe::channel::cuda_basic::create(
       tensorpipe::channel::basic::create());
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index ea7c0d125c82c..631a32bb4fcdf 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -37,7 +37,6 @@
     captured_output,
     skip_if_lt_x_gpu,
     tp_transports,
-    skip_if_rocm,
 )
 from torch.testing._internal.common_utils import (
     get_cycles_per_ms,

From 301dc4677dd48bce4043953cb39e6f30db2a63de Mon Sep 17 00:00:00 2001
From: Prachi Gupta <prachi.gupta@amd.com>
Date: Fri, 8 Aug 2025 16:05:41 +0000
Subject: [PATCH 4/8] Export.h: dehipify this file

---
 torch/headeronly/macros/Export.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/torch/headeronly/macros/Export.h b/torch/headeronly/macros/Export.h
index a241439c86bca..2222baedb4b1b 100644
--- a/torch/headeronly/macros/Export.h
+++ b/torch/headeronly/macros/Export.h
@@ -100,10 +100,10 @@
 #define TORCH_API C10_IMPORT
 #endif
 
-// You may be wondering why we have TORCH_HIP_CPP_API and TORCH_HIP_API
+// You may be wondering why we have TORCH_CUDA_CPP_API and TORCH_CUDA_API
 // belonging to the same library instead of just one TORCH_CUDA_API. Well, it
-// can indeed just be one TORCH_CUDA_API (and used to be)! TORCH_HIP_CPP_API
-// and TORCH_HIP_API are artifacts of when we needed a split build to
+// can indeed just be one TORCH_CUDA_API (and used to be)! TORCH_CUDA_CPP_API
+// and TORCH_CUDA_API are artifacts of when we needed a split build to
 // avoid relocation marker linking errors. The context is as follows:
 //
 // Once upon a time, there _was_ only TORCH_CUDA_API. All was happy until we
@@ -137,7 +137,7 @@
 #endif
 
 // Enums only need to be exported on windows for non-CUDA files
-#if defined(_WIN32) && defined(__HIPCC__)
+#if defined(_WIN32) && defined(__CUDACC__)
 #define C10_API_ENUM C10_API
 #else
 #define C10_API_ENUM

From 9ce6b1a1254bbd7630eb5773d23bb36d631f5db7 Mon Sep 17 00:00:00 2001
From: Prachi Gupta <prachi.gupta@amd.com>
Date: Fri, 8 Aug 2025 16:58:01 +0000
Subject: [PATCH 5/8] Update requirements-ci with upstream and rocm/release/2.8

---
 .ci/docker/requirements-ci.txt | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index feac690887a46..5cdcaa3d81a33 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -110,10 +110,8 @@ ninja==1.11.1.3
 #Pinned versions: 1.11.1.3
 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
 
-numba==0.49.0 ; python_version < "3.9"
-numba==0.55.2 ; python_version == "3.9"
-numba==0.55.2 ; python_version == "3.10"
-numba==0.60.0 ; python_version == "3.12"
+numba==0.60.0 ; python_version == "3.9"
+numba==0.61.2 ; python_version > "3.9"
 #Description: Just-In-Time Compiler for Numerical Functions
 #Pinned versions: 0.54.1, 0.49.0, <=0.49.1
 #test that import: test_numba_integration.py
@@ -135,6 +133,7 @@ numpy==2.0.2 ; python_version == "3.9"
 numpy==2.1.2 ; python_version > "3.9"
 
 pandas==2.2.3
+
 #onnxruntime
 #Description: scoring engine for Open Neural Network Exchange (ONNX) models
 #Pinned versions: 1.9.0
@@ -163,10 +162,11 @@ pillow==11.0.0
 #Pinned versions: 10.3.0
 #test that import:
 
-protobuf==5.29.4
-#Description:  Google's data interchange format
-#Pinned versions: 5.29.4
-#test that import: test_tensorboard.py, test/onnx/*
+protobuf==3.20.2 ; python_version <= "3.12"
+protobuf==4.25.1 ; python_version == "3.13"
+#Description:  Google’s data interchange format
+#Pinned versions: 3.20.1
+#test that import: test_tensorboard.py
 
 psutil
 #Description: information on running processes and system utilization
@@ -263,11 +263,6 @@ tb-nightly==2.13.0a20230426
 #Pinned versions:
 #test that import:
 
-tlparse==0.3.30
-#Description: parse logs produced by torch.compile
-#Pinned versions:
-#test that import: dynamo/test_structured_trace.py
-
 # needed by torchgen utils
 typing-extensions>=4.10.0
 #Description: type hints for python
@@ -309,7 +304,7 @@ pytest-cpp==2.3.0
 #Pinned versions: 2.3.0
 #test that import:
 
-z3-solver==4.15.1.0
+z3-solver==4.12.6.0
 #Description: The Z3 Theorem Prover Project
 #Pinned versions:
 #test that import:
@@ -326,7 +321,8 @@ pywavelets==1.7.0 ; python_version >= "3.12"
 #Pinned versions: 1.4.1
 #test that import:
 
-lxml==5.3.0
+lxml==5.3.0 ; python_version <= "3.12"
+lxml==6.0.0 ; python_version == "3.13"
 #Description: This is a requirement of unittest-xml-reporting
 
 # Python-3.9 binaries
@@ -338,8 +334,9 @@ sympy==1.13.3
 #Pinned versions:
 #test that import:
 
-onnx==1.18.0
-#Description: Required by onnx tests, and mypy and test_public_bindings.py when checking torch.onnx._internal
+onnx==1.16.1 ; python_version <= "3.12"
+onnx==1.18.0 ; python_version == "3.13"
+#Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
 

From 45f8d8e47f2355e2092954447e24221de11cbbf9 Mon Sep 17 00:00:00 2001
From: Prachi Gupta <prachi.gupta@amd.com>
Date: Fri, 8 Aug 2025 21:16:15 +0000
Subject: [PATCH 6/8] Use newer versions from upstream

---
 .ci/docker/requirements-ci.txt | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index 5cdcaa3d81a33..5d32a375ca7d7 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -168,6 +168,7 @@ protobuf==4.25.1 ; python_version == "3.13"
 #Pinned versions: 3.20.1
 #test that import: test_tensorboard.py
 
+
 psutil
 #Description: information on running processes and system utilization
 #Pinned versions:
@@ -304,7 +305,7 @@ pytest-cpp==2.3.0
 #Pinned versions: 2.3.0
 #test that import:
 
-z3-solver==4.12.6.0
+z3-solver==4.15.1.0
 #Description: The Z3 Theorem Prover Project
 #Pinned versions:
 #test that import:
@@ -334,9 +335,9 @@ sympy==1.13.3
 #Pinned versions:
 #test that import:
 
-onnx==1.16.1 ; python_version <= "3.12"
-onnx==1.18.0 ; python_version == "3.13"
-#Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
+onnx==1.18.0
+#Description: Required by onnx tests, and mypy and test_public_bindings.py when checking torch.onnx._internal
+
 #Pinned versions:
 #test that import:
 

From 44451cdaf470d4de5e4e38fd5b3afcec9350565c Mon Sep 17 00:00:00 2001
From: Prachi Gupta <prachi.gupta@amd.com>
Date: Fri, 8 Aug 2025 21:17:44 +0000
Subject: [PATCH 7/8] Fix protobuf version

---
 .ci/docker/requirements-ci.txt | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index 5d32a375ca7d7..9a7747095710a 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -162,10 +162,9 @@ pillow==11.0.0
 #Pinned versions: 10.3.0
 #test that import:
 
-protobuf==3.20.2 ; python_version <= "3.12"
-protobuf==4.25.1 ; python_version == "3.13"
+protobuf==5.29.4
 #Description:  Google’s data interchange format
-#Pinned versions: 3.20.1
+#Pinned versions: 5.29.4
 #test that import: test_tensorboard.py
 
 

From 1ee30ac39628b611d22a8fdf331e07b4a697ec5a Mon Sep 17 00:00:00 2001
From: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
Date: Fri, 8 Aug 2025 16:20:29 -0500
Subject: [PATCH 8/8] typo

---
 .ci/docker/requirements-ci.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index 9a7747095710a..f12be92975054 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -163,9 +163,9 @@ pillow==11.0.0
 #test that import:
 
 protobuf==5.29.4
-#Description:  Google’s data interchange format
+#Description:  Google's data interchange format
 #Pinned versions: 5.29.4
-#test that import: test_tensorboard.py
+#test that import: test_tensorboard.py, test/onnx/*
 
 
 psutil