From b3ac1f4ede76903139e2026f0bb62537c3e383d8 Mon Sep 17 00:00:00 2001 From: Xinya Zhang Date: Wed, 2 Jul 2025 13:42:26 -0500 Subject: [PATCH 1/3] Enable FA/ME UT on gfx950 for ROCM >= 6.5 --- torch/testing/_internal/common_cuda.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py index 6fc5b1bd55b90..6394a85392cd7 100644 --- a/torch/testing/_internal/common_cuda.py +++ b/torch/testing/_internal/common_cuda.py @@ -48,6 +48,9 @@ def CDNA2OrLater(): def evaluate_platform_supports_flash_attention(): if TEST_WITH_ROCM: arch_list = ["gfx90a", "gfx942", "gfx1100"] + version = _get_torch_rocm_version() + if version >= (6, 5): + arch_list += ["gfx950"] return evaluate_gfx_arch_within(arch_list) if TEST_CUDA: return not IS_WINDOWS and SM80OrLater @@ -56,6 +59,9 @@ def evaluate_platform_supports_flash_attention(): def evaluate_platform_supports_efficient_attention(): if TEST_WITH_ROCM: arch_list = ["gfx90a", "gfx942", "gfx1100"] + version = _get_torch_rocm_version() + if version >= (6, 5): + arch_list += ["gfx950"] return evaluate_gfx_arch_within(arch_list) if TEST_CUDA: return True From d3450a16c42eb1b664d2236569dfc7ae3f217552 Mon Sep 17 00:00:00 2001 From: Jeff Daily Date: Fri, 20 Jun 2025 21:19:59 +0000 Subject: [PATCH 2/3] remove warpSize usage on host side --- aten/src/ATen/native/cuda/SoftMax.cu | 11 ++++++----- aten/src/ATen/native/cuda/TensorTopK.cu | 4 ++++ aten/src/ATen/native/cuda/block_reduce.cuh | 6 ++++++ c10/macros/Macros.h | 16 +++++++++++++++- third_party/composable_kernel | 2 +- .../csrc/distributed/c10d/CUDASymmetricMemory.cu | 6 +++--- .../distributed/c10d/CUDASymmetricMemoryOps.cu | 2 +- 7 files changed, 36 insertions(+), 11 deletions(-) diff --git a/aten/src/ATen/native/cuda/SoftMax.cu b/aten/src/ATen/native/cuda/SoftMax.cu index 9273f31ce4116..f3d62316eeb93 100644 --- a/aten/src/ATen/native/cuda/SoftMax.cu +++ b/aten/src/ATen/native/cuda/SoftMax.cu @@ -177,15 +177,16 @@ inline dim3 SoftMaxForward_getBlockSize(uint64_t dim_size) { uint64_t block_size = 1; uint64_t max_block_size = std::min(dim_size, static_cast(max_threads)); - // We need a block size that is a multiple of C10_WARP_SIZE in order + // We need a block size that is a multiple of at::cuda::warp_size() in order // to perform block size reductions using warp shuffle instructions. // Since max_threads is also a multiple of C10_WARPS_SIZE we do not // risk creating a block size larger than the limit. - if (max_block_size % C10_WARP_SIZE == 0) { + int warp_size = at::cuda::warp_size(); + if (max_block_size % warp_size == 0) { block_size = max_block_size; } else { - block_size = (max_block_size / C10_WARP_SIZE + 1) * C10_WARP_SIZE; + block_size = (max_block_size / warp_size + 1) * warp_size; } return dim3(block_size); @@ -859,7 +860,7 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t } else { constexpr int ILP = sizeof(float4) / sizeof(scalar_t); dim3 block = SoftMaxForward_getBlockSize(dim_size); - size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t); + size_t smem_reduction_sz = block.x / at::cuda::warp_size() * sizeof(accscalar_t); auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock - smem_reduction_sz) / sizeof(scalar_t); @@ -895,7 +896,7 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t } else { constexpr int ILP = sizeof(float4) / sizeof(scalar_t); dim3 block = SoftMaxForward_getBlockSize(dim_size); - size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t); + size_t smem_reduction_sz = block.x / at::cuda::warp_size() * sizeof(accscalar_t); auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock - smem_reduction_sz) / sizeof(scalar_t); diff --git a/aten/src/ATen/native/cuda/TensorTopK.cu b/aten/src/ATen/native/cuda/TensorTopK.cu index 103b360bcb868..49086c42cd4a2 100644 --- a/aten/src/ATen/native/cuda/TensorTopK.cu +++ b/aten/src/ATen/native/cuda/TensorTopK.cu @@ -439,8 +439,12 @@ __global__ void computeBlockwiseWithinKCounts( warp_counts[warp] = count; } __syncthreads(); +#ifdef USE_ROCM + CUDA_KERNEL_ASSERT(RADIX_DIGITS < C10_WARP_SIZE * C10_WARP_SIZE); +#else static_assert(RADIX_DIGITS < C10_WARP_SIZE * C10_WARP_SIZE, "Assuming only 1 warp is needed for final reduction"); +#endif if (warp != 0) { return; } diff --git a/aten/src/ATen/native/cuda/block_reduce.cuh b/aten/src/ATen/native/cuda/block_reduce.cuh index df757a11761bb..112c6ab952574 100644 --- a/aten/src/ATen/native/cuda/block_reduce.cuh +++ b/aten/src/ATen/native/cuda/block_reduce.cuh @@ -14,7 +14,13 @@ constexpr int kCUDABlockReduceNumThreads = 512; // of which reduces C10_WARP_SIZE elements. So, at most // C10_WARP_SIZE**2 elements can be reduced at a time. // NOTE: This is >= the max block size on current hardware anyway (1024). +// ROCm NOTE: C10_WARP_SIZE should only be used inside device functions, +// and kCUDABlockReduceMaxThreads is a host-side variable. +#ifdef USE_ROCM +static const int kCUDABlockReduceMaxThreads = at::cuda::warp_size() * at::cuda::warp_size(); +#else constexpr int kCUDABlockReduceMaxThreads = C10_WARP_SIZE * C10_WARP_SIZE; +#endif // Sums `val` across all threads in a warp. // diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h index ab6f2b38cf6be..6434d6bad105a 100644 --- a/c10/macros/Macros.h +++ b/c10/macros/Macros.h @@ -330,7 +330,21 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256; #endif #if defined(USE_ROCM) -#define C10_WARP_SIZE warpSize // = 64 or 32 (Defined in hip_runtime.h) +// C10_WARP_SIZE is only allowed for device code. +// Host code _must_ use at::cuda::warp_size() +// HIP header used to define warpSize as a constexpr that was either 32 or 64 +// depending on the target device, and then always set it to 64 for host code. +// Host pass of HIP compiler needs C10_WARP_SIZE defined to _something_ so we +// set it to something unreasonable to trigger obvious host code errors. +#if defined(__HIP_DEVICE_COMPILE__) +#if defined(__GFX9__) +static constexpr int C10_WARP_SIZE = 64; +#else // __GFX9__ +static constexpr int C10_WARP_SIZE = 32; +#endif // __GFX9__ +#else +static constexpr int C10_WARP_SIZE = 1; +#endif // __HIP_DEVICE_COMPILE__ #else #define C10_WARP_SIZE 32 #endif diff --git a/third_party/composable_kernel b/third_party/composable_kernel index 8086bbe3a78d9..df6023e305f38 160000 --- a/third_party/composable_kernel +++ b/third_party/composable_kernel @@ -1 +1 @@ -Subproject commit 8086bbe3a78d931eb96fe12fdc014082e18d18d3 +Subproject commit df6023e305f389bbf7249b0c4414e649f3ad6598 diff --git a/torch/csrc/distributed/c10d/CUDASymmetricMemory.cu b/torch/csrc/distributed/c10d/CUDASymmetricMemory.cu index e297d85f0d555..a7ab6891d0c62 100644 --- a/torch/csrc/distributed/c10d/CUDASymmetricMemory.cu +++ b/torch/csrc/distributed/c10d/CUDASymmetricMemory.cu @@ -436,7 +436,7 @@ static __global__ void barrier_kernel( void CUDASymmetricMemory::barrier(int channel) { check_channel(channel, world_size_); c10::cuda::CUDAGuard guard(local_device_idx_); - barrier_kernel<<<1, C10_WARP_SIZE, 0, at::cuda::getCurrentCUDAStream()>>>( + barrier_kernel<<<1, at::cuda::warp_size(), 0, at::cuda::getCurrentCUDAStream()>>>( reinterpret_cast(signal_pads_dev_), channel, rank_, @@ -458,7 +458,7 @@ static __global__ void put_signal_kernel( void CUDASymmetricMemory::put_signal(int dst_rank, int channel) { check_channel(channel, world_size_); c10::cuda::CUDAGuard guard(local_device_idx_); - put_signal_kernel<<<1, C10_WARP_SIZE, 0, at::cuda::getCurrentCUDAStream()>>>( + put_signal_kernel<<<1, at::cuda::warp_size(), 0, at::cuda::getCurrentCUDAStream()>>>( reinterpret_cast(signal_pads_dev_), dst_rank, channel, @@ -482,7 +482,7 @@ static __global__ void wait_signal_kernel( void CUDASymmetricMemory::wait_signal(int src_rank, int channel) { check_channel(channel, world_size_); c10::cuda::CUDAGuard guard(local_device_idx_); - wait_signal_kernel<<<1, C10_WARP_SIZE, 0, at::cuda::getCurrentCUDAStream()>>>( + wait_signal_kernel<<<1, at::cuda::warp_size(), 0, at::cuda::getCurrentCUDAStream()>>>( reinterpret_cast(signal_pads_dev_), src_rank, channel, diff --git a/torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu b/torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu index cedcca2c97612..e5342bab4cd70 100644 --- a/torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu +++ b/torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu @@ -64,7 +64,7 @@ void init_elementwise_launch_config( num_blocks = 1; num_threads = at::round_up( at::ceil_div(numel_per_split, numel_per_thread), - static_cast(C10_WARP_SIZE)); + static_cast(at::cuda::warp_size())); } else { num_blocks = std::min( at::ceil_div( From 3c1c9e87f8386753925a7f67dbcc852071f8aa93 Mon Sep 17 00:00:00 2001 From: Ethan Wee Date: Wed, 25 Jun 2025 15:10:51 -0700 Subject: [PATCH 3/3] [rocm7.0_internal_testing] Prevent static initialization of at::cuda::warp_size() (#2293) Fixes SWDEV-540240, SWDEV-540309, SWDEV-539989 ``` ... ``` https://github.com/ROCm/pytorch/commit/80cca7006d94df97ee932fd5903ed20c08c2eb34 created a static global variable that used `at::cuda::warp_size()` to initialize its value, which needs GPUs to be visible to query device properties. However, GPUs are not present on CPU-only build systems. Convert static variable into a static function, thus preventing static initialization. http://rocm-ci.amd.com/job/pyt_whl_docker_mainline/1461/artifact/build_artifacts.txt/*view*/ Ran microbenchmark to confirm basic functionality: ``` root@ubb4-rack-22:/var/lib/jenkins/pytorch-micro-benchmarking# python3 micro_benchmarking_pytorch.py --network resnet50 INFO: running forward and backward for warmup. INFO: running the benchmark.. OK: finished running benchmark.. --------------------SUMMARY-------------------------- Microbenchmark for network : resnet50 Num devices: 1 Dtype: FP32 Mini batch size [img] : 64 Time per mini-batch : 0.10158218145370483 Throughput [img/sec] : 630.0317544289736= ``` --- aten/src/ATen/native/cuda/Embedding.cu | 2 +- aten/src/ATen/native/cuda/MultinomialKernel.cu | 2 +- aten/src/ATen/native/cuda/TensorModeKernel.cu | 2 +- aten/src/ATen/native/cuda/block_reduce.cuh | 8 ++++++-- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu index b8fb51304e4b0..5a02d199ed6b0 100644 --- a/aten/src/ATen/native/cuda/Embedding.cu +++ b/aten/src/ATen/native/cuda/Embedding.cu @@ -369,7 +369,7 @@ Tensor & embedding_renorm_cuda_(Tensor & self, const Tensor & indices, int warp_size = at::cuda::warp_size(); TORCH_INTERNAL_ASSERT(num_threads() % warp_size == 0 && - num_threads() <= cuda_utils::kCUDABlockReduceMaxThreads, + num_threads() <= cuda_utils::kCUDABlockReduceMaxThreads(), "BlockReduceSum requires all warps be active"); const int64_t *num_unique_indices_ptr = num_unique_indices.const_data_ptr(); dim3 grid = unique_indices.numel(); diff --git a/aten/src/ATen/native/cuda/MultinomialKernel.cu b/aten/src/ATen/native/cuda/MultinomialKernel.cu index 65770e40a8b2b..8132e7df57b51 100644 --- a/aten/src/ATen/native/cuda/MultinomialKernel.cu +++ b/aten/src/ATen/native/cuda/MultinomialKernel.cu @@ -86,7 +86,7 @@ void renormRows(Tensor& t) { TORCH_CHECK(props != nullptr); int numSM = props->multiProcessorCount; const int64_t maxThreads = std::min( - props->maxThreadsPerBlock, cuda_utils::kCUDABlockReduceMaxThreads); + props->maxThreadsPerBlock, cuda_utils::kCUDABlockReduceMaxThreads()); int warp_size = at::cuda::warp_size(); dim3 grid(rows < numSM * 4 ? rows : numSM * 4); diff --git a/aten/src/ATen/native/cuda/TensorModeKernel.cu b/aten/src/ATen/native/cuda/TensorModeKernel.cu index 4764b078c050b..0c97ab742103f 100644 --- a/aten/src/ATen/native/cuda/TensorModeKernel.cu +++ b/aten/src/ATen/native/cuda/TensorModeKernel.cu @@ -207,7 +207,7 @@ void handle_fused_mode( constexpr int num_threads = size / 2; int warp_size = at::cuda::warp_size(); TORCH_INTERNAL_ASSERT(num_threads % warp_size == 0 && - num_threads <= cuda_utils::kCUDABlockReduceMaxThreads, ""); + num_threads <= cuda_utils::kCUDABlockReduceMaxThreads(), ""); const auto memsize = (sizeof(scalar_t) * size) + (2 * size * sizeof(unsigned int)); compute_mode diff --git a/aten/src/ATen/native/cuda/block_reduce.cuh b/aten/src/ATen/native/cuda/block_reduce.cuh index 112c6ab952574..fc44b0f4a9da0 100644 --- a/aten/src/ATen/native/cuda/block_reduce.cuh +++ b/aten/src/ATen/native/cuda/block_reduce.cuh @@ -17,9 +17,13 @@ constexpr int kCUDABlockReduceNumThreads = 512; // ROCm NOTE: C10_WARP_SIZE should only be used inside device functions, // and kCUDABlockReduceMaxThreads is a host-side variable. #ifdef USE_ROCM -static const int kCUDABlockReduceMaxThreads = at::cuda::warp_size() * at::cuda::warp_size(); +static int kCUDABlockReduceMaxThreads() { + return at::cuda::warp_size() * at::cuda::warp_size(); +} #else -constexpr int kCUDABlockReduceMaxThreads = C10_WARP_SIZE * C10_WARP_SIZE; +constexpr int kCUDABlockReduceMaxThreads() { + return C10_WARP_SIZE * C10_WARP_SIZE; +} #endif // Sums `val` across all threads in a warp.