ROCm · jerrymannil · Aug 27, 2025 · Aug 26, 2025
diff --git a/aten/src/ATen/native/cuda/KernelUtils.cuh b/aten/src/ATen/native/cuda/KernelUtils.cuh
@@ -223,6 +223,41 @@ __device__ __forceinline__ void fastAtomicAdd(
   }
 }
 
+
+#ifdef USE_ROCM
+// This function implements a committed store.
+// Upon returning, the store is committed to global memory.
+// This is useful in avoiding the need for fences.
+template <typename T>
+__device__ inline void cmtdStore(void* address, T value) {
+      int constexpr num_long_per_val = sizeof(value)/sizeof(long);
+      int constexpr num_int_per_val = sizeof(value)/sizeof(int);
+      int constexpr num_short_per_val = sizeof(value)/sizeof(short);
+      int constexpr num_char_per_val = sizeof(value)/sizeof(char);
+      union pnr { T v;
+                  long l[num_long_per_val];
+                  int i[num_int_per_val];
+                  short s[num_short_per_val];
+                  char c[num_char_per_val]; }
+            _pnr = {.v = value };
+      if constexpr (num_long_per_val*sizeof(long) == sizeof(value))
+        for (int i=0; i<num_long_per_val; i++)
+          __hip_atomic_store(reinterpret_cast<long *>(address)+i, _pnr.l[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+      else if constexpr (num_int_per_val*sizeof(int) == sizeof(value))
+        for (int i=0; i<num_int_per_val; i++)
+          __hip_atomic_store(reinterpret_cast<int *>(address)+i, _pnr.i[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+      else if constexpr (num_short_per_val*sizeof(short) == sizeof(value))
+        for (int i=0; i<num_short_per_val; i++)
+          __hip_atomic_store(reinterpret_cast<short *>(address)+i, _pnr.s[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+      else if constexpr (num_char_per_val*sizeof(char) == sizeof(value))
+        for (int i=0; i<num_char_per_val; i++)
+          __hip_atomic_store(reinterpret_cast<char *>(address)+i, _pnr.c[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+      __atomic_signal_fence(__ATOMIC_SEQ_CST);
+      asm volatile("s_waitcnt vmcnt(0)" ::: "memory");
+      __atomic_signal_fence(__ATOMIC_SEQ_CST);
+}
+#endif
+
 #if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__))
 // This function implements warp-level opportunistic fastatomics
 // To reduce contention on an atomicAdd, this replaces per-thread atomicAdd with a per-warp atomicAdd.

diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh
@@ -18,6 +18,7 @@
 #include <thrust/pair.h>
 
 #include <ATen/native/cuda/jit_utils.h>
+#include <ATen/native/cuda/KernelUtils.cuh>
 
 namespace at::native {
 
@@ -796,22 +797,23 @@ struct ReduceOp {
     bool should_store = config.should_store(output_idx);
     if (should_store) {
       index_t offset = config.staging_memory_offset(blockIdx.y);
+#ifndef USE_ROCM
       reduce_buffer[offset] = value;
-#ifdef USE_ROCM
-      __builtin_amdgcn_fence(__ATOMIC_RELEASE, "agent"); // make sure writes are globally visible
+#else // [CMTSTRS]
+      // In architectures with split caches, global fences are costly.
+      // Here we preempt need for fences by committing stores to global memory.
+      cmtdStore(&reduce_buffer[offset], value);
 #endif
     }
 
-#ifndef USE_ROCM
+#ifndef USE_ROCM // skip fence if store are committed [CMTSTRS]
     __threadfence(); // make sure writes are globally visible
 #endif
     __syncthreads(); // if multiple warps in this block wrote to staging, make sure they're all done
     bool is_last_block_done = mark_block_finished();
 
     if (is_last_block_done) {
-#ifdef USE_ROCM
-      __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "agent"); // complete the acquire pattern after release
-#else
+#ifndef USE_ROCM // skip fence if store are committed [CMTSTRS]
       __threadfence(); // complete the acquire pattern after atomic
 #endif
       for (auto &v : value) {