diff --git a/aten/src/ATen/native/cuda/KernelUtils.cuh b/aten/src/ATen/native/cuda/KernelUtils.cuh index 5bdb3f6cc67d4..75fdd6922a8bd 100644 --- a/aten/src/ATen/native/cuda/KernelUtils.cuh +++ b/aten/src/ATen/native/cuda/KernelUtils.cuh @@ -223,6 +223,41 @@ __device__ __forceinline__ void fastAtomicAdd( } } + +#ifdef USE_ROCM +// This function implements a committed store. +// Upon returning, the store is committed to global memory. +// This is useful in avoiding the need for fences. +template +__device__ inline void cmtdStore(void* address, T value) { + int constexpr num_long_per_val = sizeof(value)/sizeof(long); + int constexpr num_int_per_val = sizeof(value)/sizeof(int); + int constexpr num_short_per_val = sizeof(value)/sizeof(short); + int constexpr num_char_per_val = sizeof(value)/sizeof(char); + union pnr { T v; + long l[num_long_per_val]; + int i[num_int_per_val]; + short s[num_short_per_val]; + char c[num_char_per_val]; } + _pnr = {.v = value }; + if constexpr (num_long_per_val*sizeof(long) == sizeof(value)) + for (int i=0; i(address)+i, _pnr.l[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + else if constexpr (num_int_per_val*sizeof(int) == sizeof(value)) + for (int i=0; i(address)+i, _pnr.i[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + else if constexpr (num_short_per_val*sizeof(short) == sizeof(value)) + for (int i=0; i(address)+i, _pnr.s[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + else if constexpr (num_char_per_val*sizeof(char) == sizeof(value)) + for (int i=0; i(address)+i, _pnr.c[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + __atomic_signal_fence(__ATOMIC_SEQ_CST); + asm volatile("s_waitcnt vmcnt(0)" ::: "memory"); + __atomic_signal_fence(__ATOMIC_SEQ_CST); +} +#endif + #if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)) // This function implements warp-level opportunistic fastatomics // To reduce contention on an atomicAdd, this replaces per-thread atomicAdd with a per-warp atomicAdd. diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh index e67adff4a4d98..9914ba3a01564 100644 --- a/aten/src/ATen/native/cuda/Reduce.cuh +++ b/aten/src/ATen/native/cuda/Reduce.cuh @@ -18,6 +18,7 @@ #include #include +#include namespace at::native { @@ -796,22 +797,23 @@ struct ReduceOp { bool should_store = config.should_store(output_idx); if (should_store) { index_t offset = config.staging_memory_offset(blockIdx.y); +#ifndef USE_ROCM reduce_buffer[offset] = value; -#ifdef USE_ROCM - __builtin_amdgcn_fence(__ATOMIC_RELEASE, "agent"); // make sure writes are globally visible +#else // [CMTSTRS] + // In architectures with split caches, global fences are costly. + // Here we preempt need for fences by committing stores to global memory. + cmtdStore(&reduce_buffer[offset], value); #endif } -#ifndef USE_ROCM +#ifndef USE_ROCM // skip fence if store are committed [CMTSTRS] __threadfence(); // make sure writes are globally visible #endif __syncthreads(); // if multiple warps in this block wrote to staging, make sure they're all done bool is_last_block_done = mark_block_finished(); if (is_last_block_done) { -#ifdef USE_ROCM - __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "agent"); // complete the acquire pattern after release -#else +#ifndef USE_ROCM // skip fence if store are committed [CMTSTRS] __threadfence(); // complete the acquire pattern after atomic #endif for (auto &v : value) {