Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions aten/src/ATen/native/cuda/KernelUtils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,41 @@ __device__ __forceinline__ void fastAtomicAdd(
}
}


#ifdef USE_ROCM
// This function implements a committed store.
// Upon returning, the store is committed to global memory.
// This is useful in avoiding the need for fences.
template <typename T>
__device__ inline void cmtdStore(void* address, T value) {
int constexpr num_long_per_val = sizeof(value)/sizeof(long);
int constexpr num_int_per_val = sizeof(value)/sizeof(int);
int constexpr num_short_per_val = sizeof(value)/sizeof(short);
int constexpr num_char_per_val = sizeof(value)/sizeof(char);
union pnr { T v;
long l[num_long_per_val];
int i[num_int_per_val];
short s[num_short_per_val];
char c[num_char_per_val]; }
_pnr = {.v = value };
if constexpr (num_long_per_val*sizeof(long) == sizeof(value))
for (int i=0; i<num_long_per_val; i++)
__hip_atomic_store(reinterpret_cast<long *>(address)+i, _pnr.l[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
else if constexpr (num_int_per_val*sizeof(int) == sizeof(value))
for (int i=0; i<num_int_per_val; i++)
__hip_atomic_store(reinterpret_cast<int *>(address)+i, _pnr.i[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
else if constexpr (num_short_per_val*sizeof(short) == sizeof(value))
for (int i=0; i<num_short_per_val; i++)
__hip_atomic_store(reinterpret_cast<short *>(address)+i, _pnr.s[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
else if constexpr (num_char_per_val*sizeof(char) == sizeof(value))
for (int i=0; i<num_char_per_val; i++)
__hip_atomic_store(reinterpret_cast<char *>(address)+i, _pnr.c[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
__atomic_signal_fence(__ATOMIC_SEQ_CST);
asm volatile("s_waitcnt vmcnt(0)" ::: "memory");
__atomic_signal_fence(__ATOMIC_SEQ_CST);
}
#endif

#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__))
// This function implements warp-level opportunistic fastatomics
// To reduce contention on an atomicAdd, this replaces per-thread atomicAdd with a per-warp atomicAdd.
Expand Down
14 changes: 8 additions & 6 deletions aten/src/ATen/native/cuda/Reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <thrust/pair.h>

#include <ATen/native/cuda/jit_utils.h>
#include <ATen/native/cuda/KernelUtils.cuh>

namespace at::native {

Expand Down Expand Up @@ -796,22 +797,23 @@ struct ReduceOp {
bool should_store = config.should_store(output_idx);
if (should_store) {
index_t offset = config.staging_memory_offset(blockIdx.y);
#ifndef USE_ROCM
reduce_buffer[offset] = value;
#ifdef USE_ROCM
__builtin_amdgcn_fence(__ATOMIC_RELEASE, "agent"); // make sure writes are globally visible
#else // [CMTSTRS]
// In architectures with split caches, global fences are costly.
// Here we preempt need for fences by committing stores to global memory.
cmtdStore(&reduce_buffer[offset], value);
#endif
}

#ifndef USE_ROCM
#ifndef USE_ROCM // skip fence if store are committed [CMTSTRS]
__threadfence(); // make sure writes are globally visible
#endif
__syncthreads(); // if multiple warps in this block wrote to staging, make sure they're all done
bool is_last_block_done = mark_block_finished();

if (is_last_block_done) {
#ifdef USE_ROCM
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "agent"); // complete the acquire pattern after release
#else
#ifndef USE_ROCM // skip fence if store are committed [CMTSTRS]
__threadfence(); // complete the acquire pattern after atomic
#endif
for (auto &v : value) {
Expand Down