Skip to content

Commit

Permalink
Set CUDA dot kernel to use number of blocks relative to device property
Browse files Browse the repository at this point in the history
This aligns with the approach implemented in other models (SYCL 1.2.1 and HIP)

Cherry-picks the CUDA updates from lmeadows in #122
  • Loading branch information
tomdeakin committed Oct 6, 2023
1 parent 92fed70 commit 9954b7d
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 11 deletions.
26 changes: 16 additions & 10 deletions src/cuda/CUDAStream.cu
Original file line number Diff line number Diff line change
Expand Up @@ -51,16 +51,22 @@ CUDAStream<T>::CUDAStream(const int ARRAY_SIZE, const int device_index)
#endif
array_size = ARRAY_SIZE;


// Query device for sensible dot kernel block count
cudaDeviceProp props;
cudaGetDeviceProperties(&props, device_index);
check_error();
dot_num_blocks = props.multiProcessorCount * 4;

// Allocate the host array for partial sums for dot kernels
sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS);
sums = (T*)malloc(sizeof(T) * dot_num_blocks);

size_t array_bytes = sizeof(T);
array_bytes *= ARRAY_SIZE;
size_t total_bytes = array_bytes * 3;
size_t total_bytes = array_bytes * 4;
std::cout << "Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE << std::endl;

// Check buffers fit on the device
cudaDeviceProp props;
cudaGetDeviceProperties(&props, 0);
if (props.totalGlobalMem < total_bytes)
throw std::runtime_error("Device does not have enough memory for all 3 buffers");

Expand All @@ -72,21 +78,21 @@ CUDAStream<T>::CUDAStream(const int ARRAY_SIZE, const int device_index)
check_error();
cudaMallocManaged(&d_c, array_bytes);
check_error();
cudaMallocManaged(&d_sum, DOT_NUM_BLOCKS*sizeof(T));
cudaMallocManaged(&d_sum, dot_num_blocks*sizeof(T));
check_error();
#elif defined(PAGEFAULT)
d_a = (T*)malloc(array_bytes);
d_b = (T*)malloc(array_bytes);
d_c = (T*)malloc(array_bytes);
d_sum = (T*)malloc(sizeof(T)*DOT_NUM_BLOCKS);
d_sum = (T*)malloc(sizeof(T)*dot_num_blocks);
#else
cudaMalloc(&d_a, array_bytes);
check_error();
cudaMalloc(&d_b, array_bytes);
check_error();
cudaMalloc(&d_c, array_bytes);
check_error();
cudaMalloc(&d_sum, DOT_NUM_BLOCKS*sizeof(T));
cudaMalloc(&d_sum, dot_num_blocks*sizeof(T));
check_error();
#endif
}
Expand Down Expand Up @@ -267,19 +273,19 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size)
template <class T>
T CUDAStream<T>::dot()
{
dot_kernel<<<DOT_NUM_BLOCKS, TBSIZE>>>(d_a, d_b, d_sum, array_size);
dot_kernel<<<dot_num_blocks, TBSIZE>>>(d_a, d_b, d_sum, array_size);
check_error();

#if defined(MANAGED) || defined(PAGEFAULT)
cudaDeviceSynchronize();
check_error();
#else
cudaMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), cudaMemcpyDeviceToHost);
cudaMemcpy(sums, d_sum, dot_num_blocks*sizeof(T), cudaMemcpyDeviceToHost);
check_error();
#endif

T sum = 0.0;
for (int i = 0; i < DOT_NUM_BLOCKS; i++)
for (int i = 0; i < dot_num_blocks; i++)
{
#if defined(MANAGED) || defined(PAGEFAULT)
sum += d_sum[i];
Expand Down
3 changes: 2 additions & 1 deletion src/cuda/CUDAStream.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
#define IMPLEMENTATION_STRING "CUDA"

#define TBSIZE 1024
#define DOT_NUM_BLOCKS 1024

template <class T>
class CUDAStream : public Stream<T>
Expand All @@ -34,6 +33,8 @@ class CUDAStream : public Stream<T>
T *d_c;
T *d_sum;

// Number of blocks for dot kernel
int dot_num_blocks;

public:

Expand Down

0 comments on commit 9954b7d

Please sign in to comment.