Skip to content
Permalink
Browse files

Add kernel launch helper utility (+saxpy demo) and merge in octotiger…

… changes

Add a demo that computes saxpy on the GPU and launches it using
the cudaLaunchKernel function via the stream helper.

Tested for compatibility with latest Octotiger at Hackathon.
  • Loading branch information...
biddisco committed Oct 6, 2019
1 parent 708072c commit 8300df46456e18fe10c749863e2aa1912fbac03b
@@ -37,7 +37,7 @@ if(HPX_WITH_CUDA)
set(data_copy_CUDA ON)
set(hello_compute_CUDA ON)
set(cuda_future_CUDA OFF)
set(cuda_future_CUDA_SOURCE trivial_demo)
set(cuda_future_CUDA_SOURCE saxpy trivial_demo )
set(partitioned_vector_CUDA ON)
set(partitioned_vector_FLAGS COMPONENT_DEPENDENCIES partitioned_vector)
endif()
@@ -52,8 +52,10 @@ foreach(example_program ${example_programs})
endif()

if(${example_program}_CUDA_SOURCE)
set(sources
${sources} ${${example_program}_CUDA_SOURCE}.cu)
foreach(src ${${example_program}_CUDA_SOURCE})
set(sources
${sources} ${src}.cu)
endforeach()
endif()

source_group("Source Files" FILES ${sources})
@@ -38,10 +38,75 @@
template <typename T>
extern void cuda_trivial_kernel(T, cudaStream_t stream);

extern __global__
void saxpy(int n, float a, float *x, float *y);

// -------------------------------------------------------------------------
int test_saxpy(hpx::compute::util::cuda_future_helper &helper)
{
int N = 1 << 20;

// host arrays
std::vector<float> h_A(N);
std::vector<float> h_B(N);

float *d_A, *d_B;
hpx::compute::util::cuda_error(
cudaMalloc((void **) &d_A, N*sizeof(float)));

hpx::compute::util::cuda_error(
cudaMalloc((void **) &d_B, N*sizeof(float)));

// init host data
for (int idx = 0; idx < N; idx++)
{
h_A[idx] = 1.0f;
h_B[idx] = 2.0f;
}

// copy both arrays from cpu to gpu, putting both copies onto the stream
// no need to get a future back yet
helper.memcpy_apply(
d_A, h_A.data(), N*sizeof(float), cudaMemcpyHostToDevice);
helper.memcpy_apply(
d_B, h_B.data(), N*sizeof(float), cudaMemcpyHostToDevice);

unsigned int threads = 256;
unsigned int blocks = (N + 255) / threads;
float ratio = 2.0f;

// now launch a kernel on the stream
void *args[] = { &N, &ratio, &d_A, &d_B };
helper.device_launch_apply(&saxpy, dim3(blocks), dim3(threads), args, 0);

// finally, perform a copy from the gpu back to the cpu all on the same stream
// grab a future to when this completes
auto cuda_future = helper.memcpy_async(h_B.data(), d_B,
N * sizeof(float), cudaMemcpyDeviceToHost);

// we can add a continuation to the memcpy future, so that when the
// memory copy completes, we can do new things ...
cuda_future.then([&](hpx::future<void> &&f){
std::cout << "saxpy completed on GPU, checking results" << std::endl;
float max_error = 0.0f;
for (int jdx = 0; jdx < N; jdx++)
{
max_error = (std::max)(max_error, abs(h_B[jdx] - 4.0f));
}
std::cout << "Max Error: " << max_error << std::endl;
}).get();

// the .get() is important in the line above because without it, this function
// returns amd the task above goes out of scope and the refs it holds
// are invalidated.

return 0;
}

// -------------------------------------------------------------------------
int hpx_main(hpx::program_options::variables_map& vm)
{
std::size_t device = vm["device"].as<std::size_t>();
std::size_t device = vm["device"].as<std::size_t>();
//
unsigned int seed = (unsigned int)std::time(nullptr);
if (vm.count("seed"))
@@ -66,7 +131,9 @@ int hpx_main(hpx::program_options::variables_map& vm)
auto f = helper.async(fn, d);
f.then([](hpx::future<void> &&f) {
std::cout << "trivial kernel completed \n";
});
}).get();

test_saxpy(helper);

return hpx::finalize();
}
@@ -177,10 +177,49 @@ namespace hpx { namespace compute { namespace util
cuda_future_helper(const cuda_future_helper& other) = delete;
cuda_future_helper operator=(const cuda_future_helper& other) = delete;

// -------------------------------------------------------------------------
// launch a kernel on our stream - this does not require a c++ wrapped
// invoke call of the cuda kernel but must be called with the args that would
// otherwise be passed to cudaLaunchKernel - minus the stream arg which
// the helper class will provide. This function does not return a future.
// Typically, one must pass ...
// const void* func, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem)
template <typename R, typename... Params, typename... Args>
R device_launch_apply(R(*cuda_kernel)(Params...), Args &&... args)
{
// make sure we run on the correct device
cuda_error(cudaSetDevice(target_.native_handle().get_device()));
// launch the kernel directly on the GPU
cuda_error(
cudaLaunchKernel(reinterpret_cast<void const*>(cuda_kernel),
std::forward<Args>(args)..., stream_));
}

// -------------------------------------------------------------------------
// launch a kernel on our stream - this does not require a c++ wrapped
// invoke call of the cuda kernel but must be called with the args that would
// otherwise be passed to cudaLaunchKernel - minus the stream arg which
// the helper class will provide.
// This function returns a future that will become ready when the task
// completes, this allows integregration of GPU kernels with
// hpx::futures and the tasking DAG.
//
// Typically, for cudaLaunchKernel one must pass ...
// const void* func, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem)
template <typename... Args>
hpx::future<void> device_launch_async(Args&&... args)
{
// make sure we run on the correct device
cuda_error(cudaSetDevice(target_.native_handle().get_device()));
// launch the kernel directly on the GPU
cuda_error(cudaLaunchKernel(std::forward<Args>(args)..., stream_));
return get_future();
}

// -------------------------------------------------------------------------
// launch a kernel on our stream and return a future that will become ready
// when the task completes, this allows integregration of GPU kernels with
// hpx::futuresa and the tasking DAG.
// hpx::futures and the tasking DAG.
template <typename R, typename... Params, typename... Args>
hpx::future<void> async(R(*cuda_kernel)(Params...), Args &&... args) {
// make sure we run on the correct device
@@ -215,35 +254,35 @@ namespace hpx { namespace compute { namespace util
}

// -------------------------------------------------------------------------
// utility function for copying to/from the GPU, async and apply versions
// utility function for setting memory on the GPU, async and apply versions
template <typename... Args>
hpx::future<void> copy_async(Args&&... args) {
return async(cudaMemcpyAsync, std::forward<Args>(args)...);
hpx::future<void> memset_async(Args&&... args) {
return async(cudaMemsetAsync, std::forward<Args>(args)...);
}

template <typename... Args>
auto copy_apply(Args&&... args)
auto memset_apply(Args&&... args)
#if !defined(HPX_HAVE_CXX14_RETURN_TYPE_DEDUCTION)
-> decltype(apply(cudaMemcpyAsync, std::forward<Args>(args)...))
-> decltype(apply(cudaMemsetAsync, std::forward<Args>(args)...))
#endif
{
return apply(cudaMemcpyAsync, std::forward<Args>(args)...);
return apply(cudaMemsetAsync, std::forward<Args>(args)...);
}

// -------------------------------------------------------------------------
// utility function for setting memory on the GPU, async and apply versions
// utility function for memory copies to/from the GPU, async and apply versions
template <typename... Args>
hpx::future<void> memset_async(Args&&... args) {
return async(cudaMemsetAsync, std::forward<Args>(args)...);
hpx::future<void> memcpy_async(Args&&... args) {
return async(cudaMemcpyAsync, std::forward<Args>(args)...);
}

template <typename... Args>
auto memset_apply(Args&&... args)
auto memcpy_apply(Args&&... args)
#if !defined(HPX_HAVE_CXX14_RETURN_TYPE_DEDUCTION)
-> decltype(apply(cudaMemsetAsync, std::forward<Args>(args)...))
-> decltype(apply(cudaMemcpyAsync, std::forward<Args>(args)...))
#endif
{
return apply(cudaMemsetAsync, std::forward<Args>(args)...);
return apply(cudaMemcpyAsync, std::forward<Args>(args)...);
}

// -------------------------------------------------------------------------
@@ -0,0 +1,14 @@
// Copyright (c) 2018 John Biddiscombe
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

#include "cuda_runtime.h"

__global__
void saxpy(int n, float a, float *x, float *y)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) y[i] = a * x[i] + y[i];
}

@@ -12,7 +12,7 @@ __global__ void trivial_kernel(T val) {
printf("hello from gpu with value %f\n", val);
}

// here is a wrapper that can call the kernel from C++ outsied of the .cu file
// here is a wrapper that can call the kernel from C++ outside of the .cu file
template <typename T>
void cuda_trivial_kernel(T t, cudaStream_t stream)
{

0 comments on commit 8300df4

Please sign in to comment.
You can’t perform that action at this time.