Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new cuda kernel synchronization with hpx::future demo #3292

Merged
merged 1 commit into from May 4, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
16 changes: 12 additions & 4 deletions examples/compute/cuda/CMakeLists.txt
Expand Up @@ -9,6 +9,7 @@ set(example_programs)
if(HPX_WITH_CUDA)
set(example_programs ${example_programs}
cublas_matmul
cuda_future
data_copy
hello_compute
)
Expand All @@ -25,9 +26,11 @@ if(HPX_WITH_CUDA)
set(cublas_matmul_FLAGS
DEPENDENCIES ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES})

set(cublas_matmul_CUDA Off)
set(data_copy_CUDA On)
set(hello_compute_CUDA On)
set(cublas_matmul_CUDA OFF)
set(data_copy_CUDA ON)
set(hello_compute_CUDA ON)
set(cuda_future_CUDA OFF)
set(cuda_future_CUDA_SOURCE trivial_demo)
set(partitioned_vector_CUDA ON)
set(partitioned_vector_FLAGS COMPONENT_DEPENDENCIES partitioned_vector)
endif()
Expand All @@ -41,6 +44,12 @@ foreach(example_program ${example_programs})
${example_program}.cpp)
endif()

if(${example_program}_CUDA_SOURCE)
message("got ${${example_program}_CUDA_SOURCE} " ${${example_program}_CUDA_SOURCE})
set(sources
${sources} ${${example_program}_CUDA_SOURCE}.cu)
endif()

source_group("Source Files" FILES ${sources})

# add example executable
Expand All @@ -60,4 +69,3 @@ foreach(example_program ${example_programs})
add_hpx_pseudo_dependencies(examples.compute.cuda.${example_program}
${example_program}_exe)
endforeach()

147 changes: 47 additions & 100 deletions examples/compute/cuda/cublas_matmul.cpp
@@ -1,4 +1,4 @@
// Copyright (c) 2017 John Biddiscombe
// Copyright (c) 2017-2018 John Biddiscombe
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
Expand Down Expand Up @@ -50,91 +50,72 @@
#include <sstream>
#include <utility>
#include <vector>

const char *_cudaGetErrorEnum(cublasStatus_t error);
//
#include "cuda_future_helper.h"
//
std::mt19937 gen;

// -------------------------------------------------------------------------
// a simple cublas wrapper helper object that can be used to synchronize
// cublas calls with an hpx future.
// -------------------------------------------------------------------------
template<typename T>
struct cublas_helper
struct cublas_helper : hpx::compute::util::cuda_future_helper
{
public:
using future_type = hpx::future<void>;

#ifdef HPX_CUBLAS_DEMO_WITH_ALLOCATOR
using allocator_type = typename hpx::compute::cuda::allocator<T>;
using vector_type = typename hpx::compute::vector<T, allocator_type>;
#endif

// construct a cublas stream
cublas_helper(std::size_t device=0) : target_(device) {
cublas_helper(std::size_t device=0) : hpx::compute::util::cuda_future_helper(device) {
handle_ = 0;
stream_ = target_.native_handle().get_stream();
cublas_error(cublasCreate(&handle_));
hpx::compute::util::cublas_error(cublasCreate(&handle_));
}

cublas_helper(cublas_helper& other) = delete;
cublas_helper(const cublas_helper& other) = delete;
cublas_helper operator=(const cublas_helper& other) = delete;

~cublas_helper() {
cublas_error(cublasDestroy(handle_));
hpx::compute::util::cublas_error(cublasDestroy(handle_));
}

// This is a simple wrapper for any cublas call, pass in the same arguments
// that you would use for a cublas call except the cublas handle which is omitted
// as the wrapper will supply that for you
template <typename Func, typename ...Args>
void operator()(Func && cublas_function, Args&&... args)
{
// -------------------------------------------------------------------------
// launch a cuBlas function and return a future that will become ready
// when the task completes, this allows integregration of GPU kernels with
// hpx::futuresa and the tasking DAG.
template <typename R, typename... Params, typename... Args>
hpx::future<void> async(R(*cublas_function)(Params...), Args &&... args) {
// make sue we run on the correct device
cuda_error(cudaSetDevice(target_.native_handle().get_device()));

hpx::compute::util::cuda_error(cudaSetDevice(target_.native_handle().get_device()));
// make sure this operation takes place on our stream
cublas_error(cublasSetStream(handle_, stream_));

hpx::compute::util::cublas_error(cublasSetStream(handle_, stream_));
// insert the cublas handle in the arg list and call the cublas function
cublas_error(cublas_function(handle_, std::forward<Args>(args)...));
hpx::compute::util::detail::async_helper<R, Params...> helper;
helper(cublas_function, handle_, std::forward<Args>(args)...);
return get_future();
}

template <typename ...Args>
void copy_async(Args&&... args)
{
// This is a simple wrapper for any cublas call, pass in the same arguments
// that you would use for a cublas call except the cublas handle which is omitted
// as the wrapper will supply that for you
template <typename R, typename... Params, typename... Args>
R apply(R(*cublas_function)(Params...), Args &&... args) {
// make sue we run on the correct device
cuda_error(cudaSetDevice(target_.native_handle().get_device()));

// insert the uda stream in the arg list and call the cuda memcpy
cuda_error(cudaMemcpyAsync (std::forward<Args>(args)..., stream_));
hpx::compute::util::cuda_error(cudaSetDevice(target_.native_handle().get_device()));
// make sure this operation takes place on our stream
hpx::compute::util::cublas_error(cublasSetStream(handle_, stream_));
// insert the cublas handle in the arg list and call the cublas function
hpx::compute::util::detail::async_helper<R, Params...> helper;
return helper(cublas_function, handle_, std::forward<Args>(args)...);
}

// get the future to synchronize this cublas stream with
future_type get_future() { return target_.get_future(); }

// return a copy of the cublas handle
cublasHandle_t handle() { return handle_; }

// return a reference to the compute::cuda object owned by this class
hpx::compute::cuda::target & target() { return target_; }

static void cublas_error(cublasStatus_t err) {
if (err != CUBLAS_STATUS_SUCCESS) {
std::stringstream temp;
temp << "cublas function returned error code " << _cudaGetErrorEnum(err);
throw std::runtime_error(temp.str());
}
}

static void cuda_error(cudaError_t err) {
if (err != cudaSuccess) {
std::stringstream temp;
temp << "cuda function returned error code " << cudaGetErrorString(err);
throw std::runtime_error(temp.str());
}
}
cublasHandle_t get_handle() { return handle_; }

private:
cublasHandle_t handle_;
cudaStream_t stream_;
hpx::compute::cuda::target target_;
cublasHandle_t handle_;
};

// -------------------------------------------------------------------------
Expand Down Expand Up @@ -202,36 +183,6 @@ compare_L2_err(const float *reference, const float *data,
return result;
}

// -------------------------------------------------------------------------
// not all of these are supported by all cuda/cublas versions
// comment them out if they cause compiler errors
const char *_cudaGetErrorEnum(cublasStatus_t error)
{
switch (error) {
case CUBLAS_STATUS_SUCCESS:
return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
case CUBLAS_STATUS_NOT_SUPPORTED:
return "CUBLAS_STATUS_NOT_SUPPORTED";
case CUBLAS_STATUS_LICENSE_ERROR:
return "CUBLAS_STATUS_LICENSE_ERROR";
}
return "<unknown>";
}

// -------------------------------------------------------------------------
// Run a simple test matrix multiply using CUBLAS
// -------------------------------------------------------------------------
Expand All @@ -257,8 +208,7 @@ void matrixMultiply(sMatrixSize &matrix_size, std::size_t device, std::size_t it

// create a cublas helper object we'll use to futurize the cuda events
cublas_helper<T> cublas(device);

using cublas_future = typename cublas_helper<T>::future_type;
using cublas_future = typename cublas_helper<T>::future_type;

#ifdef HPX_CUBLAS_DEMO_WITH_ALLOCATOR
// for convenience
Expand Down Expand Up @@ -286,27 +236,27 @@ void matrixMultiply(sMatrixSize &matrix_size, std::size_t device, std::size_t it

#else
T *d_A, *d_B, *d_C;
cublas_helper<T>::cuda_error(
hpx::compute::util::cuda_error(
cudaMalloc((void **) &d_A, size_A*sizeof(T)));

cublas_helper<T>::cuda_error(
hpx::compute::util::cuda_error(
cudaMalloc((void **) &d_B, size_B*sizeof(T)));

cublas_helper<T>::cuda_error(
hpx::compute::util::cuda_error(
cudaMalloc((void **) &d_C, size_C*sizeof(T)));

// adding async copy operations into the stream before cublas calls puts
// the copies in the queue before the matrix operations.
cublas.copy_async(
cublas.copy_apply(
d_A, h_A.data(), size_A*sizeof(T), cudaMemcpyHostToDevice);

cublas.copy_async(
auto copy_future = cublas.copy_async(
d_B, h_B.data(), size_B*sizeof(T), cudaMemcpyHostToDevice);

// we can call get_future multiple times on the cublas helper.
// Each one returns a new future that will be set ready when the stream event
// for this point is triggered
auto copy_future = cublas.get_future().then([](cublas_future &&f){
copy_future.then([](cublas_future &&f){
std::cout << "The async host->device copy operation completed" << std::endl;
});

Expand All @@ -321,7 +271,7 @@ void matrixMultiply(sMatrixSize &matrix_size, std::size_t device, std::size_t it
hpx::util::high_resolution_timer t1;
//
std::cout << "calling CUBLAS...\n";
cublas(
auto fut =cublas.async(
&cublasSgemm,
CUBLAS_OP_N, CUBLAS_OP_N,
matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA,
Expand All @@ -332,7 +282,7 @@ void matrixMultiply(sMatrixSize &matrix_size, std::size_t device, std::size_t it
d_C, matrix_size.uiWA);

// wait until the operation completes
cublas.get_future().get();
fut.get();

double us1 = t1.elapsed_microseconds();
std::cout << "warmup: elapsed_microseconds " << us1 << std::endl;
Expand All @@ -343,7 +293,7 @@ void matrixMultiply(sMatrixSize &matrix_size, std::size_t device, std::size_t it

hpx::util::high_resolution_timer t2;
for (std::size_t j=0; j<iterations; j++) {
cublas(
cublas.apply(
&cublasSgemm,
CUBLAS_OP_N, CUBLAS_OP_N,
matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA,
Expand All @@ -358,12 +308,9 @@ void matrixMultiply(sMatrixSize &matrix_size, std::size_t device, std::size_t it

#ifndef HPX_CUBLAS_DEMO_WITH_ALLOCATOR
// when the matrix operations complete, copy the result to the host
cublas.copy_async(
auto copy_finished = cublas.copy_async(
h_CUBLAS.data(), d_C, size_C*sizeof(T), cudaMemcpyDeviceToHost);

// and get another future when the copy back is done
auto copy_finished = cublas.get_future();

#endif

// attach a continuation to the cublas future
Expand Down
93 changes: 93 additions & 0 deletions examples/compute/cuda/cuda_future.cpp
@@ -0,0 +1,93 @@
// Copyright (c) 2018 John Biddiscombe
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

#define BOOST_NO_CXX11_ALLOCATOR
//
#include <hpx/hpx.hpp>
#include <hpx/hpx_init.hpp>
#include <hpx/include/parallel_copy.hpp>
#include <hpx/include/parallel_for_each.hpp>
#include <hpx/include/parallel_for_loop.hpp>
#include <hpx/include/parallel_executors.hpp>
#include <hpx/include/parallel_executor_parameters.hpp>
//
#include <hpx/include/compute.hpp>
#include <hpx/compute/cuda/target.hpp>
// CUDA runtime
#include <cuda_runtime.h>
//
#include <algorithm>
#include <cmath>
#include <cstddef>
#include <iostream>
#include <sstream>
#include <utility>
#include <vector>
//
#include "cuda_future_helper.h"

// -------------------------------------------------------------------------
// This example uses the normal C++ compiler to compile all the HPX stuff
// but the cuda functions go in their own .cu file and are compiled with
// nvcc, we don't mix them.
// Declare functions we are importing - note that template instantiations
// must be present in the .cu file and compiled so that we can link to them
template <typename T>
extern void cuda_trivial_kernel(T, cudaStream_t stream);

// -------------------------------------------------------------------------
int hpx_main(boost::program_options::variables_map& vm)
{
std::size_t device = vm["device"].as<std::size_t>();
//
unsigned int seed = (unsigned int)std::time(nullptr);
if (vm.count("seed"))
seed = vm["seed"].as<unsigned int>();

std::cout << "using seed: " << seed << std::endl;
std::srand(seed);

hpx::compute::cuda::target target(device);
//
hpx::compute::util::cuda_future_helper helper(device);
helper.print_local_targets();
//
float testf = 2.345;
cuda_trivial_kernel(testf, helper.get_stream());

double testd = 5.678;
cuda_trivial_kernel(testd, helper.get_stream());

auto fn = &cuda_trivial_kernel<double>;
double d = 3.1415;
auto f = helper.async(fn, d);
f.then([](hpx::future<void> &&f) {
std::cout << "trivial kernel completed \n";
});

return hpx::finalize();
}

// -------------------------------------------------------------------------
int main(int argc, char **argv)
{
printf("[HPX Cuda future] - Starting...\n");

using namespace boost::program_options;
options_description cmdline("usage: " HPX_APPLICATION_STRING " [options]");
cmdline.add_options()
( "device",
boost::program_options::value<std::size_t>()->default_value(0),
"Device to use")
( "iterations",
boost::program_options::value<std::size_t>()->default_value(30),
"iterations")
("seed,s",
boost::program_options::value<unsigned int>(),
"the random number generator seed to use for this run")
;

return hpx::init(cmdline, argc, argv);
}