STEllAR-GROUP · hkaiser · May 4, 2018 · Apr 17, 2018
@@ -9,6 +9,7 @@ set(example_programs)
 if(HPX_WITH_CUDA)
   set(example_programs ${example_programs}
       cublas_matmul
+      cuda_future
       data_copy
       hello_compute
     )
@@ -25,9 +26,11 @@ if(HPX_WITH_CUDA)
   set(cublas_matmul_FLAGS
       DEPENDENCIES ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES})
 
-  set(cublas_matmul_CUDA Off)
-  set(data_copy_CUDA On)
-  set(hello_compute_CUDA On)
+  set(cublas_matmul_CUDA OFF)
+  set(data_copy_CUDA ON)
+  set(hello_compute_CUDA ON)
+  set(cuda_future_CUDA OFF)
+  set(cuda_future_CUDA_SOURCE trivial_demo)
   set(partitioned_vector_CUDA ON)
   set(partitioned_vector_FLAGS COMPONENT_DEPENDENCIES partitioned_vector)
 endif()
@@ -41,6 +44,12 @@ foreach(example_program ${example_programs})
         ${example_program}.cpp)
   endif()
 
+  if(${example_program}_CUDA_SOURCE)
+    message("got ${${example_program}_CUDA_SOURCE} " ${${example_program}_CUDA_SOURCE})
+    set(sources
+      ${sources} ${${example_program}_CUDA_SOURCE}.cu)
+  endif()
+
   source_group("Source Files" FILES ${sources})
 
   # add example executable
@@ -60,4 +69,3 @@ foreach(example_program ${example_programs})
   add_hpx_pseudo_dependencies(examples.compute.cuda.${example_program}
                               ${example_program}_exe)
 endforeach()
-
@@ -1,4 +1,4 @@
-//  Copyright (c) 2017 John Biddiscombe
+//  Copyright (c) 2017-2018 John Biddiscombe
 //
 //  Distributed under the Boost Software License, Version 1.0. (See accompanying
 //  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -50,91 +50,72 @@
 #include <sstream>
 #include <utility>
 #include <vector>
-
-const char *_cudaGetErrorEnum(cublasStatus_t error);
+//
+#include "cuda_future_helper.h"
+//
 std::mt19937 gen;
 
 // -------------------------------------------------------------------------
 // a simple cublas wrapper helper object that can be used to synchronize
 // cublas calls with an hpx future.
 // -------------------------------------------------------------------------
 template<typename T>
-struct cublas_helper
+struct cublas_helper : hpx::compute::util::cuda_future_helper
 {
-public:
-    using future_type    = hpx::future<void>;
-
 #ifdef HPX_CUBLAS_DEMO_WITH_ALLOCATOR
     using allocator_type = typename hpx::compute::cuda::allocator<T>;
     using vector_type    = typename hpx::compute::vector<T, allocator_type>;
 #endif
 
     // construct a cublas stream
-    cublas_helper(std::size_t device=0) : target_(device) {
+    cublas_helper(std::size_t device=0) : hpx::compute::util::cuda_future_helper(device) {
         handle_ = 0;
-        stream_ = target_.native_handle().get_stream();
-        cublas_error(cublasCreate(&handle_));
+        hpx::compute::util::cublas_error(cublasCreate(&handle_));
     }
 
+    cublas_helper(cublas_helper& other) = delete;
+    cublas_helper(const cublas_helper& other) = delete;
+    cublas_helper operator=(const cublas_helper& other) = delete;
+
     ~cublas_helper() {
-        cublas_error(cublasDestroy(handle_));
+        hpx::compute::util::cublas_error(cublasDestroy(handle_));
     }
 
-    // This is a simple wrapper for any cublas call, pass in the same arguments
-    // that you would use for a cublas call except the cublas handle which is omitted
-    // as the wrapper will supply that for you
-    template <typename Func, typename ...Args>
-    void operator()(Func && cublas_function, Args&&... args)
-    {
+    // -------------------------------------------------------------------------
+    // launch a cuBlas function and return a future that will become ready
+    // when the task completes, this allows integregration of GPU kernels with
+    // hpx::futuresa and the tasking DAG.
+    template <typename R, typename... Params, typename... Args>
+     hpx::future<void> async(R(*cublas_function)(Params...), Args &&... args) {
         // make sue we run on the correct device
-        cuda_error(cudaSetDevice(target_.native_handle().get_device()));
-
+        hpx::compute::util::cuda_error(cudaSetDevice(target_.native_handle().get_device()));
         // make sure this operation takes place on our stream
-        cublas_error(cublasSetStream(handle_, stream_));
-
+        hpx::compute::util::cublas_error(cublasSetStream(handle_, stream_));
         // insert the cublas handle in the arg list and call the cublas function
-        cublas_error(cublas_function(handle_, std::forward<Args>(args)...));
+        hpx::compute::util::detail::async_helper<R, Params...> helper;
+        helper(cublas_function, handle_, std::forward<Args>(args)...);
+        return get_future();
     }
 
-    template <typename ...Args>
-    void copy_async(Args&&... args)
-    {
+    // This is a simple wrapper for any cublas call, pass in the same arguments
+    // that you would use for a cublas call except the cublas handle which is omitted
+    // as the wrapper will supply that for you
+    template <typename R, typename... Params, typename... Args>
+    R apply(R(*cublas_function)(Params...), Args &&... args) {
         // make sue we run on the correct device
-        cuda_error(cudaSetDevice(target_.native_handle().get_device()));
-
-        // insert the uda stream in the arg list and call the cuda memcpy
-        cuda_error(cudaMemcpyAsync (std::forward<Args>(args)..., stream_));
+        hpx::compute::util::cuda_error(cudaSetDevice(target_.native_handle().get_device()));
+        // make sure this operation takes place on our stream
+        hpx::compute::util::cublas_error(cublasSetStream(handle_, stream_));
+        // insert the cublas handle in the arg list and call the cublas function
+        hpx::compute::util::detail::async_helper<R, Params...> helper;
+        return helper(cublas_function, handle_, std::forward<Args>(args)...);
     }
 
-    // get the future to synchronize this cublas stream with
-    future_type get_future() { return target_.get_future(); }
-
     // return a copy of the cublas handle
-    cublasHandle_t handle() { return handle_; }
-
-    // return a reference to the compute::cuda object owned by this class
-    hpx::compute::cuda::target & target() { return target_; }
-
-    static void cublas_error(cublasStatus_t err) {
-        if (err != CUBLAS_STATUS_SUCCESS) {
-            std::stringstream temp;
-            temp << "cublas function returned error code " << _cudaGetErrorEnum(err);
-            throw std::runtime_error(temp.str());
-        }
-    }
-
-    static void cuda_error(cudaError_t err) {
-        if (err != cudaSuccess) {
-            std::stringstream temp;
-            temp << "cuda function returned error code " << cudaGetErrorString(err);
-            throw std::runtime_error(temp.str());
-        }
-    }
+    cublasHandle_t get_handle() { return handle_; }
 
 private:
-    cublasHandle_t             handle_;
-    cudaStream_t               stream_;
-    hpx::compute::cuda::target target_;
+    cublasHandle_t handle_;
 };
 
 // -------------------------------------------------------------------------
@@ -202,36 +183,6 @@ compare_L2_err(const float *reference, const float *data,
     return result;
 }
 
-// -------------------------------------------------------------------------
-// not all of these are supported by all cuda/cublas versions
-// comment them out if they cause compiler errors
-const char *_cudaGetErrorEnum(cublasStatus_t error)
-{
-    switch (error) {
-        case CUBLAS_STATUS_SUCCESS:
-            return "CUBLAS_STATUS_SUCCESS";
-        case CUBLAS_STATUS_NOT_INITIALIZED:
-            return "CUBLAS_STATUS_NOT_INITIALIZED";
-        case CUBLAS_STATUS_ALLOC_FAILED:
-            return "CUBLAS_STATUS_ALLOC_FAILED";
-        case CUBLAS_STATUS_INVALID_VALUE:
-            return "CUBLAS_STATUS_INVALID_VALUE";
-        case CUBLAS_STATUS_ARCH_MISMATCH:
-            return "CUBLAS_STATUS_ARCH_MISMATCH";
-        case CUBLAS_STATUS_MAPPING_ERROR:
-            return "CUBLAS_STATUS_MAPPING_ERROR";
-        case CUBLAS_STATUS_EXECUTION_FAILED:
-            return "CUBLAS_STATUS_EXECUTION_FAILED";
-        case CUBLAS_STATUS_INTERNAL_ERROR:
-            return "CUBLAS_STATUS_INTERNAL_ERROR";
-        case CUBLAS_STATUS_NOT_SUPPORTED:
-            return "CUBLAS_STATUS_NOT_SUPPORTED";
-        case CUBLAS_STATUS_LICENSE_ERROR:
-            return "CUBLAS_STATUS_LICENSE_ERROR";
-    }
-    return "<unknown>";
-}
-
 // -------------------------------------------------------------------------
 // Run a simple test matrix multiply using CUBLAS
 // -------------------------------------------------------------------------
@@ -257,8 +208,7 @@ void matrixMultiply(sMatrixSize &matrix_size, std::size_t device, std::size_t it
 
     // create a cublas helper object we'll use to futurize the cuda events
     cublas_helper<T> cublas(device);
-
-    using cublas_future    = typename cublas_helper<T>::future_type;
+    using cublas_future = typename cublas_helper<T>::future_type;
 
 #ifdef HPX_CUBLAS_DEMO_WITH_ALLOCATOR
     // for convenience
@@ -286,27 +236,27 @@ void matrixMultiply(sMatrixSize &matrix_size, std::size_t device, std::size_t it
 
 #else
     T *d_A, *d_B, *d_C;
-    cublas_helper<T>::cuda_error(
+    hpx::compute::util::cuda_error(
         cudaMalloc((void **) &d_A, size_A*sizeof(T)));
 
-    cublas_helper<T>::cuda_error(
+    hpx::compute::util::cuda_error(
         cudaMalloc((void **) &d_B, size_B*sizeof(T)));
 
-    cublas_helper<T>::cuda_error(
+    hpx::compute::util::cuda_error(
         cudaMalloc((void **) &d_C, size_C*sizeof(T)));
 
     // adding async copy operations into the stream before cublas calls puts
     // the copies in the queue before the matrix operations.
-    cublas.copy_async(
+    cublas.copy_apply(
         d_A, h_A.data(), size_A*sizeof(T), cudaMemcpyHostToDevice);
 
-    cublas.copy_async(
+    auto copy_future = cublas.copy_async(
         d_B, h_B.data(), size_B*sizeof(T), cudaMemcpyHostToDevice);
 
     // we can call get_future multiple times on the cublas helper.
     // Each one returns a new future that will be set ready when the stream event
     // for this point is triggered
-    auto copy_future = cublas.get_future().then([](cublas_future &&f){
+    copy_future.then([](cublas_future &&f){
         std::cout << "The async host->device copy operation completed" << std::endl;
     });
 
@@ -321,7 +271,7 @@ void matrixMultiply(sMatrixSize &matrix_size, std::size_t device, std::size_t it
     hpx::util::high_resolution_timer t1;
     //
     std::cout << "calling CUBLAS...\n";
-    cublas(
+    auto fut =cublas.async(
         &cublasSgemm,
         CUBLAS_OP_N, CUBLAS_OP_N,
         matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA,
@@ -332,7 +282,7 @@ void matrixMultiply(sMatrixSize &matrix_size, std::size_t device, std::size_t it
         d_C, matrix_size.uiWA);
 
     // wait until the operation completes
-    cublas.get_future().get();
+    fut.get();
 
     double us1 = t1.elapsed_microseconds();
     std::cout << "warmup: elapsed_microseconds " << us1 << std::endl;
@@ -343,7 +293,7 @@ void matrixMultiply(sMatrixSize &matrix_size, std::size_t device, std::size_t it
 
     hpx::util::high_resolution_timer t2;
     for (std::size_t j=0; j<iterations; j++) {
-        cublas(
+        cublas.apply(
             &cublasSgemm,
             CUBLAS_OP_N, CUBLAS_OP_N,
             matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA,
@@ -358,12 +308,9 @@ void matrixMultiply(sMatrixSize &matrix_size, std::size_t device, std::size_t it
 
 #ifndef HPX_CUBLAS_DEMO_WITH_ALLOCATOR
     // when the matrix operations complete, copy the result to the host
-    cublas.copy_async(
+    auto copy_finished = cublas.copy_async(
         h_CUBLAS.data(), d_C, size_C*sizeof(T), cudaMemcpyDeviceToHost);
 
-    // and get another future when the copy back is done
-    auto copy_finished = cublas.get_future();
-
 #endif
 
     // attach a continuation to the cublas future

@@ -0,0 +1,93 @@
+//  Copyright (c) 2018 John Biddiscombe
+//
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_NO_CXX11_ALLOCATOR
+//
+#include <hpx/hpx.hpp>
+#include <hpx/hpx_init.hpp>
+#include <hpx/include/parallel_copy.hpp>
+#include <hpx/include/parallel_for_each.hpp>
+#include <hpx/include/parallel_for_loop.hpp>
+#include <hpx/include/parallel_executors.hpp>
+#include <hpx/include/parallel_executor_parameters.hpp>
+//
+#include <hpx/include/compute.hpp>
+#include <hpx/compute/cuda/target.hpp>
+// CUDA runtime
+#include <cuda_runtime.h>
+//
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <iostream>
+#include <sstream>
+#include <utility>
+#include <vector>
+//
+#include "cuda_future_helper.h"
+
+// -------------------------------------------------------------------------
+// This example uses the normal C++ compiler to compile all the HPX stuff
+// but the cuda functions go in their own .cu file and are compiled with
+// nvcc, we don't mix them.
+// Declare functions we are importing - note that template instantiations
+// must be present in the .cu file and compiled so that we can link to them
+template <typename T>
+extern void cuda_trivial_kernel(T, cudaStream_t stream);
+
+// -------------------------------------------------------------------------
+int hpx_main(boost::program_options::variables_map& vm)
+{
+    std::size_t device     = vm["device"].as<std::size_t>();
+    //
+    unsigned int seed = (unsigned int)std::time(nullptr);
+     if (vm.count("seed"))
+        seed = vm["seed"].as<unsigned int>();
+
+    std::cout << "using seed: " << seed << std::endl;
+    std::srand(seed);
+
+    hpx::compute::cuda::target target(device);
+    //
+    hpx::compute::util::cuda_future_helper helper(device);
+    helper.print_local_targets();
+    //
+    float testf = 2.345;
+    cuda_trivial_kernel(testf, helper.get_stream());
+
+    double testd = 5.678;
+    cuda_trivial_kernel(testd, helper.get_stream());
+
+    auto fn = &cuda_trivial_kernel<double>;
+    double d = 3.1415;
+    auto f = helper.async(fn, d);
+    f.then([](hpx::future<void> &&f) {
+        std::cout << "trivial kernel completed \n";
+    });
+
+    return hpx::finalize();
+}
+
+// -------------------------------------------------------------------------
+int main(int argc, char **argv)
+{
+    printf("[HPX Cuda future] - Starting...\n");
+
+    using namespace boost::program_options;
+    options_description cmdline("usage: " HPX_APPLICATION_STRING " [options]");
+    cmdline.add_options()
+        (   "device",
+            boost::program_options::value<std::size_t>()->default_value(0),
+            "Device to use")
+        (   "iterations",
+            boost::program_options::value<std::size_t>()->default_value(30),
+            "iterations")
+        ("seed,s",
+            boost::program_options::value<unsigned int>(),
+            "the random number generator seed to use for this run")
+        ;
+
+    return hpx::init(cmdline, argc, argv);
+}