diff --git a/cpp/src/arrow/gpu/CMakeLists.txt b/cpp/src/arrow/gpu/CMakeLists.txt index 96569fe2a9476..4646ccae20c0e 100644 --- a/cpp/src/arrow/gpu/CMakeLists.txt +++ b/cpp/src/arrow/gpu/CMakeLists.txt @@ -79,28 +79,16 @@ set(ARROW_GPU_SRCS set(ARROW_GPU_SHARED_LINK_LIBS arrow_shared + ${CUDA_LIBRARIES} + ${CUDA_CUDA_LIBRARY} ) -add_library(arrow_gpu_objlib OBJECT - ${ARROW_GPU_SRCS} +ADD_ARROW_LIB(arrow_gpu + SOURCES ${ARROW_GPU_SRCS} + SHARED_LINK_FLAGS "" + SHARED_LINK_LIBS ${ARROW_GPU_SHARED_LINK_LIBS} + STATIC_LINK_LIBS "" ) -set_property(TARGET arrow_gpu_objlib PROPERTY POSITION_INDEPENDENT_CODE 1) - -if (ARROW_BUILD_SHARED) - cuda_add_library(arrow_gpu_shared SHARED $) - install(TARGETS arrow_gpu_shared - RUNTIME DESTINATION bin - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}) -endif() - -if (ARROW_BUILD_STATIC) - add_library(arrow_gpu_static STATIC $) - install(TARGETS arrow_gpu_static - RUNTIME DESTINATION bin - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}) -endif() install(FILES cuda_common.h diff --git a/cpp/src/arrow/gpu/cuda-benchmark.cc b/cpp/src/arrow/gpu/cuda-benchmark.cc index 9936ac674ec9b..26395ea9bf181 100644 --- a/cpp/src/arrow/gpu/cuda-benchmark.cc +++ b/cpp/src/arrow/gpu/cuda-benchmark.cc @@ -38,10 +38,10 @@ static void CudaBufferWriterBenchmark(benchmark::State& state, const int64_t tot CudaDeviceManager* manager; ABORT_NOT_OK(CudaDeviceManager::GetInstance(&manager)); std::shared_ptr context; - ABORT_NOT_OK(manager->CreateContext(kGpuNumber, &context)); + ABORT_NOT_OK(manager->GetContext(kGpuNumber, &context)); std::shared_ptr device_buffer; - ABORT_NOT_OK(context->Allocate(total_bytes, &device_buffer)); + ABORT_NOT_OK(AllocateCudaBuffer(total_bytes, context, &device_buffer)); CudaBufferWriter writer(device_buffer); if (buffer_size > 0) { diff --git a/cpp/src/arrow/gpu/cuda-test.cc b/cpp/src/arrow/gpu/cuda-test.cc index aec25522e9208..6bf61a37a5e97 100644 --- a/cpp/src/arrow/gpu/cuda-test.cc +++ b/cpp/src/arrow/gpu/cuda-test.cc @@ -35,7 +35,7 @@ class TestCudaBufferBase : public ::testing::Test { public: void SetUp() { ASSERT_OK(CudaDeviceManager::GetInstance(&manager_)); - ASSERT_OK(manager_->CreateContext(kGpuNumber, &context_)); + ASSERT_OK(manager_->GetContext(kGpuNumber, &context_)); } protected: @@ -53,7 +53,7 @@ class TestCudaBuffer : public TestCudaBufferBase { TEST_F(TestCudaBuffer, Allocate) { const int64_t kSize = 100; std::shared_ptr buffer; - ASSERT_OK(context_->Allocate(kSize, &buffer)); + ASSERT_OK(AllocateCudaBuffer(kSize, context_, &buffer)); ASSERT_EQ(kSize, buffer->size()); } @@ -68,7 +68,7 @@ void AssertCudaBufferEquals(const CudaBuffer& buffer, const uint8_t* host_data, TEST_F(TestCudaBuffer, CopyFromHost) { const int64_t kSize = 1000; std::shared_ptr device_buffer; - ASSERT_OK(context_->Allocate(kSize, &device_buffer)); + ASSERT_OK(AllocateCudaBuffer(kSize, context_, &device_buffer)); std::shared_ptr host_buffer; ASSERT_OK(test::MakeRandomBytePoolBuffer(kSize, default_memory_pool(), &host_buffer)); @@ -86,7 +86,7 @@ class TestCudaBufferWriter : public TestCudaBufferBase { } void Allocate(const int64_t size) { - ASSERT_OK(context_->Allocate(size, &device_buffer_)); + ASSERT_OK(AllocateCudaBuffer(size, context_, &device_buffer_)); writer_.reset(new CudaBufferWriter(device_buffer_)); } @@ -198,7 +198,7 @@ TEST_F(TestCudaBufferReader, Basics) { std::shared_ptr device_buffer; const int64_t size = 1000; - ASSERT_OK(context_->Allocate(size, &device_buffer)); + ASSERT_OK(AllocateCudaBuffer(size, context_, &device_buffer)); std::shared_ptr buffer; ASSERT_OK(test::MakeRandomBytePoolBuffer(1000, default_memory_pool(), &buffer)); diff --git a/cpp/src/arrow/gpu/cuda_context.cc b/cpp/src/arrow/gpu/cuda_context.cc index d323eb7c0226b..dd0b29b6d4b67 100644 --- a/cpp/src/arrow/gpu/cuda_context.cc +++ b/cpp/src/arrow/gpu/cuda_context.cc @@ -21,9 +21,14 @@ #include #include #include +#include +#include #include +#include "arrow/gpu/cuda_common.h" +#include "arrow/gpu/cuda_memory.h" + namespace arrow { namespace gpu { @@ -39,7 +44,7 @@ class CudaContext::CudaContextImpl { Status Init(const CudaDevice& device) { device_ = device; - CU_RETURN_NOT_OK(cuCtxCreate(context_, 0, device_.handle)); + CU_RETURN_NOT_OK(cuCtxCreate(&context_, 0, device_.handle)); is_open_ = true; return Status::OK(); } @@ -58,22 +63,23 @@ class CudaContext::CudaContextImpl { CU_RETURN_NOT_OK(cuCtxSetCurrent(context_)); CUdeviceptr data; - CU_RETURN_NOT_OK(cuMemAlloc(&data, nbytes)); + CU_RETURN_NOT_OK(cuMemAlloc(&data, static_cast(nbytes))); *out = reinterpret_cast(data); return Status::OK(); } Status CopyHostToDevice(uint8_t* dst, const uint8_t* src, int64_t nbytes) { CU_RETURN_NOT_OK(cuCtxSetCurrent(context_)); - CU_RETURN_NOT_OK(cuMemcpyDtoH(reinterpret_cast(dst), - src, nbytes)); - return Statsu::OK(); + CU_RETURN_NOT_OK(cuMemcpyHtoD(reinterpret_cast(dst), + reinterpret_cast(src), + static_cast(nbytes))); + return Status::OK(); } Status CopyDeviceToHost(uint8_t* dst, const uint8_t* src, int64_t nbytes) { CU_RETURN_NOT_OK(cuCtxSetCurrent(context_)); - CU_RETURN_NOT_OK(cuMemcpyHtoD(src, reinterpret_cast(src), - nbytes)); + CU_RETURN_NOT_OK(cuMemcpyDtoH(dst, reinterpret_cast(src), + static_cast(nbytes))); return Status::OK(); } @@ -113,7 +119,8 @@ class CudaDeviceManager::CudaDeviceManagerImpl { Status AllocateHost(int64_t nbytes, uint8_t** out) { CU_RETURN_NOT_OK(cuMemHostAlloc(reinterpret_cast(out), - nbytes, CU_MEMHOSTALLOC_PORTABLE)); + static_cast(nbytes), + CU_MEMHOSTALLOC_PORTABLE)); host_bytes_allocated_ += nbytes; return Status::OK(); } @@ -134,9 +141,16 @@ class CudaDeviceManager::CudaDeviceManagerImpl { return Status::OK(); } - Status CreateContext(int device_number, std::shared_ptr* out) { - *out = std::shared_ptr(new CudaContext()); - return (*out)->impl_->Init(devices_[i]); + Status GetContext(int device_number, std::shared_ptr* out) { + auto it = contexts_.find(device_number); + if (it == contexts_.end()) { + auto ctx = std::shared_ptr(new CudaContext()); + RETURN_NOT_OK(ctx->impl_->Init(devices_[device_number])); + contexts_[device_number] = *out = ctx; + } else { + *out = it->second; + } + return Status::OK(); } int num_devices() const { return num_devices_; } @@ -145,6 +159,9 @@ class CudaDeviceManager::CudaDeviceManagerImpl { int num_devices_; std::vector devices_; + // device_number -> CudaContext + std::unordered_map> contexts_; + int host_bytes_allocated_; }; @@ -152,6 +169,8 @@ CudaDeviceManager::CudaDeviceManager() { impl_.reset(new CudaDeviceManagerImpl()); } +std::unique_ptr CudaDeviceManager::instance_ = nullptr; + Status CudaDeviceManager::GetInstance(CudaDeviceManager** manager) { if (!instance_) { instance_.reset(new CudaDeviceManager()); @@ -161,21 +180,21 @@ Status CudaDeviceManager::GetInstance(CudaDeviceManager** manager) { return Status::OK(); } -Status CudaDeviceManager::Create(int device_number, - std::shared_ptr* out) { - return impl_->Create(device_number, out); +Status CudaDeviceManager::GetContext(int device_number, + std::shared_ptr* out) { + return impl_->GetContext(device_number, out); } Status CudaDeviceManager::AllocateHost(int64_t nbytes, std::shared_ptr* out) { - uint8_t* data; + uint8_t* data = nullptr; RETURN_NOT_OK(impl_->AllocateHost(nbytes, &data)); - *out = std::shared_ptr(data, nbytes); + *out = std::make_shared(data, nbytes); return Status::OK(); } Status CudaDeviceManager::FreeHost(uint8_t* data, int64_t nbytes) { - return impl_->FreeHost(data, nbytes)); + return impl_->FreeHost(data, nbytes); } int CudaDeviceManager::num_devices() const { @@ -185,8 +204,14 @@ int CudaDeviceManager::num_devices() const { // ---------------------------------------------------------------------- // CudaContext public API -Status CudaContext::Allocate(int64_t nbytes, std::shared_ptr* out) { - return impl_->AllocateHost(nbytes, out); +CudaContext::CudaContext() { + impl_.reset(new CudaContextImpl()); +} + +CudaContext::~CudaContext() {} + +Status CudaContext::Allocate(int64_t nbytes, uint8_t** out) { + return impl_->Allocate(nbytes, out); } Status CudaContext::CopyHostToDevice(uint8_t* dst, const uint8_t* src, int64_t nbytes) { diff --git a/cpp/src/arrow/gpu/cuda_context.h b/cpp/src/arrow/gpu/cuda_context.h index 052d1e374ffd8..c062673467448 100644 --- a/cpp/src/arrow/gpu/cuda_context.h +++ b/cpp/src/arrow/gpu/cuda_context.h @@ -37,8 +37,8 @@ class ARROW_EXPORT CudaDeviceManager { public: static Status GetInstance(CudaDeviceManager** manager); - /// \brief Create a CUDA driver context for a particular device - Status CreateContext(int gpu_number, std::shared_ptr* ctx); + /// \brief Get the CUDA driver context for a particular device + Status GetContext(int gpu_number, std::shared_ptr* ctx); Status AllocateHost(int64_t nbytes, std::shared_ptr* buffer); Status FreeHost(uint8_t* data, int64_t nbytes); @@ -46,7 +46,8 @@ class ARROW_EXPORT CudaDeviceManager { int num_devices() const; private: - std::unique_ptr instance_; + CudaDeviceManager(); + static std::unique_ptr instance_; class CudaDeviceManagerImpl; std::unique_ptr impl_; @@ -67,7 +68,7 @@ class ARROW_EXPORT CudaContext { Status CopyHostToDevice(uint8_t* dst, const uint8_t* src, int64_t nbytes); Status CopyDeviceToHost(uint8_t* dst, const uint8_t* src, int64_t nbytes); - Status Allocate(int64_t nbytes, std::shared_ptr* buffer); + Status Allocate(int64_t nbytes, uint8_t** out); Status Free(uint8_t* device_ptr, int64_t nbytes); int64_t bytes_allocated() const; diff --git a/cpp/src/arrow/gpu/cuda_memory.cc b/cpp/src/arrow/gpu/cuda_memory.cc index bf77ec1abfed3..fe89aa5cb63c5 100644 --- a/cpp/src/arrow/gpu/cuda_memory.cc +++ b/cpp/src/arrow/gpu/cuda_memory.cc @@ -55,9 +55,12 @@ Status CudaBuffer::CopyFromHost(const int64_t position, const uint8_t* data, } Status AllocateCudaBuffer(const int64_t size, - std::shared_ptr& context, + const std::shared_ptr& context, std::shared_ptr* out) { - return context->Allocate(size, out); + uint8_t* data = nullptr; + RETURN_NOT_OK(context->Allocate(size, &data)); + *out = std::make_shared(data, size, context); + return Status::OK(); } CudaHostBuffer::~CudaHostBuffer() {