Skip to content

Commit

Permalink
Draft SerializeRecordBatch for CUDA
Browse files Browse the repository at this point in the history
Change-Id: I8dd313ac4e1cc0c01fdbe760bcae325a55ec8818
  • Loading branch information
wesm committed Aug 28, 2017
1 parent 84e4525 commit 591aceb
Show file tree
Hide file tree
Showing 11 changed files with 117 additions and 37 deletions.
2 changes: 2 additions & 0 deletions cpp/src/arrow/gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ find_package(CUDA REQUIRED)
include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})

set(ARROW_GPU_SRCS
cuda_arrow_ipc.cc
cuda_context.cc
cuda_memory.cc
)
Expand All @@ -46,6 +47,7 @@ ADD_ARROW_LIB(arrow_gpu

install(FILES
cuda_api.h
cuda_arrow_ipc.h
cuda_context.h
cuda_memory.h
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/gpu")
Expand Down
37 changes: 37 additions & 0 deletions cpp/src/arrow/gpu/cuda-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,43 @@ TEST_F(TestCudaBuffer, CopyFromHost) {
AssertCudaBufferEquals(*device_buffer, host_buffer->data(), kSize);
}

// IPC only supported on Linux
#if defined(__linux)

TEST_F(TestCudaBuffer, DISABLED_ExportForIpc) {
// For this test to work, a second process needs to be spawned
const int64_t kSize = 1000;
std::shared_ptr<CudaBuffer> device_buffer;
ASSERT_OK(context_->Allocate(kSize, &device_buffer));

std::shared_ptr<PoolBuffer> host_buffer;
ASSERT_OK(test::MakeRandomBytePoolBuffer(kSize, default_memory_pool(), &host_buffer));
ASSERT_OK(device_buffer->CopyFromHost(0, host_buffer->data(), kSize));

// Export for IPC and serialize
std::unique_ptr<CudaIpcMemHandle> ipc_handle;
ASSERT_OK(device_buffer->ExportForIpc(&ipc_handle));

std::shared_ptr<Buffer> serialized_handle;
ASSERT_OK(ipc_handle->Serialize(default_memory_pool(), &serialized_handle));

// Deserialize IPC handle and open
std::unique_ptr<CudaIpcMemHandle> ipc_handle2;
ASSERT_OK(CudaIpcMemHandle::FromBuffer(serialized_handle->data(), &ipc_handle2));

std::shared_ptr<CudaBuffer> ipc_buffer;
ASSERT_OK(context_->OpenIpcBuffer(*ipc_handle2, &ipc_buffer));

ASSERT_EQ(kSize, ipc_buffer->size());

std::shared_ptr<MutableBuffer> ipc_data;
ASSERT_OK(AllocateBuffer(default_memory_pool(), kSize, &ipc_data));
ASSERT_OK(ipc_buffer->CopyToHost(0, kSize, ipc_data->mutable_data()));
ASSERT_EQ(0, std::memcmp(ipc_buffer->data(), host_buffer->data(), kSize));
}

#endif

class TestCudaBufferWriter : public TestCudaBufferBase {
public:
void SetUp() { TestCudaBufferBase::SetUp(); }
Expand Down
1 change: 1 addition & 0 deletions cpp/src/arrow/gpu/cuda_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#ifndef ARROW_GPU_CUDA_API_H
#define ARROW_GPU_CUDA_API_H

#include "arrow/gpu/cuda_arrow_ipc.h"
#include "arrow/gpu/cuda_context.h"
#include "arrow/gpu/cuda_memory.h"
#include "arrow/gpu/cuda_version.h"
Expand Down
34 changes: 14 additions & 20 deletions cpp/src/arrow/gpu/cuda_ipc.h → cpp/src/arrow/gpu/cuda_arrow_ipc.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
// specific language governing permissions and limitations
// under the License.

#ifndef ARROW_GPU_CUDA_MEMORY_H
#define ARROW_GPU_CUDA_MEMORY_H
#ifndef ARROW_GPU_CUDA_ARROW_IPC_H
#define ARROW_GPU_CUDA_ARROW_IPC_H

#include <cstdint>
#include <memory>
Expand All @@ -25,31 +25,25 @@
#include "arrow/status.h"
#include "arrow/util/visibility.h"

#include "arrow/cuda_memory.h"

namespace arrow {

class RecordBatch;

namespace gpu {

/// \brief Write record batch message to GPU device memory
///
///
ARROW_EXPORT
SerializeRecordBatch(const RecordBatch& batch, CudaContext* ctx,
std::shared_ptr<CudaBuffer>* out);
class CudaBuffer;
class CudaContext;

/// \brief Write record batch to pre-allocated GPU device memory
///
/// \param[in] batch the record batch to write
/// \param[in] out the CudaBufferWriter to write the output to
/// \brief Write record batch message to GPU device memory
/// \param[in] batch record batch to write
/// \param[in] ctx CudaContext to allocate device memory from
/// \param[out] out the returned device buffer which contains the record batch message
/// \return Status
///
/// The CudaBufferWriter must have enough pre-allocated space to accommodate
/// the record batch. You can use arrow::ipc::GetRecordBatchSize to compute
/// this
ARROW_EXPORT
SerializeRecordBatch(const RecordBatch& batch, CudaBufferWriter* out);
Status SerializeRecordBatch(const RecordBatch& batch, CudaContext* ctx,
std::shared_ptr<CudaBuffer>* out);

} // namespace gpu
} // namespace arrow

#endif // ARROW_GPU_CUDA_MEMORY_H
#endif // ARROW_GPU_CUDA_ARROW_IPC_H
2 changes: 1 addition & 1 deletion cpp/src/arrow/gpu/cuda_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ namespace gpu {
if (ret != CUDA_SUCCESS) { \
std::stringstream ss; \
ss << "Cuda Driver API call in " << __FILE__ << " at line " << __LINE__ \
<< " failed: " << #STMT; \
<< " failed with code " << ret << ": " << #STMT; \
return Status::IOError(ss.str()); \
} \
} while (0)
Expand Down
31 changes: 28 additions & 3 deletions cpp/src/arrow/gpu/cuda_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,14 @@ class CudaContext::CudaContextImpl {
return Status::OK();
}

Status ExportIpcBuffer(uint8_t* data, std::unique_ptr<CudaIpcMemHandle>* handle) {
CU_RETURN_NOT_OK(cuCtxSetCurrent(context_));
CUipcMemHandle cu_handle;
CU_RETURN_NOT_OK(cuIpcGetMemHandle(&cu_handle, reinterpret_cast<CUdeviceptr>(data)));
*handle = std::unique_ptr<CudaIpcMemHandle>(new CudaIpcMemHandle(&cu_handle));
return Status::OK();
}

Status OpenIpcBuffer(const CudaIpcMemHandle& ipc_handle, uint8_t** out) {
CU_RETURN_NOT_OK(cuCtxSetCurrent(context_));
auto handle = reinterpret_cast<const CUipcMemHandle*>(ipc_handle.handle());
Expand Down Expand Up @@ -151,12 +159,17 @@ class CudaDeviceManager::CudaDeviceManagerImpl {
return Status::OK();
}

Status CreateNewContext(int device_number, std::shared_ptr<CudaContext>* out) {
*out = std::shared_ptr<CudaContext>(new CudaContext());
return (*out)->impl_->Init(devices_[device_number]);
}

Status GetContext(int device_number, std::shared_ptr<CudaContext>* out) {
auto it = contexts_.find(device_number);
if (it == contexts_.end()) {
auto ctx = std::shared_ptr<CudaContext>(new CudaContext());
RETURN_NOT_OK(ctx->impl_->Init(devices_[device_number]));
contexts_[device_number] = *out = ctx;
std::shared_ptr<CudaContext> new_context;
RETURN_NOT_OK(CreateNewContext(device_number, &new_context));
contexts_[device_number] = *out = new_context;
} else {
*out = it->second;
}
Expand Down Expand Up @@ -193,6 +206,11 @@ Status CudaDeviceManager::GetContext(int device_number,
return impl_->GetContext(device_number, out);
}

Status CudaDeviceManager::CreateNewContext(int device_number,
std::shared_ptr<CudaContext>* out) {
return impl_->CreateNewContext(device_number, out);
}

Status CudaDeviceManager::AllocateHost(int64_t nbytes,
std::shared_ptr<CudaHostBuffer>* out) {
uint8_t* data = nullptr;
Expand Down Expand Up @@ -221,6 +239,11 @@ Status CudaContext::Allocate(int64_t nbytes, std::shared_ptr<CudaBuffer>* out) {
return Status::OK();
}

Status CudaContext::ExportIpcBuffer(uint8_t* data,
std::unique_ptr<CudaIpcMemHandle>* handle) {
return impl_->ExportIpcBuffer(data, handle);
}

Status CudaContext::CopyHostToDevice(uint8_t* dst, const uint8_t* src, int64_t nbytes) {
return impl_->CopyHostToDevice(dst, src, nbytes);
}
Expand All @@ -229,6 +252,8 @@ Status CudaContext::CopyDeviceToHost(uint8_t* dst, const uint8_t* src, int64_t n
return impl_->CopyDeviceToHost(dst, src, nbytes);
}

Status CudaContext::Close() { return impl_->Close(); }

Status CudaContext::Free(uint8_t* device_ptr, int64_t nbytes) {
return impl_->Free(device_ptr, nbytes);
}
Expand Down
10 changes: 8 additions & 2 deletions cpp/src/arrow/gpu/cuda_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,14 @@ class ARROW_EXPORT CudaDeviceManager {
public:
static Status GetInstance(CudaDeviceManager** manager);

/// \brief Get the CUDA driver context for a particular device
/// \brief Get the shared CUDA driver context for a particular device
Status GetContext(int gpu_number, std::shared_ptr<CudaContext>* ctx);

/// \brief Create a new context for a given device number
///
/// In general code will use GetContext
Status CreateNewContext(int gpu_number, std::shared_ptr<CudaContext>* ctx);

Status AllocateHost(int64_t nbytes, std::shared_ptr<CudaHostBuffer>* buffer);

Status FreeHost(uint8_t* data, int64_t nbytes);
Expand All @@ -63,7 +68,7 @@ class ARROW_EXPORT CudaContext : public std::enable_shared_from_this<CudaContext
public:
~CudaContext();

Status Destroy();
Status Close();

/// \brief Allocate CUDA memory on GPU device for this context
/// \param[in] nbytes number of bytes
Expand All @@ -83,6 +88,7 @@ class ARROW_EXPORT CudaContext : public std::enable_shared_from_this<CudaContext
private:
CudaContext();

Status ExportIpcBuffer(uint8_t* data, std::unique_ptr<CudaIpcMemHandle>* handle);
Status CopyHostToDevice(uint8_t* dst, const uint8_t* src, int64_t nbytes);
Status CopyDeviceToHost(uint8_t* dst, const uint8_t* src, int64_t nbytes);
Status Free(uint8_t* device_ptr, int64_t nbytes);
Expand Down
7 changes: 2 additions & 5 deletions cpp/src/arrow/gpu/cuda_memory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,8 @@ Status CudaBuffer::ExportForIpc(std::unique_ptr<CudaIpcMemHandle>* handle) {
if (is_ipc_) {
return Status::Invalid("Buffer has already been exported for IPC");
}
CUipcMemHandle cu_handle;
CU_RETURN_NOT_OK(
cuIpcGetMemHandle(&cu_handle, reinterpret_cast<CUdeviceptr>(mutable_data_)));
is_ipc_ = true;
*handle = std::unique_ptr<CudaIpcMemHandle>(new CudaIpcMemHandle(&cu_handle));
RETURN_NOT_OK(context_->ExportIpcBuffer(mutable_data_, handle));
own_data_ = false;
return Status::OK();
}

Expand Down
5 changes: 3 additions & 2 deletions cpp/src/arrow/gpu/cuda_memory.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,15 +103,16 @@ class ARROW_EXPORT CudaIpcMemHandle {
/// \return Status
Status Serialize(MemoryPool* pool, std::shared_ptr<Buffer>* out) const;

const void* handle() const;

private:
explicit CudaIpcMemHandle(const void* handle);

struct CudaIpcMemHandleImpl;
std::unique_ptr<CudaIpcMemHandleImpl> impl_;

const void* handle() const;

friend CudaBuffer;
friend CudaContext;
};

/// \class CudaBufferReader
Expand Down
13 changes: 9 additions & 4 deletions cpp/src/arrow/ipc/writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -901,14 +901,19 @@ Status SerializeRecordBatch(const RecordBatch& batch, MemoryPool* pool,
RETURN_NOT_OK(AllocateBuffer(pool, size, &buffer));

io::FixedSizeBufferWriter stream(buffer);
int32_t metadata_length = 0;
int64_t body_length = 0;
RETURN_NOT_OK(WriteRecordBatch(batch, 0, &stream, &metadata_length, &body_length, pool,
kMaxNestingDepth, true));
RETURN_NOT_OK(SerializeRecordBatch(batch, pool, &stream));
*out = buffer;
return Status::OK();
}

Status SerializeRecordBatch(const RecordBatch& batch, MemoryPool* pool,
io::OutputStream* out) {
int32_t metadata_length = 0;
int64_t body_length = 0;
return WriteRecordBatch(batch, 0, out, &metadata_length, &body_length, pool,
kMaxNestingDepth, true);
}

Status SerializeSchema(const Schema& schema, MemoryPool* pool,
std::shared_ptr<Buffer>* out) {
std::shared_ptr<io::BufferOutputStream> stream;
Expand Down
12 changes: 12 additions & 0 deletions cpp/src/arrow/ipc/writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,18 @@ ARROW_EXPORT
Status SerializeRecordBatch(const RecordBatch& batch, MemoryPool* pool,
std::shared_ptr<Buffer>* out);

/// \brief Write record batch to OutputStream
///
/// \param[in] batch the record batch to write
/// \param[in] out the OutputStream to write the output to
/// \return Status
///
/// If writing to pre-allocated memory, you can use
/// arrow::ipc::GetRecordBatchSize to compute how much space is required
ARROW_EXPORT
Status SerializeRecordBatch(const RecordBatch& batch, MemoryPool* pool,
io::OutputStream* out);

/// \brief Serialize schema using stream writer as a sequence of one or more
/// IPC messages
///
Expand Down

0 comments on commit 591aceb

Please sign in to comment.