From 8e3d1f4ad5ab0e2022879634ec95e2d76bfeb136 Mon Sep 17 00:00:00 2001 From: Egor Burkov Date: Sat, 31 Mar 2018 23:20:33 +0300 Subject: [PATCH] Introduce Torch-compatible GPU memory allocator --- demo/cuda/resize.lua | 50 +++++++++++++++++++++++++++++++------ include/CUDACommon.hpp | 18 +++++++++++++- init.lua | 5 ++++ src/CUDACommon.cpp | 56 ++++++++++++++++++++++++++++++++++++++++-- src/cudaobjdetect.cpp | 2 +- src/cudaoptflow.cpp | 7 +++--- 6 files changed, 124 insertions(+), 14 deletions(-) diff --git a/demo/cuda/resize.lua b/demo/cuda/resize.lua index 5563f9f..17327bb 100644 --- a/demo/cuda/resize.lua +++ b/demo/cuda/resize.lua @@ -1,17 +1,53 @@ require 'cutorch' + local cv = require 'cv' -require 'cv.cudawarping' -require 'cv.highgui' -require 'cv.imgcodecs' +require 'cv.cudawarping' -- cv.cuda.resize +require 'cv.imgproc' -- cv.resize +require 'cv.highgui' -- cv.imshow +require 'cv.imgcodecs' -- cv.imread if not arg[1] then print('Usage: `th demo/cuda/resize.lua path-to-image`') print('Now using demo/data/lena.jpg') + print('') end -local img = cv.imread {arg[1] or 'demo/data/lena.jpg', cv.IMREAD_COLOR} -local imgCUDA = img:float():cuda() / 255 -local resized = cv.cuda.resize{imgCUDA, {1024, 768}} +local img = cv.imread {arg[1] or 'demo/data/lena.jpg', cv.IMREAD_COLOR}:float() / 255 +local imgCUDA = img:cuda() + +require 'xlua' +local numIterations, dsize = 2000, {1024, 768} + +local resized = cv.resize{img, dsize} +local resizedCUDA = torch.CudaTensor(resized:size()) + +print(('Doing `cv.resize{}` (CPU) %d times (OpenCV\'s number of threads is %d):') + :format(numIterations, cv.getNumThreads{})) +local timer = torch.Timer() + +for iter = 1,numIterations do + cv.resize{img, dsize, dst=resized} + if iter % 100 == 0 then xlua.progress(iter, numIterations) end +end +local timeCPU = timer:time().real + +print(('Doing `cv.cuda.resize{}` (GPU) %d times:'):format(numIterations)) +timer:reset() + +for iter = 1,numIterations do + cv.cuda.resize{imgCUDA, dsize, dst=resizedCUDA} + if iter % 100 == 0 then xlua.progress(iter, numIterations) end +end +cutorch.synchronize() +local timeGPU = timer:time().real + +-- a technical test to check if Tensor freeing works without errors +for iter = 1,40 do + local _ = cv.cuda.resize{imgCUDA, dsize} +end +collectgarbage() -cv.imshow{"Resized to 1024x768", resized:float()} +local title = + ("Lena resized to 1024x768 by your GPU (%.3f times faster than CPU)"):format(timeCPU / timeGPU) +cv.imshow{title, resizedCUDA:float()} cv.waitKey{0} diff --git a/include/CUDACommon.hpp b/include/CUDACommon.hpp index 9cebc41..b07ff70 100644 --- a/include/CUDACommon.hpp +++ b/include/CUDACommon.hpp @@ -15,6 +15,21 @@ struct cutorchInfo { THCState *state; }; +/**************** A custom allocator that uses Torch memory management for Mats ****************/ + +class TorchCompatibleAllocator: public cuda::GpuMat::Allocator { +public: + THCState *cutorchState; + + bool allocate(cuda::GpuMat* mat, int rows, int cols, size_t elemSize); + void free(cuda::GpuMat* mat); +}; + +extern "C" +void initAllocatorCUDA(cutorchInfo info); + +/****************************************** GpuMatT ********************************************/ + class GpuMatT { public: cuda::GpuMat mat; @@ -43,8 +58,9 @@ GpuMatT TensorWrapper::toGpuMatT() { return retval; } -/************************ Fake OpenCV/CUDA classes *************************/ +/************* Fake "custom memory stack impl for OpenCV" to use cutorch streams *************/ +// Description below class FakeMemoryPool; class FakeMemoryStack; class FakeStackAllocator; diff --git a/init.lua b/init.lua index 73415a9..92f1d96 100644 --- a/init.lua +++ b/init.lua @@ -31,6 +31,7 @@ struct TensorArray { }; void initAllocator(); +void initAllocatorCUDA(struct cutorchInfo info); void *malloc(size_t size); void free(void *ptr); @@ -333,6 +334,10 @@ end --- ***************** Tensor <=> Mat conversion ***************** C.initAllocator() +if CUDACommon_C then + cv.cuda = cv.cuda or require 'cv._env_cuda' + CUDACommon_C.initAllocatorCUDA(cv.cuda._info()) +end local tensor_CV_code_by_letter = { [ 66] = cv.CV_8U , -- B : Byte diff --git a/src/CUDACommon.cpp b/src/CUDACommon.cpp index cdb8ecf..1ff5959 100644 --- a/src/CUDACommon.cpp +++ b/src/CUDACommon.cpp @@ -1,5 +1,57 @@ #include +bool TorchCompatibleAllocator::allocate(cuda::GpuMat* mat, int rows, int cols, size_t elemSize) { + + if (rows * cols == 0) { + THError("You tried to allocate a Tensor with zero rows or columns"); + return false; + } + + // See https://github.com/torch/cutorch/blob/master/lib/THC/generic/THCStorage.c#L69 + THCState *state = this->cutorchState; + const THCDeviceAllocator *allocator = state->cudaDeviceAllocator; + void *allocatorContext = state->cudaDeviceAllocator->state; + + THCHeapUpdate(state, rows * cols * elemSize); + cudaError_t err = (*allocator->malloc)( + allocatorContext, + (void **) &(mat->data), + rows * cols * elemSize, + THCState_getCurrentStream(state)); + + if (err != cudaSuccess) { + THCHeapUpdate(state, -rows * cols * elemSize); + THCudaCheck(err); + return false; + } + THCudaCheck(err); + + mat->step = elemSize * cols; + mat->refcount = (int*) cv::fastMalloc(sizeof(int)); + + return true; +} + +void TorchCompatibleAllocator::free(cuda::GpuMat* mat) { + // See https://github.com/torch/cutorch/blob/master/lib/THC/generic/THCStorage.c#L180 + THCState *state = this->cutorchState; + const THCDeviceAllocator *allocator = state->cudaDeviceAllocator; + void *allocatorContext = state->cudaDeviceAllocator->state; + + THCHeapUpdate(state, -mat->step * mat->rows); + THCudaCheck((*allocator->free)(allocatorContext, mat->data)); + + cv::fastFree(mat->refcount); +} + +static TorchCompatibleAllocator torchCompatibleAllocator; + +extern "C" +void initAllocatorCUDA(cutorchInfo info) { + torchCompatibleAllocator.cutorchState = info.state; + cuda::GpuMat::setDefaultAllocator(&torchCompatibleAllocator); +} + GpuMatT::GpuMatT(cuda::GpuMat & mat) { this->mat = mat; this->tensor = nullptr; @@ -158,7 +210,7 @@ std::vector TensorArray::toGpuMatList() { return retval; } -/************************ Fake OpenCV/CUDA classes *************************/ +/************* Fake "custom memory stack impl for OpenCV" to use cutorch streams *************/ FakeDefaultDeviceInitializer initializer; @@ -346,4 +398,4 @@ cuda::Stream & prepareStream(cutorchInfo info) { cuda::setDevice(info.deviceID - 1); fakeStream.impl_ = cv::makePtr(THCState_getCurrentStream(info.state)); return *reinterpret_cast(&fakeStream); -} \ No newline at end of file +} diff --git a/src/cudaobjdetect.cpp b/src/cudaobjdetect.cpp index 1283b61..ba8463d 100644 --- a/src/cudaobjdetect.cpp +++ b/src/cudaobjdetect.cpp @@ -292,7 +292,7 @@ struct TensorWrapper CascadeClassifier_detectMultiScaleCuda( GpuMatT objectsMat = objects.toGpuMatT(); cuda::GpuMat imageMat = image.toGpuMat(); cuda::GpuMat imageByte; - imageMat.convertTo(imageByte, CV_8U, 255.0); // Sorry guys :( + imageMat.convertTo(imageByte, CV_8U, 255.0); // Sorry guys :( #160 ptr->detectMultiScale(imageByte, objectsMat, prepareStream(info)); return TensorWrapper(objectsMat, info.state); } diff --git a/src/cudaoptflow.cpp b/src/cudaoptflow.cpp index 2053c2f..3b70bc5 100644 --- a/src/cudaoptflow.cpp +++ b/src/cudaoptflow.cpp @@ -5,9 +5,10 @@ struct TensorWrapper DenseOpticalFlow_calcCuda(struct cutorchInfo info, struct DenseOpticalFlowPtr ptr, struct TensorWrapper I0, struct TensorWrapper I1, struct TensorWrapper flow) { - cuda::GpuMat retval = flow.toGpuMat(); - ptr->calc(I0.toGpuMat(), I1.toGpuMat(), retval, prepareStream(info)); - return TensorWrapper(retval, info.state); + GpuMatT flowMat = flow.toGpuMatT(); + ptr->calc(I0.toGpuMat(), I1.toGpuMat(), flowMat, prepareStream(info)); + + return TensorWrapper(flowMat, info.state); } extern "C"