Skip to content

Commit

Permalink
Introduce Torch-compatible GPU memory allocator
Browse files Browse the repository at this point in the history
  • Loading branch information
shrubb committed Mar 31, 2018
1 parent cc4dc2d commit 8e3d1f4
Show file tree
Hide file tree
Showing 6 changed files with 124 additions and 14 deletions.
50 changes: 43 additions & 7 deletions demo/cuda/resize.lua
@@ -1,17 +1,53 @@
require 'cutorch'

local cv = require 'cv'
require 'cv.cudawarping'
require 'cv.highgui'
require 'cv.imgcodecs'
require 'cv.cudawarping' -- cv.cuda.resize
require 'cv.imgproc' -- cv.resize
require 'cv.highgui' -- cv.imshow
require 'cv.imgcodecs' -- cv.imread

if not arg[1] then
print('Usage: `th demo/cuda/resize.lua path-to-image`')
print('Now using demo/data/lena.jpg')
print('')
end

local img = cv.imread {arg[1] or 'demo/data/lena.jpg', cv.IMREAD_COLOR}
local imgCUDA = img:float():cuda() / 255
local resized = cv.cuda.resize{imgCUDA, {1024, 768}}
local img = cv.imread {arg[1] or 'demo/data/lena.jpg', cv.IMREAD_COLOR}:float() / 255
local imgCUDA = img:cuda()

require 'xlua'
local numIterations, dsize = 2000, {1024, 768}

local resized = cv.resize{img, dsize}
local resizedCUDA = torch.CudaTensor(resized:size())

print(('Doing `cv.resize{}` (CPU) %d times (OpenCV\'s number of threads is %d):')
:format(numIterations, cv.getNumThreads{}))
local timer = torch.Timer()

for iter = 1,numIterations do
cv.resize{img, dsize, dst=resized}
if iter % 100 == 0 then xlua.progress(iter, numIterations) end
end
local timeCPU = timer:time().real

print(('Doing `cv.cuda.resize{}` (GPU) %d times:'):format(numIterations))
timer:reset()

for iter = 1,numIterations do
cv.cuda.resize{imgCUDA, dsize, dst=resizedCUDA}
if iter % 100 == 0 then xlua.progress(iter, numIterations) end
end
cutorch.synchronize()
local timeGPU = timer:time().real

-- a technical test to check if Tensor freeing works without errors
for iter = 1,40 do
local _ = cv.cuda.resize{imgCUDA, dsize}
end
collectgarbage()

cv.imshow{"Resized to 1024x768", resized:float()}
local title =
("Lena resized to 1024x768 by your GPU (%.3f times faster than CPU)"):format(timeCPU / timeGPU)
cv.imshow{title, resizedCUDA:float()}
cv.waitKey{0}
18 changes: 17 additions & 1 deletion include/CUDACommon.hpp
Expand Up @@ -15,6 +15,21 @@ struct cutorchInfo {
THCState *state;
};

/**************** A custom allocator that uses Torch memory management for Mats ****************/

class TorchCompatibleAllocator: public cuda::GpuMat::Allocator {
public:
THCState *cutorchState;

bool allocate(cuda::GpuMat* mat, int rows, int cols, size_t elemSize);
void free(cuda::GpuMat* mat);
};

extern "C"
void initAllocatorCUDA(cutorchInfo info);

/****************************************** GpuMatT ********************************************/

class GpuMatT {
public:
cuda::GpuMat mat;
Expand Down Expand Up @@ -43,8 +58,9 @@ GpuMatT TensorWrapper::toGpuMatT() {
return retval;
}

/************************ Fake OpenCV/CUDA classes *************************/
/************* Fake "custom memory stack impl for OpenCV" to use cutorch streams *************/

// Description below
class FakeMemoryPool;
class FakeMemoryStack;
class FakeStackAllocator;
Expand Down
5 changes: 5 additions & 0 deletions init.lua
Expand Up @@ -31,6 +31,7 @@ struct TensorArray {
};

void initAllocator();
void initAllocatorCUDA(struct cutorchInfo info);

This comment has been minimized.

Copy link
@spateux

spateux Apr 10, 2018

We have a problem when installing torch-opencv wrapper when using non GPU version of openCV.
I guess this is due to line 34 of init.lua that define a specific CUDA function while not being in a specific CUDA code part. (At runtime we get an error saying that luaJIT can't find initAllocatorCUDA symbol)

This comment has been minimized.

Copy link
@shrubb

shrubb Apr 10, 2018

Author Contributor

Fixed, thanks. Should now work automatically, but you can always explicitly run WITH_CUDA=OFF luarocks install cv.


void *malloc(size_t size);
void free(void *ptr);
Expand Down Expand Up @@ -333,6 +334,10 @@ end
--- ***************** Tensor <=> Mat conversion *****************

C.initAllocator()
if CUDACommon_C then
cv.cuda = cv.cuda or require 'cv._env_cuda'
CUDACommon_C.initAllocatorCUDA(cv.cuda._info())
end

local tensor_CV_code_by_letter = {
[ 66] = cv.CV_8U , -- B : Byte
Expand Down
56 changes: 54 additions & 2 deletions src/CUDACommon.cpp
@@ -1,5 +1,57 @@
#include <CUDACommon.hpp>

bool TorchCompatibleAllocator::allocate(cuda::GpuMat* mat, int rows, int cols, size_t elemSize) {

if (rows * cols == 0) {
THError("You tried to allocate a Tensor with zero rows or columns");
return false;
}

// See https://github.com/torch/cutorch/blob/master/lib/THC/generic/THCStorage.c#L69
THCState *state = this->cutorchState;
const THCDeviceAllocator *allocator = state->cudaDeviceAllocator;
void *allocatorContext = state->cudaDeviceAllocator->state;

THCHeapUpdate(state, rows * cols * elemSize);
cudaError_t err = (*allocator->malloc)(
allocatorContext,
(void **) &(mat->data),
rows * cols * elemSize,
THCState_getCurrentStream(state));

if (err != cudaSuccess) {
THCHeapUpdate(state, -rows * cols * elemSize);
THCudaCheck(err);
return false;
}
THCudaCheck(err);

mat->step = elemSize * cols;
mat->refcount = (int*) cv::fastMalloc(sizeof(int));

return true;
}

void TorchCompatibleAllocator::free(cuda::GpuMat* mat) {
// See https://github.com/torch/cutorch/blob/master/lib/THC/generic/THCStorage.c#L180
THCState *state = this->cutorchState;
const THCDeviceAllocator *allocator = state->cudaDeviceAllocator;
void *allocatorContext = state->cudaDeviceAllocator->state;

THCHeapUpdate(state, -mat->step * mat->rows);
THCudaCheck((*allocator->free)(allocatorContext, mat->data));

cv::fastFree(mat->refcount);
}

static TorchCompatibleAllocator torchCompatibleAllocator;

extern "C"
void initAllocatorCUDA(cutorchInfo info) {
torchCompatibleAllocator.cutorchState = info.state;
cuda::GpuMat::setDefaultAllocator(&torchCompatibleAllocator);
}

GpuMatT::GpuMatT(cuda::GpuMat & mat) {
this->mat = mat;
this->tensor = nullptr;
Expand Down Expand Up @@ -158,7 +210,7 @@ std::vector<cv::cuda::GpuMat> TensorArray::toGpuMatList() {
return retval;
}

/************************ Fake OpenCV/CUDA classes *************************/
/************* Fake "custom memory stack impl for OpenCV" to use cutorch streams *************/

FakeDefaultDeviceInitializer initializer;

Expand Down Expand Up @@ -346,4 +398,4 @@ cuda::Stream & prepareStream(cutorchInfo info) {
cuda::setDevice(info.deviceID - 1);
fakeStream.impl_ = cv::makePtr<FakeStreamImpl>(THCState_getCurrentStream(info.state));
return *reinterpret_cast<cuda::Stream *>(&fakeStream);
}
}
2 changes: 1 addition & 1 deletion src/cudaobjdetect.cpp
Expand Up @@ -292,7 +292,7 @@ struct TensorWrapper CascadeClassifier_detectMultiScaleCuda(
GpuMatT objectsMat = objects.toGpuMatT();
cuda::GpuMat imageMat = image.toGpuMat();
cuda::GpuMat imageByte;
imageMat.convertTo(imageByte, CV_8U, 255.0); // Sorry guys :(
imageMat.convertTo(imageByte, CV_8U, 255.0); // Sorry guys :( #160
ptr->detectMultiScale(imageByte, objectsMat, prepareStream(info));
return TensorWrapper(objectsMat, info.state);
}
Expand Down
7 changes: 4 additions & 3 deletions src/cudaoptflow.cpp
Expand Up @@ -5,9 +5,10 @@ struct TensorWrapper DenseOpticalFlow_calcCuda(struct cutorchInfo info,
struct DenseOpticalFlowPtr ptr, struct TensorWrapper I0, struct TensorWrapper I1,
struct TensorWrapper flow)
{
cuda::GpuMat retval = flow.toGpuMat();
ptr->calc(I0.toGpuMat(), I1.toGpuMat(), retval, prepareStream(info));
return TensorWrapper(retval, info.state);
GpuMatT flowMat = flow.toGpuMatT();
ptr->calc(I0.toGpuMat(), I1.toGpuMat(), flowMat, prepareStream(info));

return TensorWrapper(flowMat, info.state);
}

extern "C"
Expand Down

0 comments on commit 8e3d1f4

Please sign in to comment.