forked from RTKConsortium/RTK
-
Notifications
You must be signed in to change notification settings - Fork 2
/
rtkCudaUtilities.cu
243 lines (208 loc) · 8.75 KB
/
rtkCudaUtilities.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
/*=========================================================================
*
* Copyright RTK Consortium
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*=========================================================================*/
#include "rtkCudaUtilities.hcu"
#include <cublas_v2.h>
std::vector<int>
GetListOfCudaDevices()
{
std::vector<int> deviceList;
int deviceCount;
struct cudaDeviceProp properties;
if (cudaGetDeviceCount(&deviceCount) == cudaSuccess)
{
for (int device = 0; device < deviceCount; ++device)
{
cudaGetDeviceProperties(&properties, device);
if (properties.major != 9999) /* 9999 means emulation only */
deviceList.push_back(device);
}
}
if (deviceList.size() < 1)
itkGenericExceptionMacro(<< "No CUDA device available");
return deviceList;
}
std::pair<int, int>
GetCudaComputeCapability(int device)
{
struct cudaDeviceProp properties;
if (cudaGetDeviceProperties(&properties, device) != cudaSuccess)
itkGenericExceptionMacro(<< "Invalid CUDA device");
return std::make_pair(properties.major, properties.minor);
}
size_t
GetFreeGPUGlobalMemory(int device)
{
// The return result of cuda utility methods are stored in a CUresult
CUresult result;
// create cuda context
CUcontext cudaContext;
result = cuCtxCreate(&cudaContext, CU_CTX_SCHED_AUTO, device);
if (result != CUDA_SUCCESS)
{
itkGenericExceptionMacro(<< "Could not create context on this CUDA device");
}
// get the amount of free memory on the graphics card
size_t free;
size_t total;
result = cuMemGetInfo(&free, &total);
if (result != CUDA_SUCCESS)
{
itkGenericExceptionMacro(<< "Could not obtain information on free memory on this CUDA device");
}
cuCtxDestroy_v2(cudaContext);
return free;
}
__host__ void
prepareScalarTextureObject(int size[3],
float * dev_ptr,
cudaArray *& threeDArray,
cudaTextureObject_t & tex,
const bool isProjections,
const bool isLinear,
const cudaTextureAddressMode texAddressMode)
{
// create texture object
cudaResourceDesc resDesc = {};
resDesc.resType = cudaResourceTypeArray;
cudaTextureDesc texDesc = {};
texDesc.readMode = cudaReadModeElementType;
for (int component = 0; component < 3; component++)
texDesc.addressMode[component] = texAddressMode;
if (isLinear)
texDesc.filterMode = cudaFilterModeLinear;
else
texDesc.filterMode = cudaFilterModePoint;
static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaExtent volExtent = make_cudaExtent(size[0], size[1], size[2]);
// Allocate an intermediate memory space to extract the components of the input volume
float * singleComponent;
int numel = size[0] * size[1] * size[2];
cudaMalloc(&singleComponent, numel * sizeof(float));
CUDA_CHECK_ERROR;
// Copy image data to arrays. The tricky part is the make_cudaPitchedPtr.
// The best way to understand it is to read
// https://stackoverflow.com/questions/16119943/how-and-when-should-i-use-pitched-pointer-with-the-cuda-api
// Allocate the cudaArray. Projections use layered arrays, volumes use default 3D arrays
if (isProjections)
cudaMalloc3DArray(&threeDArray, &channelDesc, volExtent, cudaArrayLayered);
else
cudaMalloc3DArray(&threeDArray, &channelDesc, volExtent);
CUDA_CHECK_ERROR;
// Fill it with the current singleComponent
cudaMemcpy3DParms CopyParams = {};
CopyParams.srcPtr = make_cudaPitchedPtr(dev_ptr, size[0] * sizeof(float), size[0], size[1]);
CUDA_CHECK_ERROR;
CopyParams.dstArray = threeDArray;
CopyParams.extent = volExtent;
CopyParams.kind = cudaMemcpyDeviceToDevice;
cudaMemcpy3D(&CopyParams);
CUDA_CHECK_ERROR;
// Fill in the texture object with all this information
resDesc.res.array.array = threeDArray;
cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
CUDA_CHECK_ERROR;
}
__host__ void
prepareVectorTextureObject(int size[3],
const float * dev_ptr,
std::vector<cudaArray *> & componentArrays,
const unsigned int nComponents,
std::vector<cudaTextureObject_t> & tex,
const bool isProjections,
const cudaTextureAddressMode texAddressMode)
{
componentArrays.resize(nComponents);
tex.resize(nComponents);
// Create CUBLAS context
cublasHandle_t handle;
cublasCreate(&handle);
// create texture object
cudaResourceDesc resDesc = {};
resDesc.resType = cudaResourceTypeArray;
cudaTextureDesc texDesc = {};
texDesc.readMode = cudaReadModeElementType;
for (int component = 0; component < 3; component++)
texDesc.addressMode[component] = texAddressMode;
texDesc.filterMode = cudaFilterModeLinear;
static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaExtent volExtent = make_cudaExtent(size[0], size[1], size[2]);
// Allocate an intermediate memory space to extract the components of the input volume
float * singleComponent;
int numel = size[0] * size[1] * size[2];
cudaMalloc(&singleComponent, numel * sizeof(float));
CUDA_CHECK_ERROR;
float one = 1.0;
// Copy image data to arrays. The tricky part is the make_cudaPitchedPtr.
// The best way to understand it is to read
// https://stackoverflow.com/questions/16119943/how-and-when-should-i-use-pitched-pointer-with-the-cuda-api
for (unsigned int component = 0; component < nComponents; component++)
{
// Reset the intermediate memory
cudaMemset((void *)singleComponent, 0, numel * sizeof(float));
// Fill it with the current component
const float * pComponent = dev_ptr + component;
cublasSaxpy(handle, numel, &one, pComponent, nComponents, singleComponent, 1);
// Allocate the cudaArray. Projections use layered arrays, volumes use default 3D arrays
if (isProjections)
cudaMalloc3DArray(&componentArrays[component], &channelDesc, volExtent, cudaArrayLayered);
else
cudaMalloc3DArray(&componentArrays[component], &channelDesc, volExtent);
CUDA_CHECK_ERROR;
// Fill it with the current singleComponent
cudaMemcpy3DParms CopyParams = cudaMemcpy3DParms();
CopyParams.srcPtr = make_cudaPitchedPtr(singleComponent, size[0] * sizeof(float), size[0], size[1]);
CUDA_CHECK_ERROR;
CopyParams.dstArray = componentArrays[component];
CopyParams.extent = volExtent;
CopyParams.kind = cudaMemcpyDeviceToDevice;
cudaMemcpy3D(&CopyParams);
CUDA_CHECK_ERROR;
// Fill in the texture object with all this information
resDesc.res.array.array = componentArrays[component];
cudaCreateTextureObject(&tex[component], &resDesc, &texDesc, NULL);
CUDA_CHECK_ERROR;
}
// Intermediate memory is no longer needed
cudaFree(singleComponent);
// Destroy CUBLAS context
cublasDestroy(handle);
}
__host__ void
prepareGeometryTextureObject(int length,
const float * geometry,
float *& dev_geom,
cudaTextureObject_t & tex_geom,
const unsigned int nParam)
{
// copy geometry matrix to device, bind the matrix to the texture
cudaMalloc((void **)&dev_geom, length * nParam * sizeof(float));
CUDA_CHECK_ERROR;
cudaMemcpy(dev_geom, geometry, length * nParam * sizeof(float), cudaMemcpyHostToDevice);
CUDA_CHECK_ERROR;
// create texture object
cudaResourceDesc resDesc = {};
resDesc.resType = cudaResourceTypeLinear;
resDesc.res.linear.devPtr = dev_geom;
resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
resDesc.res.linear.desc.x = 32; // bits per channel
resDesc.res.linear.sizeInBytes = length * nParam * sizeof(float);
cudaTextureDesc texDesc = {};
texDesc.readMode = cudaReadModeElementType;
cudaCreateTextureObject(&tex_geom, &resDesc, &texDesc, NULL);
CUDA_CHECK_ERROR;
}