Skip to content

Commit

Permalink
Merge pull request #4421 from pwang234/cuda-fix
Browse files Browse the repository at this point in the history
fix grid dim bug for eval_multi_multi_UBspline_3d_c_vgl_kernel
  • Loading branch information
ye-luo authored Jan 27, 2023
2 parents fa62bd0 + b903bd0 commit c101a9f
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 26 deletions.
8 changes: 4 additions & 4 deletions src/einspline/multi_bspline_cuda_c_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,7 @@ eval_multi_multi_UBspline_3d_c_vgl_kernel(float const * __restrict__ pos,
__shared__ float ab[96];
__shared__ float a[12], b[12], c[12];
__shared__ float G[3][3], GGt[3][3];
int ir = blockIdx.y;
int ir = blockIdx.x;
float rx = pos[3*ir+0];
float ry = pos[3*ir+1];
float rz = pos[3*ir+2];
Expand Down Expand Up @@ -512,7 +512,7 @@ eval_multi_multi_UBspline_3d_c_vgl_kernel(float const * __restrict__ pos,
float h00 = 0.0f, h01 = 0.0f, h02 = 0.0f;
float h11 = 0.0f, h12 = 0.0f, h22 = 0.0f;
float v = 0.0f;
int off = blockIdx.x * blockDim.x + threadIdx.x;
int off = blockIdx.y * blockDim.x + threadIdx.x;
if (off < 2*N)
{
int stride_x = strides.x;
Expand Down Expand Up @@ -687,7 +687,7 @@ eval_multi_multi_UBspline_3d_c_vgl_cuda (multi_UBspline_3d_c_cuda *spline,
int num_splines=spline->num_splines;
int threadsPerBlock = max(64,min(32*((2*num_splines+31)/32),256));
dim3 dimBlock(threadsPerBlock);
dim3 dimGrid((2 * num_splines + dimBlock.x - 1) / dimBlock.x, num);
dim3 dimGrid(num, (2 * num_splines + dimBlock.x - 1) / dimBlock.x);
eval_multi_multi_UBspline_3d_c_vgl_kernel<<<dimGrid,dimBlock, 0, gpu::kernelStream>>>
(pos_d, (float*)spline->coefs, Linv_d, (float**)vals_d,
(float**)grad_lapl_d, spline->gridInv, spline->dim,
Expand All @@ -712,7 +712,7 @@ eval_multi_multi_UBspline_3d_c_vgl_cudasplit (multi_UBspline_3d_c_cuda *spline,
int num_splines=spline->num_split_splines;
int threadsPerBlock = max(64,min(32*((2*num_splines+31)/32),256));
dim3 dimBlock(threadsPerBlock);
dim3 dimGrid((2 * num_splines + dimBlock.x - 1) / dimBlock.x, num);
dim3 dimGrid(num, (2 * num_splines + dimBlock.x - 1) / dimBlock.x);
eval_multi_multi_UBspline_3d_c_vgl_kernel<<<dimGrid,dimBlock,0,s>>>
(pos_d, coefs, Linv_d, (float**)vals_d,
(float**)grad_lapl_d, spline->gridInv, spline->dim,
Expand Down
12 changes: 4 additions & 8 deletions src/einspline/multi_bspline_cuda_d_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -397,9 +397,9 @@ eval_multi_multi_UBspline_3d_d_vgl_kernel
double *vals[], double *grad_lapl[], uint3 dim, uint3 strides,
int N, int row_stride, int spline_offset)
{
int block = blockIdx.x;
int ir = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int block = blockIdx.y;
int off = block*SPLINE_BLOCK_SIZE+threadIdx.x;
__shared__ double *myval, *mygrad_lapl;
__shared__ double3 r;
Expand Down Expand Up @@ -546,9 +546,7 @@ eval_multi_multi_UBspline_3d_d_vgl_cuda
double *vals_d[], double *grad_lapl_d[], int num, int row_stride)
{
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num);
if (spline->num_splines % SPLINE_BLOCK_SIZE)
dimGrid.x++;
dim3 dimGrid(num, (spline->num_splines + SPLINE_BLOCK_SIZE - 1) / SPLINE_BLOCK_SIZE);
eval_multi_multi_UBspline_3d_d_vgl_kernel<<<dimGrid,dimBlock>>>
(pos_d, spline->gridInv, spline->coefs, Linv_d, vals_d,
grad_lapl_d, spline->dim, spline->stride, spline->num_splines, row_stride, 0);
Expand All @@ -563,9 +561,7 @@ eval_multi_multi_UBspline_3d_d_vgl_cudasplit
{
int num_splines=spline->num_split_splines;
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(num_splines/SPLINE_BLOCK_SIZE, num);
if (num_splines % SPLINE_BLOCK_SIZE)
dimGrid.x++;
dim3 dimGrid(num, (num_splines + SPLINE_BLOCK_SIZE - 1) / SPLINE_BLOCK_SIZE);
eval_multi_multi_UBspline_3d_d_vgl_kernel<<<dimGrid,dimBlock,0,s>>>
(pos_d, spline->gridInv, coefs, Linv_d, vals_d,
grad_lapl_d, spline->dim, spline->stride, num_splines, row_stride, device_nr*num_splines*num);
Expand Down
8 changes: 3 additions & 5 deletions src/einspline/multi_bspline_cuda_s_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -523,9 +523,9 @@ eval_multi_multi_UBspline_3d_s_vgl_kernel
float *vals[], float *grad_lapl[], uint3 dim, uint3 strides,
int N, int row_stride)
{
int block = blockIdx.x;
int ir = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int block = blockIdx.y;
int off = block*SPLINE_BLOCK_SIZE+threadIdx.x;
__shared__ float *myval, *mygrad_lapl;
__shared__ float3 r;
Expand Down Expand Up @@ -680,9 +680,7 @@ eval_multi_multi_UBspline_3d_s_vgl_cuda
float *vals_d[], float *grad_lapl_d[], int num, int row_stride)
{
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num);
if (spline->num_splines % SPLINE_BLOCK_SIZE)
dimGrid.x++;
dim3 dimGrid(num, (spline->num_splines + SPLINE_BLOCK_SIZE -1) / SPLINE_BLOCK_SIZE);
eval_multi_multi_UBspline_3d_s_vgl_kernel<<<dimGrid,dimBlock>>>
(pos_d, spline->gridInv, spline->coefs, Linv_d, vals_d,
grad_lapl_d, spline->dim, spline->stride, spline->num_splines, row_stride);
Expand Down
14 changes: 5 additions & 9 deletions src/einspline/multi_bspline_cuda_z_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -296,9 +296,9 @@ eval_multi_multi_UBspline_3d_z_vgl_kernel
double *vals[], double *grad_lapl[], uint3 dim, uint3 strides,
int N, int row_stride, int spline_offset)
{
int block = blockIdx.x;
int ir = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int block = blockIdx.y;
int off = block*SPLINE_BLOCK_SIZE+threadIdx.x;
__shared__ double *myval, *mygrad_lapl;
__shared__ double3 r;
Expand Down Expand Up @@ -444,9 +444,7 @@ eval_multi_multi_UBspline_3d_z_vgl_cuda
double *vals_d[], double *grad_lapl_d[], int num, int row_stride)
{
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(2*spline->num_splines/SPLINE_BLOCK_SIZE, num);
if (2*spline->num_splines % SPLINE_BLOCK_SIZE)
dimGrid.x++;
dim3 dimGrid(num, (2 * spline->num_splines + SPLINE_BLOCK_SIZE - 1) / SPLINE_BLOCK_SIZE);
eval_multi_multi_UBspline_3d_z_vgl_kernel<<<dimGrid,dimBlock>>>
(pos_d, spline->gridInv, (double*)spline->coefs, Linv_d, (double**)vals_d,
(double**)grad_lapl_d, spline->dim, spline->stride, spline->num_splines, row_stride, 0);
Expand All @@ -459,11 +457,9 @@ eval_multi_multi_UBspline_3d_z_vgl_cudasplit
double *vals_d[], double *grad_lapl_d[], int num, int row_stride,
double *coefs, int device_nr, cudaStream_t s)
{
int num_splines=spline->num_split_splines;
int num_splines = spline->num_split_splines;
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(2*num_splines/SPLINE_BLOCK_SIZE, num);
if (2*num_splines % SPLINE_BLOCK_SIZE)
dimGrid.x++;
dim3 dimGrid(num, (2 * num_splines + SPLINE_BLOCK_SIZE - 1) / SPLINE_BLOCK_SIZE);
eval_multi_multi_UBspline_3d_z_vgl_kernel<<<dimGrid,dimBlock,0,s>>>
(pos_d, spline->gridInv, coefs, Linv_d, (double**)vals_d,
(double**)grad_lapl_d, spline->dim, spline->stride, num_splines, row_stride, 2*device_nr*num_splines*num);
Expand Down

0 comments on commit c101a9f

Please sign in to comment.