Skip to content

Commit

Permalink
Minor common-fixes for HIP (#345)
Browse files Browse the repository at this point in the history
* Use scratchBufferHip

* minor fix

* remove additional variable use

* Add CHECK_RETURN_STATUS to hip API

* handle fix
  • Loading branch information
r-abishek committed May 7, 2024
1 parent 9dcae9d commit 77e14ef
Show file tree
Hide file tree
Showing 14 changed files with 126 additions and 146 deletions.
6 changes: 3 additions & 3 deletions src/modules/hip/hip_tensor_arithmetic_operations.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/

#ifndef HIP_TENSOR_ARITHMEETIC_OPERATIONS_HPP
#define HIP_TENSOR_ARITHMEETIC_OPERATIONS_HPP
#ifndef HIP_TENSOR_ARITHMETIC_OPERATIONS_HPP
#define HIP_TENSOR_ARITHMETIC_OPERATIONS_HPP

#include "kernel/fused_multiply_add_scalar.hpp"
#include "kernel/add_scalar.hpp"
#include "kernel/subtract_scalar.hpp"
#include "kernel/multiply_scalar.hpp"
#include "kernel/magnitude.hpp"

#endif // HIP_TENSOR_ARITHMEETIC_OPERATIONS_HPP
#endif // HIP_TENSOR_ARITHMETIC_OPERATIONS_HPP
2 changes: 1 addition & 1 deletion src/modules/hip/kernel/copy.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ RppStatus hip_exec_copy_tensor(T *srcPtr,
{
if (srcDescPtr->layout == dstDescPtr->layout)
{
hipMemcpy(dstPtr, srcPtr, dstDescPtr->n * dstDescPtr->strides.nStride * sizeof(T), hipMemcpyDeviceToDevice);
CHECK_RETURN_STATUS(hipMemcpy(dstPtr, srcPtr, dstDescPtr->n * dstDescPtr->strides.nStride * sizeof(T), hipMemcpyDeviceToDevice));
}
else if ((srcDescPtr->c == 3) && (dstDescPtr->c == 3))
{
Expand Down
9 changes: 1 addition & 8 deletions src/modules/hip/kernel/gamma_correction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -227,9 +227,7 @@ RppStatus hip_exec_gamma_correction_tensor(T *srcPtr,
int globalThreads_y = handle.GetBatchSize();
int globalThreads_z = 1;

float *gammaLUT;
hipMalloc(&gammaLUT, 256 * handle.GetBatchSize() * sizeof(Rpp32f));

Rpp32f *gammaLUT = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem;
hipLaunchKernelGGL(gamma_correction_lut_compute,
dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X_1DIM), ceil((float)globalThreads_y/LOCAL_THREADS_Y_1DIM), ceil((float)globalThreads_z/LOCAL_THREADS_Z_1DIM)),
dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM),
Expand All @@ -238,9 +236,6 @@ RppStatus hip_exec_gamma_correction_tensor(T *srcPtr,
gammaLUT,
handle.GetInitHandle()->mem.mgpu.floatArr[0].floatmem);




globalThreads_x = (dstDescPtr->strides.hStride + 7) >> 3;
globalThreads_y = dstDescPtr->h;
globalThreads_z = handle.GetBatchSize();
Expand Down Expand Up @@ -307,7 +302,5 @@ RppStatus hip_exec_gamma_correction_tensor(T *srcPtr,
}
}

hipFree(&gammaLUT);

return RPP_SUCCESS;
}
86 changes: 40 additions & 46 deletions src/modules/hip/kernel/gaussian_filter.hpp

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/modules/hip/kernel/noise_salt_and_pepper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ RppStatus hip_exec_salt_and_pepper_noise_tensor(T *srcPtr,

Rpp32u *xorwowSeedStream;
xorwowSeedStream = (Rpp32u *)&xorwowInitialStatePtr[1];
hipMemcpy(xorwowSeedStream, rngSeedStream4050, SEED_STREAM_MAX_SIZE * sizeof(Rpp32u), hipMemcpyHostToDevice);
CHECK_RETURN_STATUS(hipMemcpy(xorwowSeedStream, rngSeedStream4050, SEED_STREAM_MAX_SIZE * sizeof(Rpp32u), hipMemcpyHostToDevice));

if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
{
Expand Down
2 changes: 1 addition & 1 deletion src/modules/hip/kernel/noise_shot.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,7 @@ RppStatus hip_exec_shot_noise_tensor(T *srcPtr,

Rpp32u *xorwowSeedStream;
xorwowSeedStream = (Rpp32u *)&xorwowInitialStatePtr[1];
hipMemcpy(xorwowSeedStream, rngSeedStream4050, SEED_STREAM_MAX_SIZE * sizeof(Rpp32u), hipMemcpyHostToDevice);
CHECK_RETURN_STATUS(hipMemcpy(xorwowSeedStream, rngSeedStream4050, SEED_STREAM_MAX_SIZE * sizeof(Rpp32u), hipMemcpyHostToDevice));

if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
{
Expand Down
19 changes: 8 additions & 11 deletions src/modules/hip/kernel/ricap.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,18 +174,15 @@ RppStatus hip_exec_ricap_tensor(T *srcPtr,
if (roiType == RpptRoiType::LTRB)
hip_exec_roi_converison_ltrb_to_xywh(roiPtrInputCropRegion, handle);

int localThreads_x = LOCAL_THREADS_X;
int localThreads_y = LOCAL_THREADS_Y;
int localThreads_z = LOCAL_THREADS_Z;
int globalThreads_x = (dstDescPtr->strides.hStride + 7) >> 3;
int globalThreads_y = dstDescPtr->h;
int globalThreads_z = handle.GetBatchSize();

if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
{
hipLaunchKernelGGL(ricap_pkd_tensor,
dim3(ceil((float)globalThreads_x / localThreads_x), ceil((float)globalThreads_y / localThreads_y), ceil((float)globalThreads_z / localThreads_z)),
dim3(localThreads_x, localThreads_y, localThreads_z),
dim3(ceil((float)globalThreads_x / LOCAL_THREADS_X), ceil((float)globalThreads_y / LOCAL_THREADS_Y), ceil((float)globalThreads_z / LOCAL_THREADS_Z)),
dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
0,
handle.GetStream(),
srcPtr,
Expand All @@ -199,8 +196,8 @@ RppStatus hip_exec_ricap_tensor(T *srcPtr,
else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
{
hipLaunchKernelGGL(ricap_pln_tensor,
dim3(ceil((float)globalThreads_x / localThreads_x), ceil((float)globalThreads_y / localThreads_y), ceil((float)globalThreads_z / localThreads_z)),
dim3(localThreads_x, localThreads_y, localThreads_z),
dim3(ceil((float)globalThreads_x / LOCAL_THREADS_X), ceil((float)globalThreads_y / LOCAL_THREADS_Y), ceil((float)globalThreads_z / LOCAL_THREADS_Z)),
dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
0,
handle.GetStream(),
srcPtr,
Expand All @@ -217,8 +214,8 @@ RppStatus hip_exec_ricap_tensor(T *srcPtr,
if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
{
hipLaunchKernelGGL(ricap_pkd3_pln3_tensor,
dim3(ceil((float)globalThreads_x / localThreads_x), ceil((float)globalThreads_y / localThreads_y), ceil((float)globalThreads_z / localThreads_z)),
dim3(localThreads_x, localThreads_y, localThreads_z),
dim3(ceil((float)globalThreads_x / LOCAL_THREADS_X), ceil((float)globalThreads_y / LOCAL_THREADS_Y), ceil((float)globalThreads_z / LOCAL_THREADS_Z)),
dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
0,
handle.GetStream(),
srcPtr,
Expand All @@ -233,8 +230,8 @@ RppStatus hip_exec_ricap_tensor(T *srcPtr,
{
globalThreads_x = (srcDescPtr->strides.hStride + 7) >> 3;
hipLaunchKernelGGL(ricap_pln3_pkd3_tensor,
dim3(ceil((float)globalThreads_x / localThreads_x), ceil((float)globalThreads_y / localThreads_y), ceil((float)globalThreads_z / localThreads_z)),
dim3(localThreads_x, localThreads_y, localThreads_z),
dim3(ceil((float)globalThreads_x / LOCAL_THREADS_X), ceil((float)globalThreads_y / LOCAL_THREADS_Y), ceil((float)globalThreads_z / LOCAL_THREADS_Z)),
dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
0,
handle.GetStream(),
srcPtr,
Expand Down
4 changes: 2 additions & 2 deletions src/modules/hip/kernel/spatter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -243,8 +243,8 @@ RppStatus hip_exec_spatter_tensor(T *srcPtr,
float *spatterMaskPtr, *spatterMaskInvPtr;
spatterMaskPtr = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem;
spatterMaskInvPtr = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem + maskSize;
hipMemcpy(spatterMaskPtr, spatterMask, maskSizeFloat, hipMemcpyHostToDevice);
hipMemcpy(spatterMaskInvPtr, spatterMaskInv, maskSizeFloat, hipMemcpyHostToDevice);
CHECK_RETURN_STATUS(hipMemcpy(spatterMaskPtr, spatterMask, maskSizeFloat, hipMemcpyHostToDevice));
CHECK_RETURN_STATUS(hipMemcpy(spatterMaskInvPtr, spatterMaskInv, maskSizeFloat, hipMemcpyHostToDevice));

if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
{
Expand Down
2 changes: 1 addition & 1 deletion src/modules/hip/kernel/warp_affine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ RppStatus hip_exec_warp_affine_tensor(T *srcPtr,
int globalThreads_z = handle.GetBatchSize();

float *affineTensorPtr = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem;
hipMemcpy(affineTensorPtr, affineTensor, 6 * handle.GetBatchSize() * sizeof(float), hipMemcpyHostToDevice);
CHECK_RETURN_STATUS(hipMemcpy(affineTensorPtr, affineTensor, 6 * handle.GetBatchSize() * sizeof(float), hipMemcpyHostToDevice));

if (interpolationType == RpptInterpolationType::BILINEAR)
{
Expand Down
19 changes: 8 additions & 11 deletions src/modules/hip/kernel/water.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -222,18 +222,15 @@ RppStatus hip_exec_water_tensor(T *srcPtr,
if (roiType == RpptRoiType::XYWH)
hip_exec_roi_converison_xywh_to_ltrb(roiTensorPtrSrc, handle);

int localThreads_x = 16;
int localThreads_y = 16;
int localThreads_z = 1;
int globalThreads_x = (dstDescPtr->strides.hStride + 7) >> 3;
int globalThreads_y = dstDescPtr->h;
int globalThreads_z = handle.GetBatchSize();

if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
{
hipLaunchKernelGGL(water_pkd_tensor,
dim3(ceil(static_cast<float>(globalThreads_x)/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)),
dim3(localThreads_x, localThreads_y, localThreads_z),
dim3(ceil(static_cast<float>(globalThreads_x)/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
0,
handle.GetStream(),
srcPtr,
Expand All @@ -251,8 +248,8 @@ RppStatus hip_exec_water_tensor(T *srcPtr,
else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
{
hipLaunchKernelGGL(water_pln_tensor,
dim3(ceil(static_cast<float>(globalThreads_x)/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)),
dim3(localThreads_x, localThreads_y, localThreads_z),
dim3(ceil(static_cast<float>(globalThreads_x)/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
0,
handle.GetStream(),
srcPtr,
Expand All @@ -273,8 +270,8 @@ RppStatus hip_exec_water_tensor(T *srcPtr,
if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
{
hipLaunchKernelGGL(water_pkd3_pln3_tensor,
dim3(ceil(static_cast<float>(globalThreads_x)/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)),
dim3(localThreads_x, localThreads_y, localThreads_z),
dim3(ceil(static_cast<float>(globalThreads_x)/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
0,
handle.GetStream(),
srcPtr,
Expand All @@ -293,8 +290,8 @@ RppStatus hip_exec_water_tensor(T *srcPtr,
{
globalThreads_x = (srcDescPtr->strides.hStride + 7) >> 3;
hipLaunchKernelGGL(water_pln3_pkd3_tensor,
dim3(ceil(static_cast<float>(globalThreads_x)/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)),
dim3(localThreads_x, localThreads_y, localThreads_z),
dim3(ceil(static_cast<float>(globalThreads_x)/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
0,
handle.GetStream(),
srcPtr,
Expand Down
11 changes: 5 additions & 6 deletions src/modules/rppt_tensor_effects_augmentations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -895,7 +895,7 @@ RppStatus rppt_spatter_gpu(RppPtr_t srcPtr,
{
#ifdef HIP_COMPILE
RpptROI roiTensorPtrSrcHost[dstDescPtr->n];
hipMemcpy(roiTensorPtrSrcHost, roiTensorPtrSrc, dstDescPtr->n * sizeof(RpptROI), hipMemcpyDeviceToHost);
CHECK_RETURN_STATUS(hipMemcpy(roiTensorPtrSrcHost, roiTensorPtrSrc, dstDescPtr->n * sizeof(RpptROI), hipMemcpyDeviceToHost));
if (roiType == RpptRoiType::XYWH)
{
for(int i = 0; i < dstDescPtr->n; i++)
Expand Down Expand Up @@ -1011,7 +1011,7 @@ RppStatus rppt_salt_and_pepper_noise_gpu(RppPtr_t srcPtr,

RpptXorwowState *d_xorwowInitialStatePtr;
d_xorwowInitialStatePtr = (RpptXorwowState *) rpp::deref(rppHandle).GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem;
hipMemcpy(d_xorwowInitialStatePtr, &xorwowInitialState, sizeof(RpptXorwowState), hipMemcpyHostToDevice);
CHECK_RETURN_STATUS(hipMemcpy(d_xorwowInitialStatePtr, &xorwowInitialState, sizeof(RpptXorwowState), hipMemcpyHostToDevice));

if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
{
Expand Down Expand Up @@ -1096,7 +1096,7 @@ RppStatus rppt_shot_noise_gpu(RppPtr_t srcPtr,

RpptXorwowStateBoxMuller *d_xorwowInitialStatePtr;
d_xorwowInitialStatePtr = (RpptXorwowStateBoxMuller *) rpp::deref(rppHandle).GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem;
hipMemcpy(d_xorwowInitialStatePtr, &xorwowInitialState, sizeof(RpptXorwowStateBoxMuller), hipMemcpyHostToDevice);
CHECK_RETURN_STATUS(hipMemcpy(d_xorwowInitialStatePtr, &xorwowInitialState, sizeof(RpptXorwowStateBoxMuller), hipMemcpyHostToDevice));

if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
{
Expand Down Expand Up @@ -1451,9 +1451,8 @@ RppStatus rppt_ricap_gpu(RppPtr_t srcPtr,
#ifdef HIP_COMPILE
if(srcDescPtr->n == 1) // BatchSize should always be greater than 1
return RPP_ERROR;
Rpp32u* permutationHipTensor;
hipMalloc(&permutationHipTensor, sizeof(Rpp32u)* 4 * dstDescPtr->n);
hipMemcpy(permutationHipTensor, permutationTensor, sizeof(Rpp32u)* 4 * dstDescPtr->n, hipMemcpyHostToDevice);
Rpp32u *permutationHipTensor = reinterpret_cast<Rpp32u*>(rpp::deref(rppHandle).GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem);
CHECK_RETURN_STATUS(hipMemcpy(permutationHipTensor, permutationTensor, sizeof(Rpp32u)* 4 * dstDescPtr->n, hipMemcpyHostToDevice));

if ((check_roi_out_of_bounds(&roiPtrInputCropRegion[0],srcDescPtr,roiType) == -1)
|| (check_roi_out_of_bounds(&roiPtrInputCropRegion[1],srcDescPtr,roiType) == -1)
Expand Down
Loading

0 comments on commit 77e14ef

Please sign in to comment.