diff --git a/include/rppdefs.h b/include/rppdefs.h index 82539e139..d35f82bfd 100644 --- a/include/rppdefs.h +++ b/include/rppdefs.h @@ -134,7 +134,7 @@ typedef enum /*! \brief Out of bound source ROI \ingroup group_rppdefs */ RPP_ERROR_OUT_OF_BOUND_SRC_ROI = -17, /*! \brief src and dst layout mismatch \ingroup group_rppdefs */ - RPP_ERROR_SRC_DST_LAYOUT_MISMATCH = -18, + RPP_ERROR_LAYOUT_MISMATCH = -18, /*! \brief Number of channels is invalid. (Needs to adhere to function specification.) \ingroup group_rppdefs */ RPP_ERROR_INVALID_CHANNELS = -19 } RppStatus; diff --git a/include/rppt_tensor_audio_augmentations.h b/include/rppt_tensor_audio_augmentations.h index 09e4cbd56..4e5f412db 100644 --- a/include/rppt_tensor_audio_augmentations.h +++ b/include/rppt_tensor_audio_augmentations.h @@ -60,7 +60,7 @@ extern "C" { * \retval RPP_SUCCESS Successful completion. * \retval RPP_ERROR* Unsuccessful completion. */ -RppStatus rppt_non_silent_region_detection_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp32s *srcLengthTensor, Rpp32f *detectedIndexTensor, Rpp32f *detectionLengthTensor, Rpp32f cutOffDB, Rpp32s windowLength, Rpp32f referencePower, Rpp32s resetInterval, rppHandle_t rppHandle); +RppStatus rppt_non_silent_region_detection_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp32s *srcLengthTensor, Rpp32s *detectedIndexTensor, Rpp32s *detectionLengthTensor, Rpp32f cutOffDB, Rpp32s windowLength, Rpp32f referencePower, Rpp32s resetInterval, rppHandle_t rppHandle); /*! \brief To Decibels augmentation on HOST backend * \details To Decibels augmentation for 1D audio buffer converts magnitude values to decibel values diff --git a/include/rppt_tensor_geometric_augmentations.h b/include/rppt_tensor_geometric_augmentations.h index 695c3252d..8e846a41b 100644 --- a/include/rppt_tensor_geometric_augmentations.h +++ b/include/rppt_tensor_geometric_augmentations.h @@ -448,38 +448,42 @@ RppStatus rppt_phase_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDesc * \details This function performs slice augmentation on a generic 4D tensor. * Slice augmentation involves selecting a region of interest (ROI) from the source tensor * and copying it to the destination tensor. Support added for f32 -> f32 and u8 -> u8 dataypes. - * \param[in] srcPtr source tensor in HOST memory - * \param[in] srcGenericDescPtr source tensor descriptor - * \param[out] dstPtr destination tensor in HOST memory - * \param[in] dstGenericDescPtr destination tensor descriptor - * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values) - * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB) + * \param [in] srcPtr source tensor memory in HOST memory + * \param [in] srcGenericDescPtr source tensor descriptor + * \param [out] dstPtr destination tensor memory in HOST memory + * \param [in] dstGenericDescPtr destination tensor descriptor + * \param [in] anchorTensor starting index of the slice for each dimension in input (1D tensor of size = batchSize * numberOfDimensions) + * \param [in] shapeTensor length of the slice for each dimension in input (1D tensor of size = batchSize * numberOfDimensions) + * \param [in] fillValue fill value that is used to fill output if enablePadding is set to true + * \param [in] enablePadding boolean flag to specify if padding is enabled or not + * \param [in] roiTensor roi data in HOST memory (1D tensor of size = batchSize * numberOfDimensions * 2) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. * \retval RPP_SUCCESS Successful completion. * \retval RPP_ERROR* Unsuccessful completion. - * \ingroup group_tensor_geometric */ -RppStatus rppt_slice_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle); +RppStatus rppt_slice_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32s *anchorTensor, Rpp32s *shapeTensor, RppPtr_t fillValue, bool enablePadding, Rpp32u *roiTensor, rppHandle_t rppHandle); #ifdef GPU_SUPPORT /*! \brief Slice augmentation GPU * \details This function performs slice augmentation on a generic 4D tensor. * Slice augmentation involves selecting a region of interest (ROI) from the source tensor * and copying it to the destination tensor. Support added for f32 -> f32 and u8 -> u8 dataypes. - * \param[in] srcPtr source tensor in HIP memory - * \param[in] srcGenericDescPtr source tensor descriptor - * \param[out] dstPtr destination tensor in HIP memory - * \param[in] dstGenericDescPtr destination tensor descriptor - * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values) - * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB) + * \param [in] srcPtr source tensor memory in HIP memory + * \param [in] srcGenericDescPtr source tensor descriptor + * \param [out] dstPtr destination tensor memory in HIP memory + * \param [in] dstGenericDescPtr destination tensor descriptor + * \param [in] anchorTensor starting index of the slice for each dimension in input (1D tensor in pinned/HOST memory of size = batchSize * numberOfDimensions) + * \param [in] shapeTensor length of the slice for each dimension in input (1D tensor in pinned/HOST memory of size = batchSize * numberOfDimensions) + * \param [in] fillValue fill value that is used to fill output if enablePadding is set to true + * \param [in] enablePadding boolean flag to specify if padding is enabled or not + * \param [in] roiTensor roi data in pinned/HOST memory (1D tensor of size = batchSize * numberOfDimensions * 2) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. * \retval RPP_SUCCESS Successful completion. * \retval RPP_ERROR* Unsuccessful completion. - * \ingroup group_tensor_geometric */ -RppStatus rppt_slice_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle); +RppStatus rppt_slice_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32s *anchorTensor, Rpp32s *shapeTensor, RppPtr_t fillValue, bool enablePadding, Rpp32u *roiTensor, rppHandle_t rppHandle); #endif // GPU_SUPPORT /*! \brief Crop and Patch augmentation on HOST backend for a NCHW/NHWC layout tensor @@ -539,15 +543,15 @@ RppStatus rppt_crop_and_patch_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPt
Support added for f32 -> f32 and u8 -> u8 dataypes. * \image html input150x150x4.gif Sample Input * \image html geometric_augmentations_flip_150x150x4.gif Sample Output - * \param[in] srcPtr source tensor in HOST memory - * \param[in] srcGenericDescPtr source tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3) - * \param[out] dstPtr destination tensor in HOST memory - * \param[in] dstGenericDescPtr destination tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3) + * \param [in] srcPtr source tensor in HOST memory + * \param [in] srcGenericDescPtr source tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3) + * \param [out] dstPtr destination tensor in HOST memory + * \param [in] dstGenericDescPtr destination tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3) * \param [in] horizontalTensor horizontal flag values to set horizontal flip on/off (1D tensor in HOST memory, of size batchSize, with horizontalTensor[i] = 0/1) * \param [in] verticalTensor vertical flag values to set vertical flip on/off (1D tensor in HOST memory, of size batchSize, with verticalTensor[i] = 0/1) * \param [in] depthTensor depth flag values to set depth flip on/off (1D tensor in HOST memory, of size batchSize, with depthTensor[i] = 0/1) - * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values) - * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB) + * \param [in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values) + * \param [in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. * \retval RPP_SUCCESS Successful completion. @@ -562,15 +566,15 @@ RppStatus rppt_flip_voxel_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDes
Support added for f32 -> f32 and u8 -> u8 dataypes. * \image html input150x150x4.gif Sample Input * \image html geometric_augmentations_flip_150x150x4.gif Sample Output - * \param[in] srcPtr source tensor in HIP memory - * \param[in] srcGenericDescPtr source tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3) - * \param[out] dstPtr destination tensor in HIP memory - * \param[in] dstGenericDescPtr destination tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3) + * \param [in] srcPtr source tensor in HIP memory + * \param [in] srcGenericDescPtr source tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3) + * \param [out] dstPtr destination tensor in HIP memory + * \param [in] dstGenericDescPtr destination tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3) * \param [in] horizontalTensor horizontal flag values to set horizontal flip on/off (1D tensor in pinned/HOST memory, of size batchSize, with horizontalTensor[i] = 0/1) * \param [in] verticalTensor vertical flag values to set vertical flip on/off (1D tensor in pinned/HOST memory, of size batchSize, with verticalTensor[i] = 0/1) * \param [in] depthTensor depth flag values to set depth flip on/off (1D tensor in pinned/HOST memory, of size batchSize, with depthTensor[i] = 0/1) - * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values) - * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB) + * \param [in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values) + * \param [in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. * \retval RPP_SUCCESS Successful completion. diff --git a/src/modules/cpu/kernel/non_silent_region_detection.hpp b/src/modules/cpu/kernel/non_silent_region_detection.hpp index 74dffb18e..39d9e6940 100644 --- a/src/modules/cpu/kernel/non_silent_region_detection.hpp +++ b/src/modules/cpu/kernel/non_silent_region_detection.hpp @@ -95,8 +95,8 @@ Rpp32f getSquare(Rpp32f &value) RppStatus non_silent_region_detection_host_tensor(Rpp32f *srcPtr, RpptDescPtr srcDescPtr, Rpp32s *srcLengthTensor, - Rpp32f *detectedIndexTensor, - Rpp32f *detectionLengthTensor, + Rpp32s *detectedIndexTensor, + Rpp32s *detectionLengthTensor, Rpp32f cutOffDB, Rpp32s windowLength, Rpp32f referencePower, diff --git a/src/modules/cpu/kernel/slice.hpp b/src/modules/cpu/kernel/slice.hpp index c451b67b4..37c3097c9 100644 --- a/src/modules/cpu/kernel/slice.hpp +++ b/src/modules/cpu/kernel/slice.hpp @@ -26,184 +26,213 @@ SOFTWARE. #include "rpp_cpu_simd.hpp" #include "rpp_cpu_common.hpp" -RppStatus slice_f32_f32_host_tensor(Rpp32f *srcPtr, - RpptGenericDescPtr srcGenericDescPtr, - Rpp32f *dstPtr, - RpptGenericDescPtr dstGenericDescPtr, - RpptROI3DPtr roiGenericPtrSrc, - RpptRoi3DType roiType, - RppLayoutParams layoutParams, - rpp::Handle& handle) +template +RppStatus slice_host_tensor(T *srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + T *dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32s *anchorTensor, + Rpp32s *shapeTensor, + T* fillValue, + bool enablePadding, + Rpp32u *roiTensor, + RppLayoutParams layoutParams, + rpp::Handle& handle) { - RpptROI3D roiDefault; - if(srcGenericDescPtr->layout==RpptLayout::NCDHW) - roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[4], (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2]}; - else if(srcGenericDescPtr->layout==RpptLayout::NDHWC) - roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2], (Rpp32s)srcGenericDescPtr->dims[1]}; Rpp32u numThreads = handle.GetNumThreads(); + Rpp32u numDims = srcGenericDescPtr->numDims - 1; // exclude batchsize from input dims omp_set_dynamic(0); #pragma omp parallel for num_threads(numThreads) for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) { - RpptROI3D roi; - RpptROI3DPtr roiPtrInput = &roiGenericPtrSrc[batchCount]; - compute_roi3D_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + T *srcPtrTemp, *dstPtrTemp; + srcPtrTemp = srcPtr + batchCount * srcGenericDescPtr->strides[0]; + dstPtrTemp = dstPtr + batchCount * dstGenericDescPtr->strides[0]; - Rpp32f *srcPtrImage, *dstPtrImage; - srcPtrImage = srcPtr + batchCount * srcGenericDescPtr->strides[0]; - dstPtrImage = dstPtr + batchCount * dstGenericDescPtr->strides[0]; + T *srcPtrChannel, *dstPtrChannel; + dstPtrChannel = dstPtrTemp; - Rpp32u bufferLength = roi.xyzwhdROI.roiWidth * layoutParams.bufferMultiplier; + Rpp32s *anchor = &anchorTensor[batchCount * numDims]; + Rpp32s *shape = &shapeTensor[batchCount * numDims]; - Rpp32f *srcPtrChannel, *dstPtrChannel; - dstPtrChannel = dstPtrImage; + // get the starting address of length values from roiTensor + Rpp32u *roi = roiTensor + batchCount * numDims * 2; + Rpp32s *length = reinterpret_cast(&roi[numDims]); - // Slice without fused output-layout toggle (NCDHW -> NCDHW) - if((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW)) + if (numDims == 4) { - srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[3]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier); - - Rpp32u copyLengthInBytes = bufferLength * sizeof(Rpp32f); - for(int c = 0; c < layoutParams.channelParam; c++) + // order of dims + Rpp32s dimsOrder[3]; + if (dstGenericDescPtr->layout == RpptLayout::NCDHW) + { + dimsOrder[0] = 1; // depth + dimsOrder[1] = 2; // height + dimsOrder[2] = 3; // width + } + else { - Rpp32f *srcPtrDepth, *dstPtrDepth; - srcPtrDepth = srcPtrChannel; - dstPtrDepth = dstPtrChannel; + dimsOrder[0] = 0; // depth + dimsOrder[1] = 1; // height + dimsOrder[2] = 2; // width + } + Rpp32u maxDepth = std::min(shape[dimsOrder[0]], length[dimsOrder[0]] - anchor[dimsOrder[0]]); + Rpp32u maxHeight = std::min(shape[dimsOrder[1]], length[dimsOrder[1]] - anchor[dimsOrder[1]]); + Rpp32u maxWidth = std::min(shape[dimsOrder[2]], length[dimsOrder[2]] - anchor[dimsOrder[2]]); + Rpp32u bufferLength = maxWidth * layoutParams.bufferMultiplier; + Rpp32u copyLengthInBytes = bufferLength * sizeof(T); + + // if padding is required, fill the buffer with fill value specified + bool needPadding = (((anchor[dimsOrder[0]] + shape[dimsOrder[0]]) > length[dimsOrder[0]]) || + ((anchor[dimsOrder[1]] + shape[dimsOrder[1]]) > length[dimsOrder[1]]) || + ((anchor[dimsOrder[2]] + shape[dimsOrder[2]]) > length[dimsOrder[2]])); + if (needPadding && enablePadding) + std::fill(dstPtrChannel, dstPtrChannel + dstGenericDescPtr->strides[0] - 1, *fillValue); + + // slice without fused output-layout toggle (NCDHW -> NCDHW) + if (dstGenericDescPtr->layout == RpptLayout::NCDHW) + { + srcPtrChannel = srcPtrTemp + (anchor[1] * srcGenericDescPtr->strides[2]) + (anchor[2] * srcGenericDescPtr->strides[3]) + (anchor[3] * layoutParams.bufferMultiplier); + for(int c = 0; c < layoutParams.channelParam; c++) + { + T *srcPtrDepth, *dstPtrDepth; + srcPtrDepth = srcPtrChannel; + dstPtrDepth = dstPtrChannel; + for(int i = 0; i < maxDepth; i++) + { + T *srcPtrRow, *dstPtrRow; + srcPtrRow = srcPtrDepth; + dstPtrRow = dstPtrDepth; + for(int j = 0; j < maxHeight; j++) + { + memcpy(dstPtrRow, srcPtrRow, copyLengthInBytes); + srcPtrRow += srcGenericDescPtr->strides[3]; + dstPtrRow += dstGenericDescPtr->strides[3]; + } + srcPtrDepth += srcGenericDescPtr->strides[2]; + dstPtrDepth += dstGenericDescPtr->strides[2]; + } + srcPtrChannel += srcGenericDescPtr->strides[1]; + dstPtrChannel += srcGenericDescPtr->strides[1]; + } + } - for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++) + // slice without fused output-layout toggle (NDHWC -> NDHWC) + else if (dstGenericDescPtr->layout == RpptLayout::NDHWC) + { + srcPtrChannel = srcPtrTemp + (anchor[0] * srcGenericDescPtr->strides[1]) + (anchor[1] * srcGenericDescPtr->strides[2]) + (anchor[2] * layoutParams.bufferMultiplier); + T *srcPtrDepth = srcPtrChannel; + T *dstPtrDepth = dstPtrChannel; + for(int i = 0; i < maxDepth; i++) { - Rpp32f *srcPtrRow, *dstPtrRow; + T *srcPtrRow, *dstPtrRow; srcPtrRow = srcPtrDepth; dstPtrRow = dstPtrDepth; - - for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++) + for(int j = 0; j < maxHeight; j++) { memcpy(dstPtrRow, srcPtrRow, copyLengthInBytes); - - srcPtrRow += srcGenericDescPtr->strides[3]; - dstPtrRow += dstGenericDescPtr->strides[3]; + srcPtrRow += srcGenericDescPtr->strides[2]; + dstPtrRow += dstGenericDescPtr->strides[2]; } - srcPtrDepth += srcGenericDescPtr->strides[2]; - dstPtrDepth += dstGenericDescPtr->strides[2]; + srcPtrDepth += srcGenericDescPtr->strides[1]; + dstPtrDepth += dstGenericDescPtr->strides[1]; } - - srcPtrChannel += srcGenericDescPtr->strides[1]; - dstPtrChannel += srcGenericDescPtr->strides[1]; } } - // Slice without fused output-layout toggle (NDHWC -> NDHWC) - else if((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC)) + else if (numDims == 3) { - Rpp32u copyLengthInBytes = bufferLength * sizeof(Rpp32f); - srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[1]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier); - Rpp32f *srcPtrDepth = srcPtrChannel; - Rpp32f *dstPtrDepth = dstPtrChannel; - - for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++) + // order of dims + Rpp32s dimsOrder[2]; + if (dstGenericDescPtr->layout == RpptLayout::NCHW) { - Rpp32f *srcPtrRow, *dstPtrRow; - srcPtrRow = srcPtrDepth; - dstPtrRow = dstPtrDepth; - - for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++) - { - memcpy(dstPtrRow, srcPtrRow, copyLengthInBytes); - - srcPtrRow += srcGenericDescPtr->strides[2]; - dstPtrRow += dstGenericDescPtr->strides[2]; - } - srcPtrDepth += srcGenericDescPtr->strides[1]; - dstPtrDepth += dstGenericDescPtr->strides[1]; + dimsOrder[0] = 1; // height + dimsOrder[1] = 2; // width + } + else + { + dimsOrder[0] = 0; // height + dimsOrder[1] = 1; // width } - } - } - - return RPP_SUCCESS; -} - -RppStatus slice_u8_u8_host_tensor(Rpp8u *srcPtr, - RpptGenericDescPtr srcGenericDescPtr, - Rpp8u *dstPtr, - RpptGenericDescPtr dstGenericDescPtr, - RpptROI3DPtr roiGenericPtrSrc, - RpptRoi3DType roiType, - RppLayoutParams layoutParams, - rpp::Handle& handle) -{ - RpptROI3D roiDefault; - if(srcGenericDescPtr->layout==RpptLayout::NCDHW) - roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[4], (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2]}; - else if(srcGenericDescPtr->layout==RpptLayout::NDHWC) - roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2], (Rpp32s)srcGenericDescPtr->dims[1]}; - Rpp32u numThreads = handle.GetNumThreads(); - - omp_set_dynamic(0); -#pragma omp parallel for num_threads(numThreads) - for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) - { - RpptROI3D roi; - RpptROI3DPtr roiPtrInput = &roiGenericPtrSrc[batchCount]; - compute_roi3D_validation_host(roiPtrInput, &roi, &roiDefault, roiType); - Rpp8u *srcPtrImage, *dstPtrImage; - srcPtrImage = srcPtr + batchCount * srcGenericDescPtr->strides[0]; - dstPtrImage = dstPtr + batchCount * dstGenericDescPtr->strides[0]; + Rpp32u maxHeight = std::min(shape[dimsOrder[0]], length[dimsOrder[0]] - anchor[dimsOrder[0]]); + Rpp32u maxWidth = std::min(shape[dimsOrder[1]], length[dimsOrder[1]] - anchor[dimsOrder[1]]); + Rpp32u bufferLength = maxWidth * layoutParams.bufferMultiplier; + Rpp32u copyLengthInBytes = bufferLength * sizeof(T); - Rpp32u bufferLength = roi.xyzwhdROI.roiWidth * layoutParams.bufferMultiplier; - Rpp8u *srcPtrChannel, *dstPtrChannel; - dstPtrChannel = dstPtrImage; + // if padding is required, fill the buffer with fill value specified + bool needPadding = ((anchor[dimsOrder[0]] + shape[dimsOrder[0]]) > length[dimsOrder[0]]) || + ((anchor[dimsOrder[1]] + shape[dimsOrder[1]]) > length[dimsOrder[1]]); + if (needPadding && enablePadding) + std::fill(dstPtrChannel, dstPtrChannel + dstGenericDescPtr->strides[0] - 1, *fillValue); - // Slice without fused output-layout toggle (NCDHW -> NCDHW) - if((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW)) - { - srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[3]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier); - for(int c = 0; c < layoutParams.channelParam; c++) + // slice without fused output-layout toggle (NCHW -> NCHW) + if (dstGenericDescPtr->layout == RpptLayout::NCHW) { - Rpp8u *srcPtrDepth, *dstPtrDepth; - srcPtrDepth = srcPtrChannel; - dstPtrDepth = dstPtrChannel; - for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++) + srcPtrChannel = srcPtrTemp + (anchor[1] * srcGenericDescPtr->strides[2]) + (anchor[2] * layoutParams.bufferMultiplier); + for(int c = 0; c < layoutParams.channelParam; c++) { - Rpp8u *srcPtrRow, *dstPtrRow; - srcPtrRow = srcPtrDepth; - dstPtrRow = dstPtrDepth; - for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++) + T *srcPtrRow, *dstPtrRow; + srcPtrRow = srcPtrChannel; + dstPtrRow = dstPtrChannel; + for(int j = 0; j < maxHeight; j++) { - memcpy(dstPtrRow, srcPtrRow, bufferLength * sizeof(Rpp8u)); - srcPtrRow += srcGenericDescPtr->strides[3]; - dstPtrRow += dstGenericDescPtr->strides[3]; + memcpy(dstPtrRow, srcPtrRow, copyLengthInBytes); + srcPtrRow += srcGenericDescPtr->strides[2]; + dstPtrRow += dstGenericDescPtr->strides[2]; } - srcPtrDepth += srcGenericDescPtr->strides[2]; - dstPtrDepth += dstGenericDescPtr->strides[2]; + srcPtrChannel += srcGenericDescPtr->strides[1]; + dstPtrChannel += srcGenericDescPtr->strides[1]; } - srcPtrChannel += srcGenericDescPtr->strides[1]; - dstPtrChannel += srcGenericDescPtr->strides[1]; } - } - // Slice without fused output-layout toggle (NDHWC -> NDHWC) - else if((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC)) - { - srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[1]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier); - Rpp8u *srcPtrDepth = srcPtrChannel; - Rpp8u *dstPtrDepth = dstPtrChannel; - for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++) + // slice without fused output-layout toggle (NHWC -> NHWC) + else if (dstGenericDescPtr->layout == RpptLayout::NHWC) { - Rpp8u *srcPtrRow, *dstPtrRow; - srcPtrRow = srcPtrDepth; - dstPtrRow = dstPtrDepth; - - for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++) + srcPtrChannel = srcPtrTemp + (anchor[0] * srcGenericDescPtr->strides[1]) + (anchor[1] * layoutParams.bufferMultiplier); + T *srcPtrRow = srcPtrChannel; + T *dstPtrRow = dstPtrChannel; + for(int j = 0; j < maxHeight; j++) { - memcpy(dstPtrRow, srcPtrRow, bufferLength * sizeof(Rpp8u)); - srcPtrRow += srcGenericDescPtr->strides[2]; - dstPtrRow += dstGenericDescPtr->strides[2]; + memcpy(dstPtrRow, srcPtrRow, copyLengthInBytes); + srcPtrRow += srcGenericDescPtr->strides[1]; + dstPtrRow += dstGenericDescPtr->strides[1]; } - srcPtrDepth += srcGenericDescPtr->strides[1]; - dstPtrDepth += dstGenericDescPtr->strides[1]; } } + else if (numDims == 2) + { + srcPtrChannel = srcPtrTemp + (anchor[0] * srcGenericDescPtr->strides[1]) + anchor[1]; + Rpp32u maxHeight = std::min(shape[0], length[0] - anchor[0]); + Rpp32u maxWidth = std::min(shape[1], length[1] - anchor[1]); + Rpp32u copyLengthInBytes = maxWidth * sizeof(T); + + // if padding is required, fill the buffer with fill value specified + bool needPadding = ((anchor[0] + shape[0]) > length[0]) || + ((anchor[1] + shape[1]) > length[1]); + if (needPadding && enablePadding) + std::fill(dstPtrChannel, dstPtrChannel + dstGenericDescPtr->strides[0] - 1, *fillValue); + + T *srcPtrRow = srcPtrChannel; + T *dstPtrRow = dstPtrChannel; + for(int j = 0; j < maxHeight; j++) + { + memcpy(dstPtrRow, srcPtrRow, copyLengthInBytes); + srcPtrRow += srcGenericDescPtr->strides[1]; + dstPtrRow += dstGenericDescPtr->strides[1]; + } + } + else if (numDims == 1) + { + srcPtrChannel = srcPtrTemp + anchor[0]; + Rpp32u maxLength = std::min(shape[0], length[0] - anchor[0]); + Rpp32u copyLengthInBytes = maxLength * sizeof(T); + + // if padding is required, fill the buffer with fill value specified + bool needPadding = ((anchor[0] + shape[0]) > length[0]); + if (needPadding && enablePadding) + std::fill(dstPtrTemp, dstPtrTemp + dstGenericDescPtr->strides[0] - 1, *fillValue); + memcpy(dstPtrChannel, srcPtrChannel, copyLengthInBytes); + } } return RPP_SUCCESS; diff --git a/src/modules/hip/kernel/slice.hpp b/src/modules/hip/kernel/slice.hpp index 8deb52bbb..c1b7a6c41 100644 --- a/src/modules/hip/kernel/slice.hpp +++ b/src/modules/hip/kernel/slice.hpp @@ -2,24 +2,79 @@ #include #include "rpp_hip_common.hpp" +template +__global__ void fill_value_ncdhw_hip_tensor(T *dstPtr, + uint3 dstStridesCDH, + int channels, + uint3 dstDimsDHW, + T *fillValue) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // W - inner most dim vectorized + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim + + if ((id_z >= dstDimsDHW.x) || (id_y >= dstDimsDHW.y) || (id_x >= dstDimsDHW.z)) + { + return; + } + + uint dstIdx = (id_z * dstStridesCDH.y) + (id_y * dstStridesCDH.z) + id_x; + d_float8 val_f8; + val_f8.f4[0] = (float4)(*fillValue); + val_f8.f4[1] = val_f8.f4[0]; + for(int c = 0; c < channels; c++) + { + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &val_f8); + dstIdx += dstStridesCDH.x; + } +} + + +template +__global__ void fill_value_ndhwc_hip_tensor(T *dstPtr, + uint2 dstStridesDH, + uint3 dstDimsDHW, + T *fillValue) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // W - inner most dim vectorized + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim + + if ((id_z >= dstDimsDHW.x) || (id_y >= dstDimsDHW.y) || (id_x >= dstDimsDHW.z)) + { + return; + } + + uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x * 3; + d_float24 val_f24; + val_f24.f4[0] = (float4)(*fillValue); + val_f24.f4[1] = val_f24.f4[0]; + val_f24.f4[2] = val_f24.f4[0]; + val_f24.f4[3] = val_f24.f4[0]; + val_f24.f4[4] = val_f24.f4[0]; + val_f24.f4[5] = val_f24.f4[0]; + rpp_hip_pack_float24_pkd3_and_store24_pkd3(dstPtr + dstIdx, &val_f24); +} + + template __global__ void slice_ncdhw_hip_tensor(T *srcPtr, uint3 srcStridesCDH, T *dstPtr, uint3 dstStridesCDH, int channels, - RpptROI3DPtr roiGenericSrc) + uint3 validShapeDHW) { int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // W - inner most dim vectorized int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim - if ((id_z >= roiGenericSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericSrc->xyzwhdROI.roiWidth)) + if ((id_z >= validShapeDHW.x) || (id_y >= validShapeDHW.y) || (id_x >= validShapeDHW.z)) { return; } - uint srcIdx = ((id_z + roiGenericSrc->xyzwhdROI.xyz.z) * srcStridesCDH.y) + ((id_y + roiGenericSrc->xyzwhdROI.xyz.y) * srcStridesCDH.z) + (id_x + roiGenericSrc->xyzwhdROI.xyz.x); + uint srcIdx = (id_z * srcStridesCDH.y) + (id_y * srcStridesCDH.z) + id_x; uint dstIdx = (id_z * dstStridesCDH.y) + (id_y * dstStridesCDH.z) + id_x; d_float8 val_f8; @@ -32,77 +87,439 @@ __global__ void slice_ncdhw_hip_tensor(T *srcPtr, } } + template __global__ void slice_ndhwc_hip_tensor(T *srcPtr, uint2 srcStridesDH, T *dstPtr, uint2 dstStridesDH, - RpptROI3DPtr roiGenericSrc) + uint3 validShapeDHW) { int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // WC - inner most dim vectorized int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim - if ((id_z >= roiGenericSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericSrc->xyzwhdROI.roiWidth)) + if ((id_z >= validShapeDHW.x) || (id_y >= validShapeDHW.y) || (id_x >= validShapeDHW.z)) { return; } - uint srcIdx = ((id_z + roiGenericSrc->xyzwhdROI.xyz.z) * srcStridesDH.x) + ((id_y + roiGenericSrc->xyzwhdROI.xyz.y) * srcStridesDH.y) + (id_x + roiGenericSrc->xyzwhdROI.xyz.x) * 3; - uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x * 3; + uint srcIdx = (id_z * srcStridesDH.x) + (id_y * srcStridesDH.y) + (id_x * 3); + uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + (id_x * 3); d_float24 val_f24; rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &val_f24); rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &val_f24); } +template +RppStatus hip_exec_fill_value_tensor(T *dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32s *anchorTensor, + Rpp32s *shapeTensor, + T *fillValue, + Rpp32u *roiTensor, + rpp::Handle& handle, + Rpp32u numDims) +{ + if (numDims == 4) + { + // set the dimsOrder and globalthreads values required for NDHWC layout + Rpp32s dimsOrder[3] = {0, 1, 2}; + int globalThreads_x = (dstGenericDescPtr->strides[2] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + int globalThreads_y = dstGenericDescPtr->dims[2]; // H - height (y direction) + int globalThreads_z = dstGenericDescPtr->dims[1]; // D - depth (z direction) + + // change the dimsOrder and globalthreads values if layout is NCDHW + if (dstGenericDescPtr->layout == RpptLayout::NCDHW) + { + dimsOrder[0] = 1; // depth + dimsOrder[1] = 2; // height + dimsOrder[2] = 3; // width + globalThreads_x = (dstGenericDescPtr->strides[3] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + globalThreads_y = dstGenericDescPtr->dims[3]; // H - height (y direction) + globalThreads_z = dstGenericDescPtr->dims[2]; // D - depth (z direction) + } + for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) + { + Rpp32s *anchor = &anchorTensor[batchCount * numDims]; + Rpp32s *shape = &shapeTensor[batchCount * numDims]; + Rpp32u *roi = roiTensor + batchCount * numDims * 2; + Rpp32s *length = reinterpret_cast(&roi[numDims]); + Rpp32u maxDepth = std::min(shape[dimsOrder[0]], length[dimsOrder[0]] - anchor[dimsOrder[0]]); + Rpp32u maxHeight = std::min(shape[dimsOrder[1]], length[dimsOrder[1]] - anchor[dimsOrder[1]]); + Rpp32u maxWidth = std::min(shape[dimsOrder[2]], length[dimsOrder[2]] - anchor[dimsOrder[2]]); + + // checking if padding is required + bool needPadding = (((anchor[dimsOrder[0]] + shape[dimsOrder[0]]) > length[dimsOrder[0]]) || + ((anchor[dimsOrder[1]] + shape[dimsOrder[1]]) > length[dimsOrder[1]]) || + ((anchor[dimsOrder[2]] + shape[dimsOrder[2]]) > length[dimsOrder[2]])); + + // if needPadding is set, launch kernel for filling the padded region with fill value specified + if (needPadding && dstGenericDescPtr->layout == RpptLayout::NCDHW) + { + hipLaunchKernelGGL(fill_value_ncdhw_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + dstPtr + (batchCount * dstGenericDescPtr->strides[0]), + make_uint3(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2], dstGenericDescPtr->strides[3]), + dstGenericDescPtr->dims[1], + make_uint3(maxDepth, maxHeight, maxWidth), + fillValue); + } + else if (needPadding && dstGenericDescPtr->layout == RpptLayout::NDHWC) + { + hipLaunchKernelGGL(fill_value_ndhwc_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + dstPtr + (batchCount * dstGenericDescPtr->strides[0]), + make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]), + make_uint3(maxDepth, maxHeight, maxWidth), + fillValue); + } + } + } + else if (numDims == 3) + { + // set the dimsOrder and globalthreads values required for NHWC layout + Rpp32s dimsOrder[2] = {0, 1}; + int globalThreads_x = (dstGenericDescPtr->strides[1] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + int globalThreads_y = dstGenericDescPtr->dims[1]; // H - height (y direction) + int globalThreads_z = 1; + + // change the dimsOrder and globalthreads values if layout is NCHW + if (dstGenericDescPtr->layout == RpptLayout::NCHW) + { + dimsOrder[0] = 1; // height + dimsOrder[1] = 2; // width + globalThreads_x = (dstGenericDescPtr->strides[2] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + globalThreads_y = dstGenericDescPtr->dims[2]; // H - height (y direction) + globalThreads_z = 1; + } + + for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) + { + Rpp32s *anchor = &anchorTensor[batchCount * numDims]; + Rpp32s *shape = &shapeTensor[batchCount * numDims]; + Rpp32u *roi = roiTensor + batchCount * numDims * 2; + Rpp32s *length = reinterpret_cast(&roi[numDims]); + Rpp32u maxHeight = std::min(shape[dimsOrder[0]], length[dimsOrder[0]] - anchor[dimsOrder[0]]); + Rpp32u maxWidth = std::min(shape[dimsOrder[1]], length[dimsOrder[1]] - anchor[dimsOrder[1]]); + + // check if padding is needed + bool needPadding = (((anchor[dimsOrder[0]] + shape[dimsOrder[0]]) > length[dimsOrder[0]]) || + ((anchor[dimsOrder[1]] + shape[dimsOrder[1]]) > length[dimsOrder[1]])); + + // launch kernel for filling the padded region with fill value specified + if (needPadding && dstGenericDescPtr->layout == RpptLayout::NCHW) + { + hipLaunchKernelGGL(fill_value_ncdhw_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), globalThreads_z), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, 1), + 0, + handle.GetStream(), + dstPtr + (batchCount * dstGenericDescPtr->strides[0]), + make_uint3(dstGenericDescPtr->strides[1], 0, dstGenericDescPtr->strides[2]), + dstGenericDescPtr->dims[1], + make_uint3(1, shape[1], shape[2]), + fillValue); + } + else if (needPadding && dstGenericDescPtr->layout == RpptLayout::NHWC) + { + hipLaunchKernelGGL(fill_value_ndhwc_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), globalThreads_z), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, 1), + 0, + handle.GetStream(), + dstPtr + (batchCount * dstGenericDescPtr->strides[0]), + make_uint2(1, dstGenericDescPtr->strides[1]), + make_uint3(1, maxHeight, maxWidth), + fillValue); + } + } + } + else if (numDims == 2) + { + // NHW + int globalThreads_x = (dstGenericDescPtr->strides[1] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + int globalThreads_y = dstGenericDescPtr->dims[1]; // H - height (y direction) + int globalThreads_z = 1; + + for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) + { + Rpp32s *anchor = &anchorTensor[batchCount * numDims]; + Rpp32s *shape = &shapeTensor[batchCount * numDims]; + Rpp32u *roi = roiTensor + batchCount * numDims * 2; + Rpp32s *length = reinterpret_cast(&roi[numDims]); + Rpp32u maxHeight = std::min(shape[0], length[0] - anchor[0]); + Rpp32u maxWidth = std::min(shape[1], length[1] - anchor[1]); + + // check if padding is needed + bool needPadding = (((anchor[0] + shape[0]) > length[0]) || + ((anchor[1] + shape[1]) > length[1])); + + // launch kernel for filling the padded region with fill value specified + if (needPadding) + { + hipLaunchKernelGGL(fill_value_ncdhw_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, 1), + 0, + handle.GetStream(), + dstPtr + (batchCount * dstGenericDescPtr->strides[0]), + make_uint3(0, 0, dstGenericDescPtr->strides[1]), + 1, + make_uint3(1, shape[0], shape[1]), + fillValue); + } + } + } + else if (numDims == 1) + { + int globalThreads_x = (dstGenericDescPtr->strides[0] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + int globalThreads_y = 1; + int globalThreads_z = 1; + + for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) + { + Rpp32s *anchor = &anchorTensor[batchCount * numDims]; + Rpp32s *shape = &shapeTensor[batchCount * numDims]; + Rpp32u *roi = roiTensor + batchCount * numDims * 2; + Rpp32s *length = reinterpret_cast(&roi[numDims]); + Rpp32u maxLength = std::min(shape[0], length[0] - anchor[0]); + + // check if padding is needed + bool needPadding = ((anchor[0] + shape[0]) > length[0]); + + // launch kernel for filling the padded region with fill value specified + if (needPadding) + { + hipLaunchKernelGGL(fill_value_ncdhw_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, 1, 1), + 0, + handle.GetStream(), + dstPtr + (batchCount * dstGenericDescPtr->strides[0]), + make_uint3(0, 0, 1), + 1, + make_uint3(1, 1, shape[0]), + fillValue); + } + } + } + + return RPP_SUCCESS; +} + template RppStatus hip_exec_slice_tensor(T *srcPtr, RpptGenericDescPtr srcGenericDescPtr, T *dstPtr, RpptGenericDescPtr dstGenericDescPtr, - RpptROI3DPtr roiGenericPtrSrc, + Rpp32s *anchorTensor, + Rpp32s *shapeTensor, + T *fillValue, + bool enablePadding, + Rpp32u *roiTensor, rpp::Handle& handle) { - if (dstGenericDescPtr->layout == RpptLayout::NCDHW) + Rpp32u numDims = srcGenericDescPtr->numDims - 1; // exclude batchsize from input dims + + /* if enabledPadding is set to true, launch kernel to fill the output buffers with fill value specified. + This will be only done if shapeTensor[d] > roiTensor[d] where d is the dimension*/ + if (enablePadding) + { + hip_exec_fill_value_tensor(dstPtr, + dstGenericDescPtr, + anchorTensor, + shapeTensor, + fillValue, + roiTensor, + handle, + numDims); + hipStreamSynchronize(handle.GetStream()); + } + + if(numDims == 4) { - int globalThreads_x = (dstGenericDescPtr->strides[3] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel - int globalThreads_y = dstGenericDescPtr->dims[3]; // H - height (y direction) - int globalThreads_z = dstGenericDescPtr->dims[2]; // D - depth (z direction) + // set the dimsOrder and globalthreads values required for NDHWC layout + Rpp32s dimsOrder[3] = {0, 1, 2}; + int globalThreads_x = (dstGenericDescPtr->strides[2] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + int globalThreads_y = dstGenericDescPtr->dims[2]; // H - height (y direction) + int globalThreads_z = dstGenericDescPtr->dims[1]; // D - depth (z direction) + + // change the dimsOrder and globalthreads values if layout is NCDHW + if (dstGenericDescPtr->layout == RpptLayout::NCDHW) + { + dimsOrder[0] = 1; // depth + dimsOrder[1] = 2; // height + dimsOrder[2] = 3; // width + globalThreads_x = (dstGenericDescPtr->strides[3] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + globalThreads_y = dstGenericDescPtr->dims[3]; // H - height (y direction) + globalThreads_z = dstGenericDescPtr->dims[2]; // D - depth (z direction) + } for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) { + Rpp32s *anchor = &anchorTensor[batchCount * numDims]; + Rpp32s *shape = &shapeTensor[batchCount * numDims]; + Rpp32u *roi = roiTensor + batchCount * numDims * 2; + Rpp32s *length = reinterpret_cast(&roi[numDims]); + Rpp32u maxDepth = std::min(shape[dimsOrder[0]], length[dimsOrder[0]] - anchor[dimsOrder[0]]); + Rpp32u maxHeight = std::min(shape[dimsOrder[1]], length[dimsOrder[1]] - anchor[dimsOrder[1]]); + Rpp32u maxWidth = std::min(shape[dimsOrder[2]], length[dimsOrder[2]] - anchor[dimsOrder[2]]); + if (dstGenericDescPtr->layout == RpptLayout::NCDHW) + { + T *srcPtrTemp = srcPtr + (batchCount * srcGenericDescPtr->strides[0]) + anchor[1] * srcGenericDescPtr->strides[2] + anchor[2] * srcGenericDescPtr->strides[3] + anchor[3]; + T *dstPtrTemp = dstPtr + (batchCount * dstGenericDescPtr->strides[0]); + hipLaunchKernelGGL(slice_ncdhw_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtrTemp, + make_uint3(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2], srcGenericDescPtr->strides[3]), + dstPtrTemp, + make_uint3(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2], dstGenericDescPtr->strides[3]), + dstGenericDescPtr->dims[1], + make_uint3(maxDepth, maxHeight, maxWidth)); + } + else if (dstGenericDescPtr->layout == RpptLayout::NDHWC) + { + T *srcPtrTemp = srcPtr + (batchCount * srcGenericDescPtr->strides[0]) + anchor[0] * srcGenericDescPtr->strides[1] + anchor[1] * srcGenericDescPtr->strides[2] + anchor[2]; + T *dstPtrTemp = dstPtr + (batchCount * dstGenericDescPtr->strides[0]); + hipLaunchKernelGGL(slice_ndhwc_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtrTemp, + make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]), + dstPtrTemp, + make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]), + make_uint3(maxDepth, maxHeight, maxWidth)); + } + } + } + else if (numDims == 3) + { + // set the dimsOrder and globalthreads values required for NHWC layout + Rpp32s dimsOrder[2] = {0, 1}; + int globalThreads_x = (dstGenericDescPtr->strides[1] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + int globalThreads_y = dstGenericDescPtr->dims[1]; // H - height (y direction) + int globalThreads_z = 1; + + // change the dimsOrder and globalthreads values if layout is NCHW + if (dstGenericDescPtr->layout == RpptLayout::NCHW) + { + dimsOrder[0] = 1; // height + dimsOrder[1] = 2; // width + globalThreads_x = (dstGenericDescPtr->strides[2] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + globalThreads_y = dstGenericDescPtr->dims[2]; // H - height (y direction) + globalThreads_z = 1; + } + + for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) + { + Rpp32s *anchor = &anchorTensor[batchCount * numDims]; + Rpp32s *shape = &shapeTensor[batchCount * numDims]; + Rpp32u *roi = roiTensor + batchCount * numDims * 2; + Rpp32s *length = reinterpret_cast(&roi[numDims]); + Rpp32u maxHeight = std::min(shape[dimsOrder[0]], length[dimsOrder[0]] - anchor[dimsOrder[0]]); + Rpp32u maxWidth = std::min(shape[dimsOrder[1]], length[dimsOrder[1]] - anchor[dimsOrder[1]]); + if (dstGenericDescPtr->layout == RpptLayout::NCHW) + { + T *srcPtrTemp = srcPtr + (batchCount * srcGenericDescPtr->strides[0]) + anchor[1] * srcGenericDescPtr->strides[2] + anchor[2]; + T *dstPtrTemp = dstPtr + (batchCount * dstGenericDescPtr->strides[0]); + hipLaunchKernelGGL(slice_ncdhw_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, 1), + 0, + handle.GetStream(), + srcPtrTemp, + make_uint3(srcGenericDescPtr->strides[1], 0, srcGenericDescPtr->strides[2]), + dstPtrTemp, + make_uint3(dstGenericDescPtr->strides[1], 0, dstGenericDescPtr->strides[2]), + dstGenericDescPtr->dims[1], + make_uint3(1, maxHeight, maxWidth)); + } + else if (dstGenericDescPtr->layout == RpptLayout::NHWC) + { + T *srcPtrTemp = srcPtr + (batchCount * srcGenericDescPtr->strides[0]) + anchor[0] * srcGenericDescPtr->strides[1] + anchor[1]; + T *dstPtrTemp = dstPtr + (batchCount * dstGenericDescPtr->strides[0]); + hipLaunchKernelGGL(slice_ndhwc_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), globalThreads_z), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, 1), + 0, + handle.GetStream(), + srcPtrTemp, + make_uint2(1, srcGenericDescPtr->strides[1]), + dstPtrTemp, + make_uint2(1, dstGenericDescPtr->strides[1]), + make_uint3(1, maxHeight, maxWidth)); + } + } + } + else if (numDims == 2) + { + // NHW + int globalThreads_x = (dstGenericDescPtr->strides[1] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + int globalThreads_y = dstGenericDescPtr->dims[1]; // H - height (y direction) + int globalThreads_z = 1; + for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) + { + Rpp32s *anchor = &anchorTensor[batchCount * numDims]; + Rpp32s *shape = &shapeTensor[batchCount * numDims]; + Rpp32u *roi = roiTensor + batchCount * numDims * 2; + Rpp32s *length = reinterpret_cast(&roi[numDims]); + Rpp32u maxHeight = std::min(shape[0], length[0] - anchor[0]); + Rpp32u maxWidth = std::min(shape[1], length[1] - anchor[1]); + T *srcPtrTemp = srcPtr + (batchCount * srcGenericDescPtr->strides[0]) + anchor[0] * srcGenericDescPtr->strides[2] + anchor[1]; + T *dstPtrTemp = dstPtr + (batchCount * dstGenericDescPtr->strides[0]); + hipLaunchKernelGGL(slice_ncdhw_hip_tensor, dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), - dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, 1), 0, handle.GetStream(), - srcPtr + (batchCount * srcGenericDescPtr->strides[0]), - make_uint3(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2], srcGenericDescPtr->strides[3]), - dstPtr + (batchCount * dstGenericDescPtr->strides[0]), - make_uint3(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2], dstGenericDescPtr->strides[3]), - dstGenericDescPtr->dims[1], - &roiGenericPtrSrc[batchCount]); + srcPtrTemp, + make_uint3(0, 0, srcGenericDescPtr->strides[1]), + dstPtrTemp, + make_uint3(0, 0, dstGenericDescPtr->strides[1]), + 1, + make_uint3(1, maxHeight, maxWidth)); } } - else if (dstGenericDescPtr->layout == RpptLayout::NDHWC) + else if (numDims == 1) { - int globalThreads_x = (dstGenericDescPtr->strides[2] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel - int globalThreads_y = dstGenericDescPtr->dims[2]; // H - height (y direction) - int globalThreads_z = dstGenericDescPtr->dims[1]; // D - depth (z direction) - + int globalThreads_x = (dstGenericDescPtr->strides[0] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + int globalThreads_y = 1; + int globalThreads_z = 1; for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) { - hipLaunchKernelGGL(slice_ndhwc_hip_tensor, + Rpp32s *anchor = &anchorTensor[batchCount * numDims]; + Rpp32s *shape = &shapeTensor[batchCount * numDims]; + Rpp32u *roi = roiTensor + batchCount * numDims * 2; + Rpp32s *length = reinterpret_cast(&roi[numDims]); + Rpp32u maxLength = std::min(shape[0], length[0] - anchor[0]); + T *srcPtrTemp = srcPtr + (batchCount * srcGenericDescPtr->strides[0]) + anchor[0]; + T *dstPtrTemp = dstPtr + (batchCount * dstGenericDescPtr->strides[0]); + + hipLaunchKernelGGL(slice_ncdhw_hip_tensor, dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), - dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + dim3(LOCAL_THREADS_X, 1, 1), 0, handle.GetStream(), - srcPtr + (batchCount * srcGenericDescPtr->strides[0]), - make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]), - dstPtr + (batchCount * dstGenericDescPtr->strides[0]), - make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]), - &roiGenericPtrSrc[batchCount]); + srcPtrTemp, + make_uint3(0, 0, 1), + dstPtrTemp, + make_uint3(0, 0, 1), + 1, + make_uint3(1, 1, maxLength)); } } diff --git a/src/modules/rppt_tensor_audio_augmentations.cpp b/src/modules/rppt_tensor_audio_augmentations.cpp index b629d1590..e20211ec1 100644 --- a/src/modules/rppt_tensor_audio_augmentations.cpp +++ b/src/modules/rppt_tensor_audio_augmentations.cpp @@ -32,8 +32,8 @@ SOFTWARE. RppStatus rppt_non_silent_region_detection_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp32s *srcLengthTensor, - Rpp32f *detectedIndexTensor, - Rpp32f *detectionLengthTensor, + Rpp32s *detectedIndexTensor, + Rpp32s *detectionLengthTensor, Rpp32f cutOffDB, Rpp32s windowLength, Rpp32f referencePower, diff --git a/src/modules/rppt_tensor_geometric_augmentations.cpp b/src/modules/rppt_tensor_geometric_augmentations.cpp index 45a0d5221..d758aa676 100644 --- a/src/modules/rppt_tensor_geometric_augmentations.cpp +++ b/src/modules/rppt_tensor_geometric_augmentations.cpp @@ -1050,43 +1050,54 @@ RppStatus rppt_slice_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, - RpptROI3DPtr roiGenericPtrSrc, - RpptRoi3DType roiType, + Rpp32s *anchorTensor, + Rpp32s *shapeTensor, + RppPtr_t fillValue, + bool enablePadding, + Rpp32u *roiTensor, rppHandle_t rppHandle) { + if ((srcGenericDescPtr->dataType != RpptDataType::F32) && (srcGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_SRC_DATATYPE; + if ((dstGenericDescPtr->dataType != RpptDataType::F32) && (dstGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_DST_DATATYPE; + if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_LAYOUT_MISMATCH; + RppLayoutParams layoutParams; if ((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW)) layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[1]); else if ((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC)) layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[4]); - - if ((srcGenericDescPtr->dataType != RpptDataType::F32) && (srcGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_SRC_DATATYPE; - if ((dstGenericDescPtr->dataType != RpptDataType::F32) && (dstGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_DST_DATATYPE; - if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT; - if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT; - if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS; + else if ((srcGenericDescPtr->layout == RpptLayout::NCHW) && (dstGenericDescPtr->layout == RpptLayout::NCHW)) + layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[1]); + else if ((srcGenericDescPtr->layout == RpptLayout::NHWC) && (dstGenericDescPtr->layout == RpptLayout::NHWC)) + layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[3]); if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32)) { - slice_f32_f32_host_tensor((Rpp32f*) (static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), - srcGenericDescPtr, - (Rpp32f*) (static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), - dstGenericDescPtr, - roiGenericPtrSrc, - roiType, - layoutParams, - rpp::deref(rppHandle)); + slice_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + anchorTensor, + shapeTensor, + static_cast(fillValue), + enablePadding, + roiTensor, + layoutParams, + rpp::deref(rppHandle)); } else if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8)) { - slice_u8_u8_host_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes, - srcGenericDescPtr, - static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes, - dstGenericDescPtr, - roiGenericPtrSrc, - roiType, - layoutParams, - rpp::deref(rppHandle)); + slice_host_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes, + srcGenericDescPtr, + static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes, + dstGenericDescPtr, + anchorTensor, + shapeTensor, + static_cast(fillValue), + enablePadding, + roiTensor, + layoutParams, + rpp::deref(rppHandle)); } return RPP_SUCCESS; @@ -1848,24 +1859,29 @@ RppStatus rppt_slice_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, - RpptROI3DPtr roiGenericPtrSrc, - RpptRoi3DType roiType, + Rpp32s *anchorTensor, + Rpp32s *shapeTensor, + RppPtr_t fillValue, + bool enablePadding, + Rpp32u *roiTensor, rppHandle_t rppHandle) { #ifdef HIP_COMPILE - if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT; - if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT; - if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS; if ((srcGenericDescPtr->dataType != RpptDataType::F32) && (srcGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_SRC_DATATYPE; if ((dstGenericDescPtr->dataType != RpptDataType::F32) && (dstGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_DST_DATATYPE; + if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_LAYOUT_MISMATCH; if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32)) { - hip_exec_slice_tensor((Rpp32f*) (static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + hip_exec_slice_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), srcGenericDescPtr, - (Rpp32f*) (static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), dstGenericDescPtr, - roiGenericPtrSrc, + anchorTensor, + shapeTensor, + static_cast(fillValue), + enablePadding, + roiTensor, rpp::deref(rppHandle)); } else if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8)) @@ -1874,7 +1890,11 @@ RppStatus rppt_slice_gpu(RppPtr_t srcPtr, srcGenericDescPtr, static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes, dstGenericDescPtr, - roiGenericPtrSrc, + anchorTensor, + shapeTensor, + static_cast(fillValue), + enablePadding, + roiTensor, rpp::deref(rppHandle)); } diff --git a/utilities/test_suite/HIP/Tensor_hip.cpp b/utilities/test_suite/HIP/Tensor_hip.cpp index f6c9276ef..e7b880433 100644 --- a/utilities/test_suite/HIP/Tensor_hip.cpp +++ b/utilities/test_suite/HIP/Tensor_hip.cpp @@ -338,6 +338,14 @@ int main(int argc, char **argv) CHECK_RETURN_STATUS(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * bitDepthByteSize)); } + // create generic descriptor and params in case of slice + RpptGenericDesc descriptor3D; + RpptGenericDescPtr descriptorPtr3D = &descriptor3D; + Rpp32s *anchorTensor = NULL, *shapeTensor = NULL; + Rpp32u *roiTensor = NULL; + if(testCase == 90) + set_generic_descriptor_slice(srcDescPtr, descriptorPtr3D, batchSize); + // Allocate hip memory for src/dst CHECK_RETURN_STATUS(hipMalloc(&d_input, inputBufferSize)); CHECK_RETURN_STATUS(hipMalloc(&d_output, outputBufferSize)); @@ -348,6 +356,7 @@ int main(int argc, char **argv) if(testCase == 82) CHECK_RETURN_STATUS(hipHostMalloc(&roiPtrInputCropRegion, 4 * sizeof(RpptROI))); + // create cropRoi and patchRoi in case of crop_and_patch RpptROI *cropRoi, *patchRoi; if(testCase == 33) { @@ -1137,6 +1146,28 @@ int main(int argc, char **argv) break; } + case 90: + { + testCaseName = "slice"; + Rpp32u numDims = descriptorPtr3D->numDims - 1; // exclude batchSize from input dims + if(anchorTensor == NULL) + CHECK_RETURN_STATUS(hipHostMalloc(&anchorTensor, batchSize * numDims * sizeof(Rpp32s))); + if(shapeTensor == NULL) + CHECK_RETURN_STATUS(hipHostMalloc(&shapeTensor, batchSize * numDims * sizeof(Rpp32s))); + if(roiTensor == NULL) + CHECK_RETURN_STATUS(hipHostMalloc(&roiTensor, batchSize * numDims * 2 * sizeof(Rpp32u))); + bool enablePadding = false; + auto fillValue = 0; + init_slice(descriptorPtr3D, roiTensorPtrSrc, roiTensor, anchorTensor, shapeTensor); + + startWallTime = omp_get_wtime(); + if((inputBitDepth == 0 || inputBitDepth == 2) && srcDescPtr->layout == dstDescPtr->layout) + rppt_slice_gpu(d_input, descriptorPtr3D, d_output, descriptorPtr3D, anchorTensor, shapeTensor, &fillValue, enablePadding, roiTensor, handle); + else + missingFuncFlag = 1; + + break; + } default: missingFuncFlag = 1; break; @@ -1232,6 +1263,42 @@ int main(int argc, char **argv) refFile.close(); } + // if test case is slice and qaFlag is set, update the dstImgSizes with shapeTensor values + // for output display and comparision purposes + if (testCase == 90) + { + if (dstDescPtr->layout == RpptLayout::NCHW) + { + if (dstDescPtr->c == 3) + { + for(int i = 0; i < batchSize; i++) + { + int idx1 = i * 3; + dstImgSizes[i].height = shapeTensor[idx1 + 1]; + dstImgSizes[i].width = shapeTensor[idx1 + 2]; + } + } + else + { + for(int i = 0; i < batchSize; i++) + { + int idx1 = i * 2; + dstImgSizes[i].height = shapeTensor[idx1]; + dstImgSizes[i].width = shapeTensor[idx1 + 1]; + } + } + } + else if (dstDescPtr->layout == RpptLayout::NHWC) + { + for(int i = 0; i < batchSize; i++) + { + int idx1 = i * 3; + dstImgSizes[i].height = shapeTensor[idx1]; + dstImgSizes[i].width = shapeTensor[idx1 + 1]; + } + } + } + /*Compare the output of the function with golden outputs only if 1.QA Flag is set 2.input bit depth 0 (Input U8 && Output U8) @@ -1294,6 +1361,12 @@ int main(int argc, char **argv) } if (reductionTypeCase) CHECK_RETURN_STATUS(hipHostFree(reductionFuncResultArr)); + if(anchorTensor != NULL) + CHECK_RETURN_STATUS(hipHostFree(anchorTensor)); + if(shapeTensor != NULL) + CHECK_RETURN_STATUS(hipHostFree(shapeTensor)); + if(roiTensor != NULL) + CHECK_RETURN_STATUS(hipHostFree(roiTensor)); free(input); free(input_second); free(output); diff --git a/utilities/test_suite/HIP/Tensor_voxel_hip.cpp b/utilities/test_suite/HIP/Tensor_voxel_hip.cpp index b39f461cc..44f2dea72 100644 --- a/utilities/test_suite/HIP/Tensor_voxel_hip.cpp +++ b/utilities/test_suite/HIP/Tensor_voxel_hip.cpp @@ -146,6 +146,10 @@ int main(int argc, char * argv[]) void *pinnedMemArgs; CHECK_RETURN_STATUS(hipHostMalloc(&pinnedMemArgs, 2 * noOfFiles * sizeof(Rpp32f))); + // arguments required for slice + Rpp32s *anchorTensor = NULL, *shapeTensor = NULL; + Rpp32u *roiTensor = NULL; + rppHandle_t handle; hipStream_t stream; CHECK_RETURN_STATUS(hipStreamCreate(&stream)); @@ -254,12 +258,21 @@ int main(int argc, char * argv[]) case 1: { testCaseName = "slice"; + if(anchorTensor == NULL) + CHECK_RETURN_STATUS(hipHostMalloc(&anchorTensor, batchSize * 4 * sizeof(Rpp32s))); + if(shapeTensor == NULL) + CHECK_RETURN_STATUS(hipHostMalloc(&shapeTensor, batchSize * 4 * sizeof(Rpp32s))); + if(roiTensor == NULL) + CHECK_RETURN_STATUS(hipHostMalloc(&roiTensor, batchSize * 8 * sizeof(Rpp32u))); + bool enablePadding = false; + auto fillValue = 0; + init_slice_voxel(descriptorPtr3D, roiGenericSrcPtr, roiTensor, anchorTensor, shapeTensor); startWallTime = omp_get_wtime(); if (inputBitDepth == 0) - rppt_slice_gpu(d_inputU8, descriptorPtr3D, d_outputU8, descriptorPtr3D, roiGenericSrcPtr, roiTypeSrc, handle); + rppt_slice_gpu(d_inputU8, descriptorPtr3D, d_outputU8, descriptorPtr3D, anchorTensor, shapeTensor, &fillValue, enablePadding, roiTensor, handle); else if(inputBitDepth == 2) - rppt_slice_gpu(d_inputF32, descriptorPtr3D, d_outputF32, descriptorPtr3D, roiGenericSrcPtr, roiTypeSrc, handle); + rppt_slice_gpu(d_inputF32, descriptorPtr3D, d_outputF32, descriptorPtr3D, anchorTensor, shapeTensor, &fillValue, enablePadding, roiTensor, handle); else missingFuncFlag = 1; @@ -413,6 +426,39 @@ int main(int argc, char * argv[]) outputF32[i] = static_cast(outputU8[i]); } + // if test case is slice and qaFlag is set, update the ROI with shapeTensor values + // for output display and comparison purposes + if(testCase == 1) + { + // update the roi for comparision with the shapeTensor values + if (descriptorPtr3D->layout == RpptLayout::NCDHW) + { + for(int i = 0; i < batchSize; i++) + { + int idx1 = i * 4; + roiGenericSrcPtr[i].xyzwhdROI.xyz.x = 0; + roiGenericSrcPtr[i].xyzwhdROI.xyz.y = 0; + roiGenericSrcPtr[i].xyzwhdROI.xyz.z = 0; + roiGenericSrcPtr[i].xyzwhdROI.roiDepth = shapeTensor[idx1 + 1]; + roiGenericSrcPtr[i].xyzwhdROI.roiHeight = shapeTensor[idx1 + 2]; + roiGenericSrcPtr[i].xyzwhdROI.roiWidth = shapeTensor[idx1 + 3]; + } + } + else if(descriptorPtr3D->layout == RpptLayout::NDHWC) + { + for(int i = 0; i < batchSize; i++) + { + int idx1 = i * 4; + roiGenericSrcPtr[i].xyzwhdROI.xyz.x = 0; + roiGenericSrcPtr[i].xyzwhdROI.xyz.y = 0; + roiGenericSrcPtr[i].xyzwhdROI.xyz.z = 0; + roiGenericSrcPtr[i].xyzwhdROI.roiDepth = shapeTensor[idx1]; + roiGenericSrcPtr[i].xyzwhdROI.roiHeight = shapeTensor[idx1 + 1]; + roiGenericSrcPtr[i].xyzwhdROI.roiWidth = shapeTensor[idx1 + 2]; + } + } + } + /*Compare the output of the function with golden outputs only if 1.QA Flag is set 2.input bit depth 2 (F32)*/ @@ -502,6 +548,12 @@ int main(int argc, char * argv[]) CHECK_RETURN_STATUS(hipHostFree(pinnedMemArgs)); CHECK_RETURN_STATUS(hipFree(d_inputF32)); CHECK_RETURN_STATUS(hipFree(d_outputF32)); + if(anchorTensor != NULL) + CHECK_RETURN_STATUS(hipHostFree(anchorTensor)); + if(shapeTensor != NULL) + CHECK_RETURN_STATUS(hipHostFree(shapeTensor)); + if(roiTensor != NULL) + CHECK_RETURN_STATUS(hipHostFree(roiTensor)); if(inputBitDepth == 0) { if(inputU8 != NULL) diff --git a/utilities/test_suite/HIP/runTests.py b/utilities/test_suite/HIP/runTests.py index 76fb7d15c..28db84bee 100644 --- a/utilities/test_suite/HIP/runTests.py +++ b/utilities/test_suite/HIP/runTests.py @@ -39,7 +39,7 @@ outFolderPath = os.getcwd() buildFolderPath = os.getcwd() caseMin = 0 -caseMax = 89 +caseMax = 90 # Get a list of log files based on a flag for preserving output def get_log_file_list(preserveOutput): @@ -327,7 +327,7 @@ def rpp_test_suite_parser_and_validator(): subprocess.run(["make", "-j16"], cwd=".") # nosec # List of cases supported -supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '33', '34', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '80', '82', '83', '84', '85', '86', '87', '88', '89'] +supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '33', '34', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '80', '82', '83', '84', '85', '86', '87', '88', '89', '90'] # Create folders based on testType and profilingOption if testType == 1 and profilingOption == "YES": diff --git a/utilities/test_suite/HOST/Tensor_host.cpp b/utilities/test_suite/HOST/Tensor_host.cpp index 19aa5452f..15a94095d 100644 --- a/utilities/test_suite/HOST/Tensor_host.cpp +++ b/utilities/test_suite/HOST/Tensor_host.cpp @@ -339,6 +339,15 @@ int main(int argc, char **argv) } } + // create generic descriptor and params in case of slice + RpptGenericDesc descriptor3D; + RpptGenericDescPtr descriptorPtr3D = &descriptor3D; + Rpp32s *anchorTensor = NULL, *shapeTensor = NULL; + Rpp32u *roiTensor = NULL; + if(testCase == 90) + set_generic_descriptor_slice(srcDescPtr, descriptorPtr3D, batchSize); + + // create cropRoi and patchRoi in case of crop_and_patch RpptROI *cropRoi, *patchRoi; if(testCase == 33) { @@ -1171,6 +1180,30 @@ int main(int argc, char **argv) break; } + case 90: + { + testCaseName = "slice"; + Rpp32u numDims = descriptorPtr3D->numDims - 1; // exclude batchSize from input dims + if(anchorTensor == NULL) + anchorTensor = static_cast(calloc(batchSize * numDims, sizeof(Rpp32s)));; + if(shapeTensor == NULL) + shapeTensor = static_cast(calloc(batchSize * numDims, sizeof(Rpp32s)));; + if(roiTensor == NULL) + roiTensor = static_cast(calloc(batchSize * numDims * 2, sizeof(Rpp32u)));; + bool enablePadding = false; + auto fillValue = 0; + init_slice(descriptorPtr3D, roiTensorPtrSrc, roiTensor, anchorTensor, shapeTensor); + + startWallTime = omp_get_wtime(); + startCpuTime = clock(); + + if((inputBitDepth == 0 || inputBitDepth == 2) && srcDescPtr->layout == dstDescPtr->layout) + rppt_slice_host(input, descriptorPtr3D, output, descriptorPtr3D, anchorTensor, shapeTensor, &fillValue, enablePadding, roiTensor, handle); + else + missingFuncFlag = 1; + + break; + } default: missingFuncFlag = 1; break; @@ -1268,6 +1301,42 @@ int main(int argc, char **argv) refFile.close(); } + // if test case is slice and qaFlag is set, update the dstImgSizes with shapeTensor values + // for output display and comparision purposes + if (testCase == 90) + { + if (dstDescPtr->layout == RpptLayout::NCHW) + { + if (dstDescPtr->c == 3) + { + for(int i = 0; i < batchSize; i++) + { + int idx1 = i * 3; + dstImgSizes[i].height = shapeTensor[idx1 + 1]; + dstImgSizes[i].width = shapeTensor[idx1 + 2]; + } + } + else + { + for(int i = 0; i < batchSize; i++) + { + int idx1 = i * 2; + dstImgSizes[i].height = shapeTensor[idx1]; + dstImgSizes[i].width = shapeTensor[idx1 + 1]; + } + } + } + else if (dstDescPtr->layout == RpptLayout::NHWC) + { + for(int i = 0; i < batchSize; i++) + { + int idx1 = i * 3; + dstImgSizes[i].height = shapeTensor[idx1]; + dstImgSizes[i].width = shapeTensor[idx1 + 1]; + } + } + } + /*Compare the output of the function with golden outputs only if 1.QA Flag is set 2.input bit depth 0 (Input U8 && Output U8) @@ -1324,6 +1393,12 @@ int main(int argc, char **argv) free(roiTensorPtrSrc); free(roiTensorPtrDst); free(dstImgSizes); + if(anchorTensor != NULL) + free(anchorTensor); + if(shapeTensor != NULL) + free(shapeTensor); + if(roiTensor != NULL) + free(roiTensor); free(input); free(inputu8); free(inputu8Second); diff --git a/utilities/test_suite/HOST/Tensor_host_audio.cpp b/utilities/test_suite/HOST/Tensor_host_audio.cpp index 35ce3a752..ac05bcc90 100644 --- a/utilities/test_suite/HOST/Tensor_host_audio.cpp +++ b/utilities/test_suite/HOST/Tensor_host_audio.cpp @@ -109,6 +109,19 @@ int main(int argc, char **argv) maxDstChannels = 1; set_audio_descriptor_dims_and_strides(dstDescPtr, batchSize, maxDstHeight, maxDstWidth, maxDstChannels, offsetInBytes); + // create generic descriptor in case of slice + RpptGenericDesc descriptor3D; + RpptGenericDescPtr descriptorPtr3D = &descriptor3D; + if(testCase == 4) + { + descriptorPtr3D->numDims = 2; + descriptorPtr3D->offsetInBytes = 0; + descriptorPtr3D->dataType = RpptDataType::F32; + descriptorPtr3D->dims[0] = batchSize; + descriptorPtr3D->dims[1] = maxSrcWidth; + descriptorPtr3D->strides[0] = descriptorPtr3D->dims[1]; + } + // set buffer sizes for src/dst iBufferSize = (Rpp64u)srcDescPtr->h * (Rpp64u)srcDescPtr->w * (Rpp64u)srcDescPtr->c * (Rpp64u)srcDescPtr->n; oBufferSize = (Rpp64u)dstDescPtr->h * (Rpp64u)dstDescPtr->w * (Rpp64u)dstDescPtr->c * (Rpp64u)dstDescPtr->n; @@ -145,8 +158,8 @@ int main(int argc, char **argv) case 0: { testCaseName = "non_silent_region_detection"; - Rpp32f detectedIndex[batchSize]; - Rpp32f detectionLength[batchSize]; + Rpp32s detectedIndex[batchSize]; + Rpp32s detectionLength[batchSize]; Rpp32f cutOffDB = -60.0; Rpp32s windowLength = 2048; Rpp32f referencePower = 0.0f; @@ -265,6 +278,32 @@ int main(int argc, char **argv) break; } + case 5: + { + testCaseName = "slice"; + Rpp32u nDim = 1; // testing for 1D slice + auto fillValue = 0; + bool enablePadding = true; + Rpp32u roiTensor[batchSize * nDim * 2]; + Rpp32s anchorTensor[batchSize * nDim]; + Rpp32s shapeTensor[batchSize * nDim]; + + // 1D slice arguments + for (int i = 0; i < batchSize; i++) + { + int idx = i * nDim * 2; + roiTensor[idx] = 10; + roiTensor[idx + 1] = srcLengthTensor[i]; + anchorTensor[i] = 10; + shapeTensor[i] = dstDims[i].width = srcLengthTensor[i] / 2; + dstDims[i].height = 1; + } + + startWallTime = omp_get_wtime(); + rppt_slice_host(inputf32, descriptorPtr3D, outputf32, descriptorPtr3D, anchorTensor, shapeTensor, &fillValue, enablePadding, roiTensor, handle); + + break; + } case 6: { testCaseName = "resample"; diff --git a/utilities/test_suite/HOST/Tensor_voxel_host.cpp b/utilities/test_suite/HOST/Tensor_voxel_host.cpp index 968f29b54..89fddf2c6 100644 --- a/utilities/test_suite/HOST/Tensor_voxel_host.cpp +++ b/utilities/test_suite/HOST/Tensor_voxel_host.cpp @@ -140,6 +140,10 @@ int main(int argc, char * argv[]) void *pinnedMemArgs; pinnedMemArgs = calloc(2 * noOfFiles , sizeof(Rpp32f)); + // arguments required for slice + Rpp32s *anchorTensor = NULL, *shapeTensor = NULL; + Rpp32u *roiTensor = NULL; + // Set the number of threads to be used by OpenMP pragma for RPP batch processing on host. // If numThreads value passed is 0, number of OpenMP threads used by RPP will be set to batch size Rpp32u numThreads = 0; @@ -241,11 +245,21 @@ int main(int argc, char * argv[]) case 1: { testCaseName = "slice"; + if(anchorTensor == NULL) + anchorTensor = static_cast(calloc(batchSize * 4, sizeof(Rpp32s)));; + if(shapeTensor == NULL) + shapeTensor = static_cast(calloc(batchSize * 4, sizeof(Rpp32s)));; + if(roiTensor == NULL) + roiTensor = static_cast(calloc(batchSize * 8, sizeof(Rpp32u)));; + bool enablePadding = false; + auto fillValue = 0; + init_slice_voxel(descriptorPtr3D, roiGenericSrcPtr, roiTensor, anchorTensor, shapeTensor); + startWallTime = omp_get_wtime(); if(inputBitDepth == 0) - rppt_slice_host(inputU8, descriptorPtr3D, outputU8, descriptorPtr3D, roiGenericSrcPtr, roiTypeSrc, handle); + rppt_slice_host(inputU8, descriptorPtr3D, outputU8, descriptorPtr3D, anchorTensor, shapeTensor, &fillValue, enablePadding, roiTensor, handle); else if(inputBitDepth == 2) - rppt_slice_host(inputF32, descriptorPtr3D, outputF32, descriptorPtr3D, roiGenericSrcPtr, roiTypeSrc, handle); + rppt_slice_host(inputF32, descriptorPtr3D, outputF32, descriptorPtr3D, anchorTensor, shapeTensor, &fillValue, enablePadding, roiTensor, handle); else missingFuncFlag = 1; @@ -393,6 +407,39 @@ int main(int argc, char * argv[]) outputF32[i] = static_cast(outputU8[i]); } + // if test case is slice and qaFlag is set, update the ROI with shapeTensor values + // for output display and comparison purposes + if(testCase == 1) + { + // update the roi for comparision with the shapeTensor values + if (descriptorPtr3D->layout == RpptLayout::NCDHW) + { + for(int i = 0; i < batchSize; i++) + { + int idx1 = i * 4; + roiGenericSrcPtr[i].xyzwhdROI.xyz.x = 0; + roiGenericSrcPtr[i].xyzwhdROI.xyz.y = 0; + roiGenericSrcPtr[i].xyzwhdROI.xyz.z = 0; + roiGenericSrcPtr[i].xyzwhdROI.roiDepth = shapeTensor[idx1 + 1]; + roiGenericSrcPtr[i].xyzwhdROI.roiHeight = shapeTensor[idx1 + 2]; + roiGenericSrcPtr[i].xyzwhdROI.roiWidth = shapeTensor[idx1 + 3]; + } + } + else if(descriptorPtr3D->layout == RpptLayout::NDHWC) + { + for(int i = 0; i < batchSize; i++) + { + int idx1 = i * 4; + roiGenericSrcPtr[i].xyzwhdROI.xyz.x = 0; + roiGenericSrcPtr[i].xyzwhdROI.xyz.y = 0; + roiGenericSrcPtr[i].xyzwhdROI.xyz.z = 0; + roiGenericSrcPtr[i].xyzwhdROI.roiDepth = shapeTensor[idx1]; + roiGenericSrcPtr[i].xyzwhdROI.roiHeight = shapeTensor[idx1 + 1]; + roiGenericSrcPtr[i].xyzwhdROI.roiWidth = shapeTensor[idx1 + 2]; + } + } + } + /*Compare the output of the function with golden outputs only if 1.QA Flag is set 2.input bit depth 2 (F32)*/ @@ -481,6 +528,12 @@ int main(int argc, char * argv[]) free(outputF32); free(roiGenericSrcPtr); free(pinnedMemArgs); + if(anchorTensor != NULL) + free(anchorTensor); + if(shapeTensor != NULL) + free(shapeTensor); + if(roiTensor != NULL) + free(roiTensor); if(inputBitDepth == 0) { if(inputU8 != NULL) diff --git a/utilities/test_suite/HOST/runTests.py b/utilities/test_suite/HOST/runTests.py index ddde7db9c..8bedd5044 100644 --- a/utilities/test_suite/HOST/runTests.py +++ b/utilities/test_suite/HOST/runTests.py @@ -40,7 +40,7 @@ outFolderPath = os.getcwd() buildFolderPath = os.getcwd() caseMin = 0 -caseMax = 89 +caseMax = 90 # Get a list of log files based on a flag for preserving output def get_log_file_list(preserveOutput): @@ -283,7 +283,7 @@ def rpp_test_suite_parser_and_validator(): subprocess.run(["make", "-j16"], cwd=".") # nosec # List of cases supported -supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '33', '34', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89'] +supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '33', '34', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90'] print("\n\n\n\n\n") print("##########################################################################################") diff --git a/utilities/test_suite/REFERENCE_OUTPUT/slice/slice_u8_Tensor.bin b/utilities/test_suite/REFERENCE_OUTPUT/slice/slice_u8_Tensor.bin new file mode 100644 index 000000000..ac379dc79 Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUT/slice/slice_u8_Tensor.bin differ diff --git a/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/slice/slice.bin b/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/slice/slice.bin new file mode 100644 index 000000000..10b45a6ff Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/slice/slice.bin differ diff --git a/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/slice/slice.bin b/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/slice/slice.bin deleted file mode 100644 index 94802b30b..000000000 Binary files a/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/slice/slice.bin and /dev/null differ diff --git a/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/slice/slice_nifti_output.bin b/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/slice/slice_nifti_output.bin index 94802b30b..f51ec113c 100644 Binary files a/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/slice/slice_nifti_output.bin and b/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/slice/slice_nifti_output.bin differ diff --git a/utilities/test_suite/rpp_test_suite_audio.h b/utilities/test_suite/rpp_test_suite_audio.h index 4367df17c..8ff5815c0 100644 --- a/utilities/test_suite/rpp_test_suite_audio.h +++ b/utilities/test_suite/rpp_test_suite_audio.h @@ -41,6 +41,7 @@ std::map audioAugmentationMap = {2, "pre_emphasis_filter"}, {3, "down_mixing"}, {4, "spectrogram"}, + {5, "slice"}, {6, "resample"} }; @@ -231,7 +232,7 @@ void verify_output(Rpp32f *dstPtr, RpptDescPtr dstDescPtr, RpptImagePatchPtr dst free(refOutput); } -void verify_non_silent_region_detection(float *detectedIndex, float *detectionLength, string testCase, int bs, vector audioNames, string dst) +void verify_non_silent_region_detection(int *detectedIndex, int *detectionLength, string testCase, int bs, vector audioNames, string dst) { int fileMatch = 0; for (int i = 0; i < bs; i++) diff --git a/utilities/test_suite/rpp_test_suite_common.h b/utilities/test_suite/rpp_test_suite_common.h index 3ec123d7f..ea3117041 100644 --- a/utilities/test_suite/rpp_test_suite_common.h +++ b/utilities/test_suite/rpp_test_suite_common.h @@ -106,6 +106,7 @@ std::map augmentationMap = {87, "tensor_sum"}, {88, "tensor_min"}, {89, "tensor_max"}, + {90, "slice"} }; // Golden outputs for Tensor min Kernel @@ -456,6 +457,43 @@ inline void set_generic_descriptor(RpptGenericDescPtr descriptorPtr3D, int noOfI descriptorPtr3D->strides[4] = 1; } +// sets generic descriptor dimensions and strides of src/dst for slice functionality +inline void set_generic_descriptor_slice(RpptDescPtr srcDescPtr, RpptGenericDescPtr descriptorPtr3D, int batchSize) +{ + descriptorPtr3D->offsetInBytes = 0; + descriptorPtr3D->dataType = srcDescPtr->dataType; + descriptorPtr3D->layout = srcDescPtr->layout; + if(srcDescPtr->c == 3) + { + descriptorPtr3D->numDims = 4; + descriptorPtr3D->dims[0] = batchSize; + if (srcDescPtr->layout == RpptLayout::NHWC) + { + descriptorPtr3D->dims[1] = srcDescPtr->h; + descriptorPtr3D->dims[2] = srcDescPtr->w; + descriptorPtr3D->dims[3] = srcDescPtr->c; + } + else + { + descriptorPtr3D->dims[1] = srcDescPtr->c; + descriptorPtr3D->dims[2] = srcDescPtr->h; + descriptorPtr3D->dims[3] = srcDescPtr->w; + } + descriptorPtr3D->strides[0] = descriptorPtr3D->dims[1] * descriptorPtr3D->dims[2] * descriptorPtr3D->dims[3]; + descriptorPtr3D->strides[1] = descriptorPtr3D->dims[2] * descriptorPtr3D->dims[3]; + descriptorPtr3D->strides[2] = descriptorPtr3D->dims[3]; + } + else + { + descriptorPtr3D->numDims = 3; + descriptorPtr3D->dims[0] = batchSize; + descriptorPtr3D->dims[1] = srcDescPtr->h; + descriptorPtr3D->dims[2] = srcDescPtr->w; + descriptorPtr3D->strides[0] = descriptorPtr3D->dims[1] * descriptorPtr3D->dims[2]; + descriptorPtr3D->strides[1] = descriptorPtr3D->dims[2]; + } +} + // sets descriptor dimensions and strides of src/dst inline void set_descriptor_dims_and_strides(RpptDescPtr descPtr, int noOfImages, int maxHeight, int maxWidth, int numChannels, int offsetInBytes) { @@ -1295,4 +1333,60 @@ void inline init_ricap(int width, int height, int batchSize, Rpp32u *permutation roiPtrInputCropRegion[1].xywhROI = {randrange(0, part0Width - 8), randrange(0, height - part0Height), width - part0Width, part0Height}; roiPtrInputCropRegion[2].xywhROI = {randrange(0, width - part0Width - 8), randrange(0, part0Height), part0Width, height - part0Height}; roiPtrInputCropRegion[3].xywhROI = {randrange(0, part0Width - 8), randrange(0, part0Height), width - part0Width, height - part0Height}; -} \ No newline at end of file +} + +// initialize the roi, anchor and shape values required for slice +void init_slice(RpptGenericDescPtr descriptorPtr3D, RpptROIPtr roiPtrSrc, Rpp32u *roiTensor, Rpp32s *anchorTensor, Rpp32s *shapeTensor) +{ + if(descriptorPtr3D->numDims == 4) + { + if (descriptorPtr3D->layout == RpptLayout::NCHW) + { + for(int i = 0; i < descriptorPtr3D->dims[0]; i++) + { + int idx1 = i * 3; + int idx2 = i * 6; + roiTensor[idx2] = anchorTensor[idx1] = 0; + roiTensor[idx2 + 1] = anchorTensor[idx1 + 1] = roiPtrSrc[i].xywhROI.xy.y; + roiTensor[idx2 + 2] = anchorTensor[idx1 + 2] = roiPtrSrc[i].xywhROI.xy.x; + roiTensor[idx2 + 3] = descriptorPtr3D->dims[1]; + roiTensor[idx2 + 4] = roiPtrSrc[i].xywhROI.roiHeight; + roiTensor[idx2 + 5] = roiPtrSrc[i].xywhROI.roiWidth; + shapeTensor[idx1] = roiTensor[idx2 + 3]; + shapeTensor[idx1 + 1] = roiTensor[idx2 + 4] / 2; + shapeTensor[idx1 + 2] = roiTensor[idx2 + 5] / 2; + } + } + else if(descriptorPtr3D->layout == RpptLayout::NHWC) + { + for(int i = 0; i < descriptorPtr3D->dims[0]; i++) + { + int idx1 = i * 3; + int idx2 = i * 6; + roiTensor[idx2] = anchorTensor[idx1] = roiPtrSrc[i].xywhROI.xy.y; + roiTensor[idx2 + 1] = anchorTensor[idx1 + 1] = roiPtrSrc[i].xywhROI.xy.x; + roiTensor[idx2 + 2] = anchorTensor[idx1 + 2] = 0; + roiTensor[idx2 + 3] = roiPtrSrc[i].xywhROI.roiHeight; + roiTensor[idx2 + 4] = roiPtrSrc[i].xywhROI.roiWidth; + roiTensor[idx2 + 5] = descriptorPtr3D->dims[3]; + shapeTensor[idx1] = roiTensor[idx2 + 3] / 2; + shapeTensor[idx1 + 1] = roiTensor[idx2 + 4] / 2; + shapeTensor[idx1 + 2] = roiTensor[idx2 + 5]; + } + } + } + if(descriptorPtr3D->numDims == 3) + { + for(int i = 0; i < descriptorPtr3D->dims[0]; i++) + { + int idx1 = i * 2; + int idx2 = i * 4; + roiTensor[idx2] = anchorTensor[idx1] = roiPtrSrc[i].xywhROI.xy.y; + roiTensor[idx2 + 1] = anchorTensor[idx1 + 1] = roiPtrSrc[i].xywhROI.xy.x; + roiTensor[idx2 + 2] = roiPtrSrc[i].xywhROI.roiHeight; + roiTensor[idx2 + 3] = roiPtrSrc[i].xywhROI.roiWidth; + shapeTensor[idx1] = roiTensor[idx2 + 2] / 2; + shapeTensor[idx1 + 1] = roiTensor[idx2 + 3] / 2; + } + } +} diff --git a/utilities/test_suite/rpp_test_suite_voxel.h b/utilities/test_suite/rpp_test_suite_voxel.h index c24fc29a8..77de34f3b 100644 --- a/utilities/test_suite/rpp_test_suite_voxel.h +++ b/utilities/test_suite/rpp_test_suite_voxel.h @@ -242,6 +242,51 @@ inline string set_function_type(int layoutType, int pln1OutTypeCase, int outputF return funcType; } +// initialize the roi, anchor and shape values required for slice +void init_slice_voxel(RpptGenericDescPtr descriptorPtr3D, RpptROI3D *roiGenericSrcPtr, Rpp32u *roiTensor, Rpp32s *anchorTensor, Rpp32s *shapeTensor) +{ + if (descriptorPtr3D->layout == RpptLayout::NCDHW) + { + for(int i = 0; i < descriptorPtr3D->dims[0]; i++) + { + int idx1 = i * 4; + int idx2 = i * 8; + roiTensor[idx2] = anchorTensor[idx1] = 0; + roiTensor[idx2 + 1] = anchorTensor[idx1 + 1] = roiGenericSrcPtr[i].xyzwhdROI.xyz.z; + roiTensor[idx2 + 2] = anchorTensor[idx1 + 2] = roiGenericSrcPtr[i].xyzwhdROI.xyz.y; + roiTensor[idx2 + 3] = anchorTensor[idx1 + 3] = roiGenericSrcPtr[i].xyzwhdROI.xyz.x; + roiTensor[idx2 + 4] = descriptorPtr3D->dims[1]; + roiTensor[idx2 + 5] = roiGenericSrcPtr[i].xyzwhdROI.roiDepth; + roiTensor[idx2 + 6] = roiGenericSrcPtr[i].xyzwhdROI.roiHeight; + roiTensor[idx2 + 7] = roiGenericSrcPtr[i].xyzwhdROI.roiWidth; + shapeTensor[idx1] = roiTensor[idx2 + 4]; + shapeTensor[idx1 + 1] = roiTensor[idx2 + 5] / 2; + shapeTensor[idx1 + 2] = roiTensor[idx2 + 6] / 2; + shapeTensor[idx1 + 3] = roiTensor[idx2 + 7] / 2; + } + } + else if(descriptorPtr3D->layout == RpptLayout::NDHWC) + { + for(int i = 0; i < descriptorPtr3D->dims[0]; i++) + { + int idx1 = i * 4; + int idx2 = i * 8; + roiTensor[idx2] = anchorTensor[idx1] = roiGenericSrcPtr[i].xyzwhdROI.xyz.z; + roiTensor[idx2 + 1] = anchorTensor[idx1 + 1] = roiGenericSrcPtr[i].xyzwhdROI.xyz.y; + roiTensor[idx2 + 2] = anchorTensor[idx1 + 2] = roiGenericSrcPtr[i].xyzwhdROI.xyz.x; + roiTensor[idx2 + 3] = anchorTensor[idx1 + 3] = 0; + roiTensor[idx2 + 4] = roiGenericSrcPtr[i].xyzwhdROI.roiDepth; + roiTensor[idx2 + 5] = roiGenericSrcPtr[i].xyzwhdROI.roiHeight; + roiTensor[idx2 + 6] = roiGenericSrcPtr[i].xyzwhdROI.roiWidth; + roiTensor[idx2 + 7] = descriptorPtr3D->dims[4]; + shapeTensor[idx1] = roiTensor[idx2 + 4] / 2; + shapeTensor[idx1 + 1] = roiTensor[idx2 + 5] / 2; + shapeTensor[idx1 + 2] = roiTensor[idx2 + 6] / 2; + shapeTensor[idx1 + 3] = roiTensor[idx2 + 7]; + } + } +} + // reads nifti-1 header file static int read_nifti_header_file(char* const header_file, nifti_1_header &niftiHeader) {