diff --git a/include/rppdefs.h b/include/rppdefs.h
index 82539e139..d35f82bfd 100644
--- a/include/rppdefs.h
+++ b/include/rppdefs.h
@@ -134,7 +134,7 @@ typedef enum
/*! \brief Out of bound source ROI \ingroup group_rppdefs */
RPP_ERROR_OUT_OF_BOUND_SRC_ROI = -17,
/*! \brief src and dst layout mismatch \ingroup group_rppdefs */
- RPP_ERROR_SRC_DST_LAYOUT_MISMATCH = -18,
+ RPP_ERROR_LAYOUT_MISMATCH = -18,
/*! \brief Number of channels is invalid. (Needs to adhere to function specification.) \ingroup group_rppdefs */
RPP_ERROR_INVALID_CHANNELS = -19
} RppStatus;
diff --git a/include/rppt_tensor_audio_augmentations.h b/include/rppt_tensor_audio_augmentations.h
index 09e4cbd56..4e5f412db 100644
--- a/include/rppt_tensor_audio_augmentations.h
+++ b/include/rppt_tensor_audio_augmentations.h
@@ -60,7 +60,7 @@ extern "C" {
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
*/
-RppStatus rppt_non_silent_region_detection_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp32s *srcLengthTensor, Rpp32f *detectedIndexTensor, Rpp32f *detectionLengthTensor, Rpp32f cutOffDB, Rpp32s windowLength, Rpp32f referencePower, Rpp32s resetInterval, rppHandle_t rppHandle);
+RppStatus rppt_non_silent_region_detection_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp32s *srcLengthTensor, Rpp32s *detectedIndexTensor, Rpp32s *detectionLengthTensor, Rpp32f cutOffDB, Rpp32s windowLength, Rpp32f referencePower, Rpp32s resetInterval, rppHandle_t rppHandle);
/*! \brief To Decibels augmentation on HOST backend
* \details To Decibels augmentation for 1D audio buffer converts magnitude values to decibel values
diff --git a/include/rppt_tensor_geometric_augmentations.h b/include/rppt_tensor_geometric_augmentations.h
index 695c3252d..8e846a41b 100644
--- a/include/rppt_tensor_geometric_augmentations.h
+++ b/include/rppt_tensor_geometric_augmentations.h
@@ -448,38 +448,42 @@ RppStatus rppt_phase_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDesc
* \details This function performs slice augmentation on a generic 4D tensor.
* Slice augmentation involves selecting a region of interest (ROI) from the source tensor
* and copying it to the destination tensor. Support added for f32 -> f32 and u8 -> u8 dataypes.
- * \param[in] srcPtr source tensor in HOST memory
- * \param[in] srcGenericDescPtr source tensor descriptor
- * \param[out] dstPtr destination tensor in HOST memory
- * \param[in] dstGenericDescPtr destination tensor descriptor
- * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
- * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
+ * \param [in] srcPtr source tensor memory in HOST memory
+ * \param [in] srcGenericDescPtr source tensor descriptor
+ * \param [out] dstPtr destination tensor memory in HOST memory
+ * \param [in] dstGenericDescPtr destination tensor descriptor
+ * \param [in] anchorTensor starting index of the slice for each dimension in input (1D tensor of size = batchSize * numberOfDimensions)
+ * \param [in] shapeTensor length of the slice for each dimension in input (1D tensor of size = batchSize * numberOfDimensions)
+ * \param [in] fillValue fill value that is used to fill output if enablePadding is set to true
+ * \param [in] enablePadding boolean flag to specify if padding is enabled or not
+ * \param [in] roiTensor roi data in HOST memory (1D tensor of size = batchSize * numberOfDimensions * 2)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
- * \ingroup group_tensor_geometric
*/
-RppStatus rppt_slice_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
+RppStatus rppt_slice_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32s *anchorTensor, Rpp32s *shapeTensor, RppPtr_t fillValue, bool enablePadding, Rpp32u *roiTensor, rppHandle_t rppHandle);
#ifdef GPU_SUPPORT
/*! \brief Slice augmentation GPU
* \details This function performs slice augmentation on a generic 4D tensor.
* Slice augmentation involves selecting a region of interest (ROI) from the source tensor
* and copying it to the destination tensor. Support added for f32 -> f32 and u8 -> u8 dataypes.
- * \param[in] srcPtr source tensor in HIP memory
- * \param[in] srcGenericDescPtr source tensor descriptor
- * \param[out] dstPtr destination tensor in HIP memory
- * \param[in] dstGenericDescPtr destination tensor descriptor
- * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
- * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
+ * \param [in] srcPtr source tensor memory in HIP memory
+ * \param [in] srcGenericDescPtr source tensor descriptor
+ * \param [out] dstPtr destination tensor memory in HIP memory
+ * \param [in] dstGenericDescPtr destination tensor descriptor
+ * \param [in] anchorTensor starting index of the slice for each dimension in input (1D tensor in pinned/HOST memory of size = batchSize * numberOfDimensions)
+ * \param [in] shapeTensor length of the slice for each dimension in input (1D tensor in pinned/HOST memory of size = batchSize * numberOfDimensions)
+ * \param [in] fillValue fill value that is used to fill output if enablePadding is set to true
+ * \param [in] enablePadding boolean flag to specify if padding is enabled or not
+ * \param [in] roiTensor roi data in pinned/HOST memory (1D tensor of size = batchSize * numberOfDimensions * 2)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
- * \ingroup group_tensor_geometric
*/
-RppStatus rppt_slice_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
+RppStatus rppt_slice_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32s *anchorTensor, Rpp32s *shapeTensor, RppPtr_t fillValue, bool enablePadding, Rpp32u *roiTensor, rppHandle_t rppHandle);
#endif // GPU_SUPPORT
/*! \brief Crop and Patch augmentation on HOST backend for a NCHW/NHWC layout tensor
@@ -539,15 +543,15 @@ RppStatus rppt_crop_and_patch_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPt
Support added for f32 -> f32 and u8 -> u8 dataypes.
* \image html input150x150x4.gif Sample Input
* \image html geometric_augmentations_flip_150x150x4.gif Sample Output
- * \param[in] srcPtr source tensor in HOST memory
- * \param[in] srcGenericDescPtr source tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3)
- * \param[out] dstPtr destination tensor in HOST memory
- * \param[in] dstGenericDescPtr destination tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3)
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcGenericDescPtr source tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HOST memory
+ * \param [in] dstGenericDescPtr destination tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3)
* \param [in] horizontalTensor horizontal flag values to set horizontal flip on/off (1D tensor in HOST memory, of size batchSize, with horizontalTensor[i] = 0/1)
* \param [in] verticalTensor vertical flag values to set vertical flip on/off (1D tensor in HOST memory, of size batchSize, with verticalTensor[i] = 0/1)
* \param [in] depthTensor depth flag values to set depth flip on/off (1D tensor in HOST memory, of size batchSize, with depthTensor[i] = 0/1)
- * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
- * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
+ * \param [in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
+ * \param [in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
* \retval RPP_SUCCESS Successful completion.
@@ -562,15 +566,15 @@ RppStatus rppt_flip_voxel_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDes
Support added for f32 -> f32 and u8 -> u8 dataypes.
* \image html input150x150x4.gif Sample Input
* \image html geometric_augmentations_flip_150x150x4.gif Sample Output
- * \param[in] srcPtr source tensor in HIP memory
- * \param[in] srcGenericDescPtr source tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3)
- * \param[out] dstPtr destination tensor in HIP memory
- * \param[in] dstGenericDescPtr destination tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3)
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param [in] srcGenericDescPtr source tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HIP memory
+ * \param [in] dstGenericDescPtr destination tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3)
* \param [in] horizontalTensor horizontal flag values to set horizontal flip on/off (1D tensor in pinned/HOST memory, of size batchSize, with horizontalTensor[i] = 0/1)
* \param [in] verticalTensor vertical flag values to set vertical flip on/off (1D tensor in pinned/HOST memory, of size batchSize, with verticalTensor[i] = 0/1)
* \param [in] depthTensor depth flag values to set depth flip on/off (1D tensor in pinned/HOST memory, of size batchSize, with depthTensor[i] = 0/1)
- * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
- * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
+ * \param [in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
+ * \param [in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
* \retval RPP_SUCCESS Successful completion.
diff --git a/src/modules/cpu/kernel/non_silent_region_detection.hpp b/src/modules/cpu/kernel/non_silent_region_detection.hpp
index 74dffb18e..39d9e6940 100644
--- a/src/modules/cpu/kernel/non_silent_region_detection.hpp
+++ b/src/modules/cpu/kernel/non_silent_region_detection.hpp
@@ -95,8 +95,8 @@ Rpp32f getSquare(Rpp32f &value)
RppStatus non_silent_region_detection_host_tensor(Rpp32f *srcPtr,
RpptDescPtr srcDescPtr,
Rpp32s *srcLengthTensor,
- Rpp32f *detectedIndexTensor,
- Rpp32f *detectionLengthTensor,
+ Rpp32s *detectedIndexTensor,
+ Rpp32s *detectionLengthTensor,
Rpp32f cutOffDB,
Rpp32s windowLength,
Rpp32f referencePower,
diff --git a/src/modules/cpu/kernel/slice.hpp b/src/modules/cpu/kernel/slice.hpp
index c451b67b4..37c3097c9 100644
--- a/src/modules/cpu/kernel/slice.hpp
+++ b/src/modules/cpu/kernel/slice.hpp
@@ -26,184 +26,213 @@ SOFTWARE.
#include "rpp_cpu_simd.hpp"
#include "rpp_cpu_common.hpp"
-RppStatus slice_f32_f32_host_tensor(Rpp32f *srcPtr,
- RpptGenericDescPtr srcGenericDescPtr,
- Rpp32f *dstPtr,
- RpptGenericDescPtr dstGenericDescPtr,
- RpptROI3DPtr roiGenericPtrSrc,
- RpptRoi3DType roiType,
- RppLayoutParams layoutParams,
- rpp::Handle& handle)
+template
+RppStatus slice_host_tensor(T *srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ T *dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32s *anchorTensor,
+ Rpp32s *shapeTensor,
+ T* fillValue,
+ bool enablePadding,
+ Rpp32u *roiTensor,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
{
- RpptROI3D roiDefault;
- if(srcGenericDescPtr->layout==RpptLayout::NCDHW)
- roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[4], (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2]};
- else if(srcGenericDescPtr->layout==RpptLayout::NDHWC)
- roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2], (Rpp32s)srcGenericDescPtr->dims[1]};
Rpp32u numThreads = handle.GetNumThreads();
+ Rpp32u numDims = srcGenericDescPtr->numDims - 1; // exclude batchsize from input dims
omp_set_dynamic(0);
#pragma omp parallel for num_threads(numThreads)
for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
{
- RpptROI3D roi;
- RpptROI3DPtr roiPtrInput = &roiGenericPtrSrc[batchCount];
- compute_roi3D_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+ T *srcPtrTemp, *dstPtrTemp;
+ srcPtrTemp = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+ dstPtrTemp = dstPtr + batchCount * dstGenericDescPtr->strides[0];
- Rpp32f *srcPtrImage, *dstPtrImage;
- srcPtrImage = srcPtr + batchCount * srcGenericDescPtr->strides[0];
- dstPtrImage = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+ T *srcPtrChannel, *dstPtrChannel;
+ dstPtrChannel = dstPtrTemp;
- Rpp32u bufferLength = roi.xyzwhdROI.roiWidth * layoutParams.bufferMultiplier;
+ Rpp32s *anchor = &anchorTensor[batchCount * numDims];
+ Rpp32s *shape = &shapeTensor[batchCount * numDims];
- Rpp32f *srcPtrChannel, *dstPtrChannel;
- dstPtrChannel = dstPtrImage;
+ // get the starting address of length values from roiTensor
+ Rpp32u *roi = roiTensor + batchCount * numDims * 2;
+ Rpp32s *length = reinterpret_cast(&roi[numDims]);
- // Slice without fused output-layout toggle (NCDHW -> NCDHW)
- if((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW))
+ if (numDims == 4)
{
- srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[3]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier);
-
- Rpp32u copyLengthInBytes = bufferLength * sizeof(Rpp32f);
- for(int c = 0; c < layoutParams.channelParam; c++)
+ // order of dims
+ Rpp32s dimsOrder[3];
+ if (dstGenericDescPtr->layout == RpptLayout::NCDHW)
+ {
+ dimsOrder[0] = 1; // depth
+ dimsOrder[1] = 2; // height
+ dimsOrder[2] = 3; // width
+ }
+ else
{
- Rpp32f *srcPtrDepth, *dstPtrDepth;
- srcPtrDepth = srcPtrChannel;
- dstPtrDepth = dstPtrChannel;
+ dimsOrder[0] = 0; // depth
+ dimsOrder[1] = 1; // height
+ dimsOrder[2] = 2; // width
+ }
+ Rpp32u maxDepth = std::min(shape[dimsOrder[0]], length[dimsOrder[0]] - anchor[dimsOrder[0]]);
+ Rpp32u maxHeight = std::min(shape[dimsOrder[1]], length[dimsOrder[1]] - anchor[dimsOrder[1]]);
+ Rpp32u maxWidth = std::min(shape[dimsOrder[2]], length[dimsOrder[2]] - anchor[dimsOrder[2]]);
+ Rpp32u bufferLength = maxWidth * layoutParams.bufferMultiplier;
+ Rpp32u copyLengthInBytes = bufferLength * sizeof(T);
+
+ // if padding is required, fill the buffer with fill value specified
+ bool needPadding = (((anchor[dimsOrder[0]] + shape[dimsOrder[0]]) > length[dimsOrder[0]]) ||
+ ((anchor[dimsOrder[1]] + shape[dimsOrder[1]]) > length[dimsOrder[1]]) ||
+ ((anchor[dimsOrder[2]] + shape[dimsOrder[2]]) > length[dimsOrder[2]]));
+ if (needPadding && enablePadding)
+ std::fill(dstPtrChannel, dstPtrChannel + dstGenericDescPtr->strides[0] - 1, *fillValue);
+
+ // slice without fused output-layout toggle (NCDHW -> NCDHW)
+ if (dstGenericDescPtr->layout == RpptLayout::NCDHW)
+ {
+ srcPtrChannel = srcPtrTemp + (anchor[1] * srcGenericDescPtr->strides[2]) + (anchor[2] * srcGenericDescPtr->strides[3]) + (anchor[3] * layoutParams.bufferMultiplier);
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ T *srcPtrDepth, *dstPtrDepth;
+ srcPtrDepth = srcPtrChannel;
+ dstPtrDepth = dstPtrChannel;
+ for(int i = 0; i < maxDepth; i++)
+ {
+ T *srcPtrRow, *dstPtrRow;
+ srcPtrRow = srcPtrDepth;
+ dstPtrRow = dstPtrDepth;
+ for(int j = 0; j < maxHeight; j++)
+ {
+ memcpy(dstPtrRow, srcPtrRow, copyLengthInBytes);
+ srcPtrRow += srcGenericDescPtr->strides[3];
+ dstPtrRow += dstGenericDescPtr->strides[3];
+ }
+ srcPtrDepth += srcGenericDescPtr->strides[2];
+ dstPtrDepth += dstGenericDescPtr->strides[2];
+ }
+ srcPtrChannel += srcGenericDescPtr->strides[1];
+ dstPtrChannel += srcGenericDescPtr->strides[1];
+ }
+ }
- for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++)
+ // slice without fused output-layout toggle (NDHWC -> NDHWC)
+ else if (dstGenericDescPtr->layout == RpptLayout::NDHWC)
+ {
+ srcPtrChannel = srcPtrTemp + (anchor[0] * srcGenericDescPtr->strides[1]) + (anchor[1] * srcGenericDescPtr->strides[2]) + (anchor[2] * layoutParams.bufferMultiplier);
+ T *srcPtrDepth = srcPtrChannel;
+ T *dstPtrDepth = dstPtrChannel;
+ for(int i = 0; i < maxDepth; i++)
{
- Rpp32f *srcPtrRow, *dstPtrRow;
+ T *srcPtrRow, *dstPtrRow;
srcPtrRow = srcPtrDepth;
dstPtrRow = dstPtrDepth;
-
- for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++)
+ for(int j = 0; j < maxHeight; j++)
{
memcpy(dstPtrRow, srcPtrRow, copyLengthInBytes);
-
- srcPtrRow += srcGenericDescPtr->strides[3];
- dstPtrRow += dstGenericDescPtr->strides[3];
+ srcPtrRow += srcGenericDescPtr->strides[2];
+ dstPtrRow += dstGenericDescPtr->strides[2];
}
- srcPtrDepth += srcGenericDescPtr->strides[2];
- dstPtrDepth += dstGenericDescPtr->strides[2];
+ srcPtrDepth += srcGenericDescPtr->strides[1];
+ dstPtrDepth += dstGenericDescPtr->strides[1];
}
-
- srcPtrChannel += srcGenericDescPtr->strides[1];
- dstPtrChannel += srcGenericDescPtr->strides[1];
}
}
- // Slice without fused output-layout toggle (NDHWC -> NDHWC)
- else if((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC))
+ else if (numDims == 3)
{
- Rpp32u copyLengthInBytes = bufferLength * sizeof(Rpp32f);
- srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[1]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier);
- Rpp32f *srcPtrDepth = srcPtrChannel;
- Rpp32f *dstPtrDepth = dstPtrChannel;
-
- for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++)
+ // order of dims
+ Rpp32s dimsOrder[2];
+ if (dstGenericDescPtr->layout == RpptLayout::NCHW)
{
- Rpp32f *srcPtrRow, *dstPtrRow;
- srcPtrRow = srcPtrDepth;
- dstPtrRow = dstPtrDepth;
-
- for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++)
- {
- memcpy(dstPtrRow, srcPtrRow, copyLengthInBytes);
-
- srcPtrRow += srcGenericDescPtr->strides[2];
- dstPtrRow += dstGenericDescPtr->strides[2];
- }
- srcPtrDepth += srcGenericDescPtr->strides[1];
- dstPtrDepth += dstGenericDescPtr->strides[1];
+ dimsOrder[0] = 1; // height
+ dimsOrder[1] = 2; // width
+ }
+ else
+ {
+ dimsOrder[0] = 0; // height
+ dimsOrder[1] = 1; // width
}
- }
- }
-
- return RPP_SUCCESS;
-}
-
-RppStatus slice_u8_u8_host_tensor(Rpp8u *srcPtr,
- RpptGenericDescPtr srcGenericDescPtr,
- Rpp8u *dstPtr,
- RpptGenericDescPtr dstGenericDescPtr,
- RpptROI3DPtr roiGenericPtrSrc,
- RpptRoi3DType roiType,
- RppLayoutParams layoutParams,
- rpp::Handle& handle)
-{
- RpptROI3D roiDefault;
- if(srcGenericDescPtr->layout==RpptLayout::NCDHW)
- roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[4], (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2]};
- else if(srcGenericDescPtr->layout==RpptLayout::NDHWC)
- roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2], (Rpp32s)srcGenericDescPtr->dims[1]};
- Rpp32u numThreads = handle.GetNumThreads();
-
- omp_set_dynamic(0);
-#pragma omp parallel for num_threads(numThreads)
- for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
- {
- RpptROI3D roi;
- RpptROI3DPtr roiPtrInput = &roiGenericPtrSrc[batchCount];
- compute_roi3D_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
- Rpp8u *srcPtrImage, *dstPtrImage;
- srcPtrImage = srcPtr + batchCount * srcGenericDescPtr->strides[0];
- dstPtrImage = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+ Rpp32u maxHeight = std::min(shape[dimsOrder[0]], length[dimsOrder[0]] - anchor[dimsOrder[0]]);
+ Rpp32u maxWidth = std::min(shape[dimsOrder[1]], length[dimsOrder[1]] - anchor[dimsOrder[1]]);
+ Rpp32u bufferLength = maxWidth * layoutParams.bufferMultiplier;
+ Rpp32u copyLengthInBytes = bufferLength * sizeof(T);
- Rpp32u bufferLength = roi.xyzwhdROI.roiWidth * layoutParams.bufferMultiplier;
- Rpp8u *srcPtrChannel, *dstPtrChannel;
- dstPtrChannel = dstPtrImage;
+ // if padding is required, fill the buffer with fill value specified
+ bool needPadding = ((anchor[dimsOrder[0]] + shape[dimsOrder[0]]) > length[dimsOrder[0]]) ||
+ ((anchor[dimsOrder[1]] + shape[dimsOrder[1]]) > length[dimsOrder[1]]);
+ if (needPadding && enablePadding)
+ std::fill(dstPtrChannel, dstPtrChannel + dstGenericDescPtr->strides[0] - 1, *fillValue);
- // Slice without fused output-layout toggle (NCDHW -> NCDHW)
- if((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW))
- {
- srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[3]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier);
- for(int c = 0; c < layoutParams.channelParam; c++)
+ // slice without fused output-layout toggle (NCHW -> NCHW)
+ if (dstGenericDescPtr->layout == RpptLayout::NCHW)
{
- Rpp8u *srcPtrDepth, *dstPtrDepth;
- srcPtrDepth = srcPtrChannel;
- dstPtrDepth = dstPtrChannel;
- for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++)
+ srcPtrChannel = srcPtrTemp + (anchor[1] * srcGenericDescPtr->strides[2]) + (anchor[2] * layoutParams.bufferMultiplier);
+ for(int c = 0; c < layoutParams.channelParam; c++)
{
- Rpp8u *srcPtrRow, *dstPtrRow;
- srcPtrRow = srcPtrDepth;
- dstPtrRow = dstPtrDepth;
- for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++)
+ T *srcPtrRow, *dstPtrRow;
+ srcPtrRow = srcPtrChannel;
+ dstPtrRow = dstPtrChannel;
+ for(int j = 0; j < maxHeight; j++)
{
- memcpy(dstPtrRow, srcPtrRow, bufferLength * sizeof(Rpp8u));
- srcPtrRow += srcGenericDescPtr->strides[3];
- dstPtrRow += dstGenericDescPtr->strides[3];
+ memcpy(dstPtrRow, srcPtrRow, copyLengthInBytes);
+ srcPtrRow += srcGenericDescPtr->strides[2];
+ dstPtrRow += dstGenericDescPtr->strides[2];
}
- srcPtrDepth += srcGenericDescPtr->strides[2];
- dstPtrDepth += dstGenericDescPtr->strides[2];
+ srcPtrChannel += srcGenericDescPtr->strides[1];
+ dstPtrChannel += srcGenericDescPtr->strides[1];
}
- srcPtrChannel += srcGenericDescPtr->strides[1];
- dstPtrChannel += srcGenericDescPtr->strides[1];
}
- }
- // Slice without fused output-layout toggle (NDHWC -> NDHWC)
- else if((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC))
- {
- srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[1]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier);
- Rpp8u *srcPtrDepth = srcPtrChannel;
- Rpp8u *dstPtrDepth = dstPtrChannel;
- for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++)
+ // slice without fused output-layout toggle (NHWC -> NHWC)
+ else if (dstGenericDescPtr->layout == RpptLayout::NHWC)
{
- Rpp8u *srcPtrRow, *dstPtrRow;
- srcPtrRow = srcPtrDepth;
- dstPtrRow = dstPtrDepth;
-
- for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++)
+ srcPtrChannel = srcPtrTemp + (anchor[0] * srcGenericDescPtr->strides[1]) + (anchor[1] * layoutParams.bufferMultiplier);
+ T *srcPtrRow = srcPtrChannel;
+ T *dstPtrRow = dstPtrChannel;
+ for(int j = 0; j < maxHeight; j++)
{
- memcpy(dstPtrRow, srcPtrRow, bufferLength * sizeof(Rpp8u));
- srcPtrRow += srcGenericDescPtr->strides[2];
- dstPtrRow += dstGenericDescPtr->strides[2];
+ memcpy(dstPtrRow, srcPtrRow, copyLengthInBytes);
+ srcPtrRow += srcGenericDescPtr->strides[1];
+ dstPtrRow += dstGenericDescPtr->strides[1];
}
- srcPtrDepth += srcGenericDescPtr->strides[1];
- dstPtrDepth += dstGenericDescPtr->strides[1];
}
}
+ else if (numDims == 2)
+ {
+ srcPtrChannel = srcPtrTemp + (anchor[0] * srcGenericDescPtr->strides[1]) + anchor[1];
+ Rpp32u maxHeight = std::min(shape[0], length[0] - anchor[0]);
+ Rpp32u maxWidth = std::min(shape[1], length[1] - anchor[1]);
+ Rpp32u copyLengthInBytes = maxWidth * sizeof(T);
+
+ // if padding is required, fill the buffer with fill value specified
+ bool needPadding = ((anchor[0] + shape[0]) > length[0]) ||
+ ((anchor[1] + shape[1]) > length[1]);
+ if (needPadding && enablePadding)
+ std::fill(dstPtrChannel, dstPtrChannel + dstGenericDescPtr->strides[0] - 1, *fillValue);
+
+ T *srcPtrRow = srcPtrChannel;
+ T *dstPtrRow = dstPtrChannel;
+ for(int j = 0; j < maxHeight; j++)
+ {
+ memcpy(dstPtrRow, srcPtrRow, copyLengthInBytes);
+ srcPtrRow += srcGenericDescPtr->strides[1];
+ dstPtrRow += dstGenericDescPtr->strides[1];
+ }
+ }
+ else if (numDims == 1)
+ {
+ srcPtrChannel = srcPtrTemp + anchor[0];
+ Rpp32u maxLength = std::min(shape[0], length[0] - anchor[0]);
+ Rpp32u copyLengthInBytes = maxLength * sizeof(T);
+
+ // if padding is required, fill the buffer with fill value specified
+ bool needPadding = ((anchor[0] + shape[0]) > length[0]);
+ if (needPadding && enablePadding)
+ std::fill(dstPtrTemp, dstPtrTemp + dstGenericDescPtr->strides[0] - 1, *fillValue);
+ memcpy(dstPtrChannel, srcPtrChannel, copyLengthInBytes);
+ }
}
return RPP_SUCCESS;
diff --git a/src/modules/hip/kernel/slice.hpp b/src/modules/hip/kernel/slice.hpp
index 8deb52bbb..c1b7a6c41 100644
--- a/src/modules/hip/kernel/slice.hpp
+++ b/src/modules/hip/kernel/slice.hpp
@@ -2,24 +2,79 @@
#include
#include "rpp_hip_common.hpp"
+template
+__global__ void fill_value_ncdhw_hip_tensor(T *dstPtr,
+ uint3 dstStridesCDH,
+ int channels,
+ uint3 dstDimsDHW,
+ T *fillValue)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // W - inner most dim vectorized
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim
+
+ if ((id_z >= dstDimsDHW.x) || (id_y >= dstDimsDHW.y) || (id_x >= dstDimsDHW.z))
+ {
+ return;
+ }
+
+ uint dstIdx = (id_z * dstStridesCDH.y) + (id_y * dstStridesCDH.z) + id_x;
+ d_float8 val_f8;
+ val_f8.f4[0] = (float4)(*fillValue);
+ val_f8.f4[1] = val_f8.f4[0];
+ for(int c = 0; c < channels; c++)
+ {
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &val_f8);
+ dstIdx += dstStridesCDH.x;
+ }
+}
+
+
+template
+__global__ void fill_value_ndhwc_hip_tensor(T *dstPtr,
+ uint2 dstStridesDH,
+ uint3 dstDimsDHW,
+ T *fillValue)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // W - inner most dim vectorized
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim
+
+ if ((id_z >= dstDimsDHW.x) || (id_y >= dstDimsDHW.y) || (id_x >= dstDimsDHW.z))
+ {
+ return;
+ }
+
+ uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x * 3;
+ d_float24 val_f24;
+ val_f24.f4[0] = (float4)(*fillValue);
+ val_f24.f4[1] = val_f24.f4[0];
+ val_f24.f4[2] = val_f24.f4[0];
+ val_f24.f4[3] = val_f24.f4[0];
+ val_f24.f4[4] = val_f24.f4[0];
+ val_f24.f4[5] = val_f24.f4[0];
+ rpp_hip_pack_float24_pkd3_and_store24_pkd3(dstPtr + dstIdx, &val_f24);
+}
+
+
template
__global__ void slice_ncdhw_hip_tensor(T *srcPtr,
uint3 srcStridesCDH,
T *dstPtr,
uint3 dstStridesCDH,
int channels,
- RpptROI3DPtr roiGenericSrc)
+ uint3 validShapeDHW)
{
int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // W - inner most dim vectorized
int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner
int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim
- if ((id_z >= roiGenericSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericSrc->xyzwhdROI.roiWidth))
+ if ((id_z >= validShapeDHW.x) || (id_y >= validShapeDHW.y) || (id_x >= validShapeDHW.z))
{
return;
}
- uint srcIdx = ((id_z + roiGenericSrc->xyzwhdROI.xyz.z) * srcStridesCDH.y) + ((id_y + roiGenericSrc->xyzwhdROI.xyz.y) * srcStridesCDH.z) + (id_x + roiGenericSrc->xyzwhdROI.xyz.x);
+ uint srcIdx = (id_z * srcStridesCDH.y) + (id_y * srcStridesCDH.z) + id_x;
uint dstIdx = (id_z * dstStridesCDH.y) + (id_y * dstStridesCDH.z) + id_x;
d_float8 val_f8;
@@ -32,77 +87,439 @@ __global__ void slice_ncdhw_hip_tensor(T *srcPtr,
}
}
+
template
__global__ void slice_ndhwc_hip_tensor(T *srcPtr,
uint2 srcStridesDH,
T *dstPtr,
uint2 dstStridesDH,
- RpptROI3DPtr roiGenericSrc)
+ uint3 validShapeDHW)
{
int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // WC - inner most dim vectorized
int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner
int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim
- if ((id_z >= roiGenericSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericSrc->xyzwhdROI.roiWidth))
+ if ((id_z >= validShapeDHW.x) || (id_y >= validShapeDHW.y) || (id_x >= validShapeDHW.z))
{
return;
}
- uint srcIdx = ((id_z + roiGenericSrc->xyzwhdROI.xyz.z) * srcStridesDH.x) + ((id_y + roiGenericSrc->xyzwhdROI.xyz.y) * srcStridesDH.y) + (id_x + roiGenericSrc->xyzwhdROI.xyz.x) * 3;
- uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x * 3;
+ uint srcIdx = (id_z * srcStridesDH.x) + (id_y * srcStridesDH.y) + (id_x * 3);
+ uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + (id_x * 3);
d_float24 val_f24;
rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &val_f24);
rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &val_f24);
}
+template
+RppStatus hip_exec_fill_value_tensor(T *dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32s *anchorTensor,
+ Rpp32s *shapeTensor,
+ T *fillValue,
+ Rpp32u *roiTensor,
+ rpp::Handle& handle,
+ Rpp32u numDims)
+{
+ if (numDims == 4)
+ {
+ // set the dimsOrder and globalthreads values required for NDHWC layout
+ Rpp32s dimsOrder[3] = {0, 1, 2};
+ int globalThreads_x = (dstGenericDescPtr->strides[2] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ int globalThreads_y = dstGenericDescPtr->dims[2]; // H - height (y direction)
+ int globalThreads_z = dstGenericDescPtr->dims[1]; // D - depth (z direction)
+
+ // change the dimsOrder and globalthreads values if layout is NCDHW
+ if (dstGenericDescPtr->layout == RpptLayout::NCDHW)
+ {
+ dimsOrder[0] = 1; // depth
+ dimsOrder[1] = 2; // height
+ dimsOrder[2] = 3; // width
+ globalThreads_x = (dstGenericDescPtr->strides[3] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ globalThreads_y = dstGenericDescPtr->dims[3]; // H - height (y direction)
+ globalThreads_z = dstGenericDescPtr->dims[2]; // D - depth (z direction)
+ }
+ for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+ {
+ Rpp32s *anchor = &anchorTensor[batchCount * numDims];
+ Rpp32s *shape = &shapeTensor[batchCount * numDims];
+ Rpp32u *roi = roiTensor + batchCount * numDims * 2;
+ Rpp32s *length = reinterpret_cast(&roi[numDims]);
+ Rpp32u maxDepth = std::min(shape[dimsOrder[0]], length[dimsOrder[0]] - anchor[dimsOrder[0]]);
+ Rpp32u maxHeight = std::min(shape[dimsOrder[1]], length[dimsOrder[1]] - anchor[dimsOrder[1]]);
+ Rpp32u maxWidth = std::min(shape[dimsOrder[2]], length[dimsOrder[2]] - anchor[dimsOrder[2]]);
+
+ // checking if padding is required
+ bool needPadding = (((anchor[dimsOrder[0]] + shape[dimsOrder[0]]) > length[dimsOrder[0]]) ||
+ ((anchor[dimsOrder[1]] + shape[dimsOrder[1]]) > length[dimsOrder[1]]) ||
+ ((anchor[dimsOrder[2]] + shape[dimsOrder[2]]) > length[dimsOrder[2]]));
+
+ // if needPadding is set, launch kernel for filling the padded region with fill value specified
+ if (needPadding && dstGenericDescPtr->layout == RpptLayout::NCDHW)
+ {
+ hipLaunchKernelGGL(fill_value_ncdhw_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+ make_uint3(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2], dstGenericDescPtr->strides[3]),
+ dstGenericDescPtr->dims[1],
+ make_uint3(maxDepth, maxHeight, maxWidth),
+ fillValue);
+ }
+ else if (needPadding && dstGenericDescPtr->layout == RpptLayout::NDHWC)
+ {
+ hipLaunchKernelGGL(fill_value_ndhwc_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+ make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]),
+ make_uint3(maxDepth, maxHeight, maxWidth),
+ fillValue);
+ }
+ }
+ }
+ else if (numDims == 3)
+ {
+ // set the dimsOrder and globalthreads values required for NHWC layout
+ Rpp32s dimsOrder[2] = {0, 1};
+ int globalThreads_x = (dstGenericDescPtr->strides[1] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ int globalThreads_y = dstGenericDescPtr->dims[1]; // H - height (y direction)
+ int globalThreads_z = 1;
+
+ // change the dimsOrder and globalthreads values if layout is NCHW
+ if (dstGenericDescPtr->layout == RpptLayout::NCHW)
+ {
+ dimsOrder[0] = 1; // height
+ dimsOrder[1] = 2; // width
+ globalThreads_x = (dstGenericDescPtr->strides[2] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ globalThreads_y = dstGenericDescPtr->dims[2]; // H - height (y direction)
+ globalThreads_z = 1;
+ }
+
+ for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+ {
+ Rpp32s *anchor = &anchorTensor[batchCount * numDims];
+ Rpp32s *shape = &shapeTensor[batchCount * numDims];
+ Rpp32u *roi = roiTensor + batchCount * numDims * 2;
+ Rpp32s *length = reinterpret_cast(&roi[numDims]);
+ Rpp32u maxHeight = std::min(shape[dimsOrder[0]], length[dimsOrder[0]] - anchor[dimsOrder[0]]);
+ Rpp32u maxWidth = std::min(shape[dimsOrder[1]], length[dimsOrder[1]] - anchor[dimsOrder[1]]);
+
+ // check if padding is needed
+ bool needPadding = (((anchor[dimsOrder[0]] + shape[dimsOrder[0]]) > length[dimsOrder[0]]) ||
+ ((anchor[dimsOrder[1]] + shape[dimsOrder[1]]) > length[dimsOrder[1]]));
+
+ // launch kernel for filling the padded region with fill value specified
+ if (needPadding && dstGenericDescPtr->layout == RpptLayout::NCHW)
+ {
+ hipLaunchKernelGGL(fill_value_ncdhw_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), globalThreads_z),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, 1),
+ 0,
+ handle.GetStream(),
+ dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+ make_uint3(dstGenericDescPtr->strides[1], 0, dstGenericDescPtr->strides[2]),
+ dstGenericDescPtr->dims[1],
+ make_uint3(1, shape[1], shape[2]),
+ fillValue);
+ }
+ else if (needPadding && dstGenericDescPtr->layout == RpptLayout::NHWC)
+ {
+ hipLaunchKernelGGL(fill_value_ndhwc_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), globalThreads_z),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, 1),
+ 0,
+ handle.GetStream(),
+ dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+ make_uint2(1, dstGenericDescPtr->strides[1]),
+ make_uint3(1, maxHeight, maxWidth),
+ fillValue);
+ }
+ }
+ }
+ else if (numDims == 2)
+ {
+ // NHW
+ int globalThreads_x = (dstGenericDescPtr->strides[1] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ int globalThreads_y = dstGenericDescPtr->dims[1]; // H - height (y direction)
+ int globalThreads_z = 1;
+
+ for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+ {
+ Rpp32s *anchor = &anchorTensor[batchCount * numDims];
+ Rpp32s *shape = &shapeTensor[batchCount * numDims];
+ Rpp32u *roi = roiTensor + batchCount * numDims * 2;
+ Rpp32s *length = reinterpret_cast(&roi[numDims]);
+ Rpp32u maxHeight = std::min(shape[0], length[0] - anchor[0]);
+ Rpp32u maxWidth = std::min(shape[1], length[1] - anchor[1]);
+
+ // check if padding is needed
+ bool needPadding = (((anchor[0] + shape[0]) > length[0]) ||
+ ((anchor[1] + shape[1]) > length[1]));
+
+ // launch kernel for filling the padded region with fill value specified
+ if (needPadding)
+ {
+ hipLaunchKernelGGL(fill_value_ncdhw_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, 1),
+ 0,
+ handle.GetStream(),
+ dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+ make_uint3(0, 0, dstGenericDescPtr->strides[1]),
+ 1,
+ make_uint3(1, shape[0], shape[1]),
+ fillValue);
+ }
+ }
+ }
+ else if (numDims == 1)
+ {
+ int globalThreads_x = (dstGenericDescPtr->strides[0] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ int globalThreads_y = 1;
+ int globalThreads_z = 1;
+
+ for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+ {
+ Rpp32s *anchor = &anchorTensor[batchCount * numDims];
+ Rpp32s *shape = &shapeTensor[batchCount * numDims];
+ Rpp32u *roi = roiTensor + batchCount * numDims * 2;
+ Rpp32s *length = reinterpret_cast(&roi[numDims]);
+ Rpp32u maxLength = std::min(shape[0], length[0] - anchor[0]);
+
+ // check if padding is needed
+ bool needPadding = ((anchor[0] + shape[0]) > length[0]);
+
+ // launch kernel for filling the padded region with fill value specified
+ if (needPadding)
+ {
+ hipLaunchKernelGGL(fill_value_ncdhw_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, 1, 1),
+ 0,
+ handle.GetStream(),
+ dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+ make_uint3(0, 0, 1),
+ 1,
+ make_uint3(1, 1, shape[0]),
+ fillValue);
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
+
template
RppStatus hip_exec_slice_tensor(T *srcPtr,
RpptGenericDescPtr srcGenericDescPtr,
T *dstPtr,
RpptGenericDescPtr dstGenericDescPtr,
- RpptROI3DPtr roiGenericPtrSrc,
+ Rpp32s *anchorTensor,
+ Rpp32s *shapeTensor,
+ T *fillValue,
+ bool enablePadding,
+ Rpp32u *roiTensor,
rpp::Handle& handle)
{
- if (dstGenericDescPtr->layout == RpptLayout::NCDHW)
+ Rpp32u numDims = srcGenericDescPtr->numDims - 1; // exclude batchsize from input dims
+
+ /* if enabledPadding is set to true, launch kernel to fill the output buffers with fill value specified.
+ This will be only done if shapeTensor[d] > roiTensor[d] where d is the dimension*/
+ if (enablePadding)
+ {
+ hip_exec_fill_value_tensor(dstPtr,
+ dstGenericDescPtr,
+ anchorTensor,
+ shapeTensor,
+ fillValue,
+ roiTensor,
+ handle,
+ numDims);
+ hipStreamSynchronize(handle.GetStream());
+ }
+
+ if(numDims == 4)
{
- int globalThreads_x = (dstGenericDescPtr->strides[3] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
- int globalThreads_y = dstGenericDescPtr->dims[3]; // H - height (y direction)
- int globalThreads_z = dstGenericDescPtr->dims[2]; // D - depth (z direction)
+ // set the dimsOrder and globalthreads values required for NDHWC layout
+ Rpp32s dimsOrder[3] = {0, 1, 2};
+ int globalThreads_x = (dstGenericDescPtr->strides[2] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ int globalThreads_y = dstGenericDescPtr->dims[2]; // H - height (y direction)
+ int globalThreads_z = dstGenericDescPtr->dims[1]; // D - depth (z direction)
+
+ // change the dimsOrder and globalthreads values if layout is NCDHW
+ if (dstGenericDescPtr->layout == RpptLayout::NCDHW)
+ {
+ dimsOrder[0] = 1; // depth
+ dimsOrder[1] = 2; // height
+ dimsOrder[2] = 3; // width
+ globalThreads_x = (dstGenericDescPtr->strides[3] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ globalThreads_y = dstGenericDescPtr->dims[3]; // H - height (y direction)
+ globalThreads_z = dstGenericDescPtr->dims[2]; // D - depth (z direction)
+ }
for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
{
+ Rpp32s *anchor = &anchorTensor[batchCount * numDims];
+ Rpp32s *shape = &shapeTensor[batchCount * numDims];
+ Rpp32u *roi = roiTensor + batchCount * numDims * 2;
+ Rpp32s *length = reinterpret_cast(&roi[numDims]);
+ Rpp32u maxDepth = std::min(shape[dimsOrder[0]], length[dimsOrder[0]] - anchor[dimsOrder[0]]);
+ Rpp32u maxHeight = std::min(shape[dimsOrder[1]], length[dimsOrder[1]] - anchor[dimsOrder[1]]);
+ Rpp32u maxWidth = std::min(shape[dimsOrder[2]], length[dimsOrder[2]] - anchor[dimsOrder[2]]);
+ if (dstGenericDescPtr->layout == RpptLayout::NCDHW)
+ {
+ T *srcPtrTemp = srcPtr + (batchCount * srcGenericDescPtr->strides[0]) + anchor[1] * srcGenericDescPtr->strides[2] + anchor[2] * srcGenericDescPtr->strides[3] + anchor[3];
+ T *dstPtrTemp = dstPtr + (batchCount * dstGenericDescPtr->strides[0]);
+ hipLaunchKernelGGL(slice_ncdhw_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtrTemp,
+ make_uint3(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2], srcGenericDescPtr->strides[3]),
+ dstPtrTemp,
+ make_uint3(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2], dstGenericDescPtr->strides[3]),
+ dstGenericDescPtr->dims[1],
+ make_uint3(maxDepth, maxHeight, maxWidth));
+ }
+ else if (dstGenericDescPtr->layout == RpptLayout::NDHWC)
+ {
+ T *srcPtrTemp = srcPtr + (batchCount * srcGenericDescPtr->strides[0]) + anchor[0] * srcGenericDescPtr->strides[1] + anchor[1] * srcGenericDescPtr->strides[2] + anchor[2];
+ T *dstPtrTemp = dstPtr + (batchCount * dstGenericDescPtr->strides[0]);
+ hipLaunchKernelGGL(slice_ndhwc_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtrTemp,
+ make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]),
+ dstPtrTemp,
+ make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]),
+ make_uint3(maxDepth, maxHeight, maxWidth));
+ }
+ }
+ }
+ else if (numDims == 3)
+ {
+ // set the dimsOrder and globalthreads values required for NHWC layout
+ Rpp32s dimsOrder[2] = {0, 1};
+ int globalThreads_x = (dstGenericDescPtr->strides[1] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ int globalThreads_y = dstGenericDescPtr->dims[1]; // H - height (y direction)
+ int globalThreads_z = 1;
+
+ // change the dimsOrder and globalthreads values if layout is NCHW
+ if (dstGenericDescPtr->layout == RpptLayout::NCHW)
+ {
+ dimsOrder[0] = 1; // height
+ dimsOrder[1] = 2; // width
+ globalThreads_x = (dstGenericDescPtr->strides[2] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ globalThreads_y = dstGenericDescPtr->dims[2]; // H - height (y direction)
+ globalThreads_z = 1;
+ }
+
+ for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+ {
+ Rpp32s *anchor = &anchorTensor[batchCount * numDims];
+ Rpp32s *shape = &shapeTensor[batchCount * numDims];
+ Rpp32u *roi = roiTensor + batchCount * numDims * 2;
+ Rpp32s *length = reinterpret_cast(&roi[numDims]);
+ Rpp32u maxHeight = std::min(shape[dimsOrder[0]], length[dimsOrder[0]] - anchor[dimsOrder[0]]);
+ Rpp32u maxWidth = std::min(shape[dimsOrder[1]], length[dimsOrder[1]] - anchor[dimsOrder[1]]);
+ if (dstGenericDescPtr->layout == RpptLayout::NCHW)
+ {
+ T *srcPtrTemp = srcPtr + (batchCount * srcGenericDescPtr->strides[0]) + anchor[1] * srcGenericDescPtr->strides[2] + anchor[2];
+ T *dstPtrTemp = dstPtr + (batchCount * dstGenericDescPtr->strides[0]);
+ hipLaunchKernelGGL(slice_ncdhw_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, 1),
+ 0,
+ handle.GetStream(),
+ srcPtrTemp,
+ make_uint3(srcGenericDescPtr->strides[1], 0, srcGenericDescPtr->strides[2]),
+ dstPtrTemp,
+ make_uint3(dstGenericDescPtr->strides[1], 0, dstGenericDescPtr->strides[2]),
+ dstGenericDescPtr->dims[1],
+ make_uint3(1, maxHeight, maxWidth));
+ }
+ else if (dstGenericDescPtr->layout == RpptLayout::NHWC)
+ {
+ T *srcPtrTemp = srcPtr + (batchCount * srcGenericDescPtr->strides[0]) + anchor[0] * srcGenericDescPtr->strides[1] + anchor[1];
+ T *dstPtrTemp = dstPtr + (batchCount * dstGenericDescPtr->strides[0]);
+ hipLaunchKernelGGL(slice_ndhwc_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), globalThreads_z),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, 1),
+ 0,
+ handle.GetStream(),
+ srcPtrTemp,
+ make_uint2(1, srcGenericDescPtr->strides[1]),
+ dstPtrTemp,
+ make_uint2(1, dstGenericDescPtr->strides[1]),
+ make_uint3(1, maxHeight, maxWidth));
+ }
+ }
+ }
+ else if (numDims == 2)
+ {
+ // NHW
+ int globalThreads_x = (dstGenericDescPtr->strides[1] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ int globalThreads_y = dstGenericDescPtr->dims[1]; // H - height (y direction)
+ int globalThreads_z = 1;
+ for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+ {
+ Rpp32s *anchor = &anchorTensor[batchCount * numDims];
+ Rpp32s *shape = &shapeTensor[batchCount * numDims];
+ Rpp32u *roi = roiTensor + batchCount * numDims * 2;
+ Rpp32s *length = reinterpret_cast(&roi[numDims]);
+ Rpp32u maxHeight = std::min(shape[0], length[0] - anchor[0]);
+ Rpp32u maxWidth = std::min(shape[1], length[1] - anchor[1]);
+ T *srcPtrTemp = srcPtr + (batchCount * srcGenericDescPtr->strides[0]) + anchor[0] * srcGenericDescPtr->strides[2] + anchor[1];
+ T *dstPtrTemp = dstPtr + (batchCount * dstGenericDescPtr->strides[0]);
+
hipLaunchKernelGGL(slice_ncdhw_hip_tensor,
dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
- dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, 1),
0,
handle.GetStream(),
- srcPtr + (batchCount * srcGenericDescPtr->strides[0]),
- make_uint3(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2], srcGenericDescPtr->strides[3]),
- dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
- make_uint3(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2], dstGenericDescPtr->strides[3]),
- dstGenericDescPtr->dims[1],
- &roiGenericPtrSrc[batchCount]);
+ srcPtrTemp,
+ make_uint3(0, 0, srcGenericDescPtr->strides[1]),
+ dstPtrTemp,
+ make_uint3(0, 0, dstGenericDescPtr->strides[1]),
+ 1,
+ make_uint3(1, maxHeight, maxWidth));
}
}
- else if (dstGenericDescPtr->layout == RpptLayout::NDHWC)
+ else if (numDims == 1)
{
- int globalThreads_x = (dstGenericDescPtr->strides[2] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
- int globalThreads_y = dstGenericDescPtr->dims[2]; // H - height (y direction)
- int globalThreads_z = dstGenericDescPtr->dims[1]; // D - depth (z direction)
-
+ int globalThreads_x = (dstGenericDescPtr->strides[0] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ int globalThreads_y = 1;
+ int globalThreads_z = 1;
for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
{
- hipLaunchKernelGGL(slice_ndhwc_hip_tensor,
+ Rpp32s *anchor = &anchorTensor[batchCount * numDims];
+ Rpp32s *shape = &shapeTensor[batchCount * numDims];
+ Rpp32u *roi = roiTensor + batchCount * numDims * 2;
+ Rpp32s *length = reinterpret_cast(&roi[numDims]);
+ Rpp32u maxLength = std::min(shape[0], length[0] - anchor[0]);
+ T *srcPtrTemp = srcPtr + (batchCount * srcGenericDescPtr->strides[0]) + anchor[0];
+ T *dstPtrTemp = dstPtr + (batchCount * dstGenericDescPtr->strides[0]);
+
+ hipLaunchKernelGGL(slice_ncdhw_hip_tensor,
dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
- dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ dim3(LOCAL_THREADS_X, 1, 1),
0,
handle.GetStream(),
- srcPtr + (batchCount * srcGenericDescPtr->strides[0]),
- make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]),
- dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
- make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]),
- &roiGenericPtrSrc[batchCount]);
+ srcPtrTemp,
+ make_uint3(0, 0, 1),
+ dstPtrTemp,
+ make_uint3(0, 0, 1),
+ 1,
+ make_uint3(1, 1, maxLength));
}
}
diff --git a/src/modules/rppt_tensor_audio_augmentations.cpp b/src/modules/rppt_tensor_audio_augmentations.cpp
index b629d1590..e20211ec1 100644
--- a/src/modules/rppt_tensor_audio_augmentations.cpp
+++ b/src/modules/rppt_tensor_audio_augmentations.cpp
@@ -32,8 +32,8 @@ SOFTWARE.
RppStatus rppt_non_silent_region_detection_host(RppPtr_t srcPtr,
RpptDescPtr srcDescPtr,
Rpp32s *srcLengthTensor,
- Rpp32f *detectedIndexTensor,
- Rpp32f *detectionLengthTensor,
+ Rpp32s *detectedIndexTensor,
+ Rpp32s *detectionLengthTensor,
Rpp32f cutOffDB,
Rpp32s windowLength,
Rpp32f referencePower,
diff --git a/src/modules/rppt_tensor_geometric_augmentations.cpp b/src/modules/rppt_tensor_geometric_augmentations.cpp
index 45a0d5221..d758aa676 100644
--- a/src/modules/rppt_tensor_geometric_augmentations.cpp
+++ b/src/modules/rppt_tensor_geometric_augmentations.cpp
@@ -1050,43 +1050,54 @@ RppStatus rppt_slice_host(RppPtr_t srcPtr,
RpptGenericDescPtr srcGenericDescPtr,
RppPtr_t dstPtr,
RpptGenericDescPtr dstGenericDescPtr,
- RpptROI3DPtr roiGenericPtrSrc,
- RpptRoi3DType roiType,
+ Rpp32s *anchorTensor,
+ Rpp32s *shapeTensor,
+ RppPtr_t fillValue,
+ bool enablePadding,
+ Rpp32u *roiTensor,
rppHandle_t rppHandle)
{
+ if ((srcGenericDescPtr->dataType != RpptDataType::F32) && (srcGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_SRC_DATATYPE;
+ if ((dstGenericDescPtr->dataType != RpptDataType::F32) && (dstGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_DST_DATATYPE;
+ if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_LAYOUT_MISMATCH;
+
RppLayoutParams layoutParams;
if ((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW))
layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[1]);
else if ((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC))
layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[4]);
-
- if ((srcGenericDescPtr->dataType != RpptDataType::F32) && (srcGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_SRC_DATATYPE;
- if ((dstGenericDescPtr->dataType != RpptDataType::F32) && (dstGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_DST_DATATYPE;
- if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT;
- if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT;
- if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS;
+ else if ((srcGenericDescPtr->layout == RpptLayout::NCHW) && (dstGenericDescPtr->layout == RpptLayout::NCHW))
+ layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[1]);
+ else if ((srcGenericDescPtr->layout == RpptLayout::NHWC) && (dstGenericDescPtr->layout == RpptLayout::NHWC))
+ layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[3]);
if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
{
- slice_f32_f32_host_tensor((Rpp32f*) (static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
- srcGenericDescPtr,
- (Rpp32f*) (static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
- dstGenericDescPtr,
- roiGenericPtrSrc,
- roiType,
- layoutParams,
- rpp::deref(rppHandle));
+ slice_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ anchorTensor,
+ shapeTensor,
+ static_cast(fillValue),
+ enablePadding,
+ roiTensor,
+ layoutParams,
+ rpp::deref(rppHandle));
}
else if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8))
{
- slice_u8_u8_host_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes,
- srcGenericDescPtr,
- static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes,
- dstGenericDescPtr,
- roiGenericPtrSrc,
- roiType,
- layoutParams,
- rpp::deref(rppHandle));
+ slice_host_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes,
+ srcGenericDescPtr,
+ static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes,
+ dstGenericDescPtr,
+ anchorTensor,
+ shapeTensor,
+ static_cast(fillValue),
+ enablePadding,
+ roiTensor,
+ layoutParams,
+ rpp::deref(rppHandle));
}
return RPP_SUCCESS;
@@ -1848,24 +1859,29 @@ RppStatus rppt_slice_gpu(RppPtr_t srcPtr,
RpptGenericDescPtr srcGenericDescPtr,
RppPtr_t dstPtr,
RpptGenericDescPtr dstGenericDescPtr,
- RpptROI3DPtr roiGenericPtrSrc,
- RpptRoi3DType roiType,
+ Rpp32s *anchorTensor,
+ Rpp32s *shapeTensor,
+ RppPtr_t fillValue,
+ bool enablePadding,
+ Rpp32u *roiTensor,
rppHandle_t rppHandle)
{
#ifdef HIP_COMPILE
- if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT;
- if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT;
- if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS;
if ((srcGenericDescPtr->dataType != RpptDataType::F32) && (srcGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_SRC_DATATYPE;
if ((dstGenericDescPtr->dataType != RpptDataType::F32) && (dstGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_DST_DATATYPE;
+ if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_LAYOUT_MISMATCH;
if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
{
- hip_exec_slice_tensor((Rpp32f*) (static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
+ hip_exec_slice_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
srcGenericDescPtr,
- (Rpp32f*) (static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
dstGenericDescPtr,
- roiGenericPtrSrc,
+ anchorTensor,
+ shapeTensor,
+ static_cast(fillValue),
+ enablePadding,
+ roiTensor,
rpp::deref(rppHandle));
}
else if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8))
@@ -1874,7 +1890,11 @@ RppStatus rppt_slice_gpu(RppPtr_t srcPtr,
srcGenericDescPtr,
static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes,
dstGenericDescPtr,
- roiGenericPtrSrc,
+ anchorTensor,
+ shapeTensor,
+ static_cast(fillValue),
+ enablePadding,
+ roiTensor,
rpp::deref(rppHandle));
}
diff --git a/utilities/test_suite/HIP/Tensor_hip.cpp b/utilities/test_suite/HIP/Tensor_hip.cpp
index f6c9276ef..e7b880433 100644
--- a/utilities/test_suite/HIP/Tensor_hip.cpp
+++ b/utilities/test_suite/HIP/Tensor_hip.cpp
@@ -338,6 +338,14 @@ int main(int argc, char **argv)
CHECK_RETURN_STATUS(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * bitDepthByteSize));
}
+ // create generic descriptor and params in case of slice
+ RpptGenericDesc descriptor3D;
+ RpptGenericDescPtr descriptorPtr3D = &descriptor3D;
+ Rpp32s *anchorTensor = NULL, *shapeTensor = NULL;
+ Rpp32u *roiTensor = NULL;
+ if(testCase == 90)
+ set_generic_descriptor_slice(srcDescPtr, descriptorPtr3D, batchSize);
+
// Allocate hip memory for src/dst
CHECK_RETURN_STATUS(hipMalloc(&d_input, inputBufferSize));
CHECK_RETURN_STATUS(hipMalloc(&d_output, outputBufferSize));
@@ -348,6 +356,7 @@ int main(int argc, char **argv)
if(testCase == 82)
CHECK_RETURN_STATUS(hipHostMalloc(&roiPtrInputCropRegion, 4 * sizeof(RpptROI)));
+ // create cropRoi and patchRoi in case of crop_and_patch
RpptROI *cropRoi, *patchRoi;
if(testCase == 33)
{
@@ -1137,6 +1146,28 @@ int main(int argc, char **argv)
break;
}
+ case 90:
+ {
+ testCaseName = "slice";
+ Rpp32u numDims = descriptorPtr3D->numDims - 1; // exclude batchSize from input dims
+ if(anchorTensor == NULL)
+ CHECK_RETURN_STATUS(hipHostMalloc(&anchorTensor, batchSize * numDims * sizeof(Rpp32s)));
+ if(shapeTensor == NULL)
+ CHECK_RETURN_STATUS(hipHostMalloc(&shapeTensor, batchSize * numDims * sizeof(Rpp32s)));
+ if(roiTensor == NULL)
+ CHECK_RETURN_STATUS(hipHostMalloc(&roiTensor, batchSize * numDims * 2 * sizeof(Rpp32u)));
+ bool enablePadding = false;
+ auto fillValue = 0;
+ init_slice(descriptorPtr3D, roiTensorPtrSrc, roiTensor, anchorTensor, shapeTensor);
+
+ startWallTime = omp_get_wtime();
+ if((inputBitDepth == 0 || inputBitDepth == 2) && srcDescPtr->layout == dstDescPtr->layout)
+ rppt_slice_gpu(d_input, descriptorPtr3D, d_output, descriptorPtr3D, anchorTensor, shapeTensor, &fillValue, enablePadding, roiTensor, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
default:
missingFuncFlag = 1;
break;
@@ -1232,6 +1263,42 @@ int main(int argc, char **argv)
refFile.close();
}
+ // if test case is slice and qaFlag is set, update the dstImgSizes with shapeTensor values
+ // for output display and comparision purposes
+ if (testCase == 90)
+ {
+ if (dstDescPtr->layout == RpptLayout::NCHW)
+ {
+ if (dstDescPtr->c == 3)
+ {
+ for(int i = 0; i < batchSize; i++)
+ {
+ int idx1 = i * 3;
+ dstImgSizes[i].height = shapeTensor[idx1 + 1];
+ dstImgSizes[i].width = shapeTensor[idx1 + 2];
+ }
+ }
+ else
+ {
+ for(int i = 0; i < batchSize; i++)
+ {
+ int idx1 = i * 2;
+ dstImgSizes[i].height = shapeTensor[idx1];
+ dstImgSizes[i].width = shapeTensor[idx1 + 1];
+ }
+ }
+ }
+ else if (dstDescPtr->layout == RpptLayout::NHWC)
+ {
+ for(int i = 0; i < batchSize; i++)
+ {
+ int idx1 = i * 3;
+ dstImgSizes[i].height = shapeTensor[idx1];
+ dstImgSizes[i].width = shapeTensor[idx1 + 1];
+ }
+ }
+ }
+
/*Compare the output of the function with golden outputs only if
1.QA Flag is set
2.input bit depth 0 (Input U8 && Output U8)
@@ -1294,6 +1361,12 @@ int main(int argc, char **argv)
}
if (reductionTypeCase)
CHECK_RETURN_STATUS(hipHostFree(reductionFuncResultArr));
+ if(anchorTensor != NULL)
+ CHECK_RETURN_STATUS(hipHostFree(anchorTensor));
+ if(shapeTensor != NULL)
+ CHECK_RETURN_STATUS(hipHostFree(shapeTensor));
+ if(roiTensor != NULL)
+ CHECK_RETURN_STATUS(hipHostFree(roiTensor));
free(input);
free(input_second);
free(output);
diff --git a/utilities/test_suite/HIP/Tensor_voxel_hip.cpp b/utilities/test_suite/HIP/Tensor_voxel_hip.cpp
index b39f461cc..44f2dea72 100644
--- a/utilities/test_suite/HIP/Tensor_voxel_hip.cpp
+++ b/utilities/test_suite/HIP/Tensor_voxel_hip.cpp
@@ -146,6 +146,10 @@ int main(int argc, char * argv[])
void *pinnedMemArgs;
CHECK_RETURN_STATUS(hipHostMalloc(&pinnedMemArgs, 2 * noOfFiles * sizeof(Rpp32f)));
+ // arguments required for slice
+ Rpp32s *anchorTensor = NULL, *shapeTensor = NULL;
+ Rpp32u *roiTensor = NULL;
+
rppHandle_t handle;
hipStream_t stream;
CHECK_RETURN_STATUS(hipStreamCreate(&stream));
@@ -254,12 +258,21 @@ int main(int argc, char * argv[])
case 1:
{
testCaseName = "slice";
+ if(anchorTensor == NULL)
+ CHECK_RETURN_STATUS(hipHostMalloc(&anchorTensor, batchSize * 4 * sizeof(Rpp32s)));
+ if(shapeTensor == NULL)
+ CHECK_RETURN_STATUS(hipHostMalloc(&shapeTensor, batchSize * 4 * sizeof(Rpp32s)));
+ if(roiTensor == NULL)
+ CHECK_RETURN_STATUS(hipHostMalloc(&roiTensor, batchSize * 8 * sizeof(Rpp32u)));
+ bool enablePadding = false;
+ auto fillValue = 0;
+ init_slice_voxel(descriptorPtr3D, roiGenericSrcPtr, roiTensor, anchorTensor, shapeTensor);
startWallTime = omp_get_wtime();
if (inputBitDepth == 0)
- rppt_slice_gpu(d_inputU8, descriptorPtr3D, d_outputU8, descriptorPtr3D, roiGenericSrcPtr, roiTypeSrc, handle);
+ rppt_slice_gpu(d_inputU8, descriptorPtr3D, d_outputU8, descriptorPtr3D, anchorTensor, shapeTensor, &fillValue, enablePadding, roiTensor, handle);
else if(inputBitDepth == 2)
- rppt_slice_gpu(d_inputF32, descriptorPtr3D, d_outputF32, descriptorPtr3D, roiGenericSrcPtr, roiTypeSrc, handle);
+ rppt_slice_gpu(d_inputF32, descriptorPtr3D, d_outputF32, descriptorPtr3D, anchorTensor, shapeTensor, &fillValue, enablePadding, roiTensor, handle);
else
missingFuncFlag = 1;
@@ -413,6 +426,39 @@ int main(int argc, char * argv[])
outputF32[i] = static_cast(outputU8[i]);
}
+ // if test case is slice and qaFlag is set, update the ROI with shapeTensor values
+ // for output display and comparison purposes
+ if(testCase == 1)
+ {
+ // update the roi for comparision with the shapeTensor values
+ if (descriptorPtr3D->layout == RpptLayout::NCDHW)
+ {
+ for(int i = 0; i < batchSize; i++)
+ {
+ int idx1 = i * 4;
+ roiGenericSrcPtr[i].xyzwhdROI.xyz.x = 0;
+ roiGenericSrcPtr[i].xyzwhdROI.xyz.y = 0;
+ roiGenericSrcPtr[i].xyzwhdROI.xyz.z = 0;
+ roiGenericSrcPtr[i].xyzwhdROI.roiDepth = shapeTensor[idx1 + 1];
+ roiGenericSrcPtr[i].xyzwhdROI.roiHeight = shapeTensor[idx1 + 2];
+ roiGenericSrcPtr[i].xyzwhdROI.roiWidth = shapeTensor[idx1 + 3];
+ }
+ }
+ else if(descriptorPtr3D->layout == RpptLayout::NDHWC)
+ {
+ for(int i = 0; i < batchSize; i++)
+ {
+ int idx1 = i * 4;
+ roiGenericSrcPtr[i].xyzwhdROI.xyz.x = 0;
+ roiGenericSrcPtr[i].xyzwhdROI.xyz.y = 0;
+ roiGenericSrcPtr[i].xyzwhdROI.xyz.z = 0;
+ roiGenericSrcPtr[i].xyzwhdROI.roiDepth = shapeTensor[idx1];
+ roiGenericSrcPtr[i].xyzwhdROI.roiHeight = shapeTensor[idx1 + 1];
+ roiGenericSrcPtr[i].xyzwhdROI.roiWidth = shapeTensor[idx1 + 2];
+ }
+ }
+ }
+
/*Compare the output of the function with golden outputs only if
1.QA Flag is set
2.input bit depth 2 (F32)*/
@@ -502,6 +548,12 @@ int main(int argc, char * argv[])
CHECK_RETURN_STATUS(hipHostFree(pinnedMemArgs));
CHECK_RETURN_STATUS(hipFree(d_inputF32));
CHECK_RETURN_STATUS(hipFree(d_outputF32));
+ if(anchorTensor != NULL)
+ CHECK_RETURN_STATUS(hipHostFree(anchorTensor));
+ if(shapeTensor != NULL)
+ CHECK_RETURN_STATUS(hipHostFree(shapeTensor));
+ if(roiTensor != NULL)
+ CHECK_RETURN_STATUS(hipHostFree(roiTensor));
if(inputBitDepth == 0)
{
if(inputU8 != NULL)
diff --git a/utilities/test_suite/HIP/runTests.py b/utilities/test_suite/HIP/runTests.py
index 76fb7d15c..28db84bee 100644
--- a/utilities/test_suite/HIP/runTests.py
+++ b/utilities/test_suite/HIP/runTests.py
@@ -39,7 +39,7 @@
outFolderPath = os.getcwd()
buildFolderPath = os.getcwd()
caseMin = 0
-caseMax = 89
+caseMax = 90
# Get a list of log files based on a flag for preserving output
def get_log_file_list(preserveOutput):
@@ -327,7 +327,7 @@ def rpp_test_suite_parser_and_validator():
subprocess.run(["make", "-j16"], cwd=".") # nosec
# List of cases supported
-supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '33', '34', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '80', '82', '83', '84', '85', '86', '87', '88', '89']
+supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '33', '34', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '80', '82', '83', '84', '85', '86', '87', '88', '89', '90']
# Create folders based on testType and profilingOption
if testType == 1 and profilingOption == "YES":
diff --git a/utilities/test_suite/HOST/Tensor_host.cpp b/utilities/test_suite/HOST/Tensor_host.cpp
index 19aa5452f..15a94095d 100644
--- a/utilities/test_suite/HOST/Tensor_host.cpp
+++ b/utilities/test_suite/HOST/Tensor_host.cpp
@@ -339,6 +339,15 @@ int main(int argc, char **argv)
}
}
+ // create generic descriptor and params in case of slice
+ RpptGenericDesc descriptor3D;
+ RpptGenericDescPtr descriptorPtr3D = &descriptor3D;
+ Rpp32s *anchorTensor = NULL, *shapeTensor = NULL;
+ Rpp32u *roiTensor = NULL;
+ if(testCase == 90)
+ set_generic_descriptor_slice(srcDescPtr, descriptorPtr3D, batchSize);
+
+ // create cropRoi and patchRoi in case of crop_and_patch
RpptROI *cropRoi, *patchRoi;
if(testCase == 33)
{
@@ -1171,6 +1180,30 @@ int main(int argc, char **argv)
break;
}
+ case 90:
+ {
+ testCaseName = "slice";
+ Rpp32u numDims = descriptorPtr3D->numDims - 1; // exclude batchSize from input dims
+ if(anchorTensor == NULL)
+ anchorTensor = static_cast(calloc(batchSize * numDims, sizeof(Rpp32s)));;
+ if(shapeTensor == NULL)
+ shapeTensor = static_cast(calloc(batchSize * numDims, sizeof(Rpp32s)));;
+ if(roiTensor == NULL)
+ roiTensor = static_cast(calloc(batchSize * numDims * 2, sizeof(Rpp32u)));;
+ bool enablePadding = false;
+ auto fillValue = 0;
+ init_slice(descriptorPtr3D, roiTensorPtrSrc, roiTensor, anchorTensor, shapeTensor);
+
+ startWallTime = omp_get_wtime();
+ startCpuTime = clock();
+
+ if((inputBitDepth == 0 || inputBitDepth == 2) && srcDescPtr->layout == dstDescPtr->layout)
+ rppt_slice_host(input, descriptorPtr3D, output, descriptorPtr3D, anchorTensor, shapeTensor, &fillValue, enablePadding, roiTensor, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
default:
missingFuncFlag = 1;
break;
@@ -1268,6 +1301,42 @@ int main(int argc, char **argv)
refFile.close();
}
+ // if test case is slice and qaFlag is set, update the dstImgSizes with shapeTensor values
+ // for output display and comparision purposes
+ if (testCase == 90)
+ {
+ if (dstDescPtr->layout == RpptLayout::NCHW)
+ {
+ if (dstDescPtr->c == 3)
+ {
+ for(int i = 0; i < batchSize; i++)
+ {
+ int idx1 = i * 3;
+ dstImgSizes[i].height = shapeTensor[idx1 + 1];
+ dstImgSizes[i].width = shapeTensor[idx1 + 2];
+ }
+ }
+ else
+ {
+ for(int i = 0; i < batchSize; i++)
+ {
+ int idx1 = i * 2;
+ dstImgSizes[i].height = shapeTensor[idx1];
+ dstImgSizes[i].width = shapeTensor[idx1 + 1];
+ }
+ }
+ }
+ else if (dstDescPtr->layout == RpptLayout::NHWC)
+ {
+ for(int i = 0; i < batchSize; i++)
+ {
+ int idx1 = i * 3;
+ dstImgSizes[i].height = shapeTensor[idx1];
+ dstImgSizes[i].width = shapeTensor[idx1 + 1];
+ }
+ }
+ }
+
/*Compare the output of the function with golden outputs only if
1.QA Flag is set
2.input bit depth 0 (Input U8 && Output U8)
@@ -1324,6 +1393,12 @@ int main(int argc, char **argv)
free(roiTensorPtrSrc);
free(roiTensorPtrDst);
free(dstImgSizes);
+ if(anchorTensor != NULL)
+ free(anchorTensor);
+ if(shapeTensor != NULL)
+ free(shapeTensor);
+ if(roiTensor != NULL)
+ free(roiTensor);
free(input);
free(inputu8);
free(inputu8Second);
diff --git a/utilities/test_suite/HOST/Tensor_host_audio.cpp b/utilities/test_suite/HOST/Tensor_host_audio.cpp
index 35ce3a752..ac05bcc90 100644
--- a/utilities/test_suite/HOST/Tensor_host_audio.cpp
+++ b/utilities/test_suite/HOST/Tensor_host_audio.cpp
@@ -109,6 +109,19 @@ int main(int argc, char **argv)
maxDstChannels = 1;
set_audio_descriptor_dims_and_strides(dstDescPtr, batchSize, maxDstHeight, maxDstWidth, maxDstChannels, offsetInBytes);
+ // create generic descriptor in case of slice
+ RpptGenericDesc descriptor3D;
+ RpptGenericDescPtr descriptorPtr3D = &descriptor3D;
+ if(testCase == 4)
+ {
+ descriptorPtr3D->numDims = 2;
+ descriptorPtr3D->offsetInBytes = 0;
+ descriptorPtr3D->dataType = RpptDataType::F32;
+ descriptorPtr3D->dims[0] = batchSize;
+ descriptorPtr3D->dims[1] = maxSrcWidth;
+ descriptorPtr3D->strides[0] = descriptorPtr3D->dims[1];
+ }
+
// set buffer sizes for src/dst
iBufferSize = (Rpp64u)srcDescPtr->h * (Rpp64u)srcDescPtr->w * (Rpp64u)srcDescPtr->c * (Rpp64u)srcDescPtr->n;
oBufferSize = (Rpp64u)dstDescPtr->h * (Rpp64u)dstDescPtr->w * (Rpp64u)dstDescPtr->c * (Rpp64u)dstDescPtr->n;
@@ -145,8 +158,8 @@ int main(int argc, char **argv)
case 0:
{
testCaseName = "non_silent_region_detection";
- Rpp32f detectedIndex[batchSize];
- Rpp32f detectionLength[batchSize];
+ Rpp32s detectedIndex[batchSize];
+ Rpp32s detectionLength[batchSize];
Rpp32f cutOffDB = -60.0;
Rpp32s windowLength = 2048;
Rpp32f referencePower = 0.0f;
@@ -265,6 +278,32 @@ int main(int argc, char **argv)
break;
}
+ case 5:
+ {
+ testCaseName = "slice";
+ Rpp32u nDim = 1; // testing for 1D slice
+ auto fillValue = 0;
+ bool enablePadding = true;
+ Rpp32u roiTensor[batchSize * nDim * 2];
+ Rpp32s anchorTensor[batchSize * nDim];
+ Rpp32s shapeTensor[batchSize * nDim];
+
+ // 1D slice arguments
+ for (int i = 0; i < batchSize; i++)
+ {
+ int idx = i * nDim * 2;
+ roiTensor[idx] = 10;
+ roiTensor[idx + 1] = srcLengthTensor[i];
+ anchorTensor[i] = 10;
+ shapeTensor[i] = dstDims[i].width = srcLengthTensor[i] / 2;
+ dstDims[i].height = 1;
+ }
+
+ startWallTime = omp_get_wtime();
+ rppt_slice_host(inputf32, descriptorPtr3D, outputf32, descriptorPtr3D, anchorTensor, shapeTensor, &fillValue, enablePadding, roiTensor, handle);
+
+ break;
+ }
case 6:
{
testCaseName = "resample";
diff --git a/utilities/test_suite/HOST/Tensor_voxel_host.cpp b/utilities/test_suite/HOST/Tensor_voxel_host.cpp
index 968f29b54..89fddf2c6 100644
--- a/utilities/test_suite/HOST/Tensor_voxel_host.cpp
+++ b/utilities/test_suite/HOST/Tensor_voxel_host.cpp
@@ -140,6 +140,10 @@ int main(int argc, char * argv[])
void *pinnedMemArgs;
pinnedMemArgs = calloc(2 * noOfFiles , sizeof(Rpp32f));
+ // arguments required for slice
+ Rpp32s *anchorTensor = NULL, *shapeTensor = NULL;
+ Rpp32u *roiTensor = NULL;
+
// Set the number of threads to be used by OpenMP pragma for RPP batch processing on host.
// If numThreads value passed is 0, number of OpenMP threads used by RPP will be set to batch size
Rpp32u numThreads = 0;
@@ -241,11 +245,21 @@ int main(int argc, char * argv[])
case 1:
{
testCaseName = "slice";
+ if(anchorTensor == NULL)
+ anchorTensor = static_cast(calloc(batchSize * 4, sizeof(Rpp32s)));;
+ if(shapeTensor == NULL)
+ shapeTensor = static_cast(calloc(batchSize * 4, sizeof(Rpp32s)));;
+ if(roiTensor == NULL)
+ roiTensor = static_cast(calloc(batchSize * 8, sizeof(Rpp32u)));;
+ bool enablePadding = false;
+ auto fillValue = 0;
+ init_slice_voxel(descriptorPtr3D, roiGenericSrcPtr, roiTensor, anchorTensor, shapeTensor);
+
startWallTime = omp_get_wtime();
if(inputBitDepth == 0)
- rppt_slice_host(inputU8, descriptorPtr3D, outputU8, descriptorPtr3D, roiGenericSrcPtr, roiTypeSrc, handle);
+ rppt_slice_host(inputU8, descriptorPtr3D, outputU8, descriptorPtr3D, anchorTensor, shapeTensor, &fillValue, enablePadding, roiTensor, handle);
else if(inputBitDepth == 2)
- rppt_slice_host(inputF32, descriptorPtr3D, outputF32, descriptorPtr3D, roiGenericSrcPtr, roiTypeSrc, handle);
+ rppt_slice_host(inputF32, descriptorPtr3D, outputF32, descriptorPtr3D, anchorTensor, shapeTensor, &fillValue, enablePadding, roiTensor, handle);
else
missingFuncFlag = 1;
@@ -393,6 +407,39 @@ int main(int argc, char * argv[])
outputF32[i] = static_cast(outputU8[i]);
}
+ // if test case is slice and qaFlag is set, update the ROI with shapeTensor values
+ // for output display and comparison purposes
+ if(testCase == 1)
+ {
+ // update the roi for comparision with the shapeTensor values
+ if (descriptorPtr3D->layout == RpptLayout::NCDHW)
+ {
+ for(int i = 0; i < batchSize; i++)
+ {
+ int idx1 = i * 4;
+ roiGenericSrcPtr[i].xyzwhdROI.xyz.x = 0;
+ roiGenericSrcPtr[i].xyzwhdROI.xyz.y = 0;
+ roiGenericSrcPtr[i].xyzwhdROI.xyz.z = 0;
+ roiGenericSrcPtr[i].xyzwhdROI.roiDepth = shapeTensor[idx1 + 1];
+ roiGenericSrcPtr[i].xyzwhdROI.roiHeight = shapeTensor[idx1 + 2];
+ roiGenericSrcPtr[i].xyzwhdROI.roiWidth = shapeTensor[idx1 + 3];
+ }
+ }
+ else if(descriptorPtr3D->layout == RpptLayout::NDHWC)
+ {
+ for(int i = 0; i < batchSize; i++)
+ {
+ int idx1 = i * 4;
+ roiGenericSrcPtr[i].xyzwhdROI.xyz.x = 0;
+ roiGenericSrcPtr[i].xyzwhdROI.xyz.y = 0;
+ roiGenericSrcPtr[i].xyzwhdROI.xyz.z = 0;
+ roiGenericSrcPtr[i].xyzwhdROI.roiDepth = shapeTensor[idx1];
+ roiGenericSrcPtr[i].xyzwhdROI.roiHeight = shapeTensor[idx1 + 1];
+ roiGenericSrcPtr[i].xyzwhdROI.roiWidth = shapeTensor[idx1 + 2];
+ }
+ }
+ }
+
/*Compare the output of the function with golden outputs only if
1.QA Flag is set
2.input bit depth 2 (F32)*/
@@ -481,6 +528,12 @@ int main(int argc, char * argv[])
free(outputF32);
free(roiGenericSrcPtr);
free(pinnedMemArgs);
+ if(anchorTensor != NULL)
+ free(anchorTensor);
+ if(shapeTensor != NULL)
+ free(shapeTensor);
+ if(roiTensor != NULL)
+ free(roiTensor);
if(inputBitDepth == 0)
{
if(inputU8 != NULL)
diff --git a/utilities/test_suite/HOST/runTests.py b/utilities/test_suite/HOST/runTests.py
index ddde7db9c..8bedd5044 100644
--- a/utilities/test_suite/HOST/runTests.py
+++ b/utilities/test_suite/HOST/runTests.py
@@ -40,7 +40,7 @@
outFolderPath = os.getcwd()
buildFolderPath = os.getcwd()
caseMin = 0
-caseMax = 89
+caseMax = 90
# Get a list of log files based on a flag for preserving output
def get_log_file_list(preserveOutput):
@@ -283,7 +283,7 @@ def rpp_test_suite_parser_and_validator():
subprocess.run(["make", "-j16"], cwd=".") # nosec
# List of cases supported
-supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '33', '34', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89']
+supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '33', '34', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90']
print("\n\n\n\n\n")
print("##########################################################################################")
diff --git a/utilities/test_suite/REFERENCE_OUTPUT/slice/slice_u8_Tensor.bin b/utilities/test_suite/REFERENCE_OUTPUT/slice/slice_u8_Tensor.bin
new file mode 100644
index 000000000..ac379dc79
Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUT/slice/slice_u8_Tensor.bin differ
diff --git a/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/slice/slice.bin b/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/slice/slice.bin
new file mode 100644
index 000000000..10b45a6ff
Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/slice/slice.bin differ
diff --git a/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/slice/slice.bin b/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/slice/slice.bin
deleted file mode 100644
index 94802b30b..000000000
Binary files a/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/slice/slice.bin and /dev/null differ
diff --git a/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/slice/slice_nifti_output.bin b/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/slice/slice_nifti_output.bin
index 94802b30b..f51ec113c 100644
Binary files a/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/slice/slice_nifti_output.bin and b/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/slice/slice_nifti_output.bin differ
diff --git a/utilities/test_suite/rpp_test_suite_audio.h b/utilities/test_suite/rpp_test_suite_audio.h
index 4367df17c..8ff5815c0 100644
--- a/utilities/test_suite/rpp_test_suite_audio.h
+++ b/utilities/test_suite/rpp_test_suite_audio.h
@@ -41,6 +41,7 @@ std::map audioAugmentationMap =
{2, "pre_emphasis_filter"},
{3, "down_mixing"},
{4, "spectrogram"},
+ {5, "slice"},
{6, "resample"}
};
@@ -231,7 +232,7 @@ void verify_output(Rpp32f *dstPtr, RpptDescPtr dstDescPtr, RpptImagePatchPtr dst
free(refOutput);
}
-void verify_non_silent_region_detection(float *detectedIndex, float *detectionLength, string testCase, int bs, vector audioNames, string dst)
+void verify_non_silent_region_detection(int *detectedIndex, int *detectionLength, string testCase, int bs, vector audioNames, string dst)
{
int fileMatch = 0;
for (int i = 0; i < bs; i++)
diff --git a/utilities/test_suite/rpp_test_suite_common.h b/utilities/test_suite/rpp_test_suite_common.h
index 3ec123d7f..ea3117041 100644
--- a/utilities/test_suite/rpp_test_suite_common.h
+++ b/utilities/test_suite/rpp_test_suite_common.h
@@ -106,6 +106,7 @@ std::map augmentationMap =
{87, "tensor_sum"},
{88, "tensor_min"},
{89, "tensor_max"},
+ {90, "slice"}
};
// Golden outputs for Tensor min Kernel
@@ -456,6 +457,43 @@ inline void set_generic_descriptor(RpptGenericDescPtr descriptorPtr3D, int noOfI
descriptorPtr3D->strides[4] = 1;
}
+// sets generic descriptor dimensions and strides of src/dst for slice functionality
+inline void set_generic_descriptor_slice(RpptDescPtr srcDescPtr, RpptGenericDescPtr descriptorPtr3D, int batchSize)
+{
+ descriptorPtr3D->offsetInBytes = 0;
+ descriptorPtr3D->dataType = srcDescPtr->dataType;
+ descriptorPtr3D->layout = srcDescPtr->layout;
+ if(srcDescPtr->c == 3)
+ {
+ descriptorPtr3D->numDims = 4;
+ descriptorPtr3D->dims[0] = batchSize;
+ if (srcDescPtr->layout == RpptLayout::NHWC)
+ {
+ descriptorPtr3D->dims[1] = srcDescPtr->h;
+ descriptorPtr3D->dims[2] = srcDescPtr->w;
+ descriptorPtr3D->dims[3] = srcDescPtr->c;
+ }
+ else
+ {
+ descriptorPtr3D->dims[1] = srcDescPtr->c;
+ descriptorPtr3D->dims[2] = srcDescPtr->h;
+ descriptorPtr3D->dims[3] = srcDescPtr->w;
+ }
+ descriptorPtr3D->strides[0] = descriptorPtr3D->dims[1] * descriptorPtr3D->dims[2] * descriptorPtr3D->dims[3];
+ descriptorPtr3D->strides[1] = descriptorPtr3D->dims[2] * descriptorPtr3D->dims[3];
+ descriptorPtr3D->strides[2] = descriptorPtr3D->dims[3];
+ }
+ else
+ {
+ descriptorPtr3D->numDims = 3;
+ descriptorPtr3D->dims[0] = batchSize;
+ descriptorPtr3D->dims[1] = srcDescPtr->h;
+ descriptorPtr3D->dims[2] = srcDescPtr->w;
+ descriptorPtr3D->strides[0] = descriptorPtr3D->dims[1] * descriptorPtr3D->dims[2];
+ descriptorPtr3D->strides[1] = descriptorPtr3D->dims[2];
+ }
+}
+
// sets descriptor dimensions and strides of src/dst
inline void set_descriptor_dims_and_strides(RpptDescPtr descPtr, int noOfImages, int maxHeight, int maxWidth, int numChannels, int offsetInBytes)
{
@@ -1295,4 +1333,60 @@ void inline init_ricap(int width, int height, int batchSize, Rpp32u *permutation
roiPtrInputCropRegion[1].xywhROI = {randrange(0, part0Width - 8), randrange(0, height - part0Height), width - part0Width, part0Height};
roiPtrInputCropRegion[2].xywhROI = {randrange(0, width - part0Width - 8), randrange(0, part0Height), part0Width, height - part0Height};
roiPtrInputCropRegion[3].xywhROI = {randrange(0, part0Width - 8), randrange(0, part0Height), width - part0Width, height - part0Height};
-}
\ No newline at end of file
+}
+
+// initialize the roi, anchor and shape values required for slice
+void init_slice(RpptGenericDescPtr descriptorPtr3D, RpptROIPtr roiPtrSrc, Rpp32u *roiTensor, Rpp32s *anchorTensor, Rpp32s *shapeTensor)
+{
+ if(descriptorPtr3D->numDims == 4)
+ {
+ if (descriptorPtr3D->layout == RpptLayout::NCHW)
+ {
+ for(int i = 0; i < descriptorPtr3D->dims[0]; i++)
+ {
+ int idx1 = i * 3;
+ int idx2 = i * 6;
+ roiTensor[idx2] = anchorTensor[idx1] = 0;
+ roiTensor[idx2 + 1] = anchorTensor[idx1 + 1] = roiPtrSrc[i].xywhROI.xy.y;
+ roiTensor[idx2 + 2] = anchorTensor[idx1 + 2] = roiPtrSrc[i].xywhROI.xy.x;
+ roiTensor[idx2 + 3] = descriptorPtr3D->dims[1];
+ roiTensor[idx2 + 4] = roiPtrSrc[i].xywhROI.roiHeight;
+ roiTensor[idx2 + 5] = roiPtrSrc[i].xywhROI.roiWidth;
+ shapeTensor[idx1] = roiTensor[idx2 + 3];
+ shapeTensor[idx1 + 1] = roiTensor[idx2 + 4] / 2;
+ shapeTensor[idx1 + 2] = roiTensor[idx2 + 5] / 2;
+ }
+ }
+ else if(descriptorPtr3D->layout == RpptLayout::NHWC)
+ {
+ for(int i = 0; i < descriptorPtr3D->dims[0]; i++)
+ {
+ int idx1 = i * 3;
+ int idx2 = i * 6;
+ roiTensor[idx2] = anchorTensor[idx1] = roiPtrSrc[i].xywhROI.xy.y;
+ roiTensor[idx2 + 1] = anchorTensor[idx1 + 1] = roiPtrSrc[i].xywhROI.xy.x;
+ roiTensor[idx2 + 2] = anchorTensor[idx1 + 2] = 0;
+ roiTensor[idx2 + 3] = roiPtrSrc[i].xywhROI.roiHeight;
+ roiTensor[idx2 + 4] = roiPtrSrc[i].xywhROI.roiWidth;
+ roiTensor[idx2 + 5] = descriptorPtr3D->dims[3];
+ shapeTensor[idx1] = roiTensor[idx2 + 3] / 2;
+ shapeTensor[idx1 + 1] = roiTensor[idx2 + 4] / 2;
+ shapeTensor[idx1 + 2] = roiTensor[idx2 + 5];
+ }
+ }
+ }
+ if(descriptorPtr3D->numDims == 3)
+ {
+ for(int i = 0; i < descriptorPtr3D->dims[0]; i++)
+ {
+ int idx1 = i * 2;
+ int idx2 = i * 4;
+ roiTensor[idx2] = anchorTensor[idx1] = roiPtrSrc[i].xywhROI.xy.y;
+ roiTensor[idx2 + 1] = anchorTensor[idx1 + 1] = roiPtrSrc[i].xywhROI.xy.x;
+ roiTensor[idx2 + 2] = roiPtrSrc[i].xywhROI.roiHeight;
+ roiTensor[idx2 + 3] = roiPtrSrc[i].xywhROI.roiWidth;
+ shapeTensor[idx1] = roiTensor[idx2 + 2] / 2;
+ shapeTensor[idx1 + 1] = roiTensor[idx2 + 3] / 2;
+ }
+ }
+}
diff --git a/utilities/test_suite/rpp_test_suite_voxel.h b/utilities/test_suite/rpp_test_suite_voxel.h
index c24fc29a8..77de34f3b 100644
--- a/utilities/test_suite/rpp_test_suite_voxel.h
+++ b/utilities/test_suite/rpp_test_suite_voxel.h
@@ -242,6 +242,51 @@ inline string set_function_type(int layoutType, int pln1OutTypeCase, int outputF
return funcType;
}
+// initialize the roi, anchor and shape values required for slice
+void init_slice_voxel(RpptGenericDescPtr descriptorPtr3D, RpptROI3D *roiGenericSrcPtr, Rpp32u *roiTensor, Rpp32s *anchorTensor, Rpp32s *shapeTensor)
+{
+ if (descriptorPtr3D->layout == RpptLayout::NCDHW)
+ {
+ for(int i = 0; i < descriptorPtr3D->dims[0]; i++)
+ {
+ int idx1 = i * 4;
+ int idx2 = i * 8;
+ roiTensor[idx2] = anchorTensor[idx1] = 0;
+ roiTensor[idx2 + 1] = anchorTensor[idx1 + 1] = roiGenericSrcPtr[i].xyzwhdROI.xyz.z;
+ roiTensor[idx2 + 2] = anchorTensor[idx1 + 2] = roiGenericSrcPtr[i].xyzwhdROI.xyz.y;
+ roiTensor[idx2 + 3] = anchorTensor[idx1 + 3] = roiGenericSrcPtr[i].xyzwhdROI.xyz.x;
+ roiTensor[idx2 + 4] = descriptorPtr3D->dims[1];
+ roiTensor[idx2 + 5] = roiGenericSrcPtr[i].xyzwhdROI.roiDepth;
+ roiTensor[idx2 + 6] = roiGenericSrcPtr[i].xyzwhdROI.roiHeight;
+ roiTensor[idx2 + 7] = roiGenericSrcPtr[i].xyzwhdROI.roiWidth;
+ shapeTensor[idx1] = roiTensor[idx2 + 4];
+ shapeTensor[idx1 + 1] = roiTensor[idx2 + 5] / 2;
+ shapeTensor[idx1 + 2] = roiTensor[idx2 + 6] / 2;
+ shapeTensor[idx1 + 3] = roiTensor[idx2 + 7] / 2;
+ }
+ }
+ else if(descriptorPtr3D->layout == RpptLayout::NDHWC)
+ {
+ for(int i = 0; i < descriptorPtr3D->dims[0]; i++)
+ {
+ int idx1 = i * 4;
+ int idx2 = i * 8;
+ roiTensor[idx2] = anchorTensor[idx1] = roiGenericSrcPtr[i].xyzwhdROI.xyz.z;
+ roiTensor[idx2 + 1] = anchorTensor[idx1 + 1] = roiGenericSrcPtr[i].xyzwhdROI.xyz.y;
+ roiTensor[idx2 + 2] = anchorTensor[idx1 + 2] = roiGenericSrcPtr[i].xyzwhdROI.xyz.x;
+ roiTensor[idx2 + 3] = anchorTensor[idx1 + 3] = 0;
+ roiTensor[idx2 + 4] = roiGenericSrcPtr[i].xyzwhdROI.roiDepth;
+ roiTensor[idx2 + 5] = roiGenericSrcPtr[i].xyzwhdROI.roiHeight;
+ roiTensor[idx2 + 6] = roiGenericSrcPtr[i].xyzwhdROI.roiWidth;
+ roiTensor[idx2 + 7] = descriptorPtr3D->dims[4];
+ shapeTensor[idx1] = roiTensor[idx2 + 4] / 2;
+ shapeTensor[idx1 + 1] = roiTensor[idx2 + 5] / 2;
+ shapeTensor[idx1 + 2] = roiTensor[idx2 + 6] / 2;
+ shapeTensor[idx1 + 3] = roiTensor[idx2 + 7];
+ }
+ }
+}
+
// reads nifti-1 header file
static int read_nifti_header_file(char* const header_file, nifti_1_header &niftiHeader)
{