diff --git a/include/rppdefs.h b/include/rppdefs.h
index 3256444f3..28876d7f5 100644
--- a/include/rppdefs.h
+++ b/include/rppdefs.h
@@ -129,7 +129,7 @@ typedef enum
RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH = -14,
/*! \brief Invalid datatype \ingroup group_rppdefs */
RPP_ERROR_INVALID_PARAMETER_DATATYPE = -15,
- /*! \brief Not enough memory \ingroup group_rppdefs */
+ /*! \brief Not enough memory to write outputs, as per dim-lengths and strides set in descriptor \ingroup group_rppdefs */
RPP_ERROR_NOT_ENOUGH_MEMORY = -16,
/*! \brief Out of bound source ROI \ingroup group_rppdefs */
RPP_ERROR_OUT_OF_BOUND_SRC_ROI = -17,
diff --git a/include/rppt_tensor_statistical_operations.h b/include/rppt_tensor_statistical_operations.h
index 7c2d3318b..441816ea3 100644
--- a/include/rppt_tensor_statistical_operations.h
+++ b/include/rppt_tensor_statistical_operations.h
@@ -193,10 +193,84 @@ RppStatus rppt_normalize_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDesc
RppStatus rppt_normalize_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u axisMask, Rpp32f *meanTensor, Rpp32f *stdDevTensor, Rpp8u computeMeanStddev, Rpp32f scale, Rpp32f shift, Rpp32u *roiTensor, rppHandle_t rppHandle);
#endif // GPU_SUPPORT
+/*! \brief Tensor mean operation on HOST backend for a NCHW/NHWC layout tensor
+ * \details The tensor mean is a reduction operation that finds the channel-wise (R mean / G mean / B mean) and total mean for each image in a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] tensorMeanArr destination array in HOST memory
+ * \param [in] tensorMeanArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorMeanArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorMeanArrLength = srcDescPtr->n * 4)
+ * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_tensor_mean_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t tensorMeanArr, Rpp32u tensorMeanArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Tensor mean operation on HIP backend for a NCHW/NHWC layout tensor
+ * \details The tensor mean is a reduction operation that finds the channel-wise (R mean / G mean / B mean) and total mean for each image in a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] tensorMeanArr destination array in HIP memory
+ * \param [in] tensorMeanArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorMeanArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorMeanArrLength = srcDescPtr->n * 4)
+ * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_tensor_mean_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t tensorMeanArr, Rpp32u tensorMeanArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
+/*! \brief Tensor stddev operation on HOST backend for a NCHW/NHWC layout tensor
+ * \details The tensor stddev is a reduction operation that finds the channel-wise (R stddev / G stddev / B stddev) and total standard deviation for each image with respect to meanTensor passed.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] tensorStddevArr destination array in HOST memory
+ * \param [in] tensorStddevArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorStddevArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorStddevArrLength = srcDescPtr->n * 4)
+ * \param [in] meanTensor mean values for stddev calculation (1D tensor of size batchSize * 4 in format (MeanR, MeanG, MeanB, MeanImage) for each image in batch)
+ * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_tensor_stddev_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t tensorStddevArr, Rpp32u tensorStddevArrLength, Rpp32f *meanTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Tensor stddev operation on HIP backend for a NCHW/NHWC layout tensor
+ * \details The tensor stddev is a reduction operation that finds the channel-wise (R stddev / G stddev / B stddev) and total standard deviation for each image with respect to meanTensor passed.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] tensorStddevArr destination array in HIP memory
+ * \param [in] tensorStddevArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorStddevArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorStddevArrLength = srcDescPtr->n * 4)
+ * \param [in] meanTensor mean values for stddev calculation (1D tensor of size batchSize * 4 in format (MeanR, MeanG, MeanB, MeanImage) for each image in batch)
+ * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_tensor_stddev_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t tensorStddevArr, Rpp32u tensorStddevArrLength, Rpp32f *meanTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
/*! @}
*/
#ifdef __cplusplus
}
#endif
-#endif // RPPT_TENSOR_STATISTICAL_OPERATIONS_H
+#endif // RPPT_TENSOR_STATISTICAL_OPERATIONS_H
\ No newline at end of file
diff --git a/src/include/cpu/rpp_cpu_common.hpp b/src/include/cpu/rpp_cpu_common.hpp
index 9e599f06d..bb06713b9 100644
--- a/src/include/cpu/rpp_cpu_common.hpp
+++ b/src/include/cpu/rpp_cpu_common.hpp
@@ -6120,6 +6120,46 @@ inline void compute_sum_24_host(__m256d *p, __m256d *pSumR, __m256d *pSumG, __m2
pSumB[0] = _mm256_add_pd(_mm256_add_pd(p[4], p[5]), pSumB[0]); //add 8B values and bring it down to 4
}
+inline void compute_variance_8_host(__m256d *p1, __m256d *pMean, __m256d *pVar)
+{
+ __m256d pSub = _mm256_sub_pd(p1[0], pMean[0]);
+ pVar[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVar[0]);
+ pSub = _mm256_sub_pd(p1[1], pMean[0]);
+ pVar[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVar[0]);
+}
+
+inline void compute_variance_channel_pln3_24_host(__m256d *p1, __m256d *pMeanR, __m256d *pMeanG, __m256d *pMeanB, __m256d *pVarR, __m256d *pVarG, __m256d *pVarB)
+{
+ __m256d pSub = _mm256_sub_pd(p1[0], pMeanR[0]);
+ pVarR[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarR[0]);
+ pSub = _mm256_sub_pd(p1[1], pMeanR[0]);
+ pVarR[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarR[0]);
+ pSub = _mm256_sub_pd(p1[2], pMeanG[0]);
+ pVarG[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarG[0]);
+ pSub = _mm256_sub_pd(p1[3], pMeanG[0]);
+ pVarG[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarG[0]);
+ pSub = _mm256_sub_pd(p1[4], pMeanB[0]);
+ pVarB[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarB[0]);
+ pSub = _mm256_sub_pd(p1[5], pMeanB[0]);
+ pVarB[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarB[0]);
+}
+
+inline void compute_variance_image_pln3_24_host(__m256d *p1, __m256d *pMean, __m256d *pVarR, __m256d *pVarG, __m256d *pVarB)
+{
+ __m256d pSub = _mm256_sub_pd(p1[0], pMean[0]);
+ pVarR[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarR[0]);
+ pSub = _mm256_sub_pd(p1[1], pMean[0]);
+ pVarR[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarR[0]);
+ pSub = _mm256_sub_pd(p1[2], pMean[0]);
+ pVarG[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarG[0]);
+ pSub = _mm256_sub_pd(pMean[0], p1[3]);
+ pVarG[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarG[0]);
+ pSub = _mm256_sub_pd(p1[4], pMean[0]);
+ pVarB[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarB[0]);
+ pSub = _mm256_sub_pd(p1[5], pMean[0]);
+ pVarB[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarB[0]);
+}
+
inline void compute_vignette_48_host(__m256 *p, __m256 &pMultiplier, __m256 &pILocComponent, __m256 &pJLocComponent)
{
__m256 pGaussianValue;
diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp
index a5e83d1ff..babc6f55c 100644
--- a/src/include/cpu/rpp_cpu_simd.hpp
+++ b/src/include/cpu/rpp_cpu_simd.hpp
@@ -1280,6 +1280,35 @@ inline void rpp_store48_f32pln3_to_u8pkd3_avx(Rpp8u *dstPtr, __m256 *p)
_mm_storeu_si128((__m128i *)(dstPtr + 36), px[3]); /* store [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|00|00|00|00] */
}
+inline void rpp_load24_u8pln3_to_f64pln3_avx(Rpp8u *srcPtrR, Rpp8u *srcPtrG, Rpp8u *srcPtrB, __m256d *p)
+{
+ __m128i px[3];
+
+ px[0] = _mm_loadu_si128((__m128i *)srcPtrR); /* load [R00|R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */
+ px[1] = _mm_loadu_si128((__m128i *)srcPtrG); /* load [G00|G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16] */
+ px[2] = _mm_loadu_si128((__m128i *)srcPtrB); /* load [B00|B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */
+ p[0] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[0], xmm_pxMask00To03)); /* Contains R00-03 */
+ p[1] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[0], xmm_pxMask04To07)); /* Contains R04-07 */
+ p[2] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[1], xmm_pxMask00To03)); /* Contains G00-03 */
+ p[3] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[1], xmm_pxMask04To07)); /* Contains G04-07 */
+ p[4] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[2], xmm_pxMask00To03)); /* Contains B00-03 */
+ p[5] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[2], xmm_pxMask04To07)); /* Contains B04-07 */
+}
+
+inline void rpp_load24_u8pkd3_to_f64pln3_avx(Rpp8u *srcPtr, __m256d *p)
+{
+ __m128i px[2];
+
+ px[0] = _mm_loadu_si128((__m128i *)srcPtr); /* load [R00|G00|B00|R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05] - Need RGB 00-03 */
+ px[1] = _mm_loadu_si128((__m128i *)(srcPtr + 12)); /* load [R04|G04|B04|R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09] - Need RGB 04-07 */
+ p[0] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[0], xmm_pxMaskR)); /* Contains R00-03 */
+ p[1] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[1], xmm_pxMaskR)); /* Contains R04-07 */
+ p[2] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[0], xmm_pxMaskG)); /* Contains G00-03 */
+ p[3] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[1], xmm_pxMaskG)); /* Contains G04-07 */
+ p[4] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[0], xmm_pxMaskB)); /* Contains B00-03 */
+ p[5] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[1], xmm_pxMaskB)); /* Contains B04-07 */
+}
+
inline void rpp_load16_u8_to_f32_avx(Rpp8u *srcPtr, __m256 *p)
{
__m128i px;
@@ -1315,6 +1344,22 @@ inline void rpp_store16_f32_to_u8_avx(Rpp8u *dstPtr, __m256 *p)
_mm_storeu_si128((__m128i *)dstPtr, px[0]);
}
+inline void rpp_load8_u8_to_f64_avx(Rpp8u *srcPtr, __m256d *p)
+{
+ __m128i px;
+ px = _mm_loadu_si128((__m128i *)srcPtr);
+ p[0] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px, xmm_pxMask00To03)); /* Contains pixels 01-04 */
+ p[1] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px, xmm_pxMask04To07)); /* Contains pixels 05-08 */
+}
+
+inline void rpp_load8_i8_to_f64_avx(Rpp8s *srcPtr, __m256d *p)
+{
+ __m128i px;
+ px = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)srcPtr));
+ p[0] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px, xmm_pxMask00To03)); /* Contains pixels 01-04 */
+ p[1] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px, xmm_pxMask04To07)); /* Contains pixels 05-08 */
+}
+
inline void rpp_load16_u8_to_u32_avx(Rpp8u *srcPtr, __m256i *p)
{
__m128i px;
@@ -1688,6 +1733,35 @@ inline void rpp_store48_f32pln3_to_i8pkd3_avx(Rpp8s *dstPtr, __m256 *p)
_mm_storeu_si128((__m128i *)(dstPtr + 36), px[3]); /* store [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|00|00|00|00] */
}
+inline void rpp_load24_i8pln3_to_f64pln3_avx(Rpp8s *srcPtrR, Rpp8s *srcPtrG, Rpp8s *srcPtrB, __m256d *p)
+{
+ __m128i px[3];
+
+ px[0] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)srcPtrR)); /* add I8 conversion param to load [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */
+ px[1] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)srcPtrG)); /* add I8 conversion param to load [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16] */
+ px[2] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)srcPtrB)); /* add I8 conversion param to load [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */
+ p[0] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[0], xmm_pxMask00To03)); /* Contains R01-04 */
+ p[1] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[0], xmm_pxMask04To07)); /* Contains R05-08 */
+ p[2] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[1], xmm_pxMask00To03)); /* Contains G01-04 */
+ p[3] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[1], xmm_pxMask04To07)); /* Contains G05-08 */
+ p[4] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[2], xmm_pxMask00To03)); /* Contains B01-04 */
+ p[5] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[2], xmm_pxMask04To07)); /* Contains B05-08 */
+}
+
+inline void rpp_load24_i8pkd3_to_f64pln3_avx(Rpp8s *srcPtr, __m256d *p)
+{
+ __m128i px[2];
+
+ px[0] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)srcPtr)); /* add I8 conversion param to load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06] - Need RGB 01-04 */
+ px[1] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)(srcPtr + 12))); /* add I8 conversion param to load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10] - Need RGB 05-08 */
+ p[0] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[0], xmm_pxMaskR)); /* Contains R01-04 */
+ p[1] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[1], xmm_pxMaskR)); /* Contains R05-08 */
+ p[2] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[0], xmm_pxMaskG)); /* Contains G01-04 */
+ p[3] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[1], xmm_pxMaskG)); /* Contains G05-08 */
+ p[4] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[0], xmm_pxMaskB)); /* Contains B01-04 */
+ p[5] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[1], xmm_pxMaskB)); /* Contains B05-08 */
+}
+
inline void rpp_load16_i8_to_f32_avx(Rpp8s *srcPtr, __m256 *p)
{
__m128i px;
diff --git a/src/include/hip/rpp_hip_common.hpp b/src/include/hip/rpp_hip_common.hpp
index a5cd5972c..3f32dbc04 100644
--- a/src/include/hip/rpp_hip_common.hpp
+++ b/src/include/hip/rpp_hip_common.hpp
@@ -63,12 +63,12 @@ typedef union { float f1[24]; float2 f2[12]; float3 f3[8]; float4 f4[6];
// uint
typedef union { uint ui1[6]; uint2 ui2[3]; } d_uint6;
typedef union { uint ui1[8]; uint4 ui4[2]; } d_uint8;
-typedef union { uint4 ui4[6]; d_uint8 ui8[3]; } d_uint24;
+typedef union { uint ui1[24]; uint4 ui4[6]; d_uint8 ui8[3]; } d_uint24;
// int
typedef union { int i1[6]; int2 i2[3]; } d_int6;
typedef union { int i1[8]; int4 i4[2]; } d_int8;
-typedef union { int4 i4[6]; d_int8 i8[3]; } d_int24;
+typedef union { int i1[24]; int4 i4[6]; d_int8 i8[3]; } d_int24;
// half
typedef struct { half h1[3]; } d_half3_s;
@@ -599,6 +599,24 @@ __device__ __forceinline__ void rpp_hip_load8_and_unpack_to_float8_mirror(half *
srcPtr_f8->f4[1] = make_float4(src1_f2.y, src1_f2.x, src2_f2.y, src2_f2.x); // write 03-00
}
+// UINT loads without layout toggle (8 UINT pixels)
+
+__device__ __forceinline__ void rpp_hip_load8_and_unpack_to_float8(uint *srcPtr, d_float8 *srcPtr_f8)
+{
+ d_uint8 src_ui8 = *(d_uint8 *)srcPtr;
+ srcPtr_f8->f4[0] = make_float4((float)src_ui8.ui4[0].x, (float)src_ui8.ui4[0].y, (float)src_ui8.ui4[0].z, (float)src_ui8.ui4[0].w);
+ srcPtr_f8->f4[1] = make_float4((float)src_ui8.ui4[1].x, (float)src_ui8.ui4[1].y, (float)src_ui8.ui4[1].z, (float)src_ui8.ui4[1].w);
+}
+
+// INT loads without layout toggle (8 INT pixels)
+
+__device__ __forceinline__ void rpp_hip_load8_and_unpack_to_float8(int *srcPtr, d_float8 *srcPtr_f8)
+{
+ d_int8 src_i8 = *(d_int8 *)srcPtr;
+ srcPtr_f8->f4[0] = make_float4((float)src_i8.i4[0].x, (float)src_i8.i4[0].y, (float)src_i8.i4[0].z, (float)src_i8.i4[0].w);
+ srcPtr_f8->f4[1] = make_float4((float)src_i8.i4[1].x, (float)src_i8.i4[1].y, (float)src_i8.i4[1].z, (float)src_i8.i4[1].w);
+}
+
// U8 loads without layout toggle PLN3 to PLN3 (24 U8 pixels)
__device__ __forceinline__ void rpp_hip_load24_pln3_and_unpack_to_float24_pln3(uchar *srcPtr, uint increment, d_float24 *srcPtr_f24)
@@ -864,6 +882,36 @@ __device__ __forceinline__ void rpp_hip_load24_pkd3_and_unpack_to_float24_pln3_m
srcPtr_f24->f4[5] = make_float4(__half2float(src_h24.h1[11]), __half2float(src_h24.h1[ 8]), __half2float(src_h24.h1[ 5]), __half2float(src_h24.h1[ 2])); // write B03-B00 (mirrored load)
}
+// UINT loads with layout toggle PLN3 to PKD3 (24 UINT pixels)
+
+__device__ __forceinline__ void rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(uint *srcPtr, d_float24 *srcPtr_f24)
+{
+ d_uint24 src_ui24;
+ *(d_uint24_s *)&src_ui24 = *(d_uint24_s *)srcPtr;
+
+ srcPtr_f24->f4[0] = make_float4((float)src_ui24.ui1[ 0], (float)src_ui24.ui1[ 3], (float)src_ui24.ui1[ 6], (float)src_ui24.ui1[ 9]); // write R00-R03
+ srcPtr_f24->f4[1] = make_float4((float)src_ui24.ui1[12], (float)src_ui24.ui1[15], (float)src_ui24.ui1[18], (float)src_ui24.ui1[21]); // write R04-R07
+ srcPtr_f24->f4[2] = make_float4((float)src_ui24.ui1[ 1], (float)src_ui24.ui1[ 4], (float)src_ui24.ui1[ 7], (float)src_ui24.ui1[10]); // write G00-G03
+ srcPtr_f24->f4[3] = make_float4((float)src_ui24.ui1[13], (float)src_ui24.ui1[16], (float)src_ui24.ui1[19], (float)src_ui24.ui1[22]); // write G04-G07
+ srcPtr_f24->f4[4] = make_float4((float)src_ui24.ui1[ 2], (float)src_ui24.ui1[ 5], (float)src_ui24.ui1[ 8], (float)src_ui24.ui1[11]); // write B00-B03
+ srcPtr_f24->f4[5] = make_float4((float)src_ui24.ui1[14], (float)src_ui24.ui1[17], (float)src_ui24.ui1[20], (float)src_ui24.ui1[23]); // write B04-B07
+}
+
+// INT loads with layout toggle PLN3 to PKD3 (24 INT pixels)
+
+__device__ __forceinline__ void rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(int *srcPtr, d_float24 *srcPtr_f24)
+{
+ d_int24 src_i24;
+ *(d_int24_s *)&src_i24 = *(d_int24_s *)srcPtr;
+
+ srcPtr_f24->f4[0] = make_float4((float)src_i24.i1[ 0], (float)src_i24.i1[ 3], (float)src_i24.i1[ 6], (float)src_i24.i1[ 9]); // write R00-R03
+ srcPtr_f24->f4[1] = make_float4((float)src_i24.i1[12], (float)src_i24.i1[15], (float)src_i24.i1[18], (float)src_i24.i1[21]); // write R04-R07
+ srcPtr_f24->f4[2] = make_float4((float)src_i24.i1[ 1], (float)src_i24.i1[ 4], (float)src_i24.i1[ 7], (float)src_i24.i1[10]); // write G00-G03
+ srcPtr_f24->f4[3] = make_float4((float)src_i24.i1[13], (float)src_i24.i1[16], (float)src_i24.i1[19], (float)src_i24.i1[22]); // write G04-G07
+ srcPtr_f24->f4[4] = make_float4((float)src_i24.i1[ 2], (float)src_i24.i1[ 5], (float)src_i24.i1[ 8], (float)src_i24.i1[11]); // write B00-B03
+ srcPtr_f24->f4[5] = make_float4((float)src_i24.i1[14], (float)src_i24.i1[17], (float)src_i24.i1[20], (float)src_i24.i1[23]); // write B04-B07
+}
+
// U8 loads with layout toggle PLN3 to PKD3 (24 U8 pixels)
__device__ __forceinline__ void rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(uchar *srcPtr, uint increment, d_float24 *srcPtr_f24)
diff --git a/src/modules/cpu/host_tensor_statistical_operations.hpp b/src/modules/cpu/host_tensor_statistical_operations.hpp
index 7ddc238fa..0a7e8ff51 100644
--- a/src/modules/cpu/host_tensor_statistical_operations.hpp
+++ b/src/modules/cpu/host_tensor_statistical_operations.hpp
@@ -28,6 +28,8 @@ SOFTWARE.
#include "kernel/tensor_sum.hpp"
#include "kernel/tensor_min.hpp"
#include "kernel/tensor_max.hpp"
+#include "kernel/tensor_mean.hpp"
+#include "kernel/tensor_stddev.hpp"
#include "kernel/normalize.hpp"
#endif // HOST_TENSOR_STATISTICAL_OPERATIONS_HPP
diff --git a/src/modules/cpu/kernel/tensor_mean.hpp b/src/modules/cpu/kernel/tensor_mean.hpp
new file mode 100644
index 000000000..9536e258f
--- /dev/null
+++ b/src/modules/cpu/kernel/tensor_mean.hpp
@@ -0,0 +1,842 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+#include "reduction.hpp"
+
+RppStatus tensor_mean_u8_f32_host(Rpp8u *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp32f *tensorMeanArr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp8u *srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+ Rpp8u *srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+ Rpp32u alignedLength = (bufferLength / 48) * 48;
+ Rpp32u vectorIncrement = 48;
+ Rpp32u vectorIncrementPerChannel = 16;
+ Rpp32f totalPixelsPerChannel = roi.xywhROI.roiWidth * roi.xywhROI.roiHeight;
+ int idx = batchCount * 4;
+
+ // Tensor Mean without fused output-layout toggle (NCHW)
+ if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ alignedLength = bufferLength & ~15;
+ Rpp32f mean = 0.0;
+ Rpp32u sum = 0;
+ Rpp32u sumAvx[8] = {0};
+
+ Rpp8u *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+#if __AVX2__
+ __m256i pSum = avx_px0;
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256i p1[2];
+ rpp_simd_load(rpp_load16_u8_to_u32_avx, srcPtrTemp, p1);
+ compute_sum_16_host(p1, &pSum);
+ srcPtrTemp += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ sum += static_cast(*srcPtrTemp++);
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ _mm256_store_si256((__m256i *)sumAvx, pSum);
+ sum += (sumAvx[0] + sumAvx[1] + sumAvx[2] + sumAvx[3] + sumAvx[4] + sumAvx[5] + sumAvx[6] + sumAvx[7]);
+#endif
+ mean = static_cast(sum) / totalPixelsPerChannel;
+ tensorMeanArr[batchCount] = mean;
+ }
+
+ // Tensor Mean without fused output-layout toggle 3 channel (NCHW)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp64u sum;
+ Rpp32u sumR = 0, sumG = 0, sumB = 0;
+ Rpp32f mean, meanR = 0.0, meanG = 0.0, meanB = 0.0;
+ Rpp32u sumAvxR[8] = {0};
+ Rpp32u sumAvxG[8] = {0};
+ Rpp32u sumAvxB[8] = {0};
+
+ Rpp8u *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+ __m256i pSumR = avx_px0;
+ __m256i pSumG = avx_px0;
+ __m256i pSumB = avx_px0;
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+ srcPtrTempR = srcPtrRowR;
+ srcPtrTempG = srcPtrRowG;
+ srcPtrTempB = srcPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256i p[6];
+ rpp_simd_load(rpp_load48_u8pln3_to_u32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
+ compute_sum_48_host(p, &pSumR, &pSumG, &pSumB);
+ srcPtrTempR += vectorIncrementPerChannel;
+ srcPtrTempG += vectorIncrementPerChannel;
+ srcPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ sumR += static_cast(*srcPtrTempR++);
+ sumG += static_cast(*srcPtrTempG++);
+ sumB += static_cast(*srcPtrTempB++);
+ }
+ srcPtrRowR += srcDescPtr->strides.hStride;
+ srcPtrRowG += srcDescPtr->strides.hStride;
+ srcPtrRowB += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ _mm256_store_si256((__m256i *)sumAvxR, pSumR);
+ _mm256_store_si256((__m256i *)sumAvxG, pSumG);
+ _mm256_store_si256((__m256i *)sumAvxB, pSumB);
+ sumR += (sumAvxR[0] + sumAvxR[1] + sumAvxR[2] + sumAvxR[3] + sumAvxR[4] + sumAvxR[5] + sumAvxR[6] + sumAvxR[7]);
+ sumG += (sumAvxG[0] + sumAvxG[1] + sumAvxG[2] + sumAvxG[3] + sumAvxG[4] + sumAvxG[5] + sumAvxG[6] + sumAvxG[7]);
+ sumB += (sumAvxB[0] + sumAvxB[1] + sumAvxB[2] + sumAvxB[3] + sumAvxB[4] + sumAvxB[5] + sumAvxB[6] + sumAvxB[7]);
+#endif
+ sum = static_cast(sumR) + static_cast(sumG) + static_cast(sumB);
+ mean = (static_cast(sum) / (totalPixelsPerChannel * 3));
+ meanR = (static_cast(sumR) / totalPixelsPerChannel);
+ meanG = (static_cast(sumG) / totalPixelsPerChannel);
+ meanB = (static_cast(sumB) / totalPixelsPerChannel);
+ tensorMeanArr[idx] = meanR;
+ tensorMeanArr[idx + 1] = meanG;
+ tensorMeanArr[idx + 2] = meanB;
+ tensorMeanArr[idx + 3] = mean;
+ }
+
+ // Tensor Mean without fused output-layout toggle (NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp64u sum;
+ Rpp32u sumR = 0, sumG = 0, sumB = 0;
+ Rpp32f mean, meanR = 0.0, meanG = 0.0, meanB = 0.0;
+ Rpp32u sumAvxR[8] = {0};
+ Rpp32u sumAvxG[8] = {0};
+ Rpp32u sumAvxB[8] = {0};
+
+ Rpp8u *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+#if __AVX2__
+ __m256i pSumR = avx_px0;
+ __m256i pSumG = avx_px0;
+ __m256i pSumB = avx_px0;
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256i p[6];
+ rpp_simd_load(rpp_load48_u8pkd3_to_u32pln3_avx, srcPtrTemp, p);
+ compute_sum_48_host(p, &pSumR, &pSumG, &pSumB);
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ sumR += static_cast(srcPtrTemp[0]);
+ sumG += static_cast(srcPtrTemp[1]);
+ sumB += static_cast(srcPtrTemp[2]);
+ srcPtrTemp += 3;
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ _mm256_store_si256((__m256i *)sumAvxR, pSumR);
+ _mm256_store_si256((__m256i *)sumAvxG, pSumG);
+ _mm256_store_si256((__m256i *)sumAvxB, pSumB);
+ sumR += (sumAvxR[0] + sumAvxR[1] + sumAvxR[2] + sumAvxR[3] + sumAvxR[4] + sumAvxR[5] + sumAvxR[6] + sumAvxR[7]);
+ sumG += (sumAvxG[0] + sumAvxG[1] + sumAvxG[2] + sumAvxG[3] + sumAvxG[4] + sumAvxG[5] + sumAvxG[6] + sumAvxG[7]);
+ sumB += (sumAvxB[0] + sumAvxB[1] + sumAvxB[2] + sumAvxB[3] + sumAvxB[4] + sumAvxB[5] + sumAvxB[6] + sumAvxB[7]);
+#endif
+ sum = static_cast(sumR) + static_cast(sumG) + static_cast(sumB);
+ mean = (static_cast(sum) / (totalPixelsPerChannel * 3));
+ meanR = (static_cast(sumR) / totalPixelsPerChannel);
+ meanG = (static_cast(sumG) / totalPixelsPerChannel);
+ meanB = (static_cast(sumB) / totalPixelsPerChannel);
+ tensorMeanArr[idx] = meanR;
+ tensorMeanArr[idx + 1] = meanG;
+ tensorMeanArr[idx + 2] = meanB;
+ tensorMeanArr[idx + 3] = mean;
+ }
+ }
+
+ return RPP_SUCCESS;
+}
+
+RppStatus tensor_mean_f32_f32_host(Rpp32f *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp32f *tensorMeanArr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp32f *srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+ Rpp32f *srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+ Rpp32u vectorIncrement = 24;
+ Rpp32u vectorIncrementPerChannel = 8;
+ Rpp32f totalPixelsPerChannel = roi.xywhROI.roiWidth * roi.xywhROI.roiHeight;
+ int idx = batchCount * 4;
+
+ // Tensor Mean without fused output-layout toggle (NCHW)
+ if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u alignedLength = bufferLength & ~(vectorIncrementPerChannel-1);
+ vectorIncrement = 8;
+ Rpp32f mean = 0.0;
+ Rpp64f sum = 0.0;
+ Rpp64f sumAvx[4] = {0.0};
+
+ Rpp32f *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+#if __AVX2__
+ __m256d pSum = _mm256_setzero_pd();
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256d p1[2];
+ rpp_simd_load(rpp_load8_f32_to_f64_avx, srcPtrTemp, p1);
+ compute_sum_8_host(p1, &pSum);
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ sum += static_cast(*srcPtrTemp++);
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ _mm256_storeu_pd(sumAvx, pSum);
+ sum += (sumAvx[0] + sumAvx[1] + sumAvx[2] + sumAvx[3]);
+#endif
+ mean = static_cast(sum / totalPixelsPerChannel);
+ tensorMeanArr[batchCount] = mean;
+ }
+
+ // Tensor Mean without fused output-layout toggle 3 channel (NCHW)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp64f sum, sumR = 0.0, sumG = 0.0, sumB = 0.0;
+ Rpp32f mean, meanR = 0.0, meanG = 0.0, meanB = 0.0;
+ Rpp64f sumAvxR[4] = {0.0};
+ Rpp64f sumAvxG[4] = {0.0};
+ Rpp64f sumAvxB[4] = {0.0};
+
+ Rpp32f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+ __m256d pSumR = _mm256_setzero_pd();
+ __m256d pSumG = _mm256_setzero_pd();
+ __m256d pSumB = _mm256_setzero_pd();
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+ srcPtrTempR = srcPtrRowR;
+ srcPtrTempG = srcPtrRowG;
+ srcPtrTempB = srcPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256d p[6];
+ rpp_simd_load(rpp_load24_f32pln3_to_f64pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
+ compute_sum_24_host(p, &pSumR, &pSumG, &pSumB);
+ srcPtrTempR += vectorIncrementPerChannel;
+ srcPtrTempG += vectorIncrementPerChannel;
+ srcPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ sumR += static_cast(*srcPtrTempR++);
+ sumG += static_cast(*srcPtrTempG++);
+ sumB += static_cast(*srcPtrTempB++);
+ }
+ srcPtrRowR += srcDescPtr->strides.hStride;
+ srcPtrRowG += srcDescPtr->strides.hStride;
+ srcPtrRowB += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ _mm256_storeu_pd(sumAvxR, pSumR);
+ _mm256_storeu_pd(sumAvxG, pSumG);
+ _mm256_storeu_pd(sumAvxB, pSumB);
+ sumR += (sumAvxR[0] + sumAvxR[1] + sumAvxR[2] + sumAvxR[3]);
+ sumG += (sumAvxG[0] + sumAvxG[1] + sumAvxG[2] + sumAvxG[3]);
+ sumB += (sumAvxB[0] + sumAvxB[1] + sumAvxB[2] + sumAvxB[3]);
+#endif
+
+ sum = sumR + sumG + sumB;
+ mean = static_cast(sum / (totalPixelsPerChannel * 3));
+ meanR = static_cast(sumR / totalPixelsPerChannel);
+ meanG = static_cast(sumG / totalPixelsPerChannel);
+ meanB = static_cast(sumB / totalPixelsPerChannel);
+ tensorMeanArr[idx] = meanR;
+ tensorMeanArr[idx + 1] = meanG;
+ tensorMeanArr[idx + 2] = meanB;
+ tensorMeanArr[idx + 3] = mean;
+ }
+
+ // Tensor Mean without fused output-layout toggle (NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp64f sum, sumR = 0.0, sumG = 0.0, sumB = 0.0;
+ Rpp32f mean, meanR = 0.0, meanG = 0.0, meanB = 0.0;
+ Rpp64f sumAvxR[4] = {0.0};
+ Rpp64f sumAvxG[4] = {0.0};
+ Rpp64f sumAvxB[4] = {0.0};
+
+ Rpp32f *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+#if __AVX2__
+ __m256d pSumR = _mm256_setzero_pd();
+ __m256d pSumG = _mm256_setzero_pd();
+ __m256d pSumB = _mm256_setzero_pd();
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256d p[6];
+ rpp_simd_load(rpp_load24_f32pkd3_to_f64pln3_avx, srcPtrTemp, p);
+ compute_sum_24_host(p, &pSumR, &pSumG, &pSumB);
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ sumR += static_cast(srcPtrTemp[0]);
+ sumG += static_cast(srcPtrTemp[1]);
+ sumB += static_cast(srcPtrTemp[2]);
+ srcPtrTemp += 3;
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ _mm256_storeu_pd(sumAvxR, pSumR);
+ _mm256_storeu_pd(sumAvxG, pSumG);
+ _mm256_storeu_pd(sumAvxB, pSumB);
+ sumR += (sumAvxR[0] + sumAvxR[1] + sumAvxR[2] + sumAvxR[3]);
+ sumG += (sumAvxG[0] + sumAvxG[1] + sumAvxG[2] + sumAvxG[3]);
+ sumB += (sumAvxB[0] + sumAvxB[1] + sumAvxB[2] + sumAvxB[3]);
+#endif
+ sum = sumR + sumG + sumB;
+ mean = static_cast(sum / (totalPixelsPerChannel * 3));
+ meanR = static_cast(sumR / totalPixelsPerChannel);
+ meanG = static_cast(sumG / totalPixelsPerChannel);
+ meanB = static_cast(sumB / totalPixelsPerChannel);
+ tensorMeanArr[idx] = meanR;
+ tensorMeanArr[idx + 1] = meanG;
+ tensorMeanArr[idx + 2] = meanB;
+ tensorMeanArr[idx + 3] = mean;
+ }
+ }
+
+ return RPP_SUCCESS;
+}
+
+RppStatus tensor_mean_f16_f32_host(Rpp16f *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp32f *tensorMeanArr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp16f *srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+ Rpp16f *srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+ Rpp32u vectorIncrement = 24;
+ Rpp32u vectorIncrementPerChannel = 8;
+ Rpp32f totalPixelsPerChannel = roi.xywhROI.roiWidth * roi.xywhROI.roiHeight;
+ int idx = batchCount * 4;
+
+ // Tensor Mean without fused output-layout toggle (NCHW)
+ if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ alignedLength = bufferLength & ~(vectorIncrementPerChannel-1);
+ vectorIncrement = 8;
+ Rpp32f mean = 0.0;
+ Rpp64f sum = 0.0;
+ Rpp64f sumAvx[4] = {0.0};
+
+ Rpp16f *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+#if __AVX2__
+ __m256d pSum = _mm256_setzero_pd();
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ Rpp32f srcPtrTemp_ps[8];
+ for(int cnt = 0; cnt < vectorIncrement; cnt++)
+ srcPtrTemp_ps[cnt] = static_cast(srcPtrTemp[cnt]);
+ __m256d p1[2];
+ rpp_simd_load(rpp_load8_f32_to_f64_avx, srcPtrTemp_ps, p1);
+ compute_sum_8_host(p1, &pSum);
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ sum += static_cast(*srcPtrTemp++);
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ _mm256_storeu_pd(sumAvx, pSum);
+ sum += (sumAvx[0] + sumAvx[1] + sumAvx[2] + sumAvx[3]);
+#endif
+ mean = static_cast(sum / totalPixelsPerChannel);
+ tensorMeanArr[batchCount] = mean;
+ }
+
+ // Tensor Mean without fused output-layout toggle 3 channel (NCHW)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp64f sum, sumR = 0.0, sumG = 0.0, sumB = 0.0;
+ Rpp32f mean, meanR = 0.0, meanG = 0.0, meanB = 0.0;
+ Rpp64f sumAvxR[4] = {0.0};
+ Rpp64f sumAvxG[4] = {0.0};
+ Rpp64f sumAvxB[4] = {0.0};
+
+ Rpp16f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+ __m256d pSumR = _mm256_setzero_pd();
+ __m256d pSumG = _mm256_setzero_pd();
+ __m256d pSumB = _mm256_setzero_pd();
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+ srcPtrTempR = srcPtrRowR;
+ srcPtrTempG = srcPtrRowG;
+ srcPtrTempB = srcPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ Rpp32f srcPtrTempR_ps[8], srcPtrTempG_ps[8], srcPtrTempB_ps[8];
+ for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++)
+ {
+ srcPtrTempR_ps[cnt] = static_cast(srcPtrTempR[cnt]);
+ srcPtrTempG_ps[cnt] = static_cast(srcPtrTempG[cnt]);
+ srcPtrTempB_ps[cnt] = static_cast(srcPtrTempB[cnt]);
+ }
+ __m256d p[6];
+ rpp_simd_load(rpp_load24_f32pln3_to_f64pln3_avx, srcPtrTempR_ps, srcPtrTempG_ps, srcPtrTempB_ps, p);
+ compute_sum_24_host(p, &pSumR, &pSumG, &pSumB);
+ srcPtrTempR += vectorIncrementPerChannel;
+ srcPtrTempG += vectorIncrementPerChannel;
+ srcPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ sumR += static_cast(*srcPtrTempR++);
+ sumG += static_cast(*srcPtrTempG++);
+ sumB += static_cast(*srcPtrTempB++);
+ }
+ srcPtrRowR += srcDescPtr->strides.hStride;
+ srcPtrRowG += srcDescPtr->strides.hStride;
+ srcPtrRowB += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ _mm256_storeu_pd(sumAvxR, pSumR);
+ _mm256_storeu_pd(sumAvxG, pSumG);
+ _mm256_storeu_pd(sumAvxB, pSumB);
+ sumR += (sumAvxR[0] + sumAvxR[1] + sumAvxR[2] + sumAvxR[3]);
+ sumG += (sumAvxG[0] + sumAvxG[1] + sumAvxG[2] + sumAvxG[3]);
+ sumB += (sumAvxB[0] + sumAvxB[1] + sumAvxB[2] + sumAvxB[3]);
+#endif
+ sum = sumR + sumG + sumB;
+ mean = static_cast(sum / (totalPixelsPerChannel * 3));
+ meanR = static_cast(sumR / totalPixelsPerChannel);
+ meanG = static_cast(sumG / totalPixelsPerChannel);
+ meanB = static_cast(sumB / totalPixelsPerChannel);
+ tensorMeanArr[idx] = meanR;
+ tensorMeanArr[idx + 1] = meanG;
+ tensorMeanArr[idx + 2] = meanB;
+ tensorMeanArr[idx + 3] = mean;
+ }
+
+ // Tensor Mean without fused output-layout toggle (NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp64f sum, sumR = 0.0, sumG = 0.0, sumB = 0.0;
+ Rpp32f mean, meanR = 0.0, meanG = 0.0, meanB = 0.0;
+ Rpp64f sumAvxR[4] = {0.0};
+ Rpp64f sumAvxG[4] = {0.0};
+ Rpp64f sumAvxB[4] = {0.0};
+
+ Rpp16f *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+#if __AVX2__
+ __m256d pSumR = _mm256_setzero_pd();
+ __m256d pSumG = _mm256_setzero_pd();
+ __m256d pSumB = _mm256_setzero_pd();
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ Rpp32f srcPtrTemp_ps[24];
+ for(int cnt = 0; cnt < vectorIncrement; cnt++)
+ srcPtrTemp_ps[cnt] = static_cast(srcPtrTemp[cnt]);
+ __m256d p[6];
+ rpp_simd_load(rpp_load24_f32pkd3_to_f64pln3_avx, srcPtrTemp_ps, p);
+ compute_sum_24_host(p, &pSumR, &pSumG, &pSumB);
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ sumR += static_cast(srcPtrTemp[0]);
+ sumG += static_cast(srcPtrTemp[1]);
+ sumB += static_cast(srcPtrTemp[2]);
+ srcPtrTemp += 3;
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ _mm256_storeu_pd(sumAvxR, pSumR);
+ _mm256_storeu_pd(sumAvxG, pSumG);
+ _mm256_storeu_pd(sumAvxB, pSumB);
+ sumR += (sumAvxR[0] + sumAvxR[1] + sumAvxR[2] + sumAvxR[3]);
+ sumG += (sumAvxG[0] + sumAvxG[1] + sumAvxG[2] + sumAvxG[3]);
+ sumB += (sumAvxB[0] + sumAvxB[1] + sumAvxB[2] + sumAvxB[3]);
+#endif
+ sum = sumR + sumG + sumB;
+ mean = static_cast(sum / (totalPixelsPerChannel * 3));
+ meanR = static_cast(sumR / totalPixelsPerChannel);
+ meanG = static_cast(sumG / totalPixelsPerChannel);
+ meanB = static_cast(sumB / totalPixelsPerChannel);
+ tensorMeanArr[idx] = meanR;
+ tensorMeanArr[idx + 1] = meanG;
+ tensorMeanArr[idx + 2] = meanB;
+ tensorMeanArr[idx + 3] = mean;
+ }
+ }
+
+ return RPP_SUCCESS;
+}
+
+RppStatus tensor_mean_i8_f32_host(Rpp8s *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp32f *tensorMeanArr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp8s *srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+ Rpp8s *srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+ Rpp32u alignedLength = (bufferLength / 48) * 48;
+ Rpp32u vectorIncrement = 48;
+ Rpp32u vectorIncrementPerChannel = 16;
+ Rpp32f totalPixelsPerChannel = roi.xywhROI.roiWidth * roi.xywhROI.roiHeight;
+ int idx = batchCount * 4;
+
+ // Tensor Mean without fused output-layout toggle (NCHW)
+ if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ alignedLength = bufferLength & ~15;
+ vectorIncrement = 16;
+ Rpp32f mean = 0.0;
+ Rpp32s sum = 0;
+ Rpp32s sumAvx[8] = {0};
+
+ Rpp8s *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+#if __AVX2__
+ __m256i pSum = avx_px0;
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256i p1[2];
+ rpp_simd_load(rpp_load16_i8_to_i32_avx, srcPtrTemp, p1);
+ compute_sum_16_host(p1, &pSum);
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ sum += static_cast(*srcPtrTemp++);
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ _mm256_store_si256((__m256i *)sumAvx, pSum);
+ sum += (sumAvx[0] + sumAvx[1] + sumAvx[2] + sumAvx[3] + sumAvx[4] + sumAvx[5] + sumAvx[6] + sumAvx[7]);
+#endif
+ mean = static_cast(sum) / totalPixelsPerChannel;
+ tensorMeanArr[batchCount] = mean;
+ }
+
+ // Tensor Mean without fused output-layout toggle 3 channel (NCHW)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp64s sum;
+ Rpp32s sumR = 0, sumG = 0, sumB = 0;
+ Rpp32f mean, meanR = 0.0, meanG = 0.0, meanB = 0.0;
+ Rpp32s sumAvxR[8] = {0};
+ Rpp32s sumAvxG[8] = {0};
+ Rpp32s sumAvxB[8] = {0};
+
+ Rpp8s *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+ __m256i pSumR = avx_px0;
+ __m256i pSumG = avx_px0;
+ __m256i pSumB = avx_px0;
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+ srcPtrTempR = srcPtrRowR;
+ srcPtrTempG = srcPtrRowG;
+ srcPtrTempB = srcPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256i p[6];
+ rpp_simd_load(rpp_load48_i8pln3_to_i32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
+ compute_sum_48_host(p, &pSumR, &pSumG, &pSumB);
+ srcPtrTempR += vectorIncrementPerChannel;
+ srcPtrTempG += vectorIncrementPerChannel;
+ srcPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ sumR += static_cast(*srcPtrTempR++);
+ sumG += static_cast(*srcPtrTempG++);
+ sumB += static_cast(*srcPtrTempB++);
+ }
+ srcPtrRowR += srcDescPtr->strides.hStride;
+ srcPtrRowG += srcDescPtr->strides.hStride;
+ srcPtrRowB += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ _mm256_store_si256((__m256i *)sumAvxR, pSumR);
+ _mm256_store_si256((__m256i *)sumAvxG, pSumG);
+ _mm256_store_si256((__m256i *)sumAvxB, pSumB);
+ sumR += (sumAvxR[0] + sumAvxR[1] + sumAvxR[2] + sumAvxR[3] + sumAvxR[4] + sumAvxR[5] + sumAvxR[6] + sumAvxR[7]);
+ sumG += (sumAvxG[0] + sumAvxG[1] + sumAvxG[2] + sumAvxG[3] + sumAvxG[4] + sumAvxG[5] + sumAvxG[6] + sumAvxG[7]);
+ sumB += (sumAvxB[0] + sumAvxB[1] + sumAvxB[2] + sumAvxB[3] + sumAvxB[4] + sumAvxB[5] + sumAvxB[6] + sumAvxB[7]);
+#endif
+
+ sum = static_cast(sum) + static_cast(sumG) + static_cast(sumB);
+ mean = (static_cast(sum) / (totalPixelsPerChannel * 3));
+ meanR = (static_cast(sumR) / totalPixelsPerChannel);
+ meanG = (static_cast(sumG) / totalPixelsPerChannel);
+ meanB = (static_cast(sumB) / totalPixelsPerChannel);
+ tensorMeanArr[idx] = meanR;
+ tensorMeanArr[idx + 1] = meanG;
+ tensorMeanArr[idx + 2] = meanB;
+ tensorMeanArr[idx + 3] = mean;
+ }
+
+ // Tensor Mean without fused output-layout toggle (NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp64s sum;
+ Rpp32s sumR = 0, sumG = 0, sumB = 0;
+ Rpp32f mean, meanR = 0.0, meanG = 0.0, meanB = 0.0;
+ Rpp32s sumAvxR[8] = {0};
+ Rpp32s sumAvxG[8] = {0};
+ Rpp32s sumAvxB[8] = {0};
+
+ Rpp8s *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+#if __AVX2__
+ __m256i pSumR = avx_px0;
+ __m256i pSumG = avx_px0;
+ __m256i pSumB = avx_px0;
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256i p[6];
+ rpp_simd_load(rpp_load48_i8pkd3_to_i32pln3_avx, srcPtrTemp, p);
+ compute_sum_48_host(p, &pSumR, &pSumG, &pSumB);
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ sumR += static_cast(srcPtrTemp[0]);
+ sumG += static_cast(srcPtrTemp[1]);
+ sumB += static_cast(srcPtrTemp[2]);
+ srcPtrTemp += 3;
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ _mm256_store_si256((__m256i *)sumAvxR, pSumR);
+ _mm256_store_si256((__m256i *)sumAvxG, pSumG);
+ _mm256_store_si256((__m256i *)sumAvxB, pSumB);
+ sumR += (sumAvxR[0] + sumAvxR[1] + sumAvxR[2] + sumAvxR[3] + sumAvxR[4] + sumAvxR[5] + sumAvxR[6] + sumAvxR[7]);
+ sumG += (sumAvxG[0] + sumAvxG[1] + sumAvxG[2] + sumAvxG[3] + sumAvxG[4] + sumAvxG[5] + sumAvxG[6] + sumAvxG[7]);
+ sumB += (sumAvxB[0] + sumAvxB[1] + sumAvxB[2] + sumAvxB[3] + sumAvxB[4] + sumAvxB[5] + sumAvxB[6] + sumAvxB[7]);
+#endif
+ sum = static_cast(sumR) + static_cast(sumG) + static_cast(sumB);
+ mean = (static_cast(sum) / (totalPixelsPerChannel * 3));
+ meanR = (static_cast(sumR) / totalPixelsPerChannel);
+ meanG = (static_cast(sumG) / totalPixelsPerChannel);
+ meanB = (static_cast(sumB) / totalPixelsPerChannel);
+ tensorMeanArr[idx] = meanR;
+ tensorMeanArr[idx + 1] = meanG;
+ tensorMeanArr[idx + 2] = meanB;
+ tensorMeanArr[idx + 3] = mean;
+ }
+ }
+
+ return RPP_SUCCESS;
+}
diff --git a/src/modules/cpu/kernel/tensor_stddev.hpp b/src/modules/cpu/kernel/tensor_stddev.hpp
new file mode 100644
index 000000000..2f64e93ab
--- /dev/null
+++ b/src/modules/cpu/kernel/tensor_stddev.hpp
@@ -0,0 +1,1092 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+#include "reduction.hpp"
+
+RppStatus tensor_stddev_u8_f32_host(Rpp8u *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp32f *tensorStddevArr,
+ Rpp32f *meanTensor,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp8u *srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+ Rpp8u *srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+ Rpp32u vectorIncrement = 24;
+ Rpp32u vectorIncrementPerChannel = 8;
+ Rpp32f totalPixelsPerChannel = roi.xywhROI.roiWidth * roi.xywhROI.roiHeight;
+ int idx = batchCount * 4;
+
+ // Tensor Stddev without fused output-layout toggle (NCHW)
+ if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ alignedLength = bufferLength & ~(vectorIncrementPerChannel-1);
+ Rpp64f var = 0.0;
+ Rpp32f stddev = 0.0;
+ Rpp64f varAvx[4] = {0.0};
+ Rpp32f mean = meanTensor[batchCount];
+
+ Rpp8u *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+#if __AVX2__
+ __m256d pMean = _mm256_set1_pd(mean);
+ __m256d pVar = _mm256_setzero_pd();
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256d p1[2];
+ rpp_simd_load(rpp_load8_u8_to_f64_avx, srcPtrTemp, p1);
+ compute_variance_8_host(p1, &pMean, &pVar);
+
+ srcPtrTemp += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ var += (static_cast(*srcPtrTemp) - mean) * (static_cast(*srcPtrTemp) - mean);
+ srcPtrTemp++;
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ _mm256_storeu_pd(varAvx, pVar);
+ var += (varAvx[0] + varAvx[1] + varAvx[2] + varAvx[3]);
+#endif
+ stddev = sqrt(var / totalPixelsPerChannel);
+ tensorStddevArr[batchCount] = static_cast(stddev);
+ }
+
+ // Tensor Stddev without fused output-layout toggle 3 channel (NCHW)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp64f varR, varG, varB, varImageR, varImageG, varImageB, varImage;
+ Rpp32f stddevImage, stddevR, stddevG, stddevB;
+ Rpp64f varAvxR[4] = {0.0};
+ Rpp64f varAvxG[4] = {0.0};
+ Rpp64f varAvxB[4] = {0.0};
+ Rpp64f varAvxImageR[4] = {0.0};
+ Rpp64f varAvxImageG[4] = {0.0};
+ Rpp64f varAvxImageB[4] = {0.0};
+ varR = varG = varB = varImageR = varImageG = varImageB = 0.0;
+
+ Rpp32f meanR = meanTensor[idx];
+ Rpp32f meanG = meanTensor[idx + 1];
+ Rpp32f meanB = meanTensor[idx + 2];
+ Rpp32f meanImage = meanTensor[idx + 3];
+
+ Rpp8u *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+ __m256d pMeanR = _mm256_set1_pd(meanR);
+ __m256d pMeanG = _mm256_set1_pd(meanG);
+ __m256d pMeanB = _mm256_set1_pd(meanB);
+ __m256d pMeanImage = _mm256_set1_pd(meanImage);
+ __m256d pVarR, pVarG, pVarB;
+ __m256d pVarImageR, pVarImageG, pVarImageB;
+ pVarR = pVarG = pVarB = pVarImageR = pVarImageG = pVarImageB = _mm256_setzero_pd();
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+ srcPtrTempR = srcPtrRowR;
+ srcPtrTempG = srcPtrRowG;
+ srcPtrTempB = srcPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256d p[6];
+ rpp_simd_load(rpp_load24_u8pln3_to_f64pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
+ compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
+ compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
+ srcPtrTempR += vectorIncrementPerChannel;
+ srcPtrTempG += vectorIncrementPerChannel;
+ srcPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ Rpp64f srcPtrR = static_cast(*srcPtrTempR);
+ Rpp64f srcPtrG = static_cast(*srcPtrTempG);
+ Rpp64f srcPtrB = static_cast(*srcPtrTempB);
+ varR += (srcPtrR - meanR) * (srcPtrR - meanR);
+ varG += (srcPtrG - meanG) * (srcPtrG - meanG);
+ varB += (srcPtrB - meanB) * (srcPtrB - meanB);
+ varImageR += (srcPtrR - meanImage) * (srcPtrR - meanImage);
+ varImageG += (srcPtrG - meanImage) * (srcPtrG - meanImage);
+ varImageB += (srcPtrB - meanImage) * (srcPtrB - meanImage);
+ srcPtrTempR++;
+ srcPtrTempG++;
+ srcPtrTempB++;
+ }
+ srcPtrRowR += srcDescPtr->strides.hStride;
+ srcPtrRowG += srcDescPtr->strides.hStride;
+ srcPtrRowB += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ _mm256_storeu_pd(varAvxR, pVarR);
+ _mm256_storeu_pd(varAvxG, pVarG);
+ _mm256_storeu_pd(varAvxB, pVarB);
+ _mm256_storeu_pd(varAvxImageR, pVarImageR);
+ _mm256_storeu_pd(varAvxImageG, pVarImageG);
+ _mm256_storeu_pd(varAvxImageB, pVarImageB);
+
+ varR += (varAvxR[0] + varAvxR[1] + varAvxR[2] + varAvxR[3]);
+ varG += (varAvxG[0] + varAvxG[1] + varAvxG[2] + varAvxG[3]);
+ varB += (varAvxB[0] + varAvxB[1] + varAvxB[2] + varAvxB[3]);
+ varImageR += (varAvxImageR[0] + varAvxImageR[1] + varAvxImageR[2] + varAvxImageR[3]);
+ varImageG += (varAvxImageG[0] + varAvxImageG[1] + varAvxImageG[2] + varAvxImageG[3]);
+ varImageB += (varAvxImageB[0] + varAvxImageB[1] + varAvxImageB[2] + varAvxImageB[3]);
+#endif
+ varImage = varImageR + varImageG + varImageB;
+ stddevImage = static_cast(sqrt(varImage / (totalPixelsPerChannel * 3)));
+ stddevR = static_cast(sqrt(varR / totalPixelsPerChannel));
+ stddevG = static_cast(sqrt(varG / totalPixelsPerChannel));
+ stddevB = static_cast(sqrt(varB / totalPixelsPerChannel));
+ tensorStddevArr[idx] = stddevR;
+ tensorStddevArr[idx + 1] = stddevG;
+ tensorStddevArr[idx + 2] = stddevB;
+ tensorStddevArr[idx + 3] = stddevImage;
+ }
+
+ // Tensor Stddev without fused output-layout toggle (NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp64f varR, varG, varB, varImageR, varImageG, varImageB, varImage;
+ Rpp32f stddevImage, stddevR, stddevG, stddevB;
+ Rpp64f varAvxR[4] = {0.0};
+ Rpp64f varAvxG[4] = {0.0};
+ Rpp64f varAvxB[4] = {0.0};
+ Rpp64f varAvxImageR[4] = {0.0};
+ Rpp64f varAvxImageG[4] = {0.0};
+ Rpp64f varAvxImageB[4] = {0.0};
+ varR = varG = varB = varImageR = varImageG = varImageB = 0.0;
+
+ Rpp32f meanR = meanTensor[idx];
+ Rpp32f meanG = meanTensor[idx + 1];
+ Rpp32f meanB = meanTensor[idx + 2];
+ Rpp32f meanImage = meanTensor[idx + 3];
+
+ Rpp8u *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+#if __AVX2__
+ __m256d pMeanR = _mm256_set1_pd(meanR);
+ __m256d pMeanG = _mm256_set1_pd(meanG);
+ __m256d pMeanB = _mm256_set1_pd(meanB);
+ __m256d pMeanImage = _mm256_set1_pd(meanImage);
+ __m256d pVarR, pVarG, pVarB;
+ __m256d pVarImageR, pVarImageG, pVarImageB;
+ pVarR = pVarG = pVarB = pVarImageR = pVarImageG = pVarImageB = _mm256_setzero_pd();
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256d p[6];
+ rpp_simd_load(rpp_load24_u8pkd3_to_f64pln3_avx, srcPtrTemp, p);
+ compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
+ compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ Rpp64f srcPtrR = static_cast(srcPtrTemp[0]);
+ Rpp64f srcPtrG = static_cast(srcPtrTemp[1]);
+ Rpp64f srcPtrB = static_cast(srcPtrTemp[2]);
+ varR += (srcPtrR - meanR) * (srcPtrR - meanR);
+ varG += (srcPtrG - meanG) * (srcPtrG - meanG);
+ varB += (srcPtrB - meanB) * (srcPtrB - meanB);
+ varImageR += (srcPtrR - meanImage) * (srcPtrR - meanImage);
+ varImageG += (srcPtrG - meanImage) * (srcPtrG - meanImage);
+ varImageB += (srcPtrB - meanImage) * (srcPtrB - meanImage);
+ srcPtrTemp += 3;
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ _mm256_storeu_pd(varAvxR, pVarR);
+ _mm256_storeu_pd(varAvxG, pVarG);
+ _mm256_storeu_pd(varAvxB, pVarB);
+ _mm256_storeu_pd(varAvxImageR, pVarImageR);
+ _mm256_storeu_pd(varAvxImageG, pVarImageG);
+ _mm256_storeu_pd(varAvxImageB, pVarImageB);
+
+ varR += (varAvxR[0] + varAvxR[1] + varAvxR[2] + varAvxR[3]);
+ varG += (varAvxG[0] + varAvxG[1] + varAvxG[2] + varAvxG[3]);
+ varB += (varAvxB[0] + varAvxB[1] + varAvxB[2] + varAvxB[3]);
+ varImageR += (varAvxImageR[0] + varAvxImageR[1] + varAvxImageR[2] + varAvxImageR[3]);
+ varImageG += (varAvxImageG[0] + varAvxImageG[1] + varAvxImageG[2] + varAvxImageG[3]);
+ varImageB += (varAvxImageB[0] + varAvxImageB[1] + varAvxImageB[2] + varAvxImageB[3]);
+#endif
+ varImage = varImageR + varImageG + varImageB;
+ stddevImage = static_cast(sqrt(varImage / (totalPixelsPerChannel * 3)));
+ stddevR = static_cast(sqrt(varR / totalPixelsPerChannel));
+ stddevG = static_cast(sqrt(varG / totalPixelsPerChannel));
+ stddevB = static_cast(sqrt(varB / totalPixelsPerChannel));
+ tensorStddevArr[idx] = stddevR;
+ tensorStddevArr[idx + 1] = stddevG;
+ tensorStddevArr[idx + 2] = stddevB;
+ tensorStddevArr[idx + 3] = stddevImage;
+ }
+ }
+
+ return RPP_SUCCESS;
+}
+
+RppStatus tensor_stddev_f32_f32_host(Rpp32f *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp32f *tensorStddevArr,
+ Rpp32f *meanTensor,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp32f *srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+ Rpp32f *srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+ Rpp32u vectorIncrement = 24;
+ Rpp32u vectorIncrementPerChannel = 8;
+ Rpp32f totalPixelsPerChannel = roi.xywhROI.roiWidth * roi.xywhROI.roiHeight;
+ int idx = batchCount * 4;
+
+ // Tensor Stddev without fused output-layout toggle (NCHW)
+ if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ alignedLength = bufferLength & ~(vectorIncrementPerChannel-1);
+ Rpp64f var = 0.0;
+ Rpp32f stddev = 0.0;
+ Rpp64f varAvx[4] = {0.0};
+ Rpp32f mean = meanTensor[batchCount];
+
+ Rpp32f *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+#if __AVX2__
+ __m256d pMean = _mm256_set1_pd(mean);
+ __m256d pVar = _mm256_setzero_pd();
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256d p1[2];
+ rpp_simd_load(rpp_load8_f32_to_f64_avx, srcPtrTemp, p1);
+ compute_variance_8_host(p1, &pMean, &pVar);
+ srcPtrTemp += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ var += (static_cast(*srcPtrTemp) - mean) * (static_cast(*srcPtrTemp) - mean);
+ srcPtrTemp++;
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ _mm256_storeu_pd(varAvx, pVar);
+ var += (varAvx[0] + varAvx[1] + varAvx[2] + varAvx[3]);
+#endif
+ stddev = sqrt(var / totalPixelsPerChannel) * 255;
+ tensorStddevArr[batchCount] = static_cast(stddev);
+ }
+
+ // Tensor Stddev without fused output-layout toggle 3 channel (NCHW)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp64f varR, varG, varB, varImageR, varImageG, varImageB, varImage;
+ Rpp32f stddevImage, stddevR, stddevG, stddevB;
+ Rpp64f varAvxR[4] = {0.0};
+ Rpp64f varAvxG[4] = {0.0};
+ Rpp64f varAvxB[4] = {0.0};
+ Rpp64f varAvxImageR[4] = {0.0};
+ Rpp64f varAvxImageG[4] = {0.0};
+ Rpp64f varAvxImageB[4] = {0.0};
+ varR = varG = varB = varImageR = varImageG = varImageB = 0.0;
+
+ Rpp32f meanR = meanTensor[idx];
+ Rpp32f meanG = meanTensor[idx + 1];
+ Rpp32f meanB = meanTensor[idx + 2];
+ Rpp32f meanImage = meanTensor[idx + 3];
+
+ Rpp32f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+ __m256d pMeanR = _mm256_set1_pd(meanR);
+ __m256d pMeanG = _mm256_set1_pd(meanG);
+ __m256d pMeanB = _mm256_set1_pd(meanB);
+ __m256d pMeanImage = _mm256_set1_pd(meanImage);
+ __m256d pVarR, pVarG, pVarB;
+ __m256d pVarImageR, pVarImageG, pVarImageB;
+ pVarR = pVarG = pVarB = pVarImageR = pVarImageG = pVarImageB = _mm256_setzero_pd();
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+ srcPtrTempR = srcPtrRowR;
+ srcPtrTempG = srcPtrRowG;
+ srcPtrTempB = srcPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256d p[6];
+ rpp_simd_load(rpp_load24_f32pln3_to_f64pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
+ compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
+ compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
+ srcPtrTempR += vectorIncrementPerChannel;
+ srcPtrTempG += vectorIncrementPerChannel;
+ srcPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ Rpp64f srcPtrR = static_cast(*srcPtrTempR);
+ Rpp64f srcPtrG = static_cast(*srcPtrTempG);
+ Rpp64f srcPtrB = static_cast(*srcPtrTempB);
+ varR += (srcPtrR - meanR) * (srcPtrR - meanR);
+ varG += (srcPtrG - meanG) * (srcPtrG - meanG);
+ varB += (srcPtrB - meanB) * (srcPtrB - meanB);
+ varImageR += (srcPtrR - meanImage) * (srcPtrR - meanImage);
+ varImageG += (srcPtrG - meanImage) * (srcPtrG - meanImage);
+ varImageB += (srcPtrB - meanImage) * (srcPtrB - meanImage);
+ srcPtrTempR++;
+ srcPtrTempG++;
+ srcPtrTempB++;
+ }
+ srcPtrRowR += srcDescPtr->strides.hStride;
+ srcPtrRowG += srcDescPtr->strides.hStride;
+ srcPtrRowB += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ _mm256_storeu_pd(varAvxR, pVarR);
+ _mm256_storeu_pd(varAvxG, pVarG);
+ _mm256_storeu_pd(varAvxB, pVarB);
+ _mm256_storeu_pd(varAvxImageR, pVarImageR);
+ _mm256_storeu_pd(varAvxImageG, pVarImageG);
+ _mm256_storeu_pd(varAvxImageB, pVarImageB);
+
+ varR += (varAvxR[0] + varAvxR[1] + varAvxR[2] + varAvxR[3]);
+ varG += (varAvxG[0] + varAvxG[1] + varAvxG[2] + varAvxG[3]);
+ varB += (varAvxB[0] + varAvxB[1] + varAvxB[2] + varAvxB[3]);
+ varImageR += (varAvxImageR[0] + varAvxImageR[1] + varAvxImageR[2] + varAvxImageR[3]);
+ varImageG += (varAvxImageG[0] + varAvxImageG[1] + varAvxImageG[2] + varAvxImageG[3]);
+ varImageB += (varAvxImageB[0] + varAvxImageB[1] + varAvxImageB[2] + varAvxImageB[3]);
+#endif
+ varImage = varImageR + varImageG + varImageB;
+ stddevImage = static_cast(sqrt(varImage / (totalPixelsPerChannel * 3)) * 255); // multiply by 255 to normalize variation
+ stddevR = static_cast(sqrt(varR / totalPixelsPerChannel) * 255);
+ stddevG = static_cast(sqrt(varG / totalPixelsPerChannel) * 255);
+ stddevB = static_cast(sqrt(varB / totalPixelsPerChannel) * 255);
+ tensorStddevArr[idx] = stddevR;
+ tensorStddevArr[idx + 1] = stddevG;
+ tensorStddevArr[idx + 2] = stddevB;
+ tensorStddevArr[idx + 3] = stddevImage;
+ }
+
+ // Tensor Stddev without fused output-layout toggle (NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp64f varR, varG, varB, varImageR, varImageG, varImageB, varImage;
+ Rpp32f stddevImage, stddevR, stddevG, stddevB;
+ Rpp64f varAvxR[4] = {0.0};
+ Rpp64f varAvxG[4] = {0.0};
+ Rpp64f varAvxB[4] = {0.0};
+ Rpp64f varAvxImageR[4] = {0.0};
+ Rpp64f varAvxImageG[4] = {0.0};
+ Rpp64f varAvxImageB[4] = {0.0};
+ varR = varG = varB = varImageR = varImageG = varImageB = 0.0;
+
+ Rpp32f meanR = meanTensor[idx];
+ Rpp32f meanG = meanTensor[idx + 1];
+ Rpp32f meanB = meanTensor[idx + 2];
+ Rpp32f meanImage = meanTensor[idx + 3];
+
+ Rpp32f *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+#if __AVX2__
+ __m256d pMeanR = _mm256_set1_pd(meanR);
+ __m256d pMeanG = _mm256_set1_pd(meanG);
+ __m256d pMeanB = _mm256_set1_pd(meanB);
+ __m256d pMeanImage = _mm256_set1_pd(meanImage);
+ __m256d pVarR, pVarG, pVarB;
+ __m256d pVarImageR, pVarImageG, pVarImageB;
+ pVarR = pVarG = pVarB = pVarImageR = pVarImageG = pVarImageB = _mm256_setzero_pd();
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256d p[6];
+ rpp_simd_load(rpp_load24_f32pkd3_to_f64pln3_avx, srcPtrTemp, p);
+ compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
+ compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ Rpp64f srcPtrR = static_cast(srcPtrTemp[0]);
+ Rpp64f srcPtrG = static_cast(srcPtrTemp[1]);
+ Rpp64f srcPtrB = static_cast(srcPtrTemp[2]);
+ varR += (srcPtrR - meanR) * (srcPtrR - meanR);
+ varG += (srcPtrG - meanG) * (srcPtrG - meanG);
+ varB += (srcPtrB - meanB) * (srcPtrB - meanB);
+ varImageR += (srcPtrR - meanImage) * (srcPtrR - meanImage);
+ varImageG += (srcPtrG - meanImage) * (srcPtrG - meanImage);
+ varImageB += (srcPtrB - meanImage) * (srcPtrB - meanImage);
+ srcPtrTemp += 3;
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ _mm256_storeu_pd(varAvxR, pVarR);
+ _mm256_storeu_pd(varAvxG, pVarG);
+ _mm256_storeu_pd(varAvxB, pVarB);
+ _mm256_storeu_pd(varAvxImageR, pVarImageR);
+ _mm256_storeu_pd(varAvxImageG, pVarImageG);
+ _mm256_storeu_pd(varAvxImageB, pVarImageB);
+
+ varR += (varAvxR[0] + varAvxR[1] + varAvxR[2] + varAvxR[3]);
+ varG += (varAvxG[0] + varAvxG[1] + varAvxG[2] + varAvxG[3]);
+ varB += (varAvxB[0] + varAvxB[1] + varAvxB[2] + varAvxB[3]);
+ varImageR += (varAvxImageR[0] + varAvxImageR[1] + varAvxImageR[2] + varAvxImageR[3]);
+ varImageG += (varAvxImageG[0] + varAvxImageG[1] + varAvxImageG[2] + varAvxImageG[3]);
+ varImageB += (varAvxImageB[0] + varAvxImageB[1] + varAvxImageB[2] + varAvxImageB[3]);
+#endif
+ varImage = varImageR + varImageG + varImageB;
+ stddevImage = static_cast(sqrt(varImage / (totalPixelsPerChannel * 3)) * 255);
+ stddevR = static_cast(sqrt(varR / totalPixelsPerChannel) * 255);
+ stddevG = static_cast(sqrt(varG / totalPixelsPerChannel) * 255);
+ stddevB = static_cast(sqrt(varB / totalPixelsPerChannel) * 255);
+ tensorStddevArr[idx] = stddevR;
+ tensorStddevArr[idx + 1] = stddevG;
+ tensorStddevArr[idx + 2] = stddevB;
+ tensorStddevArr[idx + 3] = stddevImage;
+ }
+ }
+
+ return RPP_SUCCESS;
+}
+
+RppStatus tensor_stddev_f16_f32_host(Rpp16f *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp32f *tensorStddevArr,
+ Rpp32f *meanTensor,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp16f *srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+ Rpp16f *srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+ Rpp32u vectorIncrement = 24;
+ Rpp32u vectorIncrementPerChannel = 8;
+ Rpp32f totalPixelsPerChannel = roi.xywhROI.roiWidth * roi.xywhROI.roiHeight;
+ int idx = batchCount * 4;
+
+ // Tensor Stddev without fused output-layout toggle (NCHW)
+ if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ alignedLength = bufferLength & ~(vectorIncrementPerChannel-1);
+ Rpp64f var = 0.0;
+ Rpp32f stddev = 0.0;
+ Rpp64f varAvx[4] = {0.0};
+ Rpp32f mean = meanTensor[batchCount];
+
+ Rpp16f *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+#if __AVX2__
+ __m256d pMean = _mm256_set1_pd(mean);
+ __m256d pVar = _mm256_setzero_pd();
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ Rpp32f srcPtrTemp_ps[8];
+ for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++)
+ srcPtrTemp_ps[cnt] = static_cast(srcPtrTemp[cnt]);
+
+ __m256d p1[2];
+ rpp_simd_load(rpp_load8_f32_to_f64_avx, srcPtrTemp_ps, p1);
+ compute_variance_8_host(p1, &pMean, &pVar);
+
+ srcPtrTemp += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ var += (static_cast(*srcPtrTemp) - mean) * (static_cast(*srcPtrTemp) - mean);
+ srcPtrTemp++;
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ _mm256_storeu_pd(varAvx, pVar);
+ var += (varAvx[0] + varAvx[1] + varAvx[2] + varAvx[3]);
+#endif
+ stddev = sqrt(var / totalPixelsPerChannel) * 255;
+ tensorStddevArr[batchCount] = static_cast(stddev);
+ }
+
+ // Tensor Stddev without fused output-layout toggle 3 channel (NCHW)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp64f varR, varG, varB, varImageR, varImageG, varImageB, varImage;
+ Rpp32f stddevImage, stddevR, stddevG, stddevB;
+ Rpp64f varAvxR[4] = {0.0};
+ Rpp64f varAvxG[4] = {0.0};
+ Rpp64f varAvxB[4] = {0.0};
+ Rpp64f varAvxImageR[4] = {0.0};
+ Rpp64f varAvxImageG[4] = {0.0};
+ Rpp64f varAvxImageB[4] = {0.0};
+ varR = varG = varB = varImageR = varImageG = varImageB = 0.0;
+
+ Rpp32f meanR = meanTensor[idx];
+ Rpp32f meanG = meanTensor[idx + 1];
+ Rpp32f meanB = meanTensor[idx + 2];
+ Rpp32f meanImage = meanTensor[idx + 3];
+
+ Rpp16f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+ __m256d pMeanR = _mm256_set1_pd(meanR);
+ __m256d pMeanG = _mm256_set1_pd(meanG);
+ __m256d pMeanB = _mm256_set1_pd(meanB);
+ __m256d pMeanImage = _mm256_set1_pd(meanImage);
+ __m256d pVarR, pVarG, pVarB;
+ __m256d pVarImageR, pVarImageG, pVarImageB;
+ pVarR = pVarG = pVarB = pVarImageR = pVarImageG = pVarImageB = _mm256_setzero_pd();
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+ srcPtrTempR = srcPtrRowR;
+ srcPtrTempG = srcPtrRowG;
+ srcPtrTempB = srcPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ Rpp32f srcPtrTempR_ps[8], srcPtrTempG_ps[8], srcPtrTempB_ps[8];
+ for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++)
+ {
+ srcPtrTempR_ps[cnt] = static_cast(srcPtrTempR[cnt]);
+ srcPtrTempG_ps[cnt] = static_cast(srcPtrTempG[cnt]);
+ srcPtrTempB_ps[cnt] = static_cast(srcPtrTempB[cnt]);
+ }
+
+ __m256d p[6];
+ rpp_simd_load(rpp_load24_f32pln3_to_f64pln3_avx, srcPtrTempR_ps, srcPtrTempG_ps, srcPtrTempB_ps, p);
+ compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
+ compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
+ srcPtrTempR += vectorIncrementPerChannel;
+ srcPtrTempG += vectorIncrementPerChannel;
+ srcPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ Rpp64f srcPtrR = static_cast(*srcPtrTempR);
+ Rpp64f srcPtrG = static_cast(*srcPtrTempG);
+ Rpp64f srcPtrB = static_cast(*srcPtrTempB);
+ varR += (srcPtrR - meanR) * (srcPtrR - meanR);
+ varG += (srcPtrG - meanG) * (srcPtrG - meanG);
+ varB += (srcPtrB - meanB) * (srcPtrB - meanB);
+ varImageR += (srcPtrR - meanImage) * (srcPtrR - meanImage);
+ varImageG += (srcPtrG - meanImage) * (srcPtrG - meanImage);
+ varImageB += (srcPtrB - meanImage) * (srcPtrB - meanImage);
+ srcPtrTempR++;
+ srcPtrTempG++;
+ srcPtrTempB++;
+ }
+ srcPtrRowR += srcDescPtr->strides.hStride;
+ srcPtrRowG += srcDescPtr->strides.hStride;
+ srcPtrRowB += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ _mm256_storeu_pd(varAvxR, pVarR);
+ _mm256_storeu_pd(varAvxG, pVarG);
+ _mm256_storeu_pd(varAvxB, pVarB);
+ _mm256_storeu_pd(varAvxImageR, pVarImageR);
+ _mm256_storeu_pd(varAvxImageG, pVarImageG);
+ _mm256_storeu_pd(varAvxImageB, pVarImageB);
+
+ varR += (varAvxR[0] + varAvxR[1] + varAvxR[2] + varAvxR[3]);
+ varG += (varAvxG[0] + varAvxG[1] + varAvxG[2] + varAvxG[3]);
+ varB += (varAvxB[0] + varAvxB[1] + varAvxB[2] + varAvxB[3]);
+ varImageR += (varAvxImageR[0] + varAvxImageR[1] + varAvxImageR[2] + varAvxImageR[3]);
+ varImageG += (varAvxImageG[0] + varAvxImageG[1] + varAvxImageG[2] + varAvxImageG[3]);
+ varImageB += (varAvxImageB[0] + varAvxImageB[1] + varAvxImageB[2] + varAvxImageB[3]);
+#endif
+ varImage = varImageR + varImageG + varImageB;
+ stddevImage = static_cast(sqrt(varImage / (totalPixelsPerChannel * 3)) * 255); // multiply by 255 to normalize variation
+ stddevR = static_cast(sqrt(varR / totalPixelsPerChannel) * 255);
+ stddevG = static_cast(sqrt(varG / totalPixelsPerChannel) * 255);
+ stddevB = static_cast(sqrt(varB / totalPixelsPerChannel) * 255);
+ tensorStddevArr[idx] = stddevR;
+ tensorStddevArr[idx + 1] = stddevG;
+ tensorStddevArr[idx + 2] = stddevB;
+ tensorStddevArr[idx + 3] = stddevImage;
+ }
+
+ // Tensor Stddev without fused output-layout toggle (NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp64f varR, varG, varB, varImageR, varImageG, varImageB, varImage;
+ Rpp32f stddevImage, stddevR, stddevG, stddevB;
+ Rpp64f varAvxR[4] = {0.0};
+ Rpp64f varAvxG[4] = {0.0};
+ Rpp64f varAvxB[4] = {0.0};
+ Rpp64f varAvxImageR[4] = {0.0};
+ Rpp64f varAvxImageG[4] = {0.0};
+ Rpp64f varAvxImageB[4] = {0.0};
+ varR = varG = varB = varImageR = varImageG = varImageB = 0.0;
+
+ Rpp32f meanR = meanTensor[idx];
+ Rpp32f meanG = meanTensor[idx + 1];
+ Rpp32f meanB = meanTensor[idx + 2];
+ Rpp32f meanImage = meanTensor[idx + 3];
+
+ Rpp16f *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+#if __AVX2__
+ __m256d pMeanR = _mm256_set1_pd(meanR);
+ __m256d pMeanG = _mm256_set1_pd(meanG);
+ __m256d pMeanB = _mm256_set1_pd(meanB);
+ __m256d pMeanImage = _mm256_set1_pd(meanImage);
+ __m256d pVarR, pVarG, pVarB;
+ __m256d pVarImageR, pVarImageG, pVarImageB;
+ pVarR = pVarG = pVarB = pVarImageR = pVarImageG = pVarImageB = _mm256_setzero_pd();
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ Rpp32f srcPtrTemp_ps[24];
+ for(int cnt = 0; cnt < vectorIncrement; cnt++)
+ srcPtrTemp_ps[cnt] = static_cast(srcPtrTemp[cnt]);
+
+ __m256d p[6];
+ rpp_simd_load(rpp_load24_f32pkd3_to_f64pln3_avx, srcPtrTemp_ps, p);
+ compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
+ compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
+
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ Rpp64f srcPtrR = static_cast(srcPtrTemp[0]);
+ Rpp64f srcPtrG = static_cast(srcPtrTemp[1]);
+ Rpp64f srcPtrB = static_cast(srcPtrTemp[2]);
+ varR += (srcPtrR - meanR) * (srcPtrR - meanR);
+ varG += (srcPtrG - meanG) * (srcPtrG - meanG);
+ varB += (srcPtrB - meanB) * (srcPtrB - meanB);
+ varImageR += (srcPtrR - meanImage) * (srcPtrR - meanImage);
+ varImageG += (srcPtrG - meanImage) * (srcPtrG - meanImage);
+ varImageB += (srcPtrB - meanImage) * (srcPtrB - meanImage);
+ srcPtrTemp += 3;
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ _mm256_storeu_pd(varAvxR, pVarR);
+ _mm256_storeu_pd(varAvxG, pVarG);
+ _mm256_storeu_pd(varAvxB, pVarB);
+ _mm256_storeu_pd(varAvxImageR, pVarImageR);
+ _mm256_storeu_pd(varAvxImageG, pVarImageG);
+ _mm256_storeu_pd(varAvxImageB, pVarImageB);
+
+ varR += (varAvxR[0] + varAvxR[1] + varAvxR[2] + varAvxR[3]);
+ varG += (varAvxG[0] + varAvxG[1] + varAvxG[2] + varAvxG[3]);
+ varB += (varAvxB[0] + varAvxB[1] + varAvxB[2] + varAvxB[3]);
+ varImageR += (varAvxImageR[0] + varAvxImageR[1] + varAvxImageR[2] + varAvxImageR[3]);
+ varImageG += (varAvxImageG[0] + varAvxImageG[1] + varAvxImageG[2] + varAvxImageG[3]);
+ varImageB += (varAvxImageB[0] + varAvxImageB[1] + varAvxImageB[2] + varAvxImageB[3]);
+#endif
+ varImage = varImageR + varImageG + varImageB;
+ stddevImage = static_cast(sqrt(varImage / (totalPixelsPerChannel * 3)) * 255);
+ stddevR = static_cast(sqrt(varR / totalPixelsPerChannel) * 255);
+ stddevG = static_cast(sqrt(varG / totalPixelsPerChannel) * 255);
+ stddevB = static_cast(sqrt(varB / totalPixelsPerChannel) * 255);
+ tensorStddevArr[idx] = stddevR;
+ tensorStddevArr[idx + 1] = stddevG;
+ tensorStddevArr[idx + 2] = stddevB;
+ tensorStddevArr[idx + 3] = stddevImage;
+ }
+ }
+
+ return RPP_SUCCESS;
+}
+
+RppStatus tensor_stddev_i8_f32_host(Rpp8s *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp32f *tensorStddevArr,
+ Rpp32f *meanTensor,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp8s *srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+ Rpp8s *srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+ Rpp32u vectorIncrement = 24;
+ Rpp32u vectorIncrementPerChannel = 8;
+ Rpp32f totalPixelsPerChannel = roi.xywhROI.roiWidth * roi.xywhROI.roiHeight;
+ int idx = batchCount * 4;
+
+ // Tensor Stddev without fused output-layout toggle (NCHW)
+ if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ alignedLength = bufferLength & ~(vectorIncrementPerChannel-1);
+ Rpp64f var = 0.0;
+ Rpp32f stddev = 0.0;
+ Rpp64f varAvx[4] = {0.0};
+ Rpp32f mean = meanTensor[batchCount] + 128;
+
+ Rpp8s *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+#if __AVX2__
+ __m256d pMean = _mm256_set1_pd(mean);
+ __m256d pVar = _mm256_setzero_pd();
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256d p1[2];
+ rpp_simd_load(rpp_load8_i8_to_f64_avx, srcPtrTemp, p1);
+ compute_variance_8_host(p1, &pMean, &pVar);
+
+ srcPtrTemp += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ var += (static_cast(*srcPtrTemp + 128) - mean) * (static_cast(*srcPtrTemp + 128) - mean);
+ srcPtrTemp++;
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ _mm256_storeu_pd(varAvx, pVar);
+ var += (varAvx[0] + varAvx[1] + varAvx[2] + varAvx[3]);
+#endif
+ stddev = sqrt(var / totalPixelsPerChannel);
+ tensorStddevArr[batchCount] = static_cast(stddev);
+ }
+
+ // Tensor Stddev without fused output-layout toggle 3 channel (NCHW)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp64f varR, varG, varB, varImageR, varImageG, varImageB, varImage;
+ Rpp32f stddevImage, stddevR, stddevG, stddevB;
+ Rpp64f varAvxR[4] = {0.0};
+ Rpp64f varAvxG[4] = {0.0};
+ Rpp64f varAvxB[4] = {0.0};
+ Rpp64f varAvxImageR[4] = {0.0};
+ Rpp64f varAvxImageG[4] = {0.0};
+ Rpp64f varAvxImageB[4] = {0.0};
+ varR = varG = varB = varImageR = varImageG = varImageB = 0.0;
+
+ Rpp32f meanR = meanTensor[idx] + 128;
+ Rpp32f meanG = meanTensor[idx + 1] + 128;
+ Rpp32f meanB = meanTensor[idx + 2] + 128;
+ Rpp32f meanImage = meanTensor[idx + 3] + 128;
+
+ Rpp8s *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+ __m256d pMeanR = _mm256_set1_pd(meanR);
+ __m256d pMeanG = _mm256_set1_pd(meanG);
+ __m256d pMeanB = _mm256_set1_pd(meanB);
+ __m256d pMeanImage = _mm256_set1_pd(meanImage);
+ __m256d pVarR, pVarG, pVarB;
+ __m256d pVarImageR, pVarImageG, pVarImageB;
+ pVarR = pVarG = pVarB = pVarImageR = pVarImageG = pVarImageB = _mm256_setzero_pd();
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+ srcPtrTempR = srcPtrRowR;
+ srcPtrTempG = srcPtrRowG;
+ srcPtrTempB = srcPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256d p[6];
+ rpp_simd_load(rpp_load24_i8pln3_to_f64pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
+ compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
+ compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
+ srcPtrTempR += vectorIncrementPerChannel;
+ srcPtrTempG += vectorIncrementPerChannel;
+ srcPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ Rpp64f srcPtrR = static_cast(*srcPtrTempR + 128);
+ Rpp64f srcPtrG = static_cast(*srcPtrTempG + 128);
+ Rpp64f srcPtrB = static_cast(*srcPtrTempB + 128);
+ varR += (srcPtrR - meanR) * (srcPtrR - meanR);
+ varG += (srcPtrG - meanG) * (srcPtrG - meanG);
+ varB += (srcPtrB - meanB) * (srcPtrB - meanB);
+ varImageR += (srcPtrR - meanImage) * (srcPtrR - meanImage);
+ varImageG += (srcPtrG - meanImage) * (srcPtrG - meanImage);
+ varImageB += (srcPtrB - meanImage) * (srcPtrB - meanImage);
+ srcPtrTempR++;
+ srcPtrTempG++;
+ srcPtrTempB++;
+ }
+ srcPtrRowR += srcDescPtr->strides.hStride;
+ srcPtrRowG += srcDescPtr->strides.hStride;
+ srcPtrRowB += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ _mm256_storeu_pd(varAvxR, pVarR);
+ _mm256_storeu_pd(varAvxG, pVarG);
+ _mm256_storeu_pd(varAvxB, pVarB);
+ _mm256_storeu_pd(varAvxImageR, pVarImageR);
+ _mm256_storeu_pd(varAvxImageG, pVarImageG);
+ _mm256_storeu_pd(varAvxImageB, pVarImageB);
+
+ varR += (varAvxR[0] + varAvxR[1] + varAvxR[2] + varAvxR[3]);
+ varG += (varAvxG[0] + varAvxG[1] + varAvxG[2] + varAvxG[3]);
+ varB += (varAvxB[0] + varAvxB[1] + varAvxB[2] + varAvxB[3]);
+ varImageR += (varAvxImageR[0] + varAvxImageR[1] + varAvxImageR[2] + varAvxImageR[3]);
+ varImageG += (varAvxImageG[0] + varAvxImageG[1] + varAvxImageG[2] + varAvxImageG[3]);
+ varImageB += (varAvxImageB[0] + varAvxImageB[1] + varAvxImageB[2] + varAvxImageB[3]);
+#endif
+ varImage = varImageR + varImageG + varImageB;
+ stddevImage = static_cast(sqrt(varImage / (totalPixelsPerChannel * 3)));
+ stddevR = static_cast(sqrt(varR / totalPixelsPerChannel));
+ stddevG = static_cast(sqrt(varG / totalPixelsPerChannel));
+ stddevB = static_cast(sqrt(varB / totalPixelsPerChannel));
+ tensorStddevArr[idx] = stddevR;
+ tensorStddevArr[idx + 1] = stddevG;
+ tensorStddevArr[idx + 2] = stddevB;
+ tensorStddevArr[idx + 3] = stddevImage;
+ }
+
+ // Tensor Stddev without fused output-layout toggle (NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp64f varR, varG, varB, varImageR, varImageG, varImageB, varImage;
+ Rpp32f stddevImage, stddevR, stddevG, stddevB;
+ Rpp64f varAvxR[4] = {0.0};
+ Rpp64f varAvxG[4] = {0.0};
+ Rpp64f varAvxB[4] = {0.0};
+ Rpp64f varAvxImageR[4] = {0.0};
+ Rpp64f varAvxImageG[4] = {0.0};
+ Rpp64f varAvxImageB[4] = {0.0};
+ varR = varG = varB = varImageR = varImageG = varImageB = 0.0;
+
+ Rpp32f meanR = meanTensor[idx] + 128;
+ Rpp32f meanG = meanTensor[idx + 1] + 128;
+ Rpp32f meanB = meanTensor[idx + 2] + 128;
+ Rpp32f meanImage = meanTensor[idx + 3] + 128;
+
+ Rpp8s *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+#if __AVX2__
+ __m256d pMeanR = _mm256_set1_pd(meanR);
+ __m256d pMeanG = _mm256_set1_pd(meanG);
+ __m256d pMeanB = _mm256_set1_pd(meanB);
+ __m256d pMeanImage = _mm256_set1_pd(meanImage);
+ __m256d pVarR, pVarG, pVarB;
+ __m256d pVarImageR, pVarImageG, pVarImageB;
+ pVarR = pVarG = pVarB = pVarImageR = pVarImageG = pVarImageB = _mm256_setzero_pd();
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256d p[6];
+ rpp_simd_load(rpp_load24_i8pkd3_to_f64pln3_avx, srcPtrTemp, p);
+ compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
+ compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ Rpp64f srcPtrR = static_cast(srcPtrTemp[0] + 128);
+ Rpp64f srcPtrG = static_cast(srcPtrTemp[1] + 128);
+ Rpp64f srcPtrB = static_cast(srcPtrTemp[2] + 128);
+ varR += (srcPtrR - meanR) * (srcPtrR - meanR);
+ varG += (srcPtrG - meanG) * (srcPtrG - meanG);
+ varB += (srcPtrB - meanB) * (srcPtrB - meanB);
+ varImageR += (srcPtrR - meanImage) * (srcPtrR - meanImage);
+ varImageG += (srcPtrG - meanImage) * (srcPtrG - meanImage);
+ varImageB += (srcPtrB - meanImage) * (srcPtrB - meanImage);
+ srcPtrTemp += 3;
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ _mm256_storeu_pd(varAvxR, pVarR);
+ _mm256_storeu_pd(varAvxG, pVarG);
+ _mm256_storeu_pd(varAvxB, pVarB);
+ _mm256_storeu_pd(varAvxImageR, pVarImageR);
+ _mm256_storeu_pd(varAvxImageG, pVarImageG);
+ _mm256_storeu_pd(varAvxImageB, pVarImageB);
+
+ varR += (varAvxR[0] + varAvxR[1] + varAvxR[2] + varAvxR[3]);
+ varG += (varAvxG[0] + varAvxG[1] + varAvxG[2] + varAvxG[3]);
+ varB += (varAvxB[0] + varAvxB[1] + varAvxB[2] + varAvxB[3]);
+ varImageR += (varAvxImageR[0] + varAvxImageR[1] + varAvxImageR[2] + varAvxImageR[3]);
+ varImageG += (varAvxImageG[0] + varAvxImageG[1] + varAvxImageG[2] + varAvxImageG[3]);
+ varImageB += (varAvxImageB[0] + varAvxImageB[1] + varAvxImageB[2] + varAvxImageB[3]);
+#endif
+ varImage = varImageR + varImageG + varImageB;
+ stddevImage = static_cast(sqrt(varImage / (totalPixelsPerChannel * 3)));
+ stddevR = static_cast(sqrt(varR / totalPixelsPerChannel));
+ stddevG = static_cast(sqrt(varG / totalPixelsPerChannel));
+ stddevB = static_cast(sqrt(varB / totalPixelsPerChannel));
+ tensorStddevArr[idx] = stddevR;
+ tensorStddevArr[idx + 1] = stddevG;
+ tensorStddevArr[idx + 2] = stddevB;
+ tensorStddevArr[idx + 3] = stddevImage;
+ }
+ }
+
+ return RPP_SUCCESS;
+}
diff --git a/src/modules/hip/hip_tensor_statistical_operations.hpp b/src/modules/hip/hip_tensor_statistical_operations.hpp
index 6923b9a3f..a0f50ee7e 100644
--- a/src/modules/hip/hip_tensor_statistical_operations.hpp
+++ b/src/modules/hip/hip_tensor_statistical_operations.hpp
@@ -27,6 +27,8 @@ SOFTWARE.
#include "kernel/tensor_sum.hpp"
#include "kernel/tensor_min.hpp"
#include "kernel/tensor_max.hpp"
+#include "kernel/tensor_mean.hpp"
+#include "kernel/tensor_stddev.hpp"
#include "kernel/normalize.hpp"
#endif // HIP_TENSOR_STATISTICAL_OPERATIONS_HPP
diff --git a/src/modules/hip/kernel/tensor_mean.hpp b/src/modules/hip/kernel/tensor_mean.hpp
new file mode 100644
index 000000000..4b2c81155
--- /dev/null
+++ b/src/modules/hip/kernel/tensor_mean.hpp
@@ -0,0 +1,227 @@
+#include
+#include "rpp_hip_common.hpp"
+#include "reduction.hpp"
+
+// -------------------- Set 0 - Reduction Stage 2 --------------------
+template
+__global__ void tensor_mean_grid_result_hip(T *srcPtr,
+ uint xBufferLength,
+ float *dstPtr,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = hipThreadIdx_x * 8;
+ int id_z = hipBlockIdx_z;
+
+ __shared__ float partialSum_smem[1024]; // 8192 floats of src reduced to 1024 in a 1024 x 1 thread block
+ partialSum_smem[hipThreadIdx_x] = 0.0f; // initialization of Shared to 0 using all 1024 x 1 threads
+
+ if (id_x >= xBufferLength)
+ return;
+
+ int xDiff = xBufferLength - (xBufferLength & ~7); // difference between bufferLength and alignedLength, where alignedLength = bufferLength & ~7
+ uint srcIdx = (id_z * xBufferLength) + id_x;
+
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory
+
+ if (id_x + 8 > xBufferLength)
+ for(int i = xDiff; i < 8; i++)
+ src_f8.f1[i] = 0.0f; // local memory reset of invalid values (from the vectorized global load) to 0.0f
+
+ src_f8.f4[0] += src_f8.f4[1]; // perform small work of vectorized float4 addition
+ partialSum_smem[hipThreadIdx_x] += (src_f8.f1[0] +
+ src_f8.f1[1] +
+ src_f8.f1[2] +
+ src_f8.f1[3]); // perform small work of reducing float4s to float using 1024 x 1 threads and store in Shared
+ __syncthreads(); // syncthreads after Shared load
+
+ // Reduction of 1024 floats on 1024 threads per block in x dimension
+ for (int threadMax = 512; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ partialSum_smem[hipThreadIdx_x] += partialSum_smem[hipThreadIdx_x + threadMax];
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_x == 0)
+ {
+ int totalElements = roiTensorPtrSrc[hipBlockIdx_z].xywhROI.roiHeight * roiTensorPtrSrc[hipBlockIdx_z].xywhROI.roiWidth;
+ dstPtr[hipBlockIdx_z] = partialSum_smem[0] / totalElements;
+ }
+}
+
+template
+__global__ void tensor_mean_grid_3channel_result_hip(T *srcPtr,
+ uint xBufferLength,
+ float *dstPtr,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = hipThreadIdx_x * 8;
+ int id_z = hipBlockIdx_z;
+
+ __shared__ float partialRSum_smem[1024]; // 8192 floats of src reduced to 1024 in a 1024 x 1 thread block
+ __shared__ float partialGSum_smem[1024];
+ __shared__ float partialBSum_smem[1024];
+ partialRSum_smem[hipThreadIdx_x] = 0.0f; // initialization of Shared to 0 using all 1024 x 1 threads
+ partialGSum_smem[hipThreadIdx_x] = 0.0f;
+ partialBSum_smem[hipThreadIdx_x] = 0.0f;
+
+ if (id_x >= xBufferLength)
+ return;
+
+ int xDiff = xBufferLength - (xBufferLength & ~7); // difference between bufferLength and alignedLength, where alignedLength = bufferLength & ~7
+ uint srcIdx = ((id_z * xBufferLength) + id_x) * 3;
+
+ d_float24 src_f24;
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &src_f24); // load 24 pixels to local memory
+ if (id_x + 8 > xBufferLength) // local memory reset of invalid values (from the vectorized global load) to 0.0f
+ {
+ for(int i = xDiff; i < 8; i++)
+ {
+ src_f24.f8[0].f1[i] = 0.0f;
+ src_f24.f8[1].f1[i] = 0.0f;
+ src_f24.f8[2].f1[i] = 0.0f;
+ }
+ }
+ src_f24.f8[0].f4[0] += src_f24.f8[0].f4[1]; // perform small work of vectorized float4 addition
+ src_f24.f8[1].f4[0] += src_f24.f8[1].f4[1];
+ src_f24.f8[2].f4[0] += src_f24.f8[2].f4[1];
+ partialRSum_smem[hipThreadIdx_x] = (src_f24.f8[0].f1[0] +
+ src_f24.f8[0].f1[1] +
+ src_f24.f8[0].f1[2] +
+ src_f24.f8[0].f1[3]); // perform small work of reducing R float4s to float using 1024 threads and store in Shared
+ partialGSum_smem[hipThreadIdx_x] = (src_f24.f8[1].f1[0] +
+ src_f24.f8[1].f1[1] +
+ src_f24.f8[1].f1[2] +
+ src_f24.f8[1].f1[3]); // perform small work of reducing G float4s to float using 1024 threads and store in Shared
+ partialBSum_smem[hipThreadIdx_x] = (src_f24.f8[2].f1[0] +
+ src_f24.f8[2].f1[1] +
+ src_f24.f8[2].f1[2] +
+ src_f24.f8[2].f1[3]); // perform small work of reducing B float4s to float using 1024 threads and store in Shared
+
+ __syncthreads(); // syncthreads after Shared load
+
+ // Reduction of 1024 floats on 1024 threads per block in x dimension
+ for (int threadMax = 512; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ {
+ partialRSum_smem[hipThreadIdx_x] += partialRSum_smem[hipThreadIdx_x + threadMax];
+ partialGSum_smem[hipThreadIdx_x] += partialGSum_smem[hipThreadIdx_x + threadMax];
+ partialBSum_smem[hipThreadIdx_x] += partialBSum_smem[hipThreadIdx_x + threadMax];
+ }
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_x == 0)
+ {
+ int totalElements = roiTensorPtrSrc[hipBlockIdx_z].xywhROI.roiHeight * roiTensorPtrSrc[hipBlockIdx_z].xywhROI.roiWidth;
+ float sum = partialRSum_smem[0] + partialGSum_smem[0] + partialBSum_smem[0];
+ int idx = hipBlockIdx_z * 4;
+ dstPtr[idx] = partialRSum_smem[0] / totalElements;
+ dstPtr[idx + 1] = partialGSum_smem[0] / totalElements;
+ dstPtr[idx + 2] = partialBSum_smem[0] / totalElements;
+ dstPtr[idx + 3] = sum / (totalElements * 3);
+ }
+}
+
+// -------------------- Set 1 - Kernel Executors --------------------
+
+template
+RppStatus hip_exec_tensor_mean(T *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp32f *tensorMeanArr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rpp::Handle& handle)
+{
+ if (roiType == RpptRoiType::LTRB)
+ hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle);
+
+ int globalThreads_x = (srcDescPtr->w + 7) >> 3;
+ int globalThreads_y = srcDescPtr->h;
+ int globalThreads_z = srcDescPtr->n;
+ int gridDim_x = (int) ceil((float)globalThreads_x/LOCAL_THREADS_X);
+ int gridDim_y = (int) ceil((float)globalThreads_y/LOCAL_THREADS_Y);
+ int gridDim_z = (int) ceil((float)globalThreads_z/LOCAL_THREADS_Z);
+
+ if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u tensorPartialSumArrLength = gridDim_x * gridDim_y * gridDim_z;
+ U *tensorPartialSumArr;
+ tensorPartialSumArr = reinterpret_cast(handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem);
+ hipMemsetAsync(tensorPartialSumArr, 0, tensorPartialSumArrLength * sizeof(U), handle.GetStream());
+ hipLaunchKernelGGL(tensor_sum_pln1_hip,
+ dim3(gridDim_x, gridDim_y, gridDim_z),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+ tensorPartialSumArr,
+ roiTensorPtrSrc);
+ hipLaunchKernelGGL(tensor_mean_grid_result_hip,
+ dim3(1, 1, gridDim_z),
+ dim3(1024, 1, 1),
+ 0,
+ handle.GetStream(),
+ tensorPartialSumArr,
+ gridDim_x * gridDim_y,
+ tensorMeanArr,
+ roiTensorPtrSrc);
+ }
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u tensorPartialSumArrLength = gridDim_x * gridDim_y * gridDim_z * 3;
+ U *tensorPartialSumArr;
+ tensorPartialSumArr = reinterpret_cast(handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem);
+ hipMemsetAsync(tensorPartialSumArr, 0, tensorPartialSumArrLength * sizeof(U), handle.GetStream());
+ hipLaunchKernelGGL(tensor_sum_pln3_hip,
+ dim3(gridDim_x, gridDim_y, gridDim_z),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+ tensorPartialSumArr,
+ roiTensorPtrSrc);
+ hipLaunchKernelGGL(tensor_mean_grid_3channel_result_hip,
+ dim3(1, 1, gridDim_z),
+ dim3(1024, 1, 1),
+ 0,
+ handle.GetStream(),
+ tensorPartialSumArr,
+ gridDim_x * gridDim_y,
+ tensorMeanArr,
+ roiTensorPtrSrc);
+ }
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32u tensorPartialSumArrLength = gridDim_x * gridDim_y * gridDim_z * 3;
+ U *tensorPartialSumArr;
+ tensorPartialSumArr = reinterpret_cast(handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem);
+ hipMemsetAsync(tensorPartialSumArr, 0, tensorPartialSumArrLength * sizeof(U), handle.GetStream());
+ hipLaunchKernelGGL(tensor_sum_pkd3_hip,
+ dim3(gridDim_x, gridDim_y, gridDim_z),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+ tensorPartialSumArr,
+ roiTensorPtrSrc);
+ hipLaunchKernelGGL(tensor_mean_grid_3channel_result_hip,
+ dim3(1, 1, gridDim_z),
+ dim3(1024, 1, 1),
+ 0,
+ handle.GetStream(),
+ tensorPartialSumArr,
+ gridDim_x * gridDim_y,
+ tensorMeanArr,
+ roiTensorPtrSrc);
+ }
+
+ return RPP_SUCCESS;
+}
diff --git a/src/modules/hip/kernel/tensor_stddev.hpp b/src/modules/hip/kernel/tensor_stddev.hpp
new file mode 100644
index 000000000..0d4633be4
--- /dev/null
+++ b/src/modules/hip/kernel/tensor_stddev.hpp
@@ -0,0 +1,495 @@
+#include
+#include "rpp_hip_common.hpp"
+#include "reduction.hpp"
+
+// ----------------------- Helper Functions --------------------------
+
+__device__ __forceinline__ void stddev_hip_compute(uchar *srcPtr, float *src, float *dst, int numValues) { *dst = sqrt(*src / numValues); }
+__device__ __forceinline__ void stddev_hip_compute(float *srcPtr, float *src, float *dst, int numValues) { *dst = sqrt(*src / numValues) * 255; }
+__device__ __forceinline__ void stddev_hip_compute(signed char *srcPtr, float *src, float *dst, int numValues) { *dst = sqrt(*src / numValues); }
+__device__ __forceinline__ void stddev_hip_compute(half *srcPtr, float *src, float *dst, int numValues) { *dst = sqrt(*src / numValues) * 255; }
+
+__device__ __forceinline__ void mean_subtracted_square_3channel_hip_compute(d_float24 *src_f24, d_float24 *dst_f24,
+ float4 &meanR_f4, float4 &meanG_f4, float4 &meanB_f4)
+{
+ rpp_hip_math_subtract8_const(&src_f24->f8[0], &dst_f24->f8[0], meanR_f4);
+ rpp_hip_math_subtract8_const(&src_f24->f8[1], &dst_f24->f8[1], meanG_f4);
+ rpp_hip_math_subtract8_const(&src_f24->f8[2], &dst_f24->f8[2], meanB_f4);
+ rpp_hip_math_multiply8(&dst_f24->f8[0], &dst_f24->f8[0], &dst_f24->f8[0]);
+ rpp_hip_math_multiply8(&dst_f24->f8[1], &dst_f24->f8[1], &dst_f24->f8[1]);
+ rpp_hip_math_multiply8(&dst_f24->f8[2], &dst_f24->f8[2], &dst_f24->f8[2]);
+}
+
+// perform reduction on shared memory and store the result in output
+__device__ __forceinline__ void reduce_variance_3channel_hip(d_float24 *tempChannelSquared_f24, d_float24 *tempSquared_f24,
+ float *partialRVarianceRowPtr_smem, float *partialGVarianceRowPtr_smem, float *partialBVarianceRowPtr_smem,
+ float *partialTensorVarianceRowPtr_smem, float *dstPtr)
+{
+ // channel wise addition
+ tempChannelSquared_f24->f8[0].f4[0] += tempChannelSquared_f24->f8[0].f4[1]; // perform small work of vectorized float4 addition
+ tempChannelSquared_f24->f8[1].f4[0] += tempChannelSquared_f24->f8[1].f4[1];
+ tempChannelSquared_f24->f8[2].f4[0] += tempChannelSquared_f24->f8[2].f4[1];
+
+ partialRVarianceRowPtr_smem[hipThreadIdx_x] = (tempChannelSquared_f24->f8[0].f1[0] +
+ tempChannelSquared_f24->f8[0].f1[1] +
+ tempChannelSquared_f24->f8[0].f1[2] +
+ tempChannelSquared_f24->f8[0].f1[3]); // perform small work of reducing R float4s to float using 16 x 16 threads and store in _smem
+ partialGVarianceRowPtr_smem[hipThreadIdx_x] = (tempChannelSquared_f24->f8[1].f1[0] +
+ tempChannelSquared_f24->f8[1].f1[1] +
+ tempChannelSquared_f24->f8[1].f1[2] +
+ tempChannelSquared_f24->f8[1].f1[3]); // perform small work of reducing G float4s to float using 16 x 16 threads and store in _smem
+ partialBVarianceRowPtr_smem[hipThreadIdx_x] = (tempChannelSquared_f24->f8[2].f1[0] +
+ tempChannelSquared_f24->f8[2].f1[1] +
+ tempChannelSquared_f24->f8[2].f1[2] +
+ tempChannelSquared_f24->f8[2].f1[3]); // perform small work of reducing B float4s to float using 16 x 16 threads and store in _smem
+
+ // tensor wise addition
+ tempSquared_f24->f8[0].f4[0] += tempSquared_f24->f8[0].f4[1]; // perform small work of vectorized float4 addition
+ tempSquared_f24->f8[1].f4[0] += tempSquared_f24->f8[1].f4[1];
+ tempSquared_f24->f8[2].f4[0] += tempSquared_f24->f8[2].f4[1];
+ tempSquared_f24->f8[0].f4[0] += tempSquared_f24->f8[1].f4[0];
+ tempSquared_f24->f8[0].f4[0] += tempSquared_f24->f8[2].f4[0];
+
+ partialTensorVarianceRowPtr_smem[hipThreadIdx_x] = (tempSquared_f24->f8[0].f1[0] +
+ tempSquared_f24->f8[0].f1[1] +
+ tempSquared_f24->f8[0].f1[2] +
+ tempSquared_f24->f8[0].f1[3]); // perform small work of reducing B float4s to float using 16 x 16 threads and store in _smem
+ __syncthreads(); // syncthreads after _smem load
+
+ // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+ for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ {
+ partialRVarianceRowPtr_smem[hipThreadIdx_x] += partialRVarianceRowPtr_smem[hipThreadIdx_x + threadMax];
+ partialGVarianceRowPtr_smem[hipThreadIdx_x] += partialGVarianceRowPtr_smem[hipThreadIdx_x + threadMax];
+ partialBVarianceRowPtr_smem[hipThreadIdx_x] += partialBVarianceRowPtr_smem[hipThreadIdx_x + threadMax];
+ partialTensorVarianceRowPtr_smem[hipThreadIdx_x] += partialTensorVarianceRowPtr_smem[hipThreadIdx_x + threadMax];
+ }
+ __syncthreads();
+ }
+
+ if (hipThreadIdx_x == 0)
+ {
+ // Reduction of 16 floats on 16 threads per block in y dimension
+ for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+ {
+ if (hipThreadIdx_y < threadMax)
+ {
+ partialRVarianceRowPtr_smem[0] += partialRVarianceRowPtr_smem[increment];
+ partialGVarianceRowPtr_smem[0] += partialGVarianceRowPtr_smem[increment];
+ partialBVarianceRowPtr_smem[0] += partialBVarianceRowPtr_smem[increment];
+ partialTensorVarianceRowPtr_smem[0] += partialTensorVarianceRowPtr_smem[increment];
+ }
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_y == 0)
+ {
+ int idx = ((hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x) * 4;
+ dstPtr[idx] = partialRVarianceRowPtr_smem[0];
+ dstPtr[idx + 1] = partialGVarianceRowPtr_smem[0];
+ dstPtr[idx + 2] = partialBVarianceRowPtr_smem[0];
+ dstPtr[idx + 3] = partialTensorVarianceRowPtr_smem[0];
+ }
+ }
+}
+
+// -------------------- Set 0 - Reduction Stage 2 --------------------
+
+template
+__global__ void tensor_stddev_grid_result_hip(T *inputSrcPtr,
+ float *srcPtr,
+ uint xBufferLength,
+ float *dstPtr,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = hipThreadIdx_x * 8;
+ int id_z = hipBlockIdx_z;
+
+ __shared__ float partialVariance_smem[1024]; // 8192 floats of src reduced to 1024 in a 1024 x 1 thread block
+ partialVariance_smem[hipThreadIdx_x] = 0.0f; // initialization of _smem to 0 using all 1024 x 1 threads
+
+ if (id_x >= xBufferLength)
+ return;
+
+ int xDiff = xBufferLength - (xBufferLength & ~7); // difference between roiWidth and alignedLength, where alignedLength = roiWidth & ~7
+ uint srcIdx = (id_z * xBufferLength) + id_x;
+
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory
+ if (id_x + 8 > xBufferLength)
+ for(int i = xDiff; i < 8; i++)
+ src_f8.f1[i] = 0.0f; // local memory reset of invalid values (from the vectorized global load) to 0.0f
+ src_f8.f4[0] += src_f8.f4[1]; // perform small work of vectorized float4 addition
+ partialVariance_smem[hipThreadIdx_x] += (src_f8.f1[0] +
+ src_f8.f1[1] +
+ src_f8.f1[2] +
+ src_f8.f1[3]); // perform small work of reducing float4s to float using 1024 x 1 threads and store in _smem
+ __syncthreads(); // syncthreads after _smem load
+
+ // Reduction of 1024 floats on 1024 threads per block in x dimension
+ for (int threadMax = 512; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ partialVariance_smem[hipThreadIdx_x] += partialVariance_smem[hipThreadIdx_x + threadMax];
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_x == 0)
+ {
+ int totalElements = roiTensorPtrSrc[id_z].xywhROI.roiHeight * roiTensorPtrSrc[id_z].xywhROI.roiWidth;
+ stddev_hip_compute(inputSrcPtr, &partialVariance_smem[0] , &dstPtr[id_z], totalElements);
+ }
+}
+
+template
+__global__ void tensor_stddev_grid_3channel_result_hip(T *inputSrcPtr,
+ float *srcPtr,
+ uint xBufferLength,
+ float *dstPtr,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = hipThreadIdx_x;
+ int id_z = hipBlockIdx_z;
+
+ /* Stores individual channel Variations computed from channel Means to compute Stddev of individual channels*/
+ __shared__ float partialRVariance_smem[1024]; // 8192 floats of src reduced to 1024 in a 1024 x 1 thread block
+ __shared__ float partialGVariance_smem[1024];
+ __shared__ float partialBVariance_smem[1024];
+ __shared__ float partialTensorVariance_smem[1024];
+ partialRVariance_smem[hipThreadIdx_x] = 0.0f; // initialization of _smem to 0 using all 1024 x 1 threads
+ partialGVariance_smem[hipThreadIdx_x] = 0.0f;
+ partialBVariance_smem[hipThreadIdx_x] = 0.0f;
+ partialTensorVariance_smem[hipThreadIdx_x] = 0.0f;
+
+ if (id_x >= xBufferLength)
+ return;
+
+ float4 accum_f4 = static_cast(0.0f);
+ while (id_x < xBufferLength)
+ {
+ uint srcIdx = ((id_z * xBufferLength) + id_x) * 4;
+ float4 temp_f4 = *(float4 *)(srcPtr + srcIdx);
+ accum_f4 += temp_f4;
+ id_x += hipBlockDim_x;
+ }
+
+ partialRVariance_smem[hipThreadIdx_x] = accum_f4.x;
+ partialGVariance_smem[hipThreadIdx_x] = accum_f4.y;
+ partialBVariance_smem[hipThreadIdx_x] = accum_f4.z;
+ partialTensorVariance_smem[hipThreadIdx_x] = accum_f4.w;
+ __syncthreads(); // syncthreads after _smem load
+
+ // Reduction of 1024 floats on 1024 threads per block in x dimension
+ for (int threadMax = 512; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ {
+ partialRVariance_smem[hipThreadIdx_x] += partialRVariance_smem[hipThreadIdx_x + threadMax];
+ partialGVariance_smem[hipThreadIdx_x] += partialGVariance_smem[hipThreadIdx_x + threadMax];
+ partialBVariance_smem[hipThreadIdx_x] += partialBVariance_smem[hipThreadIdx_x + threadMax];
+ partialTensorVariance_smem[hipThreadIdx_x] += partialTensorVariance_smem[hipThreadIdx_x + threadMax];
+ }
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_x == 0)
+ {
+ int totalElements = roiTensorPtrSrc[id_z].xywhROI.roiHeight * roiTensorPtrSrc[id_z].xywhROI.roiWidth;
+ uint dstIdx = id_z * 4;
+ stddev_hip_compute(inputSrcPtr, &partialRVariance_smem[0], &dstPtr[dstIdx], totalElements);
+ stddev_hip_compute(inputSrcPtr, &partialGVariance_smem[0], &dstPtr[dstIdx + 1], totalElements);
+ stddev_hip_compute(inputSrcPtr, &partialBVariance_smem[0], &dstPtr[dstIdx + 2], totalElements);
+ stddev_hip_compute(inputSrcPtr, &partialTensorVariance_smem[0], &dstPtr[dstIdx + 3], totalElements * 3);
+ }
+}
+
+// -------------------- Set 1 - Reduction Stage 1 --------------------
+
+template
+__global__ void tensor_variance_pln1_hip(T *srcPtr,
+ uint2 srcStridesNH,
+ float *tensorVarianceArr,
+ Rpp32f *mean,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ float4 mean_f4 = static_cast(mean[id_z]);
+
+ __shared__ float partialVariance_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+ float *partialVarianceRowPtr_smem = &partialVariance_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in _smem
+ partialVarianceRowPtr_smem[hipThreadIdx_x] = 0.0f; // initialization of _smem to 0 using all 16 x 16 threads
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ return;
+
+ int xDiff = roiTensorPtrSrc[id_z].xywhROI.roiWidth - (roiTensorPtrSrc[id_z].xywhROI.roiWidth & ~7); // difference between roiWidth and alignedLength, where alignedLength = roiWidth & ~7
+ uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+
+ d_float8 src_f8, temp_f8, tempSquared_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory
+ rpp_hip_math_subtract8_const(&src_f8, &temp_f8, mean_f4); // subtract mean from each pixel
+ rpp_hip_math_multiply8(&temp_f8, &temp_f8, &tempSquared_f8); // square the temporary values
+
+ if (id_x + 8 > roiTensorPtrSrc[id_z].xywhROI.roiWidth)
+ for(int i = xDiff; i < 8; i++)
+ tempSquared_f8.f1[i] = 0.0f;
+ tempSquared_f8.f4[0] += tempSquared_f8.f4[1]; // perform small work of vectorized float4 addition
+ partialVarianceRowPtr_smem[hipThreadIdx_x] = (tempSquared_f8.f1[0] +
+ tempSquared_f8.f1[1] +
+ tempSquared_f8.f1[2] +
+ tempSquared_f8.f1[3]);
+ __syncthreads(); // syncthreads after _smem load
+
+ // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+ for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ partialVarianceRowPtr_smem[hipThreadIdx_x] += partialVarianceRowPtr_smem[hipThreadIdx_x + threadMax];
+ __syncthreads();
+ }
+
+ if (hipThreadIdx_x == 0)
+ {
+ // Reduction of 16 floats on 16 threads per block in y dimension
+ for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+ {
+ if (hipThreadIdx_y < threadMax)
+ partialVarianceRowPtr_smem[0] += partialVarianceRowPtr_smem[increment];
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_y == 0)
+ tensorVarianceArr[(hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x] = partialVarianceRowPtr_smem[0];
+ }
+}
+
+template
+__global__ void tensor_variance_pln3_hip(T *srcPtr,
+ uint3 srcStridesNCH,
+ float *tensorVarianceArr,
+ float4 *meanPtr_f4,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ /* Stores individual channel Variations computed from channel Means to compute Stddev of individual channels*/
+ __shared__ float partialRVariance_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+ __shared__ float partialGVariance_smem[16][16];
+ __shared__ float partialBVariance_smem[16][16];
+ __shared__ float partialTensorVariance_smem[16][16];
+ float *partialRVarianceRowPtr_smem = &partialRVariance_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in _smem
+ float *partialGVarianceRowPtr_smem = &partialGVariance_smem[hipThreadIdx_y][0];
+ float *partialBVarianceRowPtr_smem = &partialBVariance_smem[hipThreadIdx_y][0];
+ float *partialTensorVarianceRowPtr_smem = &partialTensorVariance_smem[hipThreadIdx_y][0];
+ partialRVarianceRowPtr_smem[hipThreadIdx_x] = 0.0f; // initialization of _smem to 0 using all 16 x 16 threads
+ partialGVarianceRowPtr_smem[hipThreadIdx_x] = 0.0f;
+ partialBVarianceRowPtr_smem[hipThreadIdx_x] = 0.0f;
+ partialTensorVarianceRowPtr_smem[hipThreadIdx_x] = 0.0f;
+ float4 meanR_f4 = static_cast(meanPtr_f4[id_z].x);
+ float4 meanG_f4 = static_cast(meanPtr_f4[id_z].y);
+ float4 meanB_f4 = static_cast(meanPtr_f4[id_z].z);
+ float4 meanTensor_f4 = static_cast(meanPtr_f4[id_z].w);
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ return;
+
+ int xDiff = roiTensorPtrSrc[id_z].xywhROI.roiWidth - (roiTensorPtrSrc[id_z].xywhROI.roiWidth & ~7); // difference between roiWidth and alignedLength, where alignedLength = roiWidth & ~7
+ uint srcIdx = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+
+ d_float24 src_f24, tempChannelSquared_f24;
+ rpp_hip_load24_pln3_and_unpack_to_float24_pln3(srcPtr + srcIdx, srcStridesNCH.y, &src_f24); // load 24 pixels to local memory
+ mean_subtracted_square_3channel_hip_compute(&src_f24, &tempChannelSquared_f24, meanR_f4, meanG_f4, meanB_f4);
+
+ d_float24 tempSquared_f24;
+ mean_subtracted_square_3channel_hip_compute(&src_f24, &tempSquared_f24, meanTensor_f4, meanTensor_f4, meanTensor_f4);
+
+ if (id_x + 8 > roiTensorPtrSrc[id_z].xywhROI.roiWidth) // local memory reset of invalid values (from the vectorized global load) to 0.0f
+ {
+ for(int i = xDiff; i < 8; i++)
+ {
+ tempChannelSquared_f24.f8[0].f1[i] = 0.0f;
+ tempChannelSquared_f24.f8[1].f1[i] = 0.0f;
+ tempChannelSquared_f24.f8[2].f1[i] = 0.0f;
+ tempSquared_f24.f8[0].f1[i] = 0.0f;
+ tempSquared_f24.f8[1].f1[i] = 0.0f;
+ tempSquared_f24.f8[2].f1[i] = 0.0f;
+ }
+ }
+
+ reduce_variance_3channel_hip(&tempChannelSquared_f24, &tempSquared_f24,
+ partialRVarianceRowPtr_smem, partialGVarianceRowPtr_smem,
+ partialBVarianceRowPtr_smem, partialTensorVarianceRowPtr_smem, tensorVarianceArr);
+}
+
+template
+__global__ void tensor_variance_pkd3_hip(T *srcPtr,
+ uint2 srcStridesNH,
+ float *tensorVarianceArr,
+ float4 *meanPtr_f4,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ __shared__ float partialRVariance_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+ __shared__ float partialGVariance_smem[16][16];
+ __shared__ float partialBVariance_smem[16][16];
+ __shared__ float partialTensorVariance_smem[16][16];
+ float *partialRVarianceRowPtr_smem = &partialRVariance_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in _smem
+ float *partialGVarianceRowPtr_smem = &partialGVariance_smem[hipThreadIdx_y][0];
+ float *partialBVarianceRowPtr_smem = &partialBVariance_smem[hipThreadIdx_y][0];
+ float *partialTensorVarianceRowPtr_smem = &partialTensorVariance_smem[hipThreadIdx_y][0];
+ partialRVarianceRowPtr_smem[hipThreadIdx_x] = 0.0f; // initialization of _smem to 0 using all 16 x 16 threads
+ partialGVarianceRowPtr_smem[hipThreadIdx_x] = 0.0f;
+ partialBVarianceRowPtr_smem[hipThreadIdx_x] = 0.0f;
+ partialTensorVarianceRowPtr_smem[hipThreadIdx_x] = 0.0f;
+ float4 meanR_f4 = static_cast(meanPtr_f4[id_z].x);
+ float4 meanG_f4 = static_cast(meanPtr_f4[id_z].y);
+ float4 meanB_f4 = static_cast(meanPtr_f4[id_z].z);
+ float4 meanTensor_f4 = static_cast(meanPtr_f4[id_z].w);
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ return;
+
+ int xDiff = roiTensorPtrSrc[id_z].xywhROI.roiWidth - (roiTensorPtrSrc[id_z].xywhROI.roiWidth & ~7); // difference between roiWidth and alignedLength, where alignedLength = roiWidth & ~7
+ uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3);
+
+ d_float24 src_f24, tempChannelSquared_f24;
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &src_f24); // load 24 pixels to local memory
+ mean_subtracted_square_3channel_hip_compute(&src_f24, &tempChannelSquared_f24, meanR_f4, meanG_f4, meanB_f4);
+
+ d_float24 tempSquared_f24;
+ mean_subtracted_square_3channel_hip_compute(&src_f24, &tempSquared_f24, meanTensor_f4, meanTensor_f4, meanTensor_f4);
+
+ if (id_x + 8 > roiTensorPtrSrc[id_z].xywhROI.roiWidth) // local memory reset of invalid values (from the vectorized global load) to 0.0f
+ {
+ for(int i = xDiff; i < 8; i++)
+ {
+ tempChannelSquared_f24.f8[0].f1[i] = 0.0f;
+ tempChannelSquared_f24.f8[1].f1[i] = 0.0f;
+ tempChannelSquared_f24.f8[2].f1[i] = 0.0f;
+ tempSquared_f24.f8[0].f1[i] = 0.0f;
+ tempSquared_f24.f8[1].f1[i] = 0.0f;
+ tempSquared_f24.f8[2].f1[i] = 0.0f;
+ }
+ }
+
+ reduce_variance_3channel_hip(&tempChannelSquared_f24, &tempSquared_f24,
+ partialRVarianceRowPtr_smem, partialGVarianceRowPtr_smem,
+ partialBVarianceRowPtr_smem, partialTensorVarianceRowPtr_smem, tensorVarianceArr);
+}
+
+// -------------------- Set 2 - Kernel Executors --------------------
+
+template
+RppStatus hip_exec_tensor_stddev(T *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp32f *imageStddevArr,
+ Rpp32f *meanTensor,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rpp::Handle& handle)
+{
+ if (roiType == RpptRoiType::LTRB)
+ hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle);
+
+ int globalThreads_x = (srcDescPtr->w + 7) >> 3;
+ int globalThreads_y = srcDescPtr->h;
+ int globalThreads_z = srcDescPtr->n;
+ int gridDim_x = (int) ceil((float)globalThreads_x/LOCAL_THREADS_X);
+ int gridDim_y = (int) ceil((float)globalThreads_y/LOCAL_THREADS_Y);
+ int gridDim_z = (int) ceil((float)globalThreads_z/LOCAL_THREADS_Z);
+
+ if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u tensorPartialVarArrLength = gridDim_x * gridDim_y * gridDim_z;
+ float *tensorPartialVarArr = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem;
+ hipMemsetAsync(tensorPartialVarArr, 0, tensorPartialVarArrLength * sizeof(float), handle.GetStream());
+ hipLaunchKernelGGL(tensor_variance_pln1_hip,
+ dim3(gridDim_x, gridDim_y, gridDim_z),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+ tensorPartialVarArr,
+ meanTensor,
+ roiTensorPtrSrc);
+ hipLaunchKernelGGL(tensor_stddev_grid_result_hip,
+ dim3(1, 1, gridDim_z),
+ dim3(1024, 1, 1),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ tensorPartialVarArr,
+ gridDim_x * gridDim_y,
+ imageStddevArr,
+ roiTensorPtrSrc);
+ }
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u tensorPartialVarArrLength = gridDim_x * gridDim_y * gridDim_z * 4;
+ float *tensorPartialVarArr = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem;
+ hipMemsetAsync(tensorPartialVarArr, 0, tensorPartialVarArrLength * sizeof(float), handle.GetStream());
+ hipLaunchKernelGGL(tensor_variance_pln3_hip,
+ dim3(gridDim_x, gridDim_y, gridDim_z),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+ tensorPartialVarArr,
+ reinterpret_cast(meanTensor),
+ roiTensorPtrSrc);
+ hipLaunchKernelGGL(tensor_stddev_grid_3channel_result_hip,
+ dim3(1, 1, gridDim_z),
+ dim3(1024, 1, 1),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ tensorPartialVarArr,
+ gridDim_x * gridDim_y,
+ imageStddevArr,
+ roiTensorPtrSrc);
+ }
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32u tensorPartialVarArrLength = gridDim_x * gridDim_y * gridDim_z * 4;
+ float *tensorPartialVarArr = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem;
+ hipMemsetAsync(tensorPartialVarArr, 0, tensorPartialVarArrLength * sizeof(float), handle.GetStream());
+ hipLaunchKernelGGL(tensor_variance_pkd3_hip,
+ dim3(gridDim_x, gridDim_y, gridDim_z),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+ tensorPartialVarArr,
+ reinterpret_cast(meanTensor),
+ roiTensorPtrSrc);
+ hipLaunchKernelGGL(tensor_stddev_grid_3channel_result_hip,
+ dim3(1, 1, gridDim_z),
+ dim3(1024, 1, 1),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ tensorPartialVarArr,
+ gridDim_x * gridDim_y,
+ imageStddevArr,
+ roiTensorPtrSrc);
+ }
+
+ return RPP_SUCCESS;
+}
diff --git a/src/modules/rppt_tensor_statistical_operations.cpp b/src/modules/rppt_tensor_statistical_operations.cpp
index ef69a49bb..8d528663c 100644
--- a/src/modules/rppt_tensor_statistical_operations.cpp
+++ b/src/modules/rppt_tensor_statistical_operations.cpp
@@ -336,6 +336,161 @@ RppStatus rppt_normalize_host(RppPtr_t srcPtr,
return RPP_SUCCESS;
}
+/******************** tensor_mean ********************/
+
+RppStatus rppt_tensor_mean_host(RppPtr_t srcPtr,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t tensorMeanArr,
+ Rpp32u tensorMeanArrLength,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rppHandle_t rppHandle)
+{
+ if ((srcDescPtr->c == 1 && tensorMeanArrLength < srcDescPtr->n) || // Stddev of single channel
+ (srcDescPtr->c == 3 && tensorMeanArrLength < srcDescPtr->n * 4)) // Stddev of each channel, and total Stddev of all 3 channels / image
+ return RPP_ERROR_NOT_ENOUGH_MEMORY;
+ if (roiType == RpptRoiType::XYWH)
+ {
+ for(int i = 0; i < srcDescPtr->n; i++)
+ {
+ if ((roiTensorPtrSrc[i].xywhROI.roiWidth > REDUCTION_MAX_WIDTH)|| (roiTensorPtrSrc[i].xywhROI.roiHeight > REDUCTION_MAX_HEIGHT))
+ return RPP_ERROR_HIGH_SRC_DIMENSION;
+ }
+ }
+ else if (roiType == RpptRoiType::LTRB)
+ {
+ for(int i = 0; i < srcDescPtr->n; i++)
+ {
+ if ((roiTensorPtrSrc[i].ltrbROI.rb.x - roiTensorPtrSrc[i].ltrbROI.lt.x > REDUCTION_MAX_XDIM) || (roiTensorPtrSrc[i].ltrbROI.rb.y - roiTensorPtrSrc[i].ltrbROI.lt.y > REDUCTION_MAX_YDIM))
+ return RPP_ERROR_HIGH_SRC_DIMENSION;
+ }
+ }
+ RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c);
+
+ if (srcDescPtr->dataType == RpptDataType::U8)
+ {
+ tensor_mean_u8_f32_host(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(tensorMeanArr),
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if (srcDescPtr->dataType == RpptDataType::F16)
+ {
+ tensor_mean_f16_f32_host(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ static_cast(tensorMeanArr),
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if (srcDescPtr->dataType == RpptDataType::F32)
+ {
+ tensor_mean_f32_f32_host(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ static_cast(tensorMeanArr),
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if (srcDescPtr->dataType == RpptDataType::I8)
+ {
+ tensor_mean_i8_f32_host(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(tensorMeanArr),
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+}
+
+/******************** tensor_stddev ********************/
+
+RppStatus rppt_tensor_stddev_host(RppPtr_t srcPtr,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t tensorStddevArr,
+ Rpp32u tensorStddevArrLength,
+ Rpp32f *meanTensor,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rppHandle_t rppHandle)
+{
+ if ((srcDescPtr->c == 1 && tensorStddevArrLength < srcDescPtr->n) || // Stddev of single channel
+ (srcDescPtr->c == 3 && tensorStddevArrLength < srcDescPtr->n * 4)) // Stddev of each channel, and total Stddev of all 3 channels / image
+ return RPP_ERROR_NOT_ENOUGH_MEMORY;
+ if (roiType == RpptRoiType::XYWH)
+ {
+ for(int i = 0; i < srcDescPtr->n; i++)
+ {
+ if ((roiTensorPtrSrc[i].xywhROI.roiWidth > REDUCTION_MAX_WIDTH) || (roiTensorPtrSrc[i].xywhROI.roiHeight > REDUCTION_MAX_HEIGHT))
+ return RPP_ERROR_HIGH_SRC_DIMENSION;
+ }
+ }
+ else if (roiType == RpptRoiType::LTRB)
+ {
+ for(int i = 0; i < srcDescPtr->n; i++)
+ {
+ if ((roiTensorPtrSrc[i].ltrbROI.rb.x - roiTensorPtrSrc[i].ltrbROI.lt.x > REDUCTION_MAX_XDIM) || (roiTensorPtrSrc[i].ltrbROI.rb.y - roiTensorPtrSrc[i].ltrbROI.lt.y > REDUCTION_MAX_YDIM))
+ return RPP_ERROR_HIGH_SRC_DIMENSION;
+ }
+ }
+ RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c);
+
+ if (srcDescPtr->dataType == RpptDataType::U8)
+ {
+ tensor_stddev_u8_f32_host(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(tensorStddevArr),
+ meanTensor,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if (srcDescPtr->dataType == RpptDataType::F16)
+ {
+ tensor_stddev_f16_f32_host(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ static_cast(tensorStddevArr),
+ meanTensor,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if (srcDescPtr->dataType == RpptDataType::F32)
+ {
+ tensor_stddev_f32_f32_host(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ static_cast(tensorStddevArr),
+ meanTensor,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if (srcDescPtr->dataType == RpptDataType::I8)
+ {
+ tensor_stddev_i8_f32_host(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(tensorStddevArr),
+ meanTensor,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+}
+
/********************************************************************************************************************/
/*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/
/********************************************************************************************************************/
@@ -629,4 +784,157 @@ RppStatus rppt_normalize_gpu(RppPtr_t srcPtr,
#endif // backend
}
+/******************** tensor_mean ********************/
+
+RppStatus rppt_tensor_mean_gpu(RppPtr_t srcPtr,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t tensorMeanArr,
+ Rpp32u tensorMeanArrLength,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+ if ((srcDescPtr->c == 1 && tensorMeanArrLength < srcDescPtr->n) || // Stddev of single channel
+ (srcDescPtr->c == 3 && tensorMeanArrLength < srcDescPtr->n * 4)) // Stddev of each channel, and total Stddev of all 3 channels / image
+ return RPP_ERROR_NOT_ENOUGH_MEMORY;
+ if (roiType == RpptRoiType::XYWH)
+ {
+ for(int i = 0; i < srcDescPtr->n; i++)
+ {
+ if ((roiTensorPtrSrc[i].xywhROI.roiWidth > REDUCTION_MAX_WIDTH) || (roiTensorPtrSrc[i].xywhROI.roiHeight > REDUCTION_MAX_HEIGHT))
+ return RPP_ERROR_HIGH_SRC_DIMENSION;
+ }
+ }
+ else if (roiType == RpptRoiType::LTRB)
+ {
+ for(int i = 0; i < srcDescPtr->n; i++)
+ {
+ if ((roiTensorPtrSrc[i].ltrbROI.rb.x - roiTensorPtrSrc[i].ltrbROI.lt.x > REDUCTION_MAX_XDIM) || (roiTensorPtrSrc[i].ltrbROI.rb.y - roiTensorPtrSrc[i].ltrbROI.lt.y > REDUCTION_MAX_YDIM))
+ return RPP_ERROR_HIGH_SRC_DIMENSION;
+ }
+ }
+
+ if (srcDescPtr->dataType == RpptDataType::U8)
+ {
+ hip_exec_tensor_mean(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(tensorMeanArr),
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if (srcDescPtr->dataType == RpptDataType::F16)
+ {
+ hip_exec_tensor_mean(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ static_cast(tensorMeanArr),
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if (srcDescPtr->dataType == RpptDataType::F32)
+ {
+ hip_exec_tensor_mean(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ static_cast(tensorMeanArr),
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if (srcDescPtr->dataType == RpptDataType::I8)
+ {
+ hip_exec_tensor_mean(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(tensorMeanArr),
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+ return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
+/******************** tensor_stddev ********************/
+
+RppStatus rppt_tensor_stddev_gpu(RppPtr_t srcPtr,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t tensorStddevArr,
+ Rpp32u tensorStddevArrLength,
+ Rpp32f *meanTensor,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+ if ((srcDescPtr->c == 1 && tensorStddevArrLength < srcDescPtr->n) || // Stddev of single channel
+ (srcDescPtr->c == 3 && tensorStddevArrLength < srcDescPtr->n * 4)) // Stddev of each channel, and total Stddev of all 3 channels / image
+ return RPP_ERROR_NOT_ENOUGH_MEMORY;
+ if (roiType == RpptRoiType::XYWH)
+ {
+ for(int i = 0; i < srcDescPtr->n; i++)
+ {
+ if ((roiTensorPtrSrc[i].xywhROI.roiWidth > REDUCTION_MAX_WIDTH) || (roiTensorPtrSrc[i].xywhROI.roiHeight > REDUCTION_MAX_HEIGHT))
+ return RPP_ERROR_HIGH_SRC_DIMENSION;
+ }
+ }
+ else if (roiType == RpptRoiType::LTRB)
+ {
+ for(int i = 0; i < srcDescPtr->n; i++)
+ {
+ if ((roiTensorPtrSrc[i].ltrbROI.rb.x - roiTensorPtrSrc[i].ltrbROI.lt.x > REDUCTION_MAX_XDIM) || (roiTensorPtrSrc[i].ltrbROI.rb.y - roiTensorPtrSrc[i].ltrbROI.lt.y > REDUCTION_MAX_YDIM))
+ return RPP_ERROR_HIGH_SRC_DIMENSION;
+ }
+ }
+
+ if (srcDescPtr->dataType == RpptDataType::U8)
+ {
+ hip_exec_tensor_stddev(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(tensorStddevArr),
+ meanTensor,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if (srcDescPtr->dataType == RpptDataType::F16)
+ {
+ hip_exec_tensor_stddev(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ static_cast(tensorStddevArr),
+ meanTensor,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if (srcDescPtr->dataType == RpptDataType::F32)
+ {
+ hip_exec_tensor_stddev(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ static_cast(tensorStddevArr),
+ meanTensor,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if (srcDescPtr->dataType == RpptDataType::I8)
+ {
+ hip_exec_tensor_stddev(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(tensorStddevArr),
+ meanTensor,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+ return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
#endif // GPU_SUPPORT
diff --git a/utilities/test_suite/HIP/Tensor_hip.cpp b/utilities/test_suite/HIP/Tensor_hip.cpp
index 62f777ae8..0d8b7fd7c 100644
--- a/utilities/test_suite/HIP/Tensor_hip.cpp
+++ b/utilities/test_suite/HIP/Tensor_hip.cpp
@@ -68,7 +68,7 @@ int main(int argc, char **argv)
bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 33 || testCase == 61 || testCase == 63 || testCase == 65 || testCase == 68);
bool randomOutputCase = (testCase == 8 || testCase == 84 || testCase == 49 || testCase == 54);
bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24 || testCase == 79);
- bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89);
+ bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89 || testCase == 90 || testCase == 91);
bool noiseTypeCase = (testCase == 8);
bool pln1OutTypeCase = (testCase == 86);
@@ -86,7 +86,7 @@ int main(int argc, char **argv)
printf("\ndst = %s", argv[3]);
printf("\nu8 / f16 / f32 / u8->f16 / u8->f32 / i8 / u8->i8 (0/1/2/3/4/5/6) = %s", argv[4]);
printf("\noutputFormatToggle (pkd->pkd = 0 / pkd->pln = 1) = %s", argv[5]);
- printf("\ncase number (0:87) = %s", argv[6]);
+ printf("\ncase number (0:91) = %s", argv[6]);
printf("\nnumber of times to run = %s", argv[8]);
printf("\ntest type - (0 = unit tests / 1 = performance tests) = %s", argv[9]);
printf("\nlayout type - (0 = PKD3/ 1 = PLN3/ 2 = PLN1) = %s", argv[10]);
@@ -332,17 +332,19 @@ int main(int argc, char **argv)
// Initialize buffers for any reductionType functions (testCase 87 - tensor_sum alone cannot return final sum as 8u/8s due to overflow. 8u inputs return 64u sums, 8s inputs return 64s sums)
void *reductionFuncResultArr;
+ Rpp32f *mean;
Rpp32u reductionFuncResultArrLength = srcDescPtr->n * 4;
if (reductionTypeCase)
{
int bitDepthByteSize = 0;
- if (dstDescPtr->dataType == RpptDataType::U8)
+ if ((dstDescPtr->dataType == RpptDataType::F16) || (dstDescPtr->dataType == RpptDataType::F32) || testCase == 90 || testCase == 91)
+ bitDepthByteSize = sizeof(Rpp32f); // using 32f outputs for 16f and 32f, for testCase 90, 91
+ else if ((dstDescPtr->dataType == RpptDataType::U8) || (dstDescPtr->dataType == RpptDataType::I8))
bitDepthByteSize = (testCase == 87) ? sizeof(Rpp64u) : sizeof(Rpp8u);
- else if (dstDescPtr->dataType == RpptDataType::I8)
- bitDepthByteSize = (testCase == 87) ? sizeof(Rpp64s) : sizeof(Rpp8s);
- else if ((dstDescPtr->dataType == RpptDataType::F16) || (dstDescPtr->dataType == RpptDataType::F32))
- bitDepthByteSize = sizeof(Rpp32f); // using 32f outputs for 16f and 32f
+
CHECK_RETURN_STATUS(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * bitDepthByteSize));
+ if(testCase == 91)
+ CHECK_RETURN_STATUS(hipHostMalloc(&mean, reductionFuncResultArrLength * bitDepthByteSize));
}
// create generic descriptor and params in case of slice
@@ -350,7 +352,7 @@ int main(int argc, char **argv)
RpptGenericDescPtr descriptorPtr3D = &descriptor3D;
Rpp32s *anchorTensor = NULL, *shapeTensor = NULL;
Rpp32u *roiTensor = NULL;
- if(testCase == 90)
+ if(testCase == 92)
set_generic_descriptor_slice(srcDescPtr, descriptorPtr3D, batchSize);
// Allocate hip memory for src/dst
@@ -1244,7 +1246,6 @@ int main(int argc, char **argv)
reductionFuncResultArrLength = srcDescPtr->n;
startWallTime = omp_get_wtime();
-
if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
rppt_tensor_sum_gpu(d_input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle);
else
@@ -1277,6 +1278,37 @@ int main(int argc, char **argv)
break;
}
case 90:
+ {
+ testCaseName = "tensor_mean";
+
+ if(srcDescPtr->c == 1)
+ reductionFuncResultArrLength = srcDescPtr->n;
+
+ startWallTime = omp_get_wtime();
+ if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+ rppt_tensor_mean_gpu(d_input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
+ case 91:
+ {
+ testCaseName = "tensor_stddev";
+
+ if(srcDescPtr->c == 1)
+ reductionFuncResultArrLength = srcDescPtr->n;
+ memcpy(mean, TensorMeanReferenceOutputs[inputChannels].data(), sizeof(Rpp32f) * reductionFuncResultArrLength);
+
+ startWallTime = omp_get_wtime();
+ if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+ rppt_tensor_stddev_gpu(d_input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, mean, roiTensorPtrSrc, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
+ case 92:
{
testCaseName = "slice";
Rpp32u numDims = descriptorPtr3D->numDims - 1; // exclude batchSize from input dims
@@ -1299,8 +1331,10 @@ int main(int argc, char **argv)
break;
}
default:
+ {
missingFuncFlag = 1;
break;
+ }
}
CHECK_RETURN_STATUS(hipDeviceSynchronize());
@@ -1333,8 +1367,10 @@ int main(int argc, char **argv)
}
// print reduction functions output array based on different bit depths, and precision desired
- int precision = ((dstDescPtr->dataType == RpptDataType::F32) || (dstDescPtr->dataType == RpptDataType::F16)) ? 3 : 0;
- if (dstDescPtr->dataType == RpptDataType::U8)
+ int precision = ((dstDescPtr->dataType == RpptDataType::F32) || (dstDescPtr->dataType == RpptDataType::F16) || testCase == 90 || testCase == 91) ? 3 : 0;
+ if (dstDescPtr->dataType == RpptDataType::F32 || testCase == 90 || testCase == 91)
+ print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
+ else if (dstDescPtr->dataType == RpptDataType::U8)
{
if (testCase == 87)
print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
@@ -1348,13 +1384,6 @@ int main(int argc, char **argv)
else
print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
}
- else if (dstDescPtr->dataType == RpptDataType::F32)
- {
- if (testCase == 87)
- print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
- else
- print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
- }
else if (dstDescPtr->dataType == RpptDataType::I8)
{
if (testCase == 87)
@@ -1372,6 +1401,8 @@ int main(int argc, char **argv)
{
if (testCase == 87)
compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath);
+ else if (testCase == 90 || testCase == 91)
+ compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath);
else
compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath);
}
@@ -1395,7 +1426,7 @@ int main(int argc, char **argv)
// if test case is slice and qaFlag is set, update the dstImgSizes with shapeTensor values
// for output display and comparision purposes
- if (testCase == 90)
+ if (testCase == 92)
{
if (dstDescPtr->layout == RpptLayout::NCHW)
{
@@ -1490,7 +1521,11 @@ int main(int argc, char **argv)
CHECK_RETURN_STATUS(hipHostFree(patchRoi));
}
if (reductionTypeCase)
+ {
CHECK_RETURN_STATUS(hipHostFree(reductionFuncResultArr));
+ if(testCase == 91)
+ CHECK_RETURN_STATUS(hipHostFree(mean));
+ }
if(testCase == 32)
{
CHECK_RETURN_STATUS(hipHostFree(colorBuffer));
diff --git a/utilities/test_suite/HIP/runTests.py b/utilities/test_suite/HIP/runTests.py
index b5b492f39..40dfb21ea 100644
--- a/utilities/test_suite/HIP/runTests.py
+++ b/utilities/test_suite/HIP/runTests.py
@@ -39,7 +39,7 @@
outFolderPath = os.getcwd()
buildFolderPath = os.getcwd()
caseMin = 0
-caseMax = 90
+caseMax = 92
# Get a list of log files based on a flag for preserving output
def get_log_file_list(preserveOutput):
@@ -69,7 +69,7 @@ def func_group_finder(case_number):
return "logical_operations"
elif case_number < 87:
return "data_exchange_operations"
- elif case_number < 88:
+ elif case_number < 92:
return "statistical_operations"
else:
return "miscellaneous"
@@ -299,7 +299,7 @@ def rpp_test_suite_parser_and_validator():
subprocess.run(["make", "-j16"], cwd=".") # nosec
# List of cases supported
-supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '32', '33', '34', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '79', '80', '82', '83', '84', '85', '86', '87', '88', '89', '90']
+supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '32', '33', '34', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '79', '80', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92']
# Create folders based on testType and profilingOption
if testType == 1 and profilingOption == "YES":
diff --git a/utilities/test_suite/HOST/Tensor_host.cpp b/utilities/test_suite/HOST/Tensor_host.cpp
index 566777fd5..5a4634c97 100644
--- a/utilities/test_suite/HOST/Tensor_host.cpp
+++ b/utilities/test_suite/HOST/Tensor_host.cpp
@@ -68,7 +68,7 @@ int main(int argc, char **argv)
bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 33 || testCase == 61 || testCase == 63 || testCase == 65 || testCase == 68);
bool randomOutputCase = (testCase == 8 || testCase == 84);
bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24 || testCase == 79);
- bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89);
+ bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89 || testCase == 90 || testCase == 91);
bool noiseTypeCase = (testCase == 8);
bool pln1OutTypeCase = (testCase == 86);
@@ -86,7 +86,7 @@ int main(int argc, char **argv)
printf("\ndst = %s", argv[3]);
printf("\nu8 / f16 / f32 / u8->f16 / u8->f32 / i8 / u8->i8 (0/1/2/3/4/5/6) = %s", argv[4]);
printf("\noutputFormatToggle (pkd->pkd = 0 / pkd->pln = 1) = %s", argv[5]);
- printf("\ncase number (0:87) = %s", argv[6]);
+ printf("\ncase number (0:91) = %s", argv[6]);
printf("\nnumber of times to run = %s", argv[8]);
printf("\ntest type - (0 = unit tests / 1 = performance tests) = %s", argv[9]);
printf("\nlayout type - (0 = PKD3/ 1 = PLN3/ 2 = PLN1) = %s", argv[10]);
@@ -329,21 +329,16 @@ int main(int argc, char **argv)
if (reductionTypeCase)
{
int bitDepthByteSize = 0;
- if (dstDescPtr->dataType == RpptDataType::U8)
+ if ((dstDescPtr->dataType == RpptDataType::F16) || (dstDescPtr->dataType == RpptDataType::F32) || testCase == 90 || testCase == 91)
{
- bitDepthByteSize = (testCase == 87) ? sizeof(Rpp64u) : sizeof(Rpp8u);
- reductionFuncResultArr = static_cast(calloc(reductionFuncResultArrLength, bitDepthByteSize));
+ bitDepthByteSize = sizeof(Rpp32f); // using 32f outputs for 16f and 32f, for testCase 90, 91
+ reductionFuncResultArr = static_cast(calloc(reductionFuncResultArrLength, bitDepthByteSize));
}
- else if (dstDescPtr->dataType == RpptDataType::I8)
+ else if ((dstDescPtr->dataType == RpptDataType::U8) || (dstDescPtr->dataType == RpptDataType::I8))
{
- bitDepthByteSize = (testCase == 87) ? sizeof(Rpp64s) : sizeof(Rpp8s);
+ bitDepthByteSize = (testCase == 87) ? sizeof(Rpp64u) : sizeof(Rpp8u);
reductionFuncResultArr = static_cast(calloc(reductionFuncResultArrLength, bitDepthByteSize));
}
- else if ((dstDescPtr->dataType == RpptDataType::F16) || (dstDescPtr->dataType == RpptDataType::F32))
- {
- bitDepthByteSize = sizeof(Rpp32f); // using 32f outputs for 16f and 32f
- reductionFuncResultArr = static_cast(calloc(reductionFuncResultArrLength, bitDepthByteSize));
- }
}
// create generic descriptor and params in case of slice
@@ -351,7 +346,7 @@ int main(int argc, char **argv)
RpptGenericDescPtr descriptorPtr3D = &descriptor3D;
Rpp32s *anchorTensor = NULL, *shapeTensor = NULL;
Rpp32u *roiTensor = NULL;
- if(testCase == 90)
+ if(testCase == 92)
set_generic_descriptor_slice(srcDescPtr, descriptorPtr3D, batchSize);
// create cropRoi and patchRoi in case of crop_and_patch
@@ -1257,7 +1252,6 @@ int main(int argc, char **argv)
startWallTime = omp_get_wtime();
startCpuTime = clock();
-
if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
rppt_tensor_sum_host(input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle);
else
@@ -1274,7 +1268,6 @@ int main(int argc, char **argv)
startWallTime = omp_get_wtime();
startCpuTime = clock();
-
if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
rppt_tensor_min_host(input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle);
else
@@ -1291,7 +1284,6 @@ int main(int argc, char **argv)
startWallTime = omp_get_wtime();
startCpuTime = clock();
-
if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
rppt_tensor_max_host(input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle);
else
@@ -1300,6 +1292,39 @@ int main(int argc, char **argv)
break;
}
case 90:
+ {
+ testCaseName = "tensor_mean";
+
+ if(srcDescPtr->c == 1)
+ reductionFuncResultArrLength = srcDescPtr->n;
+
+ startWallTime = omp_get_wtime();
+ startCpuTime = clock();
+ if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+ rppt_tensor_mean_host(input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
+ case 91:
+ {
+ testCaseName = "tensor_stddev";
+
+ if(srcDescPtr->c == 1)
+ reductionFuncResultArrLength = srcDescPtr->n;
+ Rpp32f *mean = TensorMeanReferenceOutputs[inputChannels].data();
+
+ startWallTime = omp_get_wtime();
+ startCpuTime = clock();
+ if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+ rppt_tensor_stddev_host(input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, mean, roiTensorPtrSrc, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
+ case 92:
{
testCaseName = "slice";
Rpp32u numDims = descriptorPtr3D->numDims - 1; // exclude batchSize from input dims
@@ -1324,8 +1349,10 @@ int main(int argc, char **argv)
break;
}
default:
+ {
missingFuncFlag = 1;
break;
+ }
}
endCpuTime = clock();
@@ -1362,8 +1389,10 @@ int main(int argc, char **argv)
}
// print reduction functions output array based on different bit depths, and precision desired
- int precision = ((dstDescPtr->dataType == RpptDataType::F32) || (dstDescPtr->dataType == RpptDataType::F16)) ? 3 : 0;
- if (dstDescPtr->dataType == RpptDataType::U8)
+ int precision = ((dstDescPtr->dataType == RpptDataType::F32) || (dstDescPtr->dataType == RpptDataType::F16) || testCase == 90 || testCase == 91) ? 3 : 0;
+ if (dstDescPtr->dataType == RpptDataType::F32 || testCase == 90 || testCase == 91)
+ print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
+ else if (dstDescPtr->dataType == RpptDataType::U8)
{
if (testCase == 87)
print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
@@ -1377,13 +1406,6 @@ int main(int argc, char **argv)
else
print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
}
- else if (dstDescPtr->dataType == RpptDataType::F32)
- {
- if (testCase == 87)
- print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
- else
- print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
- }
else if (dstDescPtr->dataType == RpptDataType::I8)
{
if (testCase == 87)
@@ -1401,6 +1423,8 @@ int main(int argc, char **argv)
{
if (testCase == 87)
compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath);
+ else if (testCase == 90 || testCase == 91)
+ compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath);
else
compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath);
}
@@ -1422,7 +1446,7 @@ int main(int argc, char **argv)
// if test case is slice and qaFlag is set, update the dstImgSizes with shapeTensor values
// for output display and comparision purposes
- if (testCase == 90)
+ if (testCase == 92)
{
if (dstDescPtr->layout == RpptLayout::NCHW)
{
diff --git a/utilities/test_suite/HOST/runTests.py b/utilities/test_suite/HOST/runTests.py
index 445c139f5..159eed640 100644
--- a/utilities/test_suite/HOST/runTests.py
+++ b/utilities/test_suite/HOST/runTests.py
@@ -39,7 +39,7 @@
outFolderPath = os.getcwd()
buildFolderPath = os.getcwd()
caseMin = 0
-caseMax = 90
+caseMax = 92
# Get a list of log files based on a flag for preserving output
def get_log_file_list(preserveOutput):
@@ -275,7 +275,7 @@ def rpp_test_suite_parser_and_validator():
subprocess.run(["make", "-j16"], cwd=".") # nosec
# List of cases supported
-supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '32', '33', '34', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90']
+supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '32', '33', '34', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92']
print("\n\n\n\n\n")
print("##########################################################################################")
diff --git a/utilities/test_suite/README.md b/utilities/test_suite/README.md
index 375247665..05cb857e1 100644
--- a/utilities/test_suite/README.md
+++ b/utilities/test_suite/README.md
@@ -94,8 +94,8 @@ The image test suite can be executed under 2 backend scenarios - (HOST/HIP):
The image test suite accepts the following command line arguments:
- input_path1: The path to the input folder 1. Default is $cwd/../TEST_IMAGES/three_images_mixed_src1
- input_path2: The path to the input folder 2. Default is $cwd/../TEST_IMAGES/three_images_mixed_src2
-- case_start: The starting case number for the test range (0-89). Default is 0
-- case_end: The ending case number for the test range (0-89). Default is 89
+- case_start: The starting case number for the test range (0-91). Default is 0
+- case_end: The ending case number for the test range (0-91). Default is 91
- test_type: The type of test to run (0 = Unit tests, 1 = Performance tests). Default is 0
- case_list: A list of specific case numbers to run. Must be used in conjunction with --test_type
- profiling: Run the tests with a profiler (YES/NO). Default is NO. This option is only available with HIP backend
@@ -121,7 +121,11 @@ python runTests.py --input_path1 --input_path2 --cas
- QA mode (Unit tests) - Tolerance based PASS/FAIL tests for RPP HIP/HOST functionalities checking pixelwise match between C/SSE/AVX/HIP versions after comparison to preset golden outputs. Please note that QA mode is only supported with a batch size of 3.
Note: QA mode is not supported for case 84 due to run-to-run variation of outputs.
``` python
-python runTests.py --case_start 0 --case_end 89 --test_type 0 --qa_mode 1 --batch_size 3
+python runTests.py --case_start 0 --case_end 91 --test_type 0 --qa_mode 1 --batch_size 3
+```
+- QA mode (Performance tests) - Tolerance based PASS/FAIL tests for RPP HIP/HOST functionalities checking achieved improvement in performance percentage over BatchPD versions after comparison to a threshold percentage of improvement
+``` python
+python runTests.py --case_list 21 36 63 --test_type 1 --qa_mode 1 --batch_size 8 --num_runs 100
```
- QA mode (Performance tests) - Tolerance based PASS/FAIL tests for RPP HIP/HOST functionalities checking achieved improvement in performance percentage over BatchPD versions after comparison to a threshold percentage of improvement
``` python
@@ -135,13 +139,13 @@ python runTests.py --case_list 21 36 63 --test_type 1 --qa_mode 1 --batch_size 8
Note: For testcase 82(RICAP) Please use images of same resolution and Batchsize > 1
RICAP dataset path: rpp/utilities/test_suite/TEST_IMAGES/three_images_150x150_src1
``` python
-python runTests.py --case_start 0 --case_end 89 --test_type 0 --qa_mode 0
+python runTests.py --case_start 0 --case_end 91 --test_type 0 --qa_mode 0
```
- Performance test mode - Performance tests that execute the desired functionality and variant 100 times by default, and report max/min/avg RPP execution wall time, or optionally, AMD rocprof kernel profiler max/min/avg time for HIP backend variants.
Note: For testcase 82(RICAP) Please use images of same resolution and Batchsize > 1
RICAP dataset path: rpp/utilities/test_suite/TEST_IMAGES/three_images_150x150_src1
``` python
-python runTests.py --case_start 0 --case_end 89 --test_type 1
+python runTests.py --case_start 0 --case_end 91 --test_type 1
```
To run the unit tests / performance tests for specific case numbers. please case use case_list parameter. Example as below
diff --git a/utilities/test_suite/rpp_test_suite_common.h b/utilities/test_suite/rpp_test_suite_common.h
index fe9f501d7..517bebd4d 100644
--- a/utilities/test_suite/rpp_test_suite_common.h
+++ b/utilities/test_suite/rpp_test_suite_common.h
@@ -108,30 +108,46 @@ std::map augmentationMap =
{87, "tensor_sum"},
{88, "tensor_min"},
{89, "tensor_max"},
- {90, "slice"}
+ {90, "tensor_mean"},
+ {91, "tensor_stddev"},
+ {92, "slice"}
};
// Golden outputs for Tensor min Kernel
-std::map> TensorMinReferenceOutputs =
+std::map> TensorMinReferenceOutputs =
{
{1, {1, 1, 7}},
{3, {0, 0, 0, 0, 2, 0, 0, 0, 7, 9, 0, 0}}
};
// Golden outputs for Tensor max Kernel
-std::map> TensorMaxReferenceOutputs =
+std::map> TensorMaxReferenceOutputs =
{
{1, {239, 245, 255}},
{3, {255, 240, 236, 255, 255, 242, 241, 255, 253, 255, 255, 255}}
};
// Golden outputs for Tensor sum Kernel
-std::map> TensorSumReferenceOutputs =
+std::map> TensorSumReferenceOutputs =
{
{1, {334225, 813471, 2631125}},
{3, {348380, 340992, 262616, 951988, 1056552, 749506, 507441, 2313499, 2170646, 2732368, 3320699, 8223713}}
};
+// Golden outputs for Tensor mean Kernel
+std::map> TensorMeanReferenceOutputs =
+{
+ {1, {133.690, 81.347, 116.939}},
+ {3, {139.352, 136.397, 105.046, 126.932, 105.655, 74.951, 50.744, 77.117, 96.473, 121.439, 147.587, 121.833}}
+};
+
+// Golden outputs for Tensor stddev Kernel
+std::map