diff --git a/include/rppdefs.h b/include/rppdefs.h index 3256444f3..28876d7f5 100644 --- a/include/rppdefs.h +++ b/include/rppdefs.h @@ -129,7 +129,7 @@ typedef enum RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH = -14, /*! \brief Invalid datatype \ingroup group_rppdefs */ RPP_ERROR_INVALID_PARAMETER_DATATYPE = -15, - /*! \brief Not enough memory \ingroup group_rppdefs */ + /*! \brief Not enough memory to write outputs, as per dim-lengths and strides set in descriptor \ingroup group_rppdefs */ RPP_ERROR_NOT_ENOUGH_MEMORY = -16, /*! \brief Out of bound source ROI \ingroup group_rppdefs */ RPP_ERROR_OUT_OF_BOUND_SRC_ROI = -17, diff --git a/include/rppt_tensor_statistical_operations.h b/include/rppt_tensor_statistical_operations.h index 7c2d3318b..441816ea3 100644 --- a/include/rppt_tensor_statistical_operations.h +++ b/include/rppt_tensor_statistical_operations.h @@ -193,10 +193,84 @@ RppStatus rppt_normalize_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDesc RppStatus rppt_normalize_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u axisMask, Rpp32f *meanTensor, Rpp32f *stdDevTensor, Rpp8u computeMeanStddev, Rpp32f scale, Rpp32f shift, Rpp32u *roiTensor, rppHandle_t rppHandle); #endif // GPU_SUPPORT +/*! \brief Tensor mean operation on HOST backend for a NCHW/NHWC layout tensor + * \details The tensor mean is a reduction operation that finds the channel-wise (R mean / G mean / B mean) and total mean for each image in a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * - dstPtr depth ranges - Will be same depth as srcPtr. + * \param [in] srcPtr source tensor in HOST memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] tensorMeanArr destination array in HOST memory + * \param [in] tensorMeanArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorMeanArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorMeanArrLength = srcDescPtr->n * 4) + * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_tensor_mean_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t tensorMeanArr, Rpp32u tensorMeanArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); + +#ifdef GPU_SUPPORT +/*! \brief Tensor mean operation on HIP backend for a NCHW/NHWC layout tensor + * \details The tensor mean is a reduction operation that finds the channel-wise (R mean / G mean / B mean) and total mean for each image in a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * - dstPtr depth ranges - Will be same depth as srcPtr. + * \param [in] srcPtr source tensor in HIP memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] tensorMeanArr destination array in HIP memory + * \param [in] tensorMeanArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorMeanArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorMeanArrLength = srcDescPtr->n * 4) + * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_tensor_mean_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t tensorMeanArr, Rpp32u tensorMeanArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); +#endif // GPU_SUPPORT + +/*! \brief Tensor stddev operation on HOST backend for a NCHW/NHWC layout tensor + * \details The tensor stddev is a reduction operation that finds the channel-wise (R stddev / G stddev / B stddev) and total standard deviation for each image with respect to meanTensor passed.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * - dstPtr depth ranges - Will be same depth as srcPtr. + * \param [in] srcPtr source tensor in HOST memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] tensorStddevArr destination array in HOST memory + * \param [in] tensorStddevArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorStddevArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorStddevArrLength = srcDescPtr->n * 4) + * \param [in] meanTensor mean values for stddev calculation (1D tensor of size batchSize * 4 in format (MeanR, MeanG, MeanB, MeanImage) for each image in batch) + * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_tensor_stddev_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t tensorStddevArr, Rpp32u tensorStddevArrLength, Rpp32f *meanTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); + +#ifdef GPU_SUPPORT +/*! \brief Tensor stddev operation on HIP backend for a NCHW/NHWC layout tensor + * \details The tensor stddev is a reduction operation that finds the channel-wise (R stddev / G stddev / B stddev) and total standard deviation for each image with respect to meanTensor passed.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * - dstPtr depth ranges - Will be same depth as srcPtr. + * \param [in] srcPtr source tensor in HIP memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] tensorStddevArr destination array in HIP memory + * \param [in] tensorStddevArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorStddevArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorStddevArrLength = srcDescPtr->n * 4) + * \param [in] meanTensor mean values for stddev calculation (1D tensor of size batchSize * 4 in format (MeanR, MeanG, MeanB, MeanImage) for each image in batch) + * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_tensor_stddev_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t tensorStddevArr, Rpp32u tensorStddevArrLength, Rpp32f *meanTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); +#endif // GPU_SUPPORT + /*! @} */ #ifdef __cplusplus } #endif -#endif // RPPT_TENSOR_STATISTICAL_OPERATIONS_H +#endif // RPPT_TENSOR_STATISTICAL_OPERATIONS_H \ No newline at end of file diff --git a/src/include/cpu/rpp_cpu_common.hpp b/src/include/cpu/rpp_cpu_common.hpp index 9e599f06d..bb06713b9 100644 --- a/src/include/cpu/rpp_cpu_common.hpp +++ b/src/include/cpu/rpp_cpu_common.hpp @@ -6120,6 +6120,46 @@ inline void compute_sum_24_host(__m256d *p, __m256d *pSumR, __m256d *pSumG, __m2 pSumB[0] = _mm256_add_pd(_mm256_add_pd(p[4], p[5]), pSumB[0]); //add 8B values and bring it down to 4 } +inline void compute_variance_8_host(__m256d *p1, __m256d *pMean, __m256d *pVar) +{ + __m256d pSub = _mm256_sub_pd(p1[0], pMean[0]); + pVar[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVar[0]); + pSub = _mm256_sub_pd(p1[1], pMean[0]); + pVar[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVar[0]); +} + +inline void compute_variance_channel_pln3_24_host(__m256d *p1, __m256d *pMeanR, __m256d *pMeanG, __m256d *pMeanB, __m256d *pVarR, __m256d *pVarG, __m256d *pVarB) +{ + __m256d pSub = _mm256_sub_pd(p1[0], pMeanR[0]); + pVarR[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarR[0]); + pSub = _mm256_sub_pd(p1[1], pMeanR[0]); + pVarR[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarR[0]); + pSub = _mm256_sub_pd(p1[2], pMeanG[0]); + pVarG[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarG[0]); + pSub = _mm256_sub_pd(p1[3], pMeanG[0]); + pVarG[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarG[0]); + pSub = _mm256_sub_pd(p1[4], pMeanB[0]); + pVarB[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarB[0]); + pSub = _mm256_sub_pd(p1[5], pMeanB[0]); + pVarB[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarB[0]); +} + +inline void compute_variance_image_pln3_24_host(__m256d *p1, __m256d *pMean, __m256d *pVarR, __m256d *pVarG, __m256d *pVarB) +{ + __m256d pSub = _mm256_sub_pd(p1[0], pMean[0]); + pVarR[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarR[0]); + pSub = _mm256_sub_pd(p1[1], pMean[0]); + pVarR[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarR[0]); + pSub = _mm256_sub_pd(p1[2], pMean[0]); + pVarG[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarG[0]); + pSub = _mm256_sub_pd(pMean[0], p1[3]); + pVarG[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarG[0]); + pSub = _mm256_sub_pd(p1[4], pMean[0]); + pVarB[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarB[0]); + pSub = _mm256_sub_pd(p1[5], pMean[0]); + pVarB[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarB[0]); +} + inline void compute_vignette_48_host(__m256 *p, __m256 &pMultiplier, __m256 &pILocComponent, __m256 &pJLocComponent) { __m256 pGaussianValue; diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp index a5e83d1ff..babc6f55c 100644 --- a/src/include/cpu/rpp_cpu_simd.hpp +++ b/src/include/cpu/rpp_cpu_simd.hpp @@ -1280,6 +1280,35 @@ inline void rpp_store48_f32pln3_to_u8pkd3_avx(Rpp8u *dstPtr, __m256 *p) _mm_storeu_si128((__m128i *)(dstPtr + 36), px[3]); /* store [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|00|00|00|00] */ } +inline void rpp_load24_u8pln3_to_f64pln3_avx(Rpp8u *srcPtrR, Rpp8u *srcPtrG, Rpp8u *srcPtrB, __m256d *p) +{ + __m128i px[3]; + + px[0] = _mm_loadu_si128((__m128i *)srcPtrR); /* load [R00|R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */ + px[1] = _mm_loadu_si128((__m128i *)srcPtrG); /* load [G00|G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16] */ + px[2] = _mm_loadu_si128((__m128i *)srcPtrB); /* load [B00|B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */ + p[0] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[0], xmm_pxMask00To03)); /* Contains R00-03 */ + p[1] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[0], xmm_pxMask04To07)); /* Contains R04-07 */ + p[2] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[1], xmm_pxMask00To03)); /* Contains G00-03 */ + p[3] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[1], xmm_pxMask04To07)); /* Contains G04-07 */ + p[4] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[2], xmm_pxMask00To03)); /* Contains B00-03 */ + p[5] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[2], xmm_pxMask04To07)); /* Contains B04-07 */ +} + +inline void rpp_load24_u8pkd3_to_f64pln3_avx(Rpp8u *srcPtr, __m256d *p) +{ + __m128i px[2]; + + px[0] = _mm_loadu_si128((__m128i *)srcPtr); /* load [R00|G00|B00|R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05] - Need RGB 00-03 */ + px[1] = _mm_loadu_si128((__m128i *)(srcPtr + 12)); /* load [R04|G04|B04|R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09] - Need RGB 04-07 */ + p[0] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[0], xmm_pxMaskR)); /* Contains R00-03 */ + p[1] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[1], xmm_pxMaskR)); /* Contains R04-07 */ + p[2] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[0], xmm_pxMaskG)); /* Contains G00-03 */ + p[3] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[1], xmm_pxMaskG)); /* Contains G04-07 */ + p[4] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[0], xmm_pxMaskB)); /* Contains B00-03 */ + p[5] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[1], xmm_pxMaskB)); /* Contains B04-07 */ +} + inline void rpp_load16_u8_to_f32_avx(Rpp8u *srcPtr, __m256 *p) { __m128i px; @@ -1315,6 +1344,22 @@ inline void rpp_store16_f32_to_u8_avx(Rpp8u *dstPtr, __m256 *p) _mm_storeu_si128((__m128i *)dstPtr, px[0]); } +inline void rpp_load8_u8_to_f64_avx(Rpp8u *srcPtr, __m256d *p) +{ + __m128i px; + px = _mm_loadu_si128((__m128i *)srcPtr); + p[0] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px, xmm_pxMask00To03)); /* Contains pixels 01-04 */ + p[1] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px, xmm_pxMask04To07)); /* Contains pixels 05-08 */ +} + +inline void rpp_load8_i8_to_f64_avx(Rpp8s *srcPtr, __m256d *p) +{ + __m128i px; + px = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)srcPtr)); + p[0] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px, xmm_pxMask00To03)); /* Contains pixels 01-04 */ + p[1] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px, xmm_pxMask04To07)); /* Contains pixels 05-08 */ +} + inline void rpp_load16_u8_to_u32_avx(Rpp8u *srcPtr, __m256i *p) { __m128i px; @@ -1688,6 +1733,35 @@ inline void rpp_store48_f32pln3_to_i8pkd3_avx(Rpp8s *dstPtr, __m256 *p) _mm_storeu_si128((__m128i *)(dstPtr + 36), px[3]); /* store [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|00|00|00|00] */ } +inline void rpp_load24_i8pln3_to_f64pln3_avx(Rpp8s *srcPtrR, Rpp8s *srcPtrG, Rpp8s *srcPtrB, __m256d *p) +{ + __m128i px[3]; + + px[0] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)srcPtrR)); /* add I8 conversion param to load [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */ + px[1] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)srcPtrG)); /* add I8 conversion param to load [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16] */ + px[2] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)srcPtrB)); /* add I8 conversion param to load [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */ + p[0] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[0], xmm_pxMask00To03)); /* Contains R01-04 */ + p[1] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[0], xmm_pxMask04To07)); /* Contains R05-08 */ + p[2] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[1], xmm_pxMask00To03)); /* Contains G01-04 */ + p[3] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[1], xmm_pxMask04To07)); /* Contains G05-08 */ + p[4] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[2], xmm_pxMask00To03)); /* Contains B01-04 */ + p[5] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[2], xmm_pxMask04To07)); /* Contains B05-08 */ +} + +inline void rpp_load24_i8pkd3_to_f64pln3_avx(Rpp8s *srcPtr, __m256d *p) +{ + __m128i px[2]; + + px[0] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)srcPtr)); /* add I8 conversion param to load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06] - Need RGB 01-04 */ + px[1] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)(srcPtr + 12))); /* add I8 conversion param to load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10] - Need RGB 05-08 */ + p[0] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[0], xmm_pxMaskR)); /* Contains R01-04 */ + p[1] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[1], xmm_pxMaskR)); /* Contains R05-08 */ + p[2] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[0], xmm_pxMaskG)); /* Contains G01-04 */ + p[3] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[1], xmm_pxMaskG)); /* Contains G05-08 */ + p[4] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[0], xmm_pxMaskB)); /* Contains B01-04 */ + p[5] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px[1], xmm_pxMaskB)); /* Contains B05-08 */ +} + inline void rpp_load16_i8_to_f32_avx(Rpp8s *srcPtr, __m256 *p) { __m128i px; diff --git a/src/include/hip/rpp_hip_common.hpp b/src/include/hip/rpp_hip_common.hpp index a5cd5972c..3f32dbc04 100644 --- a/src/include/hip/rpp_hip_common.hpp +++ b/src/include/hip/rpp_hip_common.hpp @@ -63,12 +63,12 @@ typedef union { float f1[24]; float2 f2[12]; float3 f3[8]; float4 f4[6]; // uint typedef union { uint ui1[6]; uint2 ui2[3]; } d_uint6; typedef union { uint ui1[8]; uint4 ui4[2]; } d_uint8; -typedef union { uint4 ui4[6]; d_uint8 ui8[3]; } d_uint24; +typedef union { uint ui1[24]; uint4 ui4[6]; d_uint8 ui8[3]; } d_uint24; // int typedef union { int i1[6]; int2 i2[3]; } d_int6; typedef union { int i1[8]; int4 i4[2]; } d_int8; -typedef union { int4 i4[6]; d_int8 i8[3]; } d_int24; +typedef union { int i1[24]; int4 i4[6]; d_int8 i8[3]; } d_int24; // half typedef struct { half h1[3]; } d_half3_s; @@ -599,6 +599,24 @@ __device__ __forceinline__ void rpp_hip_load8_and_unpack_to_float8_mirror(half * srcPtr_f8->f4[1] = make_float4(src1_f2.y, src1_f2.x, src2_f2.y, src2_f2.x); // write 03-00 } +// UINT loads without layout toggle (8 UINT pixels) + +__device__ __forceinline__ void rpp_hip_load8_and_unpack_to_float8(uint *srcPtr, d_float8 *srcPtr_f8) +{ + d_uint8 src_ui8 = *(d_uint8 *)srcPtr; + srcPtr_f8->f4[0] = make_float4((float)src_ui8.ui4[0].x, (float)src_ui8.ui4[0].y, (float)src_ui8.ui4[0].z, (float)src_ui8.ui4[0].w); + srcPtr_f8->f4[1] = make_float4((float)src_ui8.ui4[1].x, (float)src_ui8.ui4[1].y, (float)src_ui8.ui4[1].z, (float)src_ui8.ui4[1].w); +} + +// INT loads without layout toggle (8 INT pixels) + +__device__ __forceinline__ void rpp_hip_load8_and_unpack_to_float8(int *srcPtr, d_float8 *srcPtr_f8) +{ + d_int8 src_i8 = *(d_int8 *)srcPtr; + srcPtr_f8->f4[0] = make_float4((float)src_i8.i4[0].x, (float)src_i8.i4[0].y, (float)src_i8.i4[0].z, (float)src_i8.i4[0].w); + srcPtr_f8->f4[1] = make_float4((float)src_i8.i4[1].x, (float)src_i8.i4[1].y, (float)src_i8.i4[1].z, (float)src_i8.i4[1].w); +} + // U8 loads without layout toggle PLN3 to PLN3 (24 U8 pixels) __device__ __forceinline__ void rpp_hip_load24_pln3_and_unpack_to_float24_pln3(uchar *srcPtr, uint increment, d_float24 *srcPtr_f24) @@ -864,6 +882,36 @@ __device__ __forceinline__ void rpp_hip_load24_pkd3_and_unpack_to_float24_pln3_m srcPtr_f24->f4[5] = make_float4(__half2float(src_h24.h1[11]), __half2float(src_h24.h1[ 8]), __half2float(src_h24.h1[ 5]), __half2float(src_h24.h1[ 2])); // write B03-B00 (mirrored load) } +// UINT loads with layout toggle PLN3 to PKD3 (24 UINT pixels) + +__device__ __forceinline__ void rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(uint *srcPtr, d_float24 *srcPtr_f24) +{ + d_uint24 src_ui24; + *(d_uint24_s *)&src_ui24 = *(d_uint24_s *)srcPtr; + + srcPtr_f24->f4[0] = make_float4((float)src_ui24.ui1[ 0], (float)src_ui24.ui1[ 3], (float)src_ui24.ui1[ 6], (float)src_ui24.ui1[ 9]); // write R00-R03 + srcPtr_f24->f4[1] = make_float4((float)src_ui24.ui1[12], (float)src_ui24.ui1[15], (float)src_ui24.ui1[18], (float)src_ui24.ui1[21]); // write R04-R07 + srcPtr_f24->f4[2] = make_float4((float)src_ui24.ui1[ 1], (float)src_ui24.ui1[ 4], (float)src_ui24.ui1[ 7], (float)src_ui24.ui1[10]); // write G00-G03 + srcPtr_f24->f4[3] = make_float4((float)src_ui24.ui1[13], (float)src_ui24.ui1[16], (float)src_ui24.ui1[19], (float)src_ui24.ui1[22]); // write G04-G07 + srcPtr_f24->f4[4] = make_float4((float)src_ui24.ui1[ 2], (float)src_ui24.ui1[ 5], (float)src_ui24.ui1[ 8], (float)src_ui24.ui1[11]); // write B00-B03 + srcPtr_f24->f4[5] = make_float4((float)src_ui24.ui1[14], (float)src_ui24.ui1[17], (float)src_ui24.ui1[20], (float)src_ui24.ui1[23]); // write B04-B07 +} + +// INT loads with layout toggle PLN3 to PKD3 (24 INT pixels) + +__device__ __forceinline__ void rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(int *srcPtr, d_float24 *srcPtr_f24) +{ + d_int24 src_i24; + *(d_int24_s *)&src_i24 = *(d_int24_s *)srcPtr; + + srcPtr_f24->f4[0] = make_float4((float)src_i24.i1[ 0], (float)src_i24.i1[ 3], (float)src_i24.i1[ 6], (float)src_i24.i1[ 9]); // write R00-R03 + srcPtr_f24->f4[1] = make_float4((float)src_i24.i1[12], (float)src_i24.i1[15], (float)src_i24.i1[18], (float)src_i24.i1[21]); // write R04-R07 + srcPtr_f24->f4[2] = make_float4((float)src_i24.i1[ 1], (float)src_i24.i1[ 4], (float)src_i24.i1[ 7], (float)src_i24.i1[10]); // write G00-G03 + srcPtr_f24->f4[3] = make_float4((float)src_i24.i1[13], (float)src_i24.i1[16], (float)src_i24.i1[19], (float)src_i24.i1[22]); // write G04-G07 + srcPtr_f24->f4[4] = make_float4((float)src_i24.i1[ 2], (float)src_i24.i1[ 5], (float)src_i24.i1[ 8], (float)src_i24.i1[11]); // write B00-B03 + srcPtr_f24->f4[5] = make_float4((float)src_i24.i1[14], (float)src_i24.i1[17], (float)src_i24.i1[20], (float)src_i24.i1[23]); // write B04-B07 +} + // U8 loads with layout toggle PLN3 to PKD3 (24 U8 pixels) __device__ __forceinline__ void rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(uchar *srcPtr, uint increment, d_float24 *srcPtr_f24) diff --git a/src/modules/cpu/host_tensor_statistical_operations.hpp b/src/modules/cpu/host_tensor_statistical_operations.hpp index 7ddc238fa..0a7e8ff51 100644 --- a/src/modules/cpu/host_tensor_statistical_operations.hpp +++ b/src/modules/cpu/host_tensor_statistical_operations.hpp @@ -28,6 +28,8 @@ SOFTWARE. #include "kernel/tensor_sum.hpp" #include "kernel/tensor_min.hpp" #include "kernel/tensor_max.hpp" +#include "kernel/tensor_mean.hpp" +#include "kernel/tensor_stddev.hpp" #include "kernel/normalize.hpp" #endif // HOST_TENSOR_STATISTICAL_OPERATIONS_HPP diff --git a/src/modules/cpu/kernel/tensor_mean.hpp b/src/modules/cpu/kernel/tensor_mean.hpp new file mode 100644 index 000000000..9536e258f --- /dev/null +++ b/src/modules/cpu/kernel/tensor_mean.hpp @@ -0,0 +1,842 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "rppdefs.h" +#include "rpp_cpu_simd.hpp" +#include "rpp_cpu_common.hpp" +#include "reduction.hpp" + +RppStatus tensor_mean_u8_f32_host(Rpp8u *srcPtr, + RpptDescPtr srcDescPtr, + Rpp32f *tensorMeanArr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp8u *srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + Rpp8u *srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + Rpp32u alignedLength = (bufferLength / 48) * 48; + Rpp32u vectorIncrement = 48; + Rpp32u vectorIncrementPerChannel = 16; + Rpp32f totalPixelsPerChannel = roi.xywhROI.roiWidth * roi.xywhROI.roiHeight; + int idx = batchCount * 4; + + // Tensor Mean without fused output-layout toggle (NCHW) + if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + alignedLength = bufferLength & ~15; + Rpp32f mean = 0.0; + Rpp32u sum = 0; + Rpp32u sumAvx[8] = {0}; + + Rpp8u *srcPtrRow; + srcPtrRow = srcPtrChannel; +#if __AVX2__ + __m256i pSum = avx_px0; +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i p1[2]; + rpp_simd_load(rpp_load16_u8_to_u32_avx, srcPtrTemp, p1); + compute_sum_16_host(p1, &pSum); + srcPtrTemp += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + sum += static_cast(*srcPtrTemp++); + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + _mm256_store_si256((__m256i *)sumAvx, pSum); + sum += (sumAvx[0] + sumAvx[1] + sumAvx[2] + sumAvx[3] + sumAvx[4] + sumAvx[5] + sumAvx[6] + sumAvx[7]); +#endif + mean = static_cast(sum) / totalPixelsPerChannel; + tensorMeanArr[batchCount] = mean; + } + + // Tensor Mean without fused output-layout toggle 3 channel (NCHW) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp64u sum; + Rpp32u sumR = 0, sumG = 0, sumB = 0; + Rpp32f mean, meanR = 0.0, meanG = 0.0, meanB = 0.0; + Rpp32u sumAvxR[8] = {0}; + Rpp32u sumAvxG[8] = {0}; + Rpp32u sumAvxB[8] = {0}; + + Rpp8u *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; +#if __AVX2__ + __m256i pSumR = avx_px0; + __m256i pSumG = avx_px0; + __m256i pSumB = avx_px0; +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtrTempR, *srcPtrTempG, *srcPtrTempB; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i p[6]; + rpp_simd_load(rpp_load48_u8pln3_to_u32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); + compute_sum_48_host(p, &pSumR, &pSumG, &pSumB); + srcPtrTempR += vectorIncrementPerChannel; + srcPtrTempG += vectorIncrementPerChannel; + srcPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + sumR += static_cast(*srcPtrTempR++); + sumG += static_cast(*srcPtrTempG++); + sumB += static_cast(*srcPtrTempB++); + } + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + } +#if __AVX2__ + _mm256_store_si256((__m256i *)sumAvxR, pSumR); + _mm256_store_si256((__m256i *)sumAvxG, pSumG); + _mm256_store_si256((__m256i *)sumAvxB, pSumB); + sumR += (sumAvxR[0] + sumAvxR[1] + sumAvxR[2] + sumAvxR[3] + sumAvxR[4] + sumAvxR[5] + sumAvxR[6] + sumAvxR[7]); + sumG += (sumAvxG[0] + sumAvxG[1] + sumAvxG[2] + sumAvxG[3] + sumAvxG[4] + sumAvxG[5] + sumAvxG[6] + sumAvxG[7]); + sumB += (sumAvxB[0] + sumAvxB[1] + sumAvxB[2] + sumAvxB[3] + sumAvxB[4] + sumAvxB[5] + sumAvxB[6] + sumAvxB[7]); +#endif + sum = static_cast(sumR) + static_cast(sumG) + static_cast(sumB); + mean = (static_cast(sum) / (totalPixelsPerChannel * 3)); + meanR = (static_cast(sumR) / totalPixelsPerChannel); + meanG = (static_cast(sumG) / totalPixelsPerChannel); + meanB = (static_cast(sumB) / totalPixelsPerChannel); + tensorMeanArr[idx] = meanR; + tensorMeanArr[idx + 1] = meanG; + tensorMeanArr[idx + 2] = meanB; + tensorMeanArr[idx + 3] = mean; + } + + // Tensor Mean without fused output-layout toggle (NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC)) + { + Rpp64u sum; + Rpp32u sumR = 0, sumG = 0, sumB = 0; + Rpp32f mean, meanR = 0.0, meanG = 0.0, meanB = 0.0; + Rpp32u sumAvxR[8] = {0}; + Rpp32u sumAvxG[8] = {0}; + Rpp32u sumAvxB[8] = {0}; + + Rpp8u *srcPtrRow; + srcPtrRow = srcPtrChannel; +#if __AVX2__ + __m256i pSumR = avx_px0; + __m256i pSumG = avx_px0; + __m256i pSumB = avx_px0; +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256i p[6]; + rpp_simd_load(rpp_load48_u8pkd3_to_u32pln3_avx, srcPtrTemp, p); + compute_sum_48_host(p, &pSumR, &pSumG, &pSumB); + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + sumR += static_cast(srcPtrTemp[0]); + sumG += static_cast(srcPtrTemp[1]); + sumB += static_cast(srcPtrTemp[2]); + srcPtrTemp += 3; + } + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + _mm256_store_si256((__m256i *)sumAvxR, pSumR); + _mm256_store_si256((__m256i *)sumAvxG, pSumG); + _mm256_store_si256((__m256i *)sumAvxB, pSumB); + sumR += (sumAvxR[0] + sumAvxR[1] + sumAvxR[2] + sumAvxR[3] + sumAvxR[4] + sumAvxR[5] + sumAvxR[6] + sumAvxR[7]); + sumG += (sumAvxG[0] + sumAvxG[1] + sumAvxG[2] + sumAvxG[3] + sumAvxG[4] + sumAvxG[5] + sumAvxG[6] + sumAvxG[7]); + sumB += (sumAvxB[0] + sumAvxB[1] + sumAvxB[2] + sumAvxB[3] + sumAvxB[4] + sumAvxB[5] + sumAvxB[6] + sumAvxB[7]); +#endif + sum = static_cast(sumR) + static_cast(sumG) + static_cast(sumB); + mean = (static_cast(sum) / (totalPixelsPerChannel * 3)); + meanR = (static_cast(sumR) / totalPixelsPerChannel); + meanG = (static_cast(sumG) / totalPixelsPerChannel); + meanB = (static_cast(sumB) / totalPixelsPerChannel); + tensorMeanArr[idx] = meanR; + tensorMeanArr[idx + 1] = meanG; + tensorMeanArr[idx + 2] = meanB; + tensorMeanArr[idx + 3] = mean; + } + } + + return RPP_SUCCESS; +} + +RppStatus tensor_mean_f32_f32_host(Rpp32f *srcPtr, + RpptDescPtr srcDescPtr, + Rpp32f *tensorMeanArr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp32f *srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + Rpp32f *srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + Rpp32u alignedLength = (bufferLength / 24) * 24; + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; + Rpp32f totalPixelsPerChannel = roi.xywhROI.roiWidth * roi.xywhROI.roiHeight; + int idx = batchCount * 4; + + // Tensor Mean without fused output-layout toggle (NCHW) + if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u alignedLength = bufferLength & ~(vectorIncrementPerChannel-1); + vectorIncrement = 8; + Rpp32f mean = 0.0; + Rpp64f sum = 0.0; + Rpp64f sumAvx[4] = {0.0}; + + Rpp32f *srcPtrRow; + srcPtrRow = srcPtrChannel; +#if __AVX2__ + __m256d pSum = _mm256_setzero_pd(); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256d p1[2]; + rpp_simd_load(rpp_load8_f32_to_f64_avx, srcPtrTemp, p1); + compute_sum_8_host(p1, &pSum); + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + sum += static_cast(*srcPtrTemp++); + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + _mm256_storeu_pd(sumAvx, pSum); + sum += (sumAvx[0] + sumAvx[1] + sumAvx[2] + sumAvx[3]); +#endif + mean = static_cast(sum / totalPixelsPerChannel); + tensorMeanArr[batchCount] = mean; + } + + // Tensor Mean without fused output-layout toggle 3 channel (NCHW) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp64f sum, sumR = 0.0, sumG = 0.0, sumB = 0.0; + Rpp32f mean, meanR = 0.0, meanG = 0.0, meanB = 0.0; + Rpp64f sumAvxR[4] = {0.0}; + Rpp64f sumAvxG[4] = {0.0}; + Rpp64f sumAvxB[4] = {0.0}; + + Rpp32f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; +#if __AVX2__ + __m256d pSumR = _mm256_setzero_pd(); + __m256d pSumG = _mm256_setzero_pd(); + __m256d pSumB = _mm256_setzero_pd(); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256d p[6]; + rpp_simd_load(rpp_load24_f32pln3_to_f64pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); + compute_sum_24_host(p, &pSumR, &pSumG, &pSumB); + srcPtrTempR += vectorIncrementPerChannel; + srcPtrTempG += vectorIncrementPerChannel; + srcPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + sumR += static_cast(*srcPtrTempR++); + sumG += static_cast(*srcPtrTempG++); + sumB += static_cast(*srcPtrTempB++); + } + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + } +#if __AVX2__ + _mm256_storeu_pd(sumAvxR, pSumR); + _mm256_storeu_pd(sumAvxG, pSumG); + _mm256_storeu_pd(sumAvxB, pSumB); + sumR += (sumAvxR[0] + sumAvxR[1] + sumAvxR[2] + sumAvxR[3]); + sumG += (sumAvxG[0] + sumAvxG[1] + sumAvxG[2] + sumAvxG[3]); + sumB += (sumAvxB[0] + sumAvxB[1] + sumAvxB[2] + sumAvxB[3]); +#endif + + sum = sumR + sumG + sumB; + mean = static_cast(sum / (totalPixelsPerChannel * 3)); + meanR = static_cast(sumR / totalPixelsPerChannel); + meanG = static_cast(sumG / totalPixelsPerChannel); + meanB = static_cast(sumB / totalPixelsPerChannel); + tensorMeanArr[idx] = meanR; + tensorMeanArr[idx + 1] = meanG; + tensorMeanArr[idx + 2] = meanB; + tensorMeanArr[idx + 3] = mean; + } + + // Tensor Mean without fused output-layout toggle (NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC)) + { + Rpp64f sum, sumR = 0.0, sumG = 0.0, sumB = 0.0; + Rpp32f mean, meanR = 0.0, meanG = 0.0, meanB = 0.0; + Rpp64f sumAvxR[4] = {0.0}; + Rpp64f sumAvxG[4] = {0.0}; + Rpp64f sumAvxB[4] = {0.0}; + + Rpp32f *srcPtrRow; + srcPtrRow = srcPtrChannel; +#if __AVX2__ + __m256d pSumR = _mm256_setzero_pd(); + __m256d pSumG = _mm256_setzero_pd(); + __m256d pSumB = _mm256_setzero_pd(); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256d p[6]; + rpp_simd_load(rpp_load24_f32pkd3_to_f64pln3_avx, srcPtrTemp, p); + compute_sum_24_host(p, &pSumR, &pSumG, &pSumB); + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + sumR += static_cast(srcPtrTemp[0]); + sumG += static_cast(srcPtrTemp[1]); + sumB += static_cast(srcPtrTemp[2]); + srcPtrTemp += 3; + } + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + _mm256_storeu_pd(sumAvxR, pSumR); + _mm256_storeu_pd(sumAvxG, pSumG); + _mm256_storeu_pd(sumAvxB, pSumB); + sumR += (sumAvxR[0] + sumAvxR[1] + sumAvxR[2] + sumAvxR[3]); + sumG += (sumAvxG[0] + sumAvxG[1] + sumAvxG[2] + sumAvxG[3]); + sumB += (sumAvxB[0] + sumAvxB[1] + sumAvxB[2] + sumAvxB[3]); +#endif + sum = sumR + sumG + sumB; + mean = static_cast(sum / (totalPixelsPerChannel * 3)); + meanR = static_cast(sumR / totalPixelsPerChannel); + meanG = static_cast(sumG / totalPixelsPerChannel); + meanB = static_cast(sumB / totalPixelsPerChannel); + tensorMeanArr[idx] = meanR; + tensorMeanArr[idx + 1] = meanG; + tensorMeanArr[idx + 2] = meanB; + tensorMeanArr[idx + 3] = mean; + } + } + + return RPP_SUCCESS; +} + +RppStatus tensor_mean_f16_f32_host(Rpp16f *srcPtr, + RpptDescPtr srcDescPtr, + Rpp32f *tensorMeanArr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp16f *srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + Rpp16f *srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + Rpp32u alignedLength = (bufferLength / 24) * 24; + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; + Rpp32f totalPixelsPerChannel = roi.xywhROI.roiWidth * roi.xywhROI.roiHeight; + int idx = batchCount * 4; + + // Tensor Mean without fused output-layout toggle (NCHW) + if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + alignedLength = bufferLength & ~(vectorIncrementPerChannel-1); + vectorIncrement = 8; + Rpp32f mean = 0.0; + Rpp64f sum = 0.0; + Rpp64f sumAvx[4] = {0.0}; + + Rpp16f *srcPtrRow; + srcPtrRow = srcPtrChannel; +#if __AVX2__ + __m256d pSum = _mm256_setzero_pd(); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + Rpp32f srcPtrTemp_ps[8]; + for(int cnt = 0; cnt < vectorIncrement; cnt++) + srcPtrTemp_ps[cnt] = static_cast(srcPtrTemp[cnt]); + __m256d p1[2]; + rpp_simd_load(rpp_load8_f32_to_f64_avx, srcPtrTemp_ps, p1); + compute_sum_8_host(p1, &pSum); + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + sum += static_cast(*srcPtrTemp++); + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + _mm256_storeu_pd(sumAvx, pSum); + sum += (sumAvx[0] + sumAvx[1] + sumAvx[2] + sumAvx[3]); +#endif + mean = static_cast(sum / totalPixelsPerChannel); + tensorMeanArr[batchCount] = mean; + } + + // Tensor Mean without fused output-layout toggle 3 channel (NCHW) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp64f sum, sumR = 0.0, sumG = 0.0, sumB = 0.0; + Rpp32f mean, meanR = 0.0, meanG = 0.0, meanB = 0.0; + Rpp64f sumAvxR[4] = {0.0}; + Rpp64f sumAvxG[4] = {0.0}; + Rpp64f sumAvxB[4] = {0.0}; + + Rpp16f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; +#if __AVX2__ + __m256d pSumR = _mm256_setzero_pd(); + __m256d pSumG = _mm256_setzero_pd(); + __m256d pSumB = _mm256_setzero_pd(); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + Rpp32f srcPtrTempR_ps[8], srcPtrTempG_ps[8], srcPtrTempB_ps[8]; + for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++) + { + srcPtrTempR_ps[cnt] = static_cast(srcPtrTempR[cnt]); + srcPtrTempG_ps[cnt] = static_cast(srcPtrTempG[cnt]); + srcPtrTempB_ps[cnt] = static_cast(srcPtrTempB[cnt]); + } + __m256d p[6]; + rpp_simd_load(rpp_load24_f32pln3_to_f64pln3_avx, srcPtrTempR_ps, srcPtrTempG_ps, srcPtrTempB_ps, p); + compute_sum_24_host(p, &pSumR, &pSumG, &pSumB); + srcPtrTempR += vectorIncrementPerChannel; + srcPtrTempG += vectorIncrementPerChannel; + srcPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + sumR += static_cast(*srcPtrTempR++); + sumG += static_cast(*srcPtrTempG++); + sumB += static_cast(*srcPtrTempB++); + } + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + } +#if __AVX2__ + _mm256_storeu_pd(sumAvxR, pSumR); + _mm256_storeu_pd(sumAvxG, pSumG); + _mm256_storeu_pd(sumAvxB, pSumB); + sumR += (sumAvxR[0] + sumAvxR[1] + sumAvxR[2] + sumAvxR[3]); + sumG += (sumAvxG[0] + sumAvxG[1] + sumAvxG[2] + sumAvxG[3]); + sumB += (sumAvxB[0] + sumAvxB[1] + sumAvxB[2] + sumAvxB[3]); +#endif + sum = sumR + sumG + sumB; + mean = static_cast(sum / (totalPixelsPerChannel * 3)); + meanR = static_cast(sumR / totalPixelsPerChannel); + meanG = static_cast(sumG / totalPixelsPerChannel); + meanB = static_cast(sumB / totalPixelsPerChannel); + tensorMeanArr[idx] = meanR; + tensorMeanArr[idx + 1] = meanG; + tensorMeanArr[idx + 2] = meanB; + tensorMeanArr[idx + 3] = mean; + } + + // Tensor Mean without fused output-layout toggle (NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC)) + { + Rpp64f sum, sumR = 0.0, sumG = 0.0, sumB = 0.0; + Rpp32f mean, meanR = 0.0, meanG = 0.0, meanB = 0.0; + Rpp64f sumAvxR[4] = {0.0}; + Rpp64f sumAvxG[4] = {0.0}; + Rpp64f sumAvxB[4] = {0.0}; + + Rpp16f *srcPtrRow; + srcPtrRow = srcPtrChannel; +#if __AVX2__ + __m256d pSumR = _mm256_setzero_pd(); + __m256d pSumG = _mm256_setzero_pd(); + __m256d pSumB = _mm256_setzero_pd(); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + Rpp32f srcPtrTemp_ps[24]; + for(int cnt = 0; cnt < vectorIncrement; cnt++) + srcPtrTemp_ps[cnt] = static_cast(srcPtrTemp[cnt]); + __m256d p[6]; + rpp_simd_load(rpp_load24_f32pkd3_to_f64pln3_avx, srcPtrTemp_ps, p); + compute_sum_24_host(p, &pSumR, &pSumG, &pSumB); + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + sumR += static_cast(srcPtrTemp[0]); + sumG += static_cast(srcPtrTemp[1]); + sumB += static_cast(srcPtrTemp[2]); + srcPtrTemp += 3; + } + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + _mm256_storeu_pd(sumAvxR, pSumR); + _mm256_storeu_pd(sumAvxG, pSumG); + _mm256_storeu_pd(sumAvxB, pSumB); + sumR += (sumAvxR[0] + sumAvxR[1] + sumAvxR[2] + sumAvxR[3]); + sumG += (sumAvxG[0] + sumAvxG[1] + sumAvxG[2] + sumAvxG[3]); + sumB += (sumAvxB[0] + sumAvxB[1] + sumAvxB[2] + sumAvxB[3]); +#endif + sum = sumR + sumG + sumB; + mean = static_cast(sum / (totalPixelsPerChannel * 3)); + meanR = static_cast(sumR / totalPixelsPerChannel); + meanG = static_cast(sumG / totalPixelsPerChannel); + meanB = static_cast(sumB / totalPixelsPerChannel); + tensorMeanArr[idx] = meanR; + tensorMeanArr[idx + 1] = meanG; + tensorMeanArr[idx + 2] = meanB; + tensorMeanArr[idx + 3] = mean; + } + } + + return RPP_SUCCESS; +} + +RppStatus tensor_mean_i8_f32_host(Rpp8s *srcPtr, + RpptDescPtr srcDescPtr, + Rpp32f *tensorMeanArr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp8s *srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + Rpp8s *srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + Rpp32u alignedLength = (bufferLength / 48) * 48; + Rpp32u vectorIncrement = 48; + Rpp32u vectorIncrementPerChannel = 16; + Rpp32f totalPixelsPerChannel = roi.xywhROI.roiWidth * roi.xywhROI.roiHeight; + int idx = batchCount * 4; + + // Tensor Mean without fused output-layout toggle (NCHW) + if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + alignedLength = bufferLength & ~15; + vectorIncrement = 16; + Rpp32f mean = 0.0; + Rpp32s sum = 0; + Rpp32s sumAvx[8] = {0}; + + Rpp8s *srcPtrRow; + srcPtrRow = srcPtrChannel; +#if __AVX2__ + __m256i pSum = avx_px0; +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256i p1[2]; + rpp_simd_load(rpp_load16_i8_to_i32_avx, srcPtrTemp, p1); + compute_sum_16_host(p1, &pSum); + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + sum += static_cast(*srcPtrTemp++); + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + _mm256_store_si256((__m256i *)sumAvx, pSum); + sum += (sumAvx[0] + sumAvx[1] + sumAvx[2] + sumAvx[3] + sumAvx[4] + sumAvx[5] + sumAvx[6] + sumAvx[7]); +#endif + mean = static_cast(sum) / totalPixelsPerChannel; + tensorMeanArr[batchCount] = mean; + } + + // Tensor Mean without fused output-layout toggle 3 channel (NCHW) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp64s sum; + Rpp32s sumR = 0, sumG = 0, sumB = 0; + Rpp32f mean, meanR = 0.0, meanG = 0.0, meanB = 0.0; + Rpp32s sumAvxR[8] = {0}; + Rpp32s sumAvxG[8] = {0}; + Rpp32s sumAvxB[8] = {0}; + + Rpp8s *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; +#if __AVX2__ + __m256i pSumR = avx_px0; + __m256i pSumG = avx_px0; + __m256i pSumB = avx_px0; +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtrTempR, *srcPtrTempG, *srcPtrTempB; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i p[6]; + rpp_simd_load(rpp_load48_i8pln3_to_i32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); + compute_sum_48_host(p, &pSumR, &pSumG, &pSumB); + srcPtrTempR += vectorIncrementPerChannel; + srcPtrTempG += vectorIncrementPerChannel; + srcPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + sumR += static_cast(*srcPtrTempR++); + sumG += static_cast(*srcPtrTempG++); + sumB += static_cast(*srcPtrTempB++); + } + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + } +#if __AVX2__ + _mm256_store_si256((__m256i *)sumAvxR, pSumR); + _mm256_store_si256((__m256i *)sumAvxG, pSumG); + _mm256_store_si256((__m256i *)sumAvxB, pSumB); + sumR += (sumAvxR[0] + sumAvxR[1] + sumAvxR[2] + sumAvxR[3] + sumAvxR[4] + sumAvxR[5] + sumAvxR[6] + sumAvxR[7]); + sumG += (sumAvxG[0] + sumAvxG[1] + sumAvxG[2] + sumAvxG[3] + sumAvxG[4] + sumAvxG[5] + sumAvxG[6] + sumAvxG[7]); + sumB += (sumAvxB[0] + sumAvxB[1] + sumAvxB[2] + sumAvxB[3] + sumAvxB[4] + sumAvxB[5] + sumAvxB[6] + sumAvxB[7]); +#endif + + sum = static_cast(sum) + static_cast(sumG) + static_cast(sumB); + mean = (static_cast(sum) / (totalPixelsPerChannel * 3)); + meanR = (static_cast(sumR) / totalPixelsPerChannel); + meanG = (static_cast(sumG) / totalPixelsPerChannel); + meanB = (static_cast(sumB) / totalPixelsPerChannel); + tensorMeanArr[idx] = meanR; + tensorMeanArr[idx + 1] = meanG; + tensorMeanArr[idx + 2] = meanB; + tensorMeanArr[idx + 3] = mean; + } + + // Tensor Mean without fused output-layout toggle (NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC)) + { + Rpp64s sum; + Rpp32s sumR = 0, sumG = 0, sumB = 0; + Rpp32f mean, meanR = 0.0, meanG = 0.0, meanB = 0.0; + Rpp32s sumAvxR[8] = {0}; + Rpp32s sumAvxG[8] = {0}; + Rpp32s sumAvxB[8] = {0}; + + Rpp8s *srcPtrRow; + srcPtrRow = srcPtrChannel; +#if __AVX2__ + __m256i pSumR = avx_px0; + __m256i pSumG = avx_px0; + __m256i pSumB = avx_px0; +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256i p[6]; + rpp_simd_load(rpp_load48_i8pkd3_to_i32pln3_avx, srcPtrTemp, p); + compute_sum_48_host(p, &pSumR, &pSumG, &pSumB); + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + sumR += static_cast(srcPtrTemp[0]); + sumG += static_cast(srcPtrTemp[1]); + sumB += static_cast(srcPtrTemp[2]); + srcPtrTemp += 3; + } + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + _mm256_store_si256((__m256i *)sumAvxR, pSumR); + _mm256_store_si256((__m256i *)sumAvxG, pSumG); + _mm256_store_si256((__m256i *)sumAvxB, pSumB); + sumR += (sumAvxR[0] + sumAvxR[1] + sumAvxR[2] + sumAvxR[3] + sumAvxR[4] + sumAvxR[5] + sumAvxR[6] + sumAvxR[7]); + sumG += (sumAvxG[0] + sumAvxG[1] + sumAvxG[2] + sumAvxG[3] + sumAvxG[4] + sumAvxG[5] + sumAvxG[6] + sumAvxG[7]); + sumB += (sumAvxB[0] + sumAvxB[1] + sumAvxB[2] + sumAvxB[3] + sumAvxB[4] + sumAvxB[5] + sumAvxB[6] + sumAvxB[7]); +#endif + sum = static_cast(sumR) + static_cast(sumG) + static_cast(sumB); + mean = (static_cast(sum) / (totalPixelsPerChannel * 3)); + meanR = (static_cast(sumR) / totalPixelsPerChannel); + meanG = (static_cast(sumG) / totalPixelsPerChannel); + meanB = (static_cast(sumB) / totalPixelsPerChannel); + tensorMeanArr[idx] = meanR; + tensorMeanArr[idx + 1] = meanG; + tensorMeanArr[idx + 2] = meanB; + tensorMeanArr[idx + 3] = mean; + } + } + + return RPP_SUCCESS; +} diff --git a/src/modules/cpu/kernel/tensor_stddev.hpp b/src/modules/cpu/kernel/tensor_stddev.hpp new file mode 100644 index 000000000..2f64e93ab --- /dev/null +++ b/src/modules/cpu/kernel/tensor_stddev.hpp @@ -0,0 +1,1092 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "rppdefs.h" +#include "rpp_cpu_simd.hpp" +#include "rpp_cpu_common.hpp" +#include "reduction.hpp" + +RppStatus tensor_stddev_u8_f32_host(Rpp8u *srcPtr, + RpptDescPtr srcDescPtr, + Rpp32f *tensorStddevArr, + Rpp32f *meanTensor, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp8u *srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + Rpp8u *srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + Rpp32u alignedLength = (bufferLength / 24) * 24; + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; + Rpp32f totalPixelsPerChannel = roi.xywhROI.roiWidth * roi.xywhROI.roiHeight; + int idx = batchCount * 4; + + // Tensor Stddev without fused output-layout toggle (NCHW) + if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + alignedLength = bufferLength & ~(vectorIncrementPerChannel-1); + Rpp64f var = 0.0; + Rpp32f stddev = 0.0; + Rpp64f varAvx[4] = {0.0}; + Rpp32f mean = meanTensor[batchCount]; + + Rpp8u *srcPtrRow; + srcPtrRow = srcPtrChannel; +#if __AVX2__ + __m256d pMean = _mm256_set1_pd(mean); + __m256d pVar = _mm256_setzero_pd(); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256d p1[2]; + rpp_simd_load(rpp_load8_u8_to_f64_avx, srcPtrTemp, p1); + compute_variance_8_host(p1, &pMean, &pVar); + + srcPtrTemp += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + var += (static_cast(*srcPtrTemp) - mean) * (static_cast(*srcPtrTemp) - mean); + srcPtrTemp++; + } + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + _mm256_storeu_pd(varAvx, pVar); + var += (varAvx[0] + varAvx[1] + varAvx[2] + varAvx[3]); +#endif + stddev = sqrt(var / totalPixelsPerChannel); + tensorStddevArr[batchCount] = static_cast(stddev); + } + + // Tensor Stddev without fused output-layout toggle 3 channel (NCHW) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp64f varR, varG, varB, varImageR, varImageG, varImageB, varImage; + Rpp32f stddevImage, stddevR, stddevG, stddevB; + Rpp64f varAvxR[4] = {0.0}; + Rpp64f varAvxG[4] = {0.0}; + Rpp64f varAvxB[4] = {0.0}; + Rpp64f varAvxImageR[4] = {0.0}; + Rpp64f varAvxImageG[4] = {0.0}; + Rpp64f varAvxImageB[4] = {0.0}; + varR = varG = varB = varImageR = varImageG = varImageB = 0.0; + + Rpp32f meanR = meanTensor[idx]; + Rpp32f meanG = meanTensor[idx + 1]; + Rpp32f meanB = meanTensor[idx + 2]; + Rpp32f meanImage = meanTensor[idx + 3]; + + Rpp8u *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; +#if __AVX2__ + __m256d pMeanR = _mm256_set1_pd(meanR); + __m256d pMeanG = _mm256_set1_pd(meanG); + __m256d pMeanB = _mm256_set1_pd(meanB); + __m256d pMeanImage = _mm256_set1_pd(meanImage); + __m256d pVarR, pVarG, pVarB; + __m256d pVarImageR, pVarImageG, pVarImageB; + pVarR = pVarG = pVarB = pVarImageR = pVarImageG = pVarImageB = _mm256_setzero_pd(); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtrTempR, *srcPtrTempG, *srcPtrTempB; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256d p[6]; + rpp_simd_load(rpp_load24_u8pln3_to_f64pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); + compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB); + compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB); + srcPtrTempR += vectorIncrementPerChannel; + srcPtrTempG += vectorIncrementPerChannel; + srcPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + Rpp64f srcPtrR = static_cast(*srcPtrTempR); + Rpp64f srcPtrG = static_cast(*srcPtrTempG); + Rpp64f srcPtrB = static_cast(*srcPtrTempB); + varR += (srcPtrR - meanR) * (srcPtrR - meanR); + varG += (srcPtrG - meanG) * (srcPtrG - meanG); + varB += (srcPtrB - meanB) * (srcPtrB - meanB); + varImageR += (srcPtrR - meanImage) * (srcPtrR - meanImage); + varImageG += (srcPtrG - meanImage) * (srcPtrG - meanImage); + varImageB += (srcPtrB - meanImage) * (srcPtrB - meanImage); + srcPtrTempR++; + srcPtrTempG++; + srcPtrTempB++; + } + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + } +#if __AVX2__ + _mm256_storeu_pd(varAvxR, pVarR); + _mm256_storeu_pd(varAvxG, pVarG); + _mm256_storeu_pd(varAvxB, pVarB); + _mm256_storeu_pd(varAvxImageR, pVarImageR); + _mm256_storeu_pd(varAvxImageG, pVarImageG); + _mm256_storeu_pd(varAvxImageB, pVarImageB); + + varR += (varAvxR[0] + varAvxR[1] + varAvxR[2] + varAvxR[3]); + varG += (varAvxG[0] + varAvxG[1] + varAvxG[2] + varAvxG[3]); + varB += (varAvxB[0] + varAvxB[1] + varAvxB[2] + varAvxB[3]); + varImageR += (varAvxImageR[0] + varAvxImageR[1] + varAvxImageR[2] + varAvxImageR[3]); + varImageG += (varAvxImageG[0] + varAvxImageG[1] + varAvxImageG[2] + varAvxImageG[3]); + varImageB += (varAvxImageB[0] + varAvxImageB[1] + varAvxImageB[2] + varAvxImageB[3]); +#endif + varImage = varImageR + varImageG + varImageB; + stddevImage = static_cast(sqrt(varImage / (totalPixelsPerChannel * 3))); + stddevR = static_cast(sqrt(varR / totalPixelsPerChannel)); + stddevG = static_cast(sqrt(varG / totalPixelsPerChannel)); + stddevB = static_cast(sqrt(varB / totalPixelsPerChannel)); + tensorStddevArr[idx] = stddevR; + tensorStddevArr[idx + 1] = stddevG; + tensorStddevArr[idx + 2] = stddevB; + tensorStddevArr[idx + 3] = stddevImage; + } + + // Tensor Stddev without fused output-layout toggle (NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC)) + { + Rpp64f varR, varG, varB, varImageR, varImageG, varImageB, varImage; + Rpp32f stddevImage, stddevR, stddevG, stddevB; + Rpp64f varAvxR[4] = {0.0}; + Rpp64f varAvxG[4] = {0.0}; + Rpp64f varAvxB[4] = {0.0}; + Rpp64f varAvxImageR[4] = {0.0}; + Rpp64f varAvxImageG[4] = {0.0}; + Rpp64f varAvxImageB[4] = {0.0}; + varR = varG = varB = varImageR = varImageG = varImageB = 0.0; + + Rpp32f meanR = meanTensor[idx]; + Rpp32f meanG = meanTensor[idx + 1]; + Rpp32f meanB = meanTensor[idx + 2]; + Rpp32f meanImage = meanTensor[idx + 3]; + + Rpp8u *srcPtrRow; + srcPtrRow = srcPtrChannel; +#if __AVX2__ + __m256d pMeanR = _mm256_set1_pd(meanR); + __m256d pMeanG = _mm256_set1_pd(meanG); + __m256d pMeanB = _mm256_set1_pd(meanB); + __m256d pMeanImage = _mm256_set1_pd(meanImage); + __m256d pVarR, pVarG, pVarB; + __m256d pVarImageR, pVarImageG, pVarImageB; + pVarR = pVarG = pVarB = pVarImageR = pVarImageG = pVarImageB = _mm256_setzero_pd(); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256d p[6]; + rpp_simd_load(rpp_load24_u8pkd3_to_f64pln3_avx, srcPtrTemp, p); + compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB); + compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB); + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + Rpp64f srcPtrR = static_cast(srcPtrTemp[0]); + Rpp64f srcPtrG = static_cast(srcPtrTemp[1]); + Rpp64f srcPtrB = static_cast(srcPtrTemp[2]); + varR += (srcPtrR - meanR) * (srcPtrR - meanR); + varG += (srcPtrG - meanG) * (srcPtrG - meanG); + varB += (srcPtrB - meanB) * (srcPtrB - meanB); + varImageR += (srcPtrR - meanImage) * (srcPtrR - meanImage); + varImageG += (srcPtrG - meanImage) * (srcPtrG - meanImage); + varImageB += (srcPtrB - meanImage) * (srcPtrB - meanImage); + srcPtrTemp += 3; + } + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + _mm256_storeu_pd(varAvxR, pVarR); + _mm256_storeu_pd(varAvxG, pVarG); + _mm256_storeu_pd(varAvxB, pVarB); + _mm256_storeu_pd(varAvxImageR, pVarImageR); + _mm256_storeu_pd(varAvxImageG, pVarImageG); + _mm256_storeu_pd(varAvxImageB, pVarImageB); + + varR += (varAvxR[0] + varAvxR[1] + varAvxR[2] + varAvxR[3]); + varG += (varAvxG[0] + varAvxG[1] + varAvxG[2] + varAvxG[3]); + varB += (varAvxB[0] + varAvxB[1] + varAvxB[2] + varAvxB[3]); + varImageR += (varAvxImageR[0] + varAvxImageR[1] + varAvxImageR[2] + varAvxImageR[3]); + varImageG += (varAvxImageG[0] + varAvxImageG[1] + varAvxImageG[2] + varAvxImageG[3]); + varImageB += (varAvxImageB[0] + varAvxImageB[1] + varAvxImageB[2] + varAvxImageB[3]); +#endif + varImage = varImageR + varImageG + varImageB; + stddevImage = static_cast(sqrt(varImage / (totalPixelsPerChannel * 3))); + stddevR = static_cast(sqrt(varR / totalPixelsPerChannel)); + stddevG = static_cast(sqrt(varG / totalPixelsPerChannel)); + stddevB = static_cast(sqrt(varB / totalPixelsPerChannel)); + tensorStddevArr[idx] = stddevR; + tensorStddevArr[idx + 1] = stddevG; + tensorStddevArr[idx + 2] = stddevB; + tensorStddevArr[idx + 3] = stddevImage; + } + } + + return RPP_SUCCESS; +} + +RppStatus tensor_stddev_f32_f32_host(Rpp32f *srcPtr, + RpptDescPtr srcDescPtr, + Rpp32f *tensorStddevArr, + Rpp32f *meanTensor, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp32f *srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + Rpp32f *srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + Rpp32u alignedLength = (bufferLength / 24) * 24; + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; + Rpp32f totalPixelsPerChannel = roi.xywhROI.roiWidth * roi.xywhROI.roiHeight; + int idx = batchCount * 4; + + // Tensor Stddev without fused output-layout toggle (NCHW) + if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + alignedLength = bufferLength & ~(vectorIncrementPerChannel-1); + Rpp64f var = 0.0; + Rpp32f stddev = 0.0; + Rpp64f varAvx[4] = {0.0}; + Rpp32f mean = meanTensor[batchCount]; + + Rpp32f *srcPtrRow; + srcPtrRow = srcPtrChannel; +#if __AVX2__ + __m256d pMean = _mm256_set1_pd(mean); + __m256d pVar = _mm256_setzero_pd(); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256d p1[2]; + rpp_simd_load(rpp_load8_f32_to_f64_avx, srcPtrTemp, p1); + compute_variance_8_host(p1, &pMean, &pVar); + srcPtrTemp += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + var += (static_cast(*srcPtrTemp) - mean) * (static_cast(*srcPtrTemp) - mean); + srcPtrTemp++; + } + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + _mm256_storeu_pd(varAvx, pVar); + var += (varAvx[0] + varAvx[1] + varAvx[2] + varAvx[3]); +#endif + stddev = sqrt(var / totalPixelsPerChannel) * 255; + tensorStddevArr[batchCount] = static_cast(stddev); + } + + // Tensor Stddev without fused output-layout toggle 3 channel (NCHW) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp64f varR, varG, varB, varImageR, varImageG, varImageB, varImage; + Rpp32f stddevImage, stddevR, stddevG, stddevB; + Rpp64f varAvxR[4] = {0.0}; + Rpp64f varAvxG[4] = {0.0}; + Rpp64f varAvxB[4] = {0.0}; + Rpp64f varAvxImageR[4] = {0.0}; + Rpp64f varAvxImageG[4] = {0.0}; + Rpp64f varAvxImageB[4] = {0.0}; + varR = varG = varB = varImageR = varImageG = varImageB = 0.0; + + Rpp32f meanR = meanTensor[idx]; + Rpp32f meanG = meanTensor[idx + 1]; + Rpp32f meanB = meanTensor[idx + 2]; + Rpp32f meanImage = meanTensor[idx + 3]; + + Rpp32f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; +#if __AVX2__ + __m256d pMeanR = _mm256_set1_pd(meanR); + __m256d pMeanG = _mm256_set1_pd(meanG); + __m256d pMeanB = _mm256_set1_pd(meanB); + __m256d pMeanImage = _mm256_set1_pd(meanImage); + __m256d pVarR, pVarG, pVarB; + __m256d pVarImageR, pVarImageG, pVarImageB; + pVarR = pVarG = pVarB = pVarImageR = pVarImageG = pVarImageB = _mm256_setzero_pd(); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256d p[6]; + rpp_simd_load(rpp_load24_f32pln3_to_f64pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); + compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB); + compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB); + srcPtrTempR += vectorIncrementPerChannel; + srcPtrTempG += vectorIncrementPerChannel; + srcPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + Rpp64f srcPtrR = static_cast(*srcPtrTempR); + Rpp64f srcPtrG = static_cast(*srcPtrTempG); + Rpp64f srcPtrB = static_cast(*srcPtrTempB); + varR += (srcPtrR - meanR) * (srcPtrR - meanR); + varG += (srcPtrG - meanG) * (srcPtrG - meanG); + varB += (srcPtrB - meanB) * (srcPtrB - meanB); + varImageR += (srcPtrR - meanImage) * (srcPtrR - meanImage); + varImageG += (srcPtrG - meanImage) * (srcPtrG - meanImage); + varImageB += (srcPtrB - meanImage) * (srcPtrB - meanImage); + srcPtrTempR++; + srcPtrTempG++; + srcPtrTempB++; + } + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + } +#if __AVX2__ + _mm256_storeu_pd(varAvxR, pVarR); + _mm256_storeu_pd(varAvxG, pVarG); + _mm256_storeu_pd(varAvxB, pVarB); + _mm256_storeu_pd(varAvxImageR, pVarImageR); + _mm256_storeu_pd(varAvxImageG, pVarImageG); + _mm256_storeu_pd(varAvxImageB, pVarImageB); + + varR += (varAvxR[0] + varAvxR[1] + varAvxR[2] + varAvxR[3]); + varG += (varAvxG[0] + varAvxG[1] + varAvxG[2] + varAvxG[3]); + varB += (varAvxB[0] + varAvxB[1] + varAvxB[2] + varAvxB[3]); + varImageR += (varAvxImageR[0] + varAvxImageR[1] + varAvxImageR[2] + varAvxImageR[3]); + varImageG += (varAvxImageG[0] + varAvxImageG[1] + varAvxImageG[2] + varAvxImageG[3]); + varImageB += (varAvxImageB[0] + varAvxImageB[1] + varAvxImageB[2] + varAvxImageB[3]); +#endif + varImage = varImageR + varImageG + varImageB; + stddevImage = static_cast(sqrt(varImage / (totalPixelsPerChannel * 3)) * 255); // multiply by 255 to normalize variation + stddevR = static_cast(sqrt(varR / totalPixelsPerChannel) * 255); + stddevG = static_cast(sqrt(varG / totalPixelsPerChannel) * 255); + stddevB = static_cast(sqrt(varB / totalPixelsPerChannel) * 255); + tensorStddevArr[idx] = stddevR; + tensorStddevArr[idx + 1] = stddevG; + tensorStddevArr[idx + 2] = stddevB; + tensorStddevArr[idx + 3] = stddevImage; + } + + // Tensor Stddev without fused output-layout toggle (NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC)) + { + Rpp64f varR, varG, varB, varImageR, varImageG, varImageB, varImage; + Rpp32f stddevImage, stddevR, stddevG, stddevB; + Rpp64f varAvxR[4] = {0.0}; + Rpp64f varAvxG[4] = {0.0}; + Rpp64f varAvxB[4] = {0.0}; + Rpp64f varAvxImageR[4] = {0.0}; + Rpp64f varAvxImageG[4] = {0.0}; + Rpp64f varAvxImageB[4] = {0.0}; + varR = varG = varB = varImageR = varImageG = varImageB = 0.0; + + Rpp32f meanR = meanTensor[idx]; + Rpp32f meanG = meanTensor[idx + 1]; + Rpp32f meanB = meanTensor[idx + 2]; + Rpp32f meanImage = meanTensor[idx + 3]; + + Rpp32f *srcPtrRow; + srcPtrRow = srcPtrChannel; +#if __AVX2__ + __m256d pMeanR = _mm256_set1_pd(meanR); + __m256d pMeanG = _mm256_set1_pd(meanG); + __m256d pMeanB = _mm256_set1_pd(meanB); + __m256d pMeanImage = _mm256_set1_pd(meanImage); + __m256d pVarR, pVarG, pVarB; + __m256d pVarImageR, pVarImageG, pVarImageB; + pVarR = pVarG = pVarB = pVarImageR = pVarImageG = pVarImageB = _mm256_setzero_pd(); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256d p[6]; + rpp_simd_load(rpp_load24_f32pkd3_to_f64pln3_avx, srcPtrTemp, p); + compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB); + compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB); + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + Rpp64f srcPtrR = static_cast(srcPtrTemp[0]); + Rpp64f srcPtrG = static_cast(srcPtrTemp[1]); + Rpp64f srcPtrB = static_cast(srcPtrTemp[2]); + varR += (srcPtrR - meanR) * (srcPtrR - meanR); + varG += (srcPtrG - meanG) * (srcPtrG - meanG); + varB += (srcPtrB - meanB) * (srcPtrB - meanB); + varImageR += (srcPtrR - meanImage) * (srcPtrR - meanImage); + varImageG += (srcPtrG - meanImage) * (srcPtrG - meanImage); + varImageB += (srcPtrB - meanImage) * (srcPtrB - meanImage); + srcPtrTemp += 3; + } + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + _mm256_storeu_pd(varAvxR, pVarR); + _mm256_storeu_pd(varAvxG, pVarG); + _mm256_storeu_pd(varAvxB, pVarB); + _mm256_storeu_pd(varAvxImageR, pVarImageR); + _mm256_storeu_pd(varAvxImageG, pVarImageG); + _mm256_storeu_pd(varAvxImageB, pVarImageB); + + varR += (varAvxR[0] + varAvxR[1] + varAvxR[2] + varAvxR[3]); + varG += (varAvxG[0] + varAvxG[1] + varAvxG[2] + varAvxG[3]); + varB += (varAvxB[0] + varAvxB[1] + varAvxB[2] + varAvxB[3]); + varImageR += (varAvxImageR[0] + varAvxImageR[1] + varAvxImageR[2] + varAvxImageR[3]); + varImageG += (varAvxImageG[0] + varAvxImageG[1] + varAvxImageG[2] + varAvxImageG[3]); + varImageB += (varAvxImageB[0] + varAvxImageB[1] + varAvxImageB[2] + varAvxImageB[3]); +#endif + varImage = varImageR + varImageG + varImageB; + stddevImage = static_cast(sqrt(varImage / (totalPixelsPerChannel * 3)) * 255); + stddevR = static_cast(sqrt(varR / totalPixelsPerChannel) * 255); + stddevG = static_cast(sqrt(varG / totalPixelsPerChannel) * 255); + stddevB = static_cast(sqrt(varB / totalPixelsPerChannel) * 255); + tensorStddevArr[idx] = stddevR; + tensorStddevArr[idx + 1] = stddevG; + tensorStddevArr[idx + 2] = stddevB; + tensorStddevArr[idx + 3] = stddevImage; + } + } + + return RPP_SUCCESS; +} + +RppStatus tensor_stddev_f16_f32_host(Rpp16f *srcPtr, + RpptDescPtr srcDescPtr, + Rpp32f *tensorStddevArr, + Rpp32f *meanTensor, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp16f *srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + Rpp16f *srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + Rpp32u alignedLength = (bufferLength / 24) * 24; + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; + Rpp32f totalPixelsPerChannel = roi.xywhROI.roiWidth * roi.xywhROI.roiHeight; + int idx = batchCount * 4; + + // Tensor Stddev without fused output-layout toggle (NCHW) + if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + alignedLength = bufferLength & ~(vectorIncrementPerChannel-1); + Rpp64f var = 0.0; + Rpp32f stddev = 0.0; + Rpp64f varAvx[4] = {0.0}; + Rpp32f mean = meanTensor[batchCount]; + + Rpp16f *srcPtrRow; + srcPtrRow = srcPtrChannel; +#if __AVX2__ + __m256d pMean = _mm256_set1_pd(mean); + __m256d pVar = _mm256_setzero_pd(); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + Rpp32f srcPtrTemp_ps[8]; + for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++) + srcPtrTemp_ps[cnt] = static_cast(srcPtrTemp[cnt]); + + __m256d p1[2]; + rpp_simd_load(rpp_load8_f32_to_f64_avx, srcPtrTemp_ps, p1); + compute_variance_8_host(p1, &pMean, &pVar); + + srcPtrTemp += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + var += (static_cast(*srcPtrTemp) - mean) * (static_cast(*srcPtrTemp) - mean); + srcPtrTemp++; + } + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + _mm256_storeu_pd(varAvx, pVar); + var += (varAvx[0] + varAvx[1] + varAvx[2] + varAvx[3]); +#endif + stddev = sqrt(var / totalPixelsPerChannel) * 255; + tensorStddevArr[batchCount] = static_cast(stddev); + } + + // Tensor Stddev without fused output-layout toggle 3 channel (NCHW) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp64f varR, varG, varB, varImageR, varImageG, varImageB, varImage; + Rpp32f stddevImage, stddevR, stddevG, stddevB; + Rpp64f varAvxR[4] = {0.0}; + Rpp64f varAvxG[4] = {0.0}; + Rpp64f varAvxB[4] = {0.0}; + Rpp64f varAvxImageR[4] = {0.0}; + Rpp64f varAvxImageG[4] = {0.0}; + Rpp64f varAvxImageB[4] = {0.0}; + varR = varG = varB = varImageR = varImageG = varImageB = 0.0; + + Rpp32f meanR = meanTensor[idx]; + Rpp32f meanG = meanTensor[idx + 1]; + Rpp32f meanB = meanTensor[idx + 2]; + Rpp32f meanImage = meanTensor[idx + 3]; + + Rpp16f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; +#if __AVX2__ + __m256d pMeanR = _mm256_set1_pd(meanR); + __m256d pMeanG = _mm256_set1_pd(meanG); + __m256d pMeanB = _mm256_set1_pd(meanB); + __m256d pMeanImage = _mm256_set1_pd(meanImage); + __m256d pVarR, pVarG, pVarB; + __m256d pVarImageR, pVarImageG, pVarImageB; + pVarR = pVarG = pVarB = pVarImageR = pVarImageG = pVarImageB = _mm256_setzero_pd(); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + Rpp32f srcPtrTempR_ps[8], srcPtrTempG_ps[8], srcPtrTempB_ps[8]; + for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++) + { + srcPtrTempR_ps[cnt] = static_cast(srcPtrTempR[cnt]); + srcPtrTempG_ps[cnt] = static_cast(srcPtrTempG[cnt]); + srcPtrTempB_ps[cnt] = static_cast(srcPtrTempB[cnt]); + } + + __m256d p[6]; + rpp_simd_load(rpp_load24_f32pln3_to_f64pln3_avx, srcPtrTempR_ps, srcPtrTempG_ps, srcPtrTempB_ps, p); + compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB); + compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB); + srcPtrTempR += vectorIncrementPerChannel; + srcPtrTempG += vectorIncrementPerChannel; + srcPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + Rpp64f srcPtrR = static_cast(*srcPtrTempR); + Rpp64f srcPtrG = static_cast(*srcPtrTempG); + Rpp64f srcPtrB = static_cast(*srcPtrTempB); + varR += (srcPtrR - meanR) * (srcPtrR - meanR); + varG += (srcPtrG - meanG) * (srcPtrG - meanG); + varB += (srcPtrB - meanB) * (srcPtrB - meanB); + varImageR += (srcPtrR - meanImage) * (srcPtrR - meanImage); + varImageG += (srcPtrG - meanImage) * (srcPtrG - meanImage); + varImageB += (srcPtrB - meanImage) * (srcPtrB - meanImage); + srcPtrTempR++; + srcPtrTempG++; + srcPtrTempB++; + } + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + } +#if __AVX2__ + _mm256_storeu_pd(varAvxR, pVarR); + _mm256_storeu_pd(varAvxG, pVarG); + _mm256_storeu_pd(varAvxB, pVarB); + _mm256_storeu_pd(varAvxImageR, pVarImageR); + _mm256_storeu_pd(varAvxImageG, pVarImageG); + _mm256_storeu_pd(varAvxImageB, pVarImageB); + + varR += (varAvxR[0] + varAvxR[1] + varAvxR[2] + varAvxR[3]); + varG += (varAvxG[0] + varAvxG[1] + varAvxG[2] + varAvxG[3]); + varB += (varAvxB[0] + varAvxB[1] + varAvxB[2] + varAvxB[3]); + varImageR += (varAvxImageR[0] + varAvxImageR[1] + varAvxImageR[2] + varAvxImageR[3]); + varImageG += (varAvxImageG[0] + varAvxImageG[1] + varAvxImageG[2] + varAvxImageG[3]); + varImageB += (varAvxImageB[0] + varAvxImageB[1] + varAvxImageB[2] + varAvxImageB[3]); +#endif + varImage = varImageR + varImageG + varImageB; + stddevImage = static_cast(sqrt(varImage / (totalPixelsPerChannel * 3)) * 255); // multiply by 255 to normalize variation + stddevR = static_cast(sqrt(varR / totalPixelsPerChannel) * 255); + stddevG = static_cast(sqrt(varG / totalPixelsPerChannel) * 255); + stddevB = static_cast(sqrt(varB / totalPixelsPerChannel) * 255); + tensorStddevArr[idx] = stddevR; + tensorStddevArr[idx + 1] = stddevG; + tensorStddevArr[idx + 2] = stddevB; + tensorStddevArr[idx + 3] = stddevImage; + } + + // Tensor Stddev without fused output-layout toggle (NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC)) + { + Rpp64f varR, varG, varB, varImageR, varImageG, varImageB, varImage; + Rpp32f stddevImage, stddevR, stddevG, stddevB; + Rpp64f varAvxR[4] = {0.0}; + Rpp64f varAvxG[4] = {0.0}; + Rpp64f varAvxB[4] = {0.0}; + Rpp64f varAvxImageR[4] = {0.0}; + Rpp64f varAvxImageG[4] = {0.0}; + Rpp64f varAvxImageB[4] = {0.0}; + varR = varG = varB = varImageR = varImageG = varImageB = 0.0; + + Rpp32f meanR = meanTensor[idx]; + Rpp32f meanG = meanTensor[idx + 1]; + Rpp32f meanB = meanTensor[idx + 2]; + Rpp32f meanImage = meanTensor[idx + 3]; + + Rpp16f *srcPtrRow; + srcPtrRow = srcPtrChannel; +#if __AVX2__ + __m256d pMeanR = _mm256_set1_pd(meanR); + __m256d pMeanG = _mm256_set1_pd(meanG); + __m256d pMeanB = _mm256_set1_pd(meanB); + __m256d pMeanImage = _mm256_set1_pd(meanImage); + __m256d pVarR, pVarG, pVarB; + __m256d pVarImageR, pVarImageG, pVarImageB; + pVarR = pVarG = pVarB = pVarImageR = pVarImageG = pVarImageB = _mm256_setzero_pd(); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + Rpp32f srcPtrTemp_ps[24]; + for(int cnt = 0; cnt < vectorIncrement; cnt++) + srcPtrTemp_ps[cnt] = static_cast(srcPtrTemp[cnt]); + + __m256d p[6]; + rpp_simd_load(rpp_load24_f32pkd3_to_f64pln3_avx, srcPtrTemp_ps, p); + compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB); + compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB); + + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + Rpp64f srcPtrR = static_cast(srcPtrTemp[0]); + Rpp64f srcPtrG = static_cast(srcPtrTemp[1]); + Rpp64f srcPtrB = static_cast(srcPtrTemp[2]); + varR += (srcPtrR - meanR) * (srcPtrR - meanR); + varG += (srcPtrG - meanG) * (srcPtrG - meanG); + varB += (srcPtrB - meanB) * (srcPtrB - meanB); + varImageR += (srcPtrR - meanImage) * (srcPtrR - meanImage); + varImageG += (srcPtrG - meanImage) * (srcPtrG - meanImage); + varImageB += (srcPtrB - meanImage) * (srcPtrB - meanImage); + srcPtrTemp += 3; + } + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + _mm256_storeu_pd(varAvxR, pVarR); + _mm256_storeu_pd(varAvxG, pVarG); + _mm256_storeu_pd(varAvxB, pVarB); + _mm256_storeu_pd(varAvxImageR, pVarImageR); + _mm256_storeu_pd(varAvxImageG, pVarImageG); + _mm256_storeu_pd(varAvxImageB, pVarImageB); + + varR += (varAvxR[0] + varAvxR[1] + varAvxR[2] + varAvxR[3]); + varG += (varAvxG[0] + varAvxG[1] + varAvxG[2] + varAvxG[3]); + varB += (varAvxB[0] + varAvxB[1] + varAvxB[2] + varAvxB[3]); + varImageR += (varAvxImageR[0] + varAvxImageR[1] + varAvxImageR[2] + varAvxImageR[3]); + varImageG += (varAvxImageG[0] + varAvxImageG[1] + varAvxImageG[2] + varAvxImageG[3]); + varImageB += (varAvxImageB[0] + varAvxImageB[1] + varAvxImageB[2] + varAvxImageB[3]); +#endif + varImage = varImageR + varImageG + varImageB; + stddevImage = static_cast(sqrt(varImage / (totalPixelsPerChannel * 3)) * 255); + stddevR = static_cast(sqrt(varR / totalPixelsPerChannel) * 255); + stddevG = static_cast(sqrt(varG / totalPixelsPerChannel) * 255); + stddevB = static_cast(sqrt(varB / totalPixelsPerChannel) * 255); + tensorStddevArr[idx] = stddevR; + tensorStddevArr[idx + 1] = stddevG; + tensorStddevArr[idx + 2] = stddevB; + tensorStddevArr[idx + 3] = stddevImage; + } + } + + return RPP_SUCCESS; +} + +RppStatus tensor_stddev_i8_f32_host(Rpp8s *srcPtr, + RpptDescPtr srcDescPtr, + Rpp32f *tensorStddevArr, + Rpp32f *meanTensor, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp8s *srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + Rpp8s *srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + Rpp32u alignedLength = (bufferLength / 24) * 24; + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; + Rpp32f totalPixelsPerChannel = roi.xywhROI.roiWidth * roi.xywhROI.roiHeight; + int idx = batchCount * 4; + + // Tensor Stddev without fused output-layout toggle (NCHW) + if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + alignedLength = bufferLength & ~(vectorIncrementPerChannel-1); + Rpp64f var = 0.0; + Rpp32f stddev = 0.0; + Rpp64f varAvx[4] = {0.0}; + Rpp32f mean = meanTensor[batchCount] + 128; + + Rpp8s *srcPtrRow; + srcPtrRow = srcPtrChannel; +#if __AVX2__ + __m256d pMean = _mm256_set1_pd(mean); + __m256d pVar = _mm256_setzero_pd(); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256d p1[2]; + rpp_simd_load(rpp_load8_i8_to_f64_avx, srcPtrTemp, p1); + compute_variance_8_host(p1, &pMean, &pVar); + + srcPtrTemp += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + var += (static_cast(*srcPtrTemp + 128) - mean) * (static_cast(*srcPtrTemp + 128) - mean); + srcPtrTemp++; + } + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + _mm256_storeu_pd(varAvx, pVar); + var += (varAvx[0] + varAvx[1] + varAvx[2] + varAvx[3]); +#endif + stddev = sqrt(var / totalPixelsPerChannel); + tensorStddevArr[batchCount] = static_cast(stddev); + } + + // Tensor Stddev without fused output-layout toggle 3 channel (NCHW) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp64f varR, varG, varB, varImageR, varImageG, varImageB, varImage; + Rpp32f stddevImage, stddevR, stddevG, stddevB; + Rpp64f varAvxR[4] = {0.0}; + Rpp64f varAvxG[4] = {0.0}; + Rpp64f varAvxB[4] = {0.0}; + Rpp64f varAvxImageR[4] = {0.0}; + Rpp64f varAvxImageG[4] = {0.0}; + Rpp64f varAvxImageB[4] = {0.0}; + varR = varG = varB = varImageR = varImageG = varImageB = 0.0; + + Rpp32f meanR = meanTensor[idx] + 128; + Rpp32f meanG = meanTensor[idx + 1] + 128; + Rpp32f meanB = meanTensor[idx + 2] + 128; + Rpp32f meanImage = meanTensor[idx + 3] + 128; + + Rpp8s *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; +#if __AVX2__ + __m256d pMeanR = _mm256_set1_pd(meanR); + __m256d pMeanG = _mm256_set1_pd(meanG); + __m256d pMeanB = _mm256_set1_pd(meanB); + __m256d pMeanImage = _mm256_set1_pd(meanImage); + __m256d pVarR, pVarG, pVarB; + __m256d pVarImageR, pVarImageG, pVarImageB; + pVarR = pVarG = pVarB = pVarImageR = pVarImageG = pVarImageB = _mm256_setzero_pd(); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtrTempR, *srcPtrTempG, *srcPtrTempB; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256d p[6]; + rpp_simd_load(rpp_load24_i8pln3_to_f64pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); + compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB); + compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB); + srcPtrTempR += vectorIncrementPerChannel; + srcPtrTempG += vectorIncrementPerChannel; + srcPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + Rpp64f srcPtrR = static_cast(*srcPtrTempR + 128); + Rpp64f srcPtrG = static_cast(*srcPtrTempG + 128); + Rpp64f srcPtrB = static_cast(*srcPtrTempB + 128); + varR += (srcPtrR - meanR) * (srcPtrR - meanR); + varG += (srcPtrG - meanG) * (srcPtrG - meanG); + varB += (srcPtrB - meanB) * (srcPtrB - meanB); + varImageR += (srcPtrR - meanImage) * (srcPtrR - meanImage); + varImageG += (srcPtrG - meanImage) * (srcPtrG - meanImage); + varImageB += (srcPtrB - meanImage) * (srcPtrB - meanImage); + srcPtrTempR++; + srcPtrTempG++; + srcPtrTempB++; + } + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + } +#if __AVX2__ + _mm256_storeu_pd(varAvxR, pVarR); + _mm256_storeu_pd(varAvxG, pVarG); + _mm256_storeu_pd(varAvxB, pVarB); + _mm256_storeu_pd(varAvxImageR, pVarImageR); + _mm256_storeu_pd(varAvxImageG, pVarImageG); + _mm256_storeu_pd(varAvxImageB, pVarImageB); + + varR += (varAvxR[0] + varAvxR[1] + varAvxR[2] + varAvxR[3]); + varG += (varAvxG[0] + varAvxG[1] + varAvxG[2] + varAvxG[3]); + varB += (varAvxB[0] + varAvxB[1] + varAvxB[2] + varAvxB[3]); + varImageR += (varAvxImageR[0] + varAvxImageR[1] + varAvxImageR[2] + varAvxImageR[3]); + varImageG += (varAvxImageG[0] + varAvxImageG[1] + varAvxImageG[2] + varAvxImageG[3]); + varImageB += (varAvxImageB[0] + varAvxImageB[1] + varAvxImageB[2] + varAvxImageB[3]); +#endif + varImage = varImageR + varImageG + varImageB; + stddevImage = static_cast(sqrt(varImage / (totalPixelsPerChannel * 3))); + stddevR = static_cast(sqrt(varR / totalPixelsPerChannel)); + stddevG = static_cast(sqrt(varG / totalPixelsPerChannel)); + stddevB = static_cast(sqrt(varB / totalPixelsPerChannel)); + tensorStddevArr[idx] = stddevR; + tensorStddevArr[idx + 1] = stddevG; + tensorStddevArr[idx + 2] = stddevB; + tensorStddevArr[idx + 3] = stddevImage; + } + + // Tensor Stddev without fused output-layout toggle (NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC)) + { + Rpp64f varR, varG, varB, varImageR, varImageG, varImageB, varImage; + Rpp32f stddevImage, stddevR, stddevG, stddevB; + Rpp64f varAvxR[4] = {0.0}; + Rpp64f varAvxG[4] = {0.0}; + Rpp64f varAvxB[4] = {0.0}; + Rpp64f varAvxImageR[4] = {0.0}; + Rpp64f varAvxImageG[4] = {0.0}; + Rpp64f varAvxImageB[4] = {0.0}; + varR = varG = varB = varImageR = varImageG = varImageB = 0.0; + + Rpp32f meanR = meanTensor[idx] + 128; + Rpp32f meanG = meanTensor[idx + 1] + 128; + Rpp32f meanB = meanTensor[idx + 2] + 128; + Rpp32f meanImage = meanTensor[idx + 3] + 128; + + Rpp8s *srcPtrRow; + srcPtrRow = srcPtrChannel; +#if __AVX2__ + __m256d pMeanR = _mm256_set1_pd(meanR); + __m256d pMeanG = _mm256_set1_pd(meanG); + __m256d pMeanB = _mm256_set1_pd(meanB); + __m256d pMeanImage = _mm256_set1_pd(meanImage); + __m256d pVarR, pVarG, pVarB; + __m256d pVarImageR, pVarImageG, pVarImageB; + pVarR = pVarG = pVarB = pVarImageR = pVarImageG = pVarImageB = _mm256_setzero_pd(); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256d p[6]; + rpp_simd_load(rpp_load24_i8pkd3_to_f64pln3_avx, srcPtrTemp, p); + compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB); + compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB); + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + Rpp64f srcPtrR = static_cast(srcPtrTemp[0] + 128); + Rpp64f srcPtrG = static_cast(srcPtrTemp[1] + 128); + Rpp64f srcPtrB = static_cast(srcPtrTemp[2] + 128); + varR += (srcPtrR - meanR) * (srcPtrR - meanR); + varG += (srcPtrG - meanG) * (srcPtrG - meanG); + varB += (srcPtrB - meanB) * (srcPtrB - meanB); + varImageR += (srcPtrR - meanImage) * (srcPtrR - meanImage); + varImageG += (srcPtrG - meanImage) * (srcPtrG - meanImage); + varImageB += (srcPtrB - meanImage) * (srcPtrB - meanImage); + srcPtrTemp += 3; + } + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + _mm256_storeu_pd(varAvxR, pVarR); + _mm256_storeu_pd(varAvxG, pVarG); + _mm256_storeu_pd(varAvxB, pVarB); + _mm256_storeu_pd(varAvxImageR, pVarImageR); + _mm256_storeu_pd(varAvxImageG, pVarImageG); + _mm256_storeu_pd(varAvxImageB, pVarImageB); + + varR += (varAvxR[0] + varAvxR[1] + varAvxR[2] + varAvxR[3]); + varG += (varAvxG[0] + varAvxG[1] + varAvxG[2] + varAvxG[3]); + varB += (varAvxB[0] + varAvxB[1] + varAvxB[2] + varAvxB[3]); + varImageR += (varAvxImageR[0] + varAvxImageR[1] + varAvxImageR[2] + varAvxImageR[3]); + varImageG += (varAvxImageG[0] + varAvxImageG[1] + varAvxImageG[2] + varAvxImageG[3]); + varImageB += (varAvxImageB[0] + varAvxImageB[1] + varAvxImageB[2] + varAvxImageB[3]); +#endif + varImage = varImageR + varImageG + varImageB; + stddevImage = static_cast(sqrt(varImage / (totalPixelsPerChannel * 3))); + stddevR = static_cast(sqrt(varR / totalPixelsPerChannel)); + stddevG = static_cast(sqrt(varG / totalPixelsPerChannel)); + stddevB = static_cast(sqrt(varB / totalPixelsPerChannel)); + tensorStddevArr[idx] = stddevR; + tensorStddevArr[idx + 1] = stddevG; + tensorStddevArr[idx + 2] = stddevB; + tensorStddevArr[idx + 3] = stddevImage; + } + } + + return RPP_SUCCESS; +} diff --git a/src/modules/hip/hip_tensor_statistical_operations.hpp b/src/modules/hip/hip_tensor_statistical_operations.hpp index 6923b9a3f..a0f50ee7e 100644 --- a/src/modules/hip/hip_tensor_statistical_operations.hpp +++ b/src/modules/hip/hip_tensor_statistical_operations.hpp @@ -27,6 +27,8 @@ SOFTWARE. #include "kernel/tensor_sum.hpp" #include "kernel/tensor_min.hpp" #include "kernel/tensor_max.hpp" +#include "kernel/tensor_mean.hpp" +#include "kernel/tensor_stddev.hpp" #include "kernel/normalize.hpp" #endif // HIP_TENSOR_STATISTICAL_OPERATIONS_HPP diff --git a/src/modules/hip/kernel/tensor_mean.hpp b/src/modules/hip/kernel/tensor_mean.hpp new file mode 100644 index 000000000..4b2c81155 --- /dev/null +++ b/src/modules/hip/kernel/tensor_mean.hpp @@ -0,0 +1,227 @@ +#include +#include "rpp_hip_common.hpp" +#include "reduction.hpp" + +// -------------------- Set 0 - Reduction Stage 2 -------------------- +template +__global__ void tensor_mean_grid_result_hip(T *srcPtr, + uint xBufferLength, + float *dstPtr, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = hipThreadIdx_x * 8; + int id_z = hipBlockIdx_z; + + __shared__ float partialSum_smem[1024]; // 8192 floats of src reduced to 1024 in a 1024 x 1 thread block + partialSum_smem[hipThreadIdx_x] = 0.0f; // initialization of Shared to 0 using all 1024 x 1 threads + + if (id_x >= xBufferLength) + return; + + int xDiff = xBufferLength - (xBufferLength & ~7); // difference between bufferLength and alignedLength, where alignedLength = bufferLength & ~7 + uint srcIdx = (id_z * xBufferLength) + id_x; + + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory + + if (id_x + 8 > xBufferLength) + for(int i = xDiff; i < 8; i++) + src_f8.f1[i] = 0.0f; // local memory reset of invalid values (from the vectorized global load) to 0.0f + + src_f8.f4[0] += src_f8.f4[1]; // perform small work of vectorized float4 addition + partialSum_smem[hipThreadIdx_x] += (src_f8.f1[0] + + src_f8.f1[1] + + src_f8.f1[2] + + src_f8.f1[3]); // perform small work of reducing float4s to float using 1024 x 1 threads and store in Shared + __syncthreads(); // syncthreads after Shared load + + // Reduction of 1024 floats on 1024 threads per block in x dimension + for (int threadMax = 512; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + partialSum_smem[hipThreadIdx_x] += partialSum_smem[hipThreadIdx_x + threadMax]; + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_x == 0) + { + int totalElements = roiTensorPtrSrc[hipBlockIdx_z].xywhROI.roiHeight * roiTensorPtrSrc[hipBlockIdx_z].xywhROI.roiWidth; + dstPtr[hipBlockIdx_z] = partialSum_smem[0] / totalElements; + } +} + +template +__global__ void tensor_mean_grid_3channel_result_hip(T *srcPtr, + uint xBufferLength, + float *dstPtr, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = hipThreadIdx_x * 8; + int id_z = hipBlockIdx_z; + + __shared__ float partialRSum_smem[1024]; // 8192 floats of src reduced to 1024 in a 1024 x 1 thread block + __shared__ float partialGSum_smem[1024]; + __shared__ float partialBSum_smem[1024]; + partialRSum_smem[hipThreadIdx_x] = 0.0f; // initialization of Shared to 0 using all 1024 x 1 threads + partialGSum_smem[hipThreadIdx_x] = 0.0f; + partialBSum_smem[hipThreadIdx_x] = 0.0f; + + if (id_x >= xBufferLength) + return; + + int xDiff = xBufferLength - (xBufferLength & ~7); // difference between bufferLength and alignedLength, where alignedLength = bufferLength & ~7 + uint srcIdx = ((id_z * xBufferLength) + id_x) * 3; + + d_float24 src_f24; + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &src_f24); // load 24 pixels to local memory + if (id_x + 8 > xBufferLength) // local memory reset of invalid values (from the vectorized global load) to 0.0f + { + for(int i = xDiff; i < 8; i++) + { + src_f24.f8[0].f1[i] = 0.0f; + src_f24.f8[1].f1[i] = 0.0f; + src_f24.f8[2].f1[i] = 0.0f; + } + } + src_f24.f8[0].f4[0] += src_f24.f8[0].f4[1]; // perform small work of vectorized float4 addition + src_f24.f8[1].f4[0] += src_f24.f8[1].f4[1]; + src_f24.f8[2].f4[0] += src_f24.f8[2].f4[1]; + partialRSum_smem[hipThreadIdx_x] = (src_f24.f8[0].f1[0] + + src_f24.f8[0].f1[1] + + src_f24.f8[0].f1[2] + + src_f24.f8[0].f1[3]); // perform small work of reducing R float4s to float using 1024 threads and store in Shared + partialGSum_smem[hipThreadIdx_x] = (src_f24.f8[1].f1[0] + + src_f24.f8[1].f1[1] + + src_f24.f8[1].f1[2] + + src_f24.f8[1].f1[3]); // perform small work of reducing G float4s to float using 1024 threads and store in Shared + partialBSum_smem[hipThreadIdx_x] = (src_f24.f8[2].f1[0] + + src_f24.f8[2].f1[1] + + src_f24.f8[2].f1[2] + + src_f24.f8[2].f1[3]); // perform small work of reducing B float4s to float using 1024 threads and store in Shared + + __syncthreads(); // syncthreads after Shared load + + // Reduction of 1024 floats on 1024 threads per block in x dimension + for (int threadMax = 512; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + { + partialRSum_smem[hipThreadIdx_x] += partialRSum_smem[hipThreadIdx_x + threadMax]; + partialGSum_smem[hipThreadIdx_x] += partialGSum_smem[hipThreadIdx_x + threadMax]; + partialBSum_smem[hipThreadIdx_x] += partialBSum_smem[hipThreadIdx_x + threadMax]; + } + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_x == 0) + { + int totalElements = roiTensorPtrSrc[hipBlockIdx_z].xywhROI.roiHeight * roiTensorPtrSrc[hipBlockIdx_z].xywhROI.roiWidth; + float sum = partialRSum_smem[0] + partialGSum_smem[0] + partialBSum_smem[0]; + int idx = hipBlockIdx_z * 4; + dstPtr[idx] = partialRSum_smem[0] / totalElements; + dstPtr[idx + 1] = partialGSum_smem[0] / totalElements; + dstPtr[idx + 2] = partialBSum_smem[0] / totalElements; + dstPtr[idx + 3] = sum / (totalElements * 3); + } +} + +// -------------------- Set 1 - Kernel Executors -------------------- + +template +RppStatus hip_exec_tensor_mean(T *srcPtr, + RpptDescPtr srcDescPtr, + Rpp32f *tensorMeanArr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rpp::Handle& handle) +{ + if (roiType == RpptRoiType::LTRB) + hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle); + + int globalThreads_x = (srcDescPtr->w + 7) >> 3; + int globalThreads_y = srcDescPtr->h; + int globalThreads_z = srcDescPtr->n; + int gridDim_x = (int) ceil((float)globalThreads_x/LOCAL_THREADS_X); + int gridDim_y = (int) ceil((float)globalThreads_y/LOCAL_THREADS_Y); + int gridDim_z = (int) ceil((float)globalThreads_z/LOCAL_THREADS_Z); + + if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u tensorPartialSumArrLength = gridDim_x * gridDim_y * gridDim_z; + U *tensorPartialSumArr; + tensorPartialSumArr = reinterpret_cast(handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem); + hipMemsetAsync(tensorPartialSumArr, 0, tensorPartialSumArrLength * sizeof(U), handle.GetStream()); + hipLaunchKernelGGL(tensor_sum_pln1_hip, + dim3(gridDim_x, gridDim_y, gridDim_z), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride), + tensorPartialSumArr, + roiTensorPtrSrc); + hipLaunchKernelGGL(tensor_mean_grid_result_hip, + dim3(1, 1, gridDim_z), + dim3(1024, 1, 1), + 0, + handle.GetStream(), + tensorPartialSumArr, + gridDim_x * gridDim_y, + tensorMeanArr, + roiTensorPtrSrc); + } + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u tensorPartialSumArrLength = gridDim_x * gridDim_y * gridDim_z * 3; + U *tensorPartialSumArr; + tensorPartialSumArr = reinterpret_cast(handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem); + hipMemsetAsync(tensorPartialSumArr, 0, tensorPartialSumArrLength * sizeof(U), handle.GetStream()); + hipLaunchKernelGGL(tensor_sum_pln3_hip, + dim3(gridDim_x, gridDim_y, gridDim_z), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride), + tensorPartialSumArr, + roiTensorPtrSrc); + hipLaunchKernelGGL(tensor_mean_grid_3channel_result_hip, + dim3(1, 1, gridDim_z), + dim3(1024, 1, 1), + 0, + handle.GetStream(), + tensorPartialSumArr, + gridDim_x * gridDim_y, + tensorMeanArr, + roiTensorPtrSrc); + } + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u tensorPartialSumArrLength = gridDim_x * gridDim_y * gridDim_z * 3; + U *tensorPartialSumArr; + tensorPartialSumArr = reinterpret_cast(handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem); + hipMemsetAsync(tensorPartialSumArr, 0, tensorPartialSumArrLength * sizeof(U), handle.GetStream()); + hipLaunchKernelGGL(tensor_sum_pkd3_hip, + dim3(gridDim_x, gridDim_y, gridDim_z), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride), + tensorPartialSumArr, + roiTensorPtrSrc); + hipLaunchKernelGGL(tensor_mean_grid_3channel_result_hip, + dim3(1, 1, gridDim_z), + dim3(1024, 1, 1), + 0, + handle.GetStream(), + tensorPartialSumArr, + gridDim_x * gridDim_y, + tensorMeanArr, + roiTensorPtrSrc); + } + + return RPP_SUCCESS; +} diff --git a/src/modules/hip/kernel/tensor_stddev.hpp b/src/modules/hip/kernel/tensor_stddev.hpp new file mode 100644 index 000000000..0d4633be4 --- /dev/null +++ b/src/modules/hip/kernel/tensor_stddev.hpp @@ -0,0 +1,495 @@ +#include +#include "rpp_hip_common.hpp" +#include "reduction.hpp" + +// ----------------------- Helper Functions -------------------------- + +__device__ __forceinline__ void stddev_hip_compute(uchar *srcPtr, float *src, float *dst, int numValues) { *dst = sqrt(*src / numValues); } +__device__ __forceinline__ void stddev_hip_compute(float *srcPtr, float *src, float *dst, int numValues) { *dst = sqrt(*src / numValues) * 255; } +__device__ __forceinline__ void stddev_hip_compute(signed char *srcPtr, float *src, float *dst, int numValues) { *dst = sqrt(*src / numValues); } +__device__ __forceinline__ void stddev_hip_compute(half *srcPtr, float *src, float *dst, int numValues) { *dst = sqrt(*src / numValues) * 255; } + +__device__ __forceinline__ void mean_subtracted_square_3channel_hip_compute(d_float24 *src_f24, d_float24 *dst_f24, + float4 &meanR_f4, float4 &meanG_f4, float4 &meanB_f4) +{ + rpp_hip_math_subtract8_const(&src_f24->f8[0], &dst_f24->f8[0], meanR_f4); + rpp_hip_math_subtract8_const(&src_f24->f8[1], &dst_f24->f8[1], meanG_f4); + rpp_hip_math_subtract8_const(&src_f24->f8[2], &dst_f24->f8[2], meanB_f4); + rpp_hip_math_multiply8(&dst_f24->f8[0], &dst_f24->f8[0], &dst_f24->f8[0]); + rpp_hip_math_multiply8(&dst_f24->f8[1], &dst_f24->f8[1], &dst_f24->f8[1]); + rpp_hip_math_multiply8(&dst_f24->f8[2], &dst_f24->f8[2], &dst_f24->f8[2]); +} + +// perform reduction on shared memory and store the result in output +__device__ __forceinline__ void reduce_variance_3channel_hip(d_float24 *tempChannelSquared_f24, d_float24 *tempSquared_f24, + float *partialRVarianceRowPtr_smem, float *partialGVarianceRowPtr_smem, float *partialBVarianceRowPtr_smem, + float *partialTensorVarianceRowPtr_smem, float *dstPtr) +{ + // channel wise addition + tempChannelSquared_f24->f8[0].f4[0] += tempChannelSquared_f24->f8[0].f4[1]; // perform small work of vectorized float4 addition + tempChannelSquared_f24->f8[1].f4[0] += tempChannelSquared_f24->f8[1].f4[1]; + tempChannelSquared_f24->f8[2].f4[0] += tempChannelSquared_f24->f8[2].f4[1]; + + partialRVarianceRowPtr_smem[hipThreadIdx_x] = (tempChannelSquared_f24->f8[0].f1[0] + + tempChannelSquared_f24->f8[0].f1[1] + + tempChannelSquared_f24->f8[0].f1[2] + + tempChannelSquared_f24->f8[0].f1[3]); // perform small work of reducing R float4s to float using 16 x 16 threads and store in _smem + partialGVarianceRowPtr_smem[hipThreadIdx_x] = (tempChannelSquared_f24->f8[1].f1[0] + + tempChannelSquared_f24->f8[1].f1[1] + + tempChannelSquared_f24->f8[1].f1[2] + + tempChannelSquared_f24->f8[1].f1[3]); // perform small work of reducing G float4s to float using 16 x 16 threads and store in _smem + partialBVarianceRowPtr_smem[hipThreadIdx_x] = (tempChannelSquared_f24->f8[2].f1[0] + + tempChannelSquared_f24->f8[2].f1[1] + + tempChannelSquared_f24->f8[2].f1[2] + + tempChannelSquared_f24->f8[2].f1[3]); // perform small work of reducing B float4s to float using 16 x 16 threads and store in _smem + + // tensor wise addition + tempSquared_f24->f8[0].f4[0] += tempSquared_f24->f8[0].f4[1]; // perform small work of vectorized float4 addition + tempSquared_f24->f8[1].f4[0] += tempSquared_f24->f8[1].f4[1]; + tempSquared_f24->f8[2].f4[0] += tempSquared_f24->f8[2].f4[1]; + tempSquared_f24->f8[0].f4[0] += tempSquared_f24->f8[1].f4[0]; + tempSquared_f24->f8[0].f4[0] += tempSquared_f24->f8[2].f4[0]; + + partialTensorVarianceRowPtr_smem[hipThreadIdx_x] = (tempSquared_f24->f8[0].f1[0] + + tempSquared_f24->f8[0].f1[1] + + tempSquared_f24->f8[0].f1[2] + + tempSquared_f24->f8[0].f1[3]); // perform small work of reducing B float4s to float using 16 x 16 threads and store in _smem + __syncthreads(); // syncthreads after _smem load + + // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension) + for (int threadMax = 8; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + { + partialRVarianceRowPtr_smem[hipThreadIdx_x] += partialRVarianceRowPtr_smem[hipThreadIdx_x + threadMax]; + partialGVarianceRowPtr_smem[hipThreadIdx_x] += partialGVarianceRowPtr_smem[hipThreadIdx_x + threadMax]; + partialBVarianceRowPtr_smem[hipThreadIdx_x] += partialBVarianceRowPtr_smem[hipThreadIdx_x + threadMax]; + partialTensorVarianceRowPtr_smem[hipThreadIdx_x] += partialTensorVarianceRowPtr_smem[hipThreadIdx_x + threadMax]; + } + __syncthreads(); + } + + if (hipThreadIdx_x == 0) + { + // Reduction of 16 floats on 16 threads per block in y dimension + for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2) + { + if (hipThreadIdx_y < threadMax) + { + partialRVarianceRowPtr_smem[0] += partialRVarianceRowPtr_smem[increment]; + partialGVarianceRowPtr_smem[0] += partialGVarianceRowPtr_smem[increment]; + partialBVarianceRowPtr_smem[0] += partialBVarianceRowPtr_smem[increment]; + partialTensorVarianceRowPtr_smem[0] += partialTensorVarianceRowPtr_smem[increment]; + } + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_y == 0) + { + int idx = ((hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x) * 4; + dstPtr[idx] = partialRVarianceRowPtr_smem[0]; + dstPtr[idx + 1] = partialGVarianceRowPtr_smem[0]; + dstPtr[idx + 2] = partialBVarianceRowPtr_smem[0]; + dstPtr[idx + 3] = partialTensorVarianceRowPtr_smem[0]; + } + } +} + +// -------------------- Set 0 - Reduction Stage 2 -------------------- + +template +__global__ void tensor_stddev_grid_result_hip(T *inputSrcPtr, + float *srcPtr, + uint xBufferLength, + float *dstPtr, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = hipThreadIdx_x * 8; + int id_z = hipBlockIdx_z; + + __shared__ float partialVariance_smem[1024]; // 8192 floats of src reduced to 1024 in a 1024 x 1 thread block + partialVariance_smem[hipThreadIdx_x] = 0.0f; // initialization of _smem to 0 using all 1024 x 1 threads + + if (id_x >= xBufferLength) + return; + + int xDiff = xBufferLength - (xBufferLength & ~7); // difference between roiWidth and alignedLength, where alignedLength = roiWidth & ~7 + uint srcIdx = (id_z * xBufferLength) + id_x; + + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory + if (id_x + 8 > xBufferLength) + for(int i = xDiff; i < 8; i++) + src_f8.f1[i] = 0.0f; // local memory reset of invalid values (from the vectorized global load) to 0.0f + src_f8.f4[0] += src_f8.f4[1]; // perform small work of vectorized float4 addition + partialVariance_smem[hipThreadIdx_x] += (src_f8.f1[0] + + src_f8.f1[1] + + src_f8.f1[2] + + src_f8.f1[3]); // perform small work of reducing float4s to float using 1024 x 1 threads and store in _smem + __syncthreads(); // syncthreads after _smem load + + // Reduction of 1024 floats on 1024 threads per block in x dimension + for (int threadMax = 512; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + partialVariance_smem[hipThreadIdx_x] += partialVariance_smem[hipThreadIdx_x + threadMax]; + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_x == 0) + { + int totalElements = roiTensorPtrSrc[id_z].xywhROI.roiHeight * roiTensorPtrSrc[id_z].xywhROI.roiWidth; + stddev_hip_compute(inputSrcPtr, &partialVariance_smem[0] , &dstPtr[id_z], totalElements); + } +} + +template +__global__ void tensor_stddev_grid_3channel_result_hip(T *inputSrcPtr, + float *srcPtr, + uint xBufferLength, + float *dstPtr, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = hipThreadIdx_x; + int id_z = hipBlockIdx_z; + + /* Stores individual channel Variations computed from channel Means to compute Stddev of individual channels*/ + __shared__ float partialRVariance_smem[1024]; // 8192 floats of src reduced to 1024 in a 1024 x 1 thread block + __shared__ float partialGVariance_smem[1024]; + __shared__ float partialBVariance_smem[1024]; + __shared__ float partialTensorVariance_smem[1024]; + partialRVariance_smem[hipThreadIdx_x] = 0.0f; // initialization of _smem to 0 using all 1024 x 1 threads + partialGVariance_smem[hipThreadIdx_x] = 0.0f; + partialBVariance_smem[hipThreadIdx_x] = 0.0f; + partialTensorVariance_smem[hipThreadIdx_x] = 0.0f; + + if (id_x >= xBufferLength) + return; + + float4 accum_f4 = static_cast(0.0f); + while (id_x < xBufferLength) + { + uint srcIdx = ((id_z * xBufferLength) + id_x) * 4; + float4 temp_f4 = *(float4 *)(srcPtr + srcIdx); + accum_f4 += temp_f4; + id_x += hipBlockDim_x; + } + + partialRVariance_smem[hipThreadIdx_x] = accum_f4.x; + partialGVariance_smem[hipThreadIdx_x] = accum_f4.y; + partialBVariance_smem[hipThreadIdx_x] = accum_f4.z; + partialTensorVariance_smem[hipThreadIdx_x] = accum_f4.w; + __syncthreads(); // syncthreads after _smem load + + // Reduction of 1024 floats on 1024 threads per block in x dimension + for (int threadMax = 512; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + { + partialRVariance_smem[hipThreadIdx_x] += partialRVariance_smem[hipThreadIdx_x + threadMax]; + partialGVariance_smem[hipThreadIdx_x] += partialGVariance_smem[hipThreadIdx_x + threadMax]; + partialBVariance_smem[hipThreadIdx_x] += partialBVariance_smem[hipThreadIdx_x + threadMax]; + partialTensorVariance_smem[hipThreadIdx_x] += partialTensorVariance_smem[hipThreadIdx_x + threadMax]; + } + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_x == 0) + { + int totalElements = roiTensorPtrSrc[id_z].xywhROI.roiHeight * roiTensorPtrSrc[id_z].xywhROI.roiWidth; + uint dstIdx = id_z * 4; + stddev_hip_compute(inputSrcPtr, &partialRVariance_smem[0], &dstPtr[dstIdx], totalElements); + stddev_hip_compute(inputSrcPtr, &partialGVariance_smem[0], &dstPtr[dstIdx + 1], totalElements); + stddev_hip_compute(inputSrcPtr, &partialBVariance_smem[0], &dstPtr[dstIdx + 2], totalElements); + stddev_hip_compute(inputSrcPtr, &partialTensorVariance_smem[0], &dstPtr[dstIdx + 3], totalElements * 3); + } +} + +// -------------------- Set 1 - Reduction Stage 1 -------------------- + +template +__global__ void tensor_variance_pln1_hip(T *srcPtr, + uint2 srcStridesNH, + float *tensorVarianceArr, + Rpp32f *mean, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + float4 mean_f4 = static_cast(mean[id_z]); + + __shared__ float partialVariance_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block + float *partialVarianceRowPtr_smem = &partialVariance_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in _smem + partialVarianceRowPtr_smem[hipThreadIdx_x] = 0.0f; // initialization of _smem to 0 using all 16 x 16 threads + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + return; + + int xDiff = roiTensorPtrSrc[id_z].xywhROI.roiWidth - (roiTensorPtrSrc[id_z].xywhROI.roiWidth & ~7); // difference between roiWidth and alignedLength, where alignedLength = roiWidth & ~7 + uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x); + + d_float8 src_f8, temp_f8, tempSquared_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory + rpp_hip_math_subtract8_const(&src_f8, &temp_f8, mean_f4); // subtract mean from each pixel + rpp_hip_math_multiply8(&temp_f8, &temp_f8, &tempSquared_f8); // square the temporary values + + if (id_x + 8 > roiTensorPtrSrc[id_z].xywhROI.roiWidth) + for(int i = xDiff; i < 8; i++) + tempSquared_f8.f1[i] = 0.0f; + tempSquared_f8.f4[0] += tempSquared_f8.f4[1]; // perform small work of vectorized float4 addition + partialVarianceRowPtr_smem[hipThreadIdx_x] = (tempSquared_f8.f1[0] + + tempSquared_f8.f1[1] + + tempSquared_f8.f1[2] + + tempSquared_f8.f1[3]); + __syncthreads(); // syncthreads after _smem load + + // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension) + for (int threadMax = 8; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + partialVarianceRowPtr_smem[hipThreadIdx_x] += partialVarianceRowPtr_smem[hipThreadIdx_x + threadMax]; + __syncthreads(); + } + + if (hipThreadIdx_x == 0) + { + // Reduction of 16 floats on 16 threads per block in y dimension + for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2) + { + if (hipThreadIdx_y < threadMax) + partialVarianceRowPtr_smem[0] += partialVarianceRowPtr_smem[increment]; + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_y == 0) + tensorVarianceArr[(hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x] = partialVarianceRowPtr_smem[0]; + } +} + +template +__global__ void tensor_variance_pln3_hip(T *srcPtr, + uint3 srcStridesNCH, + float *tensorVarianceArr, + float4 *meanPtr_f4, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + /* Stores individual channel Variations computed from channel Means to compute Stddev of individual channels*/ + __shared__ float partialRVariance_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block + __shared__ float partialGVariance_smem[16][16]; + __shared__ float partialBVariance_smem[16][16]; + __shared__ float partialTensorVariance_smem[16][16]; + float *partialRVarianceRowPtr_smem = &partialRVariance_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in _smem + float *partialGVarianceRowPtr_smem = &partialGVariance_smem[hipThreadIdx_y][0]; + float *partialBVarianceRowPtr_smem = &partialBVariance_smem[hipThreadIdx_y][0]; + float *partialTensorVarianceRowPtr_smem = &partialTensorVariance_smem[hipThreadIdx_y][0]; + partialRVarianceRowPtr_smem[hipThreadIdx_x] = 0.0f; // initialization of _smem to 0 using all 16 x 16 threads + partialGVarianceRowPtr_smem[hipThreadIdx_x] = 0.0f; + partialBVarianceRowPtr_smem[hipThreadIdx_x] = 0.0f; + partialTensorVarianceRowPtr_smem[hipThreadIdx_x] = 0.0f; + float4 meanR_f4 = static_cast(meanPtr_f4[id_z].x); + float4 meanG_f4 = static_cast(meanPtr_f4[id_z].y); + float4 meanB_f4 = static_cast(meanPtr_f4[id_z].z); + float4 meanTensor_f4 = static_cast(meanPtr_f4[id_z].w); + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + return; + + int xDiff = roiTensorPtrSrc[id_z].xywhROI.roiWidth - (roiTensorPtrSrc[id_z].xywhROI.roiWidth & ~7); // difference between roiWidth and alignedLength, where alignedLength = roiWidth & ~7 + uint srcIdx = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x); + + d_float24 src_f24, tempChannelSquared_f24; + rpp_hip_load24_pln3_and_unpack_to_float24_pln3(srcPtr + srcIdx, srcStridesNCH.y, &src_f24); // load 24 pixels to local memory + mean_subtracted_square_3channel_hip_compute(&src_f24, &tempChannelSquared_f24, meanR_f4, meanG_f4, meanB_f4); + + d_float24 tempSquared_f24; + mean_subtracted_square_3channel_hip_compute(&src_f24, &tempSquared_f24, meanTensor_f4, meanTensor_f4, meanTensor_f4); + + if (id_x + 8 > roiTensorPtrSrc[id_z].xywhROI.roiWidth) // local memory reset of invalid values (from the vectorized global load) to 0.0f + { + for(int i = xDiff; i < 8; i++) + { + tempChannelSquared_f24.f8[0].f1[i] = 0.0f; + tempChannelSquared_f24.f8[1].f1[i] = 0.0f; + tempChannelSquared_f24.f8[2].f1[i] = 0.0f; + tempSquared_f24.f8[0].f1[i] = 0.0f; + tempSquared_f24.f8[1].f1[i] = 0.0f; + tempSquared_f24.f8[2].f1[i] = 0.0f; + } + } + + reduce_variance_3channel_hip(&tempChannelSquared_f24, &tempSquared_f24, + partialRVarianceRowPtr_smem, partialGVarianceRowPtr_smem, + partialBVarianceRowPtr_smem, partialTensorVarianceRowPtr_smem, tensorVarianceArr); +} + +template +__global__ void tensor_variance_pkd3_hip(T *srcPtr, + uint2 srcStridesNH, + float *tensorVarianceArr, + float4 *meanPtr_f4, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + __shared__ float partialRVariance_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block + __shared__ float partialGVariance_smem[16][16]; + __shared__ float partialBVariance_smem[16][16]; + __shared__ float partialTensorVariance_smem[16][16]; + float *partialRVarianceRowPtr_smem = &partialRVariance_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in _smem + float *partialGVarianceRowPtr_smem = &partialGVariance_smem[hipThreadIdx_y][0]; + float *partialBVarianceRowPtr_smem = &partialBVariance_smem[hipThreadIdx_y][0]; + float *partialTensorVarianceRowPtr_smem = &partialTensorVariance_smem[hipThreadIdx_y][0]; + partialRVarianceRowPtr_smem[hipThreadIdx_x] = 0.0f; // initialization of _smem to 0 using all 16 x 16 threads + partialGVarianceRowPtr_smem[hipThreadIdx_x] = 0.0f; + partialBVarianceRowPtr_smem[hipThreadIdx_x] = 0.0f; + partialTensorVarianceRowPtr_smem[hipThreadIdx_x] = 0.0f; + float4 meanR_f4 = static_cast(meanPtr_f4[id_z].x); + float4 meanG_f4 = static_cast(meanPtr_f4[id_z].y); + float4 meanB_f4 = static_cast(meanPtr_f4[id_z].z); + float4 meanTensor_f4 = static_cast(meanPtr_f4[id_z].w); + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + return; + + int xDiff = roiTensorPtrSrc[id_z].xywhROI.roiWidth - (roiTensorPtrSrc[id_z].xywhROI.roiWidth & ~7); // difference between roiWidth and alignedLength, where alignedLength = roiWidth & ~7 + uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3); + + d_float24 src_f24, tempChannelSquared_f24; + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &src_f24); // load 24 pixels to local memory + mean_subtracted_square_3channel_hip_compute(&src_f24, &tempChannelSquared_f24, meanR_f4, meanG_f4, meanB_f4); + + d_float24 tempSquared_f24; + mean_subtracted_square_3channel_hip_compute(&src_f24, &tempSquared_f24, meanTensor_f4, meanTensor_f4, meanTensor_f4); + + if (id_x + 8 > roiTensorPtrSrc[id_z].xywhROI.roiWidth) // local memory reset of invalid values (from the vectorized global load) to 0.0f + { + for(int i = xDiff; i < 8; i++) + { + tempChannelSquared_f24.f8[0].f1[i] = 0.0f; + tempChannelSquared_f24.f8[1].f1[i] = 0.0f; + tempChannelSquared_f24.f8[2].f1[i] = 0.0f; + tempSquared_f24.f8[0].f1[i] = 0.0f; + tempSquared_f24.f8[1].f1[i] = 0.0f; + tempSquared_f24.f8[2].f1[i] = 0.0f; + } + } + + reduce_variance_3channel_hip(&tempChannelSquared_f24, &tempSquared_f24, + partialRVarianceRowPtr_smem, partialGVarianceRowPtr_smem, + partialBVarianceRowPtr_smem, partialTensorVarianceRowPtr_smem, tensorVarianceArr); +} + +// -------------------- Set 2 - Kernel Executors -------------------- + +template +RppStatus hip_exec_tensor_stddev(T *srcPtr, + RpptDescPtr srcDescPtr, + Rpp32f *imageStddevArr, + Rpp32f *meanTensor, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rpp::Handle& handle) +{ + if (roiType == RpptRoiType::LTRB) + hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle); + + int globalThreads_x = (srcDescPtr->w + 7) >> 3; + int globalThreads_y = srcDescPtr->h; + int globalThreads_z = srcDescPtr->n; + int gridDim_x = (int) ceil((float)globalThreads_x/LOCAL_THREADS_X); + int gridDim_y = (int) ceil((float)globalThreads_y/LOCAL_THREADS_Y); + int gridDim_z = (int) ceil((float)globalThreads_z/LOCAL_THREADS_Z); + + if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u tensorPartialVarArrLength = gridDim_x * gridDim_y * gridDim_z; + float *tensorPartialVarArr = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem; + hipMemsetAsync(tensorPartialVarArr, 0, tensorPartialVarArrLength * sizeof(float), handle.GetStream()); + hipLaunchKernelGGL(tensor_variance_pln1_hip, + dim3(gridDim_x, gridDim_y, gridDim_z), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride), + tensorPartialVarArr, + meanTensor, + roiTensorPtrSrc); + hipLaunchKernelGGL(tensor_stddev_grid_result_hip, + dim3(1, 1, gridDim_z), + dim3(1024, 1, 1), + 0, + handle.GetStream(), + srcPtr, + tensorPartialVarArr, + gridDim_x * gridDim_y, + imageStddevArr, + roiTensorPtrSrc); + } + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u tensorPartialVarArrLength = gridDim_x * gridDim_y * gridDim_z * 4; + float *tensorPartialVarArr = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem; + hipMemsetAsync(tensorPartialVarArr, 0, tensorPartialVarArrLength * sizeof(float), handle.GetStream()); + hipLaunchKernelGGL(tensor_variance_pln3_hip, + dim3(gridDim_x, gridDim_y, gridDim_z), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride), + tensorPartialVarArr, + reinterpret_cast(meanTensor), + roiTensorPtrSrc); + hipLaunchKernelGGL(tensor_stddev_grid_3channel_result_hip, + dim3(1, 1, gridDim_z), + dim3(1024, 1, 1), + 0, + handle.GetStream(), + srcPtr, + tensorPartialVarArr, + gridDim_x * gridDim_y, + imageStddevArr, + roiTensorPtrSrc); + } + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u tensorPartialVarArrLength = gridDim_x * gridDim_y * gridDim_z * 4; + float *tensorPartialVarArr = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem; + hipMemsetAsync(tensorPartialVarArr, 0, tensorPartialVarArrLength * sizeof(float), handle.GetStream()); + hipLaunchKernelGGL(tensor_variance_pkd3_hip, + dim3(gridDim_x, gridDim_y, gridDim_z), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride), + tensorPartialVarArr, + reinterpret_cast(meanTensor), + roiTensorPtrSrc); + hipLaunchKernelGGL(tensor_stddev_grid_3channel_result_hip, + dim3(1, 1, gridDim_z), + dim3(1024, 1, 1), + 0, + handle.GetStream(), + srcPtr, + tensorPartialVarArr, + gridDim_x * gridDim_y, + imageStddevArr, + roiTensorPtrSrc); + } + + return RPP_SUCCESS; +} diff --git a/src/modules/rppt_tensor_statistical_operations.cpp b/src/modules/rppt_tensor_statistical_operations.cpp index ef69a49bb..8d528663c 100644 --- a/src/modules/rppt_tensor_statistical_operations.cpp +++ b/src/modules/rppt_tensor_statistical_operations.cpp @@ -336,6 +336,161 @@ RppStatus rppt_normalize_host(RppPtr_t srcPtr, return RPP_SUCCESS; } +/******************** tensor_mean ********************/ + +RppStatus rppt_tensor_mean_host(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t tensorMeanArr, + Rpp32u tensorMeanArrLength, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ + if ((srcDescPtr->c == 1 && tensorMeanArrLength < srcDescPtr->n) || // Stddev of single channel + (srcDescPtr->c == 3 && tensorMeanArrLength < srcDescPtr->n * 4)) // Stddev of each channel, and total Stddev of all 3 channels / image + return RPP_ERROR_NOT_ENOUGH_MEMORY; + if (roiType == RpptRoiType::XYWH) + { + for(int i = 0; i < srcDescPtr->n; i++) + { + if ((roiTensorPtrSrc[i].xywhROI.roiWidth > REDUCTION_MAX_WIDTH)|| (roiTensorPtrSrc[i].xywhROI.roiHeight > REDUCTION_MAX_HEIGHT)) + return RPP_ERROR_HIGH_SRC_DIMENSION; + } + } + else if (roiType == RpptRoiType::LTRB) + { + for(int i = 0; i < srcDescPtr->n; i++) + { + if ((roiTensorPtrSrc[i].ltrbROI.rb.x - roiTensorPtrSrc[i].ltrbROI.lt.x > REDUCTION_MAX_XDIM) || (roiTensorPtrSrc[i].ltrbROI.rb.y - roiTensorPtrSrc[i].ltrbROI.lt.y > REDUCTION_MAX_YDIM)) + return RPP_ERROR_HIGH_SRC_DIMENSION; + } + } + RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c); + + if (srcDescPtr->dataType == RpptDataType::U8) + { + tensor_mean_u8_f32_host(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(tensorMeanArr), + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if (srcDescPtr->dataType == RpptDataType::F16) + { + tensor_mean_f16_f32_host(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + static_cast(tensorMeanArr), + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if (srcDescPtr->dataType == RpptDataType::F32) + { + tensor_mean_f32_f32_host(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + static_cast(tensorMeanArr), + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if (srcDescPtr->dataType == RpptDataType::I8) + { + tensor_mean_i8_f32_host(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(tensorMeanArr), + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +} + +/******************** tensor_stddev ********************/ + +RppStatus rppt_tensor_stddev_host(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t tensorStddevArr, + Rpp32u tensorStddevArrLength, + Rpp32f *meanTensor, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ + if ((srcDescPtr->c == 1 && tensorStddevArrLength < srcDescPtr->n) || // Stddev of single channel + (srcDescPtr->c == 3 && tensorStddevArrLength < srcDescPtr->n * 4)) // Stddev of each channel, and total Stddev of all 3 channels / image + return RPP_ERROR_NOT_ENOUGH_MEMORY; + if (roiType == RpptRoiType::XYWH) + { + for(int i = 0; i < srcDescPtr->n; i++) + { + if ((roiTensorPtrSrc[i].xywhROI.roiWidth > REDUCTION_MAX_WIDTH) || (roiTensorPtrSrc[i].xywhROI.roiHeight > REDUCTION_MAX_HEIGHT)) + return RPP_ERROR_HIGH_SRC_DIMENSION; + } + } + else if (roiType == RpptRoiType::LTRB) + { + for(int i = 0; i < srcDescPtr->n; i++) + { + if ((roiTensorPtrSrc[i].ltrbROI.rb.x - roiTensorPtrSrc[i].ltrbROI.lt.x > REDUCTION_MAX_XDIM) || (roiTensorPtrSrc[i].ltrbROI.rb.y - roiTensorPtrSrc[i].ltrbROI.lt.y > REDUCTION_MAX_YDIM)) + return RPP_ERROR_HIGH_SRC_DIMENSION; + } + } + RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c); + + if (srcDescPtr->dataType == RpptDataType::U8) + { + tensor_stddev_u8_f32_host(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(tensorStddevArr), + meanTensor, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if (srcDescPtr->dataType == RpptDataType::F16) + { + tensor_stddev_f16_f32_host(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + static_cast(tensorStddevArr), + meanTensor, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if (srcDescPtr->dataType == RpptDataType::F32) + { + tensor_stddev_f32_f32_host(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + static_cast(tensorStddevArr), + meanTensor, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if (srcDescPtr->dataType == RpptDataType::I8) + { + tensor_stddev_i8_f32_host(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(tensorStddevArr), + meanTensor, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +} + /********************************************************************************************************************/ /*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/ /********************************************************************************************************************/ @@ -629,4 +784,157 @@ RppStatus rppt_normalize_gpu(RppPtr_t srcPtr, #endif // backend } +/******************** tensor_mean ********************/ + +RppStatus rppt_tensor_mean_gpu(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t tensorMeanArr, + Rpp32u tensorMeanArrLength, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ +#ifdef HIP_COMPILE + if ((srcDescPtr->c == 1 && tensorMeanArrLength < srcDescPtr->n) || // Stddev of single channel + (srcDescPtr->c == 3 && tensorMeanArrLength < srcDescPtr->n * 4)) // Stddev of each channel, and total Stddev of all 3 channels / image + return RPP_ERROR_NOT_ENOUGH_MEMORY; + if (roiType == RpptRoiType::XYWH) + { + for(int i = 0; i < srcDescPtr->n; i++) + { + if ((roiTensorPtrSrc[i].xywhROI.roiWidth > REDUCTION_MAX_WIDTH) || (roiTensorPtrSrc[i].xywhROI.roiHeight > REDUCTION_MAX_HEIGHT)) + return RPP_ERROR_HIGH_SRC_DIMENSION; + } + } + else if (roiType == RpptRoiType::LTRB) + { + for(int i = 0; i < srcDescPtr->n; i++) + { + if ((roiTensorPtrSrc[i].ltrbROI.rb.x - roiTensorPtrSrc[i].ltrbROI.lt.x > REDUCTION_MAX_XDIM) || (roiTensorPtrSrc[i].ltrbROI.rb.y - roiTensorPtrSrc[i].ltrbROI.lt.y > REDUCTION_MAX_YDIM)) + return RPP_ERROR_HIGH_SRC_DIMENSION; + } + } + + if (srcDescPtr->dataType == RpptDataType::U8) + { + hip_exec_tensor_mean(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(tensorMeanArr), + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if (srcDescPtr->dataType == RpptDataType::F16) + { + hip_exec_tensor_mean(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + static_cast(tensorMeanArr), + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if (srcDescPtr->dataType == RpptDataType::F32) + { + hip_exec_tensor_mean(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + static_cast(tensorMeanArr), + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if (srcDescPtr->dataType == RpptDataType::I8) + { + hip_exec_tensor_mean(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(tensorMeanArr), + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +#elif defined(OCL_COMPILE) + return RPP_ERROR_NOT_IMPLEMENTED; +#endif // backend +} + +/******************** tensor_stddev ********************/ + +RppStatus rppt_tensor_stddev_gpu(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t tensorStddevArr, + Rpp32u tensorStddevArrLength, + Rpp32f *meanTensor, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ +#ifdef HIP_COMPILE + if ((srcDescPtr->c == 1 && tensorStddevArrLength < srcDescPtr->n) || // Stddev of single channel + (srcDescPtr->c == 3 && tensorStddevArrLength < srcDescPtr->n * 4)) // Stddev of each channel, and total Stddev of all 3 channels / image + return RPP_ERROR_NOT_ENOUGH_MEMORY; + if (roiType == RpptRoiType::XYWH) + { + for(int i = 0; i < srcDescPtr->n; i++) + { + if ((roiTensorPtrSrc[i].xywhROI.roiWidth > REDUCTION_MAX_WIDTH) || (roiTensorPtrSrc[i].xywhROI.roiHeight > REDUCTION_MAX_HEIGHT)) + return RPP_ERROR_HIGH_SRC_DIMENSION; + } + } + else if (roiType == RpptRoiType::LTRB) + { + for(int i = 0; i < srcDescPtr->n; i++) + { + if ((roiTensorPtrSrc[i].ltrbROI.rb.x - roiTensorPtrSrc[i].ltrbROI.lt.x > REDUCTION_MAX_XDIM) || (roiTensorPtrSrc[i].ltrbROI.rb.y - roiTensorPtrSrc[i].ltrbROI.lt.y > REDUCTION_MAX_YDIM)) + return RPP_ERROR_HIGH_SRC_DIMENSION; + } + } + + if (srcDescPtr->dataType == RpptDataType::U8) + { + hip_exec_tensor_stddev(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(tensorStddevArr), + meanTensor, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if (srcDescPtr->dataType == RpptDataType::F16) + { + hip_exec_tensor_stddev(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + static_cast(tensorStddevArr), + meanTensor, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if (srcDescPtr->dataType == RpptDataType::F32) + { + hip_exec_tensor_stddev(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + static_cast(tensorStddevArr), + meanTensor, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if (srcDescPtr->dataType == RpptDataType::I8) + { + hip_exec_tensor_stddev(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(tensorStddevArr), + meanTensor, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +#elif defined(OCL_COMPILE) + return RPP_ERROR_NOT_IMPLEMENTED; +#endif // backend +} + #endif // GPU_SUPPORT diff --git a/utilities/test_suite/HIP/Tensor_hip.cpp b/utilities/test_suite/HIP/Tensor_hip.cpp index 62f777ae8..0d8b7fd7c 100644 --- a/utilities/test_suite/HIP/Tensor_hip.cpp +++ b/utilities/test_suite/HIP/Tensor_hip.cpp @@ -68,7 +68,7 @@ int main(int argc, char **argv) bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 33 || testCase == 61 || testCase == 63 || testCase == 65 || testCase == 68); bool randomOutputCase = (testCase == 8 || testCase == 84 || testCase == 49 || testCase == 54); bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24 || testCase == 79); - bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89); + bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89 || testCase == 90 || testCase == 91); bool noiseTypeCase = (testCase == 8); bool pln1OutTypeCase = (testCase == 86); @@ -86,7 +86,7 @@ int main(int argc, char **argv) printf("\ndst = %s", argv[3]); printf("\nu8 / f16 / f32 / u8->f16 / u8->f32 / i8 / u8->i8 (0/1/2/3/4/5/6) = %s", argv[4]); printf("\noutputFormatToggle (pkd->pkd = 0 / pkd->pln = 1) = %s", argv[5]); - printf("\ncase number (0:87) = %s", argv[6]); + printf("\ncase number (0:91) = %s", argv[6]); printf("\nnumber of times to run = %s", argv[8]); printf("\ntest type - (0 = unit tests / 1 = performance tests) = %s", argv[9]); printf("\nlayout type - (0 = PKD3/ 1 = PLN3/ 2 = PLN1) = %s", argv[10]); @@ -332,17 +332,19 @@ int main(int argc, char **argv) // Initialize buffers for any reductionType functions (testCase 87 - tensor_sum alone cannot return final sum as 8u/8s due to overflow. 8u inputs return 64u sums, 8s inputs return 64s sums) void *reductionFuncResultArr; + Rpp32f *mean; Rpp32u reductionFuncResultArrLength = srcDescPtr->n * 4; if (reductionTypeCase) { int bitDepthByteSize = 0; - if (dstDescPtr->dataType == RpptDataType::U8) + if ((dstDescPtr->dataType == RpptDataType::F16) || (dstDescPtr->dataType == RpptDataType::F32) || testCase == 90 || testCase == 91) + bitDepthByteSize = sizeof(Rpp32f); // using 32f outputs for 16f and 32f, for testCase 90, 91 + else if ((dstDescPtr->dataType == RpptDataType::U8) || (dstDescPtr->dataType == RpptDataType::I8)) bitDepthByteSize = (testCase == 87) ? sizeof(Rpp64u) : sizeof(Rpp8u); - else if (dstDescPtr->dataType == RpptDataType::I8) - bitDepthByteSize = (testCase == 87) ? sizeof(Rpp64s) : sizeof(Rpp8s); - else if ((dstDescPtr->dataType == RpptDataType::F16) || (dstDescPtr->dataType == RpptDataType::F32)) - bitDepthByteSize = sizeof(Rpp32f); // using 32f outputs for 16f and 32f + CHECK_RETURN_STATUS(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * bitDepthByteSize)); + if(testCase == 91) + CHECK_RETURN_STATUS(hipHostMalloc(&mean, reductionFuncResultArrLength * bitDepthByteSize)); } // create generic descriptor and params in case of slice @@ -350,7 +352,7 @@ int main(int argc, char **argv) RpptGenericDescPtr descriptorPtr3D = &descriptor3D; Rpp32s *anchorTensor = NULL, *shapeTensor = NULL; Rpp32u *roiTensor = NULL; - if(testCase == 90) + if(testCase == 92) set_generic_descriptor_slice(srcDescPtr, descriptorPtr3D, batchSize); // Allocate hip memory for src/dst @@ -1244,7 +1246,6 @@ int main(int argc, char **argv) reductionFuncResultArrLength = srcDescPtr->n; startWallTime = omp_get_wtime(); - if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) rppt_tensor_sum_gpu(d_input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle); else @@ -1277,6 +1278,37 @@ int main(int argc, char **argv) break; } case 90: + { + testCaseName = "tensor_mean"; + + if(srcDescPtr->c == 1) + reductionFuncResultArrLength = srcDescPtr->n; + + startWallTime = omp_get_wtime(); + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_tensor_mean_gpu(d_input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } + case 91: + { + testCaseName = "tensor_stddev"; + + if(srcDescPtr->c == 1) + reductionFuncResultArrLength = srcDescPtr->n; + memcpy(mean, TensorMeanReferenceOutputs[inputChannels].data(), sizeof(Rpp32f) * reductionFuncResultArrLength); + + startWallTime = omp_get_wtime(); + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_tensor_stddev_gpu(d_input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, mean, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } + case 92: { testCaseName = "slice"; Rpp32u numDims = descriptorPtr3D->numDims - 1; // exclude batchSize from input dims @@ -1299,8 +1331,10 @@ int main(int argc, char **argv) break; } default: + { missingFuncFlag = 1; break; + } } CHECK_RETURN_STATUS(hipDeviceSynchronize()); @@ -1333,8 +1367,10 @@ int main(int argc, char **argv) } // print reduction functions output array based on different bit depths, and precision desired - int precision = ((dstDescPtr->dataType == RpptDataType::F32) || (dstDescPtr->dataType == RpptDataType::F16)) ? 3 : 0; - if (dstDescPtr->dataType == RpptDataType::U8) + int precision = ((dstDescPtr->dataType == RpptDataType::F32) || (dstDescPtr->dataType == RpptDataType::F16) || testCase == 90 || testCase == 91) ? 3 : 0; + if (dstDescPtr->dataType == RpptDataType::F32 || testCase == 90 || testCase == 91) + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); + else if (dstDescPtr->dataType == RpptDataType::U8) { if (testCase == 87) print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); @@ -1348,13 +1384,6 @@ int main(int argc, char **argv) else print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); } - else if (dstDescPtr->dataType == RpptDataType::F32) - { - if (testCase == 87) - print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); - else - print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); - } else if (dstDescPtr->dataType == RpptDataType::I8) { if (testCase == 87) @@ -1372,6 +1401,8 @@ int main(int argc, char **argv) { if (testCase == 87) compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath); + else if (testCase == 90 || testCase == 91) + compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath); else compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath); } @@ -1395,7 +1426,7 @@ int main(int argc, char **argv) // if test case is slice and qaFlag is set, update the dstImgSizes with shapeTensor values // for output display and comparision purposes - if (testCase == 90) + if (testCase == 92) { if (dstDescPtr->layout == RpptLayout::NCHW) { @@ -1490,7 +1521,11 @@ int main(int argc, char **argv) CHECK_RETURN_STATUS(hipHostFree(patchRoi)); } if (reductionTypeCase) + { CHECK_RETURN_STATUS(hipHostFree(reductionFuncResultArr)); + if(testCase == 91) + CHECK_RETURN_STATUS(hipHostFree(mean)); + } if(testCase == 32) { CHECK_RETURN_STATUS(hipHostFree(colorBuffer)); diff --git a/utilities/test_suite/HIP/runTests.py b/utilities/test_suite/HIP/runTests.py index b5b492f39..40dfb21ea 100644 --- a/utilities/test_suite/HIP/runTests.py +++ b/utilities/test_suite/HIP/runTests.py @@ -39,7 +39,7 @@ outFolderPath = os.getcwd() buildFolderPath = os.getcwd() caseMin = 0 -caseMax = 90 +caseMax = 92 # Get a list of log files based on a flag for preserving output def get_log_file_list(preserveOutput): @@ -69,7 +69,7 @@ def func_group_finder(case_number): return "logical_operations" elif case_number < 87: return "data_exchange_operations" - elif case_number < 88: + elif case_number < 92: return "statistical_operations" else: return "miscellaneous" @@ -299,7 +299,7 @@ def rpp_test_suite_parser_and_validator(): subprocess.run(["make", "-j16"], cwd=".") # nosec # List of cases supported -supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '32', '33', '34', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '79', '80', '82', '83', '84', '85', '86', '87', '88', '89', '90'] +supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '32', '33', '34', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '79', '80', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92'] # Create folders based on testType and profilingOption if testType == 1 and profilingOption == "YES": diff --git a/utilities/test_suite/HOST/Tensor_host.cpp b/utilities/test_suite/HOST/Tensor_host.cpp index 566777fd5..5a4634c97 100644 --- a/utilities/test_suite/HOST/Tensor_host.cpp +++ b/utilities/test_suite/HOST/Tensor_host.cpp @@ -68,7 +68,7 @@ int main(int argc, char **argv) bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 33 || testCase == 61 || testCase == 63 || testCase == 65 || testCase == 68); bool randomOutputCase = (testCase == 8 || testCase == 84); bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24 || testCase == 79); - bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89); + bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89 || testCase == 90 || testCase == 91); bool noiseTypeCase = (testCase == 8); bool pln1OutTypeCase = (testCase == 86); @@ -86,7 +86,7 @@ int main(int argc, char **argv) printf("\ndst = %s", argv[3]); printf("\nu8 / f16 / f32 / u8->f16 / u8->f32 / i8 / u8->i8 (0/1/2/3/4/5/6) = %s", argv[4]); printf("\noutputFormatToggle (pkd->pkd = 0 / pkd->pln = 1) = %s", argv[5]); - printf("\ncase number (0:87) = %s", argv[6]); + printf("\ncase number (0:91) = %s", argv[6]); printf("\nnumber of times to run = %s", argv[8]); printf("\ntest type - (0 = unit tests / 1 = performance tests) = %s", argv[9]); printf("\nlayout type - (0 = PKD3/ 1 = PLN3/ 2 = PLN1) = %s", argv[10]); @@ -329,21 +329,16 @@ int main(int argc, char **argv) if (reductionTypeCase) { int bitDepthByteSize = 0; - if (dstDescPtr->dataType == RpptDataType::U8) + if ((dstDescPtr->dataType == RpptDataType::F16) || (dstDescPtr->dataType == RpptDataType::F32) || testCase == 90 || testCase == 91) { - bitDepthByteSize = (testCase == 87) ? sizeof(Rpp64u) : sizeof(Rpp8u); - reductionFuncResultArr = static_cast(calloc(reductionFuncResultArrLength, bitDepthByteSize)); + bitDepthByteSize = sizeof(Rpp32f); // using 32f outputs for 16f and 32f, for testCase 90, 91 + reductionFuncResultArr = static_cast(calloc(reductionFuncResultArrLength, bitDepthByteSize)); } - else if (dstDescPtr->dataType == RpptDataType::I8) + else if ((dstDescPtr->dataType == RpptDataType::U8) || (dstDescPtr->dataType == RpptDataType::I8)) { - bitDepthByteSize = (testCase == 87) ? sizeof(Rpp64s) : sizeof(Rpp8s); + bitDepthByteSize = (testCase == 87) ? sizeof(Rpp64u) : sizeof(Rpp8u); reductionFuncResultArr = static_cast(calloc(reductionFuncResultArrLength, bitDepthByteSize)); } - else if ((dstDescPtr->dataType == RpptDataType::F16) || (dstDescPtr->dataType == RpptDataType::F32)) - { - bitDepthByteSize = sizeof(Rpp32f); // using 32f outputs for 16f and 32f - reductionFuncResultArr = static_cast(calloc(reductionFuncResultArrLength, bitDepthByteSize)); - } } // create generic descriptor and params in case of slice @@ -351,7 +346,7 @@ int main(int argc, char **argv) RpptGenericDescPtr descriptorPtr3D = &descriptor3D; Rpp32s *anchorTensor = NULL, *shapeTensor = NULL; Rpp32u *roiTensor = NULL; - if(testCase == 90) + if(testCase == 92) set_generic_descriptor_slice(srcDescPtr, descriptorPtr3D, batchSize); // create cropRoi and patchRoi in case of crop_and_patch @@ -1257,7 +1252,6 @@ int main(int argc, char **argv) startWallTime = omp_get_wtime(); startCpuTime = clock(); - if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) rppt_tensor_sum_host(input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle); else @@ -1274,7 +1268,6 @@ int main(int argc, char **argv) startWallTime = omp_get_wtime(); startCpuTime = clock(); - if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) rppt_tensor_min_host(input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle); else @@ -1291,7 +1284,6 @@ int main(int argc, char **argv) startWallTime = omp_get_wtime(); startCpuTime = clock(); - if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) rppt_tensor_max_host(input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle); else @@ -1300,6 +1292,39 @@ int main(int argc, char **argv) break; } case 90: + { + testCaseName = "tensor_mean"; + + if(srcDescPtr->c == 1) + reductionFuncResultArrLength = srcDescPtr->n; + + startWallTime = omp_get_wtime(); + startCpuTime = clock(); + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_tensor_mean_host(input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } + case 91: + { + testCaseName = "tensor_stddev"; + + if(srcDescPtr->c == 1) + reductionFuncResultArrLength = srcDescPtr->n; + Rpp32f *mean = TensorMeanReferenceOutputs[inputChannels].data(); + + startWallTime = omp_get_wtime(); + startCpuTime = clock(); + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_tensor_stddev_host(input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, mean, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } + case 92: { testCaseName = "slice"; Rpp32u numDims = descriptorPtr3D->numDims - 1; // exclude batchSize from input dims @@ -1324,8 +1349,10 @@ int main(int argc, char **argv) break; } default: + { missingFuncFlag = 1; break; + } } endCpuTime = clock(); @@ -1362,8 +1389,10 @@ int main(int argc, char **argv) } // print reduction functions output array based on different bit depths, and precision desired - int precision = ((dstDescPtr->dataType == RpptDataType::F32) || (dstDescPtr->dataType == RpptDataType::F16)) ? 3 : 0; - if (dstDescPtr->dataType == RpptDataType::U8) + int precision = ((dstDescPtr->dataType == RpptDataType::F32) || (dstDescPtr->dataType == RpptDataType::F16) || testCase == 90 || testCase == 91) ? 3 : 0; + if (dstDescPtr->dataType == RpptDataType::F32 || testCase == 90 || testCase == 91) + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); + else if (dstDescPtr->dataType == RpptDataType::U8) { if (testCase == 87) print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); @@ -1377,13 +1406,6 @@ int main(int argc, char **argv) else print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); } - else if (dstDescPtr->dataType == RpptDataType::F32) - { - if (testCase == 87) - print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); - else - print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); - } else if (dstDescPtr->dataType == RpptDataType::I8) { if (testCase == 87) @@ -1401,6 +1423,8 @@ int main(int argc, char **argv) { if (testCase == 87) compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath); + else if (testCase == 90 || testCase == 91) + compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath); else compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath); } @@ -1422,7 +1446,7 @@ int main(int argc, char **argv) // if test case is slice and qaFlag is set, update the dstImgSizes with shapeTensor values // for output display and comparision purposes - if (testCase == 90) + if (testCase == 92) { if (dstDescPtr->layout == RpptLayout::NCHW) { diff --git a/utilities/test_suite/HOST/runTests.py b/utilities/test_suite/HOST/runTests.py index 445c139f5..159eed640 100644 --- a/utilities/test_suite/HOST/runTests.py +++ b/utilities/test_suite/HOST/runTests.py @@ -39,7 +39,7 @@ outFolderPath = os.getcwd() buildFolderPath = os.getcwd() caseMin = 0 -caseMax = 90 +caseMax = 92 # Get a list of log files based on a flag for preserving output def get_log_file_list(preserveOutput): @@ -275,7 +275,7 @@ def rpp_test_suite_parser_and_validator(): subprocess.run(["make", "-j16"], cwd=".") # nosec # List of cases supported -supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '32', '33', '34', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90'] +supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '32', '33', '34', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92'] print("\n\n\n\n\n") print("##########################################################################################") diff --git a/utilities/test_suite/README.md b/utilities/test_suite/README.md index 375247665..05cb857e1 100644 --- a/utilities/test_suite/README.md +++ b/utilities/test_suite/README.md @@ -94,8 +94,8 @@ The image test suite can be executed under 2 backend scenarios - (HOST/HIP): The image test suite accepts the following command line arguments: - input_path1: The path to the input folder 1. Default is $cwd/../TEST_IMAGES/three_images_mixed_src1 - input_path2: The path to the input folder 2. Default is $cwd/../TEST_IMAGES/three_images_mixed_src2 -- case_start: The starting case number for the test range (0-89). Default is 0 -- case_end: The ending case number for the test range (0-89). Default is 89 +- case_start: The starting case number for the test range (0-91). Default is 0 +- case_end: The ending case number for the test range (0-91). Default is 91 - test_type: The type of test to run (0 = Unit tests, 1 = Performance tests). Default is 0 - case_list: A list of specific case numbers to run. Must be used in conjunction with --test_type - profiling: Run the tests with a profiler (YES/NO). Default is NO. This option is only available with HIP backend @@ -121,7 +121,11 @@ python runTests.py --input_path1 --input_path2 --cas - QA mode (Unit tests) - Tolerance based PASS/FAIL tests for RPP HIP/HOST functionalities checking pixelwise match between C/SSE/AVX/HIP versions after comparison to preset golden outputs. Please note that QA mode is only supported with a batch size of 3. Note: QA mode is not supported for case 84 due to run-to-run variation of outputs. ``` python -python runTests.py --case_start 0 --case_end 89 --test_type 0 --qa_mode 1 --batch_size 3 +python runTests.py --case_start 0 --case_end 91 --test_type 0 --qa_mode 1 --batch_size 3 +``` +- QA mode (Performance tests) - Tolerance based PASS/FAIL tests for RPP HIP/HOST functionalities checking achieved improvement in performance percentage over BatchPD versions after comparison to a threshold percentage of improvement +``` python +python runTests.py --case_list 21 36 63 --test_type 1 --qa_mode 1 --batch_size 8 --num_runs 100 ``` - QA mode (Performance tests) - Tolerance based PASS/FAIL tests for RPP HIP/HOST functionalities checking achieved improvement in performance percentage over BatchPD versions after comparison to a threshold percentage of improvement ``` python @@ -135,13 +139,13 @@ python runTests.py --case_list 21 36 63 --test_type 1 --qa_mode 1 --batch_size 8 Note: For testcase 82(RICAP) Please use images of same resolution and Batchsize > 1 RICAP dataset path: rpp/utilities/test_suite/TEST_IMAGES/three_images_150x150_src1 ``` python -python runTests.py --case_start 0 --case_end 89 --test_type 0 --qa_mode 0 +python runTests.py --case_start 0 --case_end 91 --test_type 0 --qa_mode 0 ``` - Performance test mode - Performance tests that execute the desired functionality and variant 100 times by default, and report max/min/avg RPP execution wall time, or optionally, AMD rocprof kernel profiler max/min/avg time for HIP backend variants. Note: For testcase 82(RICAP) Please use images of same resolution and Batchsize > 1 RICAP dataset path: rpp/utilities/test_suite/TEST_IMAGES/three_images_150x150_src1 ``` python -python runTests.py --case_start 0 --case_end 89 --test_type 1 +python runTests.py --case_start 0 --case_end 91 --test_type 1 ``` To run the unit tests / performance tests for specific case numbers. please case use case_list parameter. Example as below diff --git a/utilities/test_suite/rpp_test_suite_common.h b/utilities/test_suite/rpp_test_suite_common.h index fe9f501d7..517bebd4d 100644 --- a/utilities/test_suite/rpp_test_suite_common.h +++ b/utilities/test_suite/rpp_test_suite_common.h @@ -108,30 +108,46 @@ std::map augmentationMap = {87, "tensor_sum"}, {88, "tensor_min"}, {89, "tensor_max"}, - {90, "slice"} + {90, "tensor_mean"}, + {91, "tensor_stddev"}, + {92, "slice"} }; // Golden outputs for Tensor min Kernel -std::map> TensorMinReferenceOutputs = +std::map> TensorMinReferenceOutputs = { {1, {1, 1, 7}}, {3, {0, 0, 0, 0, 2, 0, 0, 0, 7, 9, 0, 0}} }; // Golden outputs for Tensor max Kernel -std::map> TensorMaxReferenceOutputs = +std::map> TensorMaxReferenceOutputs = { {1, {239, 245, 255}}, {3, {255, 240, 236, 255, 255, 242, 241, 255, 253, 255, 255, 255}} }; // Golden outputs for Tensor sum Kernel -std::map> TensorSumReferenceOutputs = +std::map> TensorSumReferenceOutputs = { {1, {334225, 813471, 2631125}}, {3, {348380, 340992, 262616, 951988, 1056552, 749506, 507441, 2313499, 2170646, 2732368, 3320699, 8223713}} }; +// Golden outputs for Tensor mean Kernel +std::map> TensorMeanReferenceOutputs = +{ + {1, {133.690, 81.347, 116.939}}, + {3, {139.352, 136.397, 105.046, 126.932, 105.655, 74.951, 50.744, 77.117, 96.473, 121.439, 147.587, 121.833}} +}; + +// Golden outputs for Tensor stddev Kernel +std::map> TensorStddevReferenceOutputs = +{ + {1, {49.583, 54.623, 47.649}}, + {3, {57.416, 47.901, 53.235, 55.220, 68.471, 55.735, 46.668, 61.880, 47.462, 49.039, 67.269, 59.130}} +}; + template inline T validate_pixel_range(T pixel) { @@ -1184,19 +1200,18 @@ inline void compare_reduction_output(T* output, string funcName, RpptDescPtr src int matched_values = 0; T *refOutput; - refOutput = (T *)calloc(srcDescPtr->n * 4, sizeof(T)); int numChannels = (srcDescPtr->c == 1) ? 1 : 3; int numOutputs = (srcDescPtr->c == 1) ? srcDescPtr->n : srcDescPtr->n * 4; - std::vector ref; if(testCase == 88) - ref = TensorMinReferenceOutputs[numChannels]; + refOutput = reinterpret_cast(TensorMinReferenceOutputs[numChannels].data()); else if(testCase == 89) - ref = TensorMaxReferenceOutputs[numChannels]; + refOutput = reinterpret_cast(TensorMaxReferenceOutputs[numChannels].data()); else if(testCase == 87) - ref = TensorSumReferenceOutputs[numChannels]; - - for (int i = 0; i < numOutputs; i++) - refOutput[i] = (T)ref[i]; + refOutput = reinterpret_cast(TensorSumReferenceOutputs[numChannels].data()); + else if(testCase == 90) + refOutput = reinterpret_cast(TensorMeanReferenceOutputs[numChannels].data()); + else if(testCase == 91) + refOutput = reinterpret_cast(TensorStddevReferenceOutputs[numChannels].data()); if(srcDescPtr->c == 1) { @@ -1222,7 +1237,6 @@ inline void compare_reduction_output(T* output, string funcName, RpptDescPtr src fileMatch++; } } - free(refOutput); std::cout << std::endl << "Results for " << func << " :" << std::endl; std::string status = func + ": ";