Skip to content

Commit

Permalink
Merge pull request #46 from sampath1117/sr/opt_brightness_avx
Browse files Browse the repository at this point in the history
AVX optimizations for Brightness kernel - HOST
  • Loading branch information
r-abishek committed Feb 17, 2022
2 parents 93a171b + 29f7dd2 commit 6613f95
Show file tree
Hide file tree
Showing 3 changed files with 429 additions and 201 deletions.
100 changes: 100 additions & 0 deletions src/include/cpu/rpp_cpu_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2068,6 +2068,106 @@ inline RppStatus custom_convolve_image_host(T* srcPtr, RppiSize srcSize, U* dstP

// Compute Functions for RPP Tensor API

inline RppStatus compute_brightness_48_host(__m256 *p, __m256 *pBrightnessParams)
{
p[0] = _mm256_fmadd_ps(p[0], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
p[1] = _mm256_fmadd_ps(p[1], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
p[2] = _mm256_fmadd_ps(p[2], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
p[3] = _mm256_fmadd_ps(p[3], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
p[4] = _mm256_fmadd_ps(p[4], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
p[5] = _mm256_fmadd_ps(p[5], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment

return RPP_SUCCESS;
}

inline RppStatus compute_brightness_48_host(__m128 *p, __m128 *pBrightnessParams)
{
p[0] = _mm_fmadd_ps(p[0], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
p[1] = _mm_fmadd_ps(p[1], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
p[2] = _mm_fmadd_ps(p[2], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
p[3] = _mm_fmadd_ps(p[3], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
p[4] = _mm_fmadd_ps(p[4], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
p[5] = _mm_fmadd_ps(p[5], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
p[6] = _mm_fmadd_ps(p[6], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
p[7] = _mm_fmadd_ps(p[7], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
p[8] = _mm_fmadd_ps(p[8], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
p[9] = _mm_fmadd_ps(p[9], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
p[10] = _mm_fmadd_ps(p[10], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
p[11] = _mm_fmadd_ps(p[11], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment

return RPP_SUCCESS;
}

inline RppStatus compute_brightness_24_host(__m256 *p, __m256 *pBrightnessParams)
{
p[0] = _mm256_fmadd_ps(p[0], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
p[1] = _mm256_fmadd_ps(p[1], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
p[2] = _mm256_fmadd_ps(p[2], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment

return RPP_SUCCESS;
}

inline RppStatus compute_brightness_24_host(__m128 *p, __m128 *pBrightnessParams)
{
p[0] = _mm_fmadd_ps(p[0], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
p[1] = _mm_fmadd_ps(p[1], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
p[2] = _mm_fmadd_ps(p[2], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
p[3] = _mm_fmadd_ps(p[3], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
p[4] = _mm_fmadd_ps(p[4], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
p[5] = _mm_fmadd_ps(p[5], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment

return RPP_SUCCESS;
}

inline RppStatus compute_brightness_16_host(__m256 *p, __m256 *pBrightnessParams)
{
p[0] = _mm256_fmadd_ps(p[0], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
p[1] = _mm256_fmadd_ps(p[1], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment

return RPP_SUCCESS;
}

inline RppStatus compute_brightness_16_host(__m128 *p, __m128 *pBrightnessParams)
{
p[0] = _mm_fmadd_ps(p[0], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
p[1] = _mm_fmadd_ps(p[1], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
p[2] = _mm_fmadd_ps(p[2], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
p[3] = _mm_fmadd_ps(p[3], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment

return RPP_SUCCESS;
}

inline RppStatus compute_brightness_12_host(__m128 *p, __m128 *pBrightnessParams)
{
p[0] = _mm_fmadd_ps(p[0], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
p[1] = _mm_fmadd_ps(p[1], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
p[2] = _mm_fmadd_ps(p[2], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment

return RPP_SUCCESS;
}

inline RppStatus compute_brightness_8_host(__m256 *p, __m256 *pBrightnessParams)
{
p[0] = _mm256_fmadd_ps(p[0], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment

return RPP_SUCCESS;
}

inline RppStatus compute_brightness_8_host(__m128 *p, __m128 *pBrightnessParams)
{
p[0] = _mm_fmadd_ps(p[0], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
p[1] = _mm_fmadd_ps(p[1], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment

return RPP_SUCCESS;
}

inline RppStatus compute_brightness_4_host(__m128 *p, __m128 *pBrightnessParams)
{
p[0] = _mm_fmadd_ps(p[0], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment

return RPP_SUCCESS;
}

inline RppStatus compute_gridmask_masks_16_host(__m128 *pCol, __m128 *pGridRowRatio, __m128 pCosRatio, __m128 pSinRatio, __m128 pGridRatio, __m128 *pMask)
{
__m128 pCalc[2];
Expand Down
84 changes: 84 additions & 0 deletions src/include/cpu/rpp_cpu_simd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -782,6 +782,39 @@ inline RppStatus rpp_store48_f32pln3_to_u8pkd3_avx(Rpp8u *dstPtr, __m256 *p)
return RPP_SUCCESS;
}

inline RppStatus rpp_load16_u8_to_f32_avx(Rpp8u *srcPtr, __m256 *p)
{
__m128i px[2];
__m128i pxZero = _mm_setzero_si128();

px[0] = _mm_loadu_si128((__m128i *)srcPtr); /* load pixels 0-15 */
px[1] = _mm_unpackhi_epi8(px[0], pxZero); /* pixels 8-15 */
px[0] = _mm_unpacklo_epi8(px[0], pxZero); /* pixels 0-7 */
p[0] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_unpacklo_epi16(px[0], pxZero),_mm_unpackhi_epi16(px[0], pxZero)));/* pixels 0-7 */
p[1] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_unpacklo_epi16(px[1], pxZero),_mm_unpackhi_epi16(px[1], pxZero)));/* pixels 8-25 */

return RPP_SUCCESS;
}

inline RppStatus rpp_store16_f32_to_u8_avx(Rpp8u *dstPtr, __m256 *p)
{
__m256i pxCvt[2];
__m128i px[4];

pxCvt[0] = _mm256_cvtps_epi32(p[0]); /* pixels 0-7 */
pxCvt[1] = _mm256_cvtps_epi32(p[1]); /* pixels 8-15 */
px[0] = _mm256_extracti128_si256(pxCvt[0],0); /* pixels 0-3 */
px[1] = _mm256_extracti128_si256(pxCvt[0],1); /* pixels 4-7 */
px[2] = _mm256_extracti128_si256(pxCvt[1],0); /* pixels 8-11 */
px[3] = _mm256_extracti128_si256(pxCvt[1],1); /* pixels 12-15 */
px[0] = _mm_packus_epi32(px[0], px[1]); /* pixels 0-7 */
px[1] = _mm_packus_epi32(px[2], px[3]); /* pixels 8-15 */
px[2] = _mm_packus_epi16(px[0], px[1]); /* pixels 8-15 */
_mm_storeu_si128((__m128i *)dstPtr, px[2]); /* store pixels 0-15 */

return RPP_SUCCESS;
}

inline RppStatus rpp_load24_f32pkd3_to_f32pln3_avx(Rpp32f *srcPtr, __m256 *p)
{
__m128 p128[8];
Expand Down Expand Up @@ -843,6 +876,20 @@ inline RppStatus rpp_store24_f32pln3_to_f32pkd3_avx(Rpp32f *dstPtr, __m256 *p)
return RPP_SUCCESS;
}

inline RppStatus rpp_load8_f32_to_f32_avx(Rpp32f *srcPtr, __m256 *p)
{
p[0] = _mm256_loadu_ps(srcPtr);

return RPP_SUCCESS;
}

inline RppStatus rpp_store8_f32_to_f32_avx(Rpp32f *dstPtr, __m256 *p)
{
_mm256_storeu_ps(dstPtr, p[0]);

return RPP_SUCCESS;
}

inline RppStatus rpp_load48_i8pkd3_to_f32pln3_avx(Rpp8s *srcPtr, __m256 *p)
{
__m128i px[4];
Expand Down Expand Up @@ -957,6 +1004,43 @@ inline RppStatus rpp_store48_f32pln3_to_i8pkd3_avx(Rpp8s *dstPtr, __m256 *p)
return RPP_SUCCESS;
}

inline RppStatus rpp_load16_i8_to_f32_avx(Rpp8s *srcPtr, __m256 *p)
{
__m128i px[2];
__m128i pxZero = _mm_setzero_si128();
__m128i pxConvertI8 = _mm_set1_epi8((char)128);

px[0] = _mm_loadu_si128((__m128i *)srcPtr); /* load pixels 0-15 */
px[0] = _mm_add_epi8(px[0], pxConvertI8); /* convert to u8 for px0 compute */
px[1] = _mm_unpackhi_epi8(px[0], pxZero); /* pixels 8-15 */
px[0] = _mm_unpacklo_epi8(px[0], pxZero); /* pixels 0-7 */
p[0] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_unpacklo_epi16(px[0], pxZero),_mm_unpackhi_epi16(px[0], pxZero))); /* pixels 0-7 */
p[1] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_unpacklo_epi16(px[1], pxZero),_mm_unpackhi_epi16(px[1], pxZero))); /* pixels 8-15 */

return RPP_SUCCESS;
}

inline RppStatus rpp_store16_f32_to_i8_avx(Rpp8s *dstPtr, __m256 *p)
{
__m256i pxCvt[2];
__m128i px[4];
__m128i pxConvertI8 = _mm_set1_epi8((char)128);

pxCvt[0] = _mm256_cvtps_epi32(p[0]); /* pixels 0-7 */
pxCvt[1] = _mm256_cvtps_epi32(p[1]); /* pixels 8-15 */
px[0] = _mm256_extracti128_si256(pxCvt[0],0); /* pixels 0-3 */
px[1] = _mm256_extracti128_si256(pxCvt[0],1); /* pixels 4-7 */
px[2] = _mm256_extracti128_si256(pxCvt[1],0); /* pixels 8-11 */
px[3] = _mm256_extracti128_si256(pxCvt[1],1); /* pixels 12-15 */
px[0] = _mm_packus_epi32(px[0], px[1]); /* pixels 0-7 */
px[1] = _mm_packus_epi32(px[2], px[3]); /* pixels 8-15 */
px[2] = _mm_packus_epi16(px[0], px[1]); /* pixels 0-15 */
px[2] = _mm_sub_epi8(px[2], pxConvertI8); /* convert back to i8 for px0 store */
_mm_storeu_si128((__m128i *)dstPtr, px[2]); /* store pixels 0-15 */

return RPP_SUCCESS;
}

inline RppStatus rpp_normalize48_avx(__m256 *p)
{
p[0] = _mm256_mul_ps(p[0], avx_p1op255);
Expand Down
Loading

0 comments on commit 6613f95

Please sign in to comment.