Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RPP Tensor Water Augmentation on HOST and HIP #181

Merged
merged 41 commits into from
Oct 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
52b83c6
added water HOST and HIP codes
sampath1117 May 4, 2023
8dafad2
added water case in test suite
sampath1117 May 4, 2023
890437a
added golden outputs for water
sampath1117 May 4, 2023
643281c
added omp thread changes for water augmentation
sampath1117 May 22, 2023
336188d
experimental changes
sampath1117 May 22, 2023
87ab19f
fixed output issue with AVX2 instructions
sampath1117 May 23, 2023
94c8340
added AVX2 support for PKD3 load function
sampath1117 May 23, 2023
56190d8
nwc commit - added avx2 changes for u8 layout toggle variants but nee…
sampath1117 May 29, 2023
3b18a58
Add Avx2 implementation for F32 and U8 toggle variants
HazarathKumarM Jun 7, 2023
754e353
Add AVX2 support for u8 pkd3-pln3 and i8 pkd3-pln3 for water augmenta…
HazarathKumarM Jun 19, 2023
c4f69a9
change F32 load and store logic
HazarathKumarM Jul 3, 2023
e9e74a2
optimized the store function for F32 PLN3-PKD3
sampath1117 Jul 11, 2023
6c8fa57
Merge branch 'master' into water_avx_exp
sampath1117 Jul 12, 2023
fb1fdb4
reverted back irrelevant changes
sampath1117 Jul 12, 2023
4ffe9f6
minor change
sampath1117 Jul 12, 2023
6e0756a
optimized load and store functions for water U8 and F32 variants in host
sampath1117 Jul 12, 2023
0cf2626
merge with master
sampath1117 Jul 13, 2023
81553d3
removed golden outputs for water
sampath1117 Jul 13, 2023
a5567e6
minor changes
sampath1117 Jul 13, 2023
89380a5
renamed few functions and removed unused functions
sampath1117 Jul 13, 2023
27b318b
fixed bug in i8 load function
sampath1117 Jul 13, 2023
d3943b5
changed cast to c++ style
sampath1117 Jul 14, 2023
31d1624
added golden outputs for water
sampath1117 May 4, 2023
2729b1e
updated golden outputs with latest changes
sampath1117 Jul 14, 2023
8b763ad
modified the u8, i8 pkd3-pln3 function and added comments for the vec…
sampath1117 Jul 14, 2023
b418e24
fixed minor bug in I8 variants
sampath1117 Jul 14, 2023
9da6be1
Merge pull request #126 from sampath1117/sr/opt_water_merge
r-abishek Jul 18, 2023
3b14636
made to changes to resolve codacy warnings
sampath1117 Jul 18, 2023
13062d2
changed cast to c++ style in hip kernel
sampath1117 Jul 18, 2023
cb0a19d
Merge pull request #145 from sampath1117/sr/opt_water_merge
r-abishek Jul 18, 2023
837e121
changed generic nn F32 loads using gather and setr instructions
sampath1117 Aug 9, 2023
12ba56b
added comments for latest changes
sampath1117 Aug 11, 2023
f030c75
minor change
sampath1117 Aug 11, 2023
0ff02be
Merge branch 'GPUOpen-ProfessionalCompute-Libraries:master' into ar/o…
r-abishek Aug 24, 2023
00e3084
Merge pull request #161 from sampath1117/sr/opt_water_latest
r-abishek Aug 25, 2023
a60b2b4
Merge branch 'develop' into ar/opt_water
r-abishek Sep 11, 2023
b742c4b
Merge branch 'develop' into ar/opt_water
r-abishek Sep 18, 2023
7c674be
Merge branch 'GPUOpen-ProfessionalCompute-Libraries:master' into ar/o…
r-abishek Sep 19, 2023
ba7ebae
added definition for storing 32 and 64 bits from a 128bit register
sampath1117 Oct 17, 2023
c6f9c4a
Merge pull request #180 from sampath1117/sr/water_build_fix
r-abishek Oct 17, 2023
e3bb7c1
Merge branch 'master' into ar/opt_water
r-abishek Oct 17, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions include/rppt_tensor_effects_augmentations.h
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,58 @@ RppStatus rppt_non_linear_blend_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDes
RppStatus rppt_non_linear_blend_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *stdDevTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
#endif // GPU_SUPPORT

/*! \brief Water augmentation on HOST backend for a NCHW/NHWC layout tensor
* \details The water augmentation adds a water effect for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.<br>
* - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
* - dstPtr depth ranges - Will be same depth as srcPtr.
* \image html img150x150.jpg Sample Input
* \image html effects_augmentations_water_img150x150.jpg Sample Output
* \param [in] srcPtr source tensor in HOST memory
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HOST memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param[in] amplitudeXTensor amplitudeX values for water effect (1D tensor in HOST memory, of size batchSize)
* \param[in] amplitudeYTensor amplitudeY values for water effect (1D tensor in HOST memory, of size batchSize)
* \param[in] freqXTensor freqX values for water effect (1D tensor in HOST memory, of size batchSize)
* \param[in] freqYTensor freqY values for water effect (1D tensor in HOST memory, of size batchSize)
* \param[in] phaseXTensor amplitudeY values for water effect (1D tensor in HOST memory, of size batchSize)
* \param[in] phaseYTensor amplitudeY values for water effect (1D tensor in HOST memory, of size batchSize)
* \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
* \return A <tt> \ref RppStatus</tt> enumeration.
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
*/
RppStatus rppt_water_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *amplitudeXTensor, Rpp32f *amplitudeYTensor, Rpp32f *frequencyXTensor, Rpp32f *frequencyYTensor, Rpp32f *phaseXTensor, Rpp32f *phaseYTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);

#ifdef GPU_SUPPORT
/*! \brief Water augmentation on HIP backend for a NCHW/NHWC layout tensor
* \details The water augmentation adds a water effect for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.<br>
* - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
* - dstPtr depth ranges - Will be same depth as srcPtr.
* \image html img150x150.jpg Sample Input
* \image html effects_augmentations_water_img150x150.jpg Sample Output
* \param [in] srcPtr source tensor in HIP memory
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param[in] amplitudeXTensor amplitudeX values for water effect (1D tensor in pinned/HOST memory, of size batchSize)
* \param[in] amplitudeYTensor amplitudeY values for water effect (1D tensor in pinned/HOST memory, of size batchSize)
* \param[in] freqXTensor freqX values for water effect (1D tensor in pinned/HOST memory, of size batchSize)
* \param[in] freqYTensor freqY values for water effect (1D tensor in pinned/HOST memory, of size batchSize)
* \param[in] phaseXTensor amplitudeY values for water effect (1D tensor in pinned/HOST memory, of size batchSize)
* \param[in] phaseYTensor amplitudeY values for water effect (1D tensor in pinned/HOST memory, of size batchSize)
* \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
* \return A <tt> \ref RppStatus</tt> enumeration.
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
*/
RppStatus rppt_water_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *amplitudeXTensor, Rpp32f *amplitudeYTensor, Rpp32f *frequencyXTensor, Rpp32f *frequencyYTensor, Rpp32f *phaseXTensor, Rpp32f *phaseYTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
#endif // GPU_SUPPORT

/*! @}
*/

Expand Down
14 changes: 14 additions & 0 deletions src/include/cpu/rpp_cpu_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5044,6 +5044,20 @@ inline void compute_generic_nn_srclocs_and_validate_sse(__m128 pSrcY, __m128 pSr
_mm_storeu_si128((__m128i*) srcLoc, pxSrcLoc);
}

inline void compute_generic_nn_srclocs_and_validate_avx(__m256 pSrcY, __m256 pSrcX, __m256 *pRoiLTRB, __m256 pSrcStrideH, Rpp32s *srcLoc, Rpp32s *invalidLoad, bool hasRGBChannels = false)
{
pSrcY = _mm256_round_ps(pSrcY, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)); // Nearest Neighbor Y location vector
pSrcX = _mm256_round_ps(pSrcX, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)); // Nearest Neighbor X location vector
_mm256_storeu_si256((__m256i*) invalidLoad, _mm256_cvtps_epi32(_mm256_or_ps( // Vectorized ROI boundary check
_mm256_or_ps(_mm256_cmp_ps(pSrcX, pRoiLTRB[0], _CMP_LT_OQ), _mm256_cmp_ps(pSrcY, pRoiLTRB[1],_CMP_LT_OQ)),
_mm256_or_ps(_mm256_cmp_ps(pSrcX, pRoiLTRB[2], _CMP_GT_OQ), _mm256_cmp_ps(pSrcY, pRoiLTRB[3], _CMP_GT_OQ))
)));
if (hasRGBChannels)
pSrcX = _mm256_mul_ps(pSrcX, avx_p3);
__m256i pxSrcLoc = _mm256_cvtps_epi32(_mm256_fmadd_ps(pSrcY, pSrcStrideH, pSrcX));
_mm256_storeu_si256((__m256i*) srcLoc, pxSrcLoc);
}

template <typename T>
inline void compute_generic_nn_interpolation_pkd3_to_pln3(Rpp32f srcY, Rpp32f srcX, RpptROI *roiLTRB, T *dstPtrTempR, T *dstPtrTempG, T *dstPtrTempB, T *srcPtrChannel, RpptDescPtr srcDescPtr)
{
Expand Down
Loading