diff --git a/include/rppt_tensor_arithmetic_operations.h b/include/rppt_tensor_arithmetic_operations.h index 0a247f886..bc111e7f6 100644 --- a/include/rppt_tensor_arithmetic_operations.h +++ b/include/rppt_tensor_arithmetic_operations.h @@ -25,66 +25,144 @@ SOFTWARE. #ifndef RPPT_TENSOR_ARITHMETIC_OPERATIONS_H #define RPPT_TENSOR_ARITHMETIC_OPERATIONS_H -/*! - * \file - * \brief RPPT Tensor Arithmetic operation Functions. - * - * \defgroup group_tensor_arithmetic Operations: AMD RPP Tensor Arithmetic Operations - * \brief Tensor Color Augmentations. - */ - #include "rpp.h" #include "rppdefs.h" #ifdef __cplusplus extern "C" { #endif -/*! \brief Fmadd augmentation HOST +/*! + * \file + * \brief RPPT Tensor Operations - Arithmetic Operations. + * \defgroup group_tensor_arithmetic_operations RPPT Tensor Operations - Arithmetic Operations. + * \brief RPPT Tensor Operations - Arithmetic Operations. + */ + +/*! \addtogroup group_rppt_tensor_arithmetic_operations + * @{ + */ + +/*! \brief Fused multiply add scalar augmentation on HOST backend * \details This function performs the fmadd operation on a batch of 4D tensors. * It multiplies each element of the source tensor by a corresponding element in the 'mulTensor', * adds a corresponding element from the 'addTensor', and stores the result in the destination tensor. * Support added for f32 -> f32 dataype. - * \param [in] srcPtr source tensor memory + * \param [in] srcPtr source tensor in HOST memory * \param[in] srcGenericDescPtr source tensor descriptor - * \param[out] dstPtr destination tensor memory + * \param[out] dstPtr destination tensor in HOST memory * \param[in] dstGenericDescPtr destination tensor descriptor * \param[in] mulTensor mul values for fmadd calculation (1D tensor of batchSize Rpp32f values) * \param[in] addTensor add values for fmadd calculation (1D tensor of batchSize Rpp32f values) * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values) * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB) - * \param [in] rppHandle Host-handle - * \return RppStatus enum. - * \returns RPP_SUCCESS \ref RppStatus on successful completion. - * Else return RPP_ERROR - * \ingroup group_tensor_arithmetic + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. */ RppStatus rppt_fused_multiply_add_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *mulTensor, Rpp32f *addTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle); - -/*! \brief Fmadd augmentation GPU +#ifdef GPU_SUPPORT +/*! \brief Fused multiply add scalar augmentation on HIP backend * \details This function performs the fmadd operation on a batch of 4D tensors. * It multiplies each element of the source tensor by a corresponding element in the 'mulTensor', * adds a corresponding element from the 'addTensor', and stores the result in the destination tensor. * Support added for f32 -> f32 dataype. - * \param [in] srcPtr source tensor memory + * \param [in] srcPtr source tensor in HIP memory * \param[in] srcGenericDescPtr source tensor descriptor - * \param[out] dstPtr destination tensor memory + * \param[out] dstPtr destination tensor in HIP memory * \param[in] dstGenericDescPtr destination tensor descriptor * \param[in] mulTensor mul values for fmadd calculation (1D tensor of batchSize Rpp32f values) * \param[in] addTensor add values for fmadd calculation (1D tensor of batchSize Rpp32f values) * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values) * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB) - * \param [in] rppHandle Hip-handle - * \return RppStatus enum. - * \returns RPP_SUCCESS \ref RppStatus on successful completion. - * Else return RPP_ERROR - * \ingroup group_tensor_arithmetic + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. */ +RppStatus rppt_fused_multiply_add_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *mulTensor, Rpp32f *addTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle); +#endif // GPU_SUPPORT + +/*! \brief Add scalar augmentation on HOST backend + * \details This function performs the addition operation on a batch of 4D tensors. + * It adds a corresponding element from the 'addTensor' to source tensor, and stores the result in the destination tensor. + * Support added for f32 -> f32 dataype. + * \param [in] srcPtr source tensor in HOST memory + * \param[in] srcGenericDescPtr source tensor descriptor + * \param[out] dstPtr destination tensor in HOST memory + * \param[in] dstGenericDescPtr destination tensor descriptor + * \param[in] addTensor add values for used for addition (1D tensor of batchSize Rpp32f values) + * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values) + * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB) + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_add_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *addTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle); #ifdef GPU_SUPPORT -RppStatus rppt_fused_multiply_add_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *mulTensor, Rpp32f *addTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle); +/*! \brief Add scalar augmentation on HIP backend + * \details This function performs the addition operation on a batch of 4D tensors. + * It adds a corresponding element from the 'addTensor' to source tensor, and stores the result in the destination tensor. + * Support added for f32 -> f32 dataype. + * \param [in] srcPtr source tensor in HIP memory + * \param[in] srcGenericDescPtr source tensor descriptor + * \param[out] dstPtr destination tensor in HIP memory + * \param[in] dstGenericDescPtr destination tensor descriptor + * \param[in] addTensor add values for used for addition (1D tensor of batchSize Rpp32f values) + * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values) + * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB) + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_add_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *addTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle); #endif // GPU_SUPPORT +/*! \brief Subtract scalar augmentation on HOST backend + * \details This function performs the subtraction operation on a batch of 4D tensors. + * It takes a corresponding element from 'subtractTensor' and subtracts it from source tensor. Result is stored in the destination tensor. + * Support added for f32 -> f32 dataype. + * \param [in] srcPtr source tensor in HOST memory + * \param[in] srcGenericDescPtr source tensor descriptor + * \param[out] dstPtr destination tensor in HOST memory + * \param[in] dstGenericDescPtr destination tensor descriptor + * \param[in] subtractTensor subtract values for used for subtraction (1D tensor of batchSize Rpp32f values) + * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values) + * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB) + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_subtract_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *subtractTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle); + +#ifdef GPU_SUPPORT +/*! \brief Subtract scalar augmentation on HIP backend + * \details This function performs the subtraction operation on a batch of 4D tensors. + * It takes a corresponding element from 'subtractTensor' and subtracts it from source tensor. Result is stored in the destination tensor. + * Support added for f32 -> f32 dataype. + * \param [in] srcPtr source tensor in HIP memory + * \param[in] srcGenericDescPtr source tensor descriptor + * \param[out] dstPtr destination tensor in HIP memory + * \param[in] dstGenericDescPtr destination tensor descriptor + * \param[in] subtractTensor subtract values for used for subtraction (1D tensor of batchSize Rpp32f values) + * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values) + * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB) + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_subtract_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *subtractTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle); +#endif // GPU_SUPPORT + +/*! @} + */ + #ifdef __cplusplus } #endif diff --git a/include/rppt_tensor_statistical_operations.h b/include/rppt_tensor_statistical_operations.h index 181b1c565..b61af9dde 100644 --- a/include/rppt_tensor_statistical_operations.h +++ b/include/rppt_tensor_statistical_operations.h @@ -24,6 +24,7 @@ SOFTWARE. #ifndef RPPT_TENSOR_STATISTICAL_OPERATIONS_H #define RPPT_TENSOR_STATISTICAL_OPERATIONS_H + #include "rpp.h" #include "rppdefs.h" #ifdef __cplusplus diff --git a/src/include/cpu/rpp_cpu_common.hpp b/src/include/cpu/rpp_cpu_common.hpp index 67c34de70..6fd625b57 100644 --- a/src/include/cpu/rpp_cpu_common.hpp +++ b/src/include/cpu/rpp_cpu_common.hpp @@ -2431,6 +2431,18 @@ inline RppStatus custom_convolve_image_host(T* srcPtr, RppiSize srcSize, U* dstP // Compute Functions for RPP Tensor API +inline void compute_subtract_16_host(__m256 *p, __m256 *pSubtractParam) +{ + p[0] = _mm256_sub_ps(p[0], pSubtractParam[0]); // subtract adjustment + p[1] = _mm256_sub_ps(p[1], pSubtractParam[0]); // subtract adjustment +} + +inline void compute_add_16_host(__m256 *p, __m256 *pAddParam) +{ + p[0] = _mm256_add_ps(p[0], pAddParam[0]); // add adjustment + p[1] = _mm256_add_ps(p[1], pAddParam[0]); // add adjustment +} + inline void compute_rmn_24_host(__m256 *p, __m256 *pRMNParams) { p[0] = _mm256_mul_ps(_mm256_sub_ps(p[0], pRMNParams[0]), pRMNParams[1]); diff --git a/src/modules/cpu/host_tensor_arithmetic_operations.hpp b/src/modules/cpu/host_tensor_arithmetic_operations.hpp index 96553489d..e043aab83 100644 --- a/src/modules/cpu/host_tensor_arithmetic_operations.hpp +++ b/src/modules/cpu/host_tensor_arithmetic_operations.hpp @@ -26,5 +26,7 @@ SOFTWARE. #define HOST_TENSOR_ARITHMETIC_OPERATIONS_HPP #include "kernel/fused_multiply_add_scalar.hpp" +#include "kernel/add_scalar.hpp" +#include "kernel/subtract_scalar.hpp" #endif // HOST_TENSOR_ARITHMETIC_OPERATIONS_HPP \ No newline at end of file diff --git a/src/modules/cpu/kernel/add_scalar.hpp b/src/modules/cpu/kernel/add_scalar.hpp new file mode 100644 index 000000000..d0179d4e1 --- /dev/null +++ b/src/modules/cpu/kernel/add_scalar.hpp @@ -0,0 +1,152 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "rppdefs.h" +#include "rpp_cpu_simd.hpp" +#include "rpp_cpu_common.hpp" + +RppStatus add_scalar_f32_f32_host_tensor(Rpp32f *srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + Rpp32f *dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32f *addTensor, + RpptROI3DPtr roiGenericPtrSrc, + RpptRoi3DType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI3D roiDefault; + if(srcGenericDescPtr->layout==RpptLayout::NCDHW) + roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[4], (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2]}; + else if(srcGenericDescPtr->layout==RpptLayout::NDHWC) + roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2], (Rpp32s)srcGenericDescPtr->dims[1]}; + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) + { + RpptROI3D roi; + RpptROI3DPtr roiPtrInput = &roiGenericPtrSrc[batchCount]; + compute_roi3D_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp32f *srcPtrImage, *dstPtrImage; + srcPtrImage = srcPtr + batchCount * srcGenericDescPtr->strides[0]; + dstPtrImage = dstPtr + batchCount * dstGenericDescPtr->strides[0]; + + Rpp32f addParam = addTensor[batchCount]; + Rpp32f *srcPtrChannel, *dstPtrChannel; + dstPtrChannel = dstPtrImage; + + Rpp32u vectorIncrement = 16; + Rpp32u bufferLength = roi.xyzwhdROI.roiWidth * layoutParams.bufferMultiplier; + Rpp32u alignedLength = (bufferLength / vectorIncrement) * vectorIncrement; + __m256 pAddParam = _mm256_set1_ps(addParam); + + // Add without fused output-layout toggle (NCDHW -> NCDHW) + if((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW)) + { + srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[3]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier); + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp32f *srcPtrDepth, *dstPtrDepth; + srcPtrDepth = srcPtrChannel; + dstPtrDepth = dstPtrChannel; + for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++) + { + Rpp32f *srcPtrRow, *dstPtrRow; + srcPtrRow = srcPtrDepth; + dstPtrRow = dstPtrDepth; + for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++) + { + Rpp32f *srcPtrTemp, *dstPtrTemp; + srcPtrTemp = srcPtrRow; + dstPtrTemp = dstPtrRow; + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p[2]; + rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p); // simd loads + compute_add_16_host(p, &pAddParam); // add adjustment + rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores + srcPtrTemp += vectorIncrement; + dstPtrTemp += vectorIncrement; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp++ = *srcPtrTemp++ + addParam; + } + srcPtrRow += srcGenericDescPtr->strides[3]; + dstPtrRow += dstGenericDescPtr->strides[3]; + } + srcPtrDepth += srcGenericDescPtr->strides[2]; + dstPtrDepth += dstGenericDescPtr->strides[2]; + } + srcPtrChannel += srcGenericDescPtr->strides[1]; + dstPtrChannel += srcGenericDescPtr->strides[1]; + } + } + // Add without fused output-layout toggle (NDHWC -> NDHWC) + else if((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC)) + { + srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[1]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier); + Rpp32f *srcPtrDepth = srcPtrChannel; + Rpp32f *dstPtrDepth = dstPtrChannel; + for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++) + { + Rpp32f *srcPtrRow, *dstPtrRow; + srcPtrRow = srcPtrDepth; + dstPtrRow = dstPtrDepth; + for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++) + { + Rpp32f *srcPtrTemp, *dstPtrTemp; + srcPtrTemp = srcPtrRow; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p[2]; + rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p); // simd loads + compute_add_16_host(p, &pAddParam); // add adjustment + rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores + srcPtrTemp += vectorIncrement; + dstPtrTemp += vectorIncrement; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp++ = *srcPtrTemp++ + addParam; + } + srcPtrRow += srcGenericDescPtr->strides[2]; + dstPtrRow += dstGenericDescPtr->strides[2]; + } + srcPtrDepth += srcGenericDescPtr->strides[1]; + dstPtrDepth += dstGenericDescPtr->strides[1]; + } + } + } + + return RPP_SUCCESS; +} diff --git a/src/modules/cpu/kernel/subtract_scalar.hpp b/src/modules/cpu/kernel/subtract_scalar.hpp new file mode 100644 index 000000000..a40e6219f --- /dev/null +++ b/src/modules/cpu/kernel/subtract_scalar.hpp @@ -0,0 +1,152 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "rppdefs.h" +#include "rpp_cpu_simd.hpp" +#include "rpp_cpu_common.hpp" + +RppStatus subtract_scalar_f32_f32_host_tensor(Rpp32f *srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + Rpp32f *dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32f *subtractTensor, + RpptROI3DPtr roiGenericPtrSrc, + RpptRoi3DType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI3D roiDefault; + if(srcGenericDescPtr->layout==RpptLayout::NCDHW) + roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[4], (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2]}; + else if(srcGenericDescPtr->layout==RpptLayout::NDHWC) + roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2], (Rpp32s)srcGenericDescPtr->dims[1]}; + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) + { + RpptROI3D roi; + RpptROI3DPtr roiPtrInput = &roiGenericPtrSrc[batchCount]; + compute_roi3D_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp32f *srcPtrImage, *dstPtrImage; + srcPtrImage = srcPtr + batchCount * srcGenericDescPtr->strides[0]; + dstPtrImage = dstPtr + batchCount * dstGenericDescPtr->strides[0]; + + Rpp32f subtractParam = subtractTensor[batchCount]; + Rpp32f *srcPtrChannel, *dstPtrChannel; + dstPtrChannel = dstPtrImage; + + Rpp32u vectorIncrement = 16; + Rpp32u bufferLength = roi.xyzwhdROI.roiWidth * layoutParams.bufferMultiplier; + Rpp32u alignedLength = (bufferLength / vectorIncrement) * vectorIncrement; + __m256 pSubtractParam = _mm256_set1_ps(subtractParam); + + // Subtract without fused output-layout toggle (NCDHW -> NCDHW) + if((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW)) + { + srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[3]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier); + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp32f *srcPtrDepth, *dstPtrDepth; + srcPtrDepth = srcPtrChannel; + dstPtrDepth = dstPtrChannel; + for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++) + { + Rpp32f *srcPtrRow, *dstPtrRow; + srcPtrRow = srcPtrDepth; + dstPtrRow = dstPtrDepth; + for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++) + { + Rpp32f *srcPtrTemp, *dstPtrTemp; + srcPtrTemp = srcPtrRow; + dstPtrTemp = dstPtrRow; + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p[2]; + rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p); // simd loads + compute_subtract_16_host(p, &pSubtractParam); // subtract adjustment + rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores + srcPtrTemp += vectorIncrement; + dstPtrTemp += vectorIncrement; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp++ = *srcPtrTemp++ - subtractParam; + } + srcPtrRow += srcGenericDescPtr->strides[3]; + dstPtrRow += dstGenericDescPtr->strides[3]; + } + srcPtrDepth += srcGenericDescPtr->strides[2]; + dstPtrDepth += dstGenericDescPtr->strides[2]; + } + srcPtrChannel += srcGenericDescPtr->strides[1]; + dstPtrChannel += srcGenericDescPtr->strides[1]; + } + } + // Subtract without fused output-layout toggle (NDHWC -> NDHWC) + else if((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC)) + { + srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[1]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier); + Rpp32f *srcPtrDepth = srcPtrChannel; + Rpp32f *dstPtrDepth = dstPtrChannel; + for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++) + { + Rpp32f *srcPtrRow, *dstPtrRow; + srcPtrRow = srcPtrDepth; + dstPtrRow = dstPtrDepth; + for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++) + { + Rpp32f *srcPtrTemp, *dstPtrTemp; + srcPtrTemp = srcPtrRow; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p[2]; + rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p); // simd loads + compute_subtract_16_host(p, &pSubtractParam); // subtract adjustment + rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores + srcPtrTemp += vectorIncrement; + dstPtrTemp += vectorIncrement; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp++ = *srcPtrTemp++ - subtractParam; + } + srcPtrRow += srcGenericDescPtr->strides[2]; + dstPtrRow += dstGenericDescPtr->strides[2]; + } + srcPtrDepth += srcGenericDescPtr->strides[1]; + dstPtrDepth += dstGenericDescPtr->strides[1]; + } + } + } + + return RPP_SUCCESS; +} diff --git a/src/modules/hip/hip_tensor_arithmetic_operations.hpp b/src/modules/hip/hip_tensor_arithmetic_operations.hpp index 55fbb7832..6123ab2fb 100644 --- a/src/modules/hip/hip_tensor_arithmetic_operations.hpp +++ b/src/modules/hip/hip_tensor_arithmetic_operations.hpp @@ -26,5 +26,7 @@ SOFTWARE. #define HIP_TENSOR_ARITHMEETIC_OPERATIONS_HPP #include "kernel/fused_multiply_add_scalar.hpp" +#include "kernel/add_scalar.hpp" +#include "kernel/subtract_scalar.hpp" #endif // HIP_TENSOR_ARITHMEETIC_OPERATIONS_HPP diff --git a/src/modules/hip/kernel/add_scalar.hpp b/src/modules/hip/kernel/add_scalar.hpp new file mode 100644 index 000000000..709337c9d --- /dev/null +++ b/src/modules/hip/kernel/add_scalar.hpp @@ -0,0 +1,114 @@ +#include +#include "rpp_hip_common.hpp" + + +__global__ void add_scalar_ncdhw_hip_tensor(float *srcPtr, + uint3 srcStridesCDH, + float *dstPtr, + uint3 dstStridesCDH, + int channels, + float addParam, + RpptROI3DPtr roiGenericPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // W - inner most dim vectorized + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim + + if ((id_z >= roiGenericPtrSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericPtrSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericPtrSrc->xyzwhdROI.roiWidth)) + { + return; + } + + uint srcIdx = ((id_z + roiGenericPtrSrc->xyzwhdROI.xyz.z) * srcStridesCDH.y) + ((id_y + roiGenericPtrSrc->xyzwhdROI.xyz.y) * srcStridesCDH.z) + (id_x + roiGenericPtrSrc->xyzwhdROI.xyz.x); + uint dstIdx = (id_z * dstStridesCDH.y) + (id_y * dstStridesCDH.z) + id_x; + + d_float8 val_f8; + for(int c = 0; c < channels; c++) + { + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &val_f8); + rpp_hip_math_add8_const(&val_f8, &val_f8, static_cast(addParam)); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &val_f8); + srcIdx += srcStridesCDH.x; + dstIdx += dstStridesCDH.x; + } +} + +__global__ void add_scalar_ndhwc_hip_tensor(float *srcPtr, + uint2 srcStridesDH, + float *dstPtr, + uint2 dstStridesDH, + float addParam, + RpptROI3DPtr roiGenericPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // WC - inner most dim vectorized + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim + + if ((id_z >= roiGenericPtrSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericPtrSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericPtrSrc->xyzwhdROI.roiWidth)) + { + return; + } + + uint srcIdx = ((id_z + roiGenericPtrSrc->xyzwhdROI.xyz.z) * srcStridesDH.x) + ((id_y + roiGenericPtrSrc->xyzwhdROI.xyz.y) * srcStridesDH.y) + (id_x + roiGenericPtrSrc->xyzwhdROI.xyz.x) * 3; + uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x * 3; + + d_float24 val_f24; + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &val_f24); + rpp_hip_math_add24_const(&val_f24, &val_f24, static_cast(addParam)); + rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &val_f24); +} + +RppStatus hip_exec_add_scalar_tensor(Rpp32f *srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + Rpp32f *dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + RpptROI3DPtr roiGenericPtrSrc, + Rpp32f *addTensor, + rpp::Handle& handle) +{ + if (dstGenericDescPtr->layout == RpptLayout::NCDHW) + { + int globalThreads_x = (dstGenericDescPtr->strides[3] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + int globalThreads_y = dstGenericDescPtr->dims[3]; // H - height (y direction) + int globalThreads_z = dstGenericDescPtr->dims[2]; // D - depth (z direction) + + for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) + { + hipLaunchKernelGGL(add_scalar_ncdhw_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr + (batchCount * srcGenericDescPtr->strides[0]), + make_uint3(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2], srcGenericDescPtr->strides[3]), + dstPtr + (batchCount * dstGenericDescPtr->strides[0]), + make_uint3(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2], dstGenericDescPtr->strides[3]), + dstGenericDescPtr->dims[1], + addTensor[batchCount], + &roiGenericPtrSrc[batchCount]); + } + } + else if (dstGenericDescPtr->layout == RpptLayout::NDHWC) + { + int globalThreads_x = (dstGenericDescPtr->strides[2] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + int globalThreads_y = dstGenericDescPtr->dims[2]; // H - height (y direction) + int globalThreads_z = dstGenericDescPtr->dims[1]; // D - depth (z direction) + + for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) + { + hipLaunchKernelGGL(add_scalar_ndhwc_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr + (batchCount * srcGenericDescPtr->strides[0]), + make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]), + dstPtr + (batchCount * dstGenericDescPtr->strides[0]), + make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]), + addTensor[batchCount], + &roiGenericPtrSrc[batchCount]); + } + } + + return RPP_SUCCESS; +} \ No newline at end of file diff --git a/src/modules/hip/kernel/subtract_scalar.hpp b/src/modules/hip/kernel/subtract_scalar.hpp new file mode 100644 index 000000000..7ee128709 --- /dev/null +++ b/src/modules/hip/kernel/subtract_scalar.hpp @@ -0,0 +1,114 @@ +#include +#include "rpp_hip_common.hpp" + + +__global__ void subtract_scalar_ncdhw_hip_tensor(float *srcPtr, + uint3 srcStridesCDH, + float *dstPtr, + uint3 dstStridesCDH, + int channels, + float subtractParam, + RpptROI3DPtr roiGenericPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // W - inner most dim vectorized + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim + + if ((id_z >= roiGenericPtrSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericPtrSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericPtrSrc->xyzwhdROI.roiWidth)) + { + return; + } + + uint srcIdx = ((id_z + roiGenericPtrSrc->xyzwhdROI.xyz.z) * srcStridesCDH.y) + ((id_y + roiGenericPtrSrc->xyzwhdROI.xyz.y) * srcStridesCDH.z) + (id_x + roiGenericPtrSrc->xyzwhdROI.xyz.x); + uint dstIdx = (id_z * dstStridesCDH.y) + (id_y * dstStridesCDH.z) + id_x; + + d_float8 val_f8; + for(int c = 0; c < channels; c++) + { + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &val_f8); + rpp_hip_math_subtract8_const(&val_f8, &val_f8, static_cast(subtractParam)); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &val_f8); + srcIdx += srcStridesCDH.x; + dstIdx += dstStridesCDH.x; + } +} + +__global__ void subtract_scalar_ndhwc_hip_tensor(float *srcPtr, + uint2 srcStridesDH, + float *dstPtr, + uint2 dstStridesDH, + float subtractParam, + RpptROI3DPtr roiGenericPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // WC - inner most dim vectorized + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim + + if ((id_z >= roiGenericPtrSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericPtrSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericPtrSrc->xyzwhdROI.roiWidth)) + { + return; + } + + uint srcIdx = ((id_z + roiGenericPtrSrc->xyzwhdROI.xyz.z) * srcStridesDH.x) + ((id_y + roiGenericPtrSrc->xyzwhdROI.xyz.y) * srcStridesDH.y) + (id_x + roiGenericPtrSrc->xyzwhdROI.xyz.x) * 3; + uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x * 3; + + d_float24 val_f24; + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &val_f24); + rpp_hip_math_subtract24_const(&val_f24, &val_f24, static_cast(subtractParam)); + rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &val_f24); +} + +RppStatus hip_exec_subtract_scalar_tensor(Rpp32f *srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + Rpp32f *dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + RpptROI3DPtr roiGenericPtrSrc, + Rpp32f *subtractTensor, + rpp::Handle& handle) +{ + if (dstGenericDescPtr->layout == RpptLayout::NCDHW) + { + int globalThreads_x = (dstGenericDescPtr->strides[3] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + int globalThreads_y = dstGenericDescPtr->dims[3]; // H - height (y direction) + int globalThreads_z = dstGenericDescPtr->dims[2]; // D - depth (z direction) + + for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) + { + hipLaunchKernelGGL(subtract_scalar_ncdhw_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr + (batchCount * srcGenericDescPtr->strides[0]), + make_uint3(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2], srcGenericDescPtr->strides[3]), + dstPtr + (batchCount * dstGenericDescPtr->strides[0]), + make_uint3(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2], dstGenericDescPtr->strides[3]), + dstGenericDescPtr->dims[1], + subtractTensor[batchCount], + &roiGenericPtrSrc[batchCount]); + } + } + else if (dstGenericDescPtr->layout == RpptLayout::NDHWC) + { + int globalThreads_x = (dstGenericDescPtr->strides[2] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + int globalThreads_y = dstGenericDescPtr->dims[2]; // H - height (y direction) + int globalThreads_z = dstGenericDescPtr->dims[1]; // D - depth (z direction) + + for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) + { + hipLaunchKernelGGL(subtract_scalar_ndhwc_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr + (batchCount * srcGenericDescPtr->strides[0]), + make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]), + dstPtr + (batchCount * dstGenericDescPtr->strides[0]), + make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]), + subtractTensor[batchCount], + &roiGenericPtrSrc[batchCount]); + } + } + + return RPP_SUCCESS; +} \ No newline at end of file diff --git a/src/modules/rppt_tensor_arithmetic_operations.cpp b/src/modules/rppt_tensor_arithmetic_operations.cpp index daf0479ee..e82aa3239 100644 --- a/src/modules/rppt_tensor_arithmetic_operations.cpp +++ b/src/modules/rppt_tensor_arithmetic_operations.cpp @@ -73,6 +73,84 @@ RppStatus rppt_fused_multiply_add_scalar_host(RppPtr_t srcPtr, return RPP_SUCCESS; } +/******************** add_scalar ********************/ + +RppStatus rppt_add_scalar_host(RppPtr_t srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + RppPtr_t dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32f *addTensor, + RpptROI3DPtr roiGenericPtrSrc, + RpptRoi3DType roiType, + rppHandle_t rppHandle) +{ + RppLayoutParams layoutParams; + if ((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW)) + layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[1]); + else if ((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC)) + layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[4]); + + if (srcGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_SRC_DATATYPE; + if (dstGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_DST_DATATYPE; + if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT; + if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT; + if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS; + + if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32)) + { + add_scalar_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + addTensor, + roiGenericPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +} + +/******************** subtract_scalar ********************/ + +RppStatus rppt_subtract_scalar_host(RppPtr_t srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + RppPtr_t dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32f *subtractTensor, + RpptROI3DPtr roiGenericPtrSrc, + RpptRoi3DType roiType, + rppHandle_t rppHandle) +{ + RppLayoutParams layoutParams; + if ((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW)) + layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[1]); + else if ((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC)) + layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[4]); + + if (srcGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_SRC_DATATYPE; + if (dstGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_DST_DATATYPE; + if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT; + if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT; + if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS; + + if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32)) + { + subtract_scalar_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + subtractTensor, + roiGenericPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +} + /********************************************************************************************************************/ /*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/ /********************************************************************************************************************/ @@ -113,4 +191,68 @@ RppStatus rppt_fused_multiply_add_scalar_gpu(RppPtr_t srcPtr, #endif // backend } +/******************** add_scalar ********************/ + +RppStatus rppt_add_scalar_gpu(RppPtr_t srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + RppPtr_t dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32f *addTensor, + RpptROI3DPtr roiGenericPtrSrc, + RpptRoi3DType roiType, + rppHandle_t rppHandle) +{ +#ifdef HIP_COMPILE + if (srcGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_SRC_DATATYPE; + if (dstGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_DST_DATATYPE; + if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT; + if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT; + if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS; + + hip_exec_add_scalar_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + roiGenericPtrSrc, + addTensor, + rpp::deref(rppHandle)); + + return RPP_SUCCESS; +#elif defined(OCL_COMPILE) + return RPP_ERROR_NOT_IMPLEMENTED; +#endif // backend +} + +/******************** subtract_scalar ********************/ + +RppStatus rppt_subtract_scalar_gpu(RppPtr_t srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + RppPtr_t dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32f *subtractTensor, + RpptROI3DPtr roiGenericPtrSrc, + RpptRoi3DType roiType, + rppHandle_t rppHandle) +{ +#ifdef HIP_COMPILE + if (srcGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_SRC_DATATYPE; + if (dstGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_DST_DATATYPE; + if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT; + if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT; + if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS; + + hip_exec_subtract_scalar_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + roiGenericPtrSrc, + subtractTensor, + rpp::deref(rppHandle)); + + return RPP_SUCCESS; +#elif defined(OCL_COMPILE) + return RPP_ERROR_NOT_IMPLEMENTED; +#endif // backend +} + #endif // GPU_SUPPORT diff --git a/utilities/test_suite/HIP/Tensor_hip.cpp b/utilities/test_suite/HIP/Tensor_hip.cpp index 04831ddf4..60b9eb719 100644 --- a/utilities/test_suite/HIP/Tensor_hip.cpp +++ b/utilities/test_suite/HIP/Tensor_hip.cpp @@ -323,18 +323,6 @@ int main(int argc, char **argv) double wallTime; string testCaseName; - if(testCase == 82 && imagesMixed) - { - std::cerr<<"\n RICAP only works with same dimension images"; - exit(0); - } - - if(testCase == 82 && batchSize < 2) - { - std::cerr<<"\n RICAP only works with BatchSize > 1"; - exit(0); - } - // Initialize buffers for any reductionType functions void *reductionFuncResultArr; Rpp32u reductionFuncResultArrLength = srcDescPtr->n * 4; diff --git a/utilities/test_suite/HIP/Tensor_voxel_hip.cpp b/utilities/test_suite/HIP/Tensor_voxel_hip.cpp index d4155c8e7..4331b6a24 100644 --- a/utilities/test_suite/HIP/Tensor_voxel_hip.cpp +++ b/utilities/test_suite/HIP/Tensor_voxel_hip.cpp @@ -55,12 +55,6 @@ int main(int argc, char * argv[]) fprintf(stdout, "\nUsage: %s
\n", argv[0]); exit(1); } - if ((testCase < 0) || (testCase > 4)) - { - fprintf(stdout, "\nUsage: %s
\n", argv[0]); - exit(1); - } - if(batchSize > MAX_BATCH_SIZE) { std::cout << "\n Batchsize should be less than or equal to "<< MAX_BATCH_SIZE << " Aborting!"; @@ -272,6 +266,38 @@ int main(int argc, char * argv[]) break; } + case 2: + { + testCaseName = "add_scalar"; + Rpp32f addTensor[batchSize]; + + for (int i = 0; i < batchSize; i++) + addTensor[i] = 40; + + startWallTime = omp_get_wtime(); + if (inputBitDepth == 2) + rppt_add_scalar_gpu(d_inputF32, descriptorPtr3D, d_outputF32, descriptorPtr3D, addTensor, roiGenericSrcPtr, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } + case 3: + { + testCaseName = "subtract_scalar"; + Rpp32f subtractTensor[batchSize]; + + for (int i = 0; i < batchSize; i++) + subtractTensor[i] = 40; + + startWallTime = omp_get_wtime(); + if (inputBitDepth == 2) + rppt_subtract_scalar_gpu(d_inputF32, descriptorPtr3D, d_outputF32, descriptorPtr3D, subtractTensor, roiGenericSrcPtr, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } default: { missingFuncFlag = 1; diff --git a/utilities/test_suite/HIP/runTests_voxel.py b/utilities/test_suite/HIP/runTests_voxel.py index 55c5dc3a5..82d9cbea9 100644 --- a/utilities/test_suite/HIP/runTests_voxel.py +++ b/utilities/test_suite/HIP/runTests_voxel.py @@ -38,6 +38,8 @@ qaInputFile = scriptPath + "/../TEST_QA_IMAGES_VOXEL" outFolderPath = os.getcwd() buildFolderPath = os.getcwd() +caseMin = 0 +caseMax = 3 # Check if folder path is empty, if it is the root folder, or if it exists, and remove its contents def validate_and_remove_contents(path): @@ -95,7 +97,7 @@ def case_file_check(CASE_FILE_PATH, TYPE, TENSOR_TYPE_LIST, new_file): print("Unable to open case results") return False - # Generate a directory name based on certain parameters +# Generate a directory name based on certain parameters def directory_name_generator(qaMode, affinity, layoutType, case, path): if qaMode == 0: functionality_group = func_group_finder(int(case)) @@ -256,8 +258,8 @@ def rpp_test_suite_parser_and_validator(): parser = argparse.ArgumentParser() parser.add_argument("--header_path", type = str, default = headerFilePath, help = "Path to the nii header") parser.add_argument("--data_path", type = str, default = dataFilePath, help = "Path to the nii data file") - parser.add_argument("--case_start", type = int, default = 0, help = "Testing range starting case # - (0:1)") - parser.add_argument("--case_end", type = int, default = 1, help = "Testing range ending case # - (0:1)") + parser.add_argument("--case_start", type = int, default = caseMin, help = "Testing start case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]") + parser.add_argument("--case_end", type = int, default = caseMax, help = "Testing start case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]") parser.add_argument('--test_type', type = int, default = 0, help = "Type of Test - (0 = Unit tests / 1 = Performance tests)") parser.add_argument('--case_list', nargs = "+", help = "List of case numbers to list", required = False) parser.add_argument('--profiling', type = str , default = 'NO', help = 'Run with profiler? - (YES/NO)', required = False) @@ -273,7 +275,7 @@ def rpp_test_suite_parser_and_validator(): validate_path(qaInputFile) # validate the parameters passed by user - if ((args.case_start < 0 or args.case_start > 1) or (args.case_end < 0 or args.case_end > 1)): + if ((args.case_start < caseMin or args.case_start > caseMax) or (args.case_end < caseMin or args.case_end > caseMax)): print("Starting case# and Ending case# must be in the 0:1 range. Aborting!") exit(0) elif args.case_end < args.case_start: @@ -285,7 +287,7 @@ def rpp_test_suite_parser_and_validator(): elif args.qa_mode < 0 or args.qa_mode > 1: print("QA mode must be in the 0 / 1. Aborting!") exit(0) - elif args.case_list is not None and args.case_start > 0 and args.case_end < 1: + elif args.case_list is not None and args.case_start > caseMin and args.case_end < caseMax: print("Invalid input! Please provide only 1 option between case_list, case_start and case_end") exit(0) elif args.num_runs <= 0: @@ -306,7 +308,7 @@ def rpp_test_suite_parser_and_validator(): args.case_list = [str(x) for x in args.case_list] else: for case in args.case_list: - if int(case) < 0 or int(case) > 1: + if int(case) < caseMin or int(case) > caseMax: print("The case# must be in the 0:1 range!") exit(0) @@ -468,7 +470,7 @@ def rpp_test_suite_parser_and_validator(): print("Unable to open results in " + RESULTS_DIR + "/consolidated_results_" + TYPE + ".stats.csv") # print the results of qa tests -supportedCaseList = ['0', '1'] +supportedCaseList = ['0', '1', '2', '3'] nonQACaseList = [] # Add cases present in supportedCaseList, but without QA support if qaMode and testType == 0: diff --git a/utilities/test_suite/HOST/Tensor_host.cpp b/utilities/test_suite/HOST/Tensor_host.cpp index 1e416ed52..84bdbca34 100644 --- a/utilities/test_suite/HOST/Tensor_host.cpp +++ b/utilities/test_suite/HOST/Tensor_host.cpp @@ -321,18 +321,6 @@ int main(int argc, char **argv) double cpuTime, wallTime; string testCaseName; - if(testCase == 82 && imagesMixed) - { - std::cerr<<"\n RICAP only works with same dimension images"; - exit(0); - } - - if(testCase == 82 && batchSize < 2) - { - std::cerr<<"\n RICAP only works with BatchSize > 1"; - exit(0); - } - // Initialize buffers for any reductionType functions void *reductionFuncResultArr; Rpp32u reductionFuncResultArrLength = srcDescPtr->n * 4; diff --git a/utilities/test_suite/HOST/Tensor_voxel_host.cpp b/utilities/test_suite/HOST/Tensor_voxel_host.cpp index 841c28311..01673d0f6 100644 --- a/utilities/test_suite/HOST/Tensor_voxel_host.cpp +++ b/utilities/test_suite/HOST/Tensor_voxel_host.cpp @@ -55,12 +55,6 @@ int main(int argc, char * argv[]) fprintf(stdout, "\nUsage: %s
\n", argv[0]); exit(1); } - if ((testCase < 0) || (testCase > 1)) - { - fprintf(stdout, "\nUsage: %s
\n", argv[0]); - exit(1); - } - if(batchSize > MAX_BATCH_SIZE) { std::cout << "\n Batchsize should be less than or equal to "<< MAX_BATCH_SIZE << " Aborting!"; @@ -257,6 +251,38 @@ int main(int argc, char * argv[]) break; } + case 2: + { + testCaseName = "add_scalar"; + Rpp32f addTensor[batchSize]; + + for (int i = 0; i < batchSize; i++) + addTensor[i] = 40; + + startWallTime = omp_get_wtime(); + if(inputBitDepth == 2) + rppt_add_scalar_host(inputF32, descriptorPtr3D, outputF32, descriptorPtr3D, addTensor, roiGenericSrcPtr, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } + case 3: + { + testCaseName = "subtract_scalar"; + Rpp32f subtractTensor[batchSize]; + + for (int i = 0; i < batchSize; i++) + subtractTensor[i] = 40; + + startWallTime = omp_get_wtime(); + if (inputBitDepth == 2) + rppt_subtract_scalar_host(inputF32, descriptorPtr3D, outputF32, descriptorPtr3D, subtractTensor, roiGenericSrcPtr, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } default: { missingFuncFlag = 1; diff --git a/utilities/test_suite/HOST/runTests_voxel.py b/utilities/test_suite/HOST/runTests_voxel.py index aab51f52e..e21dc0f0e 100644 --- a/utilities/test_suite/HOST/runTests_voxel.py +++ b/utilities/test_suite/HOST/runTests_voxel.py @@ -38,6 +38,8 @@ qaInputFile = scriptPath + "/../TEST_QA_IMAGES_VOXEL" outFolderPath = os.getcwd() buildFolderPath = os.getcwd() +caseMin = 0 +caseMax = 3 # Check if folder path is empty, if it is the root folder, or if it exists, and remove its contents def validate_and_remove_contents(path): @@ -115,7 +117,8 @@ def func_group_finder(case_number): return "geometric_augmentations" else: return "miscellaneous" - # Generate a directory name based on certain parameters + +# Generate a directory name based on certain parameters def directory_name_generator(qaMode, affinity, layoutType, case, path): if qaMode == 0: functionality_group = func_group_finder(int(case)) @@ -182,8 +185,8 @@ def rpp_test_suite_parser_and_validator(): parser = argparse.ArgumentParser() parser.add_argument("--header_path", type = str, default = headerFilePath, help = "Path to the nii header") parser.add_argument("--data_path", type = str, default = dataFilePath, help = "Path to the nii data file") - parser.add_argument("--case_start", type = int, default = 0, help = "Testing range starting case # - (0:1)") - parser.add_argument("--case_end", type = int, default = 1, help = "Testing range ending case # - (0:1)") + parser.add_argument("--case_start", type = int, default = caseMin, help = "Testing start case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]") + parser.add_argument("--case_end", type = int, default = caseMax, help = "Testing start case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]") parser.add_argument('--test_type', type = int, default = 0, help = "Type of Test - (0 = Unit tests / 1 = Performance tests)") parser.add_argument('--case_list', nargs = "+", help = "List of case numbers to list", required = False) parser.add_argument('--qa_mode', type = int, default = 0, help = "Run with qa_mode? Output images from tests will be compared with golden outputs - (0 / 1)", required = False) @@ -198,7 +201,7 @@ def rpp_test_suite_parser_and_validator(): validate_path(qaInputFile) # validate the parameters passed by user - if ((args.case_start < 0 or args.case_start > 1) or (args.case_end < 0 or args.case_end > 1)): + if ((args.case_start < caseMin or args.case_start > caseMax) or (args.case_end < caseMin or args.case_end > caseMax)): print("Starting case# and Ending case# must be in the 0:1 range. Aborting!") exit(0) elif args.case_end < args.case_start: @@ -210,7 +213,7 @@ def rpp_test_suite_parser_and_validator(): elif args.qa_mode < 0 or args.qa_mode > 1: print("QA mode must be in the 0 / 1. Aborting!") exit(0) - elif args.case_list is not None and args.case_start > 0 and args.case_end < 1: + elif args.case_list is not None and args.case_start > caseMin and args.case_end < caseMax: print("Invalid input! Please provide only 1 option between case_list, case_start and case_end") exit(0) elif args.num_runs <= 0: @@ -228,7 +231,7 @@ def rpp_test_suite_parser_and_validator(): args.case_list = [str(x) for x in args.case_list] else: for case in args.case_list: - if int(case) < 0 or int(case) > 1: + if int(case) < caseMin or int(case) > caseMax: print("The case# must be in the 0:1 range!") exit(0) @@ -318,7 +321,7 @@ def rpp_test_suite_parser_and_validator(): run_performance_test(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize) # print the results of qa tests -supportedCaseList = ['0', '1'] +supportedCaseList = ['0', '1', '2', '3'] nonQACaseList = [] # Add cases present in supportedCaseList, but without QA support if qaMode and testType == 0: diff --git a/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/add_scalar/add_scalar_nifti_output.bin b/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/add_scalar/add_scalar_nifti_output.bin new file mode 100644 index 000000000..628d3785b Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/add_scalar/add_scalar_nifti_output.bin differ diff --git a/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/subtract_scalar/subtract_scalar_nifti_output.bin b/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/subtract_scalar/subtract_scalar_nifti_output.bin new file mode 100644 index 000000000..9b9328536 Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/subtract_scalar/subtract_scalar_nifti_output.bin differ diff --git a/utilities/test_suite/rpp_test_suite_voxel.h b/utilities/test_suite/rpp_test_suite_voxel.h index effac9c96..af9d67d14 100644 --- a/utilities/test_suite/rpp_test_suite_voxel.h +++ b/utilities/test_suite/rpp_test_suite_voxel.h @@ -61,6 +61,8 @@ std::map augmentationMap = { {0, "fused_multiply_add_scalar"}, {1, "slice"}, + {2, "add_scalar"}, + {3, "subtract_scalar"} }; void replicate_last_file_to_fill_batch(const string& lastFilePath, vector& filePathVector, vector& fileNamesVector, const string& lastFileName, int noOfFiles, int batchCount)