diff --git a/include/rppt_tensor_arithmetic_operations.h b/include/rppt_tensor_arithmetic_operations.h
index 0a247f886..bc111e7f6 100644
--- a/include/rppt_tensor_arithmetic_operations.h
+++ b/include/rppt_tensor_arithmetic_operations.h
@@ -25,66 +25,144 @@ SOFTWARE.
#ifndef RPPT_TENSOR_ARITHMETIC_OPERATIONS_H
#define RPPT_TENSOR_ARITHMETIC_OPERATIONS_H
-/*!
- * \file
- * \brief RPPT Tensor Arithmetic operation Functions.
- *
- * \defgroup group_tensor_arithmetic Operations: AMD RPP Tensor Arithmetic Operations
- * \brief Tensor Color Augmentations.
- */
-
#include "rpp.h"
#include "rppdefs.h"
#ifdef __cplusplus
extern "C" {
#endif
-/*! \brief Fmadd augmentation HOST
+/*!
+ * \file
+ * \brief RPPT Tensor Operations - Arithmetic Operations.
+ * \defgroup group_tensor_arithmetic_operations RPPT Tensor Operations - Arithmetic Operations.
+ * \brief RPPT Tensor Operations - Arithmetic Operations.
+ */
+
+/*! \addtogroup group_rppt_tensor_arithmetic_operations
+ * @{
+ */
+
+/*! \brief Fused multiply add scalar augmentation on HOST backend
* \details This function performs the fmadd operation on a batch of 4D tensors.
* It multiplies each element of the source tensor by a corresponding element in the 'mulTensor',
* adds a corresponding element from the 'addTensor', and stores the result in the destination tensor.
* Support added for f32 -> f32 dataype.
- * \param [in] srcPtr source tensor memory
+ * \param [in] srcPtr source tensor in HOST memory
* \param[in] srcGenericDescPtr source tensor descriptor
- * \param[out] dstPtr destination tensor memory
+ * \param[out] dstPtr destination tensor in HOST memory
* \param[in] dstGenericDescPtr destination tensor descriptor
* \param[in] mulTensor mul values for fmadd calculation (1D tensor of batchSize Rpp32f values)
* \param[in] addTensor add values for fmadd calculation (1D tensor of batchSize Rpp32f values)
* \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
* \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
- * \param [in] rppHandle Host-handle
- * \return RppStatus enum.
- * \returns RPP_SUCCESS \ref RppStatus on successful completion.
- * Else return RPP_ERROR
- * \ingroup group_tensor_arithmetic
+ * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
*/
RppStatus rppt_fused_multiply_add_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *mulTensor, Rpp32f *addTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
-
-/*! \brief Fmadd augmentation GPU
+#ifdef GPU_SUPPORT
+/*! \brief Fused multiply add scalar augmentation on HIP backend
* \details This function performs the fmadd operation on a batch of 4D tensors.
* It multiplies each element of the source tensor by a corresponding element in the 'mulTensor',
* adds a corresponding element from the 'addTensor', and stores the result in the destination tensor.
* Support added for f32 -> f32 dataype.
- * \param [in] srcPtr source tensor memory
+ * \param [in] srcPtr source tensor in HIP memory
* \param[in] srcGenericDescPtr source tensor descriptor
- * \param[out] dstPtr destination tensor memory
+ * \param[out] dstPtr destination tensor in HIP memory
* \param[in] dstGenericDescPtr destination tensor descriptor
* \param[in] mulTensor mul values for fmadd calculation (1D tensor of batchSize Rpp32f values)
* \param[in] addTensor add values for fmadd calculation (1D tensor of batchSize Rpp32f values)
* \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
* \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
- * \param [in] rppHandle Hip-handle
- * \return RppStatus enum.
- * \returns RPP_SUCCESS \ref RppStatus on successful completion.
- * Else return RPP_ERROR
- * \ingroup group_tensor_arithmetic
+ * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
*/
+RppStatus rppt_fused_multiply_add_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *mulTensor, Rpp32f *addTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
+/*! \brief Add scalar augmentation on HOST backend
+ * \details This function performs the addition operation on a batch of 4D tensors.
+ * It adds a corresponding element from the 'addTensor' to source tensor, and stores the result in the destination tensor.
+ * Support added for f32 -> f32 dataype.
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param[in] srcGenericDescPtr source tensor descriptor
+ * \param[out] dstPtr destination tensor in HOST memory
+ * \param[in] dstGenericDescPtr destination tensor descriptor
+ * \param[in] addTensor add values for used for addition (1D tensor of batchSize Rpp32f values)
+ * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
+ * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
+ * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_add_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *addTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
#ifdef GPU_SUPPORT
-RppStatus rppt_fused_multiply_add_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *mulTensor, Rpp32f *addTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
+/*! \brief Add scalar augmentation on HIP backend
+ * \details This function performs the addition operation on a batch of 4D tensors.
+ * It adds a corresponding element from the 'addTensor' to source tensor, and stores the result in the destination tensor.
+ * Support added for f32 -> f32 dataype.
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param[in] srcGenericDescPtr source tensor descriptor
+ * \param[out] dstPtr destination tensor in HIP memory
+ * \param[in] dstGenericDescPtr destination tensor descriptor
+ * \param[in] addTensor add values for used for addition (1D tensor of batchSize Rpp32f values)
+ * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
+ * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
+ * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_add_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *addTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
#endif // GPU_SUPPORT
+/*! \brief Subtract scalar augmentation on HOST backend
+ * \details This function performs the subtraction operation on a batch of 4D tensors.
+ * It takes a corresponding element from 'subtractTensor' and subtracts it from source tensor. Result is stored in the destination tensor.
+ * Support added for f32 -> f32 dataype.
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param[in] srcGenericDescPtr source tensor descriptor
+ * \param[out] dstPtr destination tensor in HOST memory
+ * \param[in] dstGenericDescPtr destination tensor descriptor
+ * \param[in] subtractTensor subtract values for used for subtraction (1D tensor of batchSize Rpp32f values)
+ * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
+ * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
+ * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_subtract_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *subtractTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Subtract scalar augmentation on HIP backend
+ * \details This function performs the subtraction operation on a batch of 4D tensors.
+ * It takes a corresponding element from 'subtractTensor' and subtracts it from source tensor. Result is stored in the destination tensor.
+ * Support added for f32 -> f32 dataype.
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param[in] srcGenericDescPtr source tensor descriptor
+ * \param[out] dstPtr destination tensor in HIP memory
+ * \param[in] dstGenericDescPtr destination tensor descriptor
+ * \param[in] subtractTensor subtract values for used for subtraction (1D tensor of batchSize Rpp32f values)
+ * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
+ * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
+ * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_subtract_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *subtractTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
+/*! @}
+ */
+
#ifdef __cplusplus
}
#endif
diff --git a/include/rppt_tensor_statistical_operations.h b/include/rppt_tensor_statistical_operations.h
index 181b1c565..b61af9dde 100644
--- a/include/rppt_tensor_statistical_operations.h
+++ b/include/rppt_tensor_statistical_operations.h
@@ -24,6 +24,7 @@ SOFTWARE.
#ifndef RPPT_TENSOR_STATISTICAL_OPERATIONS_H
#define RPPT_TENSOR_STATISTICAL_OPERATIONS_H
+
#include "rpp.h"
#include "rppdefs.h"
#ifdef __cplusplus
diff --git a/src/include/cpu/rpp_cpu_common.hpp b/src/include/cpu/rpp_cpu_common.hpp
index 67c34de70..6fd625b57 100644
--- a/src/include/cpu/rpp_cpu_common.hpp
+++ b/src/include/cpu/rpp_cpu_common.hpp
@@ -2431,6 +2431,18 @@ inline RppStatus custom_convolve_image_host(T* srcPtr, RppiSize srcSize, U* dstP
// Compute Functions for RPP Tensor API
+inline void compute_subtract_16_host(__m256 *p, __m256 *pSubtractParam)
+{
+ p[0] = _mm256_sub_ps(p[0], pSubtractParam[0]); // subtract adjustment
+ p[1] = _mm256_sub_ps(p[1], pSubtractParam[0]); // subtract adjustment
+}
+
+inline void compute_add_16_host(__m256 *p, __m256 *pAddParam)
+{
+ p[0] = _mm256_add_ps(p[0], pAddParam[0]); // add adjustment
+ p[1] = _mm256_add_ps(p[1], pAddParam[0]); // add adjustment
+}
+
inline void compute_rmn_24_host(__m256 *p, __m256 *pRMNParams)
{
p[0] = _mm256_mul_ps(_mm256_sub_ps(p[0], pRMNParams[0]), pRMNParams[1]);
diff --git a/src/modules/cpu/host_tensor_arithmetic_operations.hpp b/src/modules/cpu/host_tensor_arithmetic_operations.hpp
index 96553489d..e043aab83 100644
--- a/src/modules/cpu/host_tensor_arithmetic_operations.hpp
+++ b/src/modules/cpu/host_tensor_arithmetic_operations.hpp
@@ -26,5 +26,7 @@ SOFTWARE.
#define HOST_TENSOR_ARITHMETIC_OPERATIONS_HPP
#include "kernel/fused_multiply_add_scalar.hpp"
+#include "kernel/add_scalar.hpp"
+#include "kernel/subtract_scalar.hpp"
#endif // HOST_TENSOR_ARITHMETIC_OPERATIONS_HPP
\ No newline at end of file
diff --git a/src/modules/cpu/kernel/add_scalar.hpp b/src/modules/cpu/kernel/add_scalar.hpp
new file mode 100644
index 000000000..d0179d4e1
--- /dev/null
+++ b/src/modules/cpu/kernel/add_scalar.hpp
@@ -0,0 +1,152 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+RppStatus add_scalar_f32_f32_host_tensor(Rpp32f *srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ Rpp32f *dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32f *addTensor,
+ RpptROI3DPtr roiGenericPtrSrc,
+ RpptRoi3DType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI3D roiDefault;
+ if(srcGenericDescPtr->layout==RpptLayout::NCDHW)
+ roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[4], (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2]};
+ else if(srcGenericDescPtr->layout==RpptLayout::NDHWC)
+ roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2], (Rpp32s)srcGenericDescPtr->dims[1]};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+ {
+ RpptROI3D roi;
+ RpptROI3DPtr roiPtrInput = &roiGenericPtrSrc[batchCount];
+ compute_roi3D_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp32f *srcPtrImage, *dstPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+ dstPtrImage = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+
+ Rpp32f addParam = addTensor[batchCount];
+ Rpp32f *srcPtrChannel, *dstPtrChannel;
+ dstPtrChannel = dstPtrImage;
+
+ Rpp32u vectorIncrement = 16;
+ Rpp32u bufferLength = roi.xyzwhdROI.roiWidth * layoutParams.bufferMultiplier;
+ Rpp32u alignedLength = (bufferLength / vectorIncrement) * vectorIncrement;
+ __m256 pAddParam = _mm256_set1_ps(addParam);
+
+ // Add without fused output-layout toggle (NCDHW -> NCDHW)
+ if((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW))
+ {
+ srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[3]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier);
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp32f *srcPtrDepth, *dstPtrDepth;
+ srcPtrDepth = srcPtrChannel;
+ dstPtrDepth = dstPtrChannel;
+ for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++)
+ {
+ Rpp32f *srcPtrRow, *dstPtrRow;
+ srcPtrRow = srcPtrDepth;
+ dstPtrRow = dstPtrDepth;
+ for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++)
+ {
+ Rpp32f *srcPtrTemp, *dstPtrTemp;
+ srcPtrTemp = srcPtrRow;
+ dstPtrTemp = dstPtrRow;
+ int vectorLoopCount = 0;
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p[2];
+ rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p); // simd loads
+ compute_add_16_host(p, &pAddParam); // add adjustment
+ rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores
+ srcPtrTemp += vectorIncrement;
+ dstPtrTemp += vectorIncrement;
+ }
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTemp++ = *srcPtrTemp++ + addParam;
+ }
+ srcPtrRow += srcGenericDescPtr->strides[3];
+ dstPtrRow += dstGenericDescPtr->strides[3];
+ }
+ srcPtrDepth += srcGenericDescPtr->strides[2];
+ dstPtrDepth += dstGenericDescPtr->strides[2];
+ }
+ srcPtrChannel += srcGenericDescPtr->strides[1];
+ dstPtrChannel += srcGenericDescPtr->strides[1];
+ }
+ }
+ // Add without fused output-layout toggle (NDHWC -> NDHWC)
+ else if((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC))
+ {
+ srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[1]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier);
+ Rpp32f *srcPtrDepth = srcPtrChannel;
+ Rpp32f *dstPtrDepth = dstPtrChannel;
+ for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++)
+ {
+ Rpp32f *srcPtrRow, *dstPtrRow;
+ srcPtrRow = srcPtrDepth;
+ dstPtrRow = dstPtrDepth;
+ for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++)
+ {
+ Rpp32f *srcPtrTemp, *dstPtrTemp;
+ srcPtrTemp = srcPtrRow;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p[2];
+ rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p); // simd loads
+ compute_add_16_host(p, &pAddParam); // add adjustment
+ rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores
+ srcPtrTemp += vectorIncrement;
+ dstPtrTemp += vectorIncrement;
+ }
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTemp++ = *srcPtrTemp++ + addParam;
+ }
+ srcPtrRow += srcGenericDescPtr->strides[2];
+ dstPtrRow += dstGenericDescPtr->strides[2];
+ }
+ srcPtrDepth += srcGenericDescPtr->strides[1];
+ dstPtrDepth += dstGenericDescPtr->strides[1];
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
diff --git a/src/modules/cpu/kernel/subtract_scalar.hpp b/src/modules/cpu/kernel/subtract_scalar.hpp
new file mode 100644
index 000000000..a40e6219f
--- /dev/null
+++ b/src/modules/cpu/kernel/subtract_scalar.hpp
@@ -0,0 +1,152 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+RppStatus subtract_scalar_f32_f32_host_tensor(Rpp32f *srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ Rpp32f *dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32f *subtractTensor,
+ RpptROI3DPtr roiGenericPtrSrc,
+ RpptRoi3DType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI3D roiDefault;
+ if(srcGenericDescPtr->layout==RpptLayout::NCDHW)
+ roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[4], (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2]};
+ else if(srcGenericDescPtr->layout==RpptLayout::NDHWC)
+ roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2], (Rpp32s)srcGenericDescPtr->dims[1]};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+ {
+ RpptROI3D roi;
+ RpptROI3DPtr roiPtrInput = &roiGenericPtrSrc[batchCount];
+ compute_roi3D_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp32f *srcPtrImage, *dstPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+ dstPtrImage = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+
+ Rpp32f subtractParam = subtractTensor[batchCount];
+ Rpp32f *srcPtrChannel, *dstPtrChannel;
+ dstPtrChannel = dstPtrImage;
+
+ Rpp32u vectorIncrement = 16;
+ Rpp32u bufferLength = roi.xyzwhdROI.roiWidth * layoutParams.bufferMultiplier;
+ Rpp32u alignedLength = (bufferLength / vectorIncrement) * vectorIncrement;
+ __m256 pSubtractParam = _mm256_set1_ps(subtractParam);
+
+ // Subtract without fused output-layout toggle (NCDHW -> NCDHW)
+ if((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW))
+ {
+ srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[3]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier);
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp32f *srcPtrDepth, *dstPtrDepth;
+ srcPtrDepth = srcPtrChannel;
+ dstPtrDepth = dstPtrChannel;
+ for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++)
+ {
+ Rpp32f *srcPtrRow, *dstPtrRow;
+ srcPtrRow = srcPtrDepth;
+ dstPtrRow = dstPtrDepth;
+ for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++)
+ {
+ Rpp32f *srcPtrTemp, *dstPtrTemp;
+ srcPtrTemp = srcPtrRow;
+ dstPtrTemp = dstPtrRow;
+ int vectorLoopCount = 0;
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p[2];
+ rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p); // simd loads
+ compute_subtract_16_host(p, &pSubtractParam); // subtract adjustment
+ rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores
+ srcPtrTemp += vectorIncrement;
+ dstPtrTemp += vectorIncrement;
+ }
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTemp++ = *srcPtrTemp++ - subtractParam;
+ }
+ srcPtrRow += srcGenericDescPtr->strides[3];
+ dstPtrRow += dstGenericDescPtr->strides[3];
+ }
+ srcPtrDepth += srcGenericDescPtr->strides[2];
+ dstPtrDepth += dstGenericDescPtr->strides[2];
+ }
+ srcPtrChannel += srcGenericDescPtr->strides[1];
+ dstPtrChannel += srcGenericDescPtr->strides[1];
+ }
+ }
+ // Subtract without fused output-layout toggle (NDHWC -> NDHWC)
+ else if((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC))
+ {
+ srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[1]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier);
+ Rpp32f *srcPtrDepth = srcPtrChannel;
+ Rpp32f *dstPtrDepth = dstPtrChannel;
+ for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++)
+ {
+ Rpp32f *srcPtrRow, *dstPtrRow;
+ srcPtrRow = srcPtrDepth;
+ dstPtrRow = dstPtrDepth;
+ for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++)
+ {
+ Rpp32f *srcPtrTemp, *dstPtrTemp;
+ srcPtrTemp = srcPtrRow;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p[2];
+ rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p); // simd loads
+ compute_subtract_16_host(p, &pSubtractParam); // subtract adjustment
+ rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores
+ srcPtrTemp += vectorIncrement;
+ dstPtrTemp += vectorIncrement;
+ }
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTemp++ = *srcPtrTemp++ - subtractParam;
+ }
+ srcPtrRow += srcGenericDescPtr->strides[2];
+ dstPtrRow += dstGenericDescPtr->strides[2];
+ }
+ srcPtrDepth += srcGenericDescPtr->strides[1];
+ dstPtrDepth += dstGenericDescPtr->strides[1];
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
diff --git a/src/modules/hip/hip_tensor_arithmetic_operations.hpp b/src/modules/hip/hip_tensor_arithmetic_operations.hpp
index 55fbb7832..6123ab2fb 100644
--- a/src/modules/hip/hip_tensor_arithmetic_operations.hpp
+++ b/src/modules/hip/hip_tensor_arithmetic_operations.hpp
@@ -26,5 +26,7 @@ SOFTWARE.
#define HIP_TENSOR_ARITHMEETIC_OPERATIONS_HPP
#include "kernel/fused_multiply_add_scalar.hpp"
+#include "kernel/add_scalar.hpp"
+#include "kernel/subtract_scalar.hpp"
#endif // HIP_TENSOR_ARITHMEETIC_OPERATIONS_HPP
diff --git a/src/modules/hip/kernel/add_scalar.hpp b/src/modules/hip/kernel/add_scalar.hpp
new file mode 100644
index 000000000..709337c9d
--- /dev/null
+++ b/src/modules/hip/kernel/add_scalar.hpp
@@ -0,0 +1,114 @@
+#include
+#include "rpp_hip_common.hpp"
+
+
+__global__ void add_scalar_ncdhw_hip_tensor(float *srcPtr,
+ uint3 srcStridesCDH,
+ float *dstPtr,
+ uint3 dstStridesCDH,
+ int channels,
+ float addParam,
+ RpptROI3DPtr roiGenericPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // W - inner most dim vectorized
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim
+
+ if ((id_z >= roiGenericPtrSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericPtrSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericPtrSrc->xyzwhdROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = ((id_z + roiGenericPtrSrc->xyzwhdROI.xyz.z) * srcStridesCDH.y) + ((id_y + roiGenericPtrSrc->xyzwhdROI.xyz.y) * srcStridesCDH.z) + (id_x + roiGenericPtrSrc->xyzwhdROI.xyz.x);
+ uint dstIdx = (id_z * dstStridesCDH.y) + (id_y * dstStridesCDH.z) + id_x;
+
+ d_float8 val_f8;
+ for(int c = 0; c < channels; c++)
+ {
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &val_f8);
+ rpp_hip_math_add8_const(&val_f8, &val_f8, static_cast(addParam));
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &val_f8);
+ srcIdx += srcStridesCDH.x;
+ dstIdx += dstStridesCDH.x;
+ }
+}
+
+__global__ void add_scalar_ndhwc_hip_tensor(float *srcPtr,
+ uint2 srcStridesDH,
+ float *dstPtr,
+ uint2 dstStridesDH,
+ float addParam,
+ RpptROI3DPtr roiGenericPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // WC - inner most dim vectorized
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim
+
+ if ((id_z >= roiGenericPtrSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericPtrSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericPtrSrc->xyzwhdROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = ((id_z + roiGenericPtrSrc->xyzwhdROI.xyz.z) * srcStridesDH.x) + ((id_y + roiGenericPtrSrc->xyzwhdROI.xyz.y) * srcStridesDH.y) + (id_x + roiGenericPtrSrc->xyzwhdROI.xyz.x) * 3;
+ uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x * 3;
+
+ d_float24 val_f24;
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &val_f24);
+ rpp_hip_math_add24_const(&val_f24, &val_f24, static_cast(addParam));
+ rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &val_f24);
+}
+
+RppStatus hip_exec_add_scalar_tensor(Rpp32f *srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ Rpp32f *dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ RpptROI3DPtr roiGenericPtrSrc,
+ Rpp32f *addTensor,
+ rpp::Handle& handle)
+{
+ if (dstGenericDescPtr->layout == RpptLayout::NCDHW)
+ {
+ int globalThreads_x = (dstGenericDescPtr->strides[3] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ int globalThreads_y = dstGenericDescPtr->dims[3]; // H - height (y direction)
+ int globalThreads_z = dstGenericDescPtr->dims[2]; // D - depth (z direction)
+
+ for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+ {
+ hipLaunchKernelGGL(add_scalar_ncdhw_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr + (batchCount * srcGenericDescPtr->strides[0]),
+ make_uint3(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2], srcGenericDescPtr->strides[3]),
+ dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+ make_uint3(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2], dstGenericDescPtr->strides[3]),
+ dstGenericDescPtr->dims[1],
+ addTensor[batchCount],
+ &roiGenericPtrSrc[batchCount]);
+ }
+ }
+ else if (dstGenericDescPtr->layout == RpptLayout::NDHWC)
+ {
+ int globalThreads_x = (dstGenericDescPtr->strides[2] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ int globalThreads_y = dstGenericDescPtr->dims[2]; // H - height (y direction)
+ int globalThreads_z = dstGenericDescPtr->dims[1]; // D - depth (z direction)
+
+ for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+ {
+ hipLaunchKernelGGL(add_scalar_ndhwc_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr + (batchCount * srcGenericDescPtr->strides[0]),
+ make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]),
+ dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+ make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]),
+ addTensor[batchCount],
+ &roiGenericPtrSrc[batchCount]);
+ }
+ }
+
+ return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/hip/kernel/subtract_scalar.hpp b/src/modules/hip/kernel/subtract_scalar.hpp
new file mode 100644
index 000000000..7ee128709
--- /dev/null
+++ b/src/modules/hip/kernel/subtract_scalar.hpp
@@ -0,0 +1,114 @@
+#include
+#include "rpp_hip_common.hpp"
+
+
+__global__ void subtract_scalar_ncdhw_hip_tensor(float *srcPtr,
+ uint3 srcStridesCDH,
+ float *dstPtr,
+ uint3 dstStridesCDH,
+ int channels,
+ float subtractParam,
+ RpptROI3DPtr roiGenericPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // W - inner most dim vectorized
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim
+
+ if ((id_z >= roiGenericPtrSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericPtrSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericPtrSrc->xyzwhdROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = ((id_z + roiGenericPtrSrc->xyzwhdROI.xyz.z) * srcStridesCDH.y) + ((id_y + roiGenericPtrSrc->xyzwhdROI.xyz.y) * srcStridesCDH.z) + (id_x + roiGenericPtrSrc->xyzwhdROI.xyz.x);
+ uint dstIdx = (id_z * dstStridesCDH.y) + (id_y * dstStridesCDH.z) + id_x;
+
+ d_float8 val_f8;
+ for(int c = 0; c < channels; c++)
+ {
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &val_f8);
+ rpp_hip_math_subtract8_const(&val_f8, &val_f8, static_cast(subtractParam));
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &val_f8);
+ srcIdx += srcStridesCDH.x;
+ dstIdx += dstStridesCDH.x;
+ }
+}
+
+__global__ void subtract_scalar_ndhwc_hip_tensor(float *srcPtr,
+ uint2 srcStridesDH,
+ float *dstPtr,
+ uint2 dstStridesDH,
+ float subtractParam,
+ RpptROI3DPtr roiGenericPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // WC - inner most dim vectorized
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim
+
+ if ((id_z >= roiGenericPtrSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericPtrSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericPtrSrc->xyzwhdROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = ((id_z + roiGenericPtrSrc->xyzwhdROI.xyz.z) * srcStridesDH.x) + ((id_y + roiGenericPtrSrc->xyzwhdROI.xyz.y) * srcStridesDH.y) + (id_x + roiGenericPtrSrc->xyzwhdROI.xyz.x) * 3;
+ uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x * 3;
+
+ d_float24 val_f24;
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &val_f24);
+ rpp_hip_math_subtract24_const(&val_f24, &val_f24, static_cast(subtractParam));
+ rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &val_f24);
+}
+
+RppStatus hip_exec_subtract_scalar_tensor(Rpp32f *srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ Rpp32f *dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ RpptROI3DPtr roiGenericPtrSrc,
+ Rpp32f *subtractTensor,
+ rpp::Handle& handle)
+{
+ if (dstGenericDescPtr->layout == RpptLayout::NCDHW)
+ {
+ int globalThreads_x = (dstGenericDescPtr->strides[3] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ int globalThreads_y = dstGenericDescPtr->dims[3]; // H - height (y direction)
+ int globalThreads_z = dstGenericDescPtr->dims[2]; // D - depth (z direction)
+
+ for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+ {
+ hipLaunchKernelGGL(subtract_scalar_ncdhw_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr + (batchCount * srcGenericDescPtr->strides[0]),
+ make_uint3(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2], srcGenericDescPtr->strides[3]),
+ dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+ make_uint3(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2], dstGenericDescPtr->strides[3]),
+ dstGenericDescPtr->dims[1],
+ subtractTensor[batchCount],
+ &roiGenericPtrSrc[batchCount]);
+ }
+ }
+ else if (dstGenericDescPtr->layout == RpptLayout::NDHWC)
+ {
+ int globalThreads_x = (dstGenericDescPtr->strides[2] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ int globalThreads_y = dstGenericDescPtr->dims[2]; // H - height (y direction)
+ int globalThreads_z = dstGenericDescPtr->dims[1]; // D - depth (z direction)
+
+ for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+ {
+ hipLaunchKernelGGL(subtract_scalar_ndhwc_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr + (batchCount * srcGenericDescPtr->strides[0]),
+ make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]),
+ dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+ make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]),
+ subtractTensor[batchCount],
+ &roiGenericPtrSrc[batchCount]);
+ }
+ }
+
+ return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/rppt_tensor_arithmetic_operations.cpp b/src/modules/rppt_tensor_arithmetic_operations.cpp
index daf0479ee..e82aa3239 100644
--- a/src/modules/rppt_tensor_arithmetic_operations.cpp
+++ b/src/modules/rppt_tensor_arithmetic_operations.cpp
@@ -73,6 +73,84 @@ RppStatus rppt_fused_multiply_add_scalar_host(RppPtr_t srcPtr,
return RPP_SUCCESS;
}
+/******************** add_scalar ********************/
+
+RppStatus rppt_add_scalar_host(RppPtr_t srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ RppPtr_t dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32f *addTensor,
+ RpptROI3DPtr roiGenericPtrSrc,
+ RpptRoi3DType roiType,
+ rppHandle_t rppHandle)
+{
+ RppLayoutParams layoutParams;
+ if ((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW))
+ layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[1]);
+ else if ((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC))
+ layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[4]);
+
+ if (srcGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_SRC_DATATYPE;
+ if (dstGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_DST_DATATYPE;
+ if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT;
+ if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT;
+ if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS;
+
+ if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+ {
+ add_scalar_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ addTensor,
+ roiGenericPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+}
+
+/******************** subtract_scalar ********************/
+
+RppStatus rppt_subtract_scalar_host(RppPtr_t srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ RppPtr_t dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32f *subtractTensor,
+ RpptROI3DPtr roiGenericPtrSrc,
+ RpptRoi3DType roiType,
+ rppHandle_t rppHandle)
+{
+ RppLayoutParams layoutParams;
+ if ((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW))
+ layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[1]);
+ else if ((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC))
+ layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[4]);
+
+ if (srcGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_SRC_DATATYPE;
+ if (dstGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_DST_DATATYPE;
+ if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT;
+ if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT;
+ if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS;
+
+ if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+ {
+ subtract_scalar_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ subtractTensor,
+ roiGenericPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+}
+
/********************************************************************************************************************/
/*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/
/********************************************************************************************************************/
@@ -113,4 +191,68 @@ RppStatus rppt_fused_multiply_add_scalar_gpu(RppPtr_t srcPtr,
#endif // backend
}
+/******************** add_scalar ********************/
+
+RppStatus rppt_add_scalar_gpu(RppPtr_t srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ RppPtr_t dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32f *addTensor,
+ RpptROI3DPtr roiGenericPtrSrc,
+ RpptRoi3DType roiType,
+ rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+ if (srcGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_SRC_DATATYPE;
+ if (dstGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_DST_DATATYPE;
+ if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT;
+ if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT;
+ if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS;
+
+ hip_exec_add_scalar_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ roiGenericPtrSrc,
+ addTensor,
+ rpp::deref(rppHandle));
+
+ return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+ return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
+/******************** subtract_scalar ********************/
+
+RppStatus rppt_subtract_scalar_gpu(RppPtr_t srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ RppPtr_t dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32f *subtractTensor,
+ RpptROI3DPtr roiGenericPtrSrc,
+ RpptRoi3DType roiType,
+ rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+ if (srcGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_SRC_DATATYPE;
+ if (dstGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_DST_DATATYPE;
+ if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT;
+ if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT;
+ if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS;
+
+ hip_exec_subtract_scalar_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ roiGenericPtrSrc,
+ subtractTensor,
+ rpp::deref(rppHandle));
+
+ return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+ return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
#endif // GPU_SUPPORT
diff --git a/utilities/test_suite/HIP/Tensor_hip.cpp b/utilities/test_suite/HIP/Tensor_hip.cpp
index 04831ddf4..60b9eb719 100644
--- a/utilities/test_suite/HIP/Tensor_hip.cpp
+++ b/utilities/test_suite/HIP/Tensor_hip.cpp
@@ -323,18 +323,6 @@ int main(int argc, char **argv)
double wallTime;
string testCaseName;
- if(testCase == 82 && imagesMixed)
- {
- std::cerr<<"\n RICAP only works with same dimension images";
- exit(0);
- }
-
- if(testCase == 82 && batchSize < 2)
- {
- std::cerr<<"\n RICAP only works with BatchSize > 1";
- exit(0);
- }
-
// Initialize buffers for any reductionType functions
void *reductionFuncResultArr;
Rpp32u reductionFuncResultArrLength = srcDescPtr->n * 4;
diff --git a/utilities/test_suite/HIP/Tensor_voxel_hip.cpp b/utilities/test_suite/HIP/Tensor_voxel_hip.cpp
index d4155c8e7..4331b6a24 100644
--- a/utilities/test_suite/HIP/Tensor_voxel_hip.cpp
+++ b/utilities/test_suite/HIP/Tensor_voxel_hip.cpp
@@ -55,12 +55,6 @@ int main(int argc, char * argv[])
fprintf(stdout, "\nUsage: %s \n", argv[0]);
exit(1);
}
- if ((testCase < 0) || (testCase > 4))
- {
- fprintf(stdout, "\nUsage: %s \n", argv[0]);
- exit(1);
- }
-
if(batchSize > MAX_BATCH_SIZE)
{
std::cout << "\n Batchsize should be less than or equal to "<< MAX_BATCH_SIZE << " Aborting!";
@@ -272,6 +266,38 @@ int main(int argc, char * argv[])
break;
}
+ case 2:
+ {
+ testCaseName = "add_scalar";
+ Rpp32f addTensor[batchSize];
+
+ for (int i = 0; i < batchSize; i++)
+ addTensor[i] = 40;
+
+ startWallTime = omp_get_wtime();
+ if (inputBitDepth == 2)
+ rppt_add_scalar_gpu(d_inputF32, descriptorPtr3D, d_outputF32, descriptorPtr3D, addTensor, roiGenericSrcPtr, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
+ case 3:
+ {
+ testCaseName = "subtract_scalar";
+ Rpp32f subtractTensor[batchSize];
+
+ for (int i = 0; i < batchSize; i++)
+ subtractTensor[i] = 40;
+
+ startWallTime = omp_get_wtime();
+ if (inputBitDepth == 2)
+ rppt_subtract_scalar_gpu(d_inputF32, descriptorPtr3D, d_outputF32, descriptorPtr3D, subtractTensor, roiGenericSrcPtr, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
default:
{
missingFuncFlag = 1;
diff --git a/utilities/test_suite/HIP/runTests_voxel.py b/utilities/test_suite/HIP/runTests_voxel.py
index 55c5dc3a5..82d9cbea9 100644
--- a/utilities/test_suite/HIP/runTests_voxel.py
+++ b/utilities/test_suite/HIP/runTests_voxel.py
@@ -38,6 +38,8 @@
qaInputFile = scriptPath + "/../TEST_QA_IMAGES_VOXEL"
outFolderPath = os.getcwd()
buildFolderPath = os.getcwd()
+caseMin = 0
+caseMax = 3
# Check if folder path is empty, if it is the root folder, or if it exists, and remove its contents
def validate_and_remove_contents(path):
@@ -95,7 +97,7 @@ def case_file_check(CASE_FILE_PATH, TYPE, TENSOR_TYPE_LIST, new_file):
print("Unable to open case results")
return False
- # Generate a directory name based on certain parameters
+# Generate a directory name based on certain parameters
def directory_name_generator(qaMode, affinity, layoutType, case, path):
if qaMode == 0:
functionality_group = func_group_finder(int(case))
@@ -256,8 +258,8 @@ def rpp_test_suite_parser_and_validator():
parser = argparse.ArgumentParser()
parser.add_argument("--header_path", type = str, default = headerFilePath, help = "Path to the nii header")
parser.add_argument("--data_path", type = str, default = dataFilePath, help = "Path to the nii data file")
- parser.add_argument("--case_start", type = int, default = 0, help = "Testing range starting case # - (0:1)")
- parser.add_argument("--case_end", type = int, default = 1, help = "Testing range ending case # - (0:1)")
+ parser.add_argument("--case_start", type = int, default = caseMin, help = "Testing start case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]")
+ parser.add_argument("--case_end", type = int, default = caseMax, help = "Testing start case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]")
parser.add_argument('--test_type', type = int, default = 0, help = "Type of Test - (0 = Unit tests / 1 = Performance tests)")
parser.add_argument('--case_list', nargs = "+", help = "List of case numbers to list", required = False)
parser.add_argument('--profiling', type = str , default = 'NO', help = 'Run with profiler? - (YES/NO)', required = False)
@@ -273,7 +275,7 @@ def rpp_test_suite_parser_and_validator():
validate_path(qaInputFile)
# validate the parameters passed by user
- if ((args.case_start < 0 or args.case_start > 1) or (args.case_end < 0 or args.case_end > 1)):
+ if ((args.case_start < caseMin or args.case_start > caseMax) or (args.case_end < caseMin or args.case_end > caseMax)):
print("Starting case# and Ending case# must be in the 0:1 range. Aborting!")
exit(0)
elif args.case_end < args.case_start:
@@ -285,7 +287,7 @@ def rpp_test_suite_parser_and_validator():
elif args.qa_mode < 0 or args.qa_mode > 1:
print("QA mode must be in the 0 / 1. Aborting!")
exit(0)
- elif args.case_list is not None and args.case_start > 0 and args.case_end < 1:
+ elif args.case_list is not None and args.case_start > caseMin and args.case_end < caseMax:
print("Invalid input! Please provide only 1 option between case_list, case_start and case_end")
exit(0)
elif args.num_runs <= 0:
@@ -306,7 +308,7 @@ def rpp_test_suite_parser_and_validator():
args.case_list = [str(x) for x in args.case_list]
else:
for case in args.case_list:
- if int(case) < 0 or int(case) > 1:
+ if int(case) < caseMin or int(case) > caseMax:
print("The case# must be in the 0:1 range!")
exit(0)
@@ -468,7 +470,7 @@ def rpp_test_suite_parser_and_validator():
print("Unable to open results in " + RESULTS_DIR + "/consolidated_results_" + TYPE + ".stats.csv")
# print the results of qa tests
-supportedCaseList = ['0', '1']
+supportedCaseList = ['0', '1', '2', '3']
nonQACaseList = [] # Add cases present in supportedCaseList, but without QA support
if qaMode and testType == 0:
diff --git a/utilities/test_suite/HOST/Tensor_host.cpp b/utilities/test_suite/HOST/Tensor_host.cpp
index 1e416ed52..84bdbca34 100644
--- a/utilities/test_suite/HOST/Tensor_host.cpp
+++ b/utilities/test_suite/HOST/Tensor_host.cpp
@@ -321,18 +321,6 @@ int main(int argc, char **argv)
double cpuTime, wallTime;
string testCaseName;
- if(testCase == 82 && imagesMixed)
- {
- std::cerr<<"\n RICAP only works with same dimension images";
- exit(0);
- }
-
- if(testCase == 82 && batchSize < 2)
- {
- std::cerr<<"\n RICAP only works with BatchSize > 1";
- exit(0);
- }
-
// Initialize buffers for any reductionType functions
void *reductionFuncResultArr;
Rpp32u reductionFuncResultArrLength = srcDescPtr->n * 4;
diff --git a/utilities/test_suite/HOST/Tensor_voxel_host.cpp b/utilities/test_suite/HOST/Tensor_voxel_host.cpp
index 841c28311..01673d0f6 100644
--- a/utilities/test_suite/HOST/Tensor_voxel_host.cpp
+++ b/utilities/test_suite/HOST/Tensor_voxel_host.cpp
@@ -55,12 +55,6 @@ int main(int argc, char * argv[])
fprintf(stdout, "\nUsage: %s \n", argv[0]);
exit(1);
}
- if ((testCase < 0) || (testCase > 1))
- {
- fprintf(stdout, "\nUsage: %s \n", argv[0]);
- exit(1);
- }
-
if(batchSize > MAX_BATCH_SIZE)
{
std::cout << "\n Batchsize should be less than or equal to "<< MAX_BATCH_SIZE << " Aborting!";
@@ -257,6 +251,38 @@ int main(int argc, char * argv[])
break;
}
+ case 2:
+ {
+ testCaseName = "add_scalar";
+ Rpp32f addTensor[batchSize];
+
+ for (int i = 0; i < batchSize; i++)
+ addTensor[i] = 40;
+
+ startWallTime = omp_get_wtime();
+ if(inputBitDepth == 2)
+ rppt_add_scalar_host(inputF32, descriptorPtr3D, outputF32, descriptorPtr3D, addTensor, roiGenericSrcPtr, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
+ case 3:
+ {
+ testCaseName = "subtract_scalar";
+ Rpp32f subtractTensor[batchSize];
+
+ for (int i = 0; i < batchSize; i++)
+ subtractTensor[i] = 40;
+
+ startWallTime = omp_get_wtime();
+ if (inputBitDepth == 2)
+ rppt_subtract_scalar_host(inputF32, descriptorPtr3D, outputF32, descriptorPtr3D, subtractTensor, roiGenericSrcPtr, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
default:
{
missingFuncFlag = 1;
diff --git a/utilities/test_suite/HOST/runTests_voxel.py b/utilities/test_suite/HOST/runTests_voxel.py
index aab51f52e..e21dc0f0e 100644
--- a/utilities/test_suite/HOST/runTests_voxel.py
+++ b/utilities/test_suite/HOST/runTests_voxel.py
@@ -38,6 +38,8 @@
qaInputFile = scriptPath + "/../TEST_QA_IMAGES_VOXEL"
outFolderPath = os.getcwd()
buildFolderPath = os.getcwd()
+caseMin = 0
+caseMax = 3
# Check if folder path is empty, if it is the root folder, or if it exists, and remove its contents
def validate_and_remove_contents(path):
@@ -115,7 +117,8 @@ def func_group_finder(case_number):
return "geometric_augmentations"
else:
return "miscellaneous"
- # Generate a directory name based on certain parameters
+
+# Generate a directory name based on certain parameters
def directory_name_generator(qaMode, affinity, layoutType, case, path):
if qaMode == 0:
functionality_group = func_group_finder(int(case))
@@ -182,8 +185,8 @@ def rpp_test_suite_parser_and_validator():
parser = argparse.ArgumentParser()
parser.add_argument("--header_path", type = str, default = headerFilePath, help = "Path to the nii header")
parser.add_argument("--data_path", type = str, default = dataFilePath, help = "Path to the nii data file")
- parser.add_argument("--case_start", type = int, default = 0, help = "Testing range starting case # - (0:1)")
- parser.add_argument("--case_end", type = int, default = 1, help = "Testing range ending case # - (0:1)")
+ parser.add_argument("--case_start", type = int, default = caseMin, help = "Testing start case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]")
+ parser.add_argument("--case_end", type = int, default = caseMax, help = "Testing start case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]")
parser.add_argument('--test_type', type = int, default = 0, help = "Type of Test - (0 = Unit tests / 1 = Performance tests)")
parser.add_argument('--case_list', nargs = "+", help = "List of case numbers to list", required = False)
parser.add_argument('--qa_mode', type = int, default = 0, help = "Run with qa_mode? Output images from tests will be compared with golden outputs - (0 / 1)", required = False)
@@ -198,7 +201,7 @@ def rpp_test_suite_parser_and_validator():
validate_path(qaInputFile)
# validate the parameters passed by user
- if ((args.case_start < 0 or args.case_start > 1) or (args.case_end < 0 or args.case_end > 1)):
+ if ((args.case_start < caseMin or args.case_start > caseMax) or (args.case_end < caseMin or args.case_end > caseMax)):
print("Starting case# and Ending case# must be in the 0:1 range. Aborting!")
exit(0)
elif args.case_end < args.case_start:
@@ -210,7 +213,7 @@ def rpp_test_suite_parser_and_validator():
elif args.qa_mode < 0 or args.qa_mode > 1:
print("QA mode must be in the 0 / 1. Aborting!")
exit(0)
- elif args.case_list is not None and args.case_start > 0 and args.case_end < 1:
+ elif args.case_list is not None and args.case_start > caseMin and args.case_end < caseMax:
print("Invalid input! Please provide only 1 option between case_list, case_start and case_end")
exit(0)
elif args.num_runs <= 0:
@@ -228,7 +231,7 @@ def rpp_test_suite_parser_and_validator():
args.case_list = [str(x) for x in args.case_list]
else:
for case in args.case_list:
- if int(case) < 0 or int(case) > 1:
+ if int(case) < caseMin or int(case) > caseMax:
print("The case# must be in the 0:1 range!")
exit(0)
@@ -318,7 +321,7 @@ def rpp_test_suite_parser_and_validator():
run_performance_test(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize)
# print the results of qa tests
-supportedCaseList = ['0', '1']
+supportedCaseList = ['0', '1', '2', '3']
nonQACaseList = [] # Add cases present in supportedCaseList, but without QA support
if qaMode and testType == 0:
diff --git a/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/add_scalar/add_scalar_nifti_output.bin b/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/add_scalar/add_scalar_nifti_output.bin
new file mode 100644
index 000000000..628d3785b
Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/add_scalar/add_scalar_nifti_output.bin differ
diff --git a/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/subtract_scalar/subtract_scalar_nifti_output.bin b/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/subtract_scalar/subtract_scalar_nifti_output.bin
new file mode 100644
index 000000000..9b9328536
Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/subtract_scalar/subtract_scalar_nifti_output.bin differ
diff --git a/utilities/test_suite/rpp_test_suite_voxel.h b/utilities/test_suite/rpp_test_suite_voxel.h
index effac9c96..af9d67d14 100644
--- a/utilities/test_suite/rpp_test_suite_voxel.h
+++ b/utilities/test_suite/rpp_test_suite_voxel.h
@@ -61,6 +61,8 @@ std::map augmentationMap =
{
{0, "fused_multiply_add_scalar"},
{1, "slice"},
+ {2, "add_scalar"},
+ {3, "subtract_scalar"}
};
void replicate_last_file_to_fill_batch(const string& lastFilePath, vector& filePathVector, vector& fileNamesVector, const string& lastFileName, int noOfFiles, int batchCount)