Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RPP Voxel 3D Tensor Add/Subtract scalar on HOST and HIP #272

Merged
merged 27 commits into from
Feb 9, 2024
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
6e536f4
added HOST support for voxel add kernel
sampath1117 Jul 21, 2023
806b0a7
added HIP support for voxel add kernel
sampath1117 Jul 21, 2023
cb9f53f
added test suite support for add scalar
sampath1117 Dec 11, 2023
bc75a4e
added Doxygen support and modified hip kernel function names as per n…
sampath1117 Dec 11, 2023
e775ca0
added HOST support for voxel subtract kernel
sampath1117 Jul 21, 2023
d2b51a6
added HIP support for voxel subtract kernel
sampath1117 Jul 21, 2023
379d661
added test suite support
sampath1117 Dec 12, 2023
bee69e8
updated the golden outputs for subtract with correct values
sampath1117 Dec 13, 2023
3548286
removed unnessary validation checks
sampath1117 Dec 13, 2023
6a63230
Merge pull request #209 from sampath1117/sr/voxel_add_and_subtract_pr
r-abishek Dec 13, 2023
27be445
Remove double spaces
r-abishek Dec 13, 2023
d6d576f
Fix header
r-abishek Dec 13, 2023
f976f43
Fix all retval docs
r-abishek Dec 13, 2023
6bc28dc
Fix docs to add memory type
r-abishek Dec 14, 2023
e45d6ee
Fix comment
r-abishek Dec 14, 2023
4c61514
Add divider comment
r-abishek Dec 14, 2023
23c45bf
Use post-increment efficiently
r-abishek Dec 14, 2023
bbb7315
Merge branch 'master' into ar/voxel_add_subtract
sampath1117 Dec 28, 2023
4b577ce
Merge pull request #214 from sampath1117/sr/add_subtract_merge_pr
r-abishek Jan 3, 2024
a7f17f7
RPP Test Suite Upgrade 4 - CSV to BIN conversions for file size reduc…
r-abishek Jan 26, 2024
16ded05
Merge branch 'develop' into sr/add_subtract_pr_changes
sampath1117 Jan 29, 2024
4779b23
converted add and subtract scalar golden outputs to bin files
sampath1117 Jan 29, 2024
1427969
changed copyright from 2023 to 2024
sampath1117 Jan 29, 2024
eca4faf
Merge pull request #225 from sampath1117/sr/add_subtract_pr_changes
r-abishek Jan 29, 2024
de2b555
Update add_scalar.hpp license
r-abishek Jan 31, 2024
44786aa
Update subtract_scalar.hpp license
r-abishek Jan 31, 2024
3635701
Merge branch 'master' of https://github.com/GPUOpen-ProfessionalCompu…
r-abishek Feb 1, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 104 additions & 26 deletions include/rppt_tensor_arithmetic_operations.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,66 +23,144 @@ THE SOFTWARE.
#ifndef RPPT_TENSOR_ARITHMETIC_OPERATIONS_H
#define RPPT_TENSOR_ARITHMETIC_OPERATIONS_H

/*!
* \file
* \brief RPPT Tensor Arithmetic operation Functions.
*
* \defgroup group_tensor_arithmetic Operations: AMD RPP Tensor Arithmetic Operations
* \brief Tensor Color Augmentations.
*/

#include "rpp.h"
#include "rppdefs.h"
#ifdef __cplusplus
extern "C" {
#endif

/*! \brief Fmadd augmentation HOST
/*!
* \file
* \brief RPPT Tensor Operations - Arithmetic Operations.
* \defgroup group_tensor_arithmetic_operations RPPT Tensor Operations - Arithmetic Operations.
* \brief RPPT Tensor Operations - Arithmetic Operations.
*/

/*! \addtogroup group_rppt_tensor_arithmetic_operations
* @{
*/

/*! \brief Fused multiply add scalar augmentation on HOST backend
* \details This function performs the fmadd operation on a batch of 4D tensors.
* It multiplies each element of the source tensor by a corresponding element in the 'mulTensor',
* adds a corresponding element from the 'addTensor', and stores the result in the destination tensor.
* Support added for f32 -> f32 dataype.
* \param [in] srcPtr source tensor memory
* \param [in] srcPtr source tensor in HOST memory
* \param[in] srcGenericDescPtr source tensor descriptor
* \param[out] dstPtr destination tensor memory
* \param[out] dstPtr destination tensor in HOST memory
* \param[in] dstGenericDescPtr destination tensor descriptor
* \param[in] mulTensor mul values for fmadd calculation (1D tensor of batchSize Rpp32f values)
* \param[in] addTensor add values for fmadd calculation (1D tensor of batchSize Rpp32f values)
* \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
* \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
* \param [in] rppHandle Host-handle
* \return <tt> RppStatus enum</tt>.
* \returns RPP_SUCCESS <tt>\ref RppStatus</tt> on successful completion.
* Else return RPP_ERROR
* \ingroup group_tensor_arithmetic
* \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
* \return A <tt> \ref RppStatus</tt> enumeration.
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
*/
RppStatus rppt_fused_multiply_add_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *mulTensor, Rpp32f *addTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);


/*! \brief Fmadd augmentation GPU
#ifdef GPU_SUPPORT
/*! \brief Fused multiply add scalar augmentation on HIP backend
* \details This function performs the fmadd operation on a batch of 4D tensors.
* It multiplies each element of the source tensor by a corresponding element in the 'mulTensor',
* adds a corresponding element from the 'addTensor', and stores the result in the destination tensor.
* Support added for f32 -> f32 dataype.
* \param [in] srcPtr source tensor memory
* \param [in] srcPtr source tensor in HIP memory
* \param[in] srcGenericDescPtr source tensor descriptor
* \param[out] dstPtr destination tensor memory
* \param[out] dstPtr destination tensor in HIP memory
* \param[in] dstGenericDescPtr destination tensor descriptor
* \param[in] mulTensor mul values for fmadd calculation (1D tensor of batchSize Rpp32f values)
* \param[in] addTensor add values for fmadd calculation (1D tensor of batchSize Rpp32f values)
* \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
* \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
* \param [in] rppHandle Hip-handle
* \return <tt> RppStatus enum</tt>.
* \returns RPP_SUCCESS <tt>\ref RppStatus</tt> on successful completion.
* Else return RPP_ERROR
* \ingroup group_tensor_arithmetic
* \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
* \return A <tt> \ref RppStatus</tt> enumeration.
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
*/
RppStatus rppt_fused_multiply_add_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *mulTensor, Rpp32f *addTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
#endif // GPU_SUPPORT

/*! \brief Add scalar augmentation on HOST backend
* \details This function performs the addition operation on a batch of 4D tensors.
* It adds a corresponding element from the 'addTensor' to source tensor, and stores the result in the destination tensor.
* Support added for f32 -> f32 dataype.
* \param [in] srcPtr source tensor in HOST memory
* \param[in] srcGenericDescPtr source tensor descriptor
* \param[out] dstPtr destination tensor in HOST memory
* \param[in] dstGenericDescPtr destination tensor descriptor
* \param[in] addTensor add values for used for addition (1D tensor of batchSize Rpp32f values)
* \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
* \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
* \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
* \return A <tt> \ref RppStatus</tt> enumeration.
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
*/
RppStatus rppt_add_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *addTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);

#ifdef GPU_SUPPORT
RppStatus rppt_fused_multiply_add_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *mulTensor, Rpp32f *addTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
/*! \brief Add scalar augmentation on HIP backend
* \details This function performs the addition operation on a batch of 4D tensors.
* It adds a corresponding element from the 'addTensor' to source tensor, and stores the result in the destination tensor.
* Support added for f32 -> f32 dataype.
* \param [in] srcPtr source tensor in HIP memory
* \param[in] srcGenericDescPtr source tensor descriptor
* \param[out] dstPtr destination tensor in HIP memory
* \param[in] dstGenericDescPtr destination tensor descriptor
* \param[in] addTensor add values for used for addition (1D tensor of batchSize Rpp32f values)
* \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
* \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
* \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
* \return A <tt> \ref RppStatus</tt> enumeration.
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
*/
RppStatus rppt_add_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *addTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
#endif // GPU_SUPPORT

/*! \brief Subtract scalar augmentation on HOST backend
* \details This function performs the subtraction operation on a batch of 4D tensors.
* It takes a corresponding element from 'subtractTensor' and subtracts it from source tensor. Result is stored in the destination tensor.
* Support added for f32 -> f32 dataype.
* \param [in] srcPtr source tensor in HOST memory
* \param[in] srcGenericDescPtr source tensor descriptor
* \param[out] dstPtr destination tensor in HOST memory
* \param[in] dstGenericDescPtr destination tensor descriptor
* \param[in] subtractTensor subtract values for used for subtraction (1D tensor of batchSize Rpp32f values)
* \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
* \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
* \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
* \return A <tt> \ref RppStatus</tt> enumeration.
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
*/
RppStatus rppt_subtract_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *subtractTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);

#ifdef GPU_SUPPORT
/*! \brief Subtract scalar augmentation on HIP backend
* \details This function performs the subtraction operation on a batch of 4D tensors.
* It takes a corresponding element from 'subtractTensor' and subtracts it from source tensor. Result is stored in the destination tensor.
* Support added for f32 -> f32 dataype.
* \param [in] srcPtr source tensor in HIP memory
* \param[in] srcGenericDescPtr source tensor descriptor
* \param[out] dstPtr destination tensor in HIP memory
* \param[in] dstGenericDescPtr destination tensor descriptor
* \param[in] subtractTensor subtract values for used for subtraction (1D tensor of batchSize Rpp32f values)
* \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
* \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
* \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
* \return A <tt> \ref RppStatus</tt> enumeration.
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
*/
RppStatus rppt_subtract_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *subtractTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
#endif // GPU_SUPPORT

/*! @}
*/

#ifdef __cplusplus
}
#endif
Expand Down
1 change: 1 addition & 0 deletions include/rppt_tensor_statistical_operations.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ THE SOFTWARE.

#ifndef RPPT_TENSOR_STATISTICAL_OPERATIONS_H
#define RPPT_TENSOR_STATISTICAL_OPERATIONS_H

#include "rpp.h"
#include "rppdefs.h"
#ifdef __cplusplus
Expand Down
12 changes: 12 additions & 0 deletions src/include/cpu/rpp_cpu_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2429,6 +2429,18 @@ inline RppStatus custom_convolve_image_host(T* srcPtr, RppiSize srcSize, U* dstP

// Compute Functions for RPP Tensor API

inline void compute_subtract_16_host(__m256 *p, __m256 *pSubtractParam)
{
p[0] = _mm256_sub_ps(p[0], pSubtractParam[0]); // subtract adjustment
p[1] = _mm256_sub_ps(p[1], pSubtractParam[0]); // subtract adjustment
}

inline void compute_add_16_host(__m256 *p, __m256 *pAddParam)
{
p[0] = _mm256_add_ps(p[0], pAddParam[0]); // add adjustment
p[1] = _mm256_add_ps(p[1], pAddParam[0]); // add adjustment
}

inline void compute_rmn_24_host(__m256 *p, __m256 *pRMNParams)
{
p[0] = _mm256_mul_ps(_mm256_sub_ps(p[0], pRMNParams[0]), pRMNParams[1]);
Expand Down
2 changes: 2 additions & 0 deletions src/modules/cpu/host_tensor_arithmetic_operations.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,7 @@ THE SOFTWARE.
#define HOST_TENSOR_ARITHMETIC_OPERATIONS_HPP

#include "kernel/fused_multiply_add_scalar.hpp"
#include "kernel/add_scalar.hpp"
#include "kernel/subtract_scalar.hpp"

#endif // HOST_TENSOR_ARITHMETIC_OPERATIONS_HPP
152 changes: 152 additions & 0 deletions src/modules/cpu/kernel/add_scalar.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
/*
MIT License

Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/

#include "rppdefs.h"
#include "rpp_cpu_simd.hpp"
#include "rpp_cpu_common.hpp"

RppStatus add_scalar_f32_f32_host_tensor(Rpp32f *srcPtr,
RpptGenericDescPtr srcGenericDescPtr,
Rpp32f *dstPtr,
RpptGenericDescPtr dstGenericDescPtr,
Rpp32f *addTensor,
RpptROI3DPtr roiGenericPtrSrc,
RpptRoi3DType roiType,
RppLayoutParams layoutParams,
rpp::Handle& handle)
{
RpptROI3D roiDefault;
if(srcGenericDescPtr->layout==RpptLayout::NCDHW)
roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[4], (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2]};
else if(srcGenericDescPtr->layout==RpptLayout::NDHWC)
roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2], (Rpp32s)srcGenericDescPtr->dims[1]};
Rpp32u numThreads = handle.GetNumThreads();

omp_set_dynamic(0);
#pragma omp parallel for num_threads(numThreads)
for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
{
RpptROI3D roi;
RpptROI3DPtr roiPtrInput = &roiGenericPtrSrc[batchCount];
compute_roi3D_validation_host(roiPtrInput, &roi, &roiDefault, roiType);

Rpp32f *srcPtrImage, *dstPtrImage;
srcPtrImage = srcPtr + batchCount * srcGenericDescPtr->strides[0];
dstPtrImage = dstPtr + batchCount * dstGenericDescPtr->strides[0];

Rpp32f addParam = addTensor[batchCount];
Rpp32f *srcPtrChannel, *dstPtrChannel;
dstPtrChannel = dstPtrImage;

Rpp32u vectorIncrement = 16;
Rpp32u bufferLength = roi.xyzwhdROI.roiWidth * layoutParams.bufferMultiplier;
Rpp32u alignedLength = (bufferLength / vectorIncrement) * vectorIncrement;
__m256 pAddParam = _mm256_set1_ps(addParam);

// Add without fused output-layout toggle (NCDHW -> NCDHW)
if((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW))
{
srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[3]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier);

for(int c = 0; c < layoutParams.channelParam; c++)
{
Rpp32f *srcPtrDepth, *dstPtrDepth;
srcPtrDepth = srcPtrChannel;
dstPtrDepth = dstPtrChannel;
for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++)
{
Rpp32f *srcPtrRow, *dstPtrRow;
srcPtrRow = srcPtrDepth;
dstPtrRow = dstPtrDepth;
for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++)
{
Rpp32f *srcPtrTemp, *dstPtrTemp;
srcPtrTemp = srcPtrRow;
dstPtrTemp = dstPtrRow;
int vectorLoopCount = 0;
for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
{
__m256 p[2];
rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p); // simd loads
compute_add_16_host(p, &pAddParam); // add adjustment
rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores
srcPtrTemp += vectorIncrement;
dstPtrTemp += vectorIncrement;
}
for (; vectorLoopCount < bufferLength; vectorLoopCount++)
{
*dstPtrTemp++ = *srcPtrTemp++ + addParam;
}
srcPtrRow += srcGenericDescPtr->strides[3];
dstPtrRow += dstGenericDescPtr->strides[3];
}
srcPtrDepth += srcGenericDescPtr->strides[2];
dstPtrDepth += dstGenericDescPtr->strides[2];
}
srcPtrChannel += srcGenericDescPtr->strides[1];
dstPtrChannel += srcGenericDescPtr->strides[1];
}
}
// Add without fused output-layout toggle (NDHWC -> NDHWC)
else if((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC))
{
srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[1]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier);
Rpp32f *srcPtrDepth = srcPtrChannel;
Rpp32f *dstPtrDepth = dstPtrChannel;
for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++)
{
Rpp32f *srcPtrRow, *dstPtrRow;
srcPtrRow = srcPtrDepth;
dstPtrRow = dstPtrDepth;
for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++)
{
Rpp32f *srcPtrTemp, *dstPtrTemp;
srcPtrTemp = srcPtrRow;
dstPtrTemp = dstPtrRow;

int vectorLoopCount = 0;
for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
{
__m256 p[2];
rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p); // simd loads
compute_add_16_host(p, &pAddParam); // add adjustment
rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores
srcPtrTemp += vectorIncrement;
dstPtrTemp += vectorIncrement;
}
for (; vectorLoopCount < bufferLength; vectorLoopCount++)
{
*dstPtrTemp++ = *srcPtrTemp++ + addParam;
}
srcPtrRow += srcGenericDescPtr->strides[2];
dstPtrRow += dstGenericDescPtr->strides[2];
}
srcPtrDepth += srcGenericDescPtr->strides[1];
dstPtrDepth += dstGenericDescPtr->strides[1];
}
}
}

return RPP_SUCCESS;
}
Loading