ROCm · kiritigowda · May 8, 2024 · Aug 3, 2023 · Aug 3, 2023 · Aug 5, 2023
diff --git a/include/rppdefs.h b/include/rppdefs.h
@@ -369,10 +369,13 @@ typedef enum
  */
 typedef enum
 {
-    NCHW,
-    NHWC,
-    NCDHW,
-    NDHWC
+    NCHW,   // BatchSize-Channels-Height-Width
+    NHWC,   // BatchSize-Height-Width-Channels
+    NCDHW,  // BatchSize-Channels-Depth-Height-Width
+    NDHWC,  // BatchSize-Depth-Height-Width-Channels
+    NHW,    // BatchSize-Height-Width
+    NFT,    // BatchSize-Frequency-Time -> Frequency Major used for Spectrogram / MelfilterBank
+    NTF     // BatchSize-Time-Frequency -> Time Major used for Spectrogram / MelfilterBank
 } RpptLayout;
 
 /*! \brief RPPT Tensor 2D ROI type enum
@@ -434,6 +437,15 @@ typedef enum
     TF,      //Time Major
 } RpptSpectrogramLayout;
 
+/*! \brief RPPT Mel Scale Formula
+ * \ingroup group_rppdefs
+ */
+typedef enum
+{
+    SLANEY = 0,  // Follows Slaney’s MATLAB Auditory Modelling Work behavior
+    HTK,         // Follows O’Shaughnessy’s book formula, consistent with Hidden Markov Toolkit(HTK), m = 2595 * log10(1 + (f/700))
+} RpptMelScaleFormula;
+
 /*! \brief RPPT Tensor 2D ROI LTRB struct
  * \ingroup group_rppdefs
  */

diff --git a/include/rppt_tensor_audio_augmentations.h b/include/rppt_tensor_audio_augmentations.h
@@ -132,6 +132,26 @@ RppStatus rppt_down_mixing_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_
  */
 RppStatus rppt_spectrogram_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32s *srcLengthTensor, bool centerWindows, bool reflectPadding, Rpp32f *windowFunction, Rpp32s nfft, Rpp32s power, Rpp32s windowLength, Rpp32s windowStep, RpptSpectrogramLayout layout, rppHandle_t rppHandle);
 
+/*! \brief Mel filter bank augmentation HOST backend
+ * \details Mel filter bank augmentation for audio data
+ * \param[in] srcPtr source tensor in HOST memory
+ * \param[in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32, layout - NFT / NTF)
+ * \param[out] dstPtr destination tensor in HOST memory
+ * \param[in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32, layout - NFT / NTF)
+ * \param[in] srcDimsTensor source audio buffer length and number of channels (1D tensor in HOST memory, of size batchSize * 2)
+ * \param[in] maxFreq maximum frequency if not provided maxFreq = sampleRate / 2
+ * \param[in] minFreq minimum frequency
+ * \param[in] melFormula formula used to convert frequencies from hertz to mel and from mel to hertz (SLANEY / HTK)
+ * \param[in] numFilter number of mel filters
+ * \param[in] sampleRate sampling rate of the audio
+ * \param[in] normalize boolean variable that determine whether to normalize weights / not
+ * \param[in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_mel_filter_bank_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32s *srcDims, Rpp32f maxFreq, Rpp32f minFreq, RpptMelScaleFormula melFormula, Rpp32s numFilter, Rpp32f sampleRate, bool normalize, rppHandle_t rppHandle);
+
 /*! \brief Resample augmentation on HOST backend
 * \details Resample augmentation for audio data
 * \param[in] srcPtr source tensor in HOST memory

diff --git a/src/modules/cpu/host_tensor_audio_augmentations.hpp b/src/modules/cpu/host_tensor_audio_augmentations.hpp
@@ -30,6 +30,7 @@ SOFTWARE.
 #include "kernel/pre_emphasis_filter.hpp"
 #include "kernel/down_mixing.hpp"
 #include "kernel/spectrogram.hpp"
+#include "kernel/mel_filter_bank.hpp"
 #include "kernel/resample.hpp"
 
 #endif // HOST_TENSOR_AUDIO_AUGMENTATIONS_HPP
diff --git a/src/modules/cpu/kernel/mel_filter_bank.hpp b/src/modules/cpu/kernel/mel_filter_bank.hpp
@@ -0,0 +1,252 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+struct BaseMelScale
+{
+    public:
+        virtual Rpp32f hz_to_mel(Rpp32f hz) = 0;
+        virtual Rpp32f mel_to_hz(Rpp32f mel) = 0;
+        virtual ~BaseMelScale() = default;
+};
+
+struct HtkMelScale : public BaseMelScale
+{
+    Rpp32f hz_to_mel(Rpp32f hz) { return 1127.0f * std::log(1.0f + (hz / 700.0f)); }
+    Rpp32f mel_to_hz(Rpp32f mel) { return 700.0f * (std::exp(mel / 1127.0f) - 1.0f); }
+    public:
+        ~HtkMelScale() {};
+};
+
+struct SlaneyMelScale : public BaseMelScale
+{
+    const Rpp32f freqLow = 0;
+    const Rpp32f fsp = 200.0 / 3.0;
+    const Rpp32f minLogHz = 1000.0;
+    const Rpp32f minLogMel = (minLogHz - freqLow) / fsp;
+    const Rpp32f stepLog = 0.068751777;  // Equivalent to std::log(6.4) / 27.0;
+
+    const Rpp32f invMinLogHz = 1.0f / 1000.0;
+    const Rpp32f invStepLog = 1.0f / stepLog;
+    const Rpp32f invFsp = 1.0f / fsp;
+
+    Rpp32f hz_to_mel(Rpp32f hz)
+    {
+        Rpp32f mel = 0.0f;
+        if (hz >= minLogHz)
+            mel = minLogMel + std::log(hz * invMinLogHz) * invStepLog;
+        else
+            mel = (hz - freqLow) * invFsp;
+
+        return mel;
+    }
+
+    Rpp32f mel_to_hz(Rpp32f mel)
+    {
+        Rpp32f hz = 0.0f;
+        if (mel >= minLogMel)
+            hz = minLogHz * std::exp(stepLog * (mel - minLogMel));
+        else
+            hz = freqLow + mel * fsp;
+        return hz;
+    }
+    public:
+        ~SlaneyMelScale() {};
+};
+
+RppStatus mel_filter_bank_host_tensor(Rpp32f *srcPtr,
+                                      RpptDescPtr srcDescPtr,
+                                      Rpp32f *dstPtr,
+                                      RpptDescPtr dstDescPtr,
+                                      Rpp32s *srcDimsTensor,
+                                      Rpp32f maxFreqVal,    // check unused
+                                      Rpp32f minFreqVal,
+                                      RpptMelScaleFormula melFormula,
+                                      Rpp32s numFilter,
+                                      Rpp32f sampleRate,
+                                      bool normalize,
+                                      rpp::Handle& handle)
+{
+    BaseMelScale *melScalePtr;
+    switch(melFormula)
+    {
+        case RpptMelScaleFormula::HTK:
+            melScalePtr = new HtkMelScale;
+            break;
+        case RpptMelScaleFormula::SLANEY:
+        default:
+            melScalePtr = new SlaneyMelScale();
+            break;
+    }
+    Rpp32u numThreads = handle.GetNumThreads();
+    Rpp32u batchSize = srcDescPtr->n;
+    Rpp32f *scratchMem = handle.GetInitHandle()->mem.mcpu.scratchBufferHost;
+
+    Rpp32f maxFreq = sampleRate / 2;
+    Rpp32f minFreq = minFreqVal;
+
+    // Convert lower, higher frequencies to mel scale and find melStep
+    Rpp64f melLow = melScalePtr->hz_to_mel(minFreq);
+    Rpp64f melHigh = melScalePtr->hz_to_mel(maxFreq);
+    Rpp64f melStep = (melHigh - melLow) / (numFilter + 1);
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+    for(int batchCount = 0; batchCount < batchSize; batchCount++)
+    {
+        Rpp32f *srcPtrTemp = srcPtr + batchCount * srcDescPtr->strides.nStride;
+        Rpp32f *dstPtrTemp = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+        // Extract nfft, number of Frames, numBins
+        Rpp32s nfft = (srcDimsTensor[batchCount * 2] - 1) * 2;
+        Rpp32s numBins = nfft / 2 + 1;
+        Rpp32s numFrames = srcDimsTensor[batchCount * 2 + 1];
+
+        // Find hzStep
+        Rpp64f hzStep = static_cast<Rpp64f>(sampleRate) / nfft;
+        Rpp64f invHzStep = 1.0 / hzStep;
+
+        // Find fftBinStart and fftBinEnd
+        Rpp32s fftBinStart = std::ceil(minFreq * invHzStep);
+        Rpp32s fftBinEnd = std::ceil(maxFreq * invHzStep);
+        fftBinEnd = std::min(fftBinEnd, numBins);
+
+        // Set/Fill normFactors, weightsDown and intervals
+        Rpp32f *normFactors = scratchMem + (batchCount * numFilter);
+        std::fill(normFactors, normFactors + numFilter, 1.f);           // normFactors contain numFilter values of type float
+        Rpp32f *weightsDown = scratchMem + (batchSize * numFilter) + (batchCount * numBins);
+        memset(weightsDown, 0, sizeof(numBins * sizeof(Rpp32f)));       // weightsDown contain numBins values of type float
+        Rpp32s *intervals = reinterpret_cast<Rpp32s *>(weightsDown + (batchSize * numBins));
+        std::fill(intervals, intervals + numBins, -1);                  // intervals contain numBins values of type integer
+
+        Rpp32s fftBin = fftBinStart;
+        Rpp64f mel0 = melLow, mel1 = melLow + melStep;
+        Rpp64f fIter = fftBin * hzStep;
+        for (int interval = 0; interval < numFilter + 1; interval++, mel0 = mel1, mel1 += melStep)
+        {
+            Rpp64f f0 = melScalePtr->mel_to_hz(mel0);
+            Rpp64f f1 = melScalePtr->mel_to_hz(interval == numFilter ? melHigh : mel1);
+            Rpp64f slope = 1. / (f1 - f0);
+
+            if (normalize && interval < numFilter)
+            {
+                Rpp64f f2 = melScalePtr->mel_to_hz(mel1 + melStep);
+                normFactors[interval] = 2.0 / (f2 - f0);
+            }
+
+            for (; fftBin < fftBinEnd && fIter < f1; fftBin++, fIter = fftBin * hzStep)
+            {
+                weightsDown[fftBin] = (f1 - fIter) * slope;
+                intervals[fftBin] = interval;
+            }
+        }
+
+        Rpp32u maxFrames = std::min(static_cast<Rpp32u>(numFrames + 8), dstDescPtr->strides.hStride);
+        Rpp32u maxAlignedLength = maxFrames & ~7;
+        Rpp32u vectorIncrement = 8;
+
+        // Set ROI values in dst buffer to 0.0
+        for(int i = 0; i < numFilter; i++)
+        {
+            Rpp32f *dstPtrRow = dstPtrTemp + i * dstDescPtr->strides.hStride;
+            Rpp32u vectorLoopCount = 0;
+            for(; vectorLoopCount < maxAlignedLength; vectorLoopCount += 8)
+            {
+                _mm256_storeu_ps(dstPtrRow, avx_p0);
+                dstPtrRow += 8;
+            }
+            for(; vectorLoopCount < maxFrames; vectorLoopCount++)
+                *dstPtrRow++ = 0.0f;
+        }
+
+        Rpp32u alignedLength = numFrames & ~7;
+        __m256 pSrc, pDst;
+        Rpp32f *srcRowPtr = srcPtrTemp + fftBinStart * srcDescPtr->strides.hStride;
+        for (int64_t fftBin = fftBinStart; fftBin < fftBinEnd; fftBin++)
+        {
+            auto filterUp = intervals[fftBin];
+            auto weightUp = 1.0f - weightsDown[fftBin];
+            auto filterDown = filterUp - 1;
+            auto weightDown = weightsDown[fftBin];
+
+            if (filterDown >= 0)
+            {
+                Rpp32f *dstRowPtrTemp = dstPtrTemp + filterDown * dstDescPtr->strides.hStride;
+                Rpp32f *srcRowPtrTemp = srcRowPtr;
+
+                if (normalize)
+                    weightDown *= normFactors[filterDown];
+                __m256 pWeightDown = _mm256_set1_ps(weightDown);
+
+                int vectorLoopCount = 0;
+                for(; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                {
+                    pSrc = _mm256_loadu_ps(srcRowPtrTemp);
+                    pSrc = _mm256_mul_ps(pSrc, pWeightDown);
+                    pDst = _mm256_loadu_ps(dstRowPtrTemp);
+                    pDst = _mm256_add_ps(pDst, pSrc);
+                    _mm256_storeu_ps(dstRowPtrTemp, pDst);
+                    dstRowPtrTemp += vectorIncrement;
+                    srcRowPtrTemp += vectorIncrement;
+                }
+
+                for (; vectorLoopCount < numFrames; vectorLoopCount++)
+                    (*dstRowPtrTemp++) += weightDown * (*srcRowPtrTemp++);
+            }
+
+            if (filterUp >= 0 && filterUp < numFilter)
+            {
+                Rpp32f *dstRowPtrTemp = dstPtrTemp + filterUp *  dstDescPtr->strides.hStride;
+                Rpp32f *srcRowPtrTemp = srcRowPtr;
+
+                if (normalize)
+                    weightUp *= normFactors[filterUp];
+                __m256 pWeightUp = _mm256_set1_ps(weightUp);
+
+                int vectorLoopCount = 0;
+                for(; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                {
+                    pSrc = _mm256_loadu_ps(srcRowPtrTemp);
+                    pSrc = _mm256_mul_ps(pSrc, pWeightUp);
+                    pDst = _mm256_loadu_ps(dstRowPtrTemp);
+                    pDst = _mm256_add_ps(pDst, pSrc);
+                    _mm256_storeu_ps(dstRowPtrTemp, pDst);
+                    dstRowPtrTemp += vectorIncrement;
+                    srcRowPtrTemp += vectorIncrement;
+                }
+
+                for (; vectorLoopCount < numFrames; vectorLoopCount++)
+                    (*dstRowPtrTemp++) += weightUp * (*srcRowPtrTemp++);
+            }
+
+            srcRowPtr += srcDescPtr->strides.hStride;
+        }
+    }
+    delete melScalePtr;
+
+    return RPP_SUCCESS;
+}
diff --git a/src/modules/rppt_tensor_audio_augmentations.cpp b/src/modules/rppt_tensor_audio_augmentations.cpp
@@ -197,6 +197,46 @@ RppStatus rppt_spectrogram_host(RppPtr_t srcPtr,
     }
 }
 
+/******************** mel_filter_bank ********************/
+
+RppStatus rppt_mel_filter_bank_host(RppPtr_t srcPtr,
+                                    RpptDescPtr srcDescPtr,
+                                    RppPtr_t dstPtr,
+                                    RpptDescPtr dstDescPtr,
+                                    Rpp32s* srcDimsTensor,
+                                    Rpp32f maxFreq,
+                                    Rpp32f minFreq,
+                                    RpptMelScaleFormula melFormula,
+                                    Rpp32s numFilter,
+                                    Rpp32f sampleRate,
+                                    bool normalize,
+                                    rppHandle_t rppHandle)
+{
+    if (srcDescPtr->layout != RpptLayout::NFT) return RPP_ERROR_INVALID_SRC_LAYOUT;
+    if (dstDescPtr->layout != RpptLayout::NFT) return RPP_ERROR_INVALID_DST_LAYOUT;
+
+    if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+    {
+        mel_filter_bank_host_tensor(static_cast<Rpp32f*>(srcPtr),
+                                    srcDescPtr,
+                                    static_cast<Rpp32f*>(dstPtr),
+                                    dstDescPtr,
+                                    srcDimsTensor,
+                                    maxFreq,
+                                    minFreq,
+                                    melFormula,
+                                    numFilter,
+                                    sampleRate,
+                                    normalize,
+                                    rpp::deref(rppHandle));
+        return RPP_SUCCESS;
+    }
+    else
+    {
+        return RPP_ERROR_NOT_IMPLEMENTED;
+    }
+}
+
 /******************** resample ********************/
 
 RppStatus rppt_resample_host(RppPtr_t srcPtr,