ROCm · rrawther · May 28, 2024 · Feb 16, 2024 · Feb 18, 2024 · Mar 13, 2024
@@ -11,6 +11,7 @@ Documentation for MIVisionX is available at
 
 * Support for advanced GPUs
 * Support for PreEmphasis Filter augmentation in openVX extensions
+* Support for Spectrogram augmentation in openVX extensions
 
 ### Optimizations
 

@@ -157,6 +157,7 @@ list(APPEND SOURCES
         source/tensor/Saturation.cpp
         source/tensor/SequenceRearrange.cpp
         source/tensor/Snow.cpp
+        source/tensor/Spectrogram.cpp
         source/tensor/Vignette.cpp
         source/tensor/WarpAffine.cpp
         source/tensor/SequenceRearrange.cpp

@@ -156,6 +156,7 @@ vx_status Snow_Register(vx_context);
 vx_status Vignette_Register(vx_context);
 vx_status WarpAffine_Register(vx_context);
 vx_status SequenceRearrange_Register(vx_context);
+vx_status Spectrogram_Register(vx_context);
 
 // kernel names
 #define VX_KERNEL_RPP_NOPBATCHPD_NAME                           "org.rpp.NopbatchPD"
@@ -274,12 +275,13 @@ vx_status SequenceRearrange_Register(vx_context);
 #define VX_KERNEL_RPP_PIXELATE_NAME                             "org.rpp.Pixelate"
 #define VX_KERNEL_RPP_VIGNETTE_NAME                             "org.rpp.Vignette"
 #define VX_KERNEL_RPP_WARPAFFINE_NAME                           "org.rpp.WarpAffine"
-#define VX_KERNEL_RPP_BRIGHTNESS_NAME                            "org.rpp.Brightness"
-#define VX_KERNEL_RPP_COPY_NAME                                  "org.rpp.Copy"
-#define VX_KERNEL_RPP_CROPMIRRORNORMALIZE_NAME                   "org.rpp.CropMirrorNormalize"
-#define VX_KERNEL_RPP_NOP_NAME                                   "org.rpp.Nop"
-#define VX_KERNEL_RPP_RESIZE_NAME                                "org.rpp.Resize"
-#define VX_KERNEL_RPP_SEQUENCEREARRANGE_NAME                     "org.rpp.SequenceRearrange"
-#define VX_KERNEL_RPP_PREEMPHASISFILTER_NAME                     "org.rpp.PreemphasisFilter"
+#define VX_KERNEL_RPP_BRIGHTNESS_NAME                           "org.rpp.Brightness"
+#define VX_KERNEL_RPP_COPY_NAME                                 "org.rpp.Copy"
+#define VX_KERNEL_RPP_CROPMIRRORNORMALIZE_NAME                  "org.rpp.CropMirrorNormalize"
+#define VX_KERNEL_RPP_NOP_NAME                                  "org.rpp.Nop"
+#define VX_KERNEL_RPP_RESIZE_NAME                               "org.rpp.Resize"
+#define VX_KERNEL_RPP_SEQUENCEREARRANGE_NAME                    "org.rpp.SequenceRearrange"
+#define VX_KERNEL_RPP_PREEMPHASISFILTER_NAME                    "org.rpp.PreemphasisFilter"
+#define VX_KERNEL_RPP_SPECTROGRAM_NAME                          "org.rpp.Spectrogram"
 
 #endif //_AMDVX_EXT__PUBLISH_KERNELS_H_
@@ -68,16 +68,20 @@ enum vxTensorLayout {
     VX_NHWC = 0,
     VX_NCHW = 1,
     VX_NFHWC = 2,
-    VX_NFCHW = 3
+    VX_NFCHW = 3,
+    VX_NHW = 4,     // Audio/2D layout
+    VX_NFT = 5,     // Frequency major, Used for Spectrogram/MelFilterBank
+    VX_NTF = 6      // Time major, Used for Spectrogram/MelFilterBank
 };
 
 //! Brief The utility functions
 vx_node createNode(vx_graph graph, vx_enum kernelEnum, vx_reference params[], vx_uint32 num);
 vx_status createRPPHandle(vx_node node, vxRppHandle ** pHandle, Rpp32u batchSize, Rpp32u deviceType);
 vx_status releaseRPPHandle(vx_node node, vxRppHandle * handle, Rpp32u deviceType);
 void fillDescriptionPtrfromDims(RpptDescPtr &descPtr, vxTensorLayout layout, size_t *tensorDims);
-void fillAudioDescriptionPtrFromDims(RpptDescPtr &descPtr, size_t *tensorDims);
+void fillAudioDescriptionPtrFromDims(RpptDescPtr &descPtr, size_t *tensorDims, vxTensorLayout layout = vxTensorLayout::VX_NHW);
 RpptDataType getRpptDataType(vx_enum dataType);
+RpptLayout getRpptLayout(vxTensorLayout layout);
 
 class Kernellist
 {

@@ -148,7 +148,8 @@ extern "C"
         VX_KERNEL_RPP_SNOW = VX_KERNEL_BASE(VX_ID_AMD, VX_LIBRARY_RPP) + 0x71,
         VX_KERNEL_RPP_VIGNETTE = VX_KERNEL_BASE(VX_ID_AMD, VX_LIBRARY_RPP) + 0x72,
         VX_KERNEL_RPP_WARPAFFINE = VX_KERNEL_BASE(VX_ID_AMD, VX_LIBRARY_RPP) + 0x73,
-        VX_KERNEL_RPP_PREEMPHASISFILTER = VX_KERNEL_BASE(VX_ID_AMD, VX_LIBRARY_RPP) + 0x74
+        VX_KERNEL_RPP_PREEMPHASISFILTER = VX_KERNEL_BASE(VX_ID_AMD, VX_LIBRARY_RPP) + 0x74,
+        VX_KERNEL_RPP_SPECTROGRAM = VX_KERNEL_BASE(VX_ID_AMD, VX_LIBRARY_RPP) + 0x75
     };
 
 #ifdef __cplusplus

@@ -1876,6 +1876,26 @@ extern "C"
 	 * \return A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>.
 	 */
 	SHARED_PUBLIC vx_node VX_API_CALL vxExtRppPreemphasisFilter(vx_graph graph, vx_tensor pSrc, vx_tensor pSrcRoi, vx_tensor pDst, vx_array pPreemphCoeff, vx_scalar borderType);
+
+	/*! \brief [Graph] Produces a spectrogram from a 1D signal.
+	* \ingroup group_amd_rpp
+	* \param [in] graph The handle to the graph.
+	* \param [in] pSrc The input tensor in <tt>\ref VX_TYPE_FLOAT32</tt> format data.
+	* \param [in] pSrcRoi The input tensor of batch size in <tt>unsigned int<tt> containing the roi values for the input in xywh/ltrb format.
+	* \param [out] pDst The output tensor (begin) in <tt>\ref VX_TYPE_FLOAT32</tt> format data.
+	* \param [in] pDstRoi The input tensor of batch size in <tt>unsigned int<tt> containing the roi values for the output tensor in xywh/ltrb format.
+	* \param [in] windowFunction The input array in <tt>\ref VX_TYPE_FLOAT32</tt> format containing the samples of the window function that will be multiplied to each extracted window when calculating the STFT.
+	* \param [in] centerWindow The input scalar in <tt>\ref VX_TYPE_BOOL</tt> format indicates whether extracted windows should be padded so that the window function is centered at multiples of window_step.
+	* \param [in] reflectPadding The input scalar in <tt>\ref VX_TYPE_BOOL</tt> format indicates the padding policy when sampling outside the bounds of the signal.
+	* \param [in] spectrogramLayout The input scalar in <tt>\ref VX_TYPE_INT32</tt> format containing the Output spectrogram layout.
+	* \param [in] power The input scalar in <tt>\ref VX_TYPE_INT32</tt> format containing the exponent of the magnitude of the spectrum.
+	* \param [in] nfft The input scalar in <tt>\ref VX_TYPE_INT32</tt> format containing the size of the FFT.
+	* \param [in] windowLength The input scalar in <tt>\ref VX_TYPE_INT32</tt> format containing Window size in number of samples.
+	* \param [in] windowStep The input array in <tt>\ref VX_TYPE_INT32</tt> format containing the step between the STFT windows in number of samples.
+	* \return A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+	*/
+	SHARED_PUBLIC vx_node VX_API_CALL vxExtRppSpectrogram(vx_graph graph, vx_tensor pSrc, vx_tensor pSrcRoi, vx_tensor pDst, vx_tensor pDstRoi, vx_array windowFunction, vx_scalar centerWindow, vx_scalar reflectPadding, vx_scalar spectrogramLayout, vx_scalar power, vx_scalar nfft, vx_scalar windowLength, vx_scalar windowStep);
+
 #ifdef __cplusplus
 }
 #endif

@@ -161,6 +161,7 @@ vx_status get_kernels_to_publish()
     STATUS_ERROR_CHECK(ADD_KERNEL(Snow_Register));
     STATUS_ERROR_CHECK(ADD_KERNEL(Vignette_Register));
     STATUS_ERROR_CHECK(ADD_KERNEL(WarpAffine_Register));
+    STATUS_ERROR_CHECK(ADD_KERNEL(Spectrogram_Register));
 
     return status;
 }

@@ -2558,6 +2558,32 @@ VX_API_ENTRY vx_node VX_API_CALL vxExtRppPreemphasisFilter(vx_graph graph, vx_te
     return node;
 }
 
+VX_API_ENTRY vx_node VX_API_CALL vxExtRppSpectrogram(vx_graph graph, vx_tensor pSrc, vx_tensor pSrcRoi, vx_tensor pDst, vx_tensor pDstRoi, vx_array windowFunction, vx_scalar centerWindows, vx_scalar reflectPadding, vx_scalar spectrogramLayout,
+                                                     vx_scalar power, vx_scalar nfft, vx_scalar windowLength, vx_scalar windowStep) {
+    vx_node node = NULL;
+    vx_context context = vxGetContext((vx_reference)graph);
+    if (vxGetStatus((vx_reference)context) == VX_SUCCESS) {
+        vx_uint32 devtype = getGraphAffinity(graph);
+        vx_scalar deviceType = vxCreateScalar(vxGetContext((vx_reference)graph), VX_TYPE_UINT32, &devtype);
+        vx_reference params[] = {
+            (vx_reference)pSrc,
+            (vx_reference)pSrcRoi,
+            (vx_reference)pDst,
+            (vx_reference)pDstRoi,
+            (vx_reference)windowFunction,
+            (vx_reference)centerWindows,
+            (vx_reference)reflectPadding,
+            (vx_reference)spectrogramLayout,
+            (vx_reference)power,
+            (vx_reference)nfft,
+            (vx_reference)windowLength,
+            (vx_reference)windowStep,
+            (vx_reference)deviceType};
+        node = createNode(graph, VX_KERNEL_RPP_SPECTROGRAM, params, 13);
+    }
+    return node;
+}
+
 RpptDataType getRpptDataType(vx_enum vxDataType) {
     switch(vxDataType) {
         case vx_type_e::VX_TYPE_FLOAT32:
@@ -2571,6 +2597,34 @@ RpptDataType getRpptDataType(vx_enum vxDataType) {
     }
 }
 
+RpptLayout getRpptLayout(vxTensorLayout layout) {
+    switch(layout) {
+        case vxTensorLayout::VX_NHWC:
+            return RpptLayout::NHWC;
+        case vxTensorLayout::VX_NCHW:
+            return RpptLayout::NCHW;
+        case vxTensorLayout::VX_NFHWC:
+            return RpptLayout::NHWC;
+        case vxTensorLayout::VX_NFCHW:
+            return RpptLayout::NCHW;
+#if RPP_AUDIO
+        case vxTensorLayout::VX_NHW:
+            return RpptLayout::NHW;
+        case vxTensorLayout::VX_NFT:
+            return RpptLayout::NFT;
+        case vxTensorLayout::VX_NTF:
+            return RpptLayout::NTF;
+#else
+        case vxTensorLayout::VX_NHW:
+        case vxTensorLayout::VX_NFT:
+        case vxTensorLayout::VX_NTF:
+            throw std::runtime_error("RPP_AUDIO flag disabled, Audio layouts are not supported");
+#endif
+        default:
+            throw std::runtime_error("Invalid layout");
+    }
+}
+
 void fillDescriptionPtrfromDims(RpptDescPtr &descPtr, vxTensorLayout layout, size_t *tensorDims) {
     switch(layout) {
         case vxTensorLayout::VX_NHWC: {
@@ -2627,16 +2681,17 @@ void fillDescriptionPtrfromDims(RpptDescPtr &descPtr, vxTensorLayout layout, siz
     }
 }
 
-void fillAudioDescriptionPtrFromDims(RpptDescPtr &descPtr, size_t *tensorDims) {
-    descPtr->n = tensorDims[0];
-    descPtr->h = tensorDims[2];
-    descPtr->w = tensorDims[1];
+void fillAudioDescriptionPtrFromDims(RpptDescPtr &descPtr, size_t *maxTensorDims, vxTensorLayout layout) {
+    descPtr->n = maxTensorDims[0];
+    descPtr->h = maxTensorDims[1];
+    descPtr->w = maxTensorDims[2];
     descPtr->c = 1;
     descPtr->strides.nStride = descPtr->c * descPtr->w * descPtr->h;
     descPtr->strides.hStride = descPtr->c * descPtr->w;
     descPtr->strides.wStride = descPtr->c;
     descPtr->strides.cStride = 1;
     descPtr->numDims = 4;
+    descPtr->layout = getRpptLayout(layout);
 }
 
 // utility functions

@@ -98,8 +98,12 @@ static vx_status VX_CALLBACK processPreemphasisFilter(vx_node node, const vx_ref
 #endif
     }
     if (data->deviceType == AGO_TARGET_AFFINITY_CPU) {
+#if RPP_AUDIO
         rpp_status = rppt_pre_emphasis_filter_host((float *)data->pSrc, data->pSrcDesc, (float *)data->pDst, data->pDstDesc, (Rpp32s *)data->pSampleSize, data->pPreemphCoeff, RpptAudioBorderType(data->borderType), data->handle->rppHandle);
         return_status = (rpp_status == RPP_SUCCESS) ? VX_SUCCESS : VX_FAILURE;
+#else
+        return_status = VX_ERROR_NOT_SUPPORTED;
+#endif
     }
     return return_status;
 }