This is a simple example showcasing how to set up a simple audio decoder pipeline.

We load and decode audio data using rocAL.

The input data we will be using to showcase this example is a speech dataset which is present in the form of wav files.

We will start by importing rocAL and other necessary packages which is show in the cell below

In [None]:
!pip install opencv-python
!pip install matplotlib

In [None]:
import random
import numpy as np
from amd.rocal.plugin.pytorch import ROCALAudioIterator
import torch
np.set_printoptions(threshold=1000, edgeitems=10000)
from amd.rocal.pipeline import Pipeline
import amd.rocal.fn as fn
import amd.rocal.types as types
import math
import sys
import cv2
import matplotlib.pyplot as plt
import os

In [None]:
def draw_patches(img, idx, device):
    image = img.detach().numpy()
    audio_data = image.flatten()
    label = idx.cpu().detach().numpy()
    plt.plot(audio_data)
    plt.show()
    plt.close()

In [None]:
# Note: Set the ROCAL_DATA_PATH environment variable before running the notebook.

# Check if ROCAL_DATA_PATH is set
rocal_data_path = os.environ.get('ROCAL_DATA_PATH')

if rocal_data_path is None:
    print("The environment variable ROCAL_DATA_PATH is not set.")
else:
    print(f"ROCAL_DATA_PATH is set to: {rocal_data_path}")

rocal_audio_data_path = os.path.join(rocal_data_path, "rocal_data", "audio")


The rocAL pipeline is configured with batch_size, num_threads, and the CPU/GPU backend.

In [None]:
file_list = f"{rocal_audio_data_path}/wav_file_list.txt" #Use file list defined in the MIVisisonX-data repo
rocal_cpu = True
audio_pipeline = Pipeline(batch_size=1, num_threads=8, rocal_cpu=rocal_cpu)

Defining the audio decoder pipeline with downmix set to 'False'

In [None]:
with audio_pipeline:
    audio, labels = fn.readers.file(file_root=rocal_audio_data_path, file_list=file_list)
    decoded_audio = fn.decoders.audio(
        audio,
        file_root=rocal_audio_data_path,
        file_list_path=file_list,
        downmix=False,
        shard_id=0,
        num_shards=1,
        stick_to_shard=True)
    audio_pipeline.set_outputs(decoded_audio)


In [None]:
audio_pipeline.build()
audioIterator = ROCALAudioIterator(audio_pipeline)

The output from the iterator contains the audio data as Torch tensors, the corresponding label, and the region of interest.

In [None]:
for i, output_list in enumerate(audioIterator):
    for x in range(len(output_list[0])):
        for audio_tensor, label, roi in zip(output_list[0][x], output_list[1], output_list[2]):
            print("Audio shape: ", audio_tensor.shape)
            print("Label: ", label)
            print("Roi: ", roi)
            draw_patches(audio_tensor, label, "cpu")
audioIterator.reset()