<a href="https://colab.research.google.com/github/TirendazAcademy/Audio-Data-with-HuggingFace/blob/main/1-Working-with-Audio-Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load and explore an audio dataset

In [None]:
!pip install -q datasets[audio]

In [None]:
import datasets
datasets.__version__

In [None]:
from datasets import load_dataset

minds = load_dataset("PolyAI/minds14", name="en-AU", split="train")
minds

In [None]:
minds

In [None]:
print(minds.features["lang_id"])
print(minds.features["lang_id"].num_classes)
print(minds.features["lang_id"].names)

In [None]:
minds.features["intent_class"]

In [None]:
len(minds["intent_class"])

In [None]:
example = minds[0]
example

In [None]:
id2label = minds.features["intent_class"].int2str
id2label

In [None]:
id2label = minds.features["intent_class"].int2str
id2label(example["intent_class"])

In [None]:
columns_to_remove = ["lang_id", "english_transcription"]
minds = minds.remove_columns(columns_to_remove)
minds

In [None]:
!pip install -q gradio

In [None]:
import gradio as gr

def generate_audio():
    example = minds.shuffle()[0]
    audio = example["audio"]
    return (
        audio["sampling_rate"],
        audio["array"],
    ), id2label(example["intent_class"])


with gr.Blocks() as demo:
    with gr.Column():
        for _ in range(4):
            audio, label = generate_audio()
            output = gr.Audio(audio, label=label)

demo.launch(debug=True)

In [None]:
example = minds.shuffle()[0]
example

In [None]:
import librosa
import matplotlib.pyplot as plt
import librosa.display

array = example["audio"]["array"]
sampling_rate = example["audio"]["sampling_rate"]

plt.figure().set_figwidth(12)
librosa.display.waveshow(array, sr=sampling_rate)

#  Preprocessing an audio dataset

## Resampling the audio data

In [None]:
minds[0]

In [None]:
minds[0]["audio"]

In [None]:
# Get one audio sample
audio_sample = minds[0]["audio"]

# Print the sampling rate
print(audio_sample["sampling_rate"])

In [None]:
from datasets import Audio

minds = minds.cast_column("audio", Audio(sampling_rate=16_000))

In [None]:
minds[0]

## Filtering the dataset

In [None]:
MAX_DURATION_IN_SECONDS = 20.0

def is_audio_length_in_range(input_length):
    return input_length < MAX_DURATION_IN_SECONDS

In [None]:
# use librosa to get example's duration from the audio file
new_column = [librosa.get_duration(path=x) for x in minds["path"]]
minds = minds.add_column("duration", new_column)

# use 🤗 Datasets' `filter` method to apply the filtering function
minds = minds.filter(is_audio_length_in_range, input_columns=["duration"])

# remove the temporary helper column
minds = minds.remove_columns(["duration"])
minds

## Pre-processing audio data

In [None]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

In [None]:
def prepare_dataset(example):
    audio = example["audio"]
    features = feature_extractor(
        audio["array"], sampling_rate=audio["sampling_rate"], padding=True
    )
    return features

In [None]:
minds = minds.map(prepare_dataset)
minds

In [None]:
import numpy as np

example = minds[0]
input_features = example["input_features"]

plt.figure().set_figwidth(12)
librosa.display.specshow(
    np.asarray(input_features[0]),
    x_axis="time",
    y_axis="mel",
    sr=feature_extractor.sampling_rate,
    hop_length=feature_extractor.hop_length,
)
plt.colorbar()

In [None]:
from transformers import AutoProcessor

processor = AutoProcessor.from_pretrained("openai/whisper-small")

# Streaming audio data

In [None]:
gigaspeech = load_dataset("speechcolab/gigaspeech", "xs", streaming=True)

In [None]:
next(iter(gigaspeech["train"]))

In [None]:
gigaspeech_head = gigaspeech["train"].take(2)
list(gigaspeech_head)