In [None]:
!pip install datasets
import datasets

In [None]:
from datasets import load_dataset_builder

# librispeech_asr dataset contains custom code which must be executed to correctly load the dataset.
# You can inspect the repository content at https://hf.co/datasets/librispeech_asr
# We add the `trust_remote_code=True` argument to indicate that we trust this code.
ds_builder = load_dataset_builder("librispeech_asr", trust_remote_code=True)
ds_builder.info.splits

In [None]:
ds_builder.info.features

{'file': Value(dtype='string', id=None),
 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None),
 'text': Value(dtype='string', id=None),
 'speaker_id': Value(dtype='int64', id=None),
 'chapter_id': Value(dtype='int64', id=None),
 'id': Value(dtype='string', id=None)}

In [None]:
from datasets import load_dataset

ds = load_dataset(
    "librispeech_asr",
    split="train.clean.100",
    streaming=True,
    trust_remote_code=True,
)
sample = next(iter(ds))
sample

{'file': '374-180298-0000.flac',
 'audio': {'path': '374-180298-0000.flac',
  'array': array([ 7.01904297e-04,  7.32421875e-04,  7.32421875e-04, ...,
         -2.74658203e-04, -1.83105469e-04, -3.05175781e-05]),
  'sampling_rate': 16000},
 'text': 'CHAPTER SIXTEEN I MIGHT HAVE TOLD YOU OF THE BEGINNING OF THIS LIAISON IN A FEW LINES BUT I WANTED YOU TO SEE EVERY STEP BY WHICH WE CAME I TO AGREE TO WHATEVER MARGUERITE WISHED',
 'speaker_id': 374,
 'chapter_id': 180298,
 'id': '374-180298-0000'}

In [None]:
array = sample["audio"]["array"]
sampling_rate = sample["audio"]["sampling_rate"]

# Let's get the first 5 seconds
array = array[: sampling_rate * 5]
print(f"Number of samples: {len(array)}. Values: {array}")


Number of samples: 80000. Values: [ 0.0007019   0.00073242  0.00073242 ... -0.02697754 -0.02227783
 -0.0300293 ]

In [None]:
import librosa.display

librosa.display.waveshow(array, sr=sampling_rate);

![alt text](waveform.png)

In [None]:
import numpy as np
from matplotlib import pyplot as plt


def plot_sine(freq):
    sr = 1000  # samples per second
    ts = 1.0 / sr  # sampling interval
    t = np.arange(0, 1, ts)  # time vector
    amplitude = np.sin(2 * np.pi * freq * t)

    plt.plot(t, amplitude)
    plt.title("Sine wave wih frequency {}".format(freq))
    plt.xlabel("Time")


fig = plt.figure()

plt.subplot(2, 2, 1)
plot_sine(1)

plt.subplot(2, 2, 2)
plot_sine(2)

plt.subplot(2, 2, 3)
plot_sine(5)

plt.subplot(2, 2, 4)
plot_sine(30)

fig.tight_layout()
plt.show()


![alt text](sine.png)


In [None]:
X = np.fft.fft(array)
N = len(X)
n = np.arange(N)
T = N / sampling_rate
freq = n / T
plt.stem(freq[:8000], np.abs(X[:8000]), "b", markerfmt=" ", basefmt="-b")
plt.xlabel("Frequency (Hz)")
plt.ylabel("Amplitude in Frequency Domain")
plt.show()

![alt text](audio_fft.png)

In [None]:
D = np.abs(librosa.stft(array))
S_db = librosa.amplitude_to_db(D, ref=np.max)

librosa.display.specshow(S_db, sr=sampling_rate, x_axis="time", y_axis="hz")
plt.colorbar(format="%+2.0f dB");

![alt text](spectogram.png)

In [None]:
S = librosa.feature.melspectrogram(y=array, sr=sampling_rate)
S_dB = librosa.power_to_db(S, ref=np.max)

librosa.display.specshow(S_dB, sr=sampling_rate, x_axis="time", y_axis="mel")
plt.colorbar(format="%+2.0f dB");

![alt text](mel_spectogram.png)

In [None]:
## Diffusion based Audio Generation

In [None]:
!pip install diffusers

In [None]:
import torch
from diffusers import AudioDiffusionPipeline

device = "cuda" if torch.cuda.is_available() else "cpu"
pipe = AudioDiffusionPipeline.from_pretrained(
    "teticio/audio-diffusion-ddim-256"
).to(device)
pipe.to("cuda")

output = pipe()

In [None]:
import torch
from diffusers import StableDiffusionPipeline

pipe = StableDiffusionPipeline.from_pretrained(
    "riffusion/riffusion-model-v1", torch_dtype=torch.float16
)
pipe = pipe.to(device)
prompt = "slow piano piece, classical"
negative_prompt = "drums"
spec_img = pipe(
    prompt, negative_prompt=negative_prompt, height=512, width=512
).images[0]

In [None]:
from diffusers import StableDiffusionImg2ImgPipeline

pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
    "riffusion/riffusion-model-v1", torch_dtype=torch.float16
)
pipe = pipe.to(device)

prompt = "guitar, acoustic, calmed"
generator = torch.Generator(device=device).manual_seed(1024)
image = pipe(
    prompt=prompt,
    image=spec_img,
    strength=0.7,
    guidance_scale=8,
    generator=generator,
).images[0]

## Dance Diffusion

In [None]:
from diffusers import DanceDiffusionPipeline

pipe = DanceDiffusionPipeline.from_pretrained(
    "harmonai/maestro-150k", torch_dtype=torch.float16
)
pipe = pipe.to(device)
audio = pipe(audio_length_in_s=5, num_inference_steps=50).audios[0]

## Speech to Text With Transformer-based Architectures

In [None]:
from transformers import pipeline

pipe = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-tiny",
    max_new_tokens=100,
)
pipe(array)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
{'text': ' Chapter 16 I might have told you of the beginning'}

### Encoder only based models

In [None]:
In the first part of this chapter we used spectrograms to capture the amplitude and frequency characteristics of the input data in a concise 2D visual representation. In this case, we are using CNNs instead of spectrograms to better manage the vast amounts of input data we need to process. Both are dimensionality reduction techniques, and the choice depends on factors like the task to solve or the architecture we choose. Transformers, thanks to the attention mechanism, are great to handle data sequences, so staying close to a sequential temporal representation seems to make sense.

Let’s recap the whole flow to perform ASR with encoder-based models:

Raw audio data (1D array) representing the amplitudes is received.

Data is normalized to zero mean and univariance to standardize across different amplitudes.

A small convolutional neural network turns the audio into a latent representation. This reduces the length of the input sequence.

The representations are then passed to an encoder model, which outputs embeddings for each representation.

Each embedding is finally passed through a classifier, which predicts the corresponding character for each one.

The output of such a model would be something as follows:

CHAAAAAPTTERRRSSIXTEEEEENIMMMIIGHT...

In [None]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

# The AutoProcessor has the pre and post-processing incorporated
wav2vec2_processor = Wav2Vec2Processor.from_pretrained(
    "facebook/wav2vec2-base-960h"
)
wav2vec2_model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-base-960h"
).to(device)

# Run forward pass, making sure to resample to 16kHz
inputs = wav2vec2_processor(
    array, sampling_rate=sampling_rate, return_tensors="pt"
)
with torch.no_grad():
    outputs = wav2vec2_model(**inputs.to(device))

# Transcribe
predicted_ids = torch.argmax(outputs.logits, dim=-1)
transcription = wav2vec2_processor.batch_decode(predicted_ids)
print(transcription)

You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
['CHAPTER SIXTEEN I MIGHT HAVE TOLD YOU OF THE BEGI']

Whisper output has a very specific format, so looking at the figure above is essential to understand its generation. It doesn’t just output the transcription but also different special characters that provide significant flexibility for Whisper to perform multiple tasks in multiple languages. This format is not just specific to the output texts but also to the data used during training. Some of the most important tokens are:

The speech begins with a start of transcript token.

If the language is not English, there is a language tag token (e.g., hi for Hindi).

With the language tag, one can perform language identification, transcription, or translate to English.

If there’s a no speech token, Whisper is used for voice activity detection.

### Encoder-Decoder based models

One can present the ASR problem as a sequence-to-sequence problem rather than a classification problem. This is what Whisper, the open-source model we introduced at the beginning of this section

Whisper, unlike Wav2Vec2, operates with spectrograms. As it’s commonly done in sequence-to-sequence, we begin by padding and/or truncating a batch of audio samples to ensure all batches have the same input length. Padding is achieved by adding 0s at the end. The padded audios are then converted into a sequence of log-mel spectrograms by sliding a window, just like we saw in the first part of the chapter.

It doesn’t just output the transcription but also different special characters that provide significant flexibility for Whisper to perform multiple tasks in multiple languages. This format is not just specific to the output texts but also to the data used during training. Some of the most important tokens are:

The speech begins with a start of transcript token.

If the language is not English, there is a language tag token (e.g., hi for Hindi).

With the language tag, one can perform language identification, transcription, or translate to English.

If there’s a no speech token, Whisper is used for voice activity detection.

In [None]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained(
    "openai/whisper-small", language="Spanish", task="transcribe"
)

input_str = "Hola, ¿cómo estás?"
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                         {input_str}")
print(f"Formatted input w/ special:    {decoded_with_special}")
print(f"Formatted input w/out special: {decoded_str}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Input:                         Hola, ¿cómo estás?
Formatted input w/ special:    <|startoftranscript|><|es|><|transcribe|><|notimestamps|>Hola, ¿cómo estás?<|endoftext|>
Formatted input w/out special: Hola, ¿cómo estás?

In [None]:
from transformers import WhisperForConditionalGeneration, WhisperProcessor

whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-base")
whisper_model = WhisperForConditionalGeneration.from_pretrained(
    "openai/whisper-base"
).to(device)

inputs = whisper_processor(
    array, sampling_rate=sampling_rate, return_tensors="pt"
)
with torch.no_grad():
    generated_ids = whisper_model.generate(**inputs.to(device))

transcription = whisper_processor.batch_decode(
    generated_ids, skip_special_tokens=False
)[0]
print(transcription)

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
<|startoftranscript|><|en|><|transcribe|><|notimestamps|> Chapter 16. I might have told you of the beginning<|endoftext|>

In [None]:
from genaibook.core import generate_long_audio
from transformers import pipeline

long_audio = generate_long_audio()
pipe = pipeline(
    "automatic-speech-recognition", model="openai/whisper-small", device=device
)
pipe(
    long_audio,
    generate_kwargs={"task": "transcribe"},
    chunk_length_s=5,
    batch_size=8,
    return_timestamps=True,
)

{'text': " Chapter 16. I might have told you of the beginning of this liaison in a few lines, but I wanted you to see every step by which we came. I too agree to whatever Marguerite wished, margarite to be unable to live apart from me. It was the day after the evening when she came to see me that I sent her Manon the Scott. From that time, seeing that I could not change my mistress's life. I changed my own. I wished above all not to leave myself time to think over the position I had accepted, for, in spite of myself, it was a great distress to me. Thus my life, generally so calm, assumed all at once an appearance of noise and disorder. Never believe, however disinterested, the love of a kept woman may be, that it will cost one nothing. Nothing is so expensive as their caprices, flowers, boxes at the theater, suppers, days in the country, which one can never refuse to one's mistress. As I have told you, I had little money.",
 'chunks': [{'timestamp': (0.0, 25.13),
   'text': ' Chapter 16. I might have told you of the beginning of this liaison in a few lines, but I wanted you to see every step by which we came. I too agree to whatever Marguerite wished, margarite to be unable to live apart from me. It was the day after the evening when she came to see me that I sent her Manon the Scott.'},
  {'timestamp': (25.13, 29.07),
   'text': " From that time, seeing that I could not change my mistress's life."},
  {'timestamp': (29.07, 66.63),
   'text': " I changed my own. I wished above all not to leave myself time to think over the position I had accepted, for, in spite of myself, it was a great distress to me. Thus my life, generally so calm, assumed all at once an appearance of noise and disorder. Never believe, however disinterested, the love of a kept woman may be, that it will cost one nothing. Nothing is so expensive as their caprices, flowers, boxes at the theater, suppers, days in the country, which one can never refuse to one's mistress."},
  {'timestamp': (66.63, 68.95),
   'text': ' As I have told you, I had little money.'}]}

## Evaluation

In [None]:
from genaibook.core import measure_latency_and_memory_use

wav2vec2_pipe = pipeline(
    "automatic-speech-recognition",
    model="facebook/wav2vec2-base-960h",
    device=device,
)
whisper_pipe = pipeline(
    "automatic-speech-recognition", model="openai/whisper-base", device=device
)

with torch.inference_mode():
    measure_latency_and_memory_use(
        wav2vec2_pipe, array, "Wav2Vec2", device, nb_loops=100
    )
    measure_latency_and_memory_use(
        whisper_pipe, array, "Whisper", device=device, nb_loops=100
    )

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Wav2Vec2 execution time: 0.033196728515625 seconds
Wav2Vec2 max memory footprint: 2.384644096 GB
Whisper execution time: 0.126587021484375 seconds
Whisper max memory footprint: 2.363074048 GB

In [None]:
from evaluate import load

wer_metric = load("wer")

label = "how can the llama jump"
pred = "can the lama jump up"
wer = wer_metric.compute(references=[label], predictions=[pred])

print(wer)

In [None]:
# This code example is optimized for explainability
# The inference could be done in batches for speedup, for example.
from datasets import Audio
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

normalizer = BasicTextNormalizer()


def normalise(batch):
    batch["norm_text"] = normalizer(batch["sentence"])
    return batch


def evaluate_model(pipe, lang="en", samples_to_evaluate=200, whisper=False):
    dataset = load_dataset(
        "mozilla-foundation/common_voice_13_0",
        lang,
        split="test",
        streaming=True,
        trust_remote_code=True,
    )
    dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
    dataset = dataset.map(normalise)
    dataset = dataset.take(samples_to_evaluate)

    predictions = []
    references = []

    for sample in dataset:
        if whisper:
            extra_kwargs = {
                "task": "transcribe",
                "language": f"<|{lang}|>",
                "max_new_tokens": 100,
            }
            prediction = pipe(
                sample["audio"]["array"],
                return_timestamps=True,
                generate_kwargs=extra_kwargs,
            )
        else:
            prediction = pipe(sample["audio"]["array"])
        predictions.append(normalizer(prediction["text"]))
        references.append(sample["norm_text"])
    return predictions, references

In [None]:
eval_suite = [
    ["Wav2Vec2", wav2vec2_pipe, "en"],
    ["Wav2Vec2", wav2vec2_pipe, "fr"],
    ["Whisper", whisper_pipe, "en"],
    ["Whisper", whisper_pipe, "fr"],
]

In [None]:
HF_ACCESS_TOKEN="YOUR_HF_TOKEN"

In [None]:
from huggingface_hub import login

login(token=HF_ACCESS_TOKEN)

In [None]:
cer_metric = load("cer")

for config in eval_suite:
    predictions, references = evaluate_model(
        config[1], lang=config[2], whisper=config[0] == "Whisper"
    )

    wer = wer_metric.compute(references=references, predictions=predictions)
    cer = cer_metric.compute(references=references, predictions=predictions)

    print(f"{config[0]} metrics for lang: {config[2]}. WER: {wer} , CER: {cer}")

Reading metadata...: 16372it [00:00, 44589.69it/s]
Wav2Vec2 metrics for lang: en. WER: 0.44012772751463547 , CER: 0.22138524750538055
Reading metadata...: 16114it [00:01, 14657.73it/s]
Wav2Vec2 metrics for lang: fr. WER: 1.0099113197704748 , CER: 0.5745033112582781
Reading metadata...: 16372it [00:00, 38628.98it/s]
Whisper metrics for lang: en. WER: 0.2687599787120809 , CER: 0.14674232048522795
Reading metadata...: 16114it [00:00, 36785.75it/s]
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper metrics for lang: fr. WER: 0.5211267605633803 , CER: 0.2573583517292127