<a href="https://colab.research.google.com/github/TirendazAcademy/Audio-Data-with-HuggingFace/blob/main/6-Intro-to-ASR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q datasets

# Working-with-Audio-Data

In [None]:
from datasets import load_dataset_builder

ds_builder = load_dataset_builder("openslr/librispeech_asr", trust_remote_code=True)

In [None]:
ds_builder.info.splits

In [None]:
ds_builder.info.features

In [None]:
from datasets import load_dataset
ds = load_dataset(
    "openslr/librispeech_asr",
    split="train.clean.360",
    streaming=True, trust_remote_code=True
)

In [None]:
sample = next(iter(ds))
sample

In [None]:
array = sample["audio"]["array"]
array

In [None]:
sampling_rate = sample["audio"]["sampling_rate"]
sampling_rate

In [None]:
# Let's get the first 5 seconds
array = array[: sampling_rate * 5]
print(f"Number of samples: {len(array)}. Values: {array}")

In [None]:
import IPython.display as ipd
ipd.Audio(data=array, rate=sampling_rate)

In [None]:
import librosa.display
librosa.display.waveshow(array, sr=sampling_rate);

In [None]:
import numpy as np
from matplotlib import pyplot as plt

def plot_sine(freq):
    sr = 1000  # samples per second
    ts = 1.0 / sr  # sampling interval
    t = np.arange(0, 1, ts)  # time vector
    amplitude = np.sin(2 * np.pi * freq * t)

    plt.plot(t, amplitude)
    plt.title("Sine wave with frequency {}".format(freq))
    plt.xlabel("Time")

fig = plt.figure()

plt.subplot(2, 2, 1)
plot_sine(1)

plt.subplot(2, 2, 2)
plot_sine(2)

plt.subplot(2, 2, 3)
plot_sine(5)

plt.subplot(2, 2, 4)
plot_sine(30)

fig.tight_layout()
plt.show()

In [None]:
from transformers import pipeline

pipe = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-tiny",
    max_new_tokens=100,
)
pipe(array)

In [None]:
sample["text"]

# Encoder-Based Techniques

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

# The Wav2Vec2Processor has the pre- and post-processing incorporated
wav2vec2_processor = Wav2Vec2Processor.from_pretrained(
    "facebook/wav2vec2-base-960h"
)
wav2vec2_model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-base-960h"
).to(device)

In [None]:
# Run forward pass, making sure to resample to 16kHz
inputs = wav2vec2_processor(
    array, sampling_rate=sampling_rate, return_tensors="pt"
)

In [None]:
with torch.inference_mode():
    outputs = wav2vec2_model(**inputs.to(device))

In [None]:
# Transcribe
predicted_ids = torch.argmax(outputs.logits, dim=-1)
transcription = wav2vec2_processor.batch_decode(predicted_ids)
print(transcription)

# Encoder-Decoder Techniques

In [None]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained(
    "openai/whisper-small", language="Spanish", task="transcribe"
)

In [None]:
input_str = "Hola, ¿cómo estás?"
labels = tokenizer(input_str).input_ids
labels

In [None]:
decoded_with_special = tokenizer.decode(
    labels, skip_special_tokens=False
)
decoded_with_special

In [None]:
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)
decoded_str

In [None]:
print(f"Input:                         {input_str}")
print(f"Formatted input w/ special:    {decoded_with_special}")
print(f"Formatted input w/out special: {decoded_str}")

In [None]:
from transformers import WhisperForConditionalGeneration, WhisperProcessor

whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-small")
whisper_model = WhisperForConditionalGeneration.from_pretrained(
    "openai/whisper-small"
).to(device)

In [None]:
inputs = whisper_processor(
    array, sampling_rate=sampling_rate, return_tensors="pt"
)

In [None]:
inputs

In [None]:
with torch.inference_mode():
    generated_ids = whisper_model.generate(**inputs.to(device))

In [None]:
transcription = whisper_processor.batch_decode(
    generated_ids, skip_special_tokens=False
)[0]
print(transcription)

# From Model to Pipeline

In [None]:
!pip install genaibook

In [None]:
from genaibook.core import generate_long_audio

In [None]:
from genaibook.core import generate_long_audio

long_audio = generate_long_audio()

In [None]:
long_audio

In [None]:
import IPython.display as ipd
ipd.Audio(data=long_audio, rate=16000)

In [None]:
from transformers import pipeline
pipe = pipeline(
    "automatic-speech-recognition", model="openai/whisper-small", device=device
)

In [None]:
pipe(
    long_audio,
    generate_kwargs={"task": "transcribe"},
    chunk_length_s=5,
    batch_size=8,
    return_timestamps=True,
)

# Evaluation

In [None]:
from genaibook.core import measure_latency_and_memory_use

wav2vec2_pipe = pipeline(
    "automatic-speech-recognition",
    model="facebook/wav2vec2-base-960h",
    device=device,
)
whisper_pipe = pipeline(
    "automatic-speech-recognition", model="openai/whisper-base", device=device
)

with torch.inference_mode():
    measure_latency_and_memory_use(
        wav2vec2_pipe, array, "Wav2Vec2", device, nb_loops=100
    )
    measure_latency_and_memory_use(
        whisper_pipe, array, "Whisper", device=device, nb_loops=100
    )

## Word Error Rate (WER)

In [None]:
from evaluate import load

wer_metric = load("wer")

label = "how can the llama jump"
pred = "can the lama jump up"
wer = wer_metric.compute(references=[label], predictions=[pred])

print(wer)

##  Normalizing

In [None]:
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

normalizer = BasicTextNormalizer()
print(normalizer("I'm having a great day!"))

In [None]:
from datasets import Audio

def normalize(batch):
    batch["norm_text"] = normalizer(batch["sentence"])
    return batch

def prepare_dataset(language="en", sample_count=200):
    dataset = load_dataset(
        "mozilla-foundation/common_voice_13_0",
        language,
        split="test",
        streaming=True,
        trust_remote_code=True,
    )
    dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
    dataset = dataset.take(sample_count)
    buffered_dataset = [sample for sample in dataset.map(normalize)]
    return buffered_dataset

def evaluate_model(pipe, dataset, lang="en", use_whisper=False):
    predictions, references = [], []

    for sample in dataset:
        if use_whisper:
            extra_kwargs = {
                "task": "transcribe",
                "language": f"<|{lang}|>",
                "max_new_tokens": 100,
            }
            transcription = pipe(
                sample["audio"]["array"],
                return_timestamps=True,
                generate_kwargs=extra_kwargs,
            )
        else:
            transcription = pipe(sample["audio"]["array"])
        predictions.append(normalizer(transcription["text"]))
        references.append(sample["norm_text"])
    return predictions, references

In [None]:
eval_suite = [
    ["Wav2Vec2", wav2vec2_pipe, "en"],
    ["Wav2Vec2", wav2vec2_pipe, "fr"],
    ["Whisper", whisper_pipe, "en"],
    ["Whisper", whisper_pipe, "fr"],
]

In [None]:
cer_metric = load("cer")

In [None]:
# Pre-process the English and French datasets
processed_datasets = {
    "en": prepare_dataset("en"),
    "fr": prepare_dataset("fr"),
}

In [None]:
for config in eval_suite:
    model_name, pipeline, lang = config[0], config[1], config[2]

    dataset = processed_datasets[lang]

    predictions, references = evaluate_model(
        pipeline, dataset, lang, model_name == "Whisper"
    )

    # Compute evaluation metrics
    wer = wer_metric.compute(references=references, predictions=predictions)
    cer = cer_metric.compute(references=references, predictions=predictions)

    print(f"{model_name} metrics for lang: {lang}. WER: {wer}, CER: {cer}")