In [1]:
!pip install datasets transformers torchaudio evaluate jiwer soundfile



In [2]:
!pip install datasets



In [3]:
!pip install torchaudio



In [4]:
!pip install git+https://github.com/pytorch/audio.git@main#egg=torchcodec&subdirectory=torchcodec

In [5]:
pip install torchcodec

Note: you may need to restart the kernel to use updated packages.


In [6]:
from datasets import load_dataset, Audio

# Step 0: Force download of the actual audio file
ds = load_dataset("librispeech_asr", "clean", split="test[:1]")
_ = ds[0]["audio"]  # this downloads the FLAC file

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
from datasets import load_dataset

# Step 1: Download with decode=True to get the actual file
ds = load_dataset("librispeech_asr", "clean", split="test[:1]")
audio_info = ds[0]["audio"]
print(audio_info)

<datasets.features._torchcodec.AudioDecoder object at 0x15a8bc5f0>


In [8]:
import os
print(os.path.exists(audio_info["path"]))

TypeError: 'torchcodec.decoders.AudioDecoder' object is not subscriptable

In [9]:
from datasets import load_dataset, Audio
import os

# Step 1: Load the dataset with decode=False to access raw path
ds = load_dataset("librispeech_asr", "clean", split="test[:1]")
ds = ds.cast_column("audio", Audio(decode=False))

# Step 2: Extract path
sample = ds[0]
audio_path = sample["audio"]["path"]

# Step 3: Verify file exists
print("Audio path:", audio_path)
print("Exists?", os.path.exists(audio_path))

Audio path: 6930-75918-0000.flac
Exists? False


In [10]:
from datasets import load_dataset

# Force redownload the actual .flac file
ds = load_dataset("librispeech_asr", "clean", split="test[:1]", download_mode="force_redownload")


Downloading data: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [05:34<00:00,  6.97s/files]
Generating test split: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2620/2620 [00:00<00:00, 8716.78 examples/s]
Generating train.100 split: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28539/28539 [00:03<00:00, 7638.10 examples/s]
Generating train.360 split: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 104014/104014 [00:13<00:00, 7582.49 examples/s]
Generating validation split: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████

In [11]:
ds = load_dataset(
    "librispeech_asr", "clean",
    split="test[:1]",
    download_mode="force_redownload"  # one more push if cache was half-done
)

Downloading data: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [05:29<00:00,  6.87s/files]
Generating test split: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2620/2620 [00:00<00:00, 12786.10 examples/s]
Generating train.100 split: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28539/28539 [00:03<00:00, 8879.74 examples/s]
Generating train.360 split: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 104014/104014 [00:12<00:00, 8404.06 examples/s]
Generating validation split: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████

In [12]:
from datasets import load_dataset, Audio

# tiny slice to keep it fast
ds = load_dataset("librispeech_asr", "clean", split="test[:1]")

# ask datasets to decode the audio for you
ds = ds.cast_column("audio", Audio(sampling_rate=16000))  # decode=True by default

ex = ds[0]["audio"]
waveform = ex["array"]      # numpy float32, shape [T]
sample_rate = ex["sampling_rate"]

print(waveform.shape, sample_rate)

(56080,) 16000


In [13]:
from datasets import load_dataset, Audio
import torch

# Tiny slice to keep it fast
ds = load_dataset("librispeech_asr", "clean", split="test[:1]")
ds = ds.cast_column("audio", Audio(sampling_rate=16000))  # decode=True by default

ex = ds[0]
waveform = torch.from_numpy(ex["audio"]["array"]).unsqueeze(0)  # [1, T] torch float32
sample_rate = ex["audio"]["sampling_rate"]
print(waveform.shape, sample_rate)   # e.g., torch.Size([1, 56080]) 16000

torch.Size([1, 56080]) 16000


In [14]:
!pip -q install transformers accelerate evaluate jiwer sentencepiece --upgrade

In [15]:
import torch, time, re, numpy as np
from datasets import load_dataset, Audio

# Rebuild the exact tiny example you just used (decoded to 16 kHz)
ds = load_dataset("librispeech_asr", "clean", split="test[:1]")
ds = ds.cast_column("audio", Audio(sampling_rate=16000))
ex = ds[0]
wave_np = ex["audio"]["array"].astype("float32")      # [T]
wave_pt = torch.from_numpy(wave_np).unsqueeze(0)      # [1, T]
sr = ex["audio"]["sampling_rate"]
ref_text = ex["text"]

print(wave_pt.shape, sr)
print("REF:", ref_text)

torch.Size([1, 56080]) 16000
REF: CONCORD RETURNED TO ITS PLACE AMIDST THE TENTS


In [16]:
def normalize(s: str) -> str:
    s = s.lower().strip()
    s = re.sub(r"[^a-z0-9' ]+", " ", s)  # keep letters, digits, apostrophes
    s = re.sub(r"\s+", " ", s).strip()
    return s

In [17]:
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration

device = "cuda" if torch.cuda.is_available() else "cpu"
model_id_whisper = "openai/whisper-medium.en"   # or: openai/whisper-large-v3

processor_w = WhisperProcessor.from_pretrained(model_id_whisper)
model_w = WhisperForConditionalGeneration.from_pretrained(model_id_whisper).to(device)
model_w.eval()

# Force English transcription (no translation)
forced_ids = processor_w.get_decoder_prompt_ids(language="en", task="transcribe")

# Prepare inputs
inputs = processor_w(wave_np, sampling_rate=sr, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}

# Inference
t0 = time.time()
with torch.no_grad():
    predicted_ids = model_w.generate(
        **inputs,
        forced_decoder_ids=forced_ids,
        # for accuracy you can let it search more (slower):
        num_beams=5, length_penalty=1.0, no_repeat_ngram_size=3
    )
pred_whisper = processor_w.batch_decode(predicted_ids, skip_special_tokens=True)[0]
t1 = time.time()

dur = len(wave_np) / sr
rtf_w = (t1 - t0) / dur
print("WHISPER:", pred_whisper)
print(f"Whisper time: {t1 - t0:.3f}s   audio: {dur:.3f}s   RTF: {rtf_w:.3f}")

Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


WHISPER:  Concord returned to its place amidst the tents.
Whisper time: 11.752s   audio: 3.505s   RTF: 3.353


In [18]:
import time, torch, numpy as np
from datasets import load_dataset, Audio
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from jiwer import wer, cer, Compose, ToLowerCase, RemovePunctuation, RemoveMultipleSpaces, Strip

# ---------- config ----------
PREFER_ACCURACY = True     # your preference
MAX_ITEMS = 50             # evaluate on first N test samples (raise later)
SR = 16000
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# best → … → lighter
CANDIDATE_MODELS = [
    "openai/whisper-large-v3",
    "openai/whisper-large-v2",
    "openai/whisper-medium.en"
] if PREFER_ACCURACY else ["openai/whisper-medium.en"]

def load_whisper():
    last_err = None
    for name in CANDIDATE_MODELS:
        try:
            print(f"Loading {name} on {DEVICE} …")
            proc = WhisperProcessor.from_pretrained(name)
            model = WhisperForConditionalGeneration.from_pretrained(
                name,
                torch_dtype=torch.float16 if DEVICE=="cuda" else None
            ).to(DEVICE)
            model.eval()
            return name, proc, model
        except RuntimeError as e:
            last_err = e
            print(f"OOM or load error with {name}: {e}")
            torch.cuda.empty_cache() if DEVICE=="cuda" else None
    raise last_err

model_id, processor, model_w = load_whisper()

# generation settings emphasising accuracy (beam search, low temp)
GEN_KW = dict(
    language="en", task="transcribe",
    num_beams=8, length_penalty=1.0,
    temperature=0.0, no_repeat_ngram_size=3,
    repetition_penalty=1.05
)

# ---------- data ----------
ds = load_dataset("librispeech_asr", "clean", split="test[:{}]".format(MAX_ITEMS))
ds = ds.cast_column("audio", Audio(sampling_rate=SR))

# text normalization for fair WER/CER
norm = Compose([ToLowerCase(), RemovePunctuation(), RemoveMultipleSpaces(), Strip()])

# ---------- eval loop ----------
preds, refs = [], []
audio_seconds_total = 0.0
t0 = time.time()

for ex in ds:
    wav = ex["audio"]["array"]          # float32 numpy [T]
    sr  = ex["audio"]["sampling_rate"]
    if sr != SR:
        # (should already be 16 kHz from cast_column, but just in case)
        # torchaudio resample path if you really need it:
        # wav = torchaudio.functional.resample(torch.tensor(wav), sr, SR).numpy()
        pass

    audio_seconds_total += len(wav)/SR

    inputs = processor(
        wav, sampling_rate=SR, return_tensors="pt"
    ).to(DEVICE)

    with torch.no_grad():
        pred_ids = model_w.generate(**inputs, **GEN_KW)

    text = processor.batch_decode(pred_ids, skip_special_tokens=True)[0]
    ref  = ex["text"]

    preds.append(norm(text))
    refs.append(norm(ref))

wall = time.time() - t0
WER = wer(refs, preds)
CER = cer(refs, preds)
RTF = wall / audio_seconds_total

print(f"\nWhisper model: {model_id}")
print(f"Items: {len(refs)}  audio: {audio_seconds_total:.2f}s  wall: {wall:.2f}s")
print(f"WER: {WER:.4f}   CER: {CER:.4f}   RTF: {RTF:.3f}")

Loading openai/whisper-large-v3 on cpu …

Whisper model: openai/whisper-large-v3
Items: 50  audio: 328.27s  wall: 899.24s
WER: 0.0295   CER: 0.0061   RTF: 2.739


In [19]:
import time, torch
from datasets import load_dataset, Audio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from jiwer import wer, cer, Compose, ToLowerCase, RemovePunctuation, RemoveMultipleSpaces, Strip

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SR = 16000
MAX_ITEMS = 50

# High‑accuracy English model (CTC):
wav2vec_id = "facebook/wav2vec2-large-960h-lv60-self"

proc2 = Wav2Vec2Processor.from_pretrained(wav2vec_id)
model2 = Wav2Vec2ForCTC.from_pretrained(wav2vec_id).to(DEVICE)
model2.eval()

ds2 = load_dataset("librispeech_asr", "clean", split="test[:{}]".format(MAX_ITEMS))
ds2 = ds2.cast_column("audio", Audio(sampling_rate=SR))

norm = Compose([ToLowerCase(), RemovePunctuation(), RemoveMultipleSpaces(), Strip()])

preds, refs = [], []
audio_seconds_total = 0.0
t0 = time.time()

with torch.no_grad():
    for ex in ds2:
        wav = ex["audio"]["array"]
        audio_seconds_total += len(wav)/SR
        inputs = proc2(wav, sampling_rate=SR, return_tensors="pt", padding="longest").to(DEVICE)
        logits = model2(**inputs).logits
        ids = torch.argmax(logits, dim=-1)
        text = proc2.batch_decode(ids)[0]
        preds.append(norm(text))
        refs.append(norm(ex["text"]))

wall = time.time() - t0
WER = wer(refs, preds)
CER = cer(refs, preds)
RTF = wall / audio_seconds_total

print(f"\nWav2Vec2 model: {wav2vec_id}")
print(f"Items: {len(refs)}  audio: {audio_seconds_total:.2f}s  wall: {wall:.2f}s")
print(f"WER: {WER:.4f}   CER: {CER:.4f}   RTF: {RTF:.3f}")


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Wav2Vec2 model: facebook/wav2vec2-large-960h-lv60-self
Items: 50  audio: 328.27s  wall: 18.23s
WER: 0.0164   CER: 0.0061   RTF: 0.056


In [21]:
# --- Accuracy-first Whisper evaluation on LibriSpeech test-clean ---
import time, torch, numpy as np
from datasets import load_dataset, Audio
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from jiwer import wer, cer, Compose, ToLowerCase, RemovePunctuation, RemoveMultipleSpaces, Strip

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_ID = "openai/whisper-large-v3"

# 1) Data
ds = load_dataset("librispeech_asr", "clean", split="test")
ds = ds.cast_column("audio", Audio(sampling_rate=16000))  # ensures 16 kHz
print(f"{len(ds)} utterances")

# 2) Model
processor = WhisperProcessor.from_pretrained(MODEL_ID)
model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID).to(DEVICE)
model.eval()

# 3) Text normalization for scoring (roughly Whisper style)
norm = Compose([ToLowerCase(), RemovePunctuation(), RemoveMultipleSpaces(), Strip()])

# 4) Generation config tuned for accuracy (beam search)
GEN = dict(
    task="transcribe",
    language="en",
    temperature=0.0,        # greedy within beam
    beam_size=5,            # modest beam, good accuracy/speed balance
    best_of=5,              # consider several hypotheses before beam
    length_penalty=1.0,
    no_repeat_ngram_size=3,
)

preds, refs = [], []
audio_sec_total = 0.0
t0 = time.time()

with torch.no_grad():
    for ex in ds:  # full set; slice if needed e.g. ds.select(range(1000))
        wav = ex["audio"]["array"]             # float32 [T]
        sr  = ex["audio"]["sampling_rate"]
        audio_sec_total += len(wav) / sr

        inputs = processor(
            wav, sampling_rate=sr, return_tensors="pt"
        ).to(DEVICE)

        gen_ids = model.generate(**inputs, **GEN)
        text = processor.batch_decode(gen_ids, skip_special_tokens=True)[0]

        preds.append(norm(text))
        refs.append(norm(ex["text"]))

t1 = time.time()
WER = wer(refs, preds)
CER_v = cer(refs, preds)
RTF = (t1 - t0) / audio_sec_total

print(f"Whisper large-v3  | items: {len(refs)}  audio: {audio_sec_total:.2f}s  wall: {t1-t0:.2f}s")
print(f"WER: {WER:.4f}  CER: {CER_v:.4f}  RTF: {RTF:.3f}")

2620 utterances


ValueError: The following `model_kwargs` are not used by the model: ['beam_size', 'best_of'] (note: typos in the generate arguments will also show up in this list)

In [22]:
# logits = model2(**inputs).logits
# ids = torch.argmax(logits, dim=-1)
# text = proc2.batch_decode(ids)[0]

In [23]:
# --- Accuracy-first Wav2Vec2-CTC with KenLM decoding ---
!pip -q install pyctcdecode==0.5.0 kenlm

import time, torch, numpy as np
from datasets import load_dataset, Audio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from jiwer import wer, cer, Compose, ToLowerCase, RemovePunctuation, RemoveMultipleSpaces, Strip
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
W2V_ID = "facebook/wav2vec2-large-960h-lv60"  # or "jonatasgrosman/wav2vec2-large-robust-ft-libri-960h"

# 1) Data
ds = load_dataset("librispeech_asr", "clean", split="test")
ds = ds.cast_column("audio", Audio(sampling_rate=16000))

# 2) Model
proc = Wav2Vec2Processor.from_pretrained(W2V_ID)
model = Wav2Vec2ForCTC.from_pretrained(W2V_ID).to(DEVICE)
model.eval()

# 3) KenLM (download a 4-gram ARPA from HF; any good English 4-gram works)
arpa_path = hf_hub_download("kensho-ai/kenlm", filename="4gram.arpa")  # swap if you prefer a different LM
# Build decoder with the model’s CTC vocabulary
vocab_list = list(proc.tokenizer.get_vocab().keys())
decoder = build_ctcdecoder(vocab_list, arpa_path)

# 4) Scoring normalization
norm = Compose([ToLowerCase(), RemovePunctuation(), RemoveMultipleSpaces(), Strip()])

preds, refs = [], []
audio_sec_total = 0.0
t0 = time.time()

with torch.no_grad():
    for ex in ds:  # full set recommended
        wav = ex["audio"]["array"]
        sr  = ex["audio"]["sampling_rate"]
        audio_sec_total += len(wav) / sr

        inputs = proc(wav, sampling_rate=sr, return_tensors="pt", padding="longest").to(DEVICE)
        #logits = model(**inputs).logits[0].cpu().numpy()
        # LM decode (beam width ~100 is common; increase for accuracy)
        #text = decoder.decode(logits, beam_width=100)

        preds.append(norm(text))
        refs.append(norm(ex["text"]))

t1 = time.time()
WER = wer(refs, preds)
CER_v = cer(refs, preds)
RTF = (t1 - t0) / audio_sec_total

print(f"Wav2Vec2-CTC+LM | items: {len(refs)}  audio: {audio_sec_total:.2f}s  wall: {t1-t0:.2f}s")
print(f"WER: {WER:.4f}  CER: {CER_v:.4f}  RTF: {RTF:.3f}")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60 and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RepositoryNotFoundError: 401 Client Error. (Request ID: Root=1-68b177f0-460c6a1755eb128842f8c180;f544f797-4566-4885-9018-bd593a1bc4a3)

Repository Not Found for url: https://huggingface.co/kensho-ai/kenlm/resolve/main/4gram.arpa.
Please make sure you specified the correct `repo_id` and `repo_type`.
If you are trying to access a private or gated repo, make sure you are authenticated. For more details, see https://huggingface.co/docs/huggingface_hub/authentication
Invalid username or password.

In [24]:
pip install pyctcdecode==0.5.* kenlm

zsh:1: no matches found: pyctcdecode==0.5.*
Note: you may need to restart the kernel to use updated packages.


In [25]:
# in a notebook cell
!mkdir -p lm
!curl -L -o lm/4-gram.arpa.gz https://www.openslr.org/resources/11/4-gram.arpa.gz
!gunzip -f lm/4-gram.arpa.gz    # produces lm/4-gram.arpa
LM_ARPA = "lm/4-gram.arpa"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1292M  100 1292M    0     0  41.5M      0  0:00:31  0:00:31 --:--:-- 43.3M


In [26]:
import os, torch, numpy as np
from datasets import load_dataset, Audio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from pyctcdecode import build_ctcdecoder

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_ID = "facebook/wav2vec2-large-960h-lv60-self"   # English CTC model (vocab ~32)

# 1) Load proper processor + model (CTC)
proc_ctc = Wav2Vec2Processor.from_pretrained(MODEL_ID)
asr_ctc  = Wav2Vec2ForCTC.from_pretrained(MODEL_ID).to(DEVICE).eval()

# 2) Get the *matching* CTC labels for the decoder
vocab_dict = proc_ctc.tokenizer.get_vocab()           # dict: token -> id
id2token   = {i: tok for tok, i in vocab_dict.items()}# reorder by id
labels     = [id2token[i] for i in range(len(id2token))]
print("CTC vocab size:", len(labels))                 # should match asr_ctc.config.vocab_size (≈32)

# 3) Tiny LibriSpeech slice, do NOT auto-decode so we keep raw arrays
ds = load_dataset("librispeech_asr", "clean", split="test[:1%]")
ds = ds.cast_column("audio", Audio(sampling_rate=16_000))  # ensure 16k

# 4) Build pyctcdecode decoder (no LM first — this alone already improves over argmax)
decoder = build_ctcdecoder(labels)                    # kenlm optional; add arpa later

# 5) Run one (or loop) and check shapes
ex    = ds[0]
wav   = np.asarray(ex["audio"]["array"], dtype=np.float32)
inp   = proc_ctc(wav, sampling_rate=16_000, return_tensors="pt", padding="longest").to(DEVICE)

with torch.no_grad():
    logits = asr_ctc(**inp).logits[0].cpu().numpy()  # (T, V)

print("logits shape:", logits.shape, "  expected V:", len(labels))
assert logits.shape[1] == len(labels), "Vocab size mismatch — decoder built with wrong labels."

# 6) Beam search decoding (no LM)
text_beam = decoder.decode(logits)
print("CTC beam text:", text_beam)

# Optional: compute WER against ref if you have your `norm()` handy
ref_text = ex["text"]
print("REF:", ref_text)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CTC vocab size: 32


Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?


logits shape: (175, 32)   expected V: 32
CTC beam text: CONCORD RETURNED TO ITS PLACE AMIDST THE TENTS
REF: CONCORD RETURNED TO ITS PLACE AMIDST THE TENTS


In [27]:
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
W_NAME  = "openai/whisper-large-v3"

proc_w = WhisperProcessor.from_pretrained(W_NAME)
whisp  = WhisperForConditionalGeneration.from_pretrained(W_NAME).to(DEVICE).eval()

# Prepare one example (16k array -> log-mel happens in processor)
wav = ex["audio"]["array"]   # from the same ds you used above (16k)
inputs = proc_w.feature_extractor(
    wav, sampling_rate=16_000, return_tensors="pt"
).to(DEVICE)

gen_cfg = dict(
    language="en", task="transcribe",
    num_beams=5, temperature=0.0,        # accuracy > speed
    return_dict_in_generate=True
)

with torch.no_grad():
    out = whisp.generate(**inputs, **gen_cfg)

text_w = proc_w.tokenizer.batch_decode(out.sequences, skip_special_tokens=True)[0]
print("Whisper:", text_w)

Whisper:  Concord returned to its place amidst the tents.
