In [1]:
!pip install -q transformers datasets jiwer torch torchaudio soundfile

In [2]:
import time
import torch
import torchaudio
import numpy as np
from datasets import load_dataset, Audio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from jiwer import wer as jiwer_wer, cer as jiwer_cer
from tqdm.auto import tqdm

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)

def resample_if_needed(batch, target_sr=16_000):
    """Hugging Face `Audio(decode=True)` gives numpy float32 and sample_rate. Resample if needed."""
    wav = batch["audio"]["array"]
    sr  = batch["audio"]["sampling_rate"]
    if sr != target_sr:
        wav = torchaudio.functional.resample(
            torch.from_numpy(wav), sr, target_sr
        ).numpy()
    batch["audio_16k"] = wav
    return batch

def normalize_text(s: str):
    """Lightweight normalizer so WER/CER are comparable across models."""
    # lower + strip; remove leading/trailing punctuation lumps but keep digits/letters/spaces
    s = s.lower().strip()
    # collapse whitespace
    s = " ".join(s.split())
    return s

Device: cpu


In [3]:
# ==== Wav2Vec2: LibriSpeech test (accuracy-first) ====
import torch
import numpy as np
from datasets import load_dataset, Audio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from jiwer import wer

# ---- Config ----
DEVICE    = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_ID  = "facebook/wav2vec2-large-960h-lv60-self"   # English CTC model
TARGET_SR = 16000
SPLIT     = "test[:100%]"   # change to "test[:10%]" while experimenting

print("Device:", DEVICE)

# ---- Load dataset (and let Datasets handle resampling to 16 kHz) ----
ds = load_dataset("librispeech_asr", "clean", split=SPLIT)
ds = ds.cast_column("audio", Audio(sampling_rate=TARGET_SR))  # <-- resamples on the fly

# Optional: simple normalizer to match typical WER setup
import re
def normalize_text(s: str) -> str:
    s = s.lower().strip()
    s = re.sub(r"[^a-z' ]+", " ", s)  # keep letters and apostrophes
    s = re.sub(r"\s+", " ", s).strip()
    return s

references = [normalize_text(ex["text"]) for ex in ds]

# ---- Load model + processor ----
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID).to(DEVICE).eval()

# ---- Inference (greedy CTC) ----
def transcribe_one(example):
    wav = example["audio"]["array"]  # float32, 16 kHz (already resampled)
    inputs = processor(wav, sampling_rate=TARGET_SR, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        logits = model(**inputs).logits
    pred_ids = torch.argmax(logits, dim=-1)[0]
    text = processor.decode(pred_ids)
    return text

hypotheses = []
audio_seconds_total = 0.0

for ex in ds:
    hypotheses.append(normalize_text(transcribe_one(ex)))
    audio_seconds_total += len(ex["audio"]["array"]) / TARGET_SR

# ---- Compute WER ----
score = wer(references, hypotheses) * 100.0
print(f"Items: {len(ds)}   audio: {audio_seconds_total/60:.2f} min")
print(f"WER: {score:.2f}%")

Device: cpu


Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Items: 2620   audio: 324.21 min
WER: 1.86%


In [4]:
# ===== Hindi (FLEURS) — optional =====
# MODEL_ID  = "jonatasgrosman/wav2vec2-large-xlsr-53-hindi"   # Hindi CTC model
# TARGET_SR = 16_000
# # Pick the Hindi subset and a small portion for demo
# lang_code = "hi_in"
# fleurs = load_dataset("google/fleurs", lang_code, split="test[:100%]")
# fleurs = fleurs.cast_column("audio", Audio(decode=True))
# fleurs = fleurs.map(resample_if_needed, fn_kwargs={"target_sr": TARGET_SR})
# 
# # make it look like LibriSpeech fields expected below
# ds = fleurs.rename_columns({"transcription": "text"})
# references = [normalize_text(x["text"]) for x in ds]
# 
# processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
# model      = Wav2Vec2ForCTC.from_pretrained(MODEL_ID).to(DEVICE).eval()
# 
# print("Items:", len(ds))

In [6]:
# === Setup ===
import torch, numpy as np, re, time
from datasets import load_dataset, Audio
from jiwer import wer
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

DEVICE   = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_ID = "facebook/wav2vec2-large-960h-lv60-self"
SPLIT    = "test[:100%]"     # change to test[:10%] for a quick run

print("Device:", DEVICE)

# === Load LibriSpeech and let 'Audio' do resampling to 16 kHz ===
TARGET_SR = 16_000
ds = load_dataset("librispeech_asr", "clean", split=SPLIT)
ds = ds.cast_column("audio", Audio(sampling_rate=TARGET_SR))  # resample-on-the-fly

# === Normalizer to mirror typical WER setups ===
def normalize_text(s: str) -> str:
    s = s.lower().strip()
    s = re.sub(r"[^a-z' ]+", " ", s)  # keep letters + apostrophes
    s = re.sub(r"\s+", " ", s).strip()
    return s

references = [normalize_text(x["text"]) for x in ds]

# === Load processor + model ===
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model     = Wav2Vec2ForCTC.from_pretrained(MODEL_ID).to(DEVICE).eval()

# === Greedy inference helper ===
@torch.no_grad()
def transcribe_one(example):
    wav = example["audio"]["array"]               # float32, 16kHz
    inputs = processor(wav, sampling_rate=TARGET_SR, return_tensors="pt").to(DEVICE)
    logits  = model(**inputs).logits              # [B, T, vocab]
    pred_ids = torch.argmax(logits, dim=-1)[0]
    text = processor.decode(pred_ids)
    return text

# === Run inference & eval ===
start = time.time()
hyps = []
audio_sec_total = 0.0

for ex in ds:
    hyp = transcribe_one(ex)
    hyps.append(normalize_text(hyp))
    audio_sec_total += len(ex["audio"]["array"]) / TARGET_SR

mins = audio_sec_total / 60.0
score = wer(references, hyps) * 100.0
wall  = time.time() - start

print(f"Items: {len(ds)}   audio: {audio_sec_total:.2f}s  (~{mins:.2f} min)   wall: {wall:.2f}s")
print(f"WER:  {score:.2f}%   RTF: {wall / audio_sec_total:.3f}")


Device: cpu


Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Items: 2620   audio: 19452.48s  (~324.21 min)   wall: 1321.34s
WER:  1.86%   RTF: 0.068


In [12]:
import torch, numpy as np, re, time
from datasets import load_dataset, Audio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from jiwer import wer

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_ID = "facebook/wav2vec2-large-960h-lv60-self"
SPLIT = "test[:10%]"       # use full 'test' when you're ready
TARGET_SR = 16_000

print("Device:", DEVICE)

# --- dataset (let Datasets resample on-the-fly) ---
ds = load_dataset("librispeech_asr", "clean", split=SPLIT)
ds = ds.cast_column("audio", Audio(sampling_rate=TARGET_SR))   # <— IMPORTANT

# --- clean refs (to match your Whisper WER setup) ---
def normalize_text(s: str) -> str:
    s = s.lower().strip()
    s = re.sub(r"[^a-z' ]+", " ", s)   # keep letters + apostrophes
    s = re.sub(r"\s+", " ", s).strip()
    return s

references = [normalize_text(x["text"]) for x in ds]

# --- models ---
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model     = Wav2Vec2ForCTC.from_pretrained(MODEL_ID).to(DEVICE).eval()

# --- batched inference ---
BATCH_SIZE = 8
hypotheses = []
audio_seconds_total = 0.0

t0 = time.time()
with torch.no_grad():
    for i in range(0, len(ds), BATCH_SIZE):
        batch = ds.select(range(i, min(i+BATCH_SIZE, len(ds))))

        # ❶ pass a LIST of 1-D arrays to the processor (do NOT build a 2-D tensor yourself)
        wavs = [ex["audio"]["array"] for ex in batch]
        audio_seconds_total += sum(len(w) for w in wavs) / TARGET_SR

        inputs = processor(
            wavs,
            sampling_rate=TARGET_SR,
            return_tensors="pt",
            padding=True
        )
        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}

        logits = model(**inputs).logits                 # [B, T, vocab]
        pred_ids = torch.argmax(logits, dim=-1)         # [B, T]
        texts = processor.batch_decode(pred_ids, skip_special_tokens=True)
        hypotheses.extend(normalize_text(t) for t in texts)

wall = time.time() - t0
rtf = wall / audio_seconds_total
print(f"Items: {len(ds)}  audio: {audio_seconds_total/60:.2f} min  wall: {wall:.2f}s  RTF: {rtf:.3f}")

score = wer(references, hypotheses) * 100.0
print(f"WER: {score:.2f}%")

Device: cpu


Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Items: 262  audio: 32.41 min  wall: 205.00s  RTF: 0.105
WER: 10.64%
