### ðŸ§© Cell 1 â€“ Imports & Config

In [1]:


from dataclasses import dataclass
from pathlib import Path
from typing import Optional

import random
import io
import json

import numpy as np
import librosa
from tqdm import tqdm
from multiprocessing.dummy import Pool as ThreadPool  # threads (Jupyter-friendly)
from multiprocessing import cpu_count

import torch
from datasets import load_dataset


In [3]:
# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

@dataclass
class Config:
    # HF LibriSpeech
    hf_name: str = "openslr/librispeech_asr"
    hf_config: str = "all"                # matches your snippet
    hf_split: str = "train.clean.360"     # subset for alignment

    # Sampling & filtering
    librispeech_max_samples: int = 1000   # how many to take from streaming
    audio_sample_rate: int = 16_000       # target sample rate
    max_audio_duration_s: float = 12.0    # filter long clips

    # Whisper encoder
    whisper_model_name: str = "openai/whisper-base"  # or tiny / small etc.

    # Paths
    root_dir: Path = Path("./data/librispeech")
    features_dir: Path = Path("./data/librispeech/features")

    # Device
    device: str = "cuda" if torch.cuda.is_available() else "cpu"

cfg = Config()
cfg.root_dir.mkdir(parents=True, exist_ok=True)
cfg.features_dir.mkdir(parents=True, exist_ok=True)

print(cfg)


Config(hf_name='openslr/librispeech_asr', hf_config='all', hf_split='train.clean.360', librispeech_max_samples=1000, audio_sample_rate=16000, max_audio_duration_s=12.0, whisper_model_name='openai/whisper-base', root_dir=PosixPath('data/librispeech'), features_dir=PosixPath('data/librispeech/features'), device='cpu')


### ðŸ§© Cell 2 â€“ Load Whisper Encoder

In [4]:
from transformers import WhisperProcessor, WhisperModel

print("Loading Whisper encoder:", cfg.whisper_model_name)

audio_processor = WhisperProcessor.from_pretrained(cfg.whisper_model_name)
audio_model = WhisperModel.from_pretrained(cfg.whisper_model_name)
audio_model.eval().to(cfg.device)

d_audio = audio_model.config.d_model
print("Whisper encoder dim (d_audio):", d_audio)


Loading Whisper encoder: openai/whisper-base


preprocessor_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/290M [00:00<?, ?B/s]

Whisper encoder dim (d_audio): 512


### ðŸ§© Cell 3 â€“ Streaming LibriSpeech, Subset & Duration Filter

In [5]:
print("\nLoading LibriSpeech ASR in streaming mode...")

librispeech_raw = load_dataset(
    cfg.hf_name,
    cfg.hf_config,
    streaming=True,
    split=cfg.hf_split,
)

print("Loaded streaming dataset:", librispeech_raw)

# Disable automatic decoding â†’ we want raw bytes for librosa
audio_stream = librispeech_raw.decode(False)

max_samples = cfg.librispeech_max_samples
subset = []

print(f"\nTaking up to {max_samples} examples in streaming mode...")

for ex in audio_stream:
    subset.append(ex)
    if len(subset) >= max_samples:
        break

print("\nSubset collected:", len(subset))
if len(subset) > 0:
    print("Keys:", subset[0].keys())
    print("Example 0 (truncated):", {k: subset[0][k] for k in subset[0].keys() if k != "audio"})



Loading LibriSpeech ASR in streaming mode...


README.md: 0.00B [00:00, ?B/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/64 [00:00<?, ?it/s]

Loaded streaming dataset: IterableDataset({
    features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
    num_shards: 48
})

Taking up to 1000 examples in streaming mode...

Subset collected: 1000
Keys: dict_keys(['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'])
Example 0 (truncated): {'file': '/home/albert/.cache/huggingface/datasets/downloads/extracted/41321bb258f165f6163b1b0335f41c1ae65f86988d636209307b7ed94af929a4/1487-133273-0000.flac', 'text': 'THE SECOND IN IMPORTANCE IS AS FOLLOWS SOVEREIGNTY MAY BE DEFINED TO BE THE RIGHT OF MAKING LAWS IN FRANCE THE KING REALLY EXERCISES A PORTION OF THE SOVEREIGN POWER SINCE THE LAWS HAVE NO WEIGHT', 'speaker_id': 1487, 'chapter_id': 133273, 'id': '1487-133273-0000'}


In [6]:
def load_waveform_from_streaming_example(example, target_sr=16_000):
    """
    Convert LibriSpeech streaming example -> mono waveform @ target_sr.
    """
    audio_info = example["audio"]
    audio_bytes = audio_info["bytes"]
    if audio_bytes is None:
        raise ValueError("No audio bytes in example.")

    audio_file = io.BytesIO(audio_bytes)
    wav, sr = librosa.load(audio_file, sr=target_sr)
    return wav, sr

def compute_duration(wav, sr):
    return len(wav) / float(sr)


In [7]:
filtered = []

print("\nFiltering by duration â‰¤", cfg.max_audio_duration_s, "seconds...")

for ex in subset:
    try:
        wav, sr = load_waveform_from_streaming_example(ex, cfg.audio_sample_rate)
    except Exception as e:
        # Log decode issues quietly if you like, or just skip
        log_path = cfg.root_dir / "decode_skipped.log"
        with open(log_path, "a") as f:
            f.write(f"decode_error\ttext={ex.get('text', '')[:80]!r}\terror={repr(e)}\n")
        continue

    dur = compute_duration(wav, sr)

    if dur <= cfg.max_audio_duration_s:
        filtered.append({
            "waveform": wav,
            "sampling_rate": sr,
            "duration": dur,
            "text": ex["text"],
        })

print("After duration filtering:", len(filtered), "examples")



Filtering by duration â‰¤ 12.0 seconds...
After duration filtering: 338 examples


In [8]:
print("\nShowing a few filtered samples...")

for i in range(min(5, len(filtered))):
    ex = filtered[i]
    print(f"\nSample {i}:")
    print("  Duration:", round(ex["duration"], 2), "s")
    print("  Transcript:", ex["text"])
    print("  Waveform shape:", ex["waveform"].shape)



Showing a few filtered samples...

Sample 0:
  Duration: 11.79 s
  Transcript: IN THE EXERCISE OF THE EXECUTIVE POWER THE PRESIDENT OF THE UNITED STATES IS CONSTANTLY SUBJECT TO A JEALOUS SCRUTINY HE MAY MAKE BUT HE CANNOT CONCLUDE A TREATY
  Waveform shape: (188560,)

Sample 1:
  Duration: 10.94 s
  Transcript: PUBLIC OPINION IS THE PREDOMINANT AUTHORITY IN BOTH OF THEM THE FUNDAMENTAL PRINCIPLE OF LEGISLATION A PRINCIPLE ESSENTIALLY REPUBLICAN IS THE SAME IN BOTH COUNTRIES
  Waveform shape: (174960,)

Sample 2:
  Duration: 11.47 s
  Transcript: THE CONTRAST WOULD HAVE BEEN RENDERED STILL MORE STRIKING I HAVE REMARKED THAT THE AUTHORITY OF THE PRESIDENT IN THE UNITED STATES IS ONLY EXERCISED WITHIN THE LIMITS OF A PARTIAL SOVEREIGNTY
  Waveform shape: (183600,)

Sample 3:
  Duration: 7.39 s
  Transcript: AND THOSE WHICH IT WOULD CARRY INTO EFFECT THE PRESIDENT OF THE UNITED STATES IS THE COMMANDER IN CHIEF OF THE ARMY
  Waveform shape: (118240,)

Sample 4:
  Duration: 3.29 s
  Transc

### ðŸ§© Cell 4 â€“ Whisper Encoding Helper

In [9]:
def whisper_encode_sequence(wav: np.ndarray, sr: int) -> torch.Tensor:
    """
    wav: 1D numpy array (time,)
    sr:  sampling rate (expected 16k)
    Returns:
        feats: Tensor(T_enc, d_audio) on CPU (float16)
    """
    inputs = audio_processor(
        wav,
        sampling_rate=sr,
        return_tensors="pt",
    )
    input_features = inputs["input_features"].to(cfg.device)  # (1, T_mel, 80)

    with torch.no_grad():
        enc_out = audio_model.encoder(input_features)
        hidden = enc_out.last_hidden_state  # (1, T_enc, d_audio)

    feats = hidden.squeeze(0).to(torch.float16).cpu()  # (T_enc, d_audio)
    return feats


### ðŸ§© Cell 5 â€“ Per-Example Processing & Saving (with quiet logging)

In [10]:
def process_and_save_audio_example(ex, idx: int, split_name: str = "train"):
    """
    ex: entry from `filtered` list
    idx: index within that list
    split_name: typically "train" for this notebook

    Saves:
      - <cfg.features_dir>/<split_name>_feat_<idx>.pt

    Returns:
      metadata dict or None if error.
    """
    wav = ex["waveform"]
    sr = ex["sampling_rate"]
    dur = ex["duration"]
    text = ex["text"]

    try:
        feats = whisper_encode_sequence(wav, sr)  # (T_enc, d_audio)
    except Exception as e:
        log_path = cfg.root_dir / f"{split_name}_skipped.log"
        with open(log_path, "a") as f:
            f.write(
                f"idx={idx}\tduration={dur:.3f}\ttext={text[:80]!r}\terror={repr(e)}\n"
            )
        return None

    fname = f"{split_name}_feat_{idx}.pt"
    fpath = cfg.features_dir / fname

    torch.save(
        {
            "features": feats,     # (T_enc, d_audio)
            "text": text,          # raw transcript
            "duration": dur,
            "sampling_rate": sr,
            "orig_idx": idx,
        },
        fpath,
    )

    return {
        "orig_idx": idx,
        "file": str(fpath),
        "num_frames": feats.shape[0],
        "feat_dim": feats.shape[1],
        "duration": dur,
    }


### ðŸ§© Cell 6 â€“ Threaded Builder (train split)

In [11]:
def _process_audio(args):
    i, ex, split_name = args
    return process_and_save_audio_example(ex, idx=i, split_name=split_name)


In [12]:
def build_audio_split_index_threaded(
    examples,
    split_name: str = "train",
    workers: Optional[int] = None,
    chunksize: int = 16,
):
    """
    Extract & cache Whisper features for LibriSpeech examples using threads.

    Creates:
      - <cfg.root_dir>/<split_name>_index.json
      - .pt feature files under cfg.features_dir
    """
    if workers is None:
        workers = min(8, cpu_count())

    total = len(examples)
    tasks = ((i, ex, split_name) for i, ex in enumerate(examples))
    split_index = []

    with ThreadPool(processes=workers) as pool:
        for meta in tqdm(
            pool.imap_unordered(_process_audio, tasks, chunksize=chunksize),
            total=total,
            desc=f"Extracting {split_name} audio features (threads)",
        ):
            if meta is not None:
                split_index.append(meta)

    index_path = cfg.root_dir / f"{split_name}_index.json"
    with open(index_path, "w") as f:
        json.dump(split_index, f, indent=2)

    # Quiet summary
    log_path = cfg.root_dir / f"{split_name}_skipped.log"
    if log_path.exists():
        n_skipped = sum(1 for _ in open(log_path, "r"))
        print(
            f"Saved {split_name} index with {len(split_index)} items "
            f"(skipped {n_skipped} examples, see {log_path})"
        )
    else:
        print(f"Saved {split_name} index with {len(split_index)} items (no skips).")

    return split_index


### ðŸ§© Cell 7 â€“ Run the Builder

In [13]:
print("\nBuilding audio feature dataset from filtered LibriSpeech samples...")
print("Filtered examples:", len(filtered))

audio_index = build_audio_split_index_threaded(
    filtered,
    split_name="train",
    workers=4,
    chunksize=64,
)



Building audio feature dataset from filtered LibriSpeech samples...
Filtered examples: 338


Extracting train audio features (threads): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 338/338 [00:25<00:00, 13.01it/s]

Saved train index with 338 items (no skips).





### ðŸ§© Cell 8 â€“ Sanity Check a Saved Feature File

In [14]:
index_path = cfg.root_dir / "train_index.json"

with open(index_path, "r") as f:
    train_index = json.load(f)

print("Loaded", len(train_index), "train items")

if len(train_index) > 0:
    first_meta = train_index[0]
    print("First meta:", first_meta)

    sample_path = first_meta["file"]
    blob = torch.load(sample_path)
    feats = blob["features"]
    text = blob["text"]
    duration = blob["duration"]

    print("Loaded features shape:", feats.shape)
    print("Duration:", round(duration, 2), "s")
    print("Transcript snippet:", text[:200], "...")
else:
    print("No train examples cached â€“ check earlier logs for errors.")


Loaded 338 train items
First meta: {'orig_idx': 0, 'file': 'data/librispeech/features/train_feat_0.pt', 'num_frames': 1500, 'feat_dim': 512, 'duration': 11.785}
Loaded features shape: torch.Size([1500, 512])
Duration: 11.79 s
Transcript snippet: IN THE EXERCISE OF THE EXECUTIVE POWER THE PRESIDENT OF THE UNITED STATES IS CONSTANTLY SUBJECT TO A JEALOUS SCRUTINY HE MAY MAKE BUT HE CANNOT CONCLUDE A TREATY ...
