## Audio–Text with Whisper + text encoder

### Basic Import Statements

In [16]:
from datasets import load_dataset, Audio
from transformers import WhisperProcessor, WhisperModel

import torch
import torchaudio
import librosa
import numpy as np
import io
import pandas as pd

from pathlib import Path

print("Torch:", torch.__version__)
print("CUDA:", torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


Torch: 2.9.0+cu128
CUDA: True


device(type='cuda')

In [10]:
from datasets import load_dataset

# Config "all" gives all standard LibriSpeech splits in one DatasetDict
librispeech = load_dataset("openslr/librispeech_asr", "all", streaming=True, split="train.clean.100")
audio_ds = librispeech.decode(False)
five_examples = list(audio_ds.take(5))

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/64 [00:00<?, ?it/s]

In [11]:
for i, ex in enumerate(five_examples):
    print(f"\nSample {i+1}")
    print("ID:", ex["id"])
    print("Text:", ex["text"])
    # ex["audio"] is now a dict with path/bytes instead of a decoded array
    print("Audio keys:", ex["audio"].keys())


Sample 1
ID: 374-180298-0000
Text: CHAPTER SIXTEEN I MIGHT HAVE TOLD YOU OF THE BEGINNING OF THIS LIAISON IN A FEW LINES BUT I WANTED YOU TO SEE EVERY STEP BY WHICH WE CAME I TO AGREE TO WHATEVER MARGUERITE WISHED
Audio keys: dict_keys(['bytes', 'path'])

Sample 2
ID: 374-180298-0001
Text: MARGUERITE TO BE UNABLE TO LIVE APART FROM ME IT WAS THE DAY AFTER THE EVENING WHEN SHE CAME TO SEE ME THAT I SENT HER MANON LESCAUT FROM THAT TIME SEEING THAT I COULD NOT CHANGE MY MISTRESS'S LIFE I CHANGED MY OWN
Audio keys: dict_keys(['bytes', 'path'])

Sample 3
ID: 374-180298-0002
Text: I WISHED ABOVE ALL NOT TO LEAVE MYSELF TIME TO THINK OVER THE POSITION I HAD ACCEPTED FOR IN SPITE OF MYSELF IT WAS A GREAT DISTRESS TO ME THUS MY LIFE GENERALLY SO CALM
Audio keys: dict_keys(['bytes', 'path'])

Sample 4
ID: 374-180298-0003
Text: ASSUMED ALL AT ONCE AN APPEARANCE OF NOISE AND DISORDER NEVER BELIEVE HOWEVER DISINTERESTED THE LOVE OF A KEPT WOMAN MAY BE THAT IT WILL COST ONE NOTHING
Audio keys: dic

### Load Whisper Model

In [22]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

Using device: cuda


In [23]:
# Choose a small model for speed
model_name = "openai/whisper-tiny"   # or "openai/whisper-base"

# Load the processor (handles log-mel, padding)
processor = WhisperProcessor.from_pretrained(model_name)

# Load ONLY encoder (decoder not needed for embeddings)
model = WhisperModel.from_pretrained(model_name)
model = model.to(device)
model.eval()

for p in model.parameters():
    p.requires_grad = False

print("Loaded Whisper model:", model_name)


Loaded Whisper model: openai/whisper-tiny


### Helper: load waveform from file

In [17]:

# Since we are streaming and not decoding, we need to load the audio ourselves
def load_waveform_from_example(example, target_sr=16000):
    audio_info = example["audio"]
    audio_path = audio_info["path"]      # path on disk
    
    # Load with librosa, resample to 16k
    wav, sr = librosa.load(audio_path, sr=target_sr)
    return wav, target_sr

# Load from streaming example
def load_waveform_from_streaming_example(example, target_sr=16000):
    audio_info = example["audio"]

    # MUST use bytes if streaming
    audio_bytes = audio_info["bytes"]
    if audio_bytes is None:
        raise ValueError("No audio bytes found in the streaming example.")

    # Wrap bytes into a file-like object
    audio_file = io.BytesIO(audio_bytes)

    # librosa can load directly from file-like objects
    wav, sr = librosa.load(audio_file, sr=target_sr)  # resample to target_sr

    return wav, target_sr



In [24]:
def whisper_embed_waveform(wav, sr):
    # 1. Convert raw audio → log-Mel features
    inputs = processor(
        wav,
        sampling_rate=sr,
        return_tensors="pt"
    )
    
    input_features = inputs["input_features"].to(device)  # (1, T', 80)

    # 2. Run only the encoder
    with torch.no_grad():
        enc_out = model.encoder(input_features)
        hidden = enc_out.last_hidden_state   # (1, T_enc, d)

    # 3. Average over time → (1, d)
    emb = hidden.mean(dim=1)                # (1, d)

    # 4. L2-normalize
    emb = emb / emb.norm(dim=-1, keepdim=True)

    return emb.squeeze(0).cpu().numpy()     # (d,)


In [25]:
audio_embeddings = []
texts = []

for ex in five_examples:
    wav, sr = load_waveform_from_streaming_example(ex, target_sr=16000)
    emb = whisper_embed_waveform(wav, sr)
    audio_embeddings.append(emb)
    texts.append(ex["text"])

audio_embeddings = np.stack(audio_embeddings)
print("Embeddings shape:", audio_embeddings.shape)
print("First text:", texts[0])



Embeddings shape: (5, 384)
First text: CHAPTER SIXTEEN I MIGHT HAVE TOLD YOU OF THE BEGINNING OF THIS LIAISON IN A FEW LINES BUT I WANTED YOU TO SEE EVERY STEP BY WHICH WE CAME I TO AGREE TO WHATEVER MARGUERITE WISHED


### Checking Similarity Matrix

In [26]:
import numpy as np

# audio_embeddings: (N, d)
N = audio_embeddings.shape[0]

# Since they are normalized, dot product = cosine similarity
sim_matrix = audio_embeddings @ audio_embeddings.T
print(sim_matrix)


[[1.0000006  0.987994   0.988766   0.9757626  0.9871941 ]
 [0.987994   0.99999994 0.9843736  0.967333   0.98079574]
 [0.988766   0.9843736  1.0000002  0.9858894  0.98897105]
 [0.9757626  0.967333   0.9858894  0.9999998  0.9812445 ]
 [0.9871941  0.98079574 0.98897105 0.9812445  1.0000005 ]]


In [27]:
short_texts = [t[:40] + "..." if len(t) > 40 else t for t in texts]

df_sim = pd.DataFrame(
    sim_matrix,
    index=[f"ex{i}: " + short_texts[i] for i in range(N)],
    columns=[f"ex{i}" for i in range(N)]
)
df_sim


Unnamed: 0,ex0,ex1,ex2,ex3,ex4
ex0: CHAPTER SIXTEEN I MIGHT HAVE TOLD YOU OF...,1.000001,0.987994,0.988766,0.975763,0.987194
ex1: MARGUERITE TO BE UNABLE TO LIVE APART FR...,0.987994,1.0,0.984374,0.967333,0.980796
ex2: I WISHED ABOVE ALL NOT TO LEAVE MYSELF T...,0.988766,0.984374,1.0,0.985889,0.988971
ex3: ASSUMED ALL AT ONCE AN APPEARANCE OF NOI...,0.975763,0.967333,0.985889,1.0,0.981245
ex4: NOTHING IS SO EXPENSIVE AS THEIR CAPRICE...,0.987194,0.980796,0.988971,0.981245,1.0


### Transcribe the same audio clips

In [28]:
from transformers import WhisperForConditionalGeneration

asr_model_name = "openai/whisper-tiny"  # same as encoder for now

asr_model = WhisperForConditionalGeneration.from_pretrained(asr_model_name).to(device)
asr_model.eval()

for p in asr_model.parameters():
    p.requires_grad = False

print("Loaded ASR model:", asr_model_name)

generation_config.json: 0.00B [00:00, ?B/s]

Loaded ASR model: openai/whisper-tiny


In [None]:
def whisper_transcribe_waveform(wav, sr, language="en"):
    # 1. Processor: waveform -> input_features
    inputs = processor(
        wav,
        sampling_rate=sr,
        return_tensors="pt"
    )

    input_features = inputs["input_features"].to(device)

    # 2. Optional: set language + task for Whisper
    forced_decoder_ids = processor.get_decoder_prompt_ids(
        language=language,
        task="transcribe"
    )

    # 3. Generate token ids
    with torch.no_grad():
        pred_ids = asr_model.generate(
            input_features,
            forced_decoder_ids=forced_decoder_ids
        )

    # 4. Decode ids to text
    pred_text = processor.batch_decode(pred_ids, skip_special_tokens=True)[0]
    return pred_text


In [30]:
for i, ex in enumerate(five_examples):
    wav, sr = load_waveform_from_streaming_example(ex, target_sr=16000)

    gt_text = ex["text"]                      # ground truth from LibriSpeech
    pred_text = whisper_transcribe_waveform(wav, sr, language="en")

    print(f"\n=== Example {i} ===")
    print("GT:   ", gt_text)
    print("ASR:  ", pred_text)


Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



=== Example 0 ===
GT:    CHAPTER SIXTEEN I MIGHT HAVE TOLD YOU OF THE BEGINNING OF THIS LIAISON IN A FEW LINES BUT I WANTED YOU TO SEE EVERY STEP BY WHICH WE CAME I TO AGREE TO WHATEVER MARGUERITE WISHED
ASR:    Chapter 16 I might have told you of the beginning of this liaison in a few lines, but I wanted you to see every step by which we came. I to agree to whatever Mark Reid wished.

=== Example 1 ===
GT:    MARGUERITE TO BE UNABLE TO LIVE APART FROM ME IT WAS THE DAY AFTER THE EVENING WHEN SHE CAME TO SEE ME THAT I SENT HER MANON LESCAUT FROM THAT TIME SEEING THAT I COULD NOT CHANGE MY MISTRESS'S LIFE I CHANGED MY OWN
ASR:    Margarit to be unable to live apart from me. It was the day after the evening when she came to see me that I sent her a mononless scut. From that time, seeing that I could not change my mistress's life, I changed my own.

=== Example 2 ===
GT:    I WISHED ABOVE ALL NOT TO LEAVE MYSELF TIME TO THINK OVER THE POSITION I HAD ACCEPTED FOR IN SPITE OF MYSELF IT WAS

In [31]:
import numpy as np
import pandas as pd

N = audio_embeddings.shape[0]

sim_matrix = audio_embeddings @ audio_embeddings.T  # (N, N)

short_texts = [t[:40] + "..." if len(t) > 40 else t for t in texts]

df_sim = pd.DataFrame(
    sim_matrix,
    index=[f"ex{i}: " + short_texts[i] for i in range(N)],
    columns=[f"ex{i}" for i in range(N)]
)
df_sim


Unnamed: 0,ex0,ex1,ex2,ex3,ex4
ex0: CHAPTER SIXTEEN I MIGHT HAVE TOLD YOU OF...,1.000001,0.987994,0.988766,0.975763,0.987194
ex1: MARGUERITE TO BE UNABLE TO LIVE APART FR...,0.987994,1.0,0.984374,0.967333,0.980796
ex2: I WISHED ABOVE ALL NOT TO LEAVE MYSELF T...,0.988766,0.984374,1.0,0.985889,0.988971
ex3: ASSUMED ALL AT ONCE AN APPEARANCE OF NOI...,0.975763,0.967333,0.985889,1.0,0.981245
ex4: NOTHING IS SO EXPENSIVE AS THEIR CAPRICE...,0.987194,0.980796,0.988971,0.981245,1.0


In [32]:
i, j = 0, 1  # maybe similar?
k, l = 0, 4  # maybe dissimilar?

print("sim(i,j) =", float(sim_matrix[i, j]))
print("sim(k,l) =", float(sim_matrix[k, l]))
print("\nText i:", texts[i])
print("Text j:", texts[j])
print("\nText k:", texts[k])
print("Text l:", texts[l])


sim(i,j) = 0.9879940152168274
sim(k,l) = 0.9871941208839417

Text i: CHAPTER SIXTEEN I MIGHT HAVE TOLD YOU OF THE BEGINNING OF THIS LIAISON IN A FEW LINES BUT I WANTED YOU TO SEE EVERY STEP BY WHICH WE CAME I TO AGREE TO WHATEVER MARGUERITE WISHED
Text j: MARGUERITE TO BE UNABLE TO LIVE APART FROM ME IT WAS THE DAY AFTER THE EVENING WHEN SHE CAME TO SEE ME THAT I SENT HER MANON LESCAUT FROM THAT TIME SEEING THAT I COULD NOT CHANGE MY MISTRESS'S LIFE I CHANGED MY OWN

Text k: CHAPTER SIXTEEN I MIGHT HAVE TOLD YOU OF THE BEGINNING OF THIS LIAISON IN A FEW LINES BUT I WANTED YOU TO SEE EVERY STEP BY WHICH WE CAME I TO AGREE TO WHATEVER MARGUERITE WISHED
Text l: NOTHING IS SO EXPENSIVE AS THEIR CAPRICES FLOWERS BOXES AT THE THEATRE SUPPERS DAYS IN THE COUNTRY WHICH ONE CAN NEVER REFUSE TO ONE'S MISTRESS AS I HAVE TOLD YOU I HAD LITTLE MONEY


### Comparing with a Text Encoder

In [33]:
from sentence_transformers import SentenceTransformer

text_model = SentenceTransformer("all-MiniLM-L6-v2")  # small & fast


In [34]:
text_embeddings = text_model.encode(
    texts,
    normalize_embeddings=True  # so dot products are cosine
)
text_embeddings.shape  # (N, d_text)


(5, 384)

In [35]:
# (N, d_audio) @ (d_text, N) won't work directly because dims differ,
# so we can use cosine similarity from sentence-transformers OR
# re-normalize and compute pairwise dot using broadcasting.

from numpy.linalg import norm

# normalize (just to be safe)
A = audio_embeddings / norm(audio_embeddings, axis=1, keepdims=True)
T = text_embeddings / norm(text_embeddings, axis=1, keepdims=True)

# audio-text similarity: (N, d_a) and (N, d_t)
# we want (N, N) where sim[i, j] = cos(audio_i, text_j)
audio_text_sim = A @ T.T

df_at = pd.DataFrame(
    audio_text_sim,
    index=[f"audio_ex{i}" for i in range(N)],
    columns=[f"text_ex{j}" for j in range(N)]
)
df_at


Unnamed: 0,text_ex0,text_ex1,text_ex2,text_ex3,text_ex4
audio_ex0,-0.012998,-0.032815,0.005903,-0.012439,0.014672
audio_ex1,-0.020162,-0.029687,-0.006506,-0.015001,0.015842
audio_ex2,-0.023884,-0.037766,-0.001717,-0.005053,0.006256
audio_ex3,-0.019939,-0.030535,0.006547,0.005151,0.00733
audio_ex4,-0.029884,-0.033821,-0.010715,-0.000369,0.015179
