In [2]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset

import torch

In [3]:
# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
model.config.forced_decoder_ids = None

# load dummy dataset and read audio files
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
sample = ds[0]["audio"]
input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features 

# generate token ids
predicted_ids = model.generate(input_features, num_beams=100, output_logits=True, return_dict_in_generate=True, language='en')
# decode token ids to text
# transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)

# transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)


You have passed language=en, but also have set `forced_decoder_ids` to [[1, None], [2, 50359]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of language=en.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [43]:
processor.batch_decode(predicted_ids.sequences, skip_special_tokens=True)

[' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.']

In [16]:
sample['array']
sample['sampling_rate']

16000

In [17]:
processor.tokenizer.added_tokens_decoder

{50257: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 50258: AddedToken("<|startoftranscript|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 50259: AddedToken("<|en|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 50260: AddedToken("<|zh|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 50261: AddedToken("<|de|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 50262: AddedToken("<|es|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 50263: AddedToken("<|ru|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 50264: AddedToken("<|ko|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 50265: AddedToken("<|fr|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True

In [18]:
predicted_ids

GenerateBeamEncoderDecoderOutput(sequences=tensor([[50258, 50259, 50359, 50363,  2221,    13,  2326,   388,   391,   307,
           264, 50244,   295,   264,  2808,  5359,   293,   321,   366,  5404,
           281,  2928,   702, 14943,    13, 50257]]), sequences_scores=None, scores=None, logits=(tensor([[ 2.3034,  3.3756, -2.0326,  ..., -2.3806, -2.2708, -3.7729]]), tensor([[12.7420,  6.6603,  3.4696,  ...,  4.3090,  3.6956,  2.0107]]), tensor([[ 1.8605, -0.2553, -3.9642,  ..., -2.8485, -3.0380, -4.5694]]), tensor([[ 1.9904,  2.4086, -1.6515,  ...,  1.0623,  0.0069,  0.4268]]), tensor([[7.9403, 9.6854, 5.9965,  ..., 7.0070, 6.9622, 5.0162]]), tensor([[39.0155, 40.3875, 35.0645,  ..., 33.1898, 33.1835, 29.9459]]), tensor([[34.0002, 36.2443, 31.4101,  ..., 32.1790, 31.5591, 28.3007]]), tensor([[ 2.4926,  5.2185, -0.9671,  ..., -0.9002, -1.2156, -5.0328]]), tensor([[39.8245, 38.6122, 32.4190,  ..., 34.5664, 33.8170, 30.7992]]), tensor([[19.7878, 21.1284, 17.0484,  ..., 15.8762, 15.6596,

In [19]:
predicted_ids.sequences

tensor([[50258, 50259, 50359, 50363,  2221,    13,  2326,   388,   391,   307,
           264, 50244,   295,   264,  2808,  5359,   293,   321,   366,  5404,
           281,  2928,   702, 14943,    13, 50257]])

In [20]:
processor.batch_decode(predicted_ids.sequences, skip_special_tokens=False)

['<|startoftranscript|><|en|><|transcribe|><|notimestamps|> Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.<|endoftext|>']

In [21]:
processor.batch_decode(predicted_ids.sequences, skip_special_tokens=True)

[' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.']

In [22]:
predicted_ids.logits[0].shape

torch.Size([1, 51865])

In [23]:
[torch.softmax(logits, dim=1).argmax() for i, logits in enumerate(predicted_ids.logits)]

[tensor(2221),
 tensor(13),
 tensor(2326),
 tensor(388),
 tensor(391),
 tensor(307),
 tensor(264),
 tensor(50244),
 tensor(295),
 tensor(264),
 tensor(2808),
 tensor(5359),
 tensor(293),
 tensor(321),
 tensor(366),
 tensor(5404),
 tensor(281),
 tensor(2928),
 tensor(702),
 tensor(14943),
 tensor(13),
 tensor(50257),
 tensor(50257),
 tensor(50257),
 tensor(50257)]

In [24]:
torch.softmax(predicted_ids.logits[0], dim=1)[:,2221]

tensor([0.8699])

In [25]:
50257 in processor.tokenizer.added_tokens_decoder.keys()

True

In [26]:
pred = [p for p in predicted_ids.sequences.squeeze() if p.item() not in processor.tokenizer.added_tokens_decoder.keys()]

In [27]:
len(pred)

21

In [28]:
len(predicted_ids.logits)

25

In [29]:
len(predicted_ids.sequences.squeeze())

26

In [37]:
probs = [torch.softmax(logits, dim=1)[:,pred[i]] for i, logits in enumerate(predicted_ids.logits[:len(pred)])]

In [38]:
len(processor.batch_decode(pred)), len(probs)

(21, 21)

In [40]:
list(zip(processor.batch_decode(pred), probs))

[(' Mr', tensor([0.8699])),
 ('.', tensor([0.9802])),
 (' Qu', tensor([0.4154])),
 ('il', tensor([0.7852])),
 ('ter', tensor([0.9616])),
 (' is', tensor([0.9590])),
 (' the', tensor([0.9836])),
 (' apostle', tensor([0.9467])),
 (' of', tensor([0.9971])),
 (' the', tensor([0.9846])),
 (' middle', tensor([0.8122])),
 (' classes', tensor([0.9145])),
 (' and', tensor([0.5238])),
 (' we', tensor([0.9824])),
 (' are', tensor([0.9553])),
 (' glad', tensor([0.9979])),
 (' to', tensor([0.9956])),
 (' welcome', tensor([0.9969])),
 (' his', tensor([0.9453])),
 (' gospel', tensor([0.8681])),
 ('.', tensor([0.8877]))]

In [32]:
''.join(processor.batch_decode(pred))

' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.'

In [33]:
processor.batch_decode(predicted_ids.sequences, skip_special_tokens=True)

[' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.']

In [34]:
[(predicted_ids.sequences[:,i], logits) for i, logits in enumerate(predicted_ids.logits)]

[(tensor([50258]),
  tensor([[ 2.3034,  3.3756, -2.0326,  ..., -2.3806, -2.2708, -3.7729]])),
 (tensor([50259]),
  tensor([[12.7420,  6.6603,  3.4696,  ...,  4.3090,  3.6956,  2.0107]])),
 (tensor([50359]),
  tensor([[ 1.8605, -0.2553, -3.9642,  ..., -2.8485, -3.0380, -4.5694]])),
 (tensor([50363]),
  tensor([[ 1.9904,  2.4086, -1.6515,  ...,  1.0623,  0.0069,  0.4268]])),
 (tensor([2221]),
  tensor([[7.9403, 9.6854, 5.9965,  ..., 7.0070, 6.9622, 5.0162]])),
 (tensor([13]),
  tensor([[39.0155, 40.3875, 35.0645,  ..., 33.1898, 33.1835, 29.9459]])),
 (tensor([2326]),
  tensor([[34.0002, 36.2443, 31.4101,  ..., 32.1790, 31.5591, 28.3007]])),
 (tensor([388]),
  tensor([[ 2.4926,  5.2185, -0.9671,  ..., -0.9002, -1.2156, -5.0328]])),
 (tensor([391]),
  tensor([[39.8245, 38.6122, 32.4190,  ..., 34.5664, 33.8170, 30.7992]])),
 (tensor([307]),
  tensor([[19.7878, 21.1284, 17.0484,  ..., 15.8762, 15.6596, 13.6304]])),
 (tensor([264]),
  tensor([[ -4.5552,  -6.0434,  -8.9678,  ..., -11.4222, -11

In [35]:
len(predicted_ids.logits)

25

In [36]:
import re

text = "apple banana cherry"
delimiter = " "

# Build a regex pattern to split by the delimiter
pattern = re.compile(r'[^' + re.escape(delimiter) + r']+')

result = [(m.group(), m.start()) for m in pattern.finditer(text)]
print(result)

[('apple', 0), ('banana', 6), ('cherry', 13)]
