In [2]:
from transformers import WhisperForConditionalGeneration, WhisperTokenizerFast, WhisperFeatureExtractor,get_scheduler,WhisperProcessor
from peft import LoraConfig, get_peft_model
from datasets import load_dataset, DatasetDict, concatenate_datasets
import datasets

from typing import Any, Dict, List, Optional, Union
from dataclasses import dataclass, field

import torch
import numpy as np


from accelerate import Accelerator
from accelerate.logging import get_logger
from accelerate.utils import set_seed

from torch.utils.data import DataLoader

from tqdm.notebook import tqdm
from transformers.models.whisper.english_normalizer import BasicTextNormalizer, EnglishTextNormalizer


# 1. Prepare Dataset

In [9]:
ds1 = load_dataset('facebook/multilingual_librispeech',
                    'german',
                    cache_dir="/media/hdd/.cache/huggingface",
            )

ds2 = load_dataset('mozilla-foundation/common_voice_16_0',
                    'de',
                    cache_dir="/media/hdd_old/.cache/huggingface",
                )

ds3 = load_dataset(
            'facebook/voxpopuli',
            'de',
            cache_dir="/media/hdd_old/.cache/huggingface",
            )

feature_extractor = WhisperFeatureExtractor.from_pretrained("distil-whisper/distil-medium.en")
tokenizer = WhisperTokenizerFast.from_pretrained("distil-whisper/distil-medium.en")

def rename_columns(ds, column_nammes):
    ds = ds.cast_column("audio", datasets.features.Audio(16000))

    ds = ds.rename_column(column_nammes, "text")
    
    dataset_features = ds['train'].features.keys()
    columns_to_keep = {"audio", "text"}
    ds = ds.remove_columns(set(dataset_features - columns_to_keep))
    
    return ds
    
ds1 = rename_columns(ds1, "transcript")
ds2 = rename_columns(ds2, "sentence")
ds3 = rename_columns(ds3, "raw_text")


dataset = DatasetDict()
dataset['train'] = concatenate_datasets([ds1['train'], ds2['train'], ds2['validation']])
dataset['ID_eval'] = concatenate_datasets([ds1['test'], ds2['test']])
dataset['OOD_eval'] = concatenate_datasets([ds3['validation'], ds3['test']])

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/63 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/63 [00:00<?, ?it/s]

In [27]:
dataset['train'] = dataset['train'].shuffle(seed=41).take(1000)

In [4]:
import time

In [5]:
def timeit(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"Function {func.__name__} took {end_time - start_time:.4f} seconds")
        return result
    return wrapper

In [6]:
raw_datasets_train_features = list(dataset["train"].features.keys())
raw_datasets_train_features

['audio', 'text']

In [15]:
@timeit
def prepare_train_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = [sample["array"] for sample in batch["audio"]]
    inputs = feature_extractor(audio, sampling_rate=16000, device='cuda')
    batch["input_features"] = inputs.input_features
    batch["input_length"] = [len(sample) for sample in audio]
    batch["labels"] = tokenizer(batch["text"]).input_ids
    
    return batch

In [24]:
import random

list = [10, 100, 1000]

[300, 800, 100, 500]

In [28]:
for batch_size in list:
    
    tic = time.time()
    dataset['train'].map(prepare_train_dataset, batched=True, batch_size=batch_size)
    toc = time.time() - tic
    
    print(f"batch_size: {batch_size}, time: {toc:.4f} seconds")

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Function prepare_train_dataset took 5.2748 seconds
Function prepare_train_dataset took 5.0388 seconds
Function prepare_train_dataset took 5.1649 seconds
Function prepare_train_dataset took 1.7884 seconds
batch_size: 300, time: 117.6985 seconds


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Function prepare_train_dataset took 14.6061 seconds
Function prepare_train_dataset took 3.4520 seconds


FileNotFoundError: [Errno 2] No such file or directory: '/media/hdd/.cache/huggingface/facebook___multilingual_librispeech/german/0.0.0/2e83e61823b4c47dcbcb1980bb88601274127609/tmp7wp0rd39'

In [3]:
"3" * 10

'3333333333'

In [4]:
'30' * 10

'30303030303030303030'

In [5]:
"3" * 10 > '30' * 10

True

In [9]:
dataset['train'] = dataset['train'].map(prepare_train_dataset, batched=True, batch_size=1)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Function prepare_train_dataset took 0.0039 seconds
Function prepare_train_dataset took 0.0032 seconds
Function prepare_train_dataset took 0.0029 seconds
Function prepare_train_dataset took 0.0037 seconds
Function prepare_train_dataset took 0.0038 seconds
Function prepare_train_dataset took 0.0038 seconds
Function prepare_train_dataset took 0.0038 seconds
Function prepare_train_dataset took 0.0036 seconds
Function prepare_train_dataset took 0.0040 seconds
Function prepare_train_dataset took 0.0037 seconds
Function prepare_train_dataset took 0.0033 seconds
Function prepare_train_dataset took 0.0041 seconds
Function prepare_train_dataset took 0.0037 seconds
Function prepare_train_dataset took 0.0038 seconds
Function prepare_train_dataset took 0.0037 seconds
Function prepare_train_dataset took 0.0035 seconds
Function prepare_train_dataset took 0.0035 seconds
Function prepare_train_dataset took 0.0037 seconds
Function prepare_train_dataset took 0.0040 seconds
Function prepare_train_dataset 

In [14]:
[x / 16000 for x in dataset['train']['input_length']]

[17.19,
 17.62,
 10.991125,
 18.8,
 16.15,
 15.87,
 18.37,
 14.54,
 18.37,
 18.95,
 15.21,
 17.57,
 14.49,
 16.92,
 13.85,
 11.12,
 12.98,
 11.38,
 15.19,
 12.74,
 12.2,
 13.63,
 16.22,
 18.06,
 14.52,
 17.82,
 13.53,
 11.66,
 14.5,
 14.67,
 12.53,
 13.24,
 16.36,
 15.68,
 13.82,
 12.54,
 10.61,
 17.68,
 10.14,
 16.68,
 10.5,
 12.3,
 10.27,
 13.81,
 12.13,
 10.29,
 12.56,
 16.62,
 20.0,
 12.05,
 13.02,
 14.53,
 15.95,
 11.4,
 16.72,
 14.62,
 17.03,
 12.86,
 12.93,
 19.98,
 17.16,
 14.61,
 16.54,
 16.91,
 16.92,
 19.2,
 10.75,
 14.71,
 13.33,
 17.99,
 10.63,
 18.62,
 12.17,
 10.23,
 18.46,
 10.72,
 10.94,
 14.29,
 13.29,
 11.62,
 17.53,
 10.72,
 13.64,
 14.96,
 11.89,
 17.71,
 11.8,
 14.87,
 16.49,
 12.36,
 11.12,
 14.54,
 14.94,
 13.21,
 15.64,
 14.11,
 12.96,
 17.94,
 11.17,
 19.72,
 13.83,
 10.52,
 16.61,
 12.58,
 12.21,
 16.04,
 11.41,
 20.0,
 12.84,
 11.73,
 17.89,
 11.01,
 10.04,
 11.48,
 14.68,
 11.57,
 18.5,
 13.78,
 12.77,
 11.63,
 17.86,
 15.18,
 12.55,
 10.14,
 18.9,
 10.31,


In [87]:
raw_dataset = DatasetDict()

In [90]:
raw_dataset['train'] = dataset['train'].take(1000).map(prepare_dataset, num_proc=2)

In [None]:
raw_dataset['train'] = dataset['train'].take(1000)

In [95]:
raw_dataset = raw_dataset.remove_columns(['audio', 'text'])

# 2.load lora model

In [3]:
# 加载 distil-whisper 模型
model = WhisperForConditionalGeneration.from_pretrained("distil-whisper/distil-medium.en")

In [8]:
target_modules = []
keywords = ["q_proj", "v_proj", "k_proj", "out_proj", "fc1", "fc2"]
for id, (name, param) in enumerate(model.named_modules()):
    if 'model.decoder' in name and (any(keyword in name for keyword in keywords)):
        target_modules.append(name)
        
target_modules

['model.decoder.layers.0.self_attn.k_proj',
 'model.decoder.layers.0.self_attn.v_proj',
 'model.decoder.layers.0.self_attn.q_proj',
 'model.decoder.layers.0.self_attn.out_proj',
 'model.decoder.layers.0.self_attn_layer_norm',
 'model.decoder.layers.0.encoder_attn.k_proj',
 'model.decoder.layers.0.encoder_attn.v_proj',
 'model.decoder.layers.0.encoder_attn.q_proj',
 'model.decoder.layers.0.encoder_attn.out_proj',
 'model.decoder.layers.0.encoder_attn_layer_norm',
 'model.decoder.layers.0.fc1',
 'model.decoder.layers.0.fc2',
 'model.decoder.layers.0.final_layer_norm',
 'model.decoder.layers.1.self_attn.k_proj',
 'model.decoder.layers.1.self_attn.v_proj',
 'model.decoder.layers.1.self_attn.q_proj',
 'model.decoder.layers.1.self_attn.out_proj',
 'model.decoder.layers.1.self_attn_layer_norm',
 'model.decoder.layers.1.encoder_attn.k_proj',
 'model.decoder.layers.1.encoder_attn.v_proj',
 'model.decoder.layers.1.encoder_attn.q_proj',
 'model.decoder.layers.1.encoder_attn.out_proj',
 'model.dec

In [4]:
# 定义 LoRA 配置
lora_config = LoraConfig(
    r=8,                    # Rank 参数
    lora_alpha=32,           # alpha乘数
    target_modules=target_modules,  # 目标模块，应用LoRA的部分
    lora_dropout=0.1,        # dropout概率
    bias="none",             # 不应用到 bias
)

# 将 LoRA 配置应用到模型
model = get_peft_model(model, lora_config)

In [5]:
model.print_trainable_parameters()

trainable params: 425,984 || all params: 394,801,152 || trainable%: 0.1079


In [10]:
lora_layers = filter(lambda p: p.requires_grad, model.parameters())

In [11]:
lora_layers

<filter at 0x73e542657670>

In [15]:
sum([p.numel() for p in model.parameters() if p.requires_grad])

425984

In [12]:
optimizer = torch.optim.AdamW(
    params=model.parameters(),
    lr=1e-3,
    betas=(0.9, 0.999),
    eps=1e-8,
)

In [19]:
sum(p.numel() for group in optimizer.param_groups for p in group['params'] if p.requires_grad)

425984

# 3. training args

In [96]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./model",  # change to a repo name of your choice
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-3,
    warmup_steps=50,
    num_train_epochs=1,
    evaluation_strategy="steps",
    fp16=True,
    per_device_eval_batch_size=8,
    generation_max_length=128,
    logging_steps=1000,
    # max_steps=100, # only for testing purposes, remove this from your final run :)
    remove_unused_columns=False,  # required as the PeftModel forward doesn't have the signature of the wrapped model's forward
    label_names=["labels"],  # same reason as above
)



In [11]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor ([`Wav2Vec2Processor`])
            The processor used for proccessing the data.
        decoder_start_token_id (:obj: `int`)
            The start-of-sequence token id of the decoder.
        decoder_prev_token_id (:obj: `int`)
            The start-of-prompt token id of the decoder
        input_padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned input sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        target_padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned target sequences (according to the model's padding side and padding index).
            See above for details.
        max_target_length (:obj:`int`, `optional`):
            Maximum length of the ``labels`` of the returned list and optionally padding length (see above).
    """

    processor: Any
    decoder_start_token_id: int
    decoder_prev_token_id: int
    input_padding: Union[bool, str] = "max_length"
    target_padding: Union[bool, str] = "max_length"
    max_target_length: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], np.ndarray]]]) -> Dict[str, np.ndarray]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods

        # dataloader returns a list of features which we convert to a dict
        input_features = {"input_features": [feature["input_features"] for feature in features]}
        label_features = {"input_ids": [feature["labels"] for feature in features]}

        # reformat list to dict and set to pytorch format
        batch = self.processor.feature_extractor.pad(
            input_features,
            padding=self.input_padding,
            return_tensors="pt",
        )

        labels_batch = self.processor.tokenizer.pad(
            label_features,
            max_length=self.max_target_length,
            padding=self.target_padding,
            return_tensors="pt",
        )

        # shift labels to the right to get decoder input ids
        labels = labels_batch["input_ids"]
        decoder_input_ids = labels[:, :-1]
        labels = labels[:, 1:]
        labels_mask = labels_batch.attention_mask[:, 1:]

        # replace padding with -100 to ignore correctly when computing the loss
        labels = labels.masked_fill(labels_mask.ne(1), -100)

        # replace initial prompt tokens with -100 to ignore correctly when computing the loss
        bos_index = torch.argmax((labels == self.decoder_start_token_id).long(), dim=1)
        bos_index = torch.where(bos_index > 0, bos_index + 1, bos_index)
        prompt_mask = torch.arange(labels.shape[1]) < bos_index[:, None]
        labels = torch.where(prompt_mask, -100, labels)

        batch["labels"] = labels
        batch["decoder_input_ids"] = decoder_input_ids

        return batch

# Eval

In [2]:
from preprocess import preprocess_datasets

In [3]:
from peft import PeftModel, PeftConfig
from transformers import WhisperForConditionalGeneration, Seq2SeqTrainer
from itertools import islice

In [7]:
peft_model_id = "./checkpoint-25000-epoch-1" # Use the same model ID as before.
peft_config = PeftConfig.from_pretrained(peft_model_id)


In [82]:
model = WhisperForConditionalGeneration.from_pretrained(
    peft_config.base_model_name_or_path
)
model = PeftModel.from_pretrained(model, peft_model_id)

In [83]:
model.eval()

PeftModel(
  (base_model): LoraModel(
    (model): WhisperForConditionalGeneration(
      (model): WhisperModel(
        (encoder): WhisperEncoder(
          (conv1): Conv1d(80, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
          (conv2): Conv1d(1024, 1024, kernel_size=(3,), stride=(2,), padding=(1,))
          (embed_positions): Embedding(1500, 1024)
          (layers): ModuleList(
            (0-23): 24 x WhisperEncoderLayer(
              (self_attn): WhisperSdpaAttention(
                (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
                (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
                (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
                (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
              )
              (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
              (activation_fn): GELUActivation()
              (fc1): Linea

In [4]:
dataset = preprocess_datasets()

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/63 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/63 [00:00<?, ?it/s]

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'text', 'input_features', 'input_length', 'labels'],
        num_rows: 1061465
    })
    ID_eval: Dataset({
        features: ['audio', 'text', 'input_features', 'input_length', 'labels'],
        num_rows: 19567
    })
    OOD_eval: Dataset({
        features: ['audio', 'text', 'input_features', 'input_length', 'labels'],
        num_rows: 4077
    })
})

In [7]:
base_model = "openai/whisper-medium"

In [6]:
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium").to("cuda")

In [8]:
processor = WhisperProcessor.from_pretrained(base_model)
tokenizer = WhisperTokenizerFast.from_pretrained(base_model)

In [9]:
decoder_start_token_id = model.config.decoder_start_token_id  # <|startoftranscript|>
decoder_prev_token_id = tokenizer.all_special_ids[-3]  # <|startofprev|>

In [12]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=decoder_start_token_id,
    decoder_prev_token_id=decoder_prev_token_id,
    input_padding="longest",
    target_padding="max_length",
    max_target_length=448,
)

In [16]:
dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'text', 'input_features', 'input_length', 'labels'],
        num_rows: 1061465
    })
    ID_eval: Dataset({
        features: ['audio', 'text', 'input_features', 'input_length', 'labels'],
        num_rows: 19567
    })
    OOD_eval: Dataset({
        features: ['audio', 'text', 'input_features', 'input_length', 'labels'],
        num_rows: 4077
    })
})

In [110]:
train_dataloader = DataLoader(
    dataset['train'],
    collate_fn=data_collator,
    batch_size=72,
    drop_last=False,
    num_workers=8,
    # pin_memory=training_args.dataloader_pin_memory,
)

ID_dataloader = DataLoader(
    dataset['ID_eval'],
    collate_fn=data_collator,
    batch_size=72,
    drop_last=False,
    num_workers=8,
    # pin_memory=training_args.dataloader_pin_memory,
)

OOD_dataloader = DataLoader(
    dataset['OOD_eval'],
    collate_fn=data_collator,
    batch_size=72,
    drop_last=False,
    num_workers=8,
    # pin_memory=training_args.dataloader_pin_memory,
)


In [102]:
gen_kwargs = {
    "max_length": 128,
    "num_beams": 5,
    # "language": 'de', 
    # "task": 'transcription',
}

In [94]:
tokenizer.pad_token_id

50256

In [95]:
model = model.to("cuda")

In [16]:
import evaluate

metric = evaluate.load("wer")

In [111]:
train_predictions = []
train_references = []
train_normalized_predictions = []
train_normalized_references = []

normalizer = BasicTextNormalizer()

for batch in tqdm(
    islice(train_dataloader, 100),
    desc=f"Evaluating {'train_eval'}...",
    ):
    generated_ids = model.generate(batch["input_features"].to('cuda'), **gen_kwargs)
    labels = batch["labels"]
    labels = np.where(labels != -100, labels, processor.tokenizer.pad_token_id)
    
    decoded_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    train_predictions.extend(decoded_preds)
    train_references.extend(decoded_labels)
    train_normalized_predictions.extend([normalizer(pred).strip() for pred in decoded_preds])
    train_normalized_references.extend([normalizer(label).strip() for label in decoded_labels])

Evaluating train_eval...: 0it [00:00, ?it/s]

In [85]:
model = model.to("cuda")

In [104]:
id_predictions = []
id_references = []
id_normalized_predictions = []
id_normalized_references = []

normalizer = BasicTextNormalizer()

ID_dataloader = islice(ID_dataloader, 100)

for batch in tqdm(
    ID_dataloader,
    desc=f"Evaluating {'ID_eval'}...",
    ):
    generated_ids = model.generate(batch["input_features"].to("cuda"), **gen_kwargs)
    labels = batch["labels"]
    labels = np.where(labels != -100, labels, processor.tokenizer.pad_token_id)
    
    decoded_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    id_predictions.extend(decoded_preds)
    id_references.extend(decoded_labels)
    id_normalized_predictions.extend([normalizer(pred).strip() for pred in decoded_preds])
    id_normalized_references.extend([normalizer(label).strip() for label in decoded_labels])
    # del generated_ids, labels, batch

Evaluating ID_eval...: 0it [00:00, ?it/s]

In [105]:
ood_predictions = []
ood_references = []
ood_normalized_predictions = []
ood_normalized_references = []

normalizer = BasicTextNormalizer()

OOD_dataloader = islice(OOD_dataloader, 100)

for batch in tqdm(
    OOD_dataloader,
    desc=f"Evaluating {'OOD_eval'}...",
    ):
    generated_ids = model.generate(batch["input_features"].to('cuda'), **gen_kwargs)
    labels = batch["labels"]
    labels = np.where(labels != -100, labels, processor.tokenizer.pad_token_id)
    
    decoded_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    ood_predictions.extend(decoded_preds)
    ood_references.extend(decoded_labels)
    ood_normalized_predictions.extend([normalizer(pred).strip() for pred in decoded_preds])
    ood_normalized_references.extend([normalizer(label).strip() for label in decoded_labels])

Evaluating OOD_eval...: 0it [00:00, ?it/s]

In [112]:
wer = 100 * metric.compute(predictions=train_predictions, references=train_references)
normalized_wer = 100 * metric.compute(predictions=train_normalized_predictions, references=train_normalized_references)

print(f"train: WER: {wer}, Normalized WER: {normalized_wer}")

train: WER: 29.05259283117839, Normalized WER: 27.846659532181622


In [107]:
wer = 100 * metric.compute(predictions=id_predictions, references=id_references)
normalized_wer = 100 * metric.compute(predictions=id_normalized_predictions, references=id_normalized_references)

print(f"ID: WER: {wer}, Normalized WER: {normalized_wer}")

WER: 38.64809810011698, Normalized WER: 36.10389349812899


4077

In [146]:
ood_predictions_new, ood_references_new = zip(*[(x, y) for x, y in zip(ood_predictions, ood_references) if x != "" and y != ""])
ood_normalized_predictions_new, ood_normalized_references_new = zip(*[(x, y) for x, y in zip(ood_normalized_predictions, ood_normalized_references) if x != "" and y != ""])

In [148]:
wer = 100 * metric.compute(predictions=ood_predictions_new, references=ood_references_new)
normalized_wer = 100 * metric.compute(predictions=ood_normalized_predictions_new, references=ood_normalized_references_new)

print(f"OOD: WER: {wer}, Normalized WER: {normalized_wer}")

OOD: WER: 63.56350110984462, Normalized WER: 50.20510483135825


In [33]:
torch.cuda.empty_cache()

In [35]:
model = WhisperForConditionalGeneration.from_pretrained("distil-whisper/distil-medium.en").to("cuda")

tokenizer = WhisperTokenizerFast.from_pretrained("distil-whisper/distil-medium.en")

In [36]:
train_dataloader = DataLoader(
    dataset['train'],
    collate_fn=data_collator,
    batch_size=72,
    drop_last=False,
    num_workers=8,
    # pin_memory=training_args.dataloader_pin_memory,
)

In [38]:
train_predictions = []
train_references = []
train_normalized_predictions = []
train_normalized_references = []

normalizer = BasicTextNormalizer()

gen_kwargs = {
    "max_length": 128,
    "num_beams": 5,
    # "language": 'de', 
    # "task": 'transcription',
}

for batch in tqdm(
    islice(train_dataloader, 100),
    desc=f"Evaluating {'train_eval'}...",
    ):
    generated_ids = model.generate(batch["input_features"].to('cuda'), **gen_kwargs)
    labels = batch["labels"]
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    decoded_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    train_predictions.extend(decoded_preds)
    train_references.extend(decoded_labels)
    train_normalized_predictions.extend([normalizer(pred).strip() for pred in decoded_preds])
    train_normalized_references.extend([normalizer(label).strip() for label in decoded_labels])

Evaluating train_eval...: 0it [00:00, ?it/s]

KeyboardInterrupt: 

In [39]:
wer = 100 * metric.compute(predictions=train_predictions, references=train_references)
normalized_wer = 100 * metric.compute(predictions=train_normalized_predictions, references=train_normalized_references)

print(f"train: WER: {wer}, Normalized WER: {normalized_wer}")

train: WER: 113.99646630496858, Normalized WER: 118.0535665044787


In [19]:
model1 = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium").to("cpu")
model2 = WhisperForConditionalGeneration.from_pretrained("distil-whisper/distil-medium.en").to("cpu")

In [24]:
model1.model.decoder.embed_tokens

Embedding(51865, 1024, padding_idx=50257)

In [32]:
model2.config

WhisperConfig {
  "_name_or_path": "distil-whisper/distil-medium.en",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "apply_spec_augment": false,
  "architectures": [
    "WhisperForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "begin_suppress_tokens": [
    220,
    50256
  ],
  "bos_token_id": 50257,
  "classifier_proj_size": 256,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 2,
  "decoder_start_token_id": 50257,
  "dropout": 0.0,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 24,
  "eos_token_id": 50256,
  "forced_decoder_ids": [
    [
      1,
      50362
    ]
  ],
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "mask_feature_length": 10,
  "mask_feature_min_masks": 0,
  "mask_feature_prob": 0.0,
  "mask_time_length": 10,
  "mask_time_min_masks": 2,
  "mask_time_prob": 0.05,
  "max_length": 448,
  "max_source

In [21]:
model2

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1024, 1024, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1024)
      (layers): ModuleList(
        (0-23): 24 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias

In [6]:
tokenizer1 = WhisperTokenizerFast.from_pretrained("openai/whisper-medium")
tokenizer2 = WhisperTokenizerFast.from_pretrained("openai/whisper-medium.en")
tokenizer3 = WhisperTokenizerFast.from_pretrained("distil-whisper/distil-medium.en")

In [8]:
tokenizer4 = WhisperTokenizerFast.from_pretrained("openai/whisper-large-v3")

In [15]:
set(tokenizer1.vocab.keys()) - set(tokenizer2.vocab.keys())

{'',
 'ĠìļĶì¦ĺ',
 'Ġgleich',
 'Ġludzi',
 'Ãĩ',
 'Ġez',
 'Ġresponder',
 'ĠÃ©conomique',
 'ĠnasÄ±l',
 'ĠÐ½ÑĢÐ°Ð²',
 'Ġbutts',
 'Ġìĥī',
 'æĹ¥',
 'ÑĭÐµ',
 'éĤĦæľī',
 'ĠÑģÐ¾Ð²',
 'ĠOke',
 'owym',
 'ĠÐ±Ñĭ',
 'aphrag',
 'ë¥',
 'Ġë§ŀìķĦìļĶ',
 'ĠÐ³Ð¾ÑĢÐ¾Ð´Ð°',
 'Ġleva',
 'ìħĺ',
 'ìĹĲê²Į',
 'Ã¡b',
 'Ġtes',
 'Ġbastards',
 'ÐµÐºÑĤ',
 'è¹',
 'ĠJeg',
 'Ġpasst',
 'æĽ¸',
 'çĽ¸',
 'Ġging',
 'Ġsegue',
 'è®ĵ',
 'ãģ¾ãģĽ',
 'ÐµÑĢÑĥ',
 'ĠìĻĢ',
 'Ġë¹Ħ',
 'Ġirrespons',
 'ÐĹÐ´',
 'ĠItÃŃs',
 'onya',
 'Ġbana',
 'ĠØ¨Øª',
 'ĠÐĹÐ°',
 'ç´¯',
 'íķľëį°',
 'riÃ¨re',
 'ĠAvo',
 'ãĤĪãģĨ',
 '?...',
 'ĠÐ·Ð°Ð´',
 'ãģĿãģĹãģ¦',
 'ÑĪÑĮ',
 'æ¶',
 'ĠDlatego',
 'Ġhu',
 'atif',
 'ÏĢÏĮÎ½',
 'ÃŃas',
 'Ġprobabil',
 'ê²łë',
 'quoi',
 'ìĿ¸ìĿĦ',
 'Ġëĵ¤ìĸ´ë',
 'ÐµÑĢÑĤÐ²',
 'Ã¢nd',
 'ĠdoÅĽwiad',
 'Ġimaginar',
 'Ġblev',
 'Ã¦r',
 'Ã§on',
 'Ġgogg',
 'erkt',
 'Ġcontrat',
 'ĠJeez',
 'ĠSchÃ¶n',
 'Ġwidz',
 'ĠÐ¾ÑĤÐ²ÐµÑĩ',
 'ĠëŃĲë',
 'ĠëŃĶê°Ģ',
 'Ġastronom',
 'á»±c',
 'ĠëĬĲë',
 'uen',
 'ãģĦãģĨãģĵãģ¨',
 'Ð»Ð¾ÑģÑĮ',
 'Ġih',
 'ĠZiel',
 'ÙĪØ§',
 'ÏĨ',


In [16]:
len(set(tokenizer2.vocab.keys()) - set(tokenizer1.vocab.keys()))

20877

In [11]:
len(set(tokenizer2.vocab.keys()) - set(tokenizer4.vocab.keys()))

20878

In [59]:
len(tokenizer2.vocab)

51864

In [56]:
tokenizer3.vocab['xi']

29992