In [3]:
import torch
import numpy as np
import torchaudio
from datasets import load_dataset, Audio
from transformers import (
    WhisperForConditionalGeneration,
    WhisperProcessor,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)
from transformers.models.whisper.feature_extraction_whisper import WhisperFeatureExtractor
from transformers.models.whisper.tokenization_whisper import WhisperTokenizer

In [4]:
model_name = "openai/whisper-small"

In [5]:
processor = WhisperProcessor.from_pretrained(model_name)
feature_extractor = processor.feature_extractor
tokenizer = processor.tokenizer
model = WhisperForConditionalGeneration.from_pretrained(model_name)

In [6]:
dataset = load_dataset("csv", data_files="Dataset/full_dataset.csv")["train"]

In [7]:
def preprocess(batch):
    waveform, sr = torchaudio.load(batch["path"])
    waveform = waveform.mean(dim=0).numpy()

    inputs = feature_extractor(
        waveform,
        sampling_rate=16000,
        return_tensors="pt"
    )

    labels = tokenizer(
        batch["text"],
        return_tensors="pt"
    ).input_ids

    batch["input_features"] = inputs.input_features[0]
    batch["labels"] = labels[0]

    return batch


In [8]:
dataset = dataset.map(preprocess)

Map: 100%|██████████| 28111/28111 [12:06<00:00, 38.72 examples/s]  


In [9]:
class DataCollatorSpeechSeq2SeqWithPadding:
    """
    Коллатор специально для Whisper: делает padding
    и для input_features, и для labels.
    """

    def __init__(self, processor):
        self.processor = processor

    def __call__(self, features):
        # входные признаки (мел-спектрограммы)
        input_features = [{"input_features": f["input_features"]} for f in features]
        batch = self.processor.feature_extractor.pad(
            input_features,
            return_tensors="pt"
        )

        # текстовые метки
        label_features = [{"input_ids": f["labels"]} for f in features]
        labels_batch = self.processor.tokenizer.pad(
            label_features,
            return_tensors="pt"
        )

        # заменяем паддинг на -100 для PyTorch loss
        labels = labels_batch["input_ids"].masked_fill(
            labels_batch.attention_mask.ne(1), -100
        )

        batch["labels"] = labels
        return batch

In [11]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor)

In [12]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper_finetuned",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    warmup_steps=100,
    max_steps=500,
    fp16=torch.cuda.is_available(),
    logging_steps=10,
    save_steps=100,
    predict_with_generate=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,              # ВАЖНО!
    data_collator=data_collator       # ВАЖНО!
)

trainer.train()

  trainer = Seq2SeqTrainer(
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,8.5278
20,8.0288
30,5.3706


KeyboardInterrupt: 