In [1]:
import torch
import numpy as np
import torchaudio
from datasets import load_dataset, Audio
from transformers import (
    WhisperForConditionalGeneration,
    WhisperProcessor,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)
from transformers.models.whisper.feature_extraction_whisper import WhisperFeatureExtractor
from transformers.models.whisper.tokenization_whisper import WhisperTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
model_name = "openai/whisper-tiny"
processor = WhisperProcessor.from_pretrained(model_name)
feature_extractor = processor.feature_extractor
tokenizer = processor.tokenizer
model = WhisperForConditionalGeneration.from_pretrained(model_name)

In [3]:
data_files = {
    "train": r"E:\Education\4 course 1 semester\Course project\Shards_prodject\Code\Task\Dataset\train.csv",
    "validation": r"E:\Education\4 course 1 semester\Course project\Shards_prodject\Code\Task\Dataset\val.csv",
    "test": r"E:\Education\4 course 1 semester\Course project\Shards_prodject\Code\Task\Dataset\test.csv"
}

dataset = load_dataset("csv", data_files=data_files)

train_ds = dataset["train"]
val_ds   = dataset["validation"]
test_ds  = dataset["test"]

print("Train size:", len(train_ds))
print("Validation size:", len(val_ds))
print("Test size:", len(test_ds))

Generating train split: 220901 examples [00:00, 276756.33 examples/s]
Generating validation split: 27613 examples [00:00, 270656.22 examples/s]
Generating test split: 27613 examples [00:00, 276069.05 examples/s]

Train size: 220901
Validation size: 27613
Test size: 27613





In [None]:
def preprocess(batch):
    
    waveform, sr = torchaudio.load(batch["path"])
    waveform = waveform.mean(dim=0).numpy() 


    if sr != 16000:
        waveform = torchaudio.functional.resample(
            torch.tensor(waveform), sr, 16000
        ).numpy()


    inputs = processor(
        waveform,
        sampling_rate=16000,
        return_tensors="pt"
    )


    labels = processor.tokenizer(
        batch["text"],
        return_tensors="pt"
    ).input_ids


    batch["input_features"] = inputs.input_features[0]
    batch["labels"] = labels[0]
    return batch

In [None]:
train_ds = train_ds.map(preprocess)
val_ds   = val_ds.map(preprocess)
test_ds  = test_ds.map(preprocess)

Map:   3%|▎         | 6269/220901 [02:13<1:16:07, 46.99 examples/s] 


ValueError: You need to specify either `text` or `text_target`.

In [None]:
class DataCollatorSpeechSeq2SeqWithPadding:

    def __init__(self, processor):
        self.processor = processor


    def __call__(self, features):
        input_features = [{"input_features": f["input_features"]} for f in features]
        batch = self.processor.feature_extractor.pad(
            input_features,
            return_tensors="pt"
        )


        label_features = [{"input_ids": f["labels"]} for f in features]
        labels_batch = self.processor.tokenizer.pad(
            label_features,
            return_tensors="pt"
        )


        labels = labels_batch["input_ids"].masked_fill(
            labels_batch.attention_mask.ne(1), -100
        )


        batch["labels"] = labels
        return batch

In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper_LoRA",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    warmup_steps=100,
    max_steps=41000, 
    logging_steps=10,
    save_steps=100,
    fp16=torch.cuda.is_available(),
    predict_with_generate=True,
    generation_max_length=225,
    push_to_hub=False,
)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=processor.tokenizer,
    data_collator=data_collator,
)


trainer.train()

  trainer = Seq2SeqTrainer(


Step,Training Loss
10,7.7326
20,6.5912
30,6.0358
40,4.2228
50,3.183
60,2.504
70,1.8875
80,1.6795
90,1.1872
100,1.188


TrainOutput(global_step=100, training_loss=3.6211781311035156, metrics={'train_runtime': 311.3659, 'train_samples_per_second': 2.569, 'train_steps_per_second': 0.321, 'total_flos': 1.9695108096e+16, 'train_loss': 3.6211781311035156, 'epoch': 0.03557452863749555})

In [None]:
metrics = trainer.evaluate(val_ds)
print(metrics)

{'eval_loss': 1.2308405637741089, 'eval_runtime': 601.1059, 'eval_samples_per_second': 4.676, 'eval_steps_per_second': 0.586, 'epoch': 0.03557452863749555}


In [None]:
model.save_pretrained("./whisper_LoRA")      
processor.save_pretrained("./whisper_LoRA")