In [None]:
!pip3 install transformers datasets accelerate librosa
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124   
!pip install "tensorboard==1.14.0"

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://download.pytorch.org/whl/cu124, https://pypi.ngc.nvidia.com


In [6]:
import torch
from datasets import load_dataset
from transformers import (
    WhisperFeatureExtractor, 
    WhisperTokenizer, 
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments, 
    Seq2SeqTrainer
)
from dataclasses import dataclass
from typing import Any, Dict, List, Union

# 1. Load your dataset from Hugging Face
# Replace 'your-username/your-dataset' with your actual dataset
dataset = load_dataset("Tarakeshwaran/Whisper-train-data")

# 2. Load pre-trained Whisper model and processors
model_name = "openai/whisper-small"  # You can choose different sizes
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name)
tokenizer = WhisperTokenizer.from_pretrained(model_name)
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

# 3. Prepare the dataset
def prepare_dataset(batch):
    # Assuming your dataset has 'audio' and 'text' columns
    # Adjust based on your actual dataset structure
    audio = batch['audio']
    text = batch['text']
    
    # Process audio
    input_features = feature_extractor(
        audio['array'], 
        sampling_rate=audio['sampling_rate'], 
        return_tensors="pt"
    ).input_features
    
    # Encode text
    labels = tokenizer(text).input_ids
    
    return {
        "input_features": input_features.squeeze(),
        "labels": labels
    }

# Prepare train and eval datasets
prepared_dataset = dataset.map(prepare_dataset, remove_columns=dataset['train'].column_names)

# 4. Data Collator
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    padding: bool = True
    max_length: int = None
    pad_to_multiple_of: int = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Prepare input features
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(
            input_features, 
            return_tensors="pt",
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of
        )

        # Prepare labels
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(
            label_features, 
            return_tensors="pt",
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of
        )

        # Replace padding with -100 to ignore loss on padding tokens
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        return {
            "input_features": batch["input_features"],
            "labels": labels,
            # Only add attention_mask if it exists
            **({"attention_mask": batch.get("attention_mask")} if "attention_mask" in batch else {})
        }

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

# 5. Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-fine-tuned",
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=4000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=1000,
    eval_steps=1000,
    logging_steps=200,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False
)

# 6. Trainer
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=prepared_dataset["train"],
    eval_dataset=prepared_dataset["test"],
    data_collator=data_collator,
    # Optional: Add compute_metrics if you want to track WER
    # compute_metrics=compute_metrics  # You'd need to define this function
)

# 7. Train the model
trainer.train()

# 8. Save the fine-tuned model
trainer.save_model("./whisper-fine-tuned")

max_steps is given, it will override any value given in num_train_epochs
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


KeyboardInterrupt: 