In [None]:
# --- Installation ---
!pip install -U transformers
!pip install -U accelerate
!pip install datasets
!pip install evaluate
!pip install rouge-score
!pip install bitsandbytes

# --- Imports (using only what's available) ---
import torch
import torchaudio
import numpy as np
import os
import json
from datasets import load_dataset, DatasetDict
import evaluate
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.nn.utils.rnn import pad_sequence
from torch.cuda.amp import autocast, GradScaler
from torch.optim.swa_utils import SWALR
import gc
import bitsandbytes as bnb
from rouge_score import rouge_scorer # Import rouge_scorer here

# Set memory allocation configuration
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# Check CUDA availability
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

# --- Load WhisperProcessor and Model (using the tiny version) ---
# Use the highest version of transformers you have installed
import transformers
print(f"Transformers version: {transformers.__version__}")

# Import only the specific components we need
from transformers import WhisperProcessor, WhisperForConditionalGeneration

# --- Dataset Loading and Preparation ---
# Load dataset
dataset = load_dataset("krishan23/indian_english")
print(dataset)
print(dataset['train'][0])
print("Dataset splits:", dataset.keys())

# Create validation split if 'test' does not exist
if 'test' not in dataset:
    train_val = dataset['train'].train_test_split(test_size=0.1, seed=42)
    dataset = DatasetDict({'train': train_val['train'], 'validation': train_val['test']})
else:
    dataset = DatasetDict({'train': dataset['train'], 'validation': dataset['test']})

# Sample smaller subsets for training
train_data = dataset['train'].shuffle(seed=42).select(range(200))  # Reduced from 500
val_data = dataset['validation'].shuffle(seed=42).select(range(20))  # Reduced from 50

# --- Model and Processor Loading ---
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="en", task="transcribe")

# Enable gradient checkpointing to save memory
model.gradient_checkpointing_enable()

# --- Dataset Class ---
class WhisperDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor
        self.processed_data = []

        # Process all examples in advance
        for i, item in enumerate(dataset):
            try:
                processed = self.process_item(item)
                if processed:
                    self.processed_data.append(processed)

                # Print progress
                if (i+1) % 50 == 0:
                    print(f"Processed {i+1}/{len(dataset)} examples")
            except Exception as e:
                print(f"Error processing example {i}: {e}")
                continue

        print(f"Successfully processed {len(self.processed_data)}/{len(dataset)} examples")

    def process_item(self, item):
        audio = item["audio"]

        # Resample if needed
        if audio["sampling_rate"] != 16000:
            try:
                resampler = torchaudio.transforms.Resample(orig_freq=audio["sampling_rate"], new_freq=16000)
                audio_array = torch.tensor(audio["array"]).float()
                resampled_audio = resampler(audio_array).numpy()
                audio["array"] = resampled_audio
                audio["sampling_rate"] = 16000
            except Exception as e:
                print(f"Warning: Error resampling audio: {e}")
                # Continue with original if resampling fails

        # Extract features and labels
        try:
            # Truncate audio to a fixed maximum length (30 seconds at 16kHz)
            max_length = 30 * 16000  # 30 seconds of audio at 16kHz
            if len(audio["array"]) > max_length:
                audio["array"] = audio["array"][:max_length]

            input_features = processor(
                audio["array"],
                sampling_rate=audio["sampling_rate"],
                return_tensors="pt"
            ).input_features[0]

            labels = processor(text=item["transcription"]).input_ids
            return {
                "input_features": input_features,
                "labels": torch.tensor(labels),
                "transcription": item["transcription"]
            }
        except Exception as e:
            print(f"Error processing features or labels: {e}")
            return None

    def __len__(self):
        return len(self.processed_data)

    def __getitem__(self, idx):
        return self.processed_data[idx]

# --- Collate Function ---
def collate_fn(batch):
    input_features = [item["input_features"] for item in batch]
    labels = [item["labels"] for item in batch]

    # Pad input features (they should all have the same second dimension)
    padded_inputs = pad_sequence([f for f in input_features], batch_first=True)

    # Pad labels
    padded_labels = pad_sequence(labels, batch_first=True, padding_value=-100)

    return {
        "input_features": padded_inputs,
        "labels": padded_labels,
        "transcriptions": [item["transcription"] for item in batch]
    }

# --- Prepare Datasets ---
print("Preparing training dataset...")
train_dataset = WhisperDataset(train_data, processor)
print("Preparing validation dataset...")
val_dataset = WhisperDataset(val_data, processor)

# --- Create DataLoaders ---
train_loader = DataLoader(
    train_dataset,
    batch_size=1,  # Reduced from 2
    shuffle=True,
    collate_fn=collate_fn
)

val_loader = DataLoader(
    val_dataset,
    batch_size=1,  # Reduced from 2
    shuffle=False,
    collate_fn=collate_fn
)

# --- Training Loop ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer (using 8-bit quantization if bitsandbytes is available)
optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=1e-5, weight_decay=0.01)

# Create a gradient scaler for mixed precision
scaler = GradScaler()

# Training parameters
num_epochs = 3
grad_accum_steps = 8  # Increased from 4
step = 0
best_val_loss = float('inf')

# Create output directory
os.makedirs("./whisper_indian_english", exist_ok=True)

print(f"Starting training for {num_epochs} epochs...")

for epoch in range(num_epochs):
    # Training
    model.train()
    train_loss = 0

    for batch_idx, batch in enumerate(train_loader):
        # Move batch to device
        input_features = batch["input_features"].to(device)
        labels = batch["labels"].to(device)

        # Forward pass with mixed precision
        with autocast():
            outputs = model(input_features=input_features, labels=labels)
            loss = outputs.loss / grad_accum_steps  # Normalize loss for gradient accumulation

        # Scale the loss and call backward
        scaler.scale(loss).backward()

        # Update weights every grad_accum_steps or at the last batch
        if (batch_idx + 1) % grad_accum_steps == 0 or batch_idx == len(train_loader) - 1:
            # Unscale gradients and apply gradient clipping
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            # Update weights with scaled gradients
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            step += 1

            # Free up GPU memory
            torch.cuda.empty_cache()
            gc.collect()

        # Log
        train_loss += loss.item() * grad_accum_steps  # Multiply back to get the actual loss
        if batch_idx % 10 == 0:
            print(f"Epoch {epoch+1}/{num_epochs}, Batch {batch_idx}/{len(train_loader)}, Loss: {loss.item() * grad_accum_steps:.4f}")
            # Print GPU memory usage
            if torch.cuda.is_available():
                print(f"GPU Memory: {torch.cuda.memory_allocated() / 1024**3:.2f}GB / {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f}GB")

    avg_train_loss = train_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Training Loss: {avg_train_loss:.4f}")

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_features = batch["input_features"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_features=input_features, labels=labels)
            val_loss += outputs.loss.item()

            # Free up GPU memory
            torch.cuda.empty_cache()

    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Validation Loss: {avg_val_loss:.4f}")

    # Save checkpoint if validation loss improved
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss


def generate_predictions(model, val_loader, device):
    model.eval()
    predictions = []
    references = []

    with torch.no_grad():
        for batch in val_loader:
            input_features = batch["input_features"].to(device)
            labels = batch["labels"].to(device)

            # Generate predictions
            generated_ids = model.generate(input_features)
            generated_transcriptions = processor.batch_decode(generated_ids, skip_special_tokens=True)
            reference_transcriptions = processor.batch_decode(labels, skip_special_tokens=True)

            predictions.extend(generated_transcriptions)
            references.extend(reference_transcriptions)

    return predictions, references

# Generate predictions
finetuned_predictions, finetuned_references = generate_predictions(model, val_loader, device)


# --- ROUGE Calculation ---
def calculate_rouge_score(predictions, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = {}
    for p, r in zip(predictions, references):
        scores_for_example = scorer.score(r, p)
        for key in scores_for_example:
            if key not in scores:
                scores[key] = []
            scores[key].append(scores_for_example[key])

    # Compute average scores
    avg_scores = {key: np.mean([score.fmeasure for score in values]) for key, values in scores.items()}
    return avg_scores

# Calculate ROUGE score for the fine-tuned model
finetuned_rouge = calculate_rouge_score(finetuned_predictions, finetuned_references)
print("ROUGE Score (Fine-tuned model):", finetuned_rouge)

Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.49.0-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m47.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.48.3
    Uninstalling transformers-4.48.3:
      Successfully uninstalled transformers-4.48.3
Successfully installed transformers-4.49.0
Collecting accelerate
  Downloading accelerate-1.4.0-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/399 [00:00<?, ?B/s]

train-00000-of-00008.parquet:   0%|          | 0.00/415M [00:00<?, ?B/s]

train-00001-of-00008.parquet:   0%|          | 0.00/345M [00:00<?, ?B/s]

train-00002-of-00008.parquet:   0%|          | 0.00/253M [00:00<?, ?B/s]

train-00003-of-00008.parquet:   0%|          | 0.00/349M [00:00<?, ?B/s]

train-00004-of-00008.parquet:   0%|          | 0.00/426M [00:00<?, ?B/s]

train-00005-of-00008.parquet:   0%|          | 0.00/385M [00:00<?, ?B/s]

train-00006-of-00008.parquet:   0%|          | 0.00/371M [00:00<?, ?B/s]

train-00007-of-00008.parquet:   0%|          | 0.00/545M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6765 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['audio', 'id', 'transcription', 'speaker_id'],
        num_rows: 6765
    })
})
{'audio': {'path': 'train_hindifullmale_00001.wav', 'array': array([0., 0., 0., ..., 0., 0., 0.]), 'sampling_rate': 48000}, 'id': 0, 'transcription': 'There was once a merchant who employed many carpenters and masons to build a temple in his garden. ', 'speaker_id': 1}
Dataset splits: dict_keys(['train'])


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/151M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

Preparing training dataset...
Processed 50/200 examples
Processed 100/200 examples
Processed 150/200 examples
Processed 200/200 examples
Successfully processed 200/200 examples
Preparing validation dataset...
Successfully processed 20/20 examples


  scaler = GradScaler()
  with autocast():


Starting training for 3 epochs...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Epoch 1/3, Batch 0/200, Loss: 5.2840
GPU Memory: 0.30GB / 14.74GB
Epoch 1/3, Batch 10/200, Loss: 2.7216
GPU Memory: 0.31GB / 14.74GB
Epoch 1/3, Batch 20/200, Loss: 2.2548
GPU Memory: 0.31GB / 14.74GB
Epoch 1/3, Batch 30/200, Loss: 3.7981
GPU Memory: 0.38GB / 14.74GB
Epoch 1/3, Batch 40/200, Loss: 2.3843
GPU Memory: 0.38GB / 14.74GB
Epoch 1/3, Batch 50/200, Loss: 1.9970
GPU Memory: 0.38GB / 14.74GB
Epoch 1/3, Batch 60/200, Loss: 2.2036
GPU Memory: 0.38GB / 14.74GB
Epoch 1/3, Batch 70/200, Loss: 1.6875
GPU Memory: 0.38GB / 14.74GB
Epoch 1/3, Batch 80/200, Loss: 1.6051
GPU Memory: 0.38GB / 14.74GB
Epoch 1/3, Batch 90/200, Loss: 0.9871
GPU Memory: 0.38GB / 14.74GB
Epoch 1/3, Batch 100/200, Loss: 0.7297
GPU Memory: 0.38GB / 14.74GB
Epoch 1/3, Batch 110/200, Loss: 1.5936
GPU Memory: 0.37GB / 14.74GB
Epoch 1/3, Batch 120/200, Loss: 1.0216
GPU Memory: 0.38GB / 14.74GB
Epoch 1/3, Batch 130/200, Loss: 1.0048
GPU Memory: 0.38GB / 14.74GB
Epoch 1/3, Batch 140/200, Loss: 0.8284
GPU Memory: 0.38GB /

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Epoch 3/3, Validation Loss: 0.4903
ROUGE Score (Fine-tuned model): {'rouge1': 0.9377789485700235, 'rouge2': 0.8928060120376007, 'rougeL': 0.9377789485700235}
