<a href="https://colab.research.google.com/github/Patience3/WhisperSmall-Finetuned-For-Afrispeech/blob/main/WhisperModel_Finetuning_for_English_With_Yoruba_Accent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Part 1

In [1]:
# ============================================================================
# COMPLETE WHISPER FINE-TUNING PIPELINE FOR Yoruba ASR
# ============================================================================

# ============================================================================
# PART 1: INSTALLATIONS & IMPORTS
# ============================================================================

!pip install huggingface_hub datasets transformers accelerate -q
!pip install torchaudio jiwer sentencepiece -q

import os
import tarfile
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchaudio
from huggingface_hub import snapshot_download
from jiwer import wer, cer
import matplotlib.pyplot as plt
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import time
import soundfile as sf

# Transformers imports for Whisper
from transformers import (
    WhisperProcessor,
    WhisperForConditionalGeneration,
    WhisperConfig
)





[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.2/3.2 MB[0m [31m142.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m83.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from google.colab import drive

# Mount Google Drive
if not os.path.ismount("/content/drive"):
    print("Mounting Google Drive...")
    drive.mount("/content/drive")
else:
    print("Google Drive is already mounted.")

Mounting Google Drive...
Mounted at /content/drive


In [3]:
# ============================================================================
# PART 2: DATA DOWNLOAD & DATA LOADING
# ============================================================================

print("="*80)
print("DOWNLOADING AFRISPEECH DATA")
print("="*80)

snapshot_download(
    repo_id="intronhealth/afrispeech-200",
    repo_type="dataset",
    allow_patterns=["audio/yoruba*", "transcripts/yoruba*", 'accents.json'],
    local_dir="/content/afrispeech200"
)

print("Extracting audio files...")
splits = ['train', 'dev', 'test']
audio_root = "/content/afrispeech200/audio/yoruba"

for split in splits:
    # Loop through all tar files for the split
    for i in range(10):  # Adjust max number if needed
        tar_path = f"{audio_root}/{split}/{split}_yoruba_{i}.tar.gz"
        if os.path.exists(tar_path):
            extract_dir = f"{audio_root}/{split}"
            os.makedirs(extract_dir, exist_ok=True)
            print(f"  Extracting {tar_path}...")
            with tarfile.open(tar_path, "r:gz") as tar:
                tar.extractall(path=extract_dir)

print("Dataset download and extraction complete")

# ---------------------------------------------------------------------------
# Function to load AfriSpeech dataset with recursive audio discovery
# ---------------------------------------------------------------------------
def load_afrispeech_data_improved(accent, split, sample_percentage=100,
                                  min_duration=0.5, max_duration=None):
    """Load AfriSpeech dataset with all subfolders recursively included"""
    try:
        split_map = {'train': 'train', 'validation': 'dev', 'test': 'test'}
        dataset_split = split_map.get(split, split)

        base_dir = "/content/afrispeech200"
        csv_path = f"{base_dir}/transcripts/{accent}/{dataset_split}.csv"

        df = pd.read_csv(csv_path)
        print(f"Loaded {len(df)} rows from {csv_path}")

        audio_root = f"{base_dir}/audio/{accent}/{dataset_split}"

        # Map CSV filenames to transcripts
        filename_to_transcript = {os.path.basename(p): t for p, t in zip(df['audio_paths'], df['transcript'])}

        # Recursively scan audio_root for all files
        all_audio_files = []
        for root, _, files in os.walk(audio_root):
            for file in files:
                if file.endswith(".wav") or file.endswith(".flac"):
                    if file in filename_to_transcript:
                        all_audio_files.append({
                            'audio_path': os.path.join(root, file),
                            'transcript': filename_to_transcript[file]
                        })

        print(f"Found {len(all_audio_files)} valid audio files in {audio_root}")

        # Apply sampling if needed
        if sample_percentage < 100:
            sample_size = int(len(all_audio_files) * sample_percentage / 100)
            all_audio_files = np.random.choice(all_audio_files, sample_size, replace=False).tolist()
            print(f"Using {len(all_audio_files)} samples ({sample_percentage}%)")

        # Filter by duration
        filtered_samples = []
        durations = []
        skipped_short = 0
        skipped_long = 0

        print(f"Analyzing durations (min={min_duration}s, max={max_duration or 'None'}s)...")
        for sample in tqdm(all_audio_files, total=len(all_audio_files), desc="Processing"):
            try:
                data, sr = sf.read(sample['audio_path'])
                duration = len(data) / sr

                if duration < min_duration:
                    skipped_short += 1
                    continue
                if max_duration and duration > max_duration:
                    skipped_long += 1
                    continue

                filtered_samples.append({
                    'audio_path': sample['audio_path'],
                    'transcript': sample['transcript'],
                    'duration': duration,
                    'sampling_rate': sr
                })
                durations.append(duration)

            except Exception:
                continue

        print(f"Loaded {len(filtered_samples)} samples")
        print(f"  Skipped {skipped_short} too short (< {min_duration}s)")
        if max_duration:
            print(f"  Skipped {skipped_long} too long (> {max_duration}s)")
        if durations:
            print(f"  Duration range: {min(durations):.2f}s - {max(durations):.2f}s")
            print(f"  Mean duration: {np.mean(durations):.2f}s")
            print(f"  Total audio: {sum(durations)/3600:.2f} hours")

        return filtered_samples

    except Exception as e:
        print(f"Error loading dataset: {e}")
        import traceback
        traceback.print_exc()
        return None


DOWNLOADING AFRISPEECH DATA


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 17 files:   0%|          | 0/17 [00:00<?, ?it/s]

audio/yoruba/train/train_yoruba_3.tar.gz:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

audio/yoruba-hausa/test/test_yoruba-haus(…):   0%|          | 0.00/80.3M [00:00<?, ?B/s]

audio/yoruba/train/train_yoruba_0.tar.gz:   0%|          | 0.00/1.40G [00:00<?, ?B/s]

audio/yoruba/train/train_yoruba_2.tar.gz:   0%|          | 0.00/1.49G [00:00<?, ?B/s]

audio/yoruba/test/test_yoruba_0.tar.gz:   0%|          | 0.00/548M [00:00<?, ?B/s]

audio/yoruba/dev/dev_yoruba_0.tar.gz:   0%|          | 0.00/286M [00:00<?, ?B/s]

audio/yoruba/train/train_yoruba_1.tar.gz:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

accents.json: 0.00B [00:00, ?B/s]

audio/yoruba/train/train_yoruba_4.tar.gz:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

audio/yoruba/train/train_yoruba_5.tar.gz:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

audio/yoruba/train/train_yoruba_6.tar.gz:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

audio/yoruba/train/train_yoruba_7.tar.gz:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

audio/yoruba/train/train_yoruba_8.tar.gz:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

transcripts/yoruba-hausa/test.csv:   0%|          | 0.00/30.8k [00:00<?, ?B/s]

transcripts/yoruba/dev.csv:   0%|          | 0.00/121k [00:00<?, ?B/s]

transcripts/yoruba/test.csv:   0%|          | 0.00/216k [00:00<?, ?B/s]

transcripts/yoruba/train.csv:   0%|          | 0.00/5.00M [00:00<?, ?B/s]

Extracting audio files...
  Extracting /content/afrispeech200/audio/yoruba/train/train_yoruba_0.tar.gz...


  tar.extractall(path=extract_dir)


  Extracting /content/afrispeech200/audio/yoruba/train/train_yoruba_1.tar.gz...
  Extracting /content/afrispeech200/audio/yoruba/train/train_yoruba_2.tar.gz...
  Extracting /content/afrispeech200/audio/yoruba/train/train_yoruba_3.tar.gz...
  Extracting /content/afrispeech200/audio/yoruba/train/train_yoruba_4.tar.gz...
  Extracting /content/afrispeech200/audio/yoruba/train/train_yoruba_5.tar.gz...
  Extracting /content/afrispeech200/audio/yoruba/train/train_yoruba_6.tar.gz...
  Extracting /content/afrispeech200/audio/yoruba/train/train_yoruba_7.tar.gz...
  Extracting /content/afrispeech200/audio/yoruba/train/train_yoruba_8.tar.gz...
  Extracting /content/afrispeech200/audio/yoruba/dev/dev_yoruba_0.tar.gz...
  Extracting /content/afrispeech200/audio/yoruba/test/test_yoruba_0.tar.gz...
Dataset download and extraction complete


In [4]:
# Set device - use CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set default tensor device to GPU if available
# torch.cuda.set_device(0)

In [5]:

# ============================================================================
# PART 3: WHISPER MODEL LOADING
# ============================================================================

def load_whisper_model(model_name="openai/whisper-small", device="cpu"):
    """
    Load pretrained Whisper model and processor

    Model options (in order of size):
    - openai/whisper-tiny (39M params) - fastest, good for testing
    - openai/whisper-base (74M params) - balanced
    - openai/whisper-small (244M params) - recommended for fine-tuning
    - openai/whisper-medium (769M params) - better quality
    - openai/whisper-large-v3 (1.55B params) - best quality (needs GPU)
    """
    print(f"Loading Whisper model: {model_name}")

    # Load processor (handles audio preprocessing and tokenization)
    processor = WhisperProcessor.from_pretrained(model_name)

    # Load model
    model = WhisperForConditionalGeneration.from_pretrained(model_name)
    model.to(device)

    # Configure for fine-tuning
    model.config.forced_decoder_ids = None
    model.config.suppress_tokens = []
    model.config.use_cache = False  # Required for training

    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

    print(f"✓ Model loaded successfully")
    print(f"  Total parameters: {total_params:,}")
    print(f"  Trainable parameters: {trainable_params:,}")

    return model, processor




In [6]:
# ============================================================================
# PART 4: WHISPER DATASET
# ============================================================================

class WhisperAfrispeechDataset(Dataset):
    """Dataset for Whisper fine-tuning on AfriSpeech"""

    def __init__(self, samples, processor, sample_rate=16000):
        self.samples = samples
        self.processor = processor
        self.sample_rate = sample_rate

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]



        # Load audio using soundfile
        audio_data, sr = sf.read(sample['audio_path'])
        audio = torch.from_numpy(audio_data).float()

        # Resample if needed
        if sr != self.sample_rate:
            audio = torchaudio.transforms.Resample(sr, self.sample_rate)(audio)

        # Convert to mono if stereo
        if audio.dim() == 2:
            audio = audio.mean(dim=1)

        # Convert to numpy (Whisper expects this)
        audio_array = audio.squeeze().numpy()

        # Process audio - Whisper processor handles mel spectrogram
        input_features = self.processor(
            audio_array,
            sampling_rate=self.sample_rate,
            return_tensors="pt"
        ).input_features.squeeze(0)

        # Tokenize transcript
        labels = self.processor.tokenizer(
            sample['transcript'],
            return_tensors="pt"
        ).input_ids.squeeze(0)

        return {
            'input_features': input_features,
            'labels': labels
        }



In [7]:
# ============================================================================
# PART 5: DATA COLLATOR
# ============================================================================

@dataclass
class WhisperDataCollator:
    """Data collator that handles padding for Whisper"""

    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Split inputs and labels
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        # Pad input features
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # Pad labels and replace padding with -100 (ignored by loss function)
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # Replace padding token id with -100 so it's ignored by loss
        labels = labels_batch["input_ids"].masked_fill(
            labels_batch.attention_mask.ne(1), -100
        )

        # If bos token is present, remove it (Whisper adds it automatically)
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch



In [8]:
# ============================================================================
# PART 6: TRAINING FUNCTION (WITH EARLY STOPPING & PLOTTING)
# ============================================================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_whisper(model, processor, train_loader, val_loader, epochs,
                  learning_rate, device,patience, save_path="whisper_finetuned",
                  ):
    """Fine-tune Whisper model with early stopping and visualization"""

    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    best_val_loss = float('inf')
    wait = 0  # counter for patience
    training_history = {'train_loss': [], 'val_loss': []}

    for epoch in range(1, epochs + 1):
        # ==================== TRAINING ====================
        model.train()
        train_loss = 0.0

        pbar = tqdm(train_loader, desc=f"Epoch {epoch}/{epochs} [Train]")
        for batch_idx, batch in enumerate(pbar):
            input_features = batch["input_features"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_features=input_features, labels=labels)
            loss = outputs.loss

            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            train_loss += loss.item()
            avg_loss = train_loss / (batch_idx + 1)
            pbar.set_postfix({'loss': f"{avg_loss:.4f}"})

        avg_train_loss = train_loss / len(train_loader)
        training_history['train_loss'].append(avg_train_loss)

        # ==================== VALIDATION ====================
        model.eval()
        val_loss = 0.0

        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"Epoch {epoch}/{epochs} [Val]"):
                input_features = batch["input_features"].to(device)
                labels = batch["labels"].to(device)

                outputs = model(input_features=input_features, labels=labels)
                val_loss += outputs.loss.item()

        avg_val_loss = val_loss / len(val_loader)
        training_history['val_loss'].append(avg_val_loss)

        # ==================== EPOCH SUMMARY ====================
        print(f"\n{'='*80}")
        print(f"Epoch {epoch}/{epochs} Summary:")
        print(f"  Train Loss: {avg_train_loss:.4f}")
        print(f"  Val Loss:   {avg_val_loss:.4f}")

        # ==================== EARLY STOPPING CHECK ====================
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            wait = 0
            model.save_pretrained(save_path)
            processor.save_pretrained(save_path)
            print(f"  ✅ Validation loss improved. Model saved to {save_path}")
        else:
            wait += 1
            print(f"  ⚠️ No improvement for {wait} epoch(s).")
            if wait >= patience:
                print(f"  ⛔ Early stopping triggered at epoch {epoch}.")
                print(f"{'='*80}\n")
                break

        print(f"{'='*80}\n")

    # ==================== PLOT TRAINING CURVES ====================
    plot_training_curves(training_history)

    return training_history


def plot_training_curves(history):
    """Plot training and validation loss curves"""
    epochs_range = range(1, len(history['train_loss']) + 1)

    plt.figure(figsize=(10, 5))
    plt.plot(epochs_range, history['train_loss'], label="Train Loss", marker='o')
    plt.plot(epochs_range, history['val_loss'], label="Validation Loss", marker='o')
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Whisper Fine-tuning Loss on Twi ASR")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("training_curves.png", dpi=150)
    plt.show()
    print("📊 Training curves saved to 'training_curves.png'")

In [9]:

# ============================================================================
# PART 7: EVALUATION FUNCTION
# ============================================================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def evaluate_whisper(model, processor, test_loader, device=device, num_samples=10):
    """Evaluate fine-tuned Whisper model"""

    model.eval()
    model.to(device)

    all_predictions = []
    all_references = []

    print("Generating predictions...")
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            input_features = batch["input_features"].to(device)
            labels = batch["labels"]

           # Generate predictions
            predicted_ids = model.generate(input_features, max_length=225, repetition_penalty=1.2,  # Penalize repeated tokens
            no_repeat_ngram_size=3  ) # Prevent 3-gram repetition)

            # Decode predictions
            predictions = processor.batch_decode(predicted_ids, skip_special_tokens=True)

            # Decode references
            labels[labels == -100] = processor.tokenizer.pad_token_id
            references = processor.batch_decode(labels, skip_special_tokens=True)

            all_predictions.extend(predictions)
            all_references.extend(references)

    # Compute metrics
    wer_score = wer(all_references, all_predictions)
    cer_score = cer(all_references, all_predictions)

    # Print results
    print(f"\n{'='*80}")
    print(f"EVALUATION RESULTS")
    print(f"{'='*80}")
    print(f"Total samples: {len(all_predictions)}")
    print(f"WER (Word Error Rate): {wer_score:.3%}")
    print(f"CER (Character Error Rate): {cer_score:.3%}")
    print(f"{'='*80}\n")

    # Print sample predictions
    print("Sample Predictions:")
    print("-" * 80)
    for i in range(min(num_samples, len(all_predictions))):
        print(f"\nSample {i+1}:")
        print(f"  Reference:  {all_references[i]}")
        print(f"  Prediction: {all_predictions[i]}")
        print("-" * 80)

    return wer_score, cer_score, all_predictions, all_references



In [10]:

# ============================================================================
# PART 8: INFERENCE FUNCTION
# ============================================================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def transcribe_audio(audio_path, model_path, device=device):
    """Transcribe a single audio file using fine-tuned Whisper"""

    # Load model and processor
    processor = WhisperProcessor.from_pretrained(model_path)
    model = WhisperForConditionalGeneration.from_pretrained(model_path).to(device)

    # Load audio using soundfile (not torchaudio)
    audio_data, sr = sf.read(audio_path)
    audio = torch.from_numpy(audio_data).float()

    # Convert to mono if stereo
    if audio.dim() == 2:
        audio = audio.mean(dim=1)

    # Resample if needed
    if sr != 16000:
        audio = audio.unsqueeze(0)
        audio = torchaudio.transforms.Resample(sr, 16000)(audio)
        audio = audio.squeeze(0)

    audio_array = audio.numpy()

    # Process
    input_features = processor(
        audio_array,
        sampling_rate=16000,
        return_tensors="pt"
    ).input_features.to(device)

    # Generate
    with torch.no_grad():
        predicted_ids = model.generate(input_features)

    # Decode
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

    return transcription

In [None]:
# ============================================================================
# PART 9: MAIN EXECUTION PIPELINE
# ============================================================================

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\n{'='*80}")
print(f"WHISPER FINE-TUNING FOR yoruba ASR")
print(f"{'='*80}")
print(f"Device: {device}")
print(f"{'='*80}\n")

# ==================== LOAD DATA ====================
print("="*80)
print("STEP 1: LOADING DATA")
print("="*80)

train_samples = load_afrispeech_data_improved('yoruba', 'train', sample_percentage=25)
val_samples = load_afrispeech_data_improved('yoruba', 'validation', sample_percentage=25)
test_samples = load_afrispeech_data_improved('yoruba', 'test', sample_percentage=25)

# ==================== LOAD WHISPER ====================
print("\n" + "="*80)
print("STEP 2: LOADING WHISPER MODEL")
print("="*80)

model, processor = load_whisper_model(
    model_name="openai/whisper-small",  # Change to "tiny" for faster testing
    device=device
)

# ==================== CREATE DATASETS ====================
print("\n" + "="*80)
print("STEP 3: CREATING DATASETS")
print("="*80)

train_dataset = WhisperAfrispeechDataset(train_samples, processor)
val_dataset = WhisperAfrispeechDataset(val_samples, processor)
test_dataset = WhisperAfrispeechDataset(test_samples, processor)

print(f"Train samples: {len(train_dataset)}")
print(f"Val samples: {len(val_dataset)}")
print(f"Test samples: {len(test_dataset)}")

# ==================== CREATE DATA LOADERS ====================
data_collator = WhisperDataCollator(processor=processor)

train_loader = DataLoader(
    train_dataset,
    batch_size=4,
    shuffle=True,
    collate_fn=data_collator,
    num_workers=0,  # Set to 0 to avoid GPU/CPU transfer issues
    pin_memory=True if device.type == 'cuda' else False
)

val_loader = DataLoader(
    val_dataset,
    batch_size=2,
    shuffle=False,
    collate_fn=data_collator,
    num_workers=0,
    pin_memory=True if device.type == 'cuda' else False
)

test_loader = DataLoader(
    test_dataset,
    batch_size=1,
    shuffle=False,
    collate_fn=data_collator,
    num_workers=0,
    pin_memory=True if device.type == 'cuda' else False
)

# ==================== TRAIN MODEL ====================
print("\n" + "="*80)
print("STEP 4: FINE-TUNING WHISPER")
print("="*80)

save_path = "/content/drive/MyDrive/whisper_twi_finetuned"

history = train_whisper(
    model=model,
    processor=processor,
    train_loader=train_loader,
    val_loader=val_loader,
    epochs=3,  # Increase for better results
    learning_rate=1e-5,
    device=device,
    save_path=save_path,
    patience=1
)

# ==================== EVALUATE ====================
print("\n" + "="*80)
print("STEP 5: EVALUATING FINE-TUNED MODEL")
print("="*80)

# Load best model
best_model = WhisperForConditionalGeneration.from_pretrained(save_path).to(device)
processor = WhisperProcessor.from_pretrained(save_path)

wer_score, cer_score, predictions, references = evaluate_whisper(
    model=best_model,
    processor=processor,
    test_loader=test_loader,
    device=device,
    num_samples=10
)

print("\n✅ WHISPER FINE-TUNING COMPLETE!")
print(f"Model saved to: {save_path}")
print(f"Final WER: {wer_score:.3%}")
print(f"Final CER: {cer_score:.3%}")


WHISPER FINE-TUNING FOR yoruba ASR
Device: cuda

STEP 1: LOADING DATA
Loaded 14369 rows from /content/afrispeech200/transcripts/yoruba/train.csv
Found 14369 valid audio files in /content/afrispeech200/audio/yoruba/train
Using 3592 samples (25%)
Analyzing durations (min=0.5s, max=Nones)...


Processing: 100%|██████████| 3592/3592 [00:12<00:00, 296.07it/s]


Loaded 3592 samples
  Skipped 0 too short (< 0.5s)
  Duration range: 1.01s - 118.44s
  Mean duration: 10.59s
  Total audio: 10.57 hours
Loaded 361 rows from /content/afrispeech200/transcripts/yoruba/dev.csv
Found 361 valid audio files in /content/afrispeech200/audio/yoruba/dev
Using 90 samples (25%)
Analyzing durations (min=0.5s, max=Nones)...


Processing: 100%|██████████| 90/90 [00:00<00:00, 320.07it/s]


Loaded 90 samples
  Skipped 0 too short (< 0.5s)
  Duration range: 2.11s - 22.35s
  Mean duration: 9.98s
  Total audio: 0.25 hours
Loaded 648 rows from /content/afrispeech200/transcripts/yoruba/test.csv
Found 648 valid audio files in /content/afrispeech200/audio/yoruba/test
Using 162 samples (25%)
Analyzing durations (min=0.5s, max=Nones)...


Processing: 100%|██████████| 162/162 [00:00<00:00, 338.07it/s]


Loaded 162 samples
  Skipped 0 too short (< 0.5s)
  Duration range: 1.02s - 63.77s
  Mean duration: 9.40s
  Total audio: 0.42 hours

STEP 2: LOADING WHISPER MODEL
Loading Whisper model: openai/whisper-small
✓ Model loaded successfully
  Total parameters: 241,734,912
  Trainable parameters: 240,582,912

STEP 3: CREATING DATASETS
Train samples: 3592
Val samples: 90
Test samples: 162

STEP 4: FINE-TUNING WHISPER


Epoch 1/3 [Train]:   0%|          | 0/898 [00:00<?, ?it/s]You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Epoch 1/3 [Train]: 100%|██████████| 898/898 [59:40<00:00,  3.99s/it, loss=1.9346]
Epoch 1/3 [Val]: 100%|██████████| 45/45 [01:20<00:00,  1.79s/it]



Epoch 1/3 Summary:
  Train Loss: 1.9346
  Val Loss:   1.5736




  ✅ Validation loss improved. Model saved to /content/drive/MyDrive/whisper_twi_finetuned



Epoch 2/3 [Train]: 100%|██████████| 898/898 [59:22<00:00,  3.97s/it, loss=0.9134]
Epoch 2/3 [Val]: 100%|██████████| 45/45 [01:19<00:00,  1.77s/it]



Epoch 2/3 Summary:
  Train Loss: 0.9134
  Val Loss:   1.5453
  ✅ Validation loss improved. Model saved to /content/drive/MyDrive/whisper_twi_finetuned



Epoch 3/3 [Train]:  68%|██████▊   | 609/898 [39:59<19:29,  4.05s/it, loss=0.5039]