<a href="https://colab.research.google.com/github/Patience3/WhisperSmall-Finetuned-For-Afrispeech/blob/main/Transformers_With_Whisper_For_Afrispeech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Part 2

In [None]:
# ============================================================================
# COMPLETE WHISPER FINE-TUNING PIPELINE FOR TWI ASR
# ============================================================================

# ============================================================================
# PART 1: INSTALLATIONS & IMPORTS
# ============================================================================

!pip install huggingface_hub datasets transformers accelerate -q
!pip install torchaudio jiwer sentencepiece -q

import os
import tarfile
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchaudio
from huggingface_hub import snapshot_download
from jiwer import wer, cer
import matplotlib.pyplot as plt
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import time

# Transformers imports for Whisper
from transformers import (
    WhisperProcessor,
    WhisperForConditionalGeneration,
    WhisperConfig
)

from google.colab import drive

# Mount Google Drive
if not os.path.ismount("/content/drive"):
    print("Mounting Google Drive...")
    drive.mount("/content/drive")
else:
    print("Google Drive is already mounted.")



[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m2.9/3.2 MB[0m [31m94.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m49.7 MB/s[0m eta [36m0:00:00[0m
[?25hMounting Google Drive...
Mounted at /content/drive


In [None]:

# ============================================================================
# PART 2: DATA DOWNLOAD (Keep your existing function)
# ============================================================================

print("="*80)
print("DOWNLOADING AFRISPEECH DATA")
print("="*80)

snapshot_download(
    repo_id="intronhealth/afrispeech-200",
    repo_type="dataset",
    allow_patterns=["audio/twi*", "transcripts/twi*", 'accents.json'],
    local_dir="/content/afrispeech200"
)

print("Extracting audio files...")
splits = ['train', 'dev', 'test']
audio_root = "/content/afrispeech200/audio/twi"

for split in splits:
    tar_path = f"{audio_root}/{split}/{split}_twi_0.tar.gz"
    extract_dir = f"{audio_root}/{split}"
    os.makedirs(extract_dir, exist_ok=True)

    if os.path.exists(tar_path):
        print(f"  Extracting {split}...")
        with tarfile.open(tar_path, "r:gz") as tar:
            tar.extractall(path=extract_dir)

print("Dataset download and extraction complete")


def load_afrispeech_data_improved(accent, split, sample_percentage=10,
                                  min_duration=0.5, max_duration=None):
    """Load AfriSpeech dataset"""
    try:
        split_map = {'train': 'train', 'validation': 'dev', 'test': 'test'}
        dataset_split = split_map.get(split, split)

        base_dir = "/content/afrispeech200"
        csv_path = f"{base_dir}/transcripts/{accent}/{dataset_split}.csv"

        df = pd.read_csv(csv_path)
        print(f"Loaded {len(df)} rows from {csv_path}")

        audio_root = f"{base_dir}/audio/{accent}/{dataset_split}"
        df['audio_path'] = df['audio_paths'].apply(
            lambda x: f"{audio_root}/data/data/intron/{'/'.join(x.split('/')[-2:])}"
        )
        df = df.drop(columns=['audio_paths'])

        valid_df = df[df['audio_path'].apply(os.path.exists)]
        print(f"Found {len(valid_df)}/{len(df)} valid files")

        if len(valid_df) == 0:
            return None

        if sample_percentage < 100:
            sample_size = int(len(valid_df) * sample_percentage / 100)
            valid_df = valid_df.sample(sample_size, random_state=42)
            print(f"Using {len(valid_df)} samples ({sample_percentage}%)")

        filtered_samples = []
        durations = []
        skipped_short = 0
        skipped_long = 0

        print(f"Analyzing durations (min={min_duration}s, max={max_duration or 'None'}s)...")

        for _, row in tqdm(valid_df.iterrows(), total=len(valid_df), desc="Processing"):
            try:
                info = torchaudio.info(row['audio_path'])
                duration = info.num_frames / info.sample_rate
                durations.append(duration)

                if duration < min_duration:
                    skipped_short += 1
                    continue

                if max_duration and duration > max_duration:
                    skipped_long += 1
                    continue

                filtered_samples.append({
                    'audio_path': row['audio_path'],
                    'transcript': row['transcript'],
                    'duration': duration,
                    'sampling_rate': info.sample_rate
                })

            except Exception:
                continue

        print(f"Loaded {len(filtered_samples)} samples")
        print(f"  Skipped {skipped_short} too short (< {min_duration}s)")
        if max_duration:
            print(f"  Skipped {skipped_long} too long (> {max_duration}s)")
        if durations:
            print(f"  Duration range: {min(durations):.2f}s - {max(durations):.2f}s")
            print(f"  Mean duration: {np.mean(durations):.2f}s")
            print(f"  Total audio: {sum(durations)/3600:.2f} hours")

        return filtered_samples

    except Exception as e:
        print(f"Error loading dataset: {e}")
        import traceback
        traceback.print_exc()
        return None



DOWNLOADING AFRISPEECH DATA


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]

accents.json: 0.00B [00:00, ?B/s]

audio/twi/dev/dev_twi_0.tar.gz:   0%|          | 0.00/152M [00:00<?, ?B/s]

audio/twi/test/test_twi_0.tar.gz:   0%|          | 0.00/45.9M [00:00<?, ?B/s]

transcripts/twi/dev.csv:   0%|          | 0.00/61.0k [00:00<?, ?B/s]

transcripts/twi/test.csv:   0%|          | 0.00/19.2k [00:00<?, ?B/s]

audio/twi/train/train_twi_0.tar.gz:   0%|          | 0.00/1.10G [00:00<?, ?B/s]

transcripts/twi/train.csv:   0%|          | 0.00/427k [00:00<?, ?B/s]

Extracting audio files...
  Extracting train...


  tar.extractall(path=extract_dir)


  Extracting dev...
  Extracting test...
Dataset download and extraction complete


In [None]:

# ============================================================================
# PART 3: WHISPER MODEL LOADING
# ============================================================================

def load_whisper_model(model_name="openai/whisper-small", device="cpu"):
    """
    Load pretrained Whisper model and processor

    Model options (in order of size):
    - openai/whisper-tiny (39M params) - fastest, good for testing
    - openai/whisper-base (74M params) - balanced
    - openai/whisper-small (244M params) - recommended for fine-tuning
    - openai/whisper-medium (769M params) - better quality
    - openai/whisper-large-v3 (1.55B params) - best quality (needs GPU)
    """
    print(f"Loading Whisper model: {model_name}")

    # Load processor (handles audio preprocessing and tokenization)
    processor = WhisperProcessor.from_pretrained(model_name)

    # Load model
    model = WhisperForConditionalGeneration.from_pretrained(model_name)
    model.to(device)

    # Configure for fine-tuning
    model.config.forced_decoder_ids = None
    model.config.suppress_tokens = []
    model.config.use_cache = False  # Required for training

    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

    print(f"✓ Model loaded successfully")
    print(f"  Total parameters: {total_params:,}")
    print(f"  Trainable parameters: {trainable_params:,}")

    return model, processor




In [None]:
# ============================================================================
# PART 4: WHISPER DATASET
# ============================================================================

class WhisperAfrispeechDataset(Dataset):
    """Dataset for Whisper fine-tuning on AfriSpeech"""

    def __init__(self, samples, processor, sample_rate=16000):
        self.samples = samples
        self.processor = processor
        self.sample_rate = sample_rate

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]

        # Load audio
        audio, sr = torchaudio.load(sample['audio_path'])

        # Resample if needed
        if sr != self.sample_rate:
            audio = torchaudio.functional.resample(audio, sr, self.sample_rate)

        # Convert to mono
        if audio.shape[0] > 1:
            audio = audio.mean(dim=0)

        # Convert to numpy (Whisper expects this)
        audio_array = audio.squeeze().numpy()

        # Process audio - Whisper processor handles mel spectrogram
        input_features = self.processor(
            audio_array,
            sampling_rate=self.sample_rate,
            return_tensors="pt"
        ).input_features.squeeze(0)

        # Tokenize transcript
        labels = self.processor.tokenizer(
            sample['transcript'],
            return_tensors="pt"
        ).input_ids.squeeze(0)

        return {
            'input_features': input_features,
            'labels': labels
        }



In [None]:
# ============================================================================
# PART 5: DATA COLLATOR
# ============================================================================

@dataclass
class WhisperDataCollator:
    """Data collator that handles padding for Whisper"""

    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Split inputs and labels
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        # Pad input features
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # Pad labels and replace padding with -100 (ignored by loss function)
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # Replace padding token id with -100 so it's ignored by loss
        labels = labels_batch["input_ids"].masked_fill(
            labels_batch.attention_mask.ne(1), -100
        )

        # If bos token is present, remove it (Whisper adds it automatically)
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch



In [None]:


# ============================================================================
# PART 6: TRAINING FUNCTION
# ============================================================================

def train_whisper(model, processor, train_loader, val_loader, epochs=10,
                  learning_rate=1e-5, device="cpu", save_path="whisper_finetuned"):
    """Fine-tune Whisper model"""

    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    best_val_loss = float('inf')
    training_history = {'train_loss': [], 'val_loss': []}

    for epoch in range(1, epochs + 1):
        # ==================== TRAINING ====================
        model.train()
        train_loss = 0.0

        pbar = tqdm(train_loader, desc=f"Epoch {epoch}/{epochs} [Train]")
        for batch_idx, batch in enumerate(pbar):
            # Move to device
            input_features = batch["input_features"].to(device)
            labels = batch["labels"].to(device)

            # Forward pass
            outputs = model(input_features=input_features, labels=labels)
            loss = outputs.loss

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            train_loss += loss.item()

            # Update progress bar
            avg_loss = train_loss / (batch_idx + 1)
            pbar.set_postfix({'loss': f"{avg_loss:.4f}"})

        avg_train_loss = train_loss / len(train_loader)
        training_history['train_loss'].append(avg_train_loss)

        # ==================== VALIDATION ====================
        model.eval()
        val_loss = 0.0

        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"Epoch {epoch}/{epochs} [Val]"):
                input_features = batch["input_features"].to(device)
                labels = batch["labels"].to(device)

                outputs = model(input_features=input_features, labels=labels)
                val_loss += outputs.loss.item()

        avg_val_loss = val_loss / len(val_loader)
        training_history['val_loss'].append(avg_val_loss)

        # Print epoch summary
        print(f"\n{'='*80}")
        print(f"Epoch {epoch}/{epochs} Summary:")
        print(f"  Train Loss: {avg_train_loss:.4f}")
        print(f"  Val Loss:   {avg_val_loss:.4f}")

        # Save best model
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            model.save_pretrained(save_path)
            processor.save_pretrained(save_path)
            print(f"  ✓ Saved best model to {save_path} (Val Loss: {avg_val_loss:.4f})")

        print(f"{'='*80}\n")

    return training_history

In [None]:

# ============================================================================
# PART 7: EVALUATION FUNCTION
# ============================================================================

def evaluate_whisper(model, processor, test_loader, device="cpu", num_samples=10):
    """Evaluate fine-tuned Whisper model"""

    model.eval()
    model.to(device)

    all_predictions = []
    all_references = []

    print("Generating predictions...")
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            input_features = batch["input_features"].to(device)
            labels = batch["labels"]

            # Generate predictions
            predicted_ids = model.generate(input_features, max_length=225)

            # Decode predictions
            predictions = processor.batch_decode(predicted_ids, skip_special_tokens=True)

            # Decode references
            labels[labels == -100] = processor.tokenizer.pad_token_id
            references = processor.batch_decode(labels, skip_special_tokens=True)

            all_predictions.extend(predictions)
            all_references.extend(references)

    # Compute metrics
    wer_score = wer(all_references, all_predictions)
    cer_score = cer(all_references, all_predictions)

    # Print results
    print(f"\n{'='*80}")
    print(f"EVALUATION RESULTS")
    print(f"{'='*80}")
    print(f"Total samples: {len(all_predictions)}")
    print(f"WER (Word Error Rate): {wer_score:.3%}")
    print(f"CER (Character Error Rate): {cer_score:.3%}")
    print(f"{'='*80}\n")

    # Print sample predictions
    print("Sample Predictions:")
    print("-" * 80)
    for i in range(min(num_samples, len(all_predictions))):
        print(f"\nSample {i+1}:")
        print(f"  Reference:  {all_references[i]}")
        print(f"  Prediction: {all_predictions[i]}")
        print("-" * 80)

    return wer_score, cer_score, all_predictions, all_references



In [None]:

# ============================================================================
# PART 8: INFERENCE FUNCTION
# ============================================================================

def transcribe_audio(audio_path, model_path, device="cpu"):
    """Transcribe a single audio file using fine-tuned Whisper"""

    # Load model and processor
    processor = WhisperProcessor.from_pretrained(model_path)
    model = WhisperForConditionalGeneration.from_pretrained(model_path).to(device)

    # Load audio
    audio, sr = torchaudio.load(audio_path)
    if sr != 16000:
        audio = torchaudio.functional.resample(audio, sr, 16000)
    if audio.shape[0] > 1:
        audio = audio.mean(dim=0)

    audio_array = audio.squeeze().numpy()

    # Process
    input_features = processor(
        audio_array,
        sampling_rate=16000,
        return_tensors="pt"
    ).input_features.to(device)

    # Generate
    with torch.no_grad():
        predicted_ids = model.generate(input_features)

    # Decode
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

    return transcription


In [None]:


# ============================================================================
# PART 9: MAIN EXECUTION PIPELINE
# ============================================================================

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\n{'='*80}")
print(f"WHISPER FINE-TUNING FOR TWI ASR")
print(f"{'='*80}")
print(f"Device: {device}")
print(f"{'='*80}\n")

# ==================== LOAD DATA ====================
print("="*80)
print("STEP 1: LOADING DATA")
print("="*80)

train_samples = load_afrispeech_data_improved('twi', 'train', sample_percentage=100)
val_samples = load_afrispeech_data_improved('twi', 'validation', sample_percentage=100)
test_samples = load_afrispeech_data_improved('twi', 'test', sample_percentage=100)

# ==================== LOAD WHISPER ====================
print("\n" + "="*80)
print("STEP 2: LOADING WHISPER MODEL")
print("="*80)

model, processor = load_whisper_model(
    model_name="openai/whisper-small",  # Change to "tiny" for faster testing
    device=device
)

# ==================== CREATE DATASETS ====================
print("\n" + "="*80)
print("STEP 3: CREATING DATASETS")
print("="*80)

train_dataset = WhisperAfrispeechDataset(train_samples, processor)
val_dataset = WhisperAfrispeechDataset(val_samples, processor)
test_dataset = WhisperAfrispeechDataset(test_samples, processor)

print(f"Train samples: {len(train_dataset)}")
print(f"Val samples: {len(val_dataset)}")
print(f"Test samples: {len(test_dataset)}")

# ==================== CREATE DATA LOADERS ====================
data_collator = WhisperDataCollator(processor=processor)

train_loader = DataLoader(
    train_dataset,
    batch_size=1,  # Reduce if you get OOM errors
    shuffle=True,
    collate_fn=data_collator,
    num_workers=0
)

val_loader = DataLoader(
    val_dataset,
    batch_size=1,
    shuffle=False,
    collate_fn=data_collator,
    num_workers=0
)

test_loader = DataLoader(
    test_dataset,
    batch_size=1,
    shuffle=False,
    collate_fn=data_collator,
    num_workers=0
)

# ==================== TRAIN MODEL ====================
print("\n" + "="*80)
print("STEP 4: FINE-TUNING WHISPER")
print("="*80)

save_path = "/content/drive/MyDrive/whisper_twi_finetuned"

history = train_whisper(
    model=model,
    processor=processor,
    train_loader=train_loader,
    val_loader=val_loader,
    epochs=5,  # Increase for better results
    learning_rate=1e-5,
    device=device,
    save_path=save_path
)

# ==================== EVALUATE ====================
print("\n" + "="*80)
print("STEP 5: EVALUATING FINE-TUNED MODEL")
print("="*80)

# Load best model
best_model = WhisperForConditionalGeneration.from_pretrained(save_path).to(device)
processor = WhisperProcessor.from_pretrained(save_path)

wer_score, cer_score, predictions, references = evaluate_whisper(
    model=best_model,
    processor=processor,
    test_loader=test_loader,
    device=device,
    num_samples=10
)

print("\n✅ WHISPER FINE-TUNING COMPLETE!")
print(f"Model saved to: {save_path}")
print(f"Final WER: {wer_score:.3%}")
print(f"Final CER: {cer_score:.3%}")


WHISPER FINE-TUNING FOR TWI ASR
Device: cuda

STEP 1: LOADING DATA
Loaded 1315 rows from /content/afrispeech200/transcripts/twi/train.csv
Found 1315/1315 valid files
Analyzing durations (min=0.5s, max=Nones)...


  info = torchaudio.info(row['audio_path'])
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  return AudioMetaData(
Processing: 100%|██████████| 1315/1315 [00:42<00:00, 30.72it/s]


Loaded 1315 samples
  Skipped 0 too short (< 0.5s)
  Duration range: 1.19s - 77.31s
  Mean duration: 9.16s
  Total audio: 3.35 hours
Loaded 186 rows from /content/afrispeech200/transcripts/twi/dev.csv
Found 186/186 valid files
Analyzing durations (min=0.5s, max=Nones)...


  info = torchaudio.info(row['audio_path'])
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  return AudioMetaData(
Processing: 100%|██████████| 186/186 [00:03<00:00, 59.96it/s]


Loaded 186 samples
  Skipped 0 too short (< 0.5s)
  Duration range: 1.07s - 33.65s
  Mean duration: 9.05s
  Total audio: 0.47 hours
Loaded 58 rows from /content/afrispeech200/transcripts/twi/test.csv
Found 58/58 valid files
Analyzing durations (min=0.5s, max=Nones)...


  info = torchaudio.info(row['audio_path'])
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  return AudioMetaData(
Processing: 100%|██████████| 58/58 [00:00<00:00, 68.37it/s]


Loaded 58 samples
  Skipped 0 too short (< 0.5s)
  Duration range: 2.98s - 32.01s
  Mean duration: 9.42s
  Total audio: 0.15 hours

STEP 2: LOADING WHISPER MODEL
Loading Whisper model: openai/whisper-small


preprocessor_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

✓ Model loaded successfully
  Total parameters: 241,734,912
  Trainable parameters: 240,582,912

STEP 3: CREATING DATASETS
Train samples: 1315
Val samples: 186
Test samples: 58

STEP 4: FINE-TUNING WHISPER


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Epoch 1/5 [Train]: 100%|██████████| 1315/1315 [12:30<00:00,  1.75it/s, loss=1.1474]
Epoch 1/5 [Val]: 100%|██████████| 186/186 [00:32<00:00,  5.71it/s]



Epoch 1/5 Summary:
  Train Loss: 1.1474
  Val Loss:   0.9238




  ✓ Saved best model to /content/drive/MyDrive/whisper_twi_finetuned (Val Loss: 0.9238)



Epoch 2/5 [Train]: 100%|██████████| 1315/1315 [12:25<00:00,  1.76it/s, loss=0.3470]
Epoch 2/5 [Val]: 100%|██████████| 186/186 [00:32<00:00,  5.76it/s]



Epoch 2/5 Summary:
  Train Loss: 0.3470
  Val Loss:   0.9929



Epoch 3/5 [Train]: 100%|██████████| 1315/1315 [12:24<00:00,  1.77it/s, loss=0.1411]
Epoch 3/5 [Val]: 100%|██████████| 186/186 [00:32<00:00,  5.70it/s]



Epoch 3/5 Summary:
  Train Loss: 0.1411
  Val Loss:   1.0896



Epoch 4/5 [Train]: 100%|██████████| 1315/1315 [12:28<00:00,  1.76it/s, loss=0.0681]
Epoch 4/5 [Val]: 100%|██████████| 186/186 [00:33<00:00,  5.62it/s]



Epoch 4/5 Summary:
  Train Loss: 0.0681
  Val Loss:   1.2716



Epoch 5/5 [Train]: 100%|██████████| 1315/1315 [12:24<00:00,  1.77it/s, loss=0.0441]
Epoch 5/5 [Val]: 100%|██████████| 186/186 [00:33<00:00,  5.59it/s]



Epoch 5/5 Summary:
  Train Loss: 0.0441
  Val Loss:   1.3121


STEP 5: EVALUATING FINE-TUNED MODEL
Generating predictions...


Evaluating:   0%|          | 0/58 [00:00<?, ?it/s]Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.
`generation_config` default values have been modified to match model-specific defaults: {'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}. If this is not desired, please set these values explicitly.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



EVALUATION RESULTS
Total samples: 58
WER (Word Error Rate): 24.605%
CER (Character Error Rate): 12.036%

Sample Predictions:
--------------------------------------------------------------------------------

Sample 1:
  Reference:  Proteins break down to release amino acids which are used as fuel for hepatic gluconeogenesis to maintain the glucose needs of the brain.
  Prediction: Proteins breakdown to release amino acids which are used as fob for hepatic gluconeogen acids so as to maintain glucosinase of the brain.
--------------------------------------------------------------------------------

Sample 2:
  Reference:  Aspiration is a potential risk in a patient who subsequently loses consciousness or fits and vomits. 
  Prediction: Aspiration is a potential risk in operation to subsurface when tree loses consciousness or feeds and vomit.
--------------------------------------------------------------------------------

Sample 3:
  Reference:  Rhinoplasty anatomy 1885P PLASTIC AND RECO


