# **Data Preprocessing/Cleaning**

####**Installing Dependecies**

In [None]:
!pip install torch torchvision torchaudio torch_xla[tpu] -f https://storage.googleapis.com/tpu-pytorch/wheels/colab.html
!pip install opencv-python
!pip install google-colab
!pip install transformers
!pip install accelerate
!pip install soundfile
!pip install datasets
!pip install evaluate
!pip install librosa
!pip install jiwer

Looking in links: https://storage.googleapis.com/tpu-pytorch/wheels/colab.html
Collecting torch
  Using cached torch-2.6.0-cp311-cp311-win_amd64.whl.metadata (28 kB)
Collecting torchvision
  Using cached torchvision-0.21.0-cp311-cp311-win_amd64.whl.metadata (6.3 kB)
Collecting torchaudio
  Using cached torchaudio-2.6.0-cp311-cp311-win_amd64.whl.metadata (6.7 kB)


ERROR: Could not find a version that satisfies the requirement torch_xla[tpu] (from versions: none)
ERROR: No matching distribution found for torch_xla[tpu]

[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting opencv-python
  Using cached opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl.metadata (20 kB)
Using cached opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl (39.5 MB)
Installing collected packages: opencv-python
Successfully installed opencv-python-4.11.0.86



[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: Could not find a version that satisfies the requirement google-colab (from versions: none)
ERROR: No matching distribution found for google-colab

[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


^C
^C


####**Preprocessing The Dataset**

*The dataset consists of this languages ;*


*   "ig"  -     **Igbo**
*   "ha" - **Hausa**
*   "yo" - **Yoruba**
*   "en" - **English**

In [None]:
import os
import re
import shutil
import torch
import torchaudio
import torchaudio.transforms as T
import torch_xla.core.xla_model as xm
from datasets import load_dataset
from google.colab import userdata

# === 1️⃣ Authenticate with Hugging Face ===
HF_TOKEN = userdata.get("HF_TOKEN")  # Retrieve token
if not HF_TOKEN:
    raise ValueError("❌ Hugging Face token not found in Colab userdata.")
os.environ["HF_TOKEN"] = HF_TOKEN  # Set environment variable

# === 2️⃣ Define Directories ===
BASE_DIR = "/content/preprocessed_languages_datasets"
os.makedirs(BASE_DIR, exist_ok=True)

# === 3️⃣ Define Languages ===
LANGUAGES = ["en", "yo", "ha", "ig"]

# === 4️⃣ Enable TPU ===
device = xm.xla_device()
print(f"✅ Using device: {device}")

# === 5️⃣ Define Special Characters and Contractions ===
special_characters = {
    "yo": "àèéẹọùúńṣáéẹọùúńṣ",
    "ha": "ɓɗƙƴƁƊƘƳ",
    "ig": "ṅọụịịụọńń",
    "fr": "çàéèêôùûâëïüæœÿéèêëîïôûùàâç"
}
contractions = {
    "i'm": "i am", "you're": "you are", "he's": "he is", "she's": "she is",
    "it's": "it is", "we're": "we are", "they're": "they are",
    "i'll": "i will", "you'll": "you will", "he'll": "he will",
    "she'll": "she will", "it'll": "it will", "we'll": "we will",
    "they'll": "they will", "i've": "i have", "you've": "you have",
    "we've": "we have", "they've": "they have", "isn't": "is not",
    "aren't": "are not", "wasn't": "was not", "weren't": "were not",
    "haven't": "have not", "hasn't": "has not", "hadn't": "had not",
    "won't": "will not", "wouldn't": "would not", "don't": "do not",
    "doesn't": "does not", "didn't": "did not", "can't": "cannot",
    "couldn't": "could not", "shouldn't": "should not",
    "mightn't": "might not", "mustn't": "must not"
}

# === 6️⃣ Text Preprocessing ===
def preprocess_text(text, lang):
    if not isinstance(text, str) or not text.strip():
        return ""
    text = text.strip().lower()

    if lang == "en":
        for contraction, expanded in contractions.items():
            text = text.replace(contraction, expanded)

    allowed_chars = special_characters.get(lang, "") + "abcdefghijklmnopqrstuvwxyz.,?!' "
    return "".join(c for c in text if c in allowed_chars).strip()

# === 7️⃣ Preprocessing Function ===
def load_and_preprocess_dataset(lang):
    print(f"\n🚀 Processing {lang.upper()}...")
    dataset_path = os.path.join(BASE_DIR, lang)
    if os.path.exists(dataset_path):
        shutil.rmtree(dataset_path)

    dataset = load_dataset("mozilla-foundation/common_voice_13_0", lang, token=HF_TOKEN, cache_dir="/content/dataset_cache")
    print(f"✅ Loaded splits: {list(dataset.keys())}")

    def preprocess_batch(batch):
        batch["sentence"] = preprocess_text(batch["sentence"], lang)
        return batch

    dataset = dataset.map(preprocess_batch, desc=f"Preprocessing {lang} dataset")
    dataset.save_to_disk(dataset_path)
    print(f"✅ Saved preprocessed dataset to {dataset_path}")

# === 8️⃣ Run Preprocessing for All Languages ===
for lang in LANGUAGES:
    load_and_preprocess_dataset(lang)

print("🎉 All datasets preprocessed and saved successfully!")

# **MODEL FINE-TUNING**

####**Fine-tuning The Model**

In [None]:
import os
import shutil
import torch
import torchaudio
from transformers import (
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)
from datasets import load_from_disk
import evaluate
from google.colab import drive
from dataclasses import dataclass
from typing import Any, Dict, List
import time

# === Mount Google Drive ===
drive.mount('/content/drive')

# === Languages to Train ===
LANGUAGES = ["yo", "ha", "ig", "en"]

# === Paths & Config ===
DATA_DIR = "/content/drive/My Drive/Colab Notebooks/CommonVoice/preprocessed_languages_datasets"
OUTPUT_DIR = "/content/whisper_models"
FINAL_MODEL_DIR = "/content/drive/My Drive/Colab Notebooks/Models/whisper_multilingual"

LANG_TAGS = {
    "yo": "<|yo|>",
    "ha": "<|ha|>",
    "ig": "<|ig|>",
    "en": "<|en|>",
}

# === Clean Conflicts ===
conflict_paths = ["/content/whisper-base"]
for path in conflict_paths:
    if os.path.exists(path):
        shutil.rmtree(path)
        print(f"Deleted conflicting folder: {path}")

# === Metric ===
wer_metric = evaluate.load("wer")

# === Data Collator ===
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: WhisperProcessor

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        input_features = [{"input_features": f["input_features"]} for f in features]
        label_features = [{"input_ids": f["labels"]} for f in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        batch["labels"] = labels_batch["input_ids"]
        return batch

# === Training Args ===
def get_training_args(lang):
    return Seq2SeqTrainingArguments(
        output_dir=f"{OUTPUT_DIR}/{lang}",
        per_device_train_batch_size=16,
        gradient_accumulation_steps=2,
        logging_steps=500,
        eval_steps=500,
        save_steps=1000,
        learning_rate=3e-4,
        weight_decay=0.005,
        num_train_epochs=3,
        logging_dir=f"{OUTPUT_DIR}/{lang}/logs",
        save_total_limit=2,
        predict_with_generate=True,
        generation_max_length=128,
        dataloader_num_workers=1,
        fp16=True,
        report_to="none",
    )

# === Dataset Preprocessing ===
def prepare_dataset(lang, processor, split="train"):
    dataset = load_from_disk(f"{DATA_DIR}/{lang}/{split}")
    resampler = torchaudio.transforms.Resample(orig_freq=48000, new_freq=16000)

    def preprocess_batch(batch):
        audio = batch["audio"]
        waveform = torch.tensor(audio["array"], dtype=torch.float32).unsqueeze(0)
        resampled = resampler(waveform).squeeze().numpy()
        inputs = processor(resampled, sampling_rate=16000)
        batch["input_features"] = inputs["input_features"][0]
        sentence = LANG_TAGS[lang] + " " + batch["sentence"]
        batch["labels"] = processor.tokenizer(sentence).input_ids
        return batch

    dataset = dataset.map(preprocess_batch, remove_columns=dataset.column_names)
    dataset = dataset.filter(lambda x: x["input_features"] is not None and len(x["labels"]) > 0)
    dataset.set_format(type="torch")
    return dataset

# === Metrics ===
def compute_metrics(eval_pred):
    pred_ids = eval_pred.predictions
    label_ids = eval_pred.label_ids
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)
    return {"wer": wer_metric.compute(predictions=pred_str, references=label_str)}

# === Loop Over Languages ===
for lang in LANGUAGES:
    print(f"\n🚀 Starting training for: {lang}")
    try:
        processor = WhisperProcessor.from_pretrained("openai/whisper-base")
        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base", device_map="auto")

        train_dataset = prepare_dataset(lang, processor, split="train")

        if len(train_dataset) == 0:
            print(f"⚠️ No training data for {lang}. Skipping...")
            continue

        data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
        training_args = get_training_args(lang)

        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            tokenizer=processor,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        trainer.train()

        # Evaluate
        print(f"\n🔍 Evaluating model for: {lang}")
        test_dataset = prepare_dataset(lang, processor, split="test")
        eval_results = trainer.evaluate(test_dataset)
        print(f"✅ Evaluation Results ({lang}):", eval_results)

        # Save
        model.save_pretrained(f"{FINAL_MODEL_DIR}/{lang}")
        processor.save_pretrained(f"{FINAL_MODEL_DIR}/{lang}")
        print(f"✅ Model saved for: {lang}")

    except Exception as e:
        print(f"❌ Error occurred while training {lang}: {e}")

    # Clear memory
    del model, processor, trainer, train_dataset
    torch.cuda.empty_cache()
    time.sleep(10)
    print(f"✅ Finished {lang}, GPU memory cleared. Moving to next language...\n{'='*50}")