### **Install All Dependencies**

In [None]:
!pip install git+https://github.com/huggingface/peft.git transformers torchaudio datasets jiwer

Collecting git+https://github.com/huggingface/peft.git
  Cloning https://github.com/huggingface/peft.git to /tmp/pip-req-build-sb9kakgs
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/peft.git /tmp/pip-req-build-sb9kakgs
  Resolved https://github.com/huggingface/peft.git to commit 8feea9031981153408dde5353394b4e805abb0c1
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft==0.15.2.dev0)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft==0.15.2.dev0)
  Downloading nvidia_cuda_runtime_cu12-

In [None]:
from peft import LoraConfig

# Initialize DoRA configuration
config = LoraConfig(
    use_dora=True,
)

# ClozeGER + DoRA Full Pipeline
# Assumes use of HuggingFace Transformers, torchaudio, DoRA, jiwer, and Common Voice

## Import all necessary libraries

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torchaudio
from transformers import AutoProcessor, AutoModelForSeq2SeqLM
from datasets import load_dataset
from jiwer import wer
import random
import warnings
import logging

warnings.filterwarnings("ignore", message=".*Xet Storage is enabled.*")
logging.getLogger("huggingface_hub.file_download").setLevel(logging.ERROR)

## Step 1: Load Common Voice and Preprocess

In [None]:
# === Step 1: Load Common Voice and Preprocess ===
# ✅ Load dummy LibriSpeech dataset with explicit config
dataset = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")

# ✅ Load Whisper ASR pipeline
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=0)

# ✅ Generate dummy N-best hypotheses
processed_dataset = []
for example in dataset:
    audio = example["audio"]["array"]
    ground_truth = example["text"]

    result = asr_pipe(audio, chunk_length_s=5, return_timestamps=False)
    hyp = result["text"]

    example["hypotheses"] = [hyp, hyp.replace(" the ", " a "), hyp.replace(" a ", " the ")]
    example["ground_truth"] = ground_truth
    processed_dataset.append(example)

print(f"✅ Processed {len(processed_dataset)} samples.")

Device set to use cpu


✅ Processed 73 samples.


## Step 2: Dataset Class with Random Sampling

In [32]:
# === Step 2: Dataset Class with Random Sampling ===
class CommonVoiceClozeDataset(Dataset):
    def __init__(self, data, processor, n_best=3):
        self.data = data
        self.processor = processor
        self.n_best = n_best

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        waveform = torch.tensor(sample["audio"]["array"])
        hyp = random.choice(sample['hypotheses'][:self.n_best])
        tokens = hyp.split()
        if len(tokens) < 2:
            masked_hyp = hyp
            label = ""
        else:
            mask_idx = random.randint(0, len(tokens) - 1)
            label = tokens[mask_idx]
            tokens[mask_idx] = '<mask>'
            masked_hyp = ' '.join(tokens)

        inputs = self.processor(audio=waveform, text=masked_hyp, return_tensors="pt", padding=True)
        inputs = {k: v.squeeze(0) for k, v in inputs.items()}
        inputs['labels'] = self.processor.tokenizer(label, return_tensors="pt").input_ids.squeeze(0)
        return inputs, sample['ground_truth'], hyp

## Step 3: Load SpeechGPT or Placeholder

In [33]:
# === Step 3: Load SpeechGPT or Placeholder ===
model_name = "openai/whisper-small"  # Replace with multimodal SpeechGPT if available
processor = AutoProcessor.from_pretrained(model_name)
# Use AutoModelForSpeechSeq2Seq instead of AutoModelForSeq2SeqLM
from transformers import AutoModelForSpeechSeq2Seq
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name)

## Step 4: Apply DoRA

In [34]:
# === Step 4: Apply DoRA via PEFT ===
from peft import LoraConfig, TaskType, get_peft_model

dora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,
    target_modules=["q_proj", "v_proj"], # Add target modules for Lora
    use_dora=True
)

model = get_peft_model(model, dora_config)

## Step 5: Training

In [35]:
# === Step 5: Training ===
def train_model(model, dataloader, epochs=1):
    model.train()
    optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=5e-5)
    for epoch in range(epochs):
        for batch, _, _ in dataloader:
            input_ids = batch['input_ids'].unsqueeze(0)
            attention_mask = batch['attention_mask'].unsqueeze(0)
            labels = batch['labels'].unsqueeze(0)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

## Step 6: Evaluation with WER

In [36]:
# === Step 6: Evaluation with WER ===
def evaluate_model(model, dataloader):
    model.eval()
    predictions = []
    references = []
    with torch.no_grad():
        for batch, reference, original_hyp in dataloader:
            input_ids = batch['input_ids'].unsqueeze(0)
            attention_mask = batch['attention_mask'].unsqueeze(0)
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)
            pred = processor.tokenizer.decode(outputs[0], skip_special_tokens=True)
            print(f"Original Hyp: {original_hyp}")
            print(f"Predicted Fix: {pred}")
            print(f"Reference: {reference}")
            predictions.append(pred)
            references.append(reference)
    print("WER:", wer(references, predictions))

## Step 7: Run Full Pipeline

In [44]:
# === Step 7: Run Full Pipeline ===

split_dataset = dataset.train_test_split(test_size=0.2)
test_data = CommonVoiceClozeDataset(split_dataset['test'], processor)

train_loader = DataLoader(train_data, batch_size=1, shuffle=True)
test_loader = DataLoader(test_data, batch_size=1)

train_model(model, train_loader)
evaluate_model(model, test_loader)

✅ Ready dataset size: 2 samples
📦 Training...
Hypothesis: (' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.',)
Ground Truth: ('MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL',)
----
Hypothesis: (" Nor is Mr. Quilter's manner less interesting than his matter.",)
Ground Truth: ("NOR IS MISTER QUILTER'S MANNER LESS INTERESTING THAN HIS MATTER",)
----
