Whisper Qlora with Training Arguments

In [None]:
!pip install -q torch torchaudio transformers accelerate peft datasets evaluate jiwer bitsandbytes

import torch
from datasets import load_dataset, Audio, Dataset
from transformers import (
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import numpy as np

assert torch.cuda.is_available(), "GPU not available!"
print(f"GPU: {torch.cuda.get_device_name(0)}")

dataset = load_dataset("parquet",data_files="/kaggle/input/dataseeeeeet/train-00000-of-00010.parquet")["train"]
print(f"Original dataset size: {len(dataset)}")
dataset = dataset.remove_columns([col for col in dataset.column_names if col not in ["audio", "text"]])
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

model_name = "openai/whisper-tiny"
processor = WhisperProcessor.from_pretrained(model_name, language="hi", task="transcribe")

def prepare_example(example):
    audio = example["audio"]["array"]
    text = example["text"]
    inputs = processor(
        audio=audio,
        sampling_rate=16000,
        return_tensors="pt",
        truncation=True,
        max_length=30*16000
    )
    
    labels = processor.tokenizer(
        text,
        truncation=True,
        max_length=448
    ).input_ids
    
    return {
        "input_features": inputs.input_features[0].numpy(),
        "labels": labels
    }

processed_data = {"input_features": [], "labels": []}
for example in dataset:
    try:
        result = prepare_example(example)
        processed_data["input_features"].append(result["input_features"])
        processed_data["labels"].append(result["labels"])
    except Exception as e:
        print(f"Skipping example due to error: {e}")
        continue

print(f"Successfully processed {len(processed_data['input_features'])}/{len(dataset)} examples")
dataset = Dataset.from_dict(processed_data)

@dataclass
class WhisperDataCollator:
    processor: Any
    
    def __call__(self, features: List[Dict[str, Union[List[int], np.ndarray]]]):
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
  
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        
        batch["labels"] = labels
        return batch

data_collator = WhisperDataCollator(processor=processor)

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

model = WhisperForConditionalGeneration.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map="auto",
    torch_dtype=torch.float16,
    attn_implementation="eager"
)

model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

peft_config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

peft_model = get_peft_model(model, peft_config)
original_base_model_forward = peft_model.base_model.forward

def patched_base_model_forward(*args, **kwargs):
    kwargs.pop('input_ids', None)
    kwargs.pop('attention_mask', None)
    kwargs.pop('inputs_embeds', None)
    if 'input_features' not in kwargs and len(args) == 0:
        raise ValueError("input_features must be provided")
    
    return original_base_model_forward(*args, **kwargs)

peft_model.base_model.forward = patched_base_model_forward

original_peft_model_forward = peft_model.forward

def patched_peft_model_forward(*args, **kwargs):
    if len(args) == 1 and isinstance(args[0], dict):
        kwargs = args[0]
        args = ()
    input_features = kwargs.pop('input_features', None)
    
    if len(args) > 0 and input_features is None:
        input_features = args[0]
        args = args[1:]
    
    if input_features is None:
        raise ValueError("input_features must be provided")
    return original_peft_model_forward(input_features=input_features, **kwargs)

peft_model.forward = patched_peft_model_forward

peft_model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="hi", task="transcribe")
peft_model.config.suppress_tokens = []
peft_model.config.use_cache = False

peft_model.print_trainable_parameters()

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-hindi-qlora",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=1e-5,
    warmup_steps=50,
    num_train_epochs=6,
    fp16=True,
    logging_steps=10,
    save_steps=200,
    evaluation_strategy="no",
    report_to="none",
    remove_unused_columns=False,
    gradient_checkpointing=True,
    predict_with_generate=True,
    generation_max_length=225,
    optim="paged_adamw_8bit"
)

trainer = Seq2SeqTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
    tokenizer=processor.feature_extractor,
)

print("Starting training...")
peft_model.config.use_cache = False
train_result = trainer.train()
print("Training completed!")

peft_model.save_pretrained("whisper-hindi-qlora-final")
processor.save_pretrained("whisper-hindi-qlora-final")
print("Model saved successfully!")

In [3]:
model.config.save_pretrained("whisper-hindi-qlora-final")

In [4]:
!zip -r whisper-hindi-qlora-final.zip whisper-hindi-qlora-final/

!ls -lh *.zip

  adding: whisper-hindi-qlora-final/ (stored 0%)
  adding: whisper-hindi-qlora-final/vocab.json (deflated 69%)
  adding: whisper-hindi-qlora-final/preprocessor_config.json (deflated 42%)
  adding: whisper-hindi-qlora-final/config.json (deflated 62%)
  adding: whisper-hindi-qlora-final/merges.txt (deflated 54%)
  adding: whisper-hindi-qlora-final/normalizer.json (deflated 81%)
  adding: whisper-hindi-qlora-final/added_tokens.json (deflated 80%)
  adding: whisper-hindi-qlora-final/README.md (deflated 66%)
  adding: whisper-hindi-qlora-final/tokenizer_config.json (deflated 96%)
  adding: whisper-hindi-qlora-final/special_tokens_map.json (deflated 80%)
  adding: whisper-hindi-qlora-final/adapter_config.json (deflated 54%)
  adding: whisper-hindi-qlora-final/adapter_model.safetensors (deflated 22%)
-rw-r--r-- 1 root root 2.4M Apr 15 05:59 whisper-hindi-qlora-final.zip


In [None]:
from IPython.display import FileLink
FileLink('whisper-hindi-qlora-final.zip')  

Inference and Evaluation on Traindataset

In [6]:
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset, Audio
import torchaudio
import evaluate
from IPython.display import display, Audio as IPythonAudio

model_path = "/kaggle/working/whisper-hindi-qlora-final"
processor = WhisperProcessor.from_pretrained(model_path)
model = WhisperForConditionalGeneration.from_pretrained(model_path).to("cuda" if torch.cuda.is_available() else "cpu")

dataset = load_dataset("parquet", data_files="/kaggle/input/dataseeeeeet/train-00000-of-00010.parquet", split="train")

def resample_audio(batch):
    if batch["audio"]["sampling_rate"] != 16000:
        audio_array = torch.tensor(batch["audio"]["array"], dtype=torch.float32)
        resampler = torchaudio.transforms.Resample(
            orig_freq=batch["audio"]["sampling_rate"],
            new_freq=16000
        )
        audio_array = resampler(audio_array)
        batch["audio"]["array"] = audio_array.numpy()
        batch["audio"]["sampling_rate"] = 16000
    return batch

eval_dataset = dataset.select(range(5)).map(resample_audio)

wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

def evaluate_samples(samples):
    results = []
    for i, sample in enumerate(samples):
        
        input_features = processor(
            sample["audio"]["array"],
            sampling_rate=sample["audio"]["sampling_rate"],
            return_tensors="pt"
        ).input_features.to(model.device)
        
        predicted_ids = model.generate(input_features)
        prediction = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
        reference = sample["text"]
        
        wer = wer_metric.compute(predictions=[prediction], references=[reference])
        cer = cer_metric.compute(predictions=[prediction], references=[reference])
    
        results.append({
            "sample_num": i+1,
            "reference": reference,
            "prediction": prediction,
            "wer": wer,
            "cer": cer,
            "audio": sample["audio"]["array"],
            "sample_rate": sample["audio"]["sampling_rate"]
        })
    return results

print("Evaluating samples...\n")
results = evaluate_samples(eval_dataset)

for result in results:
    print(f"Sample {result['sample_num']}:")
    print(f"  - Ground Truth: {result['reference']}")
    print(f"  - Predicted: {result['prediction']}")
    print(f"  - WER: {result['wer']:.4f}")
    print(f"  - CER: {result['cer']:.4f}\n")
    display(IPythonAudio(result["audio"], rate=result["sample_rate"]))
    print("\n" + "="*80 + "\n")

avg_wer = sum(r["wer"] for r in results) / len(results)
avg_cer = sum(r["cer"] for r in results) / len(results)
print(f"\nAverage Metrics:")
print(f"  - Average WER: {avg_wer:.4f}")
print(f"  - Average CER: {avg_cer:.4f}")

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.60k [00:00<?, ?B/s]

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Evaluating samples...

Sample 1:
  - Ground Truth: प्रसिद्द कबीर अध्येता, पुरुषोत्तम अग्रवाल का यह शोध आलेख, उस रामानंद की खोज करता है
  - Predicted: प्रसिद्ध कभीर अध्धेता, पुरुशोट्तम अग्र्वाल का यहशोध आलेक, उस रामानन्द की कौज करता है
  - WER: 0.6667
  - CER: 0.1446





Sample 2:
  - Ground Truth: किन्तु आधुनिक पांडित्य, न सिर्फ़ एक ब्राह्मण रामानंद के, एक जुलाहे कबीर का गुरु होने से, बल्कि दोनों के समकालीन होने से भी, इनकार करता है
  - Predicted: किंतु आदूनिक पांडित्या, नसर्फ एक ब्राम्मन रामानन्द के, एक जुलाहे कभीर का घूव होने से, बलकी दोनों के समकालीन होने से भी, इंकार करता है
  - WER: 0.4231
  - CER: 0.1460





Sample 3:
  - Ground Truth: उस पर, इन चार कवियों का गहरा असर है
  - Predicted:  उस्पर इंचार कवियों का गेहरा असर है
  - WER: 0.5556
  - CER: 0.1429





Sample 4:
  - Ground Truth: इसे कई बार मंचित भी किया गया है
  - Predicted:  इसे कई बार मन्चित भी की आगया है
  - WER: 0.3750
  - CER: 0.1935





Sample 5:
  - Ground Truth: यहाँ प्रस्तुत है, हिन्दी कवि कथाकार, तेजी ग्रोवर के अंग्रेज़ी के मार्फ़त किए गए अनुवाद के, कुछ अंश
  - Predicted: यहां प्रस्तुत है, हिंदी कवी कठाकार, तेजी ग्रोवर के अंग्रेजी के मार्पत किये गय अनुवाद के कुछ अंच
  - WER: 0.5556
  - CER: 0.1327






Average Metrics:
  - Average WER: 0.5152
  - Average CER: 0.1519


Qlora Adapter Size

In [5]:
import os

def get_folder_size(path):
    total = 0
    for dirpath, dirnames, filenames in os.walk(path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total += os.path.getsize(fp)
    return total / (1024 ** 2)  # in MB

lora_path = "./whisper-hindi-qlora-final"
lora_size = get_folder_size(lora_path)
print(f"QLoRA Adapter Size on Disk: {lora_size:.2f} MB")


QLoRA Adapter Size on Disk: 15.34 MB


Inference and Evaluation on testdataset

In [7]:
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset, Audio
import torchaudio
import evaluate
from IPython.display import display, Audio as IPythonAudio

model_path = "/kaggle/working/whisper-hindi-qlora-final"
processor = WhisperProcessor.from_pretrained(model_path)
model = WhisperForConditionalGeneration.from_pretrained(model_path).to("cuda" if torch.cuda.is_available() else "cpu")

dataset = load_dataset("parquet", data_files="/kaggle/input/datatest/train-00006-of-00010.parquet", split="train")

def resample_audio(batch):
    if batch["audio"]["sampling_rate"] != 16000:
        audio_array = torch.tensor(batch["audio"]["array"], dtype=torch.float32)
        resampler = torchaudio.transforms.Resample(
            orig_freq=batch["audio"]["sampling_rate"],
            new_freq=16000
        )
        audio_array = resampler(audio_array)
        batch["audio"]["array"] = audio_array.numpy()
        batch["audio"]["sampling_rate"] = 16000
    return batch

eval_dataset = dataset.select(range(5)).map(resample_audio)

wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

def evaluate_samples(samples):
    results = []
    for i, sample in enumerate(samples):
        
        input_features = processor(
            sample["audio"]["array"],
            sampling_rate=sample["audio"]["sampling_rate"],
            return_tensors="pt"
        ).input_features.to(model.device)
        
        predicted_ids = model.generate(input_features)
        prediction = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
        reference = sample["text"]
        
        wer = wer_metric.compute(predictions=[prediction], references=[reference])
        cer = cer_metric.compute(predictions=[prediction], references=[reference])
    
        results.append({
            "sample_num": i+1,
            "reference": reference,
            "prediction": prediction,
            "wer": wer,
            "cer": cer,
            "audio": sample["audio"]["array"],
            "sample_rate": sample["audio"]["sampling_rate"]
        })
    return results

print("Evaluating samples...\n")
results = evaluate_samples(eval_dataset)

for result in results:
    print(f"Sample {result['sample_num']}:")
    print(f"  - Ground Truth: {result['reference']}")
    print(f"  - Predicted: {result['prediction']}")
    print(f"  - WER: {result['wer']:.4f}")
    print(f"  - CER: {result['cer']:.4f}\n")
    display(IPythonAudio(result["audio"], rate=result["sample_rate"]))
    print("\n" + "="*80 + "\n")

avg_wer = sum(r["wer"] for r in results) / len(results)
avg_cer = sum(r["cer"] for r in results) / len(results)
print(f"\nAverage Metrics:")
print(f"  - Average WER: {avg_wer:.4f}")
print(f"  - Average CER: {avg_cer:.4f}")

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Evaluating samples...

Sample 1:
  - Ground Truth: क्योंकि कई बार, आपके आने-जाने का रास्ता अलग होता है.
  - Predicted:  कुंकी कई बार, आपके आने जाने का रास्ता अलग होता है
  - WER: 0.4000
  - CER: 0.1154





Sample 2:
  - Ground Truth: फिर भी थोड़ी कोशिश करके, आप अपने वर्चुअल असिस्टेंट से ये काम ले सकते हैं.
  - Predicted: फर भी थोडी कोशिष करके, आप अपने वर्च्वल अस्सिस्टन्छ से ये काम ले सकते हैं
  - WER: 0.4000
  - CER: 0.1507





Sample 3:
  - Ground Truth: आपका वर्चुअल सहायक, आपके लिए टाइमर सेट करने का काम भी आसानी से कर सकता है.
  - Predicted: अपका वर्चुल सहायक, अपकिलिए ताईमर सेट करने का काम भी आसानी से कर सकता है
  - WER: 0.3750
  - CER: 0.1081





Sample 4:
  - Ground Truth: बस आपको छोटे से वाक्य में उसको बताना है, कि मेरे लिए एक घंटे का टाइमर सेट कर दो.
  - Predicted: बस आप को चोटी से वाक्के में उसको बताना है, कि मेरेल ये एक घन्ते का तामर्ट सेट कर दो
  - WER: 0.4737
  - CER: 0.2125





Sample 5:
  - Ground Truth: आप अपने वर्चुअल सहायक से, किसी ख़ास दिन के लिए पोशाक चुनने में भी मदद मांग सकते हैं.
  - Predicted: आप अपने वर्चौल सहायक से, किसी खास दिन के लिय पोशाक चुनने में भी मदद मांक सकते हैं
  - WER: 0.2778
  - CER: 0.0714






Average Metrics:
  - Average WER: 0.3853
  - Average CER: 0.1316
