Full FineTune Model With Training Arguments

In [1]:
!pip install datasets transformers[torch] soundfile librosa accelerate evaluate jiwer --quiet
!pip install --upgrade transformers

import torch
from transformers import (
    WhisperFeatureExtractor,
    WhisperTokenizer,
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from datasets import load_dataset, Audio
import torchaudio
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import evaluate
import numpy as np

model_name = "openai/whisper-tiny"
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name)
tokenizer = WhisperTokenizer.from_pretrained(model_name, language="hindi", task="transcribe")
processor = WhisperProcessor.from_pretrained(model_name, language="hindi", task="transcribe")

model = WhisperForConditionalGeneration.from_pretrained(model_name)
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="hindi", task="transcribe")
model.config.suppress_tokens = []
model.config.use_cache = False
model.gradient_checkpointing_enable()

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

dataset = load_dataset("parquet", data_files="/kaggle/input/dataseeet/train-00000-of-00010.parquet", split="train")

dataset = dataset.train_test_split(test_size=0.1, seed=42)
print(dataset)

def resample_audio(batch):
    if batch["audio"]["sampling_rate"] != 16000:
        audio_array = torch.tensor(batch["audio"]["array"], dtype=torch.float32)
        resampler = torchaudio.transforms.Resample(
            orig_freq=batch["audio"]["sampling_rate"],
            new_freq=16000
        )
        audio_array = resampler(audio_array)
        batch["audio"]["array"] = audio_array.numpy()
        batch["audio"]["sampling_rate"] = 16000
    return batch

print("Resampling audio...")
dataset = dataset.cast_column("audio", Audio())
dataset["train"] = dataset["train"].map(resample_audio)
dataset["test"] = dataset["test"].map(resample_audio)

def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_features"] = feature_extractor(
        audio["array"], 
        sampling_rate=audio["sampling_rate"]
    ).input_features[0]
    batch["labels"] = tokenizer(batch["text"]).input_ids
    return batch

print("Preparing dataset features...")
for split in dataset:
    dataset[split] = dataset[split].map(
        prepare_dataset,
        remove_columns=dataset[split].column_names
    )

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    
    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        if torch.all(labels == -100):
            labels = None
            
        batch["labels"] = labels
        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = tokenizer.pad_token_id
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

training_args = Seq2SeqTrainingArguments(
    output_dir="/kaggle/working/whisper-hindi",
    num_train_epochs=6,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    warmup_ratio=0.1,
    gradient_checkpointing=True,
    fp16=True if device == "cuda" else False,
    eval_strategy="epoch",
    save_strategy="epoch",
    predict_with_generate=True,
    generation_max_length=225,
    save_total_limit=2,
    logging_steps=25,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
    remove_unused_columns=False,
    report_to="none",
    logging_dir=None,
)

print("Initializing trainer...")
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

print("Starting training...")
trainer.train()

print("Saving final model...")
trainer.save_model("/kaggle/working/whisper-hindi-final")
processor.save_pretrained("/kaggle/working/whisper-hindi-final")

print("✅ Training complete!")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Downloading transformers-4.51.3-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m76.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[?25hDownloading huggingface_hub-0.30.2-py3-none-any.whl (481 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m481.4/481.4 kB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collec

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/151M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['audio', 'text', 'gender'],
        num_rows: 1064
    })
    test: Dataset({
        features: ['audio', 'text', 'gender'],
        num_rows: 119
    })
})
Resampling audio...


Map:   0%|          | 0/1064 [00:00<?, ? examples/s]

Map:   0%|          | 0/119 [00:00<?, ? examples/s]

Preparing dataset features...


Map:   0%|          | 0/1064 [00:00<?, ? examples/s]

Map:   0%|          | 0/119 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

Initializing trainer...
Starting training...


  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss,Wer
1,1.476,0.816273,65.684647
2,0.5411,0.487886,57.842324
3,0.2781,0.29626,42.323651
4,0.2227,0.26251,37.883817
5,0.1663,0.247801,36.348548


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


Saving final model...
✅ Training complete!


In [8]:
model.config.save_pretrained("whisper-hindi-final")

Inference And Evaluation

In [12]:
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset, Audio
import torchaudio
import evaluate
from IPython.display import display, Audio as IPythonAudio

model_path = "/kaggle/working/whisper-hindi-final"
processor = WhisperProcessor.from_pretrained(model_path)
model = WhisperForConditionalGeneration.from_pretrained(model_path).to("cuda" if torch.cuda.is_available() else "cpu")

model.config.forced_decoder_ids = None
model.generation_config.forced_decoder_ids = None

dataset = load_dataset("parquet", data_files="/kaggle/input/dataseeet/train-00000-of-00010.parquet", split="train")

def resample_audio(batch):
    if batch["audio"]["sampling_rate"] != 16000:
        audio_array = torch.tensor(batch["audio"]["array"], dtype=torch.float32)
        resampler = torchaudio.transforms.Resample(
            orig_freq=batch["audio"]["sampling_rate"],
            new_freq=16000
        )
        audio_array = resampler(audio_array)
        batch["audio"]["array"] = audio_array.numpy()
        batch["audio"]["sampling_rate"] = 16000
    return batch

eval_dataset = dataset.select(range(5)).map(resample_audio)

wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

def evaluate_samples(samples):
    results = []
    for i, sample in enumerate(samples):
        try:
            input_features = processor(
                sample["audio"]["array"],
                sampling_rate=sample["audio"]["sampling_rate"],
                return_tensors="pt"
            ).input_features.to(model.device)
            predicted_ids = model.generate(input_features)
            prediction = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
            reference = sample["text"]
            
            wer = wer_metric.compute(predictions=[prediction], references=[reference])
            cer = cer_metric.compute(predictions=[prediction], references=[reference])
        
            results.append({
                "sample_num": i+1,
                "reference": reference,
                "prediction": prediction,
                "wer": wer,
                "cer": cer,
                "audio": sample["audio"]["array"],
                "sample_rate": sample["audio"]["sampling_rate"]
            })
        except Exception as e:
            print(f"Error processing sample {i+1}: {str(e)}")
            results.append({
                "sample_num": i+1,
                "error": str(e)
            })
    return results

print("Evaluating samples...\n")
results = evaluate_samples(eval_dataset)

for result in results:
    if "error" not in result:
        print(f"Sample {result['sample_num']}:")
        print(f"  - Ground Truth: {result['reference']}")
        print(f"  - Predicted: {result['prediction']}")
        print(f"  - WER: {result['wer']:.4f}")
        print(f"  - CER: {result['cer']:.4f}\n")
        display(IPythonAudio(result["audio"], rate=result["sample_rate"]))
        print("\n" + "="*80 + "\n")

successful_results = [r for r in results if "error" not in r]
if successful_results:
    avg_wer = sum(r["wer"] for r in successful_results) / len(successful_results)
    avg_cer = sum(r["cer"] for r in successful_results) / len(successful_results)
    print(f"\nAverage Metrics:")
    print(f"  - Average WER: {avg_wer:.4f}")
    print(f"  - Average CER: {avg_cer:.4f}")
else:
    print("\nNo successful evaluations completed.")

`generation_config` default values have been modified to match model-specific defaults: {'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}. If this is not desired, please set these values explicitly.
A custom logits processor of type <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> has been passed to `.generate()`, but it was also created in `.generate()`, given its parameterization. The custom <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> will take precedence. Please check the docstring of <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> to see related `.generate()` flags.
A custom logits processor of type <class 'transformers.generation.logits_process.SuppressTokensAtBeginLogitsProcessor'> has been passed to `.generate()`, but it was also created in `.generate()`, given its parameterization. The custom <class 'transformers.generation.logits_process.SuppressTokensAtBeginLogitsProce

Evaluating samples...

Sample 1:
  - Ground Truth: प्रसिद्द कबीर अध्येता, पुरुषोत्तम अग्रवाल का यह शोध आलेख, उस रामानंद की खोज करता है
  - Predicted: प्रसेद्ध कबीर अध्हेता, पुरुषोत्तम अग्रवाल कायहशोध आलेक, उस रामानन्द की खोच करता है
  - WER: 0.5333
  - CER: 0.1084





Sample 2:
  - Ground Truth: किन्तु आधुनिक पांडित्य, न सिर्फ़ एक ब्राह्मण रामानंद के, एक जुलाहे कबीर का गुरु होने से, बल्कि दोनों के समकालीन होने से भी, इनकार करता है
  - Predicted: ग्टुए़ुन्तुआधुनिक पांडित्य, न सिर्फ्फ़, एक ब्राम्मण, रामानद के, एक जुला हे, कबीर का, गुरू होने से, बल्किद, दोनों के समकालीन होने से, भी, इन कार करता हैं, एएए
  - WER: 0.5769
  - CER: 0.2044





Sample 3:
  - Ground Truth: उस पर, इन चार कवियों का गहरा असर है
  - Predicted: ँ़्पर इन्चार कवियोंका गहरा असर है
  - WER: 0.6667
  - CER: 0.1714





Sample 4:
  - Ground Truth: इसे कई बार मंचित भी किया गया है
  - Predicted: ँवववार्मन्चित्भी किया गया है
  - WER: 0.6250
  - CER: 0.3871





Sample 5:
  - Ground Truth: यहाँ प्रस्तुत है, हिन्दी कवि कथाकार, तेजी ग्रोवर के अंग्रेज़ी के मार्फ़त किए गए अनुवाद के, कुछ अंश
  - Predicted: ँव्वस्थ हिन्दी कवी कथाकार, तेजी ग्रोवर के अंग्रेज़ी के मार्फ़त की एगये अनुवाद के, कुछ अंश
  - WER: 0.3333
  - CER: 0.1939






Average Metrics:
  - Average WER: 0.5471
  - Average CER: 0.2130


In [13]:
!zip -r whisper-hindi-final.zip whisper-hindi-final/

!ls -lh *.zip

  adding: whisper-hindi-final/ (stored 0%)
  adding: whisper-hindi-final/training_args.bin (deflated 52%)
  adding: whisper-hindi-final/vocab.json (deflated 69%)
  adding: whisper-hindi-final/merges.txt (deflated 54%)
  adding: whisper-hindi-final/generation_config.json (deflated 73%)
  adding: whisper-hindi-final/normalizer.json (deflated 81%)
  adding: whisper-hindi-final/model.safetensors (deflated 8%)
  adding: whisper-hindi-final/tokenizer_config.json (deflated 96%)
  adding: whisper-hindi-final/special_tokens_map.json (deflated 80%)
  adding: whisper-hindi-final/preprocessor_config.json (deflated 44%)
  adding: whisper-hindi-final/config.json (deflated 60%)
  adding: whisper-hindi-final/added_tokens.json (deflated 80%)
-rw-r--r-- 1 root root 133M Apr 14 19:48 whisper-hindi-final.zip


In [14]:
from IPython.display import FileLink
FileLink('whisper-hindi-final.zip') 