In [3]:
!pip install jiwer

import torch
import torchaudio
from datasets import load_dataset, Audio
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from jiwer import wer, cer

dataset_path = "/kaggle/input/testing/train-00000-of-00010.parquet"
dataset = load_dataset("parquet", data_files=dataset_path)["train"]

model_name = "openai/whisper-tiny"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

processor.tokenizer.language = "hi"
processor.tokenizer.set_prefix_tokens = lambda *args, **kwargs: []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

def transcribe(batch):
    inputs = processor(batch["audio"]["array"], sampling_rate=16000, return_tensors="pt")
    input_features = inputs.input_features.to(device)

    forced_decoder_ids = processor.get_decoder_prompt_ids(language="hi", task="transcribe")
    predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    batch["prediction"] = transcription
    return batch

result = dataset.map(transcribe)

references = result["text"]
predictions = result["prediction"]

wer_score = wer(references, predictions)
cer_score = cer(references, predictions)

print(f"\nWER (Hindi): {wer_score:.4f}")
print(f"CER (Hindi): {cer_score:.4f}")

print("\n--- Sample Inference (First 5) ---")
for i in range(5):
    print(f"\nReference {i+1}:  {references[i]}")
    print(f"Prediction {i+1}: {predictions[i]}")

param_size = sum(p.numel() for p in model.parameters()) * 4 / (1024 ** 2)
print(f"\nModel Size: {param_size:.2f} MB")

model.save_pretrained("/kaggle/working/whisper-hindi-model")
processor.save_pretrained("/kaggle/working/whisper-hindi-model")





Map:   0%|          | 0/1183 [00:00<?, ? examples/s]


WER (Hindi): 3.9311
CER (Hindi): 3.7089

--- Sample Inference (First 5) ---

Reference 1:  प्रसिद्द कबीर अध्येता, पुरुषोत्तम अग्रवाल का यह शोध आलेख, उस रामानंद की खोज करता है
Prediction 1:  Precise de cabir adheta, purushottam agraval kaihashod alik usramanandaki khuchkarthahe.

Reference 2:  किन्तु आधुनिक पांडित्य, न सिर्फ़ एक ब्राह्मण रामानंद के, एक जुलाहे कबीर का गुरु होने से, बल्कि दोनों के समकालीन होने से भी, इनकार करता है
Prediction 2:  But the next day, the next day, the next day, the next day, the next day, the next day, the next day, the next day, the next day, the next day, the next day, the next day, the next day, the next day, the next day, the next day, the next day, the next day, the next day, the next day, the next day, the next day, the next day, the next day, the next day, the next day, the next day, the next day, the next day, the next day, the next day, the next day, the next day, the next day, the next day, the next day, the next day, the next day, the next day, th



[]