## Setting Up the Environment

In [None]:
!pip install uv

In [None]:
!uv pip install --quiet transformers datasets torch librosa jiwer evaluate soundfile

In [None]:
import torch
from transformers import (
    WhisperForConditionalGeneration,
    WhisperProcessor,
    pipeline
)
import librosa
import soundfile as sf
from datasets import load_dataset
import evaluate
import numpy as np
from tqdm import tqdm

In [None]:
!uv pip install pandas pyarrow datasets

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from huggingface_hub import hf_hub_download
import os
from google.colab import drive

# Mounting Google Drive
drive.mount('/content/drive')

# Defining download directory within Google Drive
local_dir = "/content/drive/MyDrive/Whisper_Finetune_NEW/fleurs_hindi_parquet_data"
os.makedirs(local_dir, exist_ok=True)

print(f"Google Drive directory created: {local_dir}")

In [None]:
!uv pip install "torchcodec==0.7.*"

In [None]:
!uv pip uninstall torchcodec

In [None]:
from datasets.packaged_modules import parquet
from datasets import load_dataset

# there have been some recent changes in how datasets are being loaded from hugging face
# the use scripts have been deprecated

## usig this workaround to load the FLEURS Hindi dataset

dataset = load_dataset(
    "parquet",
    data_files="https://huggingface.co/datasets/google/fleurs/resolve/refs%2Fconvert%2Fparquet/hi_in/test/0000.parquet",
    split="train"  # The parquet file is loaded as 'train' split by default
)

In [None]:
## Ensuring GPU cache is cleared up before loading my model
import torch
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("GPU cache cleared.")
else:
    print("No GPU available to clear cache.")

In [None]:
from datasets import Audio
from transformers import WhisperForConditionalGeneration, WhisperProcessor
import librosa
import torch
import evaluate
import torchcodec


# Loading model and processor
model_id = "Pranav13/whisper-small-hi-custom-final-new"
processor = WhisperProcessor.from_pretrained(model_id)
model = WhisperForConditionalGeneration.from_pretrained(model_id).to("cuda" if torch.cuda.is_available() else "cpu")

wer_metric = evaluate.load("wer")
predictions, references = [], []

# Explicitly cast the 'audio' column to an Audio feature
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

for example in dataset:
    # Access the audio data directly as an array and sampling rate
    audio_data = example["audio"]
    waveform = audio_data["array"]
    sr = audio_data["sampling_rate"]

    ref_text = example["transcription"]

    # The audio is already loaded and resampled by the Audio feature
    inputs = processor(waveform, sampling_rate=sr, return_tensors="pt")

    with torch.no_grad():
        predicted_ids = model.generate(inputs.input_features.to(model.device))
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

    predictions.append(transcription)
    references.append(ref_text)

    # Optional: Stop after N samples for quick testing
    # if len(predictions) >= 100: break

# Calculate WER
wer = wer_metric.compute(predictions=predictions, references=references)
print(f"WER: {wer * 100:.2f}%")