## Setting Up the Environment

In [None]:
!pip install uv



In [None]:
!uv pip install --quiet transformers datasets torch librosa jiwer evaluate soundfile

In [None]:
import torch
from transformers import (
    WhisperForConditionalGeneration,
    WhisperProcessor,
    pipeline
)
import librosa
import soundfile as sf
from datasets import load_dataset
import evaluate
import numpy as np
from tqdm import tqdm

In [None]:
!uv pip install pandas pyarrow datasets

[2mUsing Python 3.12.12 environment at: /usr[0m
[2mAudited [1m3 packages[0m [2min 89ms[0m[0m


In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from huggingface_hub import hf_hub_download
import os
from google.colab import drive

# Mounting Google Drive
drive.mount('/content/drive')

# Defining download directory within Google Drive
local_dir = "/content/drive/MyDrive/Whisper_Finetune_NEW/fleurs_hindi_parquet_data"
os.makedirs(local_dir, exist_ok=True)

print(f"Google Drive directory created: {local_dir}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive directory created: /content/drive/MyDrive/Whisper_Finetune_NEW/fleurs_hindi_parquet_data
Note: Direct parquet file download via hf_hub_download for FLEURS dataset is commented out. Dataset will be loaded using datasets.load_dataset.


In [None]:
!uv pip install "torchcodec==0.7.*"

[2mUsing Python 3.12.12 environment at: /usr[0m
[37m⠋[0m [2mResolving dependencies...                                                     [0m[2K[37m⠋[0m [2mResolving dependencies...                                                     [0m[2K[37m⠙[0m [2mResolving dependencies...                                                     [0m[2K[37m⠙[0m [2mtorchcodec==0.7.0                                                             [0m[2K[37m⠙[0m [2m                                                                              [0m[2K[2mResolved [1m1 package[0m [2min 18ms[0m[0m
[37m⠋[0m [2mPreparing packages...[0m (0/0)                                                   [2K[37m⠋[0m [2mPreparing packages...[0m (0/1)                                                   [2K[37m⠙[0m [2mPreparing packages...[0m (0/1)                                                   [2K[37m⠙[0m [2m[0m (1/1)                                                          

In [None]:
!uv pip uninstall torchcodec

[2mUsing Python 3.12.12 environment at: /usr[0m
[2mUninstalled [1m1 package[0m [2min 88ms[0m[0m
 [31m-[39m [1mtorchcodec[0m[2m==0.8.1[0m


In [None]:
from datasets.packaged_modules import parquet
from datasets import load_dataset

# there have been some recent changes in how datasets are being loaded from hugging face
# the use scripts have been deprecated

## usig this workaround to load the FLEURS Hindi dataset

dataset = load_dataset(
    "parquet",
    data_files="https://huggingface.co/datasets/google/fleurs/resolve/refs%2Fconvert%2Fparquet/hi_in/test/0000.parquet",
    split="train"  # The parquet file is loaded as 'train' split by default
)

In [None]:
## Ensuring GPU cache is cleared up before loading my model
import torch
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("GPU cache cleared.")
else:
    print("No GPU available to clear cache.")

GPU cache cleared.


In [None]:
from datasets import Audio
from transformers import WhisperForConditionalGeneration, WhisperProcessor
import librosa
import torch
import evaluate
import torchcodec


# Loading model and processor
model_id = "Pranav13/whisper-small-hi-custom-final-new"
processor = WhisperProcessor.from_pretrained(model_id)
model = WhisperForConditionalGeneration.from_pretrained(model_id).to("cuda" if torch.cuda.is_available() else "cpu")

wer_metric = evaluate.load("wer")
predictions, references = [], []

# Explicitly cast the 'audio' column to an Audio feature
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

for example in dataset:
    # Access the audio data directly as an array and sampling rate
    audio_data = example["audio"]
    waveform = audio_data["array"]
    sr = audio_data["sampling_rate"]

    ref_text = example["transcription"]

    # The audio is already loaded and resampled by the Audio feature
    inputs = processor(waveform, sampling_rate=sr, return_tensors="pt")

    with torch.no_grad():
        predicted_ids = model.generate(inputs.input_features.to(model.device))
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

    predictions.append(transcription)
    references.append(ref_text)

    # Optional: Stop after N samples for quick testing
    # if len(predictions) >= 100: break

# Calculate WER
wer = wer_metric.compute(predictions=predictions, references=references)
print(f"WER: {wer * 100:.2f}%")

`generation_config` default values have been modified to match model-specific defaults: {'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}. If this is not desired, please set these values explicitly.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
A custom logits processor of type <class 'transformers.generation.logit

WER: 39.95%
