# Import necessary modules

In [None]:
!pip install jiwer

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import os

import librosa
import librosa.display
from pydub import AudioSegment
import IPython.display as ipd
from collections import Counter
from tqdm import tqdm

from sklearn.model_selection import train_test_split

import torch
import torchaudio

from dataclasses import dataclass
from typing import Any, Dict, List, Union
from datasets import DatasetDict
from datasets import Dataset as DS

from transformers import (
    WhisperFeatureExtractor,
    WhisperTokenizer,
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    TrainerCallback,
    TrainingArguments,
    TrainerState,
    TrainerControl,
    EarlyStoppingCallback,
    pipeline
)

from jiwer import wer, cer

In [3]:
def pretty_sort(filename):
    name, number_str = filename.split(" (") # split the filename into two parts (name, number)
    number = int(number_str.split(")")[0]) # extract the number from within parentheses and convert it to an integer
    return name, number

# Defining data directory paths

In [4]:
BASE_DIR = '/kaggle/input/nilphamari-data/nilphamari'
train_data_dir = f"{BASE_DIR}/train/"
test_data_dir = f"{BASE_DIR}/test/"
data_path = f"{BASE_DIR}/train.xlsx"

In [None]:
pipe = pipeline( #pipeline for various tasks 
    "automatic-speech-recognition",
    model='/kaggle/input/trained-model-weights/trained_model',# model path 
    chunk_length_s=30,
    device=0,
)

In [None]:
ids = []
preds=[]

for root, dirs, files in os.walk("/kaggle/input/interspeech-2025/test"):
    
    files = sorted(files, key=pretty_sort) # sort files based on the custom pretty_sort function
    
    ids = files.copy()
    
    for file in tqdm(files):
        composed_path = f"{test_data_dir}{file}" # construct the complete path to the audio file
        audio, sr = librosa.load(composed_path, sr=16_000) # load the audio file with a specified sampling rate
        text = pipe(audio)["text"]
        preds.append(text)


inferences = pd.DataFrame() #dataframe of inferences
inferences["id"] = ids
inferences["sentence"] = preds

inferences.head(20)

In [None]:
# inferences = pd.read_excel("/kaggle/working/inferences.xlsx")
inferences = inferences.sort_values('id')
inferences

In [None]:
ground_truths = pd.read_excel("/kaggle/input/interspeech-2025/test.xlsx") #loading ground truth
ground_truths = ground_truths[["file_name","transcripts"]]
ground_truths = ground_truths.sort_values('file_name')
ground_truths

# Calculating WER & CER

In [None]:
WERS = []
CERS = []

for reference, hypothesis in zip(ground_truths["transcripts"],inferences["sentence"]):
    
    w_e = wer(str(reference), str(hypothesis))
    WERS.append(round(w_e,2))
    
    c_e = cer(str(reference), str(hypothesis))
    CERS.append(round(c_e,2))
    

    
ground_truths["predictions"] = inferences["sentence"].to_list()
ground_truths["WER"] = WERS
ground_truths["CER"] = CERS

ground_truths = ground_truths[["file_name", "transcripts", "predictions", "WER", "CER"]]
ground_truths

In [12]:
# ground_truths.to_excel("ground_truths.xlsx", index=False)

# Calculating the average WER & CER

In [None]:
avg_wer = np.average(ground_truths["WER"])
avg_cer = np.average(ground_truths["CER"])

print(f"Average WER: {round(avg_wer,2)} | Average CER: {round(avg_cer,2)}")

# Testing inferences

In [None]:
def infer(audio_file_name):
    valid_data_dir = "/kaggle/input/interspeech-2025/test/"
    composed_path = f"{valid_data_dir}{audio_file_name}"
    audio, sr = librosa.load(composed_path, sr=16_000)
    text = pipe(audio)["text"]
    return text


def get_truth(audio_file_name):

    valid_df = pd.read_excel("/kaggle/input/interspeech-2025/test.xlsx")
    file_names = valid_df["file_name"].to_list()
    index = file_names.index(audio_file_name)
    truth_value =  valid_df["transcriptions"][index]
    return truth_value


def calc_wer_cer(reference,hypothesis):
    
    from jiwer import wer, cer
    

    reference = str(reference)
    # print(reference)
    hypothesis = str(hypothesis)
    # print(hypothesis)

    WER = round(wer(reference,hypothesis),2)
    CER = round(cer(reference,hypothesis),2)
    
    
    return WER, CER


def play_audio(audio_file_name):    
    audio_file_path = f"/kaggle/input/interspeech-2025/test/{audio_file_name}"
    return display(AudioSegment.from_file(audio_file_path))

In [None]:
audio_file_name = "test_tangail_1014.wav"

reference = get_truth(audio_file_name)
# print(reference)
# print("================================")
hypothesis = infer(audio_file_name)
# print(hypothesis)
wer, cer = calc_wer_cer(reference,hypothesis)

print(f"Audio:")
print()
play_audio(audio_file_name)
print()
print(f"Truth value: {reference}")
print()
print(f"Prediction: {hypothesis}")
print()
print(f"Word Error Rate: {wer} | Character Error Rate: {cer}")