In [1]:
!pip install jiwer -q

In [None]:
import pandas as pd
from tqdm import tqdm
from glob import glob
import os
import librosa
tqdm.pandas()
import requests
from transformers import pipeline
from pathlib import Path
from jiwer import wer, cer
import numpy as np

In [None]:
asr = pipeline("automatic-speech-recognition", model="bengaliAI/tugstugi_bengaliai-asr_whisper-medium")

In [4]:
def infer_tugstugi(aud_path): # use the audio file as the audio source
    #print(aud_path)
    transcription = asr(aud_path)['text']
    text = str(transcription)
    #print(text)
    return text

In [None]:
def infer_dis(dis):
    dis_serial_dict = {
        'Rangpur':1,
        'Kishoreganj':2,
        'Narail':3,
        'Chittagong':4,
        'Narsingdi':5,
        'Tangail':6,
        'Habiganj':7,
        'Barishal':8,
        'Sylhet':9,
        'Sandwip':10,
        'Cumilla':11,
        'Noakhali':12
        }

    
    data_dir = f'/kaggle/input/interspeech-2025/district_wise/{dis}/test/'


    files = os.listdir(data_dir)
    #files = files[:3]
    #print(files)

    preds = []
    auds = []

    for i in tqdm(files):
    
        auds.append(i)
        aud_path =  f"{data_dir}{i}"
        #print(aud_path)
        try:
            pred=infer_tugstugi(aud_path)
            #print(pred)
            preds.append(pred)
        except Exception as e:
            preds.append("<>")
            #pass

    df = pd.DataFrame(columns = ["file_name", "predictions"])
    df["file_name"] = auds
    df["predictions"] = preds

    for k,v in dis_serial_dict.items():
        if k == dis:
            serial = v

    print()
    print("============================================================================")
    print(f"Dataset length of {dis}: {len(df)}")
    print("============================================================================")
    print()
    df.to_excel(f"/kaggle/working/predictions/5.{serial}: {dis}_tugstugi_inferences.xlsx",index = False)
    print()
    print("=============================== Dataframe Exported ======================================")
    print()

In [None]:
dists = ['Rangpur', 'Kishoreganj', 'Narail', 'Chittagong', 'Narsingdi', 'Tangail','Habiganj','Barishal','Sylhet','Sandwip','Cumilla','Noakhali']


Path('/kaggle/working/predictions/').mkdir(parents=True, exist_ok=True)
                                                         
for dis in dists:
    infer_dis(dis)

In [9]:
def calc_wer_cer(ground_truth,prediction):

    ground_truth = str(ground_truth)
    prediction = str(prediction)

    WER = round(wer(ground_truth, prediction),3)
    CER = round(cer(ground_truth, prediction),3)

    return WER, CER

In [None]:
pred_excel_data_dir = "/kaggle/working/predictions/"
pred_excel_sheets = os.listdir(pred_excel_data_dir)
#pred_excel_sheets

dists = ['Rangpur', 'Kishoreganj', 'Narail', 'Chittagong', 'Narsingdi', 'Tangail','Habiganj','Barishal','Sylhet','Sandwip','Cumilla','Noakhali']


Path('/kaggle/working/predictions_with_wer_cer/').mkdir(parents=True, exist_ok=True)
pred_excel_data_dir = "/kaggle/working/predictions/"
pred_excel_sheets = os.listdir(pred_excel_data_dir)

for sheet in pred_excel_sheets:

    if sheet[-4:] == "xlsx":
        i = sheet.split(" ")
        i = i[1].split("_")
        i = i[0]
        #print(i)

        pred_data_path = f"{pred_excel_data_dir}{sheet}"
        pred_df = pd.read_excel(pred_data_path)
        
        gd_df = pd.read_excel(f'/kaggle/input/interspeech-2025/district_wise/{i}/{i}_test.xlsx')
        gd_df = gd_df[["file_name","transcripts","district"]]            
        merged_df = pd.merge(pred_df, gd_df, on='file_name', how='inner')  
        merged_df['model'] = 'Tugstugi'

        WERS = []
        CERS = []
    
        for gd, pr in zip(merged_df['transcripts'], merged_df['predictions']):
            WER, CER  = calc_wer_cer(gd,pr)
            WERS.append(WER)
            CERS.append(CER)
        
        merged_df['wer'] = WERS
        merged_df['cer'] = CERS
        merged_df = merged_df[["model","district","file_name","predictions","transcripts","wer","cer"]]
        merged_df.to_excel(f"/kaggle/working/predictions_with_wer_cer/tugstugi_{sheet}", index = False)

In [None]:
wer_cer_pred_excel_data_dir = "/kaggle/working/predictions_with_wer_cer"
wer_cer_pred_excel_sheets = os.listdir(wer_cer_pred_excel_data_dir)
wer_cer_pred_excel_sheets

# District wise WER, CER¶

In [None]:
dists = ['Rangpur', 'Kishoreganj', 'Narail', 'Chittagong', 'Narsingdi', 'Tangail','Habiganj','Barishal','Sylhet','Sandwip','Cumilla','Noakhali']

avg_wer = []
avg_cer = []


for i in wer_cer_pred_excel_sheets:
    df = pd.read_excel(f"{wer_cer_pred_excel_data_dir}/{i}")
    avg_w = np.average(df['wer'])
    avg_wer.append(round(avg_w,3))
    avg_c = np.average(df['cer'])
    avg_cer.append(round(avg_c,3))


for i,j,k in zip(dists,avg_wer,avg_cer ):
    print()
    print(f'{i}: Avg. WER: {j} | Avg. CER: {k}')
    print()

# Model WER, CER

In [None]:
model = "tugstugi"

model_avg_wer = np.average(avg_wer)
model_avg_cer = np.average(avg_cer)

print(f"{model}")
print()
print(f"Average WER: {model_avg_wer} | Average CER: {model_avg_cer}")
print()
print("==========================================================================================================")

concat_df = pd.DataFrame(columns=['model', 'district', 'file_name', 'predictions', 'transcripts', 'wer','cer'])

for i in wer_cer_pred_excel_sheets:
    df = pd.read_excel(f"{wer_cer_pred_excel_data_dir}/{i}")
    concat_df = pd.concat([concat_df, df], ignore_index=True, axis=0)


concat_df.to_excel(f"{model}_inferences.xlsx",index =False)
model_avg_wer_concat = np.average(concat_df['wer'])
model_avg_cer_concat = np.average(concat_df['cer'])
print()
print(f"All Together")
print()
print(f"Average WER: {round(model_avg_wer_concat,3)} | Average CER: {round(model_avg_cer_concat,3)}")