In [None]:
!cp -r ../input/python-packages2 ./

In [None]:
!tar xvfz ./python-packages2/jiwer.tgz
!pip install ./jiwer/jiwer-2.3.0-py3-none-any.whl -f ./ --no-index
!tar xvfz ./python-packages2/normalizer.tgz
!pip install ./normalizer/bnunicodenormalizer-0.0.24.tar.gz -f ./ --no-index
!tar xvfz ./python-packages2/pyctcdecode.tgz
!pip install ./pyctcdecode/attrs-22.1.0-py2.py3-none-any.whl -f ./ --no-index --no-deps
!pip install ./pyctcdecode/exceptiongroup-1.0.0rc9-py3-none-any.whl -f ./ --no-index --no-deps
!pip install ./pyctcdecode/hypothesis-6.54.4-py3-none-any.whl -f ./ --no-index --no-deps
!pip install ./pyctcdecode/numpy-1.21.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl -f ./ --no-index --no-deps
!pip install ./pyctcdecode/pygtrie-2.5.0.tar.gz -f ./ --no-index --no-deps
!pip install ./pyctcdecode/sortedcontainers-2.4.0-py2.py3-none-any.whl -f ./ --no-index --no-deps
!pip install ./pyctcdecode/pyctcdecode-0.4.0-py2.py3-none-any.whl -f ./ --no-index --no-deps

!tar xvfz ./python-packages2/pypikenlm.tgz
!pip install ./pypikenlm/pypi-kenlm-0.1.20220713.tar.gz -f ./ --no-index --no-deps



In [None]:
# !pip install transformers==4.20.0 
!pip install jiwer

In [1]:
import os
import numpy as np
from tqdm.auto import tqdm
from glob import glob
from transformers import AutoFeatureExtractor, pipeline
import pandas as pd
import librosa
import IPython
from tqdm.auto import tqdm
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
import torch
import gc
import wave
from scipy.io import wavfile
import scipy.signal as sps
import pyctcdecode
from transformers import WhisperProcessor
from bnunicodenormalizer import Normalizer 

tqdm.pandas()
import warnings
warnings.filterwarnings("ignore")


# Load Model

In [2]:
class CFG:
    my_model_name = '/kaggle/input/whisper-medium-finetuning/whisper-medium-trained/model'
    processor_name = '/kaggle/input/whisper-medium-finetuning/whisper-medium-trained/processor'
    
# Load the processor
processor = WhisperProcessor.from_pretrained(CFG.processor_name)

In [3]:
# from safetensors.torch import safe_open

# try:
#     with safe_open("/kaggle/input/whisper-medium-trained/whisper-medium-trained/model/model.safetensors", framework="pt") as f:
#         metadata = f.metadata()
#         print("Safetensors metadata:", metadata)
# except Exception as e:
#     print("Safetensors file is corrupted:", e)


In [4]:
my_asrLM = pipeline(
    "automatic-speech-recognition",
    model=CFG.my_model_name,
    tokenizer=processor.tokenizer,  # Tokenizer from the processor
    feature_extractor=processor.feature_extractor,  # Feature extractor from the processor
    device=0  # Use GPU (device 0)
)

Device set to use cuda:0


# Function declarations

In [5]:
from jiwer import wer,cer

def infer(audio_path):
    speech, sr = librosa.load(audio_path, sr=processor.feature_extractor.sampling_rate)

    my_LM_prediction = my_asrLM(
                speech, chunk_length_s=112, stride_length_s=None
            )

    return my_LM_prediction['text']


def batch_infer(audio_paths, batch_size):
    '''
    infers on a batch of audio
    args:
      audio_paths  : list of path to audio files <list of string>
    returns:
      bangla predicted texts <list of string>
    '''
    results = []
    for path in audio_paths:
        results.append(infer(path))
    
    return results


bnorm = Normalizer()
def normalize(sen):
    _words = [bnorm(word)['normalized']  for word in sen.split()]
    return " ".join([word for word in _words if word is not None])

def dari(sentence):
    try:
        if sentence[-1]!="।":
            sentence+="।"
    except:
        print(sentence)
    return sentence


def directory_infer(folder_path, batch_size):
    # audios = os.listdir(folder_path)
    audios = [f for f in os.listdir(folder_path) if f.endswith('.wav')]
    
    sentences = []
    
    for idx in tqdm(range(0,len(audios),batch_size)):
        batch_paths = [ os.path.join(folder_path, audio) for audio in audios[idx:idx+batch_size] ]
        sentences+=batch_infer(batch_paths, batch_size)
    
    df = pd.DataFrame({'file_name' : audios, 'transcriptions' : sentences})
    df.transcriptions= df.transcriptions.apply(lambda x:normalize(x))
    df.transcriptions= df.transcriptions.apply(lambda x:dari(x))
    
    return df


def calc_wer_cer(ground_truth,prediction):
    
    ground_truth = str(ground_truth)
    prediction = str(prediction)

    WER = wer(ground_truth, prediction)
    CER = cer(ground_truth, prediction)

    return WER, CER

# Inference

In [7]:
district_serial = {
                       #  'Rangpur':1,
                       # 'Kishoreganj':2,
                       # 'Narail':3,
                       # 'Chittagong':4,
                       # 'Narsingdi':5,
                       # 'Tangail':6,
                       # 'Habiganj':7,
                       # 'Barishal':8,
    
                       'Sylhet':9,
    
                       # 'Sandwip':10,
                       # 'Cumilla':11,
                       # 'Noakhali':12,
                       # 'Lakshmipur':13,
                       # 'Nilphamari':14,
                       # 'Jhenaidah':15
                      }
BATCH_SIZE = 16

for district in district_serial:
    input_path = os.path.join(f"/kaggle/input/interspeech-2025/district_wise/{district}/test")
    print("====================================================================")
    print("Current District:", district)
    
    submission = directory_infer(input_path, BATCH_SIZE)
    # print("Exporting:", f"2.{district_serial[district]}: {district}-yellowking_inference.csv")
    # submission.to_csv(f"2.{district_serial[district]}: {district}-yellowking_inference.csv", index=False)
    
    
    df = pd.read_excel(f"/kaggle/input/interspeech-2025/district_wise/{district}/{district}_test.xlsx")
    df = df.rename(columns={'transcriptions': 'transcripts'})
    merged_df = pd.merge(df, submission, on="file_name")
    merged_df['model'] = 'Whisper-medium'
    merged_df = merged_df[['model','district','file_name','transcriptions', 'transcripts']]
    merged_df = merged_df.rename(columns={'transcriptions': 'predictions'})

    WERS = []
    CERS = []
    
    for gd, pr in zip(merged_df['transcripts'], merged_df['predictions']):
        WER, CER  = calc_wer_cer(gd,pr)
        WERS.append(WER)
        CERS.append(CER)
    
    merged_df['wer'] = WERS
    merged_df['cer'] = CERS


    print("Exporting:", f"2.{district_serial[district]}: {district}-Whisper-medium_test_inference.xlsx")
    merged_df.to_excel(f"2.{district_serial[district]}: {district}-Whisper-medium_test_inference.xlsx", index=False)
    
    # print("Exporting:", f"{district}_yellowking_test_inference.csv")
    # merged_df.to_csv(f"{district}_yellowking_test_inference.csv", index=False)
    print("====================================================================")
    print() 

# BATCH_SIZE = 16
# input_path = "/kaggle/input/final-splits/final_splits/test"
# submission = directory_infer(input_path, BATCH_SIZE)
# submission.to_csv("test_inference.csv", index=False)
# submission

Current District: Sylhet


  0%|          | 0/48 [00:00<?, ?it/s]

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Exporting: 2.9: Sylhet-Whisper-medium_test_inference.xlsx



In [None]:
# submission = pd.read_csv('/kaggle/working/test_inference.csv')
# submission

In [None]:

# df = pd.read_excel('/kaggle/input/final-splits/final_splits/test/test.xlsx')
# df = df.rename(columns={'transcriptions': 'transcripts'})
# df

In [8]:
# merged_df = pd.merge(df, submission, on="file_name")
# merged_df['Model_Name'] = 'Yellowking'
# merged_df = merged_df[['district','Model_Name','file_name','transcriptions', 'transcripts']]
# merged_df = merged_df.rename(columns={'transcriptions': 'Predictions','district': 'District'})


merged_df

Unnamed: 0,model,district,file_name,predictions,transcripts,wer,cer
0,Whisper-medium,Sylhet,test_sylhet_0001.wav,কর র রর ত র ই র র র র ক র।,ইটাই তো আর ওটাই আরকি যা নিলো। ওখন বেশি কারন তা...,1.000000,0.916667
1,Whisper-medium,Sylhet,test_sylhet_0002.wav,কর র রর ত র ই র র র র ক র।,"<> হুম আরো, আরো কতা ওইছে যেছাই যেকুনু জিনিস আম...",1.000000,0.904564
2,Whisper-medium,Sylhet,test_sylhet_0003.wav,কর র রর ত র ই র র র র ক র।,<> এখ গতো এখটা গরু ফালছোইন মানে ওউ গরু খয় না ...,1.000000,0.903670
3,Whisper-medium,Sylhet,test_sylhet_0004.wav,কর র রর ত র ই র র র র ক র।,বাট যখন দুধও দেয় না কিচ্ছু নায় ইগুরে ফালতে ফ...,1.000000,0.895735
4,Whisper-medium,Sylhet,test_sylhet_0005.wav,কর র রর ত র ই র র র র ক র।,তে শেষ খরছোইন ওখন বুলে। খামোখা না নি ইগুর তলে ...,1.000000,0.916364
...,...,...,...,...,...,...,...
758,Whisper-medium,Sylhet,test_sylhet_0759.wav,কর র রর ত র ই র র র র ক র।,"ইগু খাইত খরতাম না, যেমন ধরো <> আর ডঙ্গের আলাফে...",1.000000,0.810000
759,Whisper-medium,Sylhet,test_sylhet_0760.wav,কর র রর ত র ই র র র র ক র।,<> ফারে না কুন সময়? মুরব্বি <> আর কিতা খইতায়...,1.000000,0.907692
760,Whisper-medium,Sylhet,test_sylhet_0761.wav,কর র রর ত র ই র র র র ক র।,<> অর নানি? ওহ। অত্তো দুলাবাই আছোইন। ডং কইরা এ...,0.979167,0.900415
761,Whisper-medium,Sylhet,test_sylhet_0762.wav,কর র রর ত র ই র র র র ক র।,<> মাসি। আমি অইলাম গিয়া মাসি। তাই অইলো বইনজি।...,1.000000,0.898551


# Calculating WER & CER

In [None]:
# WERS = []
# CERS = []

# for gd, pr in zip(merged_df['transcripts'], merged_df['Predictions']):
#     WER, CER  = calc_wer_cer(gd,pr)
#     WERS.append(WER)
#     CERS.append(CER)

# merged_df['WER'] = WERS
# merged_df['CER'] = CERS  

# merged_df

In [None]:
# merged_df.to_csv('Yellowking_test_predictions.csv',index=False)

In [9]:
district_serial = {'Rangpur':1,
                       'Kishoreganj':2,
                       'Narail':3,
                       'Chittagong':4,
                       'Narsingdi':5,
                       'Tangail':6,
                       'Habiganj':7,
                       'Barishal':8,
                       'Sylhet':9,
                       'Sandwip':10,
                       'Cumilla':11,
                       'Noakhali':12,
                       'Lakshmipur':13,
                       'Nilphamari':14,
                       'Jhenaidah':15
                      }

avg_wer = []
avg_cer = []
concat_df = pd.DataFrame(columns=['model', 'district', 'file_name', 'predictions', 'transcripts', 'wer','cer'])

for district in district_serial:
    df = pd.read_excel(f"2.{district_serial[district]}: {district}-Whisper-medium_test_inference.xlsx")
    concat_df = pd.concat([concat_df, df], ignore_index=True, axis=0)
    

    avg_w = np.average(df['wer'])
    avg_wer.append(round(avg_w,3))
    avg_c = np.average(df['cer'])
    avg_cer.append(round(avg_c,3))
                   

for i,j,k in zip(district_serial,avg_wer,avg_cer ):
    print(i)
    print(f'Avg. WER: {j} | Avg. CER: {k}')
    print()

concat_df.to_excel("Whisper-medium_inferences.xlsx",index =False)
concat_df.head()


Rangpur
Avg. WER: 1.002 | Avg. CER: 0.877

Kishoreganj
Avg. WER: 1.06 | Avg. CER: 0.891

Narail
Avg. WER: 1.0 | Avg. CER: 0.888

Chittagong
Avg. WER: 1.0 | Avg. CER: 0.885

Narsingdi
Avg. WER: 0.999 | Avg. CER: 0.895

Tangail
Avg. WER: 0.998 | Avg. CER: 0.883

Habiganj
Avg. WER: 1.028 | Avg. CER: 0.879

Barishal
Avg. WER: 1.005 | Avg. CER: 0.901

Sylhet
Avg. WER: 1.005 | Avg. CER: 0.88

Sandwip
Avg. WER: 1.008 | Avg. CER: 0.886

Cumilla
Avg. WER: 1.0 | Avg. CER: 0.899

Noakhali
Avg. WER: 1.143 | Avg. CER: 0.846

Lakshmipur
Avg. WER: 1.0 | Avg. CER: 0.911

Nilphamari
Avg. WER: 1.0 | Avg. CER: 0.851

Jhenaidah
Avg. WER: 1.022 | Avg. CER: 0.871



Unnamed: 0,model,district,file_name,predictions,transcripts,wer,cer
0,Whisper-medium,Rangpur,test_rangpur_0001.wav,কর র রর ত র ই র র র র ক র।,"সুমন তারপর হইলো রানা, তারপর জসিম, জাকারিয়া এর...",1.0,0.753425
1,Whisper-medium,Rangpur,test_rangpur_0002.wav,কর র রর ত র ই র র র র ক র।,"আপা, কেমন আচেন? আচোং তো ভালো। ইয়্যা, আপা, আপন...",1.0,0.889447
2,Whisper-medium,Rangpur,test_rangpur_0003.wav,কর র রর ত র ই র র র র ক র।,"মোর নাম মোছাম্মত রেণু। ও, আপনার বয়স কতো চইলবা...",1.0,0.889401
3,Whisper-medium,Rangpur,test_rangpur_0004.wav,কর র রর ত র ই র র র র ক র।,"আলু। ও, ব্যাটাক যে বিয়্যা-শাদি করাইনেন, সংসার...",1.0,0.893401
4,Whisper-medium,Rangpur,test_rangpur_0005.wav,কর র রর ত র ই র র র র ক র।,ভালো মন্দ খালি চায়। মাছ-গোস্ত খাবার চায় খালি...,1.0,0.932353


In [10]:
model = "Whisper-medium"

model_avg_wer = np.average(avg_wer)
model_avg_cer = np.average(avg_cer)

print(f"{model}")
print()
print(f"Average WER: {model_avg_wer} | Average CER: {model_avg_cer}")
print()
print("==========================================================================================================")


model_avg_wer_concat = np.average(concat_df['wer'])
model_avg_cer_concat = np.average(concat_df['cer'])
print()
print(f"All Together")
print()
print(f"Average WER: {round(model_avg_wer_concat,3)} | Average CER: {round(model_avg_cer_concat,3)}")

Whisper-medium

Average WER: 1.0179999999999998 | Average CER: 0.8828666666666667


All Together

Average WER: 1.012 | Average CER: 0.884
