# TRANSCRIPT ANALYSIS

The following code is for performing analysis on the transcripts of the corpus. This notebook was written for the **INTERSPEECH 2025** datasets. It is compatible for both the Regional Speech corpus and the Ben10 corpus made for the interspeech 2025 conference and *any* dataset that follows a similar file structure


```
|- dev
|    |- all.xlsx/dev.xlsx
|    |- train.xlsx/dev_train.xlsx
|    |- ...
|- train
|    |- train_barishal.wav
|    |- ...
|- test
|- valid
|- ...
```

In [3]:
!pip install bnlp-toolkit -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.9/58.9 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.4/175.4 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.4/54.4 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.5/26.5 MB[0m [31m60.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m40.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.4/78.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1

In [4]:
from tqdm import tqdm
from glob import glob
import pandas as pd
import os
import librosa
from typing import Optional
import string
from IPython.display import display

from bnlp import NLTKTokenizer
tokenizer = NLTKTokenizer()

# Functions

In [None]:
##############################
# DURATION FUNCTIONS
##############################
def hms_format(s):
    h, s = divmod(s, 3600)
    m, s = divmod(s, 60)
    h, m, s = int(h), int(m), round(s)
    return f"{h}:{m:02}:{s:02}"

def duration(dataframe, disable_tqdm:bool = False) -> dict[str, int|float]:
    total_seconds: float = 0.0
    for path in tqdm(dataframe.path, disable=disable_tqdm):
        total_seconds += librosa.get_duration(path=path)
    result = {}

    return {
        "total_sec" : round(total_seconds, 2),
        "total_hours" : round(total_seconds / 3600, 2),
        "total_min" : round(total_seconds / 60, 2),
        "avg_duration" : round(total_seconds / dataframe.shape[0], 2),
        "total_duration" : hms_format(total_seconds)
    }


##############################
# OOD FUNCTIONS
##############################

def tokenize_word(sentence:str) -> list:
    return tokenizer.word_tokenize(
        sentence.translate(str.maketrans('!#$%&()*+,./:;<=>?@[\\]^_`{|}~।', "                              ")) # replace punc with space
    )


def map_word_to_frequency(dataframe, disable_tqdm:bool = False) -> dict[str, float]:
    result = {}
    for sen in tqdm(dataframe.transcripts, disable=disable_tqdm):
        words = tokenize_word(sen)
        for word in words:
            try:
                result[word] += 1
            except KeyError:
                result[word] = 1
    return result


def find_ood(
    word_dict:dict, standard:set,
) -> tuple[dict, dict]:
    temp = set(word_dict.keys()).difference(standard)
    ood_words = {}
    for word in temp:
        ood_words[word] = word_dict[word]

    return (
        ood_words,
        {
            "ood_unq_word_count" : len(ood_words.keys()),
            "ood_total_word_count" : sum(ood_words.values()),
            "ood_unq_word_percent" : len(ood_words.keys()) / len(word_dict.keys()) * 100,
            "ood_total_word_percent" : sum(ood_words.values()) / sum(word_dict.values()) * 100,
        }
    )

# Load standard bengali words

In [None]:
standard_bangla = pd.read_csv('/kaggle/input/bengaliai-train-csv/train.csv')
words = []
for sentence in tqdm(standard_bangla.sentence):
    words += tokenize_word(sentence)

STANDARD_BANGLA_WORDS = set(words)

# District wise

In [8]:
districts = ["Rangpur", "Kishoreganj", "Narail", "Chittagong", "Narsingdi", "Tangail", "Habiganj","Barishal", "Sandwip", "Sylhet", "Comilla", "Noakhali"]
df = pd.read_excel("/kaggle/input/interspeech-2025/dev/dev.xlsx")
path = "/kaggle/input/interspeech-2025"
splits = ["train", "test", "valid", "all"]

In [None]:
df["path"] = df[["file_name", "split_type"]].apply(lambda x: os.path.join(path, x.split_type.lower(), x.file_name), axis=1)

for district in districts:
    print(f"====================================================== {district} ======================================================")
    result = pd.DataFrame()
    for split in splits:
        print(split, ": ", end="")
        if split == "all":
            df2 = df.query("district == @district").copy()
        else:
            df2 = df.query("district == @district and split_type == @split").copy()
        
        duration_result = duration(df2, disable_tqdm=True)
        regional_words = map_word_to_frequency(df2, disable_tqdm=True)
        ood_words, ood_stats = find_ood(regional_words, STANDARD_BANGLA_WORDS)
        
        data = {**duration_result, **ood_stats}
        data["total_words"] = sum(regional_words.values())
        data["wpm"] = sum(regional_words.values()) / duration_result["total_min"]
        data["wps"] = sum(regional_words.values()) / df2.shape[0]
        data["count"] = df2.shape[0]

        result = pd.concat([result, pd.DataFrame(data, index=[split])])
        print("Done")
    display(result)
    print()
        

# TOTAL CORPUS

In [None]:
df = pd.read_excel("/kaggle/input/interspeech-2025/dev/dev.xlsx")
df["path"] = df[["file_name", "split_type"]].apply(lambda x: os.path.join("/kaggle/input/interspeech-2025", x.split_type.lower(), x.file_name), axis=1)


result = pd.DataFrame()
for split in ["train", "test", "valid", "all"]:
    print(split)
    if split == "all":
        df2 = df.copy()
    else:
        df2 = df.query("split_type == @split")
    
    duration_result = duration(df2)
    regional_words = tokenize_word(df2)
    ood_words, ood_stats = find_ood(regional_words, STANDARD_BANGLA_WORDS)
    
    data = {**duration_result, **ood_stats}
    data["total_words"] = sum(regional_words.values())
    data["wpm"] = sum(regional_words.values()) / duration_result["total_min"]
    data["wps"] = sum(regional_words.values()) / df2.shape[0]
    data["count"] = df2.shape[0]

    result = pd.concat([result, pd.DataFrame(data, index=[split])])

display(result)