In [None]:
!pip install bnlp-toolkit

In [None]:
from bnlp import NLTKTokenizer
tokenizer = NLTKTokenizer()

from tqdm import tqdm
from glob import glob
import pandas as pd
import os
import librosa

In [None]:
standard_bangla = pd.read_csv('/kaggle/input/bengaliai-train-csv/train.csv')

standard_bangla[['sentence']]

In [None]:
token_list = []

for sen in standard_bangla['sentence']:
    tokens = tokenizer.word_tokenize(sen)
    token_list.append(tokens)


standard_bangla['Tokens'] = token_list


vocab_bangla = {}

for i in tqdm(standard_bangla.Tokens):
    for j in i:
        if j ==  "?" or j ==  "!" or j ==  "<" or j ==  ">" or j ==  "'" or j ==  "।" or j ==  "." or j ==  ",":
            pass
        else:
            try:
                vocab_bangla[j]+=1
            except:
                vocab_bangla[j]=1

standard_words = list(vocab_bangla.keys())

len(standard_words)

# Get Duration

In [None]:
def hms_format(seconds:float, explicit_format=False) -> str:
    """Returns seconds in hours minutes seconds format.

    Keyword argument:
        explicit_format: convert format from hh:mm:ss to hh hours mm minutes ss seconds
    """
    hours, seconds = divmod(seconds, 3600)
    minutes, seconds = divmod(seconds, 60)

    if explicit_format:
        return "{} hours {:03} minutes {:03} seconds".format(int(hours), int(minutes), round(seconds))
    else:
        return "{}:{:02}:{:02}".format(int(hours), int(minutes), round(seconds))


def duration(base_dir,dataset,total_number_of_words_in_the_dataset):

    dataset_size = len(dataset)
    aud_list = list(dataset['file_name'])
    
    total_sec = 0
    aud_files_from_path = []
    
    for wav in tqdm(glob(os.path.join(base_dir, "*.*"))):
        aud_path = wav.split("/")
        aud_file = aud_path[-1]
        
        if aud_file in aud_list:
            aud_files_from_path.append(aud_file)
            d = librosa.get_duration(filename=wav)
            total_sec+=d
    
    
    hms = hms_format(seconds= total_sec, explicit_format=False)
    
    total_hours = round(total_sec/3600, 3)
    mint = total_sec//60

    avg_duration = round(total_sec/dataset_size, 3)
    wpm = round(total_number_of_words_in_the_dataset/mint, 3)
    wps = round(total_number_of_words_in_the_dataset/dataset_size, 3)

    return (total_sec,
            total_hours,
            avg_duration,
            wpm,
            hms,
            wps)

# Unique Words Functions

In [None]:
def unique_words(path, dis):
    
    if type(path) == str:
        dataset = pd.read_excel(path)
    
    else:
        dataset = path    

    dataset_size = len(dataset)
    
    district = dis

    token_list = []

    for sen in dataset['transcriptions']:
        tokens = tokenizer.word_tokenize(str(sen))
        token_list.append(tokens)

    dataset['Tokens'] = token_list

    vocab = {}
    all_words = []

    for i in tqdm(dataset.Tokens):
        for j in i:
            if j ==  "?" or j ==  "!" or j ==  "<" or j ==  ">" or j ==  "'" or j ==  "।" or j ==  "." or j ==  ",":
                pass
            else:
                try:
                    vocab[j]+=1
                except:
                    vocab[j]=1

            all_words.append(j)


    tokenized_words = list(vocab.keys())
    total_number_words = sum(vocab.values())
    unique_words_frequency = len(vocab.keys())


    ood_dict = {}

    for i in tokenized_words:
        if i not in standard_words:
            if i in ood_dict:
                ood_dict[i] += 1
            else:
                ood_dict[i] = 1
            
    
    #print(ood_dict)
    ood = list(ood_dict.keys())
    #print(ood)
    dataset_of_ood_words = pd.DataFrame(ood,columns = [f'Unique words ({district})'])
    ood_words_unique = len(ood)
    #print(ood_words_unique)
    ood_words_total_freq = sum(ood_dict.values())
    #print(ood_words_total_freq)
    


    return (dataset,
            vocab,
            all_words,
            total_number_words,
            unique_words_frequency,
            dataset_of_ood_words,
            ood_words_total_freq,
            ood_words_unique,
            ood)

# Main datasets

In [None]:
rangpur_df = pd.read_excel("/kaggle/input/only-dis/dis_only_dataframes/dis_only_dataframes/rangpur.xlsx")
kishoreganj_df = pd.read_excel("/kaggle/input/only-dis/dis_only_dataframes/dis_only_dataframes/kishoreganj.xlsx")
narail_df = pd.read_excel("/kaggle/input/only-dis/dis_only_dataframes/dis_only_dataframes/narail.xlsx")
chittagong_df = pd.read_excel("/kaggle/input/only-dis/dis_only_dataframes/dis_only_dataframes/chittagong.xlsx")
narsingdi_df = pd.read_excel("/kaggle/input/only-dis/dis_only_dataframes/dis_only_dataframes/narsingdi.xlsx")
tangail_df = pd.read_excel("/kaggle/input/only-dis/dis_only_dataframes/dis_only_dataframes/tangail.xlsx")
habiganj_df = pd.read_excel("/kaggle/input/only-dis/dis_only_dataframes/dis_only_dataframes/habiganj.xlsx")
barishal_df = pd.read_excel("/kaggle/input/only-dis/dis_only_dataframes/dis_only_dataframes/barishal.xlsx")
sylhet_df = pd.read_excel("/kaggle/input/only-dis/dis_only_dataframes/dis_only_dataframes/sylhet.xlsx")
sandwip_df = pd.read_excel("/kaggle/input/only-dis/dis_only_dataframes/dis_only_dataframes/sandwip.xlsx")
comilla_df = pd.read_excel("/kaggle/input/only-dis/dis_only_dataframes/dis_only_dataframes/comilla.xlsx")
noakhali_df = pd.read_excel("/kaggle/input/only-dis/dis_only_dataframes/dis_only_dataframes/noakhali.xlsx")



dists = ["rangpur", "kishoreganj", "narail", "chittagong", "narsingdi", "tangail", "habiganj", "barishal", "sylhet", "sandwip", "comilla", "noakhali"]

dis_words = {}

for dis in dists:
    
    if dis == "all":
        thesis_df = pd.concat([rangpur_df,kishoreganj_df,narail_df,chittagong_df,narsingdi_df], axis = 0)
        path = thesis_df
        base_dir = f'/kaggle/input/thesis-copy/thesis copy/audios/all'
        
    else:

        path = f'/kaggle/input/only-dis/dis_only_dataframes/dis_only_dataframes/{dis}.xlsx'
        base_dir = f'/kaggle/input/only-dis/only_dis/only_dis/{dis}/'

    dis = dis.capitalize()

    output_1 = unique_words(path, dis)
    
    dataset = output_1[0]
    dataset_size = len(dataset)
    vocab_dict= output_1[1]
    all_words_list = output_1[2]
    total_number_of_words_in_the_dataset = output_1[3]
    total_number_of_unique_words_in_the_dataset = output_1[4]
    ood_words_dataset = output_1[5]
    total_number_of_ood_words = output_1[6]
    total_number_of_unique_ood_words = output_1[7]
    dis_words[dis] = output_1[8]
    

    oov_perc_unq = total_number_of_unique_ood_words/total_number_of_unique_words_in_the_dataset
    oov_perc_tot = total_number_of_ood_words/total_number_of_words_in_the_dataset
 
    output_2 = duration(base_dir,dataset,total_number_of_words_in_the_dataset)


    print(f"====================================================== {dis} ======================================================")
    print()
    print()
    print()
    print(f'Total Samples {dataset_size}')
    print(f'Total number of words present in {dis} dataset: {total_number_of_words_in_the_dataset}')
    print(f'Total number of unique words present in {dis} dataset: {total_number_of_unique_words_in_the_dataset}')
    print(f'Total number of OOD words present in {dis} dataset: {total_number_of_ood_words}')
    print(f'Total number of unique OOD words present in {dis} dataset: {total_number_of_unique_ood_words}')
    print(f'OOD% in {dis} dataset (Unique words): {round(oov_perc_unq*100, 3)} %')
    print(f'OOD% in {dis} dataset (Total words): {round(oov_perc_tot*100, 3)} %')
    print()
    print(f"Total Duration: {output_2[4]}")
    print(f"Total in seconds: {output_2[0]} seconds")
    print(f"Total in Hours: {output_2[1]} hours")
    print(f"Average record size: {output_2[2]} seconds")
    print(f"Words per minute: {output_2[3]} ")
    print(f'Words per sample: {output_2[5]} words')
    print()
    print()
    print()
    print(f"###################################################################################################################")
    print()
    print()
    print()
    

output_df = pd.DataFrame()

for dis in dists:
    output_df = pd.concat(
        [output_df,
        pd.DataFrame({dis : dis_words[dis]})],
        axis = 1
    )
output_df

# Subsets

In [None]:
rangpur_train = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Rangpur/Rangpur_train.xlsx")
rangpur_test = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Rangpur/Rangpur_test.xlsx")
rangpur_valid = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Rangpur/Rangpur_valid.xlsx")

kishoreganj_train = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Kishoreganj/Kishoreganj_train.xlsx")
kishoreganj_test = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Kishoreganj/Kishoreganj_test.xlsx")
kishoreganj_valid = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Kishoreganj/Kishoreganj_valid.xlsx")

narail_train = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Narail/Narail_train.xlsx")
narail_test = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Narail/Narail_test.xlsx")
narail_valid = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Narail/Narail_valid.xlsx")

chittagong_train = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Chittagong/Chittagong_train.xlsx")
chittagong_test = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Chittagong/Chittagong_test.xlsx")
chittagong_valid = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Chittagong/Chittagong_valid.xlsx")

narsingdi_train = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Narsingdi/Narsingdi_train.xlsx")
narsingdi_test = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Narsingdi/Narsingdi_test.xlsx")
narsingdi_valid = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Narsingdi/Narsingdi_valid.xlsx")

tangail_train = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Tangail/Tangail_train.xlsx")
tangail_test = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Tangail/Tangail_test.xlsx")
tangail_valid = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Tangail/Tangail_valid.xlsx")

habiganj_train = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Habiganj/Habiganj_train.xlsx")
habiganj_test = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Habiganj/Habiganj_test.xlsx")
habiganj_valid = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Habiganj/Habiganj_valid.xlsx")

barishal_train = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Barishal/Barishal_train.xlsx")
barishal_test = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Barishal/Barishal_test.xlsx")
barishal_valid = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Barishal/Barishal_valid.xlsx")

sylhet_train = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Sylhet/Sylhet_train.xlsx")
sylhet_test = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Sylhet/Sylhet_test.xlsx")
sylhet_valid = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Sylhet/Sylhet_valid.xlsx")

sandwip_train = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Sandwip/Sandwip_train.xlsx")
sandwip_test = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Sandwip/Sandwip_test.xlsx")
sandwip_valid = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Sandwip/Sandwip_valid.xlsx")

comilla_train = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Comilla/Comilla_train.xlsx")
comilla_test = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Comilla/Comilla_test.xlsx")
comilla_valid = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Comilla/Comilla_valid.xlsx")

noakhali_train = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Noakhali/Noakhali_train.xlsx")
noakhali_test = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Noakhali/Noakhali_test.xlsx")
noakhali_valid = pd.read_excel("/kaggle/input/interspeech-2025/district_wise/Noakhali/Noakhali_valid.xlsx")


all_train = pd.read_excel("/kaggle/input/interspeech-2025/train.xlsx")
all_test = pd.read_excel("/kaggle/input/interspeech-2025/test.xlsx")
all_valid = pd.read_excel("/kaggle/input/interspeech-2025/valid.xlsx")

# Train

In [None]:
trains = [rangpur_train, kishoreganj_train, narail_train, chittagong_train, narsingdi_train, tangail_train, habiganj_train, barishal_train, sylhet_train, sandwip_train, comilla_train, noakhali_train, all_train]
dists = ["rangpur", "kishoreganj", "narail", "chittagong", "narsingdi", "tangail", "habiganj", "barishal", "sylhet", "sandwip", "comilla", "noakhali", "all"]


for dis in dists:

    dis = dis.capitalize()
    
    if dis == 'rangpur':
        df = rangpur_train
        base_dir = f'/kaggle/input/interspeech-2025/district_wise/{dis}/train/'
    elif dis == 'kishoreganj':
        df = kishoreganj_train
        base_dir = f'/kaggle/input/interspeech-2025/district_wise/{dis}/train/'
    elif dis == 'narail':
        df = narail_train
        base_dir = f'/kaggle/input/interspeech-2025/district_wise/{dis}/train/'
    elif dis == 'chittagong':
        df = chittagong_train
        base_dir = f'/kaggle/input/interspeech-2025/district_wise/{dis}/train/'
    elif dis == 'narsingdi':
        df = narsingdi_train
        base_dir = f'/kaggle/input/interspeech-2025/district_wise/{dis}/train/'
    elif dis == 'tangail':
        df = tangail_train
        base_dir = f'/kaggle/input/interspeech-2025/district_wise/{dis}/train/'
    elif dis == 'habiganj':
        df = habiganj_train
        base_dir = f'/kaggle/input/interspeech-2025/district_wise/{dis}/train/'
    elif dis == 'barishal':
        df = barishal_train
        base_dir = f'/kaggle/input/interspeech-2025/district_wise/{dis}/train/'
    elif dis == 'sylhet':
        df = sylhet_train
        base_dir = f'/kaggle/input/interspeech-2025/district_wise/{dis}/train/'
    elif dis == 'sandwip':
        df = sandwip_train
        base_dir = f'/kaggle/input/interspeech-2025/district_wise/{dis}/train/'
    elif dis == 'comilla':
        df = comilla_train
        base_dir = f'/kaggle/input/interspeech-2025/district_wise/{dis}/train/'
    elif dis == 'noakhali':
        df = noakhali_train
        base_dir = f'/kaggle/input/interspeech-2025/district_wise/{dis}/train/'
    elif dis == 'all':
        df = all_train
        base_dir = f'/kaggle/input/interspeech-2025/train/'
    

    dis = dis.capitalize()

    output_1 = unique_words(df, dis)
    
    dataset = output_1[0]
    dataset_size = len(dataset)
    vocab_dict= output_1[1]
    all_words_list = output_1[2]
    total_number_of_words_in_the_dataset = output_1[3]
    total_number_of_unique_words_in_the_dataset = output_1[4]
    ood_words_dataset = output_1[5]
    total_number_of_ood_words = output_1[6]
    total_number_of_unique_ood_words = output_1[7]

    oov_perc_unq = total_number_of_unique_ood_words/total_number_of_unique_words_in_the_dataset
    oov_perc_tot = total_number_of_ood_words/total_number_of_words_in_the_dataset
 
    output_2 = duration(base_dir,dataset,total_number_of_words_in_the_dataset)


    print(f"====================================================== {dis} ======================================================")
    print()
    print(f'Total Samples {dataset_size}')
    print(f'Total number of words present in {dis} dataset: {total_number_of_words_in_the_dataset}')
    print(f'Total number of unique words present in {dis} dataset: {total_number_of_unique_words_in_the_dataset}')
    print(f'Total number of OOD words present in {dis} dataset: {total_number_of_ood_words}')
    print(f'Total number of unique OOD words present in {dis} dataset: {total_number_of_unique_ood_words}')
    print(f'OOD% in {dis} dataset (Unique words): {round(oov_perc_unq*100, 3)} %')
    print(f'OOD% in {dis} dataset (Total words): {round(oov_perc_tot*100, 3)} %')
    print()
    print(f"Total Duration: {output_2[4]}")
    print(f"Total in seconds: {output_2[0]} seconds")
    print(f"Total in Hours: {output_2[1]} hours")
    print(f"Average record size: {output_2[2]} seconds")
    print(f"Words per minute: {output_2[3]} ")
    print(f'Words per sample: {output_2[5]} words')
    print()
    print()
    print()
    print(f"###################################################################################################################")
    print()
    print()


# Test

In [None]:
tests = [rangpur_test, kishoreganj_test, narail_test, chittagong_test, narsingdi_test, tangail_test, habiganj_test, barishal_test, sylhet_test, sandwip_test, comilla_test, noakhali_test, all_test]
dists = ["rangpur", "kishoreganj", "narail", "chittagong", "narsingdi", "tangail", "habiganj", "barishal", "sylhet", "sandwip", "comilla", "noakhali", "all"]


for dis in dists:
    
    if dis == 'rangpur':
        df = rangpur_test
        base_dir = f'/kaggle/input/only-dis/{dis}/'
    elif dis == 'kishoreganj':
        df = kishoreganj_test
        base_dir = f'/kaggle/input/only-dis/{dis}/'
    elif dis == 'narail':
        df = narail_test
        base_dir = f'/kaggle/input/only-dis/{dis}/'
    elif dis == 'chittagong':
        df = chittagong_test
        base_dir = f'/kaggle/input/only-dis/{dis}/'
    elif dis == 'narsingdi':
        df = narsingdi_test
        base_dir = f'/kaggle/input/only-dis/{dis}/'
    elif dis == 'tangail':
        df = tangail_test
        base_dir = f'/kaggle/input/only-dis/{dis}/'
    elif dis == 'habiganj':
        df = habiganj_test
        base_dir = f'/kaggle/input/only-dis/{dis}/'
    elif dis == 'barishal':
        df = barishal_test
        base_dir = f'/kaggle/input/only-dis/{dis}/'
    elif dis == 'sylhet':
        df = sylhet_test
        base_dir = f'/kaggle/input/only-dis/{dis}/'
    elif dis == 'sandwip':
        df = sandwip_test
        base_dir = f'/kaggle/input/only-dis/{dis}/'
    elif dis == 'comilla':
        df = comilla_test
        base_dir = f'/kaggle/input/only-dis/{dis}/'
    elif dis == 'noakhali':
        df = noakhali_test
        base_dir = f'/kaggle/input/only-dis/{dis}/'
    elif dis == 'all':
        df = all_test
        base_dir = f'/kaggle/input/interspeech-2025/test/'
        
    dis = dis.capitalize()

    output_1 = unique_words(df, dis)
    
    dataset = output_1[0]
    dataset_size = len(dataset)
    vocab_dict= output_1[1]
    all_words_list = output_1[2]
    total_number_of_words_in_the_dataset = output_1[3]
    total_number_of_unique_words_in_the_dataset = output_1[4]
    ood_words_dataset = output_1[5]
    total_number_of_ood_words = output_1[6]
    total_number_of_unique_ood_words = output_1[7]

    oov_perc_unq = total_number_of_unique_ood_words/total_number_of_unique_words_in_the_dataset
    oov_perc_tot = total_number_of_ood_words/total_number_of_words_in_the_dataset
 
    output_2 = duration(base_dir,dataset,total_number_of_words_in_the_dataset)


    print(f"====================================================== {dis} ======================================================")
    print()
    print(f'Total Samples {dataset_size}')
    print(f'Total number of words present in {dis} dataset: {total_number_of_words_in_the_dataset}')
    print(f'Total number of unique words present in {dis} dataset: {total_number_of_unique_words_in_the_dataset}')
    print(f'Total number of OOD words present in {dis} dataset: {total_number_of_ood_words}')
    print(f'Total number of unique OOD words present in {dis} dataset: {total_number_of_unique_ood_words}')
    print(f'OOD% in {dis} dataset (Unique words): {round(oov_perc_unq*100, 3)} %')
    print(f'OOD% in {dis} dataset (Total words): {round(oov_perc_tot*100, 3)} %')
    print()
    print(f"Total Duration: {output_2[4]}")
    print(f"Total in seconds: {output_2[0]} seconds")
    print(f"Total in Hours: {output_2[1]} hours")
    print(f"Average record size: {output_2[2]} seconds")
    print(f"Words per minute: {output_2[3]} ")
    print(f'Words per sample: {output_2[5]} words')
    print()
    print()
    print()
    print(f"###################################################################################################################")
    print()
    print()


#     elif dis == 'all':
        df = all_test
        base_dir = f'/kaggle/input/interspeech-2025/test/'Valid

In [None]:
valids = [rangpur_valid, kishoreganj_valid, narail_valid, chittagong_valid, narsingdi_valid, tangail_valid, habiganj_valid, barishal_valid, sylhet_valid, sandwip_valid, comilla_valid, noakhali_valid, all_valid]
dists = ["rangpur", "kishoreganj", "narail", "chittagong", "narsingdi", "tangail", "habiganj", "barishal", "sylhet", "sandwip", "comilla", "noakhali", "all"]


for dis in dists:
    
    if dis == 'rangpur':
        df = rangpur_valid
        base_dir = f'/kaggle/input/only-dis/{dis}/'
    elif dis == 'kishoreganj':
        df = kishoreganj_valid
        base_dir = f'/kaggle/input/only-dis/{dis}/'
    elif dis == 'narail':
        df = narail_valid
        base_dir = f'/kaggle/input/only-dis/{dis}/'
    elif dis == 'chittagong':
        df = chittagong_valid
        base_dir = f'/kaggle/input/only-dis/{dis}/'
    elif dis == 'narsingdi':
        df = narsingdi_valid
        base_dir = f'/kaggle/input/only-dis/{dis}/'
    elif dis == 'tangail':
        df = tangail_valid
        base_dir = f'/kaggle/input/only-dis/{dis}/'
    elif dis == 'habiganj':
        df = habiganj_valid
        base_dir = f'/kaggle/input/only-dis/{dis}/'
    elif dis == 'barishal':
        df = barishal_valid
        base_dir = f'/kaggle/input/only-dis/{dis}/'
    elif dis == 'sylhet':
        df = sylhet_valid
        base_dir = f'/kaggle/input/only-dis/{dis}/'
    elif dis == 'sandwip':
        df = sandwip_valid
        base_dir = f'/kaggle/input/only-dis/{dis}/'
    elif dis == 'comilla':
        df = comilla_valid
        base_dir = f'/kaggle/input/only-dis/{dis}/'
    elif dis == 'noakhali':
        df = noakhali_valid
        base_dir = f'/kaggle/input/only-dis/{dis}/'
    elif dis == 'all':
        df = all_valid
        base_dir = f'/kaggle/input/interspeech-2025/valid/'
        
    dis = dis.capitalize()

    output_1 = unique_words(df, dis)
    
    dataset = output_1[0]
    dataset_size = len(dataset)
    vocab_dict= output_1[1]
    all_words_list = output_1[2]
    total_number_of_words_in_the_dataset = output_1[3]
    total_number_of_unique_words_in_the_dataset = output_1[4]
    ood_words_dataset = output_1[5]
    total_number_of_ood_words = output_1[6]
    total_number_of_unique_ood_words = output_1[7]

    oov_perc_unq = total_number_of_unique_ood_words/total_number_of_unique_words_in_the_dataset
    oov_perc_tot = total_number_of_ood_words/total_number_of_words_in_the_dataset
 
    output_2 = duration(base_dir,dataset,total_number_of_words_in_the_dataset)


    print(f"====================================================== {dis} ======================================================")
    print()
    print(f'Total Samples {dataset_size}')
    print(f'Total number of words present in {dis} dataset: {total_number_of_words_in_the_dataset}')
    print(f'Total number of unique words present in {dis} dataset: {total_number_of_unique_words_in_the_dataset}')
    print(f'Total number of OOD words present in {dis} dataset: {total_number_of_ood_words}')
    print(f'Total number of unique OOD words present in {dis} dataset: {total_number_of_unique_ood_words}')
    print(f'OOD% in {dis} dataset (Unique words): {round(oov_perc_unq*100, 3)} %')
    print(f'OOD% in {dis} dataset (Total words): {round(oov_perc_tot*100, 3)} %')
    print()
    print(f"Total Duration: {output_2[4]}")
    print(f"Total in seconds: {output_2[0]} seconds")
    print(f"Total in Hours: {output_2[1]} hours")
    print(f"Average record size: {output_2[2]} seconds")
    print(f"Words per minute: {output_2[3]} ")
    print(f'Words per sample: {output_2[5]} words')
    print()
    print()
    print()
    print(f"###################################################################################################################")
    print()
    print()
