In [19]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import plotly.express as px
from jiwer import wer, cer

import glob
import os

In [2]:
metadata = pd.read_csv("/app/dataset/metadata/train_metadata.csv")
valid = pd.read_csv("/app/dataset/valid_data.csv")
train = pd.read_csv("/app/dataset/train_data.csv")
valid_subset = pd.read_csv("/app/dataset/valid_data_subset.csv")

In [126]:
VOCAB_NOSPECIAL = ['ও', ' ', 'ব', 'ল', 'ে', 'ছ', 'আ', 'প', 'ন', 'া', 'র', 'ঠ', 'ি', 'ক', 'ো', 'ম', 'হ', 'ষ', '্', 'ট', 'গ', 'ত', 'চ', 'ু', 'ঝ', 'এ', 'স', 'থ','শ', 'য', 'ী', 'ধ', 'ঙ', 'ভ', 'জ', 'ই', 'দ', 'খ', 'ফ', 'ং', 'উ', 'ণ', 'অ', 'ঁ', 'ড়', 'য়', 'ঢ', 'ড','ূ', 'ঘ', 'ৃ', 'ঞ', 'ৈ', 'ৌ', 'ৎ', 'ঃ','ঐ', 'ঈ', 'ঊ', 'ঋ','ঢ়', 'ঔ','—']
VOCAB = VOCAB_NOSPECIAL+['!', '?', ',', '।', '-', '‘', '’', '"', ';', '–', "'", ':', '/', '.', '“', '”']

# Separating consonants, vowels, and matras
consonants = ['ব', 'ল', 'য়','ছ', 'প', 'ন', 'র', 'ঠ', 'ক', 'ম', 'হ', 'ষ', 'ট', 'গ', 'ত', 'চ', 'ঝ', 'স', 'থ', 'শ', 'য', 'ধ', 'ঙ', 'ভ', 'জ', 'দ', 'খ', 'ফ', 'ণ', 'ড়', 'য়', 'ঢ', 'ড', 'ঘ', 'ঞ', 'ঢ়']
vowels = ['অ', 'আ', 'ই', 'ঈ', 'উ', 'ঊ', 'ঋ', 'এ', 'ঐ', 'ও', 'ঔ','ৎ']
matras = ['া', 'ি', 'ী', 'ে', 'ো', 'ু', 'ূ', 'ৃ', 'ৈ', 'ৌ', '্', 'ঃ']
matras_addition = ['ঁ','ং']
others = [' ','।','?','!','—']
special_chars = [',', '-', '‘', '’', '"', ';', '–', "'", ':', '/', '.', '“', '”']

# Creating combinations for consonants with matras
consonant_combinations = consonants+[consonant + matra for consonant in consonants for matra in matras]
consonant_combinations = consonant_combinations+[consonant + matra for consonant in consonant_combinations for matra in matras_addition]


# The vowels list already represents the standalone sounds, so we don't need to combine them with matras.
all_combinations = consonant_combinations + vowels+others+special_chars

print(len(all_combinations),sorted(all_combinations))


1434 [' ', '!', '"', "'", ',', '-', '.', '/', ':', ';', '?', '।', 'অ', 'আ', 'ই', 'ঈ', 'উ', 'ঊ', 'ঋ', 'এ', 'ঐ', 'ও', 'ঔ', 'ক', 'কঁ', 'কং', 'কঃ', 'কঃঁ', 'কঃং', 'কা', 'কাঁ', 'কাং', 'কি', 'কিঁ', 'কিং', 'কী', 'কীঁ', 'কীং', 'কু', 'কুঁ', 'কুং', 'কূ', 'কূঁ', 'কূং', 'কৃ', 'কৃঁ', 'কৃং', 'কে', 'কেঁ', 'কেং', 'কৈ', 'কৈঁ', 'কৈং', 'কো', 'কোঁ', 'কোং', 'কৌ', 'কৌঁ', 'কৌং', 'ক্', 'ক্ঁ', 'ক্ং', 'খ', 'খঁ', 'খং', 'খঃ', 'খঃঁ', 'খঃং', 'খা', 'খাঁ', 'খাং', 'খি', 'খিঁ', 'খিং', 'খী', 'খীঁ', 'খীং', 'খু', 'খুঁ', 'খুং', 'খূ', 'খূঁ', 'খূং', 'খৃ', 'খৃঁ', 'খৃং', 'খে', 'খেঁ', 'খেং', 'খৈ', 'খৈঁ', 'খৈং', 'খো', 'খোঁ', 'খোং', 'খৌ', 'খৌঁ', 'খৌং', 'খ্', 'খ্ঁ', 'খ্ং', 'গ', 'গঁ', 'গং', 'গঃ', 'গঃঁ', 'গঃং', 'গা', 'গাঁ', 'গাং', 'গি', 'গিঁ', 'গিং', 'গী', 'গীঁ', 'গীং', 'গু', 'গুঁ', 'গুং', 'গূ', 'গূঁ', 'গূং', 'গৃ', 'গৃঁ', 'গৃং', 'গে', 'গেঁ', 'গেং', 'গৈ', 'গৈঁ', 'গৈং', 'গো', 'গোঁ', 'গোং', 'গৌ', 'গৌঁ', 'গৌং', 'গ্', 'গ্ঁ', 'গ্ং', 'ঘ', 'ঘঁ', 'ঘং', 'ঘঃ', 'ঘঃঁ', 'ঘঃং', 'ঘা', 'ঘাঁ', 'ঘাং', 'ঘি', 'ঘিঁ', 'ঘিং', 'ঘী', 'ঘীঁ', 'ঘীং', 'ঘু', 'ঘুঁ'

In [129]:
class TrieNode:
    def __init__(self):
        self.children = {}
        self.is_end_of_word = False
        self.value = None

class Trie:
    def __init__(self):
        self.root = TrieNode()

    def insert(self, word, value):
        node = self.root
        for char in word:
            if char not in node.children:
                node.children[char] = TrieNode()
            node = node.children[char]
        node.is_end_of_word = True
        node.value = value

    def search(self, word):
        node = self.root
        for char in word:
            if char not in node.children:
                return None
            node = node.children[char]
        return node.value if node.is_end_of_word else None

class BengaliTokenizer:
    def __init__(self, vocabulary):
        self.trie = Trie()
        self.vocabulary = vocabulary 
        for idx, word in enumerate(vocabulary):
            self.trie.insert(word, idx)

    def encode(self, sentence):
        indices = []
        i = 0
        while i < len(sentence):
            max_len = -1
            token_idx = -1
            for j in range(i, len(sentence) + 1):
                idx = self.trie.search(sentence[i:j])
                if idx is not None:
                    if j - i > max_len:
                        max_len = j - i
                        token_idx = idx
            if max_len == -1:
                # indices.append(-1)
                i += 1
            else:
                indices.append(token_idx)
                i += max_len
        return indices

    def decode(self, indices):
        return ''.join(self.vocabulary[idx] if idx != -1 else '' for idx in indices)

# Create an instance of the tokenizer
tokenizer = BengaliTokenizer(all_combinations)

# Example usage
sentence = "আমি বাঙালি"
encoded = tokenizer.encode(sentence)
print(f"Encoded: {encoded}")
decoded = tokenizer.decode(encoded)
print(f"Decoded: {decoded}")


Encoded: [1405, 145, 1416, 36, 300, 49]
Decoded: আমি বাঙালি


In [134]:
encoded = []
decoded = []
counts = [0 for x in all_combinations]
for sentence in tqdm(train.sentence):
    en = tokenizer.encode(sentence)
    for i in en:
        counts[i]+=1

 17%|█▋        | 156842/934048 [02:03<10:08, 1276.59it/s]

In [120]:
wers = [wer(a,b) for a,b in zip(decoded,train.sentence)]

In [121]:
np.mean(wers)

0.012913005866682337

In [3]:
valid["mergeid"] = valid["id"].apply(lambda x: x.split(".")[0])
train["mergeid"] = train["id"].apply(lambda x: x.split(".")[0])
valid_subset["mergeid"] = valid_subset["id"].apply(lambda x: x.split(".")[0])

In [4]:
COLS = ['ggl_cer', 'ggl_wer','ykg_wer', 'ykg_cer']
metadata[COLS].mean()
# pd.merge(train,metadata[COLS],left_on="mergeid",right_on="id",how="inner")

ggl_cer    0.800538
ggl_wer    1.010345
ykg_wer    0.494660
ykg_cer    0.237798
dtype: float64

In [7]:
np.intersect1d(valid_subset.id,train.id)

array([], dtype=object)