In [2]:
import sentencepiece as spm
from transformers import BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


#### Step1: must execute this cell. 
As it contains definitions of functions which are used in further steps

In [3]:
swar = ['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ',  'ए', 'ऐ', 'ओ', 'औ','ॲ', 'ऑ','अं', 'अः']

halant_vyanjana_map = {'क्': 'क', 'ख्': 'ख', 'ग्': 'ग', 'घ्': 'घ',
                        'ङ्': 'ङ', 'च्': 'च', 'छ्': 'छ', 'ज्': 'ज',
                        'झ्': 'झ', 'ञ्': 'ञ', 'ट्': 'ट', 'ठ्': 'ठ',
                        'ड्': 'ड', 'ढ्': 'ढ', 'ण्': 'ण', 'त्': 'त',
                        'थ्': 'थ', 'द्': 'द', 'ध्': 'ध', 'न्': 'न',
                        'प्': 'प', 'फ्': 'फ', 'ब्': 'ब', 'भ्': 'भ',
                        'म्': 'म', 'य्': 'य', 'र्': 'र', 'ल्': 'ल',
                        'व्': 'व', 'श्': 'श', 'ष्': 'ष', 'स्': 'स',
                        'ह्': 'ह', 'ळ्': 'ळ', 'क्ष्': 'क्ष', 'ज्ञ्': 'ज्ञ'}


swar_matra_map={'ा':'आ', 'ि':'इ', 'ी':'ई', 'ु':'उ', 'ू':'ऊ', 'े':'ए', 'ै':'ऐ', 'ो':'ओ', 'ौ': 'औ', 'ॅ':'ॲ', 'ॉ':'ऑ',  'ं':'अं', 'ः': 'अः'}
reversed_swar_matra_map = {'आ': 'ा', 'इ': 'ि', 'ई': 'ी', 'उ': 'ु', 'ऊ': 'ू', 'ए': 'े', 'ऐ': 'ै', 'ओ': 'ो', 'औ': 'ौ', 'ॲ': 'ॅ', 'ऑ': 'ॉ', 'अं': 'ं', 'अः': 'ः'}


specials= { 'ँ':['ॲ','अं']}


vyanjana_map={'क': 'क्', 'ख': 'ख्', 'ग': 'ग्', 'घ': 'घ्', 
                        'ङ': 'ङ्', 'च': 'च्', 'छ': 'छ्', 'ज': 'ज्', 
                        'झ': 'झ्', 'ञ': 'ञ्', 'ट': 'ट्', 'ठ': 'ठ्', 
                        'ड': 'ड्', 'ढ': 'ढ्', 'ण': 'ण्', 'त': 'त्', 
                        'थ': 'थ्', 'द': 'द्', 'ध': 'ध्', 'न': 'न्', 
                        'प': 'प्', 'फ': 'फ्', 'ब': 'ब्', 'भ': 'भ्', 
                        'म': 'म्', 'य': 'य्', 'र': 'र्', 'ल': 'ल्', 
                        'व': 'व्', 'श': 'श्', 'ष': 'ष्', 'स': 'स्', 
                        'ह': 'ह्', 'ळ': 'ळ्', 'क्ष': 'क्ष्', 'ज्ञ': 'ज्ञ्'}

#function of unicode correction for question 2. This is to handle " "spaces also
def unicode_correction(user_input):     #input can be word or sentence
    corrected_unicodes=[]
    length=len(user_input)
    for i in range (length):
        if(user_input[i] in swar):
            corrected_unicodes.append(user_input[i])
        elif(user_input[i] in vyanjana_map):   #if it is already a halant character e.g. त् is represented as 'त', '्' in given input
            if(i<length-1 and (user_input[i+1]=='्' or user_input[i+1] in swar_matra_map)):  #i.e. if vyanjan is either halant or it has matra then 
                corrected_unicodes.append(vyanjana_map[user_input[i]])                          #writing only halant vyanjan 

            elif(i<length-1 and user_input[i+1] in specials):       #to handle बँ i.e. 'ब', 'ँ' this kind of thing. In this case I just have to insert ब् and not 'ब्' 'अ'
                corrected_unicodes.append(vyanjana_map[user_input[i]])
            else:                                                   #if vyanjan is actually a full vyanjan then
                corrected_unicodes.append(vyanjana_map[user_input[i]]) #writing halant vyanjan and
                corrected_unicodes.append('अ')                      #also writing swar अ after that

        elif(user_input[i] in swar_matra_map):  #if character is some kind of matra then replacing it with swar
            corrected_unicodes.append(swar_matra_map[user_input[i]])

        elif (user_input[i] in specials):   #to handle 'ँ', we need to explicitly insert all swar associated with it. which is stored in specials
            for j in specials[user_input[i]]:
                corrected_unicodes.append(j)
        elif(user_input[i]==" "):
            corrected_unicodes.append(" ")
    
    return corrected_unicodes

def confusion_matrix(actual, predicted):
    true_positives=len(set(actual) & set(predicted))
    false_positives=len(set(predicted) - set(actual))
    false_negatives=len(set(actual) - set(predicted))
    
    precision=true_positives/(true_positives+false_positives) if (true_positives+false_positives)>0 else 0
    recall=true_positives/(true_positives+false_negatives) if (true_positives+false_negatives) > 0 else 0
    f1_score=2 *(precision*recall)/(precision+recall) if (precision+recall) > 0 else 0
    return precision, recall, f1_score

#### Step2: Reading and tokanizing (comma seperated) Ground truth file

In [4]:
ground_truth_path = "corpus/question5_GroundTruth.txt"
# Open the file and read its content
with open(ground_truth_path, 'r', encoding='utf-8', errors='ignore') as file:
    content = file.read()
ground_truth = content.split(',')
#Removing whitespaces from each element in the list
ground_truth = [text.strip() for text in ground_truth]

In [5]:
#performing unicode correction ground truth token and storing it back
for i in range(len(ground_truth)):
    ground_truth[i]=unicode_correction(ground_truth[i])
    ground_truth[i] = ''.join(ground_truth[i])
#removing empty tokens if any
ground_truth = [token for token in ground_truth if token != '']
print("Ground truth Tokens after unicode correction: ",ground_truth)
print("Total ",len(ground_truth)," ground truth tokens")

Ground truth Tokens after unicode correction:  ['र्अव्इव्आर्ई', 'एन्अआर्अस्ईच्आ', 'ज्ओ प्अह्इल्आ', 'म्अस्उद्आ', 'ज्आह्ईर्अ झ्आल्आ', 'त्य्आन्उस्आर्अ', 'र्आज्य्आत्ईल्अ', 'क्ओट्ई ल्आख्अ', 'न्आग्अर्इक्आअंप्ऐक्ई', 'क्ओट्ई', 'ल्आख्अ', 'ल्ओक्आअंन्आ', 'अध्इक्अत्अ', 'न्आग्अर्इक्अत्व्आच्आ द्अर्ज्आ', 'ब्अह्आल्अ क्अर्अण्य्आत्अ', 'आल्आ आह्ए', 'ज्अर्ई श्अन्इव्आर्अ', 'व्अ', 'र्अव्इव्आर्अ', 'आप्अण्अ', 'च्अव्अर्अ', 'ज्ओर्अ व्अग्ऐर्ए', 'द्एत्अ', 'त्य्आअंच्य्आ', 'प्अर्इव्अर्त्अन्आच्य्आ', 'ब्ऐठ्अक्आ म्आर्अण्ए', 'स्उर्ऊ आह्ए', 'ब्ऐठ्अक्अ', 'ब्ओल्आव्अल्ई अस्अल्य्आच्ई', 'म्आह्इत्ई', 'आम्अद्आर्अ', 'अन्इल्अ ब्आब्अर्अ य्आअंन्ई', 'ढ्अव्अळ्ईत्अ', 'प्उर्अग्र्अस्त्आअंच्य्आ', 'म्अद्अत्अ', 'व्आट्अप्आच्ई', 'च्औक्अश्ई', 'स्उर्उ', 'ब्आज्आर्अ स्अम्इत्ईच्य्आ', 'आव्आर्आत्ईल्अ', 'ग्ओद्आम्आत्अ', 'व्अ', 'न्एम्आड्ए फ्ल्ऑट्अ', 'व्अस्त्ईत्ईल्अ', 'ग्ओद्आम्आत्अ', 'ब्आर्अद्आन्आच्ए', 'ग्अठ्ए', 'आढ्अळ्उन्अ आल्ए', 'न्इर्व्य्अस्अन्ई', 'आह्एत्अ', 'अम्एर्इक्एच्ए र्आष्ट्र्आध्य्अक्ष्अ', 'प्अह्आ', 'क्आय्अ', 'ब्अर्अळ्अल्आ', 'प्आक्अड्य्आअंच्आ

#### Step3: Reading question5 corpus (sentences from question 3)

In [6]:
corpus_path = "corpus/question5_corpus.txt"
# Open the file and read its content
with open(corpus_path, 'r', encoding='utf-8', errors='ignore') as file:
    corpus = file.read()


#### Step4: Finding Performances of each of tokenizer's tokenization

##### Tokanizing with mBERT with max_length=1000

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
#Tokenizing the corpus with max length = 1000
tokens_ids = tokenizer.encode(corpus, max_length=1000, truncation=True)
#getting tokens from ids
mBERT1000_tokens = tokenizer.convert_ids_to_tokens(tokens_ids)

#performing unicode correction on each token and storing it back
for i in range(len(mBERT1000_tokens)):
    mBERT1000_tokens[i]=unicode_correction(mBERT1000_tokens[i])
    mBERT1000_tokens[i] = ''.join(mBERT1000_tokens[i])

#removing empty tokens if any
mBERT1000_tokens = [token for token in mBERT1000_tokens if token != '']
print("Displaying some Tokens after unicode correction: ",mBERT1000_tokens[:10])
print("Total ",len(ground_truth)," ground truth tokens")
print("Total ",len(mBERT1000_tokens)," mBERT1000 calculated tokens")

precision, recall, f1_score=confusion_matrix(ground_truth,mBERT1000_tokens)
print("Precision: ",precision)
print("Recall: ",recall)
print("F1_score",f1_score)


Displaying some Tokens after unicode correction:  ['र्अ', 'व्इ', 'व्आर्ई', 'एन्अ', 'आ', 'र्अ', 'स्ई', 'च्आ', 'ज्ओ', 'प्अ']
Total  203  ground truth tokens
Total  850  mBERT1000 calculated tokens
Precision:  0.0798611111111111
Recall:  0.116751269035533
F1_score 0.09484536082474226


##### Tokanizing with mBERT with max_length=2000

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
#Tokenizing the corpus with max length = 1000
tokens_ids = tokenizer.encode(corpus, max_length=1000, truncation=True)
#getting tokens from ids
mBERT2000_tokens = tokenizer.convert_ids_to_tokens(tokens_ids)

#performing unicode correction on each token and storing it back
for i in range(len(mBERT2000_tokens)):
    mBERT2000_tokens[i]=unicode_correction(mBERT2000_tokens[i])
    mBERT2000_tokens[i] = ''.join(mBERT2000_tokens[i])

#removing empty tokens if any
mBERT2000_tokens = [token for token in mBERT2000_tokens if token != '']
print("Displaying some Tokens after unicode correction: ",mBERT2000_tokens[:10])
print("Total ",len(ground_truth)," ground truth tokens")
print("Total ",len(mBERT2000_tokens)," mBERT2000 calculated tokens")

precision, recall, f1_score=confusion_matrix(ground_truth,mBERT2000_tokens)
print("\nPrecision: ",precision)
print("Recall: ",recall)
print("F1_score",f1_score)

Displaying some Tokens after unicode correction:  ['र्अ', 'व्इ', 'व्आर्ई', 'एन्अ', 'आ', 'र्अ', 'स्ई', 'च्आ', 'ज्ओ', 'प्अ']
Total  203  ground truth tokens
Total  850  mBERT2000 calculated tokens

Precision:  0.0798611111111111
Recall:  0.116751269035533
F1_score 0.09484536082474226


##### Tokanization using BPE with v=1000

In [9]:

#Training BPE model with vocabulary size V = 1000

#This took around 4 minues to execute this cell
spm.SentencePieceTrainer.Train('--input=corpus/question5_corpus.txt --model_prefix=bpe_model_1000 --model_type=bpe --vocab_size=1000')
#Loading BPE model (V = 1000)
sp_bpe_1000 = spm.SentencePieceProcessor()
sp_bpe_1000.load('bpe_model_1000.model')
#Tokenizing with BPE model (V = 1000)
BPE1000_tokens = sp_bpe_1000.encode_as_pieces(corpus)

#performing unicode correction on each token and storing it back
for i in range(len(BPE1000_tokens)):
    BPE1000_tokens[i]=unicode_correction(BPE1000_tokens[i])
    BPE1000_tokens[i] = ''.join(BPE1000_tokens[i])

#removing empty tokens if any
BPE1000_tokens = [token for token in BPE1000_tokens if token != '']
print("Displaying some Tokens after unicode correction: ",BPE1000_tokens[:10])
print("Total ",len(ground_truth)," ground truth tokens")
print("Total ",len(BPE1000_tokens)," BPE1000 calculated tokens")

precision, recall, f1_score=confusion_matrix(ground_truth,BPE1000_tokens)
print("\nPrecision: ",precision)
print("Recall: ",recall)
print("F1_score",f1_score)

Displaying some Tokens after unicode correction:  ['र्अव्इव्आर्ई', 'एन्अआर्अस्ईच्आ', 'ज्ओ', 'प्अह्इल्आ', 'म्अस्उद्आ', 'ज्आह्ईर्अ', 'झ्आल्आ', 'त्य्आन्उस्आर्अ', 'र्आज्य्आत्ईल्अ', 'क्ओट्ई']
Total  203  ground truth tokens
Total  296  BPE1000 calculated tokens

Precision:  0.45318352059925093
Recall:  0.6142131979695431
F1_score 0.5215517241379309


##### Tokanization BPE with v=2000

In [10]:

#Training BPE model with vocabulary size V = 2000

#This took around 4 minues to execute this cell
spm.SentencePieceTrainer.Train('--input=corpus/question5_corpus.txt --model_prefix=bpe_model_2000 --model_type=bpe --vocab_size=2000')
#Loading BPE model (V = 2000)
sp_bpe_2000 = spm.SentencePieceProcessor()
sp_bpe_2000.load('bpe_model_2000.model')
#Tokenizing with BPE model (V = 2000)
BPE2000_tokens = sp_bpe_2000.encode_as_pieces(corpus)

#performing unicode correction on each token and storing it back
for i in range(len(BPE2000_tokens)):
    BPE2000_tokens[i]=unicode_correction(BPE2000_tokens[i])
    BPE2000_tokens[i] = ''.join(BPE2000_tokens[i])

#removing empty tokens if any
BPE2000_tokens = [token for token in BPE2000_tokens if token != '']
print("Displaying some Tokens after unicode correction: ",BPE2000_tokens[:10])
print("Total ",len(ground_truth)," ground truth tokens")
print("Total ",len(BPE2000_tokens)," BPE2000 calculated tokens")

precision, recall, f1_score=confusion_matrix(ground_truth,BPE2000_tokens)
print("\nPrecision: ",precision)
print("Recall: ",recall)
print("F1_score",f1_score)

Displaying some Tokens after unicode correction:  ['र्अव्इव्आर्ई', 'एन्अआर्अस्ईच्आ', 'ज्ओ', 'प्अह्इल्आ', 'म्अस्उद्आ', 'ज्आह्ईर्अ', 'झ्आल्आ', 'त्य्आन्उस्आर्अ', 'र्आज्य्आत्ईल्अ', 'क्ओट्ई']
Total  203  ground truth tokens
Total  290  BPE2000 calculated tokens

Precision:  0.47509578544061304
Recall:  0.6294416243654822
F1_score 0.5414847161572052


##### Tokanization by unigram

In [11]:
#training Unigram model with vocab size V = 1000

#if want to use another corpus. then provide path in input="" in below line
spm.SentencePieceTrainer.Train('--input=corpus/question5_corpus.txt --model_prefix=unigram_model --model_type=unigram --vocab_size=381')

sp_unigram = spm.SentencePieceProcessor()
sp_unigram.load('unigram_model.model')

#Tokenization with Unigram model
unigram_tokens = sp_unigram.encode_as_pieces(corpus)

print("Some of the extracted Tokens:", unigram_tokens[:10])
#performing unicode correction on each token and storing it back
for i in range(len(unigram_tokens)):
    unigram_tokens[i]=unicode_correction(unigram_tokens[i])
    unigram_tokens[i] = ''.join(unigram_tokens[i])

#removing empty tokens if any
unigram_tokens = [token for token in unigram_tokens if token != '']
print("Displaying some Tokens after unicode correction: ",unigram_tokens[:10])
print("Total ",len(ground_truth)," ground truth tokens")
print("Total ",len(unigram_tokens),"  tokens calculated using Unigram Tokenizer")

precision, recall, f1_score=confusion_matrix(ground_truth,unigram_tokens)
print("\nPrecision: ",precision)
print("Recall: ",recall)
print("F1_score",f1_score)

Some of the extracted Tokens: ['▁1', '.', '▁रविवार', 'ी', '▁ए', 'न', 'आ', 'रस', 'ी', 'चा']
Displaying some Tokens after unicode correction:  ['र्अव्इव्आर्अ', 'ई', 'ए', 'न्अ', 'आ', 'र्अस्अ', 'ई', 'च्आ', 'ज्ओ', 'प्अ']
Total  203  ground truth tokens
Total  852   tokens calculated using Unigram Tokenizer

Precision:  0.06060606060606061
Recall:  0.08121827411167512
F1_score 0.06941431670281994


##### Tokenization using Whitespace tokenizer

In [12]:
whiteSpaced_tokens = corpus.split()
print("Some of the extracted Tokens:", whiteSpaced_tokens[:10])
#performing unicode correction on each token and storing it back
for i in range(len(whiteSpaced_tokens)):
    whiteSpaced_tokens[i]=unicode_correction(whiteSpaced_tokens[i])
    whiteSpaced_tokens[i] = ''.join(whiteSpaced_tokens[i])

#removing empty tokens if any
whiteSpaced_tokens = [token for token in whiteSpaced_tokens if token != '']
print("Displaying some Tokens after unicode correction: ",whiteSpaced_tokens[:10])
print("Total ",len(ground_truth)," ground truth tokens")
print("Total ",len(whiteSpaced_tokens),"  tokens calculated using whiteSpace Tokenizer")

precision, recall, f1_score=confusion_matrix(ground_truth,whiteSpaced_tokens)
print("\nPrecision: ",precision)
print("Recall: ",recall)
print("F1_score",f1_score)

Some of the extracted Tokens: ['1.', 'रविवारी', 'एनआरसीचा', 'जो', 'पहिला', 'मसुदा', 'जाहीर', 'झाला', 'त्यानुसार', 'राज्यातील']
Displaying some Tokens after unicode correction:  ['र्अव्इव्आर्ई', 'एन्अआर्अस्ईच्आ', 'ज्ओ', 'प्अह्इल्आ', 'म्अस्उद्आ', 'ज्आह्ईर्अ', 'झ्आल्आ', 'त्य्आन्उस्आर्अ', 'र्आज्य्आत्ईल्अ', 'क्ओट्ई']
Total  203  ground truth tokens
Total  288   tokens calculated using whiteSpace Tokenizer

Precision:  0.4864864864864865
Recall:  0.6395939086294417
F1_score 0.5526315789473685
