In [1]:
import os
import json
from collections import Counter
import sentencepiece as spm

##### Step1: Must execute these below cells. These contain functions which are used in que2. ( I have written those functions here also so that there is not dependency between python notebooks)
(functions for unicode correction of token, finding characters, syllables and their counts from corpus) 

In [2]:
swar = ['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ',  'ए', 'ऐ', 'ओ', 'औ','ॲ', 'ऑ','अं', 'अः']

halant_vyanjana_map = {'क्': 'क', 'ख्': 'ख', 'ग्': 'ग', 'घ्': 'घ',
                        'ङ्': 'ङ', 'च्': 'च', 'छ्': 'छ', 'ज्': 'ज',
                        'झ्': 'झ', 'ञ्': 'ञ', 'ट्': 'ट', 'ठ्': 'ठ',
                        'ड्': 'ड', 'ढ्': 'ढ', 'ण्': 'ण', 'त्': 'त',
                        'थ्': 'थ', 'द्': 'द', 'ध्': 'ध', 'न्': 'न',
                        'प्': 'प', 'फ्': 'फ', 'ब्': 'ब', 'भ्': 'भ',
                        'म्': 'म', 'य्': 'य', 'र्': 'र', 'ल्': 'ल',
                        'व्': 'व', 'श्': 'श', 'ष्': 'ष', 'स्': 'स',
                        'ह्': 'ह', 'ळ्': 'ळ', 'क्ष्': 'क्ष', 'ज्ञ्': 'ज्ञ'}


swar_matra_map={'ा':'आ', 'ि':'इ', 'ी':'ई', 'ु':'उ', 'ू':'ऊ', 'े':'ए', 'ै':'ऐ', 'ो':'ओ', 'ौ': 'औ', 'ॅ':'ॲ', 'ॉ':'ऑ',  'ं':'अं', 'ः': 'अः'}
reversed_swar_matra_map = {'आ': 'ा', 'इ': 'ि', 'ई': 'ी', 'उ': 'ु', 'ऊ': 'ू', 'ए': 'े', 'ऐ': 'ै', 'ओ': 'ो', 'औ': 'ौ', 'ॲ': 'ॅ', 'ऑ': 'ॉ', 'अं': 'ं', 'अः': 'ः'}


specials= { 'ँ':['ॲ','अं']}


vyanjana_map={'क': 'क्', 'ख': 'ख्', 'ग': 'ग्', 'घ': 'घ्', 
                        'ङ': 'ङ्', 'च': 'च्', 'छ': 'छ्', 'ज': 'ज्', 
                        'झ': 'झ्', 'ञ': 'ञ्', 'ट': 'ट्', 'ठ': 'ठ्', 
                        'ड': 'ड्', 'ढ': 'ढ्', 'ण': 'ण्', 'त': 'त्', 
                        'थ': 'थ्', 'द': 'द्', 'ध': 'ध्', 'न': 'न्', 
                        'प': 'प्', 'फ': 'फ्', 'ब': 'ब्', 'भ': 'भ्', 
                        'म': 'म्', 'य': 'य्', 'र': 'र्', 'ल': 'ल्', 
                        'व': 'व्', 'श': 'श्', 'ष': 'ष्', 'स': 'स्', 
                        'ह': 'ह्', 'ळ': 'ळ्', 'क्ष': 'क्ष्', 'ज्ञ': 'ज्ञ्'}

#function of unicode correction for question 2. This is to handle " "spaces also
def unicode_correction(user_input):     #input can be word or sentence
    corrected_unicodes=[]
    length=len(user_input)
    for i in range (length):
        if(user_input[i] in swar):
            corrected_unicodes.append(user_input[i])
        elif(user_input[i] in vyanjana_map):   #if it is already a halant character e.g. त् is represented as 'त', '्' in given input
            if(i<length-1 and (user_input[i+1]=='्' or user_input[i+1] in swar_matra_map)):  #i.e. if vyanjan is either halant or it has matra then 
                corrected_unicodes.append(vyanjana_map[user_input[i]])                          #writing only halant vyanjan 

            elif(i<length-1 and user_input[i+1] in specials):       #to handle बँ i.e. 'ब', 'ँ' this kind of thing. In this case I just have to insert ब् and not 'ब्' 'अ'
                corrected_unicodes.append(vyanjana_map[user_input[i]])
            else:                                                   #if vyanjan is actually a full vyanjan then
                corrected_unicodes.append(vyanjana_map[user_input[i]]) #writing halant vyanjan and
                corrected_unicodes.append('अ')                      #also writing swar अ after that

        elif(user_input[i] in swar_matra_map):  #if character is some kind of matra then replacing it with swar
            corrected_unicodes.append(swar_matra_map[user_input[i]])

        elif (user_input[i] in specials):   #to handle 'ँ', we need to explicitly insert all swar associated with it. which is stored in specials
            for j in specials[user_input[i]]:
                corrected_unicodes.append(j)
        elif(user_input[i]==" "):
            corrected_unicodes.append(" ")
    return corrected_unicodes

In [3]:
#Function to calculate count of characters
def character_count(user_input2):
    char_count={}                                           #to store character and their count
    #print("Actual input is: ", list(user_input2))
    correct_input2=unicode_correction(user_input2)         #first performing unicode correction to get actual characters count
    #print("Corrected input is: ", correct_input2)
    input2_len=len(correct_input2)
    for i in range(input2_len):                             #iterating over corrected unicode
        if(correct_input2[i]!= " "):                          #as we are not counting whitespace as character
            if(correct_input2[i] not in char_count):            #if current character isn't present in char_count dictionary then inserting it   
                char_count[correct_input2[i]]=1
            else:                                               #if present then just increasing the count
                char_count[correct_input2[i]]+=1                
    return char_count

def bigram_characters(user_input):               #same as character Count function. Just some Modifications for bigram characters
    bigram_char_count={}
    #print("Actual input: ", user_input)
    correct_input=unicode_correction(user_input)
    #print("Corrected input is: ", correct_input)
    input_len=len(correct_input)
    sentence_Bigram_chars=[]       #Just in case if we want to view all Bigram characters in sequencial order of occurance in sentence
    
    temp=""             #to store last characters of previous BigramCharacters so that it can be used as first character for next BigramCharacters
    bigram_char=""   

    #just look, itis almost similar to character_count function. Just minor changes.

    i=0
    while(i< input_len):
        if(correct_input[i]==" "):
            temp=""     #emptying temp because word is completed. Hence Last syllable of word is not useful for next bigramSyllable
            bigram_char=""
            i+=1
        elif(correct_input[i] in swar or correct_input[i] in halant_vyanjana_map):                            
            current_character=correct_input[i]
            i+=1
            temp=current_character           #storing current character as it may be used as first character in next bigramcharacter
            if(len(bigram_char)>0):          #if it is not first bigram_character of the word. i.e. temp is already present as first character of current bigramcharacter
                bigram_char+=current_character        #then just insert next character in bigramcharacter. Now this bigram is completed
                
                if(bigram_char not in bigram_char_count):        #just to store count of bigramcharacter
                    bigram_char_count[bigram_char]=1
                else:
                    bigram_char_count[bigram_char]+=1

                sentence_Bigram_chars.append(bigram_char)          #complete Bigram character inserting in list       

                bigram_char=temp                 #Bigram is completed. The Current character i.e. last character of our completed bigram is now first character of next bigram
            else:
                bigram_char=current_character     #It is first character of the word. Hence store it
                temp=""                         #as it won't be used for next bigram, just erase temp
                                            
    return (bigram_char_count,sentence_Bigram_chars) 


In [4]:
#part 2: Working with syllables. 

#Function to store count of Syllables or unigram Syllables
def syllable_count(user_input2):
    syllable_count={}
    correct_input2=unicode_correction(user_input2)
    input2_len=len(correct_input2)
    current_syllable=""
    i=0
    while(i< input2_len):
        if(correct_input2[i]==" "):
            current_syllable=""
            i+=1
        elif(correct_input2[i] in swar):                          #first we are checking if there is only swars continuously occuring or not e.g. in ओंकार : ओं as ओ अं 
            while(i<input2_len and correct_input2[i] in swar):  #iterating over all swars continuously to make one swar syllable e.g. ओं
                current_syllable+=correct_input2[i]
                i+=1
            if(current_syllable not in syllable_count):
                syllable_count[current_syllable]=1
            else:
                syllable_count[current_syllable]+=1
            current_syllable=""                                 #As syllable count is inserted into dictionary hence

        elif(correct_input2[i] in halant_vyanjana_map):         #if syllable starting with vyanjana then we consider 
                                                                #all halant vyanjana +all swars following that. 
                                                                #after that when new vyanjan comes I will stop. Same thing is written in comments below step by step

            while(i<input2_len and correct_input2[i] in halant_vyanjana_map):   #syllable starting with vyanjana. hence consider all vyanjana
                current_syllable+=correct_input2[i]                             
                i+=1   
            current_syllable=current_syllable[:-1]              #last vyanjana of that should be full vyanjana so that we can use all matras on that
                                                                #Hence I am removing halant from last vyanjana to make it full vyanjana
                                                                #e.g. क् is stored as 'क' + '्' Hence removing '्' from last halant vyanjana to make it
                                                                #full vyanjana
            
            while(i<input2_len and correct_input2[i] in swar):
                if(correct_input2[i]=='अ'):                     #ignoring 'अ' because i have already made last vyanjana as full vyanjana
                    i=i+1
                    continue
                current_syllable+=reversed_swar_matra_map[correct_input2[i]]  #inserting matra of that swar e.g. if swar is ई then inserting 'ी'
                i+=1
            if(current_syllable not in syllable_count):         #storing count of syllable in dictionary
                syllable_count[current_syllable]=1
            else:
                syllable_count[current_syllable]+=1
            current_syllable=""                                 #current syllable completed hence start new syllable with empty string
    return syllable_count

def bigram_syllable(user_input2):               #same as Syllable Count function. Just some Modifications for bigram Syllables
    bigram_syllable_count={}
    #print("Actual input: ", user_input2)
    correct_input2=unicode_correction(user_input2)
    #print("Corrected input is: ", correct_input2)
    input2_len=len(correct_input2)
    sentence_Bigram_syllables=[]       #Just in case if we want to view all Bigram syllables in sequencial order of occurance in sentence
    
    temp=""             #to store last syllable of previous bigramSyllable so that it can be used as first syllable for next bigramSyllable
    bigram_syl=""   

    #just look, itis almost similar to syllable_count function. Just minor changes.
    #if not able to understand. See bigram Character Count function code. Logic is same here also
    current_syllable=""
    i=0
    while(i< input2_len):
        if(correct_input2[i]==" "):
            current_syllable=""
            temp=""     #emptying temp because word is completed. Hence Last syllable of word is not useful for next bigramSyllable
            bigram_syl=""
            i+=1
        elif(correct_input2[i] in swar):                          
            while(i<input2_len and correct_input2[i] in swar):  
                current_syllable+=correct_input2[i]
                i+=1

            #This code is specific to bigram Syllables function   
            temp=current_syllable           #storing current syllable as it may be used as first syllable in next bigramSyllable
            if(len(bigram_syl)>0):          #if it is not first bigram_syllable of the word. i.e. temp is already present as first syllable of current bigramSyllable
                bigram_syl+=current_syllable        #then just insert next syllable in bigramSyllable. Now this bigram is completed
                
                if(bigram_syl not in bigram_syllable_count):        #just to store count of bigramSyllables
                    bigram_syllable_count[bigram_syl]=1
                else:
                    bigram_syllable_count[bigram_syl]+=1

                sentence_Bigram_syllables.append(bigram_syl)          #complete Bigram syllable inserting in list       

                bigram_syl=temp                 #Bigram is completed. The Current Syllable i.e. last syllable of our completed bigram is now first syllable of next bigram
            else:
                bigram_syl=current_syllable     #It is first syllable of the word. Hence store it
                temp=""                         #as it won't be used for next bigram, just erase temp
            
            #till here
                
        #Following code is same as syllable_count function
            current_syllable=""                                  

        elif(correct_input2[i] in halant_vyanjana_map):         
            while(i<input2_len and correct_input2[i] in halant_vyanjana_map):   
                current_syllable+=correct_input2[i]                             
                i+=1   
            current_syllable=current_syllable[:-1]              
                                                                
            while(i<input2_len and correct_input2[i] in swar):
                if(correct_input2[i]=='अ'):                     
                    i=i+1
                    continue
                current_syllable+=reversed_swar_matra_map[correct_input2[i]]  
                i+=1
        #Till Here
                

            #Following code is specific to bigram Syllables function               
            temp=current_syllable              #storing current syllable as it may be used as first syllable in next bigramSyllable
            if(len(bigram_syl)>0):              #if it is not first bigram_syllable of the word. i.e. temp is already present as first syllable of current bigramSyllable
                bigram_syl+=current_syllable    #then just insert next syllable in bigramSyllable. Now this bigram is completed
                # print(" this bigram: ", bigram_syl)

                if(bigram_syl not in bigram_syllable_count):    #just to store count of bigramSyllables
                    bigram_syllable_count[bigram_syl]=1
                else:
                    bigram_syllable_count[bigram_syl]+=1

                sentence_Bigram_syllables.append(bigram_syl)          #complete Bigram syllable inserting in list

                bigram_syl=temp                #Bigram is completed. The Current Syllable i.e. last syllable of our completed bigram is now first syllable of next bigram
            else:
                bigram_syl=current_syllable     #It is first syllable of the word. Hence store it
                temp=""                            #as it won't be used for next bigram, just erase temp
            
            #till here
            current_syllable=""                                 #as complete syllable if found
    

    return (bigram_syllable_count,sentence_Bigram_syllables) 

##### Step2: Reading the corpus

In [7]:
corpus_path='corpus/mr_100.txt'
with open(corpus_path, 'r', encoding='utf-8',errors='ignore') as file:
    corpus = file.read()

### Step 3: Select only one toknizer at a time. 
if you want to try different tokanizers then execute any one tokenizer of step 3 and execute all cells of step4

#### Tokenization using BPE tokanizer (then only execute below cells)

##### Run below cell for BPE Tokenizer with V=1000:  
(if you want to use another corpus then provide the path in below cell also)

In [None]:
#Training BPE model with vocabulary size V = 1000

#This took around 4 minues to execute this cell
spm.SentencePieceTrainer.Train('--input=corpus/mr_100.txt --model_prefix=bpe_model_1000 --model_type=bpe --vocab_size=1000')
#Loading BPE model (V = 1000)
sp_bpe_1000 = spm.SentencePieceProcessor()
sp_bpe_1000.load('bpe_model_1000.model')
#Tokenizing with BPE model (V = 1000)
extracted_tokens = sp_bpe_1000.encode_as_pieces(corpus)
print("Displaying some Tokens (BPE V=1000): ",extracted_tokens[:10])

storage_path="BPE_V1000/"

##### Run below cell for BPE Tokenizer with V=2000 
if you want to use another corpus then provide the path in below cell also


In [15]:
#Training BPE model with vocabulary size V = 2000

#This took around 6 minues to execute this cell

spm.SentencePieceTrainer.Train('--input=corpus/mr_100.txt --model_prefix=bpe_model_2000 --model_type=bpe --vocab_size=2000')

#Loading BPE model (V = 2000)
sp_bpe_2000 = spm.SentencePieceProcessor()
sp_bpe_2000.load('bpe_model_2000.model')

#Tokenizing with BPE model (V = 2000)
extracted_tokens = sp_bpe_2000.encode_as_pieces(corpus)
print("Displaying some Tokens (BPE V=2000): ",extracted_tokens[:10])
storage_path="BPE_V2000/"

Displaying some Tokens (BPE V=2000):  ['▁स्व', 'प्', 'न', '▁दाख', 'वि', 'णे', '▁आणि', '▁आ', 'श्', 'वास']


#### Tokenization using whitespace tokanizer (then only execute below cell)

In [9]:
extracted_tokens = corpus.split()
print("Tokens (Whitespace Tokenizer):", extracted_tokens[:10])
storage_path="Whitespace_Tokenizer/"

Tokens (Whitespace Tokenizer): ['स्वप्न', 'दाखविणे', 'आणि', 'आश्वासने', 'देणारे', 'नेतेही', 'लोकांना', 'खूप', 'आवडतात.', 'मुलांसोबतच']


#### Tokenization using Unigram tokanizer (then only execute below cells)

##### Unigram tokenizer with V=1000 
(if you want to use another corpus then provide the path in below cell also)

In [26]:
#training Unigram model with vocab size V = 1000

#if want to use another corpus. then provide path in input="" in below line
spm.SentencePieceTrainer.Train('--input=corpus/mr_100.txt --model_prefix=unigram_model --model_type=unigram --vocab_size=1000')

sp_unigram = spm.SentencePieceProcessor()
sp_unigram.load('unigram_model.model')

#Tokenization with Unigram model
extracted_tokens = sp_unigram.encode_as_pieces(corpus)

print("Some of the extracted Tokens:", extracted_tokens[:10])
storage_path="Unigram_Tokenizer_V1000/"

Some of the extracted Tokens: ['▁स्व', 'प', '्', 'न', '▁दाखव', 'ि', 'णे', '▁आणि', '▁आ', 'श्']


##### Unigram tokenizer with V=2000 
(if you want to use another corpus then provide the path in below cell also)

In [36]:
#training Unigram model with vocab size V = 2000

#if want to use another corpus. then provide path in input="" in below line
spm.SentencePieceTrainer.Train('--input=corpus/mr_100.txt --model_prefix=unigram_model --model_type=unigram --vocab_size=2000')

sp_unigram = spm.SentencePieceProcessor()
sp_unigram.load('unigram_model.model')

#Tokenization with Unigram model
extracted_tokens = sp_unigram.encode_as_pieces(corpus)

print("Some of the extracted Tokens:", extracted_tokens[:10])
storage_path="Unigram_Tokenizer_V2000/"

Some of the extracted Tokens: ['▁स्वप्न', '▁दाखव', 'ि', 'णे', '▁आणि', '▁आश्वासन', 'े', '▁देणार', 'े', '▁नेते']


#### Tokenization using mBERT tokanizer (then only execute below cells)

##### Run below cell for mBERT with max_length=1000

In [6]:
from transformers import BertTokenizer

#Loading tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
#Tokenizing the corpus with max length = 1000
tokens_ids = tokenizer.encode(corpus, max_length=1000, truncation=True)

#getting tokens from ids
extracted_tokens = tokenizer.convert_ids_to_tokens(tokens_ids)
print("Some of the Tokens:", extracted_tokens[:10])
storage_path="mBERT_word1000/"

  from .autonotebook import tqdm as notebook_tqdm


Some of the Tokens: ['[CLS]', 'स', '##्व', '##प', '##्न', 'द', '##ा', '##ख', '##वि', '##णे']


##### Run below line for mBERT with max_length=2000

In [16]:
from transformers import BertTokenizer

#Loading tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
#Tokenizing the corpus with max length = 2000
tokens_ids = tokenizer.encode(corpus, max_length=2000, truncation=True)

#getting tokens from ids
extracted_tokens = tokenizer.convert_ids_to_tokens(tokens_ids)
print("Some of the Tokens:", extracted_tokens[:10])
storage_path="mBERT_word2000/"

Some of the Tokens: ['[CLS]', 'स', '##्व', '##प', '##्न', 'द', '##ा', '##ख', '##वि', '##णे']


### Step4: Execute all cells below to get required frequencies. 
tokenization is completed by model of our choice. 
Now, I am storing calculated frequencies in /frequencies folder. However i am not creating seperate files for frequencies calculated by each tokenizer. Instead these files will be overwritten. 
So, if you want to see frequency using another tokenizer: Then do following

1) Just execute cells of tokenizer of your choice mentioned above  
2) Run all cells below
            

In [10]:
#performing unicode correction on each token and storing it back

for i in range(len(extracted_tokens)):
    extracted_tokens[i]=unicode_correction(extracted_tokens[i])
    extracted_tokens[i] = ''.join(extracted_tokens[i])

#removing empty tokens if any
extracted_tokens = [token for token in extracted_tokens if token != '']
print("Displaying some Tokens after unicode correction: ",extracted_tokens[:10])

Displaying some Tokens after unicode correction:  ['स्व्अप्न्अ', 'द्आख्अव्इण्ए', 'आण्इ', 'आश्व्आस्अन्ए', 'द्एण्आर्ए', 'न्एत्एह्ई', 'ल्ओक्आअंन्आ', 'ख्ऊप्अ', 'आव्अड्अत्आत्अ', 'म्उल्आअंस्ओब्अत्अच्अ']


##### Unigram frequencies of tokens

In [11]:
unigram_counter = Counter(extracted_tokens)
output_directory = "frequencies/"+storage_path
os.makedirs(output_directory, exist_ok=True)
BPE_V1000_path = os.path.join(output_directory, 'unigram_tokens.json')
Desc_unigram_token_count = dict(sorted(unigram_counter.items(), key=lambda item: item[1], reverse=True))

readable_unigram_tokens = json.dumps(Desc_unigram_token_count, ensure_ascii=False, indent=2)
with open(BPE_V1000_path, 'w') as json_file:
    json_file.write(readable_unigram_tokens)

##### biigram frequencies of tokens

In [12]:
bigram_tokens = zip(extracted_tokens, extracted_tokens[1:])
# counting bigram frequencies
bigram_counter = Counter(bigram_tokens)
#print(bigram_counter)
# Converting tuple keys to strings for JSON serialization
bigram_counter = {str(key): value for key, value in bigram_counter.items()}


#Storing bigram frequencies of tokens in a JSON file in local machine
output_directory = "frequencies/"+storage_path
os.makedirs(output_directory, exist_ok=True)
Desc_bigram_token_count = dict(sorted(bigram_counter.items(), key=lambda item: item[1], reverse=True))

readable_bigram_tokens = json.dumps(Desc_bigram_token_count, ensure_ascii=False, indent=2)
BPE_V1000_path = os.path.join(output_directory, 'bigram_tokens.json')
with open(BPE_V1000_path, 'w') as json_file:
    json_file.write(readable_bigram_tokens)

##### Bigram frequencies of characters

In [13]:
#execution of this cell takes some time (it took around 4 minutes on my laptop)
Bigram_Char_counts={}
sequential_bigram_char_corpus=[]

for token in extracted_tokens:
    if not token:        #empty token
        continue
    #print(sentence)
    token_char_count=character_count(token)
    sentence_Bigramchar_count,bigram_chars_in_sentences=bigram_characters(token)

    sequential_bigram_char_corpus.append(bigram_chars_in_sentences)        #appending bigram characters of token to toal bigram characters of corpus list
    for key in sentence_Bigramchar_count:
        if(key in Bigram_Char_counts):
            Bigram_Char_counts[key]+=sentence_Bigramchar_count[key]
        else:
            Bigram_Char_counts[key]=sentence_Bigramchar_count[key]

In [14]:
#sorting in descending order of their frequencies 

#print("bigramCharacters of first 10 tokens of corpus: ",sequential_bigram_char_corpus[:10])
Desc_bigramChar_count = dict(sorted(Bigram_Char_counts.items(), key=lambda item: item[1], reverse=True))
#print("Descending order of count of bigram characters (Showing top 20): ", dict(list(Desc_bigramChar_count.items())[:20]))

In [15]:
#Storing bigram frequencies of characters in a JSON file in local machine
output_directory = "frequencies/"+storage_path
os.makedirs(output_directory, exist_ok=True)
formatted_content_bigram_chars = json.dumps(Desc_bigramChar_count, ensure_ascii=False, indent=2)
BPE_V1000_path_bigram_chars = os.path.join(output_directory, 'bigram_chars.json')
with open(BPE_V1000_path_bigram_chars, 'w', encoding='utf-8') as json_file:
    json_file.write(formatted_content_bigram_chars)

##### Bigram frequencies of syllables

In [16]:
#This cell takes around 3 minues to run in my laptop 
Bigram_syl_counts={}        #to measure overall bigram syllable count of corpus
sequential_bigram_syl_corpus=[]     #to get all bigram syllables in same order of corpus

for token in extracted_tokens:
    if not token:        #empty line
        continue
    bigram_syl_in_token=[]
    #print(sentence)
    token_syl_count=syllable_count(token)
    token_bigramSyl_count,bigram_syl_in_token=bigram_syllable(token)      

    sequential_bigram_syl_corpus.append(bigram_syl_in_token)        #appending bigram syllable of tokens to bigram syllables of corpus list
    
    for key in token_bigramSyl_count:
        if(key in Bigram_syl_counts):
            Bigram_syl_counts[key]+=token_bigramSyl_count[key]
        else:
            Bigram_syl_counts[key]=token_bigramSyl_count[key]

In [17]:
#print("bigramSyllables of first 20 tokens of corpus: ",sequential_bigram_syl_corpus[:20])
Desc_bigramSyl_count = dict(sorted(Bigram_syl_counts.items(), key=lambda item: item[1], reverse=True))
#print("Descending order of count of bigram Syllables (Showing top 20): ", dict(list(Desc_bigramSyl_count.items())[:20]))

In [18]:
#Storing bigram frequencies of syllables in a JSON file in local machine
output_directory = "frequencies/"+storage_path
os.makedirs(output_directory, exist_ok=True)
formatted_content_bigram_syllables = json.dumps(Desc_bigramSyl_count, ensure_ascii=False, indent=2)
BPE_V1000_path_bigram_syllables = os.path.join(output_directory, 'bigram_syllables.json')
with open(BPE_V1000_path_bigram_syllables, 'w', encoding='utf-8') as json_file:
    json_file.write(formatted_content_bigram_syllables)