# Compute vocabulary
Compute two sets of vocabulary
- **Mixed set**: vocabulary where all words may not be correctly spelled
- **Clean set**: vocabulary where all words are correctly spelled

In [1]:
import json
import os
from collections import Counter
from typing import List

from indicnlp.tokenize.indic_tokenize import trivial_tokenize_indic
from tqdm import tqdm

## Utilities

In [2]:
def tokenize_text(text: List[str]) -> List[List[str]]:
    """Tokenize text"""
    return [trivial_tokenize_indic(sent) for sent in tqdm(text, desc='tokenize', unit=' sentences')]

In [3]:
def build_vocab(tokenized_text: List[List[str]]) -> Counter:
    """Build vocabulary from tokenized text"""
    counter = Counter()
    for toks in tqdm(tokenized_text, desc='build vocab and compute frequencies of tokens', unit=' sentences'):
        counter.update(toks)
    print(f'Number of sentences: {len(tokenized_text):,}')
    print(f'Number of unique words or equivalently, the size of vocabulary: {len(counter):,}')
    print(f'Number of running words: {sum([freq for _, freq in counter.items()]):,}')
    return counter

In [4]:
def write_vocab_to_json(counter: Counter, json_path: str) -> None:
    """Write vocabulary to json file"""
    json_str = json.dumps(dict(counter), ensure_ascii=False, indent=4, )
    with open(json_path, 'w', encoding='utf-8') as f_json:
        f_json.write(json_str)



## Mixed vocabulary

### Load mixed data and tokenize


In [5]:
mixed_data_filepath = os.path.join('data/or')
assert os.path.isfile(mixed_data_filepath)  # sanity check
with open(mixed_data_filepath, 'r', encoding='utf-8') as f:
    lines = [s.strip() for s in tqdm(f.readlines(), desc='read lines from file')]

read lines from file: 100%|██████████| 3594672/3594672 [00:01<00:00, 1822412.56it/s]


In [6]:
# tokenize
mixed_tokens = tokenize_text(lines)


tokenize: 100%|██████████| 3594672/3594672 [01:23<00:00, 43014.48 sentences/s]


### Build mixed vocabulary


In [7]:
mixed_vocab_counter = build_vocab(mixed_tokens)

build vocab and compute frequencies of tokens: 100%|██████████| 3594672/3594672 [00:21<00:00, 170270.43 sentences/s]


Number of sentences: 3,594,672
Number of unique words or equivalently, the size of vocabulary: 778,862
Number of running words: 51,151,273


In [8]:
# most common words
mixed_vocab_counter.most_common(n=20)

[('।', 3393061),
 (',', 1191253),
 ('ଓ', 534792),
 ('ଏହି', 437185),
 ('ପାଇଁ', 373726),
 ('ସେ', 240775),
 ('ବୋଲି', 239837),
 ('ପରେ', 224959),
 ('କରି', 221628),
 ('ଏକ', 213516),
 ('ମଧ୍ୟ', 210907),
 ('ଏବଂ', 198988),
 ('କରିଥିଲେ', 195168),
 ('ସହ', 177040),
 ('-', 174796),
 ('ଖବର', 169373),
 ('.', 166728),
 ('କରିବା', 166276),
 ('ନେଇ', 161728),
 ('ବେଳେ', 156327)]

In [9]:
# write to json file
write_vocab_to_json(mixed_vocab_counter, os.path.join('mixed_vocab_counter.json'))


## Clean vocabulary

### Load clean data and tokenize

In [10]:
clean_data_filepath = os.path.join('data/Odia_structured_wordlist.json')
assert os.path.isfile(clean_data_filepath)  # sanity check
with open(clean_data_filepath, 'r', encoding='utf-8') as f:
    di = json.load(f)

In [11]:
di[list(di.keys())[4]]

{'gender': None,
 'pronunciation': 'Haḻka pāibā',
 'raw': ' (ଅମୁକର) ହଳକ ପାଇବା— Haḻka pāibā [synonym(s): হলক পাত্তয়া हलक पाना] ଦେ. ବି— 1। [କୌଣସି କାର୍ଯ୍ୟ କରିବାପାଇଁ] ସାହସ ଉପୁଜିବା। 1. Having the boldness or courage (to undertake a particular work). (ଯଥା—ଏଡ଼େ ଡେଙ୍ଗା ଗଛରେ ଚଢ଼ିବାକୁ ମୋର ହଲକ୍ ପାଉ ନାହିଁ।) ',
 'synonyms': '[synonym(s): হলক পাত্তয়া हलक पाना]',
 'word_details': [{'juktakhyara': 'ଦେ. ବି',
   'meaning': '1। [କୌଣସି କାର୍ଯ୍ୟ କରିବାପାଇଁ] ସାହସ ଉପୁଜିବା। 1. Having the boldness or courage (to undertake a particular work). (ଯଥା—ଏଡ଼େ ଡେଙ୍ଗା ଗଛରେ ଚଢ଼ିବାକୁ ମୋର ହଲକ୍ ପାଉ ନାହିଁ।)',
   'verse': None}]}

In [12]:
# collect all lines first
lines = []
for k, v in di.items():
    lines.append(k)
    if isinstance(v, str):
        lines.append(v)
    elif isinstance(v, list):
        for d in v:
            for _k, _v in d.items():
                lines.append(_k)
                lines.append(_v)
    else:
        pass

# remove None
lines = list(filter(lambda x: x is not None, lines))

# tokenize
clean_tokens = tokenize_text(lines)

# remove non Odia words
english_letters = [c for c in 'abcdefghijklmnopqrstuvwxyz' + 'abcdefghijklmnopqrstuvwxyz'.upper()]
odia_clean_tokens = []
for tokens in clean_tokens:
    odia_clean_tokens.append([token for token in tokens if len(set(token) & set(english_letters)) == 0])

tokenize: 100%|██████████| 121658/121658 [00:00<00:00, 145652.49 sentences/s]


In [13]:
odia_clean_tokens[1000:1020]

[['ଅଗଲ୍କଟା'],
 ['ଅଗଳା'],
 ['ଅଗଷ୍ଟ'],
 ['ଅଗସ୍ତ'],
 ['ଅଗସ୍ତି'],
 ['ଅଗସ୍ତ୍ୟ'],
 ['ଅଗସ୍ତ୍ୟ', 'ଯାତ୍ରା'],
 ['ଅଗସ୍ତ୍ୟ', 'ସଂହିତା'],
 ['ଅଗସ୍ତ୍ୟକୂଟ'],
 ['ଅଗସ୍ତ୍ୟସର'],
 ['ଅଗସ୍ତ୍ୟାଶ୍ରମ'],
 ['ଅଗସ୍ତ୍ୟୋଦୟ'],
 ['ଅଗା'],
 ['ଅଗାଣ୍ଡିଆ'],
 ['ଅଗାତ୍ମଜା'],
 ['ଅଗାଦ'],
 ['ଅଗାଧ'],
 ['ଅଗାଧୁ'],
 ['ଅଗାଧୁଆ'],
 ['ଅଗାଧୁଆ', 'ଅପାଧୁଆ']]

### Build clean vocabulary

In [14]:
clean_vocab_counter = build_vocab(odia_clean_tokens)

build vocab and compute frequencies of tokens: 100%|██████████| 121658/121658 [00:00<00:00, 312936.01 sentences/s]


Number of sentences: 121,658
Number of unique words or equivalently, the size of vocabulary: 93,378
Number of running words: 170,497


In [15]:
# most common words
clean_vocab_counter.most_common(n=20)

[(')', 2750),
 ('(', 2739),
 ('କରିବା', 1623),
 ('ଇତ୍ୟାଦି', 1472),
 ('ଧାତୁ', 1147),
 ('ହେବା', 911),
 ('ମାରିବା', 704),
 ('ଦେବା', 567),
 ('ହାତ', 268),
 ('ପଡ଼ିବା', 253),
 ('ରକ୍ତ', 244),
 ('ପାଣି', 227),
 ('ମନ', 208),
 ('ମୁହଁ', 203),
 ('ଯିବା', 202),
 ('ବେ', 194),
 ('ରଖିବା', 177),
 ('ପକାଇବା', 176),
 ('ଧରିବା', 171),
 ('ରସ', 161)]

In [16]:
# write to json file
write_vocab_to_json(clean_vocab_counter, os.path.join('clean_vocab_counter.json'))