In [1]:
import regex as re
import os
import pickle

import torch
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer

In [2]:
min_freq_to_include_in_vocab = 50

In [3]:
def preProcessText(text):
    # put space in beteen the | -> devanagari danda to make it a separate word.
    text = re.sub(r'\s*[\u0964]\s*', r'\u0020\u0964\u0020', text)
    # put space around the question mark ?  to make it a separate word
    text = re.sub(r'\s*[\u003f]\s*', r'\u0020\u003f\u0020', text)
    # put space in between comma(,)
    text = re.sub(r'\s*[\u002c]\s*', r'\u0020\u002c\u0020', text)
    # remove space around the new line character
    text = re.sub(r'\s*\n\s*','\n', text)
    # replace any non-devangari string with a blank
    text = re.sub(r'[^\u0900-\u097F,?\s+]','', text) 
    # add space in between the devanagari numbers and replace number by <num> token
    text = re.sub(r'\s*[\u0966-\u0976]+\s*', '\u0020<num>\u0020', text)
    return text

In [4]:
file_path = 'data/preprocessed_ne_dedup.txt'
if not os.path.exists(file_path):
    with open('data/ne_dedup.txt', 'r', encoding='utf-8') as f:
        text = f.read()
        print("Preprocessing file")
        text = preProcessText(text)
    with open('data/preprocessed_ne_dedup.txt', 'w', encoding='utf-8') as f:
        f.write(text)
else:
    print(f"Reading file  : {file_path}")
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

Reading file  : data/preprocessed_ne_dedup.txt


In [5]:
paragraph_list = text.split('\n')
print(len(paragraph_list))

319566


In [6]:
train_split = 300_000

train_iter = paragraph_list[:train_split]
test_iter = paragraph_list[train_split:]
print(len(train_iter), len(test_iter))

300000 19566


In [7]:
train_iter[:3]

['बर्दिबास नगरपालिकाको तेस्रो नगर परिषदबाट पारित आव <num> । <num> को संशोधित र <num> । <num> को प्रस्तावित नीति , कार्यक्रम तथा बजेट',
 'अार्थिक वर्ष <num> काे नदिजन्य पदार्थकाे उत्खनन् गरी बिक्रि वितरण तथा अान्तरिक निकासी गर्ने कार्यकाे बाेलपत्र सम्बन्धी सुचना',
 'सक्षार सप्तरी अभियानमा सप्तरीबासी सम्पूर्ण सरोकारवालाहरुको सहयोग र सहभागिताकाो लागि अनुराोध छ ।  सामुदायिक अध्ययन केन्द्रहरूको नविकरण सम्बन्धमा । ']

In [8]:
tokenizer = get_tokenizer(None)
vocab = build_vocab_from_iterator(
    map(tokenizer, train_iter), min_freq=min_freq_to_include_in_vocab, specials=['<unk>']
        )
vocab.set_default_index(vocab['<unk>'])

In [9]:
len(vocab)

60507

In [14]:
tokenizer_dir = "tokenizer"
if not os.path.exists(tokenizer_dir):
    os.makedirs(tokenizer_dir)
tokenizer_path = tokenizer_dir + "/tokenizer.pth"
vocab_path = tokenizer_dir + "/vocab.pkl"

## saving the tokenizer and vocab
torch.save(tokenizer, tokenizer_path)
with open(vocab_path, 'wb') as file:
    pickle.dump(vocab, file)

In [11]:
## loading the tokenizer and vocab
loaded_tokenizer = torch.load(tokenizer_path)
with open(vocab_path, 'rb') as file:
    loaded_vocab = pickle.load(file)

In [12]:
print(type(vocab.vocab.get_stoi()))
for key, value in list(vocab.vocab.get_stoi().items())[:5]:
    print(f'{key}: {value}')

<class 'dict'>
हेर्नकै: 60506
हेफर: 60505
हुलहुज्जत: 60504
हीत: 60501
हिलटेक: 60500


## Sample Test for the tokenization process

In [15]:
print("\u0964", "\u003f", "\u002c", "\u0900", "\u097F", "\u2020")

। ? , ऀ ॿ †


In [17]:
text = "आधिकारिकabcd निर्णयक? cbd निर्णयक२७, २७२७२७२७क |"
text = preProcessText(text)
print(text)

आधिकारिक निर्णयक ?  निर्णयक <num> , <num> क 


In [18]:
train_iter = text.split("\n")
train_iter

['आधिकारिक निर्णयक ?  निर्णयक <num> , <num> क ']

In [19]:
tokenizer = get_tokenizer(None)
vocab = build_vocab_from_iterator(
    map(tokenizer, train_iter), specials=['<unk>']
        )
vocab.set_default_index(vocab['<unk>'])

In [20]:
print(len(vocab.get_itos()), vocab.get_itos())

7 ['<unk>', '<num>', 'निर्णयक', ',', '?', 'आधिकारिक', 'क']


In [21]:
vocab.get_stoi()

{'क': 6, 'आधिकारिक': 5, '?': 4, ',': 3, 'निर्णयक': 2, '<num>': 1, '<unk>': 0}

In [22]:
tokenizer("आधिकारिक निर्णयक ?  निर्णयक \n आधिकारिक निर्णयक ?  निर्णयक ")

['आधिकारिक', 'निर्णयक', '?', 'निर्णयक', 'आधिकारिक', 'निर्णयक', '?', 'निर्णयक']

In [23]:
len(vocab)

7

In [24]:
vocab

Vocab()

In [32]:
## save tokenizer and vocab
torch.save(tokenizer, 'tokenizer.pth')
loaded_tokenizer = torch.load('tokenizer.pth')

In [33]:
with open('vocab.pkl', 'wb') as file:
    pickle.dump(vocab, file)

In [34]:
with open('vocab.pkl', 'rb') as file:
    loaded_vocab = pickle.load(file)

In [35]:
loaded_vocab.get_stoi()

{'क': 6, 'आधिकारिक': 5, '?': 4, ',': 3, 'निर्णयक': 2, '<num>': 1, '<unk>': 0}

In [36]:
vocab.get_stoi()

{'क': 6, 'आधिकारिक': 5, '?': 4, ',': 3, 'निर्णयक': 2, '<num>': 1, '<unk>': 0}