# Training BPE and WordPiece tokenizer for Nepali

### Dataset used here is mixture of [Oscar Corpus](https://www.kaggle.com/datasets/hsebarp/oscar-corpus-nepali), [NepCov19Tweets dataset](https://www.kaggle.com/datasets/mathew11111/nepcov19tweets), [Nepali News dataset large](https://www.kaggle.com/datasets/ashokpant/nepali-news-dataset-large), [Nepali News dataset](https://www.kaggle.com/datasets/lotusacharya/nepalinewsdataset), [nepali-wikipedia-articles](https://www.kaggle.com/datasets/disisbig/nepali-wikipedia-articles), [urdu-nepali-parallel-corpus](https://www.kaggle.com/datasets/rtatman/urdunepali-parallel-corpus).
> ### I cleaned Oscar corpus (as much as possible) in this [Notebook](https://www.kaggle.com/code/reganmaharjan/cleaning-oscar-nepali-dataset). Corpus is loaded from the output of that notebook.
> ### NepCov19Tweets data is loaded from this [Notebook](https://www.kaggle.com/code/reganmaharjan/preprocess-nepcov19tweets). Data is explored and cleaned in that notebook.

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datasets
import gc

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nepali-corpus-and-tokenizer/NepaliCombinedCorpus.csv
/kaggle/input/nepali-corpus-and-tokenizer/NepaliCombinedCorpus/state.json
/kaggle/input/nepali-corpus-and-tokenizer/NepaliCombinedCorpus/cache-5c88516ef37922dc.arrow
/kaggle/input/nepali-corpus-and-tokenizer/NepaliCombinedCorpus/dataset_info.json
/kaggle/input/nepali-corpus-and-tokenizer/NepaliCombinedCorpus/cache-d5f58b6fc0961879.arrow
/kaggle/input/nepali-corpus-and-tokenizer/NepaliCombinedCorpus/cache-39791c99b32f9043.arrow
/kaggle/input/nepali-corpus-and-tokenizer/NepaliCombinedCorpus/cache-182889154f7c98fe.arrow
/kaggle/input/nepali-corpus-and-tokenizer/NepaliCombinedCorpus/cache-2f6e5ab7053b1330.arrow
/kaggle/input/nepali-corpus-and-tokenizer/NepaliCombinedCorpus/dataset.arrow


In [2]:
data = datasets.Dataset.load_from_disk("/kaggle/input/nepali-corpus-and-tokenizer/NepaliCombinedCorpus")
data

Dataset({
    features: ['text'],
    num_rows: 1672420
})

In [3]:
nepali_alpha = ['क', 'का', 'कि', 'की', 'कु', 'कू', 'के', 'कै', 'को', 'कौ', 'क्', 'ख', 'खा', 'खि', 'खी', 'खु', 'खू', 'खे', 'खै', 'खो', 'खौ', 'ख्', 'ग', 'गा', 'गि', 'गी', 'गु', 'गू', 'गे', 'गै', 'गो', 'गौ', 'ग्', 'घ', 'घा', 'घि', 'घी', 'घु', 'घू', 'घे', 'घै', 'घो', 'घौ', 'घ्', 'ङ', 'ङा', 'ङि', 'ङी', 'ङु', 'ङू', 'ङे', 'ङै', 'ङो', 'ङौ', 'ङ्', 'च', 'चा', 'चि', 'ची', 'चु', 'चू', 'चे', 'चै', 'चो', 'चौ', 'च्', 'छ', 'छा', 'छी', 'छी', 'छु', 'छू', 'छे', 'छै', 'छो', 'छौ', 'छ्', 'ज', 'जा', 'जि', 'जी', 'जु', 'जू', 'जे', 'जै', 'जो', 'जौ', 'ज्', 'झ', 'झा', 'झि', 'झी', 'झु', 'झू', 'झे', 'झै', 'झो', 'झौ', 'झ्', 'ञ', 'ञ', 'ञि', 'ञी', 'ञु', 'ञू', 'ञे', 'ञै', 'ञो', 'ञौ', 'ञ्', 'ट', 'टा', 'टि', 'टी', 'टु', 'टू', 'टे', 'टै', 'टो', 'टौ', 'ट्', 'त', 'ता', 'ति', 'ती', 'तु', 'तू', 'ते', 'तै', 'तो', 'तौ', 'त्', 'ठ', 'ठा', 'ठि', 'ठी', 'ठु', 'ठू', 'ठे', 'ठै', 'ठो', 'ठौ', 'ठ्', 'थ', 'था', 'थि', 'थी', 'थु', 'थू', 'थे', 'थै', 'थो', 'थौ', 'थ्', 'ड', 'डा', 'डि', 'डी', 'डु', 'डू', 'डे', 'डै', 'डो', 'डौ', 'ड्', 'दा', 'दा', 'दि', 'दी', 'दु', 'दू', 'दे', 'दै', 'दो', 'दौ', 'द्', 'ढ', 'ढा', 'ढि', 'ढी', 'ढु', 'ढू', 'ढे', 'ढै', 'ढो', 'ढौ', 'ढ्', 'ध', 'धा', 'धि', 'धी', 'धु', 'धू', 'धे', 'धै', 'धो', 'धौ', 'ध्', 'न', 'ना', 'नि', 'नी', 'नु', 'नू', 'ने', 'नै', 'नो', 'नौ', 'न', 'न्', 'ण', 'णा', 'णि', 'णी', 'णु', 'णू', 'णे', 'णै', 'णो', 'णौ', 'ण्', 'म', 'मा', 'मि', 'मी', 'मु', 'मू', 'मे', 'मै', 'मो', 'मौ', 'म्', 'पा', 'पा', 'पि', 'पी', 'पु', 'पू', 'पे', 'पाई', 'पो', 'पौ', 'प्', 'फ', 'फा', 'फि', 'फी', 'फु', 'फू', 'फे', 'फै', 'फो', 'फौ', 'फ्', 'बा', 'बा', 'बि', 'बी', 'बु', 'बू', 'बे', 'बै', 'बो', 'बौ', 'ब्', 'भ', 'भा', 'भि', 'भी', 'भु', 'भू', 'भे', 'भै', 'भो', 'भौ', 'भ्', 'य', 'या', 'यि', 'यी', 'यु', 'यू', 'य्-ए', 'यै', 'यो', 'यौ', 'य्', 'रा', 'रा', 'रि', 'री', 'रु', 'रू', 'रे', 'रै', 'रो', 'रौ', 'र्', 'ला', 'ला', 'लि', 'ली', 'लु', 'लू', 'ले', 'लौ', 'लो', 'लौ', 'ल्', 'वा', 'वा', 'वि', 'वी', 'वु', 'वू', 'वे', 'वै', 'वो', 'वौ', 'व्', 'सा', 'सा', 'सि', 'सी', 'सु', 'सू', 'से', 'सै', 'सो', 'सौ', 'स्', 'शा', 'शा', 'शि', 'शी', 'शु', 'शू', 'शे', 'शै', 'शो', 'शौ', 'श्', 'षा', 'षा', 'षि', 'षी', 'षु', 'षू', 'षे', 'षै', 'षो', 'षौ', 'ष्', 'क्ष', 'क्षा', 'क्षि', 'क्षी', 'क्षु', 'क्षू', 'क्षे', 'क्षै', 'क्षो', 'क्षौ', 'क्ष्', 'त्र', 'त्रा', 'त्रि', 'त्री', 'त्रु', 'त्रू', 'त्रे', 'त्रै', 'त्रो', 'त्रौ', 'त्र्', 'ज्ञ', 'ज्ञा', 'ज्ञि', 'ज्ञी', 'ज्ञु', 'ज्ञू', 'ज्ञे', 'ज्ञै', 'ज्ञो', 'ज्ञौ', 'ज्ञ्', 'त्त', 'त्त', 'त्ति', 'त्ती', 'त्तु', 'त्तू', 'त्ते', 'त्तै', 'त्तो', 'त्तौ', 'त्त्', 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ॠ', 'ॐ', 'ए', 'ऐ', 'ओ', 'औ', 'अ्']
nepali_alpha_tokens = {nepali_alpha[i]:i+1 for i in range(len(nepali_alpha))}
print(nepali_alpha_tokens)
len(nepali_alpha)

{'क': 1, 'का': 2, 'कि': 3, 'की': 4, 'कु': 5, 'कू': 6, 'के': 7, 'कै': 8, 'को': 9, 'कौ': 10, 'क्': 11, 'ख': 12, 'खा': 13, 'खि': 14, 'खी': 15, 'खु': 16, 'खू': 17, 'खे': 18, 'खै': 19, 'खो': 20, 'खौ': 21, 'ख्': 22, 'ग': 23, 'गा': 24, 'गि': 25, 'गी': 26, 'गु': 27, 'गू': 28, 'गे': 29, 'गै': 30, 'गो': 31, 'गौ': 32, 'ग्': 33, 'घ': 34, 'घा': 35, 'घि': 36, 'घी': 37, 'घु': 38, 'घू': 39, 'घे': 40, 'घै': 41, 'घो': 42, 'घौ': 43, 'घ्': 44, 'ङ': 45, 'ङा': 46, 'ङि': 47, 'ङी': 48, 'ङु': 49, 'ङू': 50, 'ङे': 51, 'ङै': 52, 'ङो': 53, 'ङौ': 54, 'ङ्': 55, 'च': 56, 'चा': 57, 'चि': 58, 'ची': 59, 'चु': 60, 'चू': 61, 'चे': 62, 'चै': 63, 'चो': 64, 'चौ': 65, 'च्': 66, 'छ': 67, 'छा': 68, 'छी': 70, 'छु': 71, 'छू': 72, 'छे': 73, 'छै': 74, 'छो': 75, 'छौ': 76, 'छ्': 77, 'ज': 78, 'जा': 79, 'जि': 80, 'जी': 81, 'जु': 82, 'जू': 83, 'जे': 84, 'जै': 85, 'जो': 86, 'जौ': 87, 'ज्': 88, 'झ': 89, 'झा': 90, 'झि': 91, 'झी': 92, 'झु': 93, 'झू': 94, 'झे': 95, 'झै': 96, 'झो': 97, 'झौ': 98, 'झ्': 99, 'ञ': 101, 'ञि': 102, 'ञी': 103, 'ञु':

411

In [4]:
from tokenizers import normalizers
from tokenizers.normalizers import NFD, Strip,  StripAccents
normalizer = normalizers.Sequence([NFD(),Strip()])

t = 'कोरोनाको बढ्दो त्रास बेलायतबाट अमेरिका उड्नु अघि कोभिड को नेगेटिभ रिपोर्ट अनिवार्य! 9896983 '
normalizer.normalize_str(t)

'कोरोनाको बढ्दो त्रास बेलायतबाट अमेरिका उड्नु अघि कोभिड को नेगेटिभ रिपोर्ट अनिवार्य! 9896983'

In [5]:
from tokenizers import pre_tokenizers
from tokenizers.pre_tokenizers import Whitespace, Digits, Punctuation
pre_tokenizer = pre_tokenizers.Sequence([Whitespace(),Digits(individual_digits=True), Punctuation()])

t = 'कोरोनाको बढ्दो त्रास "बेलायतबाट" अमेरिका उड्नु अघि कोभिड को नेगेटिभ रिपोर्ट अनिवार्य! 9896983 '
print(pre_tokenizer.pre_tokenize_str(t))

[('कोरोनाको', (0, 8)), ('बढ्दो', (9, 14)), ('त्रास', (15, 20)), ('"', (21, 22)), ('बेलायतबाट', (22, 31)), ('"', (31, 32)), ('अमेरिका', (33, 40)), ('उड्नु', (41, 46)), ('अघि', (47, 50)), ('कोभिड', (51, 56)), ('को', (57, 59)), ('नेगेटिभ', (60, 67)), ('रिपोर्ट', (68, 75)), ('अनिवार्य', (76, 84)), ('!', (84, 85)), ('9', (86, 87)), ('8', (87, 88)), ('9', (88, 89)), ('6', (89, 90)), ('9', (90, 91)), ('8', (91, 92)), ('3', (92, 93))]


In [6]:
%%time
## Training BPE tokenizer
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.decoders import WordPiece as wp_decoder
from tokenizers.trainers import BpeTrainer

print("Training BPE Tokenizer using HuggingFace Library")
bpe_tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
bpe_tokenizer.normalizer = normalizer
bpe_tokenizer.pre_tokenizer = pre_tokenizer
bpe_tokenizer.decoder = wp_decoder() ## Here using wp_decoder because we use "##" as continuing_subword_prefix as in WordPiece.


def bpe_batch_iterator(dataset,batch_size=1000):
    for i in range(0, len(dataset), batch_size):
        yield dataset[i : i + batch_size]
        
bpe_trainer = BpeTrainer(vocab_size=30000,
                         show_progress=True,
                         special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
                         initial_alphabet=nepali_alpha,
                         continuing_subword_prefix="##")

bpe_tokenizer.train_from_iterator(bpe_batch_iterator(data['text']),bpe_trainer,length=len(data))
bpe_tokenizer.save("/kaggle/working/Nepali_BPE.tokenizer")

print(bpe_tokenizer.get_vocab_size())
# print(bpe_tokenizer.get_vocab(with_added_tokens=True))

Training BPE Tokenizer using HuggingFace Library



30000
CPU times: user 15min 5s, sys: 9.02 s, total: 15min 14s
Wall time: 4min 37s


In [7]:
%%time
## Training WordPiece Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer

print("Training WordPiece Tokenizer using HuggingFace Library")
wp_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
wp_tokenizer.normalizer = normalizer
wp_tokenizer.pre_tokenizer = pre_tokenizer
wp_tokenizer.decoder = wp_decoder()

wp_trainer = WordPieceTrainer(vocab_size=30000,
                              special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
                              show_progress=True,
                              initial_alphabet=nepali_alpha
                             )

def wp_batch_iterator(dataset,batch_size=1000):
    for i in range(0, len(dataset), batch_size):
        yield dataset[i : i + batch_size]
        
wp_tokenizer.train_from_iterator(wp_batch_iterator(data['text']),wp_trainer,length=len(data))
wp_tokenizer.save("/kaggle/working/Nepali_Wordpiece.tokenizer")

print(wp_tokenizer.get_vocab_size())
# print(wp_tokenizer.get_vocab())

Training WordPiece Tokenizer using HuggingFace Library



30000
CPU times: user 16min 10s, sys: 7.12 s, total: 16min 17s
Wall time: 4min 43s


In [8]:
tokens = bpe_tokenizer.encode(data['text'][20000])
print(tokens.tokens)
print([x for x in data['text'][2000]])
print(tokens.attention_mask)
print(tokens.ids)

['कोभिड', 'को', 'जम्मा', 'पुष्टी', 'भएको', 'संख्या', 'जस', 'मध्ये', 'आज', 'मात्रै', 'जनामा', 'नयाँ', 'पुष्टी', 'भएको', 'छ', 'जस', 'मध्ये', 'उदयपुर', 'जिल्लाका', 'वटा', 'जनै', 'पुरुष', 'र', 'बर्षिय', 'र', 'चितवन', 'जिल्लाको', 'वटा', 'केस', 'पुरुष', 'र', 'बर्षिय']
['अ', 'म', 'े', 'र', 'ि', 'क', 'ा', 'म', 'ा', ' ', 'क', 'ो', 'र', 'ो', 'न', 'ा', ' ', 'म', 'ह', 'ा', 'म', 'ा', 'र', 'ी', 'स', 'ँ', 'ग', ' ', 'ल', 'ड', '्', 'न', 'क', 'ा', ' ', 'ल', 'ा', 'ग', 'ि', ' ', 'फ', 'े', 'र', 'ि', ' ', 'क', 'ो', 'भ', 'ि', 'ड', ' ', 'ट', 'ा', 'स', '्', 'क', ' ', 'फ', 'ो', 'र', '्', 'स', ' ', 'ग', 'ठ', 'न']
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1568, 355, 2677, 16424, 333, 1575, 862, 2793, 719, 1188, 5869, 722, 16424, 333, 76, 862, 2793, 8136, 3036, 1158, 13408, 1790, 96, 12644, 96, 2663, 1998, 1158, 15763, 1790, 96, 12644]


In [9]:
### test data ids : 100, 1000, 10000, 100000, 1000000
tokens = wp_tokenizer.encode(data['text'][20000])
print(tokens.tokens)
print(data['text'][2000])
print(tokens.attention_mask)
print(tokens.ids)

['कोभिड', 'को', 'जम्मा', 'पुष्टी', 'भएको', 'संख्या', 'जस', 'मध्ये', 'आज', 'मात्रै', 'जनामा', 'नयाँ', 'पुष्टी', 'भएको', 'छ', 'जस', 'मध्ये', 'उदयपुर', 'जिल्लाका', 'वटा', 'जनै', 'पुरुष', 'र', 'बर्षिय', 'र', 'चितवन', 'जिल्लाको', 'वटा', 'केस', 'पुरुष', 'र', 'बर्षिय']
अमेरिकामा कोरोना महामारीसँग लड्नका लागि फेरि कोभिड टास्क फोर्स गठन
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1568, 355, 2677, 16424, 333, 1575, 862, 2793, 719, 1188, 5869, 722, 16424, 333, 76, 862, 2793, 8136, 3036, 1158, 13408, 1790, 96, 12644, 96, 2663, 1998, 1158, 15763, 1790, 96, 12644]


In [10]:
%%time
print("Computing Total distinct words and characters in the whole corpus")
### Bag of words calculation
# bow = set([t[0] for t in pre_tokenizer.pre_tokenize_str(" ".join(data['text']))])
# print(len(bow))

## Bag of Characters Calculation
boc = set("".join(data['text']))
print(len(boc))

Computing Total distinct words and characters in the whole corpus
172
CPU times: user 41.9 s, sys: 899 ms, total: 42.8 s
Wall time: 42.5 s


In [11]:
gc.collect()
" ".join(sorted(list(boc)))

'  ! " # $ % & \' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ [ \\ ] ^ _ ` { | } ¢ ʺ ʻ ʼ ˙ ँ ं ः अ आ इ ई उ ऊ ऋ ऌ ऍ ऎ ए ऐ ऑ ऒ ओ औ क ख ग घ ङ च छ ज झ ञ ट ठ ड ढ ण त थ द ध न ऩ प फ ब भ म य र ऱ ल ळ ऴ व श ष स ह ़ ा ि ी ु ू ृ ॄ ॅ ॆ े ै ॉ ॊ ो ौ ् ॐ ॑ ॒ ॓ ॔ क़ ख़ ग़ ज़ ड़ ढ़ फ़ य़ ॠ ॢ ॣ । ॥ ० १ २ ३ ४ ५ ६ ७ ८ ९ ॰ ॲ ॽ ঁ ং ਂ ਸ਼ ਜ਼ ੱ ં ‐ ‑ ‒ ‘ ’ ‚ ‛ “ ” „ ′ ″ 〜 ・'