# Nepali Pretrained Tokenizers examples

In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

  from .autonotebook import tqdm as notebook_tqdm


In [22]:
# Tokenizer IDs
BERT_SHUSHANT = 'Shushant/nepaliBERT'
BERT_NOWALAB = 'nowalab/nepali-bert-npvec1'
BERT_RAJAN = 'Rajan/NepaliBERT'
DistilBERT_SAKONII = 'Sakonii/distilbert-base-nepali'
RoBERTa_AMITNESS = 'amitness/nepbert'
DeBERTa_SAKONII = 'Sakonii/deberta-base-nepali'
XLM_RoBERTa_BASE = 'xlm-roberta-base'
XLM_BERT_BASE = 'bert-base-multilingual-uncased'

tokenizer_ids = [BERT_SHUSHANT, BERT_NOWALAB, BERT_RAJAN, DistilBERT_SAKONII, RoBERTa_AMITNESS, DeBERTa_SAKONII, XLM_RoBERTa_BASE, XLM_BERT_BASE]

# List of Sentences
sentences = [
    'नेपाल आफ्नै संस्कृतिका कारण विश्वमा सबैतिर चिनिएको हो ।', 
    'स्वास्थ्य तथा जनसंख्या मन्त्रालयले गत असार ९ गतेदेखि १५ गतेसम्म खोप लगाएका बालबालिकालाई आजदेखि दोस्रो मात्रा लगाउन थालेको हो ।', 
    'युरोपमा केही दिनयताको उच्च तापक्रमसँगै फैलिएको डढेलोका कारण जनजीवन अस्तव्यस्त बनेको छ ।',
    'काठमाडौं महानगरपालिकाले साउन १ गतेदेखि कुहिने र नकुहिने फोहोरलाई छुट्टाछुट्टै दिनमा संकलन गर्ने भएको छ ।',
    'काठमाडौंको नागार्जुनमा चितुवाको आक्रमणबाट पाँच जना घाइते भएका छन् ।',]

## Load pretrained tokenizers

In [6]:
tokenizers = []
for id in tokenizer_ids:
    print('Loading', id, '...')
    tokenizers.append(AutoTokenizer.from_pretrained(id))

def tokenize_sentence(text):
    for idx, tokenizer in enumerate(tokenizers):
        print('Tokenizing using', tokenizer_ids[idx])
        print(' '.join(tokenizer.tokenize(text)))
        print()

Loading Shushant/nepaliBERT ...
Loading nowalab/nepali-bert-npvec1 ...
Loading Rajan/NepaliBERT ...
Loading Sakonii/distilbert-base-nepali ...
Loading amitness/nepbert ...
Loading Sakonii/deberta-base-nepali ...
Loading xlm-roberta-base ...
Loading bert-base-multilingual-uncased ...


Downloading: 100%|██████████| 28.0/28.0 [00:00<00:00, 9.35kB/s]
Downloading: 100%|██████████| 625/625 [00:00<00:00, 206kB/s]
Downloading: 100%|██████████| 851k/851k [00:02<00:00, 403kB/s]  
Downloading: 100%|██████████| 1.64M/1.64M [00:03<00:00, 560kB/s] 


### Tokenizer info

In [7]:
[print(f'{tokenizer}\n') for tokenizer in tokenizers]

PreTrainedTokenizerFast(name_or_path='Shushant/nepaliBERT', vocab_size=30522, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

PreTrainedTokenizerFast(name_or_path='nowalab/nepali-bert-npvec1', vocab_size=30000, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

PreTrainedTokenizerFast(name_or_path='Rajan/NepaliBERT', vocab_size=50000, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

PreTrainedTokenizerFast(name_or_path='Sakonii/di

[None, None, None, None, None, None, None, None]

In [23]:
# Tokenize all sentences
for idx, sentence in enumerate(sentences):
    print(f'Sentence {idx+1}: {sentence}')
    print()
    tokenize_sentence(sentence)
    print('='*150)

Sentence 1: नेपाल आफ्नै संस्कृतिका कारण विश्वमा सबैतिर चिनिएको हो ।

Tokenizing using Shushant/nepaliBERT
नपा ##ल आफ ##न सस ##कत ##िका कारण विश ##वमा सब ##तिर चिनिएको हो ।

Tokenizing using nowalab/nepali-bert-npvec1
नपा ##ल आफ ##न सस ##कत ##िका कारण [UNK] सब ##तिर चिनिए ##को हो ।

Tokenizing using Rajan/NepaliBERT
नपा ##ल आफ ##न सस ##कति ##का कारण विश ##वमा सब ##तिर चिनिएको हो ।

Tokenizing using Sakonii/distilbert-base-nepali
▁नेपाल ▁आफ्नै ▁संस्कृति का ▁कारण ▁विश्वमा ▁सबैतिर ▁चिनिए को ▁हो ▁।

Tokenizing using amitness/nepbert
à¤¨ à¥ĩ à¤ª à¤¾ à¤² Ġà¤Ĩà¤« à¥į à¤¨ à¥Ī Ġà¤¸ à¤Ĥ à¤¸ à¥į à¤ķ à¥ĥ à¤¤ à¤¿ à¤ķ à¤¾ Ġà¤ķ à¤¾ à¤°à¤£ Ġà¤µ à¤¿ à¤¶ à¥į à¤µà¤® à¤¾ Ġà¤¸à¤¬ à¥Ī à¤¤ à¤¿ à¤° Ġà¤ļ à¤¿ à¤¨ à¤¿ à¤ıà¤ķ à¥ĭ Ġà¤¹ à¥ĭ Ġà¥¤

Tokenizing using Sakonii/deberta-base-nepali
▁नेपाल ▁आफ्नै ▁संस्कृति का ▁कारण ▁विश्वमा ▁सबैतिर ▁चिनिए को ▁हो ▁।

Tokenizing using xlm-roberta-base
▁नेपाल ▁आफ्नै ▁संस्कृति का ▁कारण ▁विश्व मा ▁सबै तिर ▁चिन िएको ▁हो ▁।

Tokenizing using bert-base-multilingual-uncased
नपाल आ ##

In [10]:
tokenize_sentence(TEXT_1)

NameError: name 'tokenize_sentence' is not defined

In [2]:
tokenizer = AutoTokenizer.from_pretrained("nowalab/nepali-bert-npvec1")

Downloading: 100%|██████████| 466/466 [00:00<00:00, 155kB/s]
Downloading: 100%|██████████| 527k/527k [00:02<00:00, 255kB/s]  


In [5]:
' '.join(tokenizer.tokenize(TEXT_1))

'[UNK] तथा जनस ##ख ##या मन ##तरा ##लय ##ल गत असार [UNK] गत ##द ##खि [UNK] गत ##सम ##म खोप लगाए ##का [UNK] आज ##द ##खि दोस ##रो मात ##रा लगाउन थाल ##को हो ।'

In [7]:
tokenizer = AutoTokenizer.from_pretrained(BERT_SHUSHANT)
' '.join(tokenizer.tokenize(TEXT_1))

Downloading: 100%|██████████| 589/589 [00:00<00:00, 589kB/s]
Downloading: 100%|██████████| 516k/516k [00:02<00:00, 206kB/s]  


'सवा ##स ##थ ##य तथा जनस ##ख ##या मन ##तर ##ालय ##ल गत असार ९ गत ##द ##खि १५ गत ##सम ##म खोप लगाएका बालबालिकालाई आज ##द ##खि दोस ##रो मा ##तर ##ा लगाउन थाल ##को हो ।'

## Load pre-trained models

In [12]:
from transformers import AutoModelForSequenceClassification

In [13]:
np_roberta = AutoModelForSequenceClassification.from_pretrained(RoBERTa_AMITNESS)

Some weights of the model checkpoint at amitness/nepbert were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at amitness/nepbert and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.o

In [15]:
all_roberta_tokens = list(tokenizers[4].get_vocab().keys())

In [16]:
len(all_roberta_tokens)

52000

In [6]:
np_roberta

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [18]:
np_roberta.config

RobertaConfig {
  "_name_or_path": "amitness/nepbert",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 52000
}

In [22]:
''.join(str(tokenizers[4].encode(sentences[0])))

'[0, 267, 270, 281, 264, 272, 409, 265, 267, 290, 279, 308, 278, 265, 268, 350, 274, 269, 268, 264, 286, 264, 358, 304, 269, 307, 265, 620, 264, 455, 290, 274, 269, 266, 336, 269, 267, 269, 314, 271, 302, 271, 296, 2]'

In [23]:
np_roberta.base_mode

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(52000, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Drop

In [8]:
sentences = ['म घर जाँदैगर्दा रुखबाट स्याउ खस्यो']

In [9]:
# Tokenize all sentences
for idx, sentence in enumerate(sentences):
    print(f'Sentence {idx+1}: {sentence}')
    print()
    tokenize_sentence(sentence)
    print('='*150)

Sentence 1: म घर जाँदैगर्दा रुखबाट स्याउ खस्यो

Tokenizing using Shushant/nepaliBERT
म घर जा ##द ##गर ##दा र ##ख ##बाट सय ##ाउ खस ##यो

Tokenizing using nowalab/nepali-bert-npvec1
म घर जा ##द ##गर ##दा रख ##बाट सय ##ाउ खस ##यो

Tokenizing using Rajan/NepaliBERT
म घर जा ##द ##गर ##दा रख ##बाट सय ##ाउ खस ##यो

Tokenizing using Sakonii/distilbert-base-nepali
▁म ▁घर ▁जाँदै गर्दा ▁रुख बाट ▁स्याउ ▁खस ्यो

Tokenizing using amitness/nepbert
à¤® Ġà¤ĺà¤° Ġà¤ľ à¤¾à¤ģ à¤¦ à¥Ī à¤Ĺà¤° à¥į à¤¦ à¤¾ Ġà¤° à¥ģ à¤ĸà¤¬ à¤¾ à¤Ł Ġà¤¸ à¥į à¤¯ à¤¾ à¤ī Ġà¤ĸà¤¸ à¥į à¤¯ à¥ĭ

Tokenizing using Sakonii/deberta-base-nepali
▁म ▁घर ▁जाँदै गर्दा ▁रुख बाट ▁स्याउ ▁खस ्यो

Tokenizing using xlm-roberta-base
▁म ▁घर ▁जा ँदै गर ्दा ▁रुख बाट ▁स्या उ ▁खस ्यो

Tokenizing using bert-base-multilingual-uncased
म घर जा ##द ##गर ##दा र ##ख ##बाट स ##या ##उ ख ##स ##यो



## Tokenization Process

In [13]:
mBERT_tokenizer = tokenizers[-1]
bert_tokenizer = tokenizers[2]

In [14]:
sent1= 'स्वास्थ्य तथा जनसंख्या मन्त्रालयले गत असार ९ गतेदेखि १५ गतेसम्म खोप लगाएका बालबालिकालाई आजदेखि दोस्रो मात्रा लगाउन थालेको हो ।'

In [15]:
tokenized_sent1 = bert_tokenizer.tokenize(sent1)
len(tokenized_sent1)

38

In [16]:
print(tokenized_sent1)

['सवा', '##स', '##थ', '##य', 'तथा', 'जन', '##स', '##ख', '##या', 'मन', '##तर', '##ालय', '##ल', 'गत', 'असार', '[UNK]', 'गत', '##द', '##खि', '[UNK]', 'गत', '##सम', '##म', 'खोप', 'लगाएका', 'बालबालिकालाई', 'आज', '##द', '##खि', 'दोस', '##रो', 'मात', '##रा', 'लगाउन', 'थाल', '##को', 'हो', '।']


In [17]:
encoded_sent1 = bert_tokenizer.encode(sent1)
len(encoded_sent1)

40

In [18]:
print(encoded_sent1)

[2, 13050, 326, 356, 320, 736, 829, 326, 336, 738, 1184, 6019, 683, 315, 964, 2910, 1, 964, 335, 29703, 1, 964, 991, 314, 10589, 4711, 5980, 1334, 335, 29703, 18279, 723, 32093, 555, 3531, 1968, 518, 589, 163, 3]


In [19]:
bert_tokenizer.decode(encoded_sent1)

'[CLS] सवासथय तथा जनसखया मनतरालयल गत असार [UNK] गतदखि [UNK] गतसमम खोप लगाएका बालबालिकालाई आजदखि दोसरो मातरा लगाउन थालको हो । [SEP]'

In [20]:
print(bert_tokenizer.convert_ids_to_tokens(encoded_sent1))

['[CLS]', 'सवा', '##स', '##थ', '##य', 'तथा', 'जन', '##स', '##ख', '##या', 'मन', '##तर', '##ालय', '##ल', 'गत', 'असार', '[UNK]', 'गत', '##द', '##खि', '[UNK]', 'गत', '##सम', '##म', 'खोप', 'लगाएका', 'बालबालिकालाई', 'आज', '##द', '##खि', 'दोस', '##रो', 'मात', '##रा', 'लगाउन', 'थाल', '##को', 'हो', '।', '[SEP]']


In [None]:
bert_tokenizer.prepare_for_model()