In [1]:
from tokenizers import (
    models,
    normalizers,
    pre_tokenizers,
    trainers,
    Tokenizer,
    processors
)

import json

In [2]:
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFD(), normalizers.Lowercase()]
)
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()

In [3]:
special_tokens = ["[UNK]", "[PAD]", "[STR]", "[END]"]
trainer = trainers.WordPieceTrainer(vocab_size=30000, special_tokens=special_tokens, continuing_subword_prefix = '')

In [4]:
with open('data/unsupervised_comments.json', 'r', encoding="utf-8") as f:
    sentences = json.load(f)

with open('data/bgjargon.json', 'r', encoding="utf-8") as f:
    bgjargon = json.load(f)

for v in bgjargon.values():
    for meaning in v['meanings']:
        if len(meaning['example']) > 0:
            sentences.append(meaning['example'])

In [5]:
tokenizer.train_from_iterator(iter(sentences), trainer=trainer)

In [6]:
str_token_id = tokenizer.token_to_id("[STR]")
end_token_id = tokenizer.token_to_id("[END]")
print(str_token_id, end_token_id)

2 3


In [7]:
tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[STR]:0 $A:0 [END]:0",
    special_tokens=[("[STR]", str_token_id), ("[END]", end_token_id)],
)

In [8]:
encoding = tokenizer.encode("Кво става бе животно")
print(encoding.tokens)

['[STR]', 'кво', 'става', 'бе', 'животно', '[END]']


In [9]:
tokenizer.save("data/tokenizer_blitz_bgjargon.json")

In [10]:
new_tokenizer = Tokenizer.from_file("data/tokenizer_blitz_bgjargon.json")
encoding = new_tokenizer.encode("Глупак")
print(encoding.tokens)

['[STR]', 'глупак', '[END]']
