In [11]:
from tokenizers import (
    models,
    normalizers,
    pre_tokenizers,
    trainers,
    Tokenizer,
    processors
)

import json

In [12]:
tokenizer = Tokenizer(models.WordPiece(unk_token="[unk]"))
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFD(), normalizers.Lowercase()]
)
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()

In [13]:
special_tokens = ["[unk]", "[pad]", "[str]", "[end]"]
trainer = trainers.WordPieceTrainer(vocab_size=30000, special_tokens=special_tokens, continuing_subword_prefix = '')

In [14]:
with open('data/unsupervised_comments.json', 'r', encoding="utf-8") as f:
    sentences = json.load(f)

# with open('data/blitz_articles.json', 'r', encoding="utf-8") as f:
#     sentences += [a['name'] + ' ' + a['content'] for a in json.load(f)]

# with open('data/dnes_bg_articles.json', 'r', encoding="utf-8") as f:
#     sentences += [a['name'] + ' ' + a['content'] for a in json.load(f)]

# with open('data/pik_articles.json', 'r', encoding="utf-8") as f:
#     sentences += [a['name'] + ' ' + a['content'] for a in json.load(f)]

# with open('data/bgjargon.json', 'r', encoding="utf-8") as f:
#     bgjargon = json.load(f)

# for v in bgjargon.values():
#     for meaning in v['meanings']:
#         if len(meaning['example']) > 0:
#             sentences.append(meaning['example'])

In [15]:
tokenizer.train_from_iterator(iter(sentences), trainer=trainer)

In [17]:
tokenizer.save("data/tokenizer_comments.json")

In [16]:
encoding = tokenizer.encode("Кво става бе животно")
print(encoding.tokens)

['кво', 'става', 'бе', 'животно']


In [6]:
str_token_id = tokenizer.token_to_id("[str]")
end_token_id = tokenizer.token_to_id("[end]")
print(str_token_id, end_token_id)

2 3


In [7]:
tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[str]:0 $A:0 [end]:0",
    special_tokens=[("[str]", str_token_id), ("[end]", end_token_id)],
)

In [8]:
encoding = tokenizer.encode("Кво става бе животно")
print(encoding.tokens)

['[STR]', 'кво', 'става', 'бе', 'животно', '[END]']


In [9]:
tokenizer.save("data/tokenizer_comments_bgjargon_articles.json")

In [10]:
new_tokenizer = Tokenizer.from_file("data/tokenizer_comments_end_start.json")
encoding = new_tokenizer.encode("тъоак")
print(encoding.tokens)

['[STR]', 'тъ', 'о', 'ак', '[END]']
