In [1]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
from tokenizers.normalizers import NFD

import pandas as pd

In [18]:
# 1. Read morphemes from file
vocabulary_df = pd.read_csv('../data/vocabulary/knu_vocabulary_with_counts.csv', encoding='utf-8')
morphemes_list = vocabulary_df['morphemes'].astype('str').tolist()

In [19]:
len(morphemes_list)

29952

In [20]:
# 2. Init new tokenizer
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

In [21]:
# 3. Add pre-tokenizer and normalizer
tokenizer.normalizer = NFD()
tokenizer.pre_tokenizer = Whitespace()

In [22]:
base_characters = list("абвгґдеєжзиіїйклмнопрстуфхцчшщьюя0123456789.,!?\"' ")

In [23]:
# 4. Set up a trainer
trainer = BpeTrainer(
    vocab_size=len(morphemes_list) + len(base_characters),
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
    initial_alphabet=[],  # No alphabet because we already have morphemes
)

In [24]:
# 5. Train the tokenizer using your morphemes as data
tokenizer.train_from_iterator([*base_characters, *morphemes_list], trainer=trainer)

In [25]:
# 6. Saving
tokenizer.model.save("../tokenizers/knu_30k")

['../tokenizers/knu_30k\\vocab.json', '../tokenizers/knu_30k\\merges.txt']