In [None]:
!pip install tokenizers

In [12]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import Lowercase, NFD, StripAccents, Sequence
from tokenizers.processors import BertProcessing
from tokenizers.pre_tokenizers import PreTokenizer
from tokenizers.implementations import ByteLevelBPETokenizer

import os
import bz2

In [14]:
import tokenizers
print(tokenizers.__version__)

0.21.0


In [2]:
# === Step 1: Prepare morphemes ===
morpheme_vocab = ['про', 'чит', 'а', 'ти', 'ви', 'робн', 'иц', 'тв', 'о']
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]

train_text_list = ['../data/text_corpus/ubertext.court.filter_rus_gcld+short.text_only.txt.bz2',
                   # '../data/text_corpus/ubertext.fiction.filter_rus_gcld+short.text_only.txt.bz2',
                   # '../data/text_corpus/ubertext.social.filter_rus_gcld+short.text_only.txt.bz2',
                   # '../data/text_corpus/ubertext.wikipedia.filter_rus_gcld+short.text_only.txt.bz2',
                   # '../data/text_corpus/ubertext.court.filter_rus_gcld+short.text_only.txt.bz2'
                   ]

In [3]:
# === Step 2: Prepare training data ===
def iter_bz2_lines(file_list: list):
    for file in file_list:
        with bz2.open(file, mode="rt", encoding="utf-8", errors="ignore") as f:
            for line in f:
                line = line.strip()
                if line:
                    yield line

In [4]:
# === Step 3: Initialize Tokenizer ===
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

alphabet = list("абвгґдеєжзиіїйклмнопрстуфхцчшщьюяabcdefghijklmnopqrstuvwxyz0123456789.,!?\"' ")

# Optional: normalize + pre-tokenize
tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()])
tokenizer.pre_tokenizer = Whitespace()

# === Step 4: Train BPE with morphemes as initial alphabet ===
trainer = BpeTrainer(
    vocab_size=500,
    show_progress=True,
    initial_alphabet=alphabet,
    special_tokens=special_tokens
)

In [5]:
# Train tokenizer
tokenizer.train_from_iterator(iter_bz2_lines(train_text_list), trainer)

In [6]:
vocab = tokenizer.get_vocab()

In [7]:
len(vocab)

500

In [None]:
# === Step 5: Save tokenizer ===
os.makedirs("morpheme_bpe_tokenizer", exist_ok=True)
tokenizer.model.save("morpheme_bpe_tokenizer")

In [None]:
# === Optional: Test it ===
tokenizer.enable_padding(pad_id=0, pad_token="[PAD]")
output = tokenizer.encode("прочитати виробництво")
print("Tokens:", output.tokens)