In [1]:
!pip install tokenizers==0.21.0



In [2]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace, Punctuation
from tokenizers.normalizers import Lowercase, NFD, StripAccents, Sequence
from tokenizers.processors import BertProcessing
from tokenizers.pre_tokenizers import PreTokenizer
from tokenizers.implementations import ByteLevelBPETokenizer
from itertools import islice
from tokenizers.pre_tokenizers import BertPreTokenizer

import os
import re

In [3]:
# === Step 1: Prepare morphemes ===
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]

In [4]:
# === Step 2: Prepare training data ===
def batch_line_iterator(file_list, batch_size=32):
    def line_iterator():
        for file in file_list:
            with open(file, mode="r", encoding="utf-8", errors="ignore") as f:
                for line in f:
                    line = line.strip()
                    if line:
                        yield re.sub(r"[^а-яА-ЯіІїЇєЄґҐa-zA-Z0-9\s.,!?\"'()-]", "", line)

    iterator = line_iterator()
    while True:
        batch = list(islice(iterator, batch_size))
        if not batch:
            break
        yield batch

In [5]:
# === Step 3: Initialize Tokenizer ===
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

alphabet = list("абвгґдеєжзиіїйклмнопрстуфхцчшщьюяabcdefghijklmnopqrstuvwxyz0123456789.,!?\"' ")

# Optional: normalize + pre-tokenize
tokenizer.normalizer = Sequence([NFD(), Lowercase()])
tokenizer.pre_tokenizer = BertPreTokenizer()

# === Step 4: Train BPE with morphemes as initial alphabet ===
trainer = BpeTrainer(
    vocab_size=3000,
    show_progress=True,
    initial_alphabet=alphabet,
    special_tokens=special_tokens,
    max_token_length=9
)

In [6]:
train_text_list = ["/kaggle/input/ubettextfiles/ubertext_court.txt",
                       "/kaggle/input/ubettextfiles/ubertext_fiction.txt",
                       "/kaggle/input/ubettextfiles/ubertext_social.txt",
                       "/kaggle/input/ubettextfiles/ubertext_wikipedia.txt"
                       ]

tokenizer.train_from_iterator(batch_line_iterator(train_text_list, batch_size=512), trainer)






In [7]:
vocab = tokenizer.get_vocab()

In [10]:
len(vocab)

3000

In [9]:
# === Step 5: Save tokenizer ===
os.makedirs("morpheme_bpe_3k_tokenizer", exist_ok=True)
tokenizer.model.save("morpheme_bpe_3k_tokenizer")

['morpheme_bpe_3k_tokenizer/vocab.json',
 'morpheme_bpe_3k_tokenizer/merges.txt']

In [11]:
# === Optional: Test it ===
tokenizer.enable_padding(pad_id=0, pad_token="[PAD]")
output = tokenizer.encode("прочитати виробництво")
print("Tokens:", output.tokens)

Tokens: ['про', 'чита', 'ти', 'вироб', 'ництво']
