In [1]:
!pip install tokenizers==0.21.0



In [2]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace, Punctuation
from tokenizers.normalizers import Lowercase, NFD, StripAccents, Sequence
from tokenizers.processors import BertProcessing
from tokenizers.pre_tokenizers import PreTokenizer
from tokenizers.implementations import ByteLevelBPETokenizer
from itertools import islice
from tokenizers.pre_tokenizers import BertPreTokenizer
from multiprocessing import Pool, cpu_count
from transformers import GPT2Config, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
from tokenizers.decoders import BPEDecoder
from tokenizers.processors import TemplateProcessing
from transformers import PreTrainedTokenizerFast
import os

import os
import re

import multiprocessing as mp

os.environ["TOKENIZERS_PARALLELISM"] = "false"

## GPT-2 training

In [6]:
tokenizer_dir = '/kaggle/input/morphemetokenizers/bpe_10k_full'

In [48]:
train_data_path = '/kaggle/input/ubettextfiles/ubertext_social.txt'

In [49]:
def tokenize_dataset(tokenizer, text_file, block_size=128):
    dataset = load_dataset('text', data_files=text_file)

    def tokenize(example):
        return tokenizer(example["text"], truncation=True, max_length=block_size)

    tokenized = dataset.map(tokenize, batched=True, remove_columns=["text"])
    return tokenized["train"]

In [50]:
tokenizer = Tokenizer(BPE.from_file(f"{tokenizer_dir}/vocab.json",
                                    f"{tokenizer_dir}/merges.txt"))

tokenizer.normalizer = Sequence([NFD(), Lowercase()])
tokenizer.pre_tokenizer = BertPreTokenizer()

In [None]:
tokenized_dataset = tokenize_dataset(tokenizer, train_data_path)

In [34]:
# Example usage
output = tokenizer.encode("Сьогодні знову обстріли")
print(output.tokens)

['сьогодні', 'знову', 'обстріли']


In [39]:
config = GPT2Config(
    vocab_size=tokenizer.get_vocab_size(),
    n_positions=128,
    n_ctx=128,
    n_embd=256,
    n_layer=4,
    n_head=4,
    # bos_token_id=tokenizer.cls_token_id,
    # eos_token_id=tokenizer.sep_token_id,
)

model = GPT2LMHeadModel(config)

In [None]:
output_dir = "/kaggle/working/trained_1/"

training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    # evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    logging_dir=f"{output_dir}/logs",
    save_total_limit=2,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
trainer.train()
trainer.save_model(output_dir)