In [1]:
from transformers import BertTokenizer, BertForMaskedLM
from datasets import load_dataset, load_from_disk
import re
import torch

model_name = "DeepPavlov/rubert-base-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
raw_datasets = load_dataset("json", data_files="data/processed/data.jsonl")

tokenizer.add_special_tokens({"additional_special_tokens": ["[TGT]", "[/TGT]"]})

2

In [3]:
def preprocess_batch(batch):
    enc = tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )
    batch["input_ids"]      = enc["input_ids"]
    batch["attention_mask"] = enc["attention_mask"]

    labels = []
    for input_ids, target in zip(enc["input_ids"], batch["target"]):
        target_ids = tokenizer(target, add_special_tokens=False)["input_ids"]
        lbl = [-100] * len(input_ids)
        pos = input_ids.index(tokenizer.mask_token_id)
        for i, tid in enumerate(target_ids):
            if pos + i < len(lbl):
                lbl[pos + i] = tid
        labels.append(lbl)
    batch["labels"] = labels
    return batch

tokenized = raw_datasets.map(preprocess_batch, batched=True)

Map:   0%|          | 0/9110 [00:00<?, ? examples/s]

In [4]:
tokenized.save_to_disk("data/processed/tokenized_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/9110 [00:00<?, ? examples/s]

In [5]:
from datasets import load_from_disk

tokenized = load_from_disk("data/processed/tokenized_dataset")

In [6]:
from transformers import AutoModelForMaskedLM, DataCollatorForLanguageModeling

model.resize_token_embeddings(len(tokenizer))

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [7]:
import os

# Меняем директории кэширования
os.environ["HF_HOME"] = "D:/hf"
os.environ["HF_DATASETS_CACHE"] = "D:/hf/datasets"
os.environ["TRANSFORMERS_CACHE"] = "D:/hf/transformers"


In [8]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./bert-artistic",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    logging_steps=100,
    save_steps=500,
    save_total_limit=2,
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    data_collator=data_collator,
)

trainer.train()


Step,Training Loss


KeyboardInterrupt: 

In [10]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir="./bert-artistic",
    num_train_epochs=1,                        # Меньше эпох — быстрее обучение
    per_device_train_batch_size=16,            # Увеличить batch size (если хватает памяти)
    max_steps=500,                             # Ограничить количество шагов (обрежет эпохи)
    logging_steps=50,                          # Чаще логировать для анализа
    save_steps=250,                            # Реже сохранять (меньше I/O)
    save_total_limit=1,                        # Хранить только одну модель
    fp16=True,                                 # Включить 16-битный режим (если поддерживается)
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    data_collator=data_collator,
)

trainer.train()

Step,Training Loss
50,0.4167
100,0.0368
150,0.0033
200,0.0012
250,0.0057
300,0.0005
350,0.0004
400,0.0004
450,0.0004
500,0.0003


TrainOutput(global_step=500, training_loss=0.04657469656690955, metrics={'train_runtime': 9474.0961, 'train_samples_per_second': 0.844, 'train_steps_per_second': 0.053, 'total_flos': 526956607488000.0, 'train_loss': 0.04657469656690955, 'epoch': 0.8771929824561403})