In [9]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import os

# –û—Ç–∫–ª—é—á–∞–µ–º W&B
os.environ["WANDB_DISABLED"] = "true"

# –ó–∞–≥—Ä—É–∑–∫–∞ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä–∞ –∏ –º–æ–¥–µ–ª–∏
tokenizer = AutoTokenizer.from_pretrained("sberbank-ai/rugpt3small_based_on_gpt2", add_prefix_space=True)
model = AutoModelForTokenClassification.from_pretrained("sberbank-ai/rugpt3small_based_on_gpt2", num_labels=2)

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = self.labels[idx]
        
        tokenized_inputs = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
            is_split_into_words=True
        )
        
        # –§–æ—Ä–º–∞—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ –º–µ—Ç–æ–∫
        label_ids = [-100] * len(tokenized_inputs["input_ids"][0])
        for i, label in enumerate(labels[:self.max_length]):
            label_ids[i] = label

        tokenized_inputs["labels"] = torch.tensor(label_ids)
        return {key: val.squeeze() for key, val in tokenized_inputs.items()}

# –ü—Ä–∏–º–µ—Ä –¥–∞–Ω–Ω—ã—Ö
texts = [
    ["–≠—Ç–æ", "–ø–µ—Ä–≤—ã–π", "—Ç–µ–∫—Å—Ç", ".", "–°–ª–µ–¥—É—é—â–∏–π", "—Ç–µ–∫—Å—Ç", "–Ω–∞—á–∏–Ω–∞–µ—Ç—Å—è", "—Å", "—ç—Ç–æ–≥–æ", "—Å–ª–æ–≤–∞", "."],
    ["–ù–∞—á–∞–ª–æ", "–Ω–æ–≤–æ–≥–æ", "—Ç–µ–∫—Å—Ç–∞", ".", "–ò", "–µ—â—ë", "–æ–¥–∏–Ω", "–ø—Ä–∏–º–µ—Ä", "."]
]
labels = [
    [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
    [1, 0, 0, 0, 1, 0, 0, 0]
]

# –†–∞–∑–¥–µ–ª–µ–Ω–∏–µ –¥–∞–Ω–Ω—ã—Ö –Ω–∞ —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—ã–µ –∏ –≤–∞–ª–∏–¥–∞—Ü–∏–æ–Ω–Ω—ã–µ
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
val_dataset = TextDataset(val_texts, val_labels, tokenizer)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    logging_dir="./logs",
    logging_strategy="steps",  # –õ–æ–≥–≥–∏—Ä–æ–≤–∞–Ω–∏–µ –≤—ã–ø–æ–ª–Ω—è–µ—Ç—Å—è —á–µ—Ä–µ–∑ –∫–∞–∂–¥—ã–π logging_steps —à–∞–≥
    evaluation_strategy="epoch",  # –û—Ü–µ–Ω–∫–∞ –≤ –∫–æ–Ω—Ü–µ –∫–∞–∂–¥–æ–π —ç–ø–æ—Ö–∏
    save_steps=100,
    logging_steps=10,
    logging_first_step=True, 
)


# –§—É–Ω–∫—Ü–∏—è –≤—ã—á–∏—Å–ª–µ–Ω–∏—è –º–µ—Ç—Ä–∏–∫
def compute_metrics(pred):
    labels = pred.label_ids.flatten()
    preds = np.argmax(pred.predictions, axis=2).flatten()

    # –û—Å—Ç–∞–≤–ª—è–µ–º —Ç–æ–ª—å–∫–æ –≤–∞–ª–∏–¥–Ω—ã–µ —Ç–æ–∫–µ–Ω—ã (–±–µ–∑ -100)
    valid_indices = labels != -100
    labels = labels[valid_indices]
    preds = preds[valid_indices]

    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# –ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ Trainer –¥–ª—è –æ–±—É—á–µ–Ω–∏—è —Å –≤–∞–ª–∏–¥–∞—Ü–∏–µ–π
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# –ó–∞–ø—É—Å–∫ –æ–±—É—á–µ–Ω–∏—è
trainer.train()


Some weights of GPT2ForTokenClassification were not initialized from the model checkpoint at sberbank-ai/rugpt3small_based_on_gpt2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7274,2.648852,0.75,0.0,0.0,0.0
2,0.7274,1.430057,0.75,0.0,0.0,0.0
3,0.7274,0.98253,0.75,0.0,0.0,0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=3, training_loss=0.7238843639691671, metrics={'train_runtime': 17.2573, 'train_samples_per_second': 0.174, 'train_steps_per_second': 0.174, 'total_flos': 783890270208.0, 'train_loss': 0.7238843639691671, 'epoch': 3.0})