In [1]:
!pip install transformers datasets torch seqeval accelerate

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Co

In [2]:
# 1. Установка библиотек (если не установлены)
# !pip install transformers datasets torch seqeval accelerate

# 2. Импорт библиотек
from datasets import load_dataset, DatasetDict, concatenate_datasets
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification,
    pipeline
)
import numpy as np
from seqeval.metrics import classification_report
import torch

In [6]:
from datasets import Dataset
import os
import re

def load_brat_data(dir_path):
    data = {"tokens": [], "labels": []}

    for filename in os.listdir(dir_path):
        if not filename.endswith(".txt"):
            continue

        # Чтение текста
        txt_path = os.path.join(dir_path, filename)
        with open(txt_path, "r", encoding="utf-8") as f:
            text = f.read()

        # Токенизация по пробелам и пунктуации (адаптируйте под ваш случай)
        tokens = re.findall(r"\w+|[^\w\s]", text, re.UNICODE)
        labels = ["O"] * len(tokens)

        # Чтение аннотаций
        ann_path = txt_path.replace(".txt", ".ann")
        if not os.path.exists(ann_path):
            continue

        with open(ann_path, "r", encoding="utf-8") as f:
            for line in f:
                if line.startswith("T"):  # Entity
                    parts = line.strip().split("\t")
                    if len(parts) < 3:
                        continue

                    tag_info = parts[1]
                    entity_text = parts[2]

                    # Парсим информацию о позициях
                    tag_parts = tag_info.split()
                    try:
                        tag = tag_parts[0]
                        start = int(tag_parts[-2])
                        end = int(tag_parts[-1])
                    except (ValueError, IndexError):
                        continue

                    # Назначаем метки токенам
                    token_pos = 0
                    current_pos = 0
                    for i, token in enumerate(tokens):
                        token_len = len(token)
                        if current_pos >= start and current_pos + token_len <= end:
                            labels[i] = f"B-{tag}" if token_pos == 0 else f"I-{tag}"
                            token_pos += 1
                        current_pos += token_len + 1  # +1 для пробела/разделителя

        data["tokens"].append(tokens)
        data["labels"].append(labels)

    return Dataset.from_dict(data)

In [7]:
# Подключение выборки данных файловым способом
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
dataset = load_brat_data("./drive/MyDrive/NEREL-v1.1/train")

In [12]:
dataset

Dataset({
    features: ['tokens', 'labels'],
    num_rows: 746
})

In [16]:
# 3. Загрузка и объединение датасетов
def load_and_preprocess_datasets():
    # Загрузка NEREL

    return DatasetDict({
        "train":load_brat_data("./drive/MyDrive/NEREL-v1.1/train"),
        "validation": load_brat_data("./drive/MyDrive/NEREL-v1.1/dev"),
        "test": load_brat_data("./drive/MyDrive/NEREL-v1.1/test")
    })

dataset = load_and_preprocess_datasets()

In [17]:
# 4. Создание словаря меток
label_list = sorted({tag for example in dataset["train"] for tag in example["labels"]})
label2id = {tag: i for i, tag in enumerate(label_list)}
id2label = {i: tag for i, tag in enumerate(label_list)}

In [20]:
# 5. Загрузка модели и токенизатора
from transformers import AutoTokenizer, AutoModelForTokenClassification
import os

# Проверка доступности модели
model_name = "DeepPavlov/rubert-base-cased"
try:
    # Загрузка токенизатора с явным указанием локального кэша
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        cache_dir=os.path.expanduser("~/.cache/huggingface/hub")
    )

    # Загрузка модели
    model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(label_list),
        id2label=id2label,
        label2id=label2id,
        cache_dir=os.path.expanduser("~/.cache/huggingface/hub")
    )

    print("Модель и токенизатор успешно загружены!")

except Exception as e:
    print(f"Ошибка при загрузке модели: {str(e)}")
    print("Попробуйте следующие решения:")
    print("1. Проверьте подключение к интернету")
    print("2. Попробуйте другую модель, например 'DeepPavlov/rubert-tiny'")
    print("3. Убедитесь, что у вас есть права на запись в кэш-директорию")

vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Модель и токенизатор успешно загружены!


In [21]:
# 6. Токенизация данных с выравниванием меток
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        max_length=256,
        padding="max_length",
    )

    labels = []
    for i, doc_labels in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        doc_labels = [label2id[label] for label in doc_labels]

        label_ids = []
        current_label = -100
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            else:
                label = doc_labels[word_idx]
                if word_idx != 0 and word_ids[word_idx] == word_ids[word_idx-1]:
                    label_ids.append(-100)  # Только первый подтокен получает метку
                else:
                    label_ids.append(label)
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/746 [00:00<?, ? examples/s]

Map:   0%|          | 0/94 [00:00<?, ? examples/s]

Map:   0%|          | 0/93 [00:00<?, ? examples/s]

In [22]:
# 7. Обучение модели
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="./ner_results",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    report_to="none",
)

def compute_metrics(p):
    predictions, labels = p.predictions, p.label_ids
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [id2label[l] for l in label if l != -100]
        for label in labels
    ]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    return classification_report(true_labels, true_predictions, output_dict=True)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Age,Award,City,Country,Crime,Date,Disease,District,Event,Facility,Family,Ideology,Language,Law,Location,Money,Nationality,Number,Ordinal,Organization,Penalty,Percent,Person,Product,Profession,Religion,State Or Province,Time,Work Of Art,Micro avg,Macro avg,Weighted avg
1,No log,1.259722,"{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 97}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 23}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 98}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 101}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 49}","{'precision': 0.23595505617977527, 'recall': 0.12804878048780488, 'f1-score': 0.1660079051383399, 'support': 328}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 80}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 9}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 426}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 61}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 5}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 25}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 3}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 45}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 23}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 18}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 32}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 53}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 15}","{'precision': 0.07575757575757576, 'recall': 0.01718213058419244, 'f1-score': 0.028011204481792718, 'support': 291}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 34}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 3}","{'precision': 0.23503325942350334, 'recall': 0.17462932454695224, 'f1-score': 0.20037807183364842, 'support': 607}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 12}","{'precision': 0.0916030534351145, 'recall': 0.028985507246376812, 'f1-score': 0.044036697247706424, 'support': 414}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 2}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 38}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 26}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 70}","{'precision': 0.19320843091334894, 'recall': 0.055220883534136546, 'f1-score': 0.08589276418532016, 'support': 2988}","{'precision': 0.022012032579171344, 'recall': 0.01202916354708022, 'f1-score': 0.01511840961039612, 'support': 2988}","{'precision': 0.09371739142055714, 'recall': 0.055220883534136546, 'f1-score': 0.0677585460686587, 'support': 2988}"
2,No log,1.020934,"{'precision': 0.4772727272727273, 'recall': 0.21649484536082475, 'f1-score': 0.2978723404255319, 'support': 97}","{'precision': 0.09090909090909091, 'recall': 0.043478260869565216, 'f1-score': 0.0588235294117647, 'support': 23}","{'precision': 0.21428571428571427, 'recall': 0.061224489795918366, 'f1-score': 0.09523809523809523, 'support': 98}","{'precision': 1.0, 'recall': 0.0297029702970297, 'f1-score': 0.05769230769230769, 'support': 101}","{'precision': 0.05263157894736842, 'recall': 0.02040816326530612, 'f1-score': 0.02941176470588235, 'support': 49}","{'precision': 0.21982758620689655, 'recall': 0.15548780487804878, 'f1-score': 0.18214285714285713, 'support': 328}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 80}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 9}","{'precision': 0.22033898305084745, 'recall': 0.03051643192488263, 'f1-score': 0.0536082474226804, 'support': 426}","{'precision': 0.2222222222222222, 'recall': 0.03278688524590164, 'f1-score': 0.05714285714285715, 'support': 61}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 5}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 25}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 3}","{'precision': 0.05714285714285714, 'recall': 0.044444444444444446, 'f1-score': 0.05, 'support': 45}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 23}","{'precision': 0.15, 'recall': 0.16666666666666666, 'f1-score': 0.15789473684210525, 'support': 18}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 32}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 53}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 15}","{'precision': 0.1834319526627219, 'recall': 0.10652920962199312, 'f1-score': 0.13478260869565217, 'support': 291}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 34}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 3}","{'precision': 0.3081180811808118, 'recall': 0.2751235584843493, 'f1-score': 0.29068755439512617, 'support': 607}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 12}","{'precision': 0.24596774193548387, 'recall': 0.1473429951690821, 'f1-score': 0.18429003021148035, 'support': 414}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 2}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 38}","{'precision': 0.3333333333333333, 'recall': 0.038461538461538464, 'f1-score': 0.06896551724137931, 'support': 26}","{'precision': 0.02564102564102564, 'recall': 0.014285714285714285, 'f1-score': 0.01834862385321101, 'support': 70}","{'precision': 0.2484641638225256, 'recall': 0.12182061579651941, 'f1-score': 0.1634852908151808, 'support': 2988}","{'precision': 0.13107320326865865, 'recall': 0.04768806823349191, 'f1-score': 0.05989314035934244, 'support': 2988}","{'precision': 0.23777065171075148, 'recall': 0.12182061579651941, 'f1-score': 0.14492919860984368, 'support': 2988}"
3,No log,0.924762,"{'precision': 0.5957446808510638, 'recall': 0.28865979381443296, 'f1-score': 0.38888888888888884, 'support': 97}","{'precision': 0.2, 'recall': 0.13043478260869565, 'f1-score': 0.15789473684210528, 'support': 23}","{'precision': 0.3333333333333333, 'recall': 0.030612244897959183, 'f1-score': 0.056074766355140186, 'support': 98}","{'precision': 0.6, 'recall': 0.1485148514851485, 'f1-score': 0.23809523809523808, 'support': 101}","{'precision': 0.11428571428571428, 'recall': 0.08163265306122448, 'f1-score': 0.09523809523809522, 'support': 49}","{'precision': 0.30687830687830686, 'recall': 0.17682926829268292, 'f1-score': 0.22437137330754353, 'support': 328}","{'precision': 0.125, 'recall': 0.0125, 'f1-score': 0.022727272727272728, 'support': 80}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 9}","{'precision': 0.24770642201834864, 'recall': 0.06338028169014084, 'f1-score': 0.10093457943925235, 'support': 426}","{'precision': 0.09090909090909091, 'recall': 0.01639344262295082, 'f1-score': 0.02777777777777778, 'support': 61}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 5}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 25}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 3}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 45}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 23}","{'precision': 0.14285714285714285, 'recall': 0.16666666666666666, 'f1-score': 0.15384615384615383, 'support': 18}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 32}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 53}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 15}","{'precision': 0.2087912087912088, 'recall': 0.13058419243986255, 'f1-score': 0.16067653276955604, 'support': 291}","{'precision': 0.25, 'recall': 0.08823529411764706, 'f1-score': 0.13043478260869565, 'support': 34}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 3}","{'precision': 0.4024390243902439, 'recall': 0.27182866556836904, 'f1-score': 0.32448377581120946, 'support': 607}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 12}","{'precision': 0.31601731601731603, 'recall': 0.17632850241545894, 'f1-score': 0.22635658914728685, 'support': 414}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 2}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 38}","{'precision': 0.16666666666666666, 'recall': 0.038461538461538464, 'f1-score': 0.0625, 'support': 26}","{'precision': 0.05405405405405406, 'recall': 0.02857142857142857, 'f1-score': 0.037383177570093455, 'support': 70}","{'precision': 0.30954115076474875, 'recall': 0.14223560910307897, 'f1-score': 0.19490942444393486, 'support': 2988}","{'precision': 0.14326492969146518, 'recall': 0.0637804691970416, 'f1-score': 0.08302357725601067, 'support': 2988}","{'precision': 0.2804674764971251, 'recall': 0.14223560910307897, 'f1-score': 0.18224363059017964, 'support': 2988}"
4,No log,0.898847,"{'precision': 0.5076923076923077, 'recall': 0.3402061855670103, 'f1-score': 0.4074074074074074, 'support': 97}","{'precision': 0.2, 'recall': 0.13043478260869565, 'f1-score': 0.15789473684210528, 'support': 23}","{'precision': 0.41935483870967744, 'recall': 0.1326530612244898, 'f1-score': 0.20155038759689925, 'support': 98}","{'precision': 0.5, 'recall': 0.1782178217821782, 'f1-score': 0.26277372262773724, 'support': 101}","{'precision': 0.05, 'recall': 0.04081632653061224, 'f1-score': 0.04494382022471909, 'support': 49}","{'precision': 0.2785388127853881, 'recall': 0.18597560975609756, 'f1-score': 0.22303473491773307, 'support': 328}","{'precision': 0.3333333333333333, 'recall': 0.0375, 'f1-score': 0.06741573033707865, 'support': 80}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 9}","{'precision': 0.2640449438202247, 'recall': 0.11032863849765258, 'f1-score': 0.15562913907284767, 'support': 426}","{'precision': 0.24242424242424243, 'recall': 0.13114754098360656, 'f1-score': 0.17021276595744678, 'support': 61}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 5}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 25}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 3}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 45}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 23}","{'precision': 0.15, 'recall': 0.16666666666666666, 'f1-score': 0.15789473684210525, 'support': 18}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 32}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 53}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 15}","{'precision': 0.2318840579710145, 'recall': 0.16494845360824742, 'f1-score': 0.19277108433734938, 'support': 291}","{'precision': 0.1875, 'recall': 0.08823529411764706, 'f1-score': 0.12, 'support': 34}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 3}","{'precision': 0.4012345679012346, 'recall': 0.3212520593080725, 'f1-score': 0.3568161024702653, 'support': 607}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 12}","{'precision': 0.29559748427672955, 'recall': 0.22705314009661837, 'f1-score': 0.2568306010928961, 'support': 414}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 2}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 38}","{'precision': 0.125, 'recall': 0.038461538461538464, 'f1-score': 0.058823529411764705, 'support': 26}","{'precision': 0.075, 'recall': 0.04285714285714286, 'f1-score': 0.05454545454545454, 'support': 70}","{'precision': 0.3051911009697661, 'recall': 0.17904953145917002, 'f1-score': 0.22569078253533012, 'support': 2988}","{'precision': 0.14695188237635007, 'recall': 0.08057773317469918, 'f1-score': 0.09960496392013136, 'support': 2988}","{'precision': 0.28252053287531836, 'recall': 0.17904953145917002, 'f1-score': 0.2135727575352544, 'support': 2988}"
5,No log,0.885927,"{'precision': 0.515625, 'recall': 0.3402061855670103, 'f1-score': 0.4099378881987578, 'support': 97}","{'precision': 0.1875, 'recall': 0.13043478260869565, 'f1-score': 0.15384615384615383, 'support': 23}","{'precision': 0.3870967741935484, 'recall': 0.12244897959183673, 'f1-score': 0.18604651162790695, 'support': 98}","{'precision': 0.5116279069767442, 'recall': 0.21782178217821782, 'f1-score': 0.3055555555555556, 'support': 101}","{'precision': 0.05714285714285714, 'recall': 0.04081632653061224, 'f1-score': 0.04761904761904761, 'support': 49}","{'precision': 0.276, 'recall': 0.21036585365853658, 'f1-score': 0.2387543252595156, 'support': 328}","{'precision': 0.10526315789473684, 'recall': 0.025, 'f1-score': 0.04040404040404041, 'support': 80}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 9}","{'precision': 0.26066350710900477, 'recall': 0.12910798122065728, 'f1-score': 0.1726844583987441, 'support': 426}","{'precision': 0.22857142857142856, 'recall': 0.13114754098360656, 'f1-score': 0.16666666666666666, 'support': 61}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 5}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 25}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 3}","{'precision': 0.034482758620689655, 'recall': 0.022222222222222223, 'f1-score': 0.027027027027027025, 'support': 45}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 23}","{'precision': 0.16666666666666666, 'recall': 0.16666666666666666, 'f1-score': 0.16666666666666666, 'support': 18}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 32}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 53}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 15}","{'precision': 0.22018348623853212, 'recall': 0.16494845360824742, 'f1-score': 0.18860510805500982, 'support': 291}","{'precision': 0.2, 'recall': 0.08823529411764706, 'f1-score': 0.12244897959183675, 'support': 34}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 3}","{'precision': 0.40606060606060607, 'recall': 0.3311367380560132, 'f1-score': 0.3647912885662432, 'support': 607}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 12}","{'precision': 0.3115264797507788, 'recall': 0.24154589371980675, 'f1-score': 0.27210884353741494, 'support': 414}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 2}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 38}","{'precision': 0.125, 'recall': 0.038461538461538464, 'f1-score': 0.058823529411764705, 'support': 26}","{'precision': 0.0625, 'recall': 0.04285714285714286, 'f1-score': 0.05084745762711865, 'support': 70}","{'precision': 0.30413756045137025, 'recall': 0.1894243641231593, 'f1-score': 0.23345019591668387, 'support': 2988}","{'precision': 0.13985898721467563, 'recall': 0.08425597869132613, 'f1-score': 0.1025115016572231, 'support': 2988}","{'precision': 0.2775011581015547, 'recall': 0.1894243641231593, 'f1-score': 0.2216991640859593, 'support': 2988}"


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=470, training_loss=1.0249787026263297, metrics={'train_runtime': 12821.396, 'train_samples_per_second': 0.291, 'train_steps_per_second': 0.037, 'total_flos': 487569582842880.0, 'train_loss': 1.0249787026263297, 'epoch': 5.0})

In [31]:
# 8. Сохранение модели
model.save_pretrained("./drive/MyDrive/NEREL-model-test/5epoch")
tokenizer.save_pretrained("./drive/MyDrive/NEREL-model-test/5epoch")

('./drive/MyDrive/NEREL-model-test/5epoch/tokenizer_config.json',
 './drive/MyDrive/NEREL-model-test/5epoch/special_tokens_map.json',
 './drive/MyDrive/NEREL-model-test/5epoch/vocab.txt',
 './drive/MyDrive/NEREL-model-test/5epoch/added_tokens.json',
 './drive/MyDrive/NEREL-model-test/5epoch/tokenizer.json')

In [30]:
import subprocess
subprocess.run(['cp', '-r', 'ner_results', './drive/MyDrive/NEREL-model-test/5epoch'])

CompletedProcess(args=['cp', '-r', 'ner_results', './drive/MyDrive/NEREL-model-test/5epoch'], returncode=1)

In [25]:
# 9. Тестирование
nlp = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="average",
    device=0 if torch.cuda.is_available() else -1,
)

text = "Правительство России задумало вернуть налог на движимое имущество, который отменили с 1 января 2019 года, еще при предыдущем составе кабмина. Об этом заявил замминистра финансов Алексей Лавров, пишут «Ведомости». Спикер Совета Федерации Валентина Матвиенко напомнила, что ранее предупреждали о негативных последствиях налогового послабления: «Освободить от налога стоило только новое оборудование, приобретаемое компаниями, но нас не послушали, все освободили». В 2019 году Минфин оценил выпадающие доходы в 183 миллиарда рублей. Компенсировать их предполагалось за счет роста цен на крепкий алкоголь, но добиться этого не удалось."
results = nlp(text)
for entity in results:
    print(f"{entity['word']} -> {entity['entity_group']} (confidence: {entity['score']:.2f})")

Device set to use cpu


Правительство -> ORGANIZATION (confidence: 0.59)
России -> ORGANIZATION (confidence: 0.52)
на движимое -> LAW (confidence: 0.54)
с 1 января 2019 -> DATE (confidence: 0.77)
замминистра -> PROFESSION (confidence: 0.47)
Алексей -> PERSON (confidence: 0.80)
« -> ORGANIZATION (confidence: 0.51)
» -> PROFESSION (confidence: 0.63)
. Спикер Совета -> PROFESSION (confidence: 0.56)
Валентина -> PERSON (confidence: 0.73)
». В -> DATE (confidence: 0.54)
году -> ORGANIZATION (confidence: 0.37)
183 -> MONEY (confidence: 0.31)
