In [None]:
!pip uninstall torch torchvision torchaudio -y
!pip install torch torchvision torchaudio
!pip install transformers[torch] -U
!pip install accelerate -U
!pip install pandas
!pip install scikit-learn
!pip install matplotlib


Found existing installation: torch 2.3.0+cu121
Uninstalling torch-2.3.0+cu121:
  Successfully uninstalled torch-2.3.0+cu121
Found existing installation: torchvision 0.18.0+cu121
Uninstalling torchvision-0.18.0+cu121:
  Successfully uninstalled torchvision-0.18.0+cu121
Found existing installation: torchaudio 2.3.0+cu121
Uninstalling torchaudio-2.3.0+cu121:
  Successfully uninstalled torchaudio-2.3.0+cu121
Collecting torch
  Downloading torch-2.3.1-cp310-cp310-manylinux1_x86_64.whl (779.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m779.1/779.1 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision
  Downloading torchvision-0.18.1-cp310-cp310-manylinux1_x86_64.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m66.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchaudio
  Downloading torchaudio-2.3.1-cp310-cp310-manylinux1_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.31.0
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/base_command.py", line 169, in exc_logging_wrapper
    status = run_func(*args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/req_command.py", line 242, in wrapper
    return func(self, options, args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/commands/install.py", line 377, in run
    requirement_set = resolver.resolve(
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/resolution/resolvelib/resolver.py", line 92, in resolve
    result = self._result = resolver.resolve(
  File "/usr/local/lib/python3.10/dist-packages/pip/_

In [None]:
import os
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import matplotlib.pyplot as plt
import json
import time

# Загрузка данных
from google.colab import files

uploaded = files.upload()

# Чтение текстов
import pandas as pd
texts_df = pd.read_csv('texts.csv', encoding='utf-8', header=None)

# Чтение тегов
tags_df = pd.read_csv('tags.csv', header=None)

# Проверка и удаление пустых строк
texts_df.dropna(inplace=True)
tags_df.dropna(inplace=True)

# Проверка количества строк
texts_count = texts_df.shape[0]
tags_count = tags_df.shape[0]

if texts_count != tags_count:
    print(f"Ошибка: количество текстов ({texts_count}) не совпадает с количеством меток ({tags_count}).")
else:
    print(f"Количество текстов и меток совпадает: {texts_count} строк.")

# Преобразование текстов и меток в массивы
texts = texts_df.iloc[:, 0].tolist()
labels = tags_df.to_numpy()

# Преобразование текстов и меток в массивы numpy
texts = np.array(texts)
labels = np.array(labels)

# Разделение данных на тренировочную и тестовую выборки
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Класс для датасета
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float)  # Используем float для многоклассовой классификации
        }

# Загрузка русскоязычного токенайзера и модели
tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')
model = BertForSequenceClassification.from_pretrained('DeepPavlov/rubert-base-cased', num_labels=labels.shape[1])

# Перемещение модели на GPU, если доступно
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Создание датасетов и даталоадеров
train_dataset = TextDataset(train_texts.tolist(), train_labels.tolist(), tokenizer)
val_dataset = TextDataset(test_texts.tolist(), test_labels.tolist(), tokenizer)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch"
)

def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    labels = labels.argmax(axis=1)
    accuracy = accuracy_score(labels, pred)
    f1 = f1_score(labels, pred, average='weighted')
    return {"accuracy": accuracy, "f1": f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Измерение времени обучения
start_time = time.time()
trainer.train()
end_time = time.time()

# Расчет времени обучения
training_time = end_time - start_time
print(f"Время обучения модели: {training_time:.2f} секунд")

# Оценка модели
eval_results = trainer.evaluate()
print(f"Результаты оценки модели: {eval_results}")

# Сохранение модели
model_dir = 'models'
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

model.save_pretrained(os.path.join(model_dir, 'bert_standard'))
tokenizer.save_pretrained(os.path.join(model_dir, 'bert_standard'))

# Вывод гиперпараметров
print(f"Гиперпараметры: {training_args}")

# Прогнозирование меток на тестовых данных
predictions = trainer.predict(val_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Определение количества классов
num_classes = labels.shape[1]
print(f"Number of classes: {num_classes}")

# Генерация имен классов
target_names = [f"Class {i}" for i in range(num_classes)]

# Определение уникальных классов в тестовой выборке
unique_labels_test = np.unique(test_labels.argmax(axis=1))

# Вычисление метрик
report = classification_report(test_labels.argmax(axis=1), predicted_labels, target_names=target_names, output_dict=True, zero_division=0)
print(f"Макро-усредненная точность: {report['macro avg']['precision']}")
print(f"Макро-усредненная полнота: {report['macro avg']['recall']}")
print(f"Макро-усредненная F1-меря: {report['macro avg']['f1-score']}")

# Матрица ошибок
cm = confusion_matrix(test_labels.argmax(axis=1), predicted_labels, labels=unique_labels_test)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[f"Class {i}" for i in unique_labels_test])
disp.plot()
plt.title('Матрица ошибок')
plt.show()

# Проверка наличия файла тренировки и чтение его содержимого
trainer_state_path = './results/trainer_state.json'
if os.path.exists(trainer_state_path):
    with open(trainer_state_path) as f:
        training_logs = [json.loads(line) for line in f if line.strip()]
else:
    print(f"Файл {trainer_state_path} не найден. Пропускаем загрузку логов.")

# Если логи были загружены, строим графики
if 'training_logs' in locals():
    # Сбор метрик за все эпохи
    precision = [entry['eval_precision'] for entry in training_logs if 'eval_precision' in entry]
    recall = [entry['eval_recall'] for entry in training_logs if 'eval_recall' in entry]
    f1 = [entry['eval_f1'] for entry in training_logs if 'eval_f1' in entry]

    plt.figure(figsize=(15, 5))

    plt.subplot(1, 3, 1)
    plt.plot(epochs, precision, label='Macro-averaged Precision')
    plt.title('Macro-averaged Precision')
    plt.legend()

    plt.subplot(1, 3, 2)
    plt.plot(epochs, recall, label='Macro-averaged Recall')
    plt.title('Macro-averaged Recall')
    plt.legend()

    plt.subplot(1, 3, 3)
    plt.plot(epochs, f1, label='Macro-averaged F1-score')
    plt.title('Macro-averaged F1-score')
    plt.legend()

    plt.show()


Saving tags.csv to tags (1).csv
Saving texts.csv to texts (1).csv
Количество текстов и меток совпадает: 535 строк.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
