In [1]:
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm
import numpy as np
import pandas as pd
from fast_bert.utils import MultiLabelDataset, train_epoch_v2, eval_model_v2
from model import MultiTaskRuBERT
import joblib

# Загрузка тренировочного и валидационного файлов
# df = pd.read_excel('02_Реальные_кейсы.xlsx', sheet_name='Реальные кейсы 800 с ответами')

train_files = ['train_data_text.xlsx', 'paraphrased_train_all_data_2.xlsx']
train_dfs = [pd.read_excel(file) for file in train_files]
df_train = pd.concat(train_dfs, ignore_index=True)
df_val = pd.read_excel('val_data_text.xlsx')

# Фильтрация данных (удаление NaN значений)
df_train_filtered = df_train[['Вопрос пользователя', 'Классификатор 1 уровня', 'Классификатор 2 уровня']].dropna()
df_val_filtered = df_val[['Вопрос пользователя', 'Классификатор 1 уровня', 'Классификатор 2 уровня']].dropna()

# Загрузка обученных LabelEncoder
le_lvl1 = joblib.load('le_lvl1.pkl')
le_lvl2 = joblib.load('le_lvl2.pkl')

# Преобразование текстовых классов в числовые метки
df_train_filtered['label_lvl1'] = le_lvl1.transform(df_train_filtered['Классификатор 1 уровня'])
df_train_filtered['label_lvl2'] = le_lvl2.transform(df_train_filtered['Классификатор 2 уровня'])
df_val_filtered['label_lvl1'] = le_lvl1.transform(df_val_filtered['Классификатор 1 уровня'])
df_val_filtered['label_lvl2'] = le_lvl2.transform(df_val_filtered['Классификатор 2 уровня'])

# Разделение данных на тексты и метки
train_texts = df_train_filtered['Вопрос пользователя'].values
train_labels_lvl1 = df_train_filtered['label_lvl1'].values
train_labels_lvl2 = df_train_filtered['label_lvl2'].values

val_texts = df_val_filtered['Вопрос пользователя'].values
val_labels_lvl1 = df_val_filtered['label_lvl1'].values
val_labels_lvl2 = df_val_filtered['label_lvl2'].values

# Определение количества классов
n_classes_lvl1 = len(le_lvl1.classes_)
n_classes_lvl2 = len(le_lvl2.classes_)

# Пример вывода для проверки
print("Количество классов 1 уровня:", n_classes_lvl1)
print("Количество классов 2 уровня:", n_classes_lvl2)
# print("Тренировочные тексты:", train_texts[:5])
# print("Валидационные тексты:", val_texts[:5])


Количество классов 1 уровня: 13
Количество классов 2 уровня: 53


In [10]:
# Инициализация модели
model_name = "ai-forever/sbert_large_nlu_ru"
model = MultiTaskRuBERT(model_name, n_classes_lvl1, n_classes_lvl2)
tokenizer = AutoTokenizer.from_pretrained(model_name)
max_len = 128

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.load_state_dict(torch.load("best_model.pth"))
model.to(device)

# Создание DataLoader-ов
train_dataset = MultiLabelDataset(train_texts, train_labels_lvl1, train_labels_lvl2, tokenizer, max_len)
val_dataset = MultiLabelDataset(val_texts, val_labels_lvl1, val_labels_lvl2, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [None]:
# 84%, 80%

In [None]:
import torch.nn.functional as F

class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        BCE_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * BCE_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss


EPOCHS = 100
optimizer = AdamW(model.parameters(), lr=5e-5, correct_bias=False)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# loss_fn = nn.CrossEntropyLoss(label_smoothing=0.4)
loss_fn = FocalLoss(alpha=0.25, gamma=4)
best_accuracy_lvl1 = 0
best_accuracy_lvl2 = 0

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc_lvl1, train_acc_lvl2, train_loss = train_epoch_v2(
        model,
        train_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(train_dataset)
    )

    print(f'Train loss {train_loss} lvl1 accuracy {train_acc_lvl1} lvl2 accuracy {train_acc_lvl2}')

    val_acc_lvl1, val_acc_lvl2, val_loss = eval_model_v2(
        model,
        val_loader,
        loss_fn,
        device,
        len(val_dataset)
    )

    print(f'Validation loss {val_loss} lvl1 accuracy {val_acc_lvl1} lvl2 accuracy {val_acc_lvl2}')

    if val_acc_lvl1 > best_accuracy_lvl1 and val_acc_lvl2 > best_accuracy_lvl2 and val_acc_lvl2 > 0.8:
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_accuracy_lvl1 = val_acc_lvl1
        best_accuracy_lvl2 = val_acc_lvl2

In [None]:
val_texts[1]

In [None]:
def predict(text, model, tokenizer, le_lvl1, le_lvl2, max_len=64):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    
    with torch.no_grad():
        outputs_lvl1, outputs_lvl2 = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        _, preds_lvl1 = torch.max(outputs_lvl1, dim=1)
        _, preds_lvl2 = torch.max(outputs_lvl2, dim=1)

    predicted_class_lvl1 = le_lvl1.inverse_transform(preds_lvl1.cpu().numpy())
    predicted_class_lvl2 = le_lvl2.inverse_transform(preds_lvl2.cpu().numpy())
    
    return predicted_class_lvl1, predicted_class_lvl2

le_lvl1 = joblib.load('le_lvl1.pkl')
le_lvl2 = joblib.load('le_lvl2.pkl')
text = 'хочу себе запись трансляции, чтобы подмонтировать потом, как сделать?'
predicted_lvl1, predicted_lvl2 = predict(text, model, tokenizer, le_lvl1, le_lvl2, max_len)
print(f"{predicted_lvl1}, {predicted_lvl2}")

In [11]:
import torch.nn.functional as F

class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        BCE_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * BCE_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss

model.load_state_dict(torch.load("best_model_state_1.bin"))
model.eval()
loss_fn = FocalLoss(alpha=0.25, gamma=4)
val_acc_lvl1, val_acc_lvl2, val_loss = eval_model_v2(
        model,
        val_loader,
        loss_fn,
        device,
        len(val_dataset)
)
print(f"{val_acc_lvl1}, {val_acc_lvl2}")

0.8641975308641975, 0.8395061728395061
