In [8]:
import pandas as pd

train_data = pd.read_csv("finall_data.csv")

labels = train_data['is_relevant'].to_list()
num_classes = len(set(labels))

texts = (train_data['Reviews']).to_list()
print(texts[0])
print(train_data.info())

Лучший курс для подготовки к ЕГЭ по математике с отличным учителем ever Учёба с Тимуром = СОТОЧКА баллов за экзамен + отлично проведённое время
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6448 entries, 0 to 6447
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   6448 non-null   int64 
 1   Reviews      6448 non-null   object
 2   is_relevant  6448 non-null   int64 
 3   is_positive  6448 non-null   int64 
 4   object       6448 non-null   int64 
dtypes: int64(4), object(1)
memory usage: 252.0+ KB
None


In [9]:
import torch
import numpy as np
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

# Загрузка данных
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Функция для тренировки модели
def train_model(model, train_dataloader, val_dataloader, optimizer, epochs):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for batch in tqdm(train_dataloader, total=len(train_dataloader), desc=f'Epoch {epoch + 1}/{epochs}', unit='batch'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        avg_train_loss = train_loss / len(train_dataloader)

        print(f'Training Loss: {avg_train_loss}')

        # Валидация
        model.eval()
        val_preds = []
        val_labels = []
        for batch in tqdm(val_dataloader, total=len(val_dataloader), desc=f'Validating Epoch {epoch + 1}/{epochs}', unit='batch'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].numpy()

            with torch.no_grad():
                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                preds = np.argmax(logits.detach().cpu().numpy(), axis=1)
                val_preds.extend(preds)
                val_labels.extend(labels)

        val_f1 = f1_score(val_labels, val_preds, average='macro')
        print(f'Validation F1-Score: {val_f1}')

# Загрузка предобученного RuBERT
tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')
model = BertForSequenceClassification.from_pretrained('DeepPavlov/rubert-base-cased', num_labels=num_classes)

# Параметры
max_len = 128
batch_size = 32
epochs = 25
learning_rate = 2e-5

# Подготовка данных
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

train_dataset = CustomDataset(train_texts, train_labels, tokenizer, max_len)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer, max_len)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

# Инициализация оптимизатора
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Обучение модели
train_model(model, train_dataloader, val_dataloader, optimizer, epochs)


NameError: name '_C' is not defined

In [25]:
test_data = pd.read_csv("train_data.csv")
test_labels = test_data['is_relevant']
test_texts = (test_data['question_2'] + test_data['question_3'] + test_data['question_4'] + test_data['question_5']).to_list()

In [26]:
test_dataset = CustomDataset(test_texts, test_labels, tokenizer, max_len)

test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Валидация
model.eval()
val_preds = []
val_labels = []
for batch in tqdm(test_dataloader, total=len(test_dataloader)):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].numpy()

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = np.argmax(logits.detach().cpu().numpy(), axis=1)
        val_preds.extend(preds)
        val_labels.extend(labels)

val_f1 = f1_score(val_labels, val_preds, average='micro')
print(f'Test F1-Score: {val_f1}')

100%|██████████| 4/4 [00:01<00:00,  2.07it/s]

Test F1-Score: 0.883495145631068





In [27]:
model_path = "models/relevant"

# Сохранение дообученной модели в bin файл
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('relevant/RuBert+Comments/tokenizer_config.json',
 'relevant/RuBert+Comments/special_tokens_map.json',
 'relevant/RuBert+Comments/vocab.txt',
 'relevant/RuBert+Comments/added_tokens.json')

In [28]:
test_texts[0]
test_data.iloc[102]

timestamp                                       27.04.2024:19:00
question_1                                    Java api браузеров
question_2                       Понравился методический вебинар
question_3                             Преподователь сошел с ума
question_4     Я думаю с другим преподователем я получил бы б...
question_5       Хочу перепройти вебинар с другим преподователем
is_relevant                                                    1
object                                                         2
is_positive                                                    0
Name: 102, dtype: object

In [32]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
loaded_model = BertForSequenceClassification.from_pretrained(model_path)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
loaded_model.to(device)

# Валидация
model.eval()
val_preds = []
val_labels = []
for batch in tqdm(test_dataloader, total=len(test_dataloader)):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].numpy()

    with torch.no_grad():
        outputs = loaded_model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = np.argmax(logits.detach().cpu().numpy(), axis=1)
        val_preds.extend(preds)
        val_labels.extend(labels)

val_f1 = f1_score(val_labels, val_preds, average='micro')

accuracy = accuracy_score(val_labels, val_preds)
precision = precision_score(val_labels, val_preds)
recall = recall_score(val_labels, val_preds)
f1 = f1_score(val_labels, val_preds, average='micro')

# Печатаем значения метрик
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Строим матрицу классификации
conf_matrix = confusion_matrix(val_labels, val_preds)

# Создаем DataFrame из матрицы классификации с подписями колонок и строк
conf_matrix_df = pd.DataFrame(conf_matrix, columns=['Predicted Negative', 'Predicted Positive'], index=['Actual Negative', 'Actual Positive'])

# Печатаем красиво оформленную матрицу классификации
print("Confusion Matrix:")
print(conf_matrix_df)
print(f'Test F1-Score: {val_f1}')

100%|██████████| 4/4 [00:18<00:00,  4.56s/it]

Accuracy: 0.883495145631068
Precision: 0.8823529411764706
Recall: 1.0
F1 Score: 0.883495145631068
Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative                   1                  12
Actual Positive                   0                  90
Test F1-Score: 0.883495145631068





('relevant/model.bin/tokenizer_config.json',
 'relevant/model.bin/special_tokens_map.json',
 'relevant/model.bin/vocab.txt',
 'relevant/model.bin/added_tokens.json')