## Лабораторная работа №3

**Тема:** RNN и Text classification

**Выполнил:** Студент группы БВТ2201 Шамсутдинов Рустам Фаргатевич

**Цель лабораторной работы:** Создать модель с архитектурой RNN для классификации текста на токсичный и не токсичный

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import (
    accuracy_score,
    auc,
    f1_score,
    precision_recall_curve,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm
from transformers import AutoTokenizer

In [None]:
df = pd.read_csv("russian_toxic_comments.csv")
print(df.shape)
print(df["toxic"].value_counts())


(14412, 2)
toxic
0.0    9586
1.0    4826
Name: count, dtype: int64


In [None]:
train_val_df, test_df = train_test_split(df, test_size=0.2, stratify=df["toxic"])
train_df, val_df = train_test_split(
    train_val_df, test_size=0.1, stratify=train_val_df["toxic"]
)

print(f"Samples — train: {len(train_df)}, val: {len(val_df)}, test: {len(test_df)}")

Samples — train: 10376, val: 1153, test: 2883


In [None]:
tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruBert-base")
MAX_LEN = 128
BATCH_SIZE = 32


class ToxicDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=MAX_LEN):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        enc = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt",
        )
        return {
            "input_ids": enc.input_ids.squeeze(0),  # [max_len]
            "attention_mask": enc.attention_mask.squeeze(0),  # [max_len]
            "label": torch.tensor(label, dtype=torch.float),  # 0.0 / 1.0
        }

In [None]:
train_loader = DataLoader(
    ToxicDataset(train_df["comment"], train_df["toxic"], tokenizer),
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4,
)
val_loader = DataLoader(
    ToxicDataset(val_df["comment"], val_df["toxic"], tokenizer),
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=4,
)
test_loader = DataLoader(
    ToxicDataset(test_df["comment"], test_df["toxic"], tokenizer),
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=4,
)

In [None]:
class ToxicRNN(nn.Module):
    def __init__(
        self,
        vocab_size,
        emb_dim=256,
        rnn_hidden=128,
        num_layers=1,
        bidirectional=True,
        dropout_emb=0.3,
        dropout_rnn=0.3,
        dropout_fc=0.3,
    ):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.drop_emb = nn.Dropout(dropout_emb)

        # LSTM с drop между слоями (только если num_layers > 1)
        self.rnn = nn.LSTM(
            input_size=emb_dim,
            hidden_size=rnn_hidden,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional,
            dropout=dropout_rnn if num_layers > 1 else 0.0,
        )

        self.drop_fc = nn.Dropout(dropout_fc)
        self.fc = nn.Linear(rnn_hidden * (2 if bidirectional else 1), 1)

    def forward(self, input_ids, attention_mask=None):
        emb = self.embedding(input_ids)
        emb = self.drop_emb(emb)

        out, (h_n, _) = self.rnn(emb)

        # собираем финальное скрытое состояние
        if self.rnn.bidirectional:
            h_forward = h_n[-2]
            h_backward = h_n[-1]
            h_final = torch.cat([h_forward, h_backward], dim=1)
        else:
            h_final = h_n[-1]

        h_final = self.drop_fc(h_final)
        logits = self.fc(h_final).squeeze(1)
        return logits

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ToxicRNN(vocab_size=tokenizer.vocab_size).to(device)

# балансировка классов
pos = train_df["toxic"].sum()
neg = len(train_df) - pos
pos_weight = torch.tensor([neg / pos], device=device)

criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = optim.AdamW(model.parameters(), lr=3e-4)

In [None]:
def evaluate(loader):
    model.eval()
    ys, ps = [], []
    with torch.no_grad():
        for batch in loader:
            ids = batch["input_ids"].to(device)
            lbls = batch["label"].cpu().numpy()
            logits = model(ids)
            probs = torch.sigmoid(logits).cpu().numpy()
            ys.extend(lbls)
            ps.extend(probs)
    ys = np.array(ys)
    ps = np.array(ps)
    acc = accuracy_score(ys, ps > 0.5)
    f1 = f1_score(ys, ps > 0.5)
    roc_auc = roc_auc_score(ys, ps)
    prec, rec, _ = precision_recall_curve(ys, ps)
    pr_auc = auc(rec, prec)
    return acc, f1, roc_auc, pr_auc

In [None]:
NUM_EPOCHS = 5

for epoch in range(1, NUM_EPOCHS + 1):
    model.train()
    epoch_losses = []
    loop = tqdm(train_loader, desc=f"Epoch {epoch}")
    for batch in loop:
        ids = batch["input_ids"].to(device)
        lbls = batch["label"].to(device)
        logits = model(ids)
        loss = criterion(logits, lbls)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_losses.append(loss.item())
        loop.set_postfix(train_loss=np.mean(epoch_losses))

    val_acc, val_f1, val_roc, val_pr = evaluate(val_loader)
    print(
        f"→ Epoch {epoch} | val acc: {val_acc:.4f} | F1: {val_f1:.4f} "
        f"| ROC-AUC: {val_roc:.4f} | PR-AUC: {val_pr:.4f}"
    )

Epoch 1: 100%|██████████| 325/325 [01:31<00:00,  3.55it/s, train_loss=0.885]


→ Epoch 1 | val acc: 0.7051 | F1: 0.5923 | ROC-AUC: 0.7420 | PR-AUC: 0.6147


Epoch 2: 100%|██████████| 325/325 [01:30<00:00,  3.60it/s, train_loss=0.697]


→ Epoch 2 | val acc: 0.7823 | F1: 0.6203 | ROC-AUC: 0.8309 | PR-AUC: 0.7442


Epoch 3: 100%|██████████| 325/325 [01:30<00:00,  3.60it/s, train_loss=0.551]


→ Epoch 3 | val acc: 0.8083 | F1: 0.6736 | ROC-AUC: 0.8702 | PR-AUC: 0.7977


Epoch 4: 100%|██████████| 325/325 [01:29<00:00,  3.61it/s, train_loss=0.461]


→ Epoch 4 | val acc: 0.8205 | F1: 0.7251 | ROC-AUC: 0.8754 | PR-AUC: 0.8204


Epoch 5: 100%|██████████| 325/325 [01:29<00:00,  3.63it/s, train_loss=0.385]


→ Epoch 5 | val acc: 0.8413 | F1: 0.7483 | ROC-AUC: 0.8931 | PR-AUC: 0.8450


In [None]:
test_acc, test_f1, test_roc, test_pr = evaluate(test_loader)
print("\n=== FINAL TEST METRICS ===")
print(
    f"Acc: {test_acc:.4f} | F1: {test_f1:.4f} | "
    f"ROC-AUC: {test_roc:.4f} | PR-AUC: {test_pr:.4f}"
)


=== FINAL TEST METRICS ===
Acc: 0.8526 | F1: 0.7653 | ROC-AUC: 0.9051 | PR-AUC: 0.8474


In [None]:
MAX_LEN = 128  # то же, что при обучении


def predict_one(text: str):
    enc = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
        return_tensors="pt",
    )

    input_ids = enc["input_ids"].to(device)

    print(input_ids)

    model.eval()
    with torch.no_grad():
        logits = model(input_ids)
        prob = torch.sigmoid(logits).item()
        label = 1 if prob > 0.5 else 0

    return label, prob


text1 = "Ты дурак!"
label, confidence = predict_one(text1)
print(f"Text: «{text1}»  →  Toxic? {label}  (p={confidence:.3f})")

text2 = "Спасибо за помощь."
label, confidence = predict_one(text2)
print(f"Text: «{text2}»  →  Toxic? {label}  (p={confidence:.3f})")


text3 = "да да да ты очень умный!"
label, confidence = predict_one(text3)
print(f"Text: «{text3}»  →  Toxic? {label}  (p={confidence:.3f})")


tensor([[  101,   947, 19142,   177,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  