In [1]:
import torch
from torch import nn
from torchvision import models
import os
from PIL import Image
import numpy as np
from tqdm import tqdm
from numpy.random import shuffle
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from torchvision import transforms
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import cv2
import torchvision.transforms as T
import random
import pandas as pd
from transformers import DistilBertTokenizerFast, DistilBertModel
import re
import nltk
from nltk.corpus import stopwords
from pymorphy3 import MorphAnalyzer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tqdm.pandas()

In [3]:


# Загружаем стоп-слова NLTK (для русского и английского)
nltk.download("stopwords")
stop_words = set(stopwords.words("russian")) | set(stopwords.words("english"))

# Инициализация морфологического анализатора pymorphy3 для лемматизации
morph = MorphAnalyzer()

def preprocess_text(text: str) -> str:
    """
    Предобрабатывает текст для модели CatBoost.
    
    Шаги обработки:
    1. Приведение к нижнему регистру.
    2. Удаление всех символов, кроме букв и пробелов.
    3. Токенизация (разделение на слова).
    4. Удаление стоп-слов.
    5. Лемматизация слов.
    
    :param text: Входной текст.
    :return: Предобработанный текст.
    """
    
    # 1. Приводим текст к нижнему регистру
    text = text.lower()
    
    # 2. Удаляем спецсимволы, цифры и оставляем только буквы и пробелы
    text = re.sub(r"[^a-zа-яё\s]", " ", text)
    
    # 3. Токенизация: разделение текста на слова
    words = text.split()
    
    # 4. Удаление стоп-слов
    words = [word for word in words if word not in stop_words]
    
    # 5. Лемматизация
    lemmatized_words = [morph.parse(word)[0].normal_form for word in words]
    
    # Собираем обратно в строку и возвращаем результат
    processed_text = " ".join(lemmatized_words)
    
    return processed_text


[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [4]:
def set_all_seeds(seed=42):
    # python's seeds
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
    # torch's seeds
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
set_all_seeds()

In [5]:
torch.cuda.get_device_name()

'NVIDIA GeForce RTX 3060 Laptop GPU'

In [6]:
class Metric:
    def __init__(self, func, **kwargs):
        self.func = func
        self.arg = kwargs
    
    def __call__(self, y_true, y_pred):
        return self.func(y_true, y_pred, **self.arg)
    
    def __name__(self):
        return self.func.__name__

In [7]:
# Загружаем данные
df = pd.read_csv(r'train.csv')

In [None]:
df['comment_text'] = df['comment_text'].progress_apply(preprocess_text)

 23%|██▎       | 36609/159571 [00:14<00:48, 2510.82it/s]

In [None]:
device = 'cuda'

In [None]:
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation edits made username hardcore metal...,0,0,0,0,0,0
1,000103f0d9cfb60f,aww matches background colour seemingly stuck ...,0,0,0,0,0,0
2,000113f07ec002fd,hey man really trying edit war guy constantly ...,0,0,0,0,0,0
3,0001b41b1c6bb37e,make real suggestions improvement wondered sec...,0,0,0,0,0,0
4,0001d958c54c6e35,sir hero chance remember page,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,second time asking view completely contradicts...,0,0,0,0,0,0
159567,ffea4adeee384e90,ashamed horrible thing put talk page,0,0,0,0,0,0
159568,ffee36eab5c267c9,spitzer umm theres actual article prostitution...,0,0,0,0,0,0
159569,fff125370e4aaaf3,looks like actually put speedy first version d...,0,0,0,0,0,0


In [None]:
# Разделяем данные на тренировочную и тестовую выборки
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['toxic'], random_state=42)
#train_df, _ = train_test_split(train_df, test_size=0.99, stratify=train_df['toxic'], random_state=42)

In [None]:
class ToxicDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Инициализируем токенизатор
tokenizer = DistilBertTokenizerFast.from_pretrained('bert-base-uncased')

# Создаем DataLoader для тренировочной и тестовой выборок
train_dataset = ToxicDataset(train_df['comment_text'].tolist(), train_df['toxic'].tolist(), tokenizer, max_length=64)
test_dataset = ToxicDataset(test_df['comment_text'].tolist(), test_df['toxic'].tolist(), tokenizer, max_length=64)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizerFast'.


In [None]:
class ToxicClassifier(nn.Module):
    def __init__(self, n_classes):
        super(ToxicClassifier, self).__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output.last_hidden_state[:, 0]
        output = self.drop(hidden_state)
        return self.out(output)

# Инициализация модели, оптимизатора и функции потерь
model = ToxicClassifier(n_classes=2)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
loss_func = nn.CrossEntropyLoss()

In [32]:
for param in model.bert.embeddings.parameters():
    param.requires_grad = False
for layer in model.bert.transformer.layer:  # Замораживаем первые 2 слоя
    for param in layer.parameters():
        param.requires_grad = False

In [33]:
model.bert

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [34]:
# Модифицируем функцию train для работы с BERT
def train(model, optim, loss_func, loader, epochs):
    fin = []
    model = model.to(device)
    model.train()
    for epoch in range(1, epochs + 1):
        print(f'Epoch: {epoch}')
        hist = []
        for batch in tqdm(loader):
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['label']
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            outputs = model(input_ids, attention_mask)
            loss = loss_func(outputs, labels)
            hist.append(loss.item())
            
            loss.backward()
            optim.step()
            optim.zero_grad()
        print(f'Loss: {sum(hist) / len(hist)}')
        fin.append(sum(hist) / len(hist))
    return fin

# Запускаем обучение
train(model, optimizer, loss_func, train_loader, epochs=1)

Epoch: 1


  5%|▍         | 368/7979 [00:08<02:55, 43.43it/s]


KeyboardInterrupt: 

In [35]:
def test_model(model, loader, metrics):
    hist_pred = []
    hist_true = []

    model.eval()
    with torch.no_grad():
        for batch in tqdm(loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            _, preds = torch.max(outputs, dim=1)
            hist_pred.extend(preds.cpu().numpy())
            hist_true.extend(labels.cpu().numpy())

    for metric in metrics:
        print(f'{metric.__name__()}: {metric(hist_true, hist_pred)}')

# Определяем метрики и тестируем модель
metrics = [Metric(accuracy_score), Metric(f1_score, average='weighted')]
test_model(model, test_loader, metrics)

100%|██████████| 1995/1995 [00:44<00:00, 44.98it/s]

accuracy_score: 0.904151652827824
f1_score: 0.8586398148458866





In [18]:
test = pd.read_csv(r'test.csv')
test['comment_text'] = test['comment_text'].progress_apply(preprocess_text)

100%|██████████| 153164/153164 [01:01<00:00, 2492.46it/s]


In [19]:
test

Unnamed: 0,id,comment_text
0,00001cee341fdb12,yo bitch ja rule succesful ever whats hating s...
1,0000247867823ef7,rfc title fine imo
2,00013b17ad220c46,sources zawe ashton lapland
3,00017563c3f7919a,look back source information updated correct f...
4,00017695ad8997eb,anonymously edit articles
...,...,...
153159,fffcd0960ee309b5,totally agree stuff nothing long crap
153160,fffd7a9a6eb32c16,throw field home plate get faster throwing cut...
153161,fffda9e8d6fafa9e,okinotorishima categories see changes agree co...
153162,fffe8f1340a79fc2,one founding nations eu germany law return qui...


In [20]:
class ToxicDatasetTest(Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        # Токенизация текста
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }


In [21]:
test_dataset = ToxicDatasetTest(test['comment_text'].tolist(), tokenizer)

In [22]:
test_dataloader = DataLoader(test_dataset, batch_size=8)

In [23]:
def generate_predictions(model, dataloader, threshold=0.5):
    """
    Функция для генерации предсказаний с использованием модели BERT для мультимаркерной классификации.

    :param model: Обученная модель
    :param dataloader: DataLoader с тестовыми данными
    :param tokenizer: Токенизатор BERT
    :param device: Устройство для вычислений (например, 'cuda' или 'cpu')
    :param threshold: Порог вероятности для бинаризации меток
    :return: Список предсказанных меток для каждого текста
    """
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in tqdm(dataloader):
            # Извлекаем input_ids и attention_mask из батча
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # Получаем выходы модели
            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs, dim=1)
            # Сохраняем предсказания
            predictions.extend(preds)

    return predictions


In [24]:
preds = generate_predictions(model, test_dataloader)

100%|██████████| 19146/19146 [13:00<00:00, 24.52it/s]


In [25]:
ans = pd.DataFrame(data=preds)

TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

In [26]:
preds

[tensor(1, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(1, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(1, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, dev