In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from tqdm.notebook import tqdm, trange
from sklearn.metrics import f1_score
from transformers import AutoTokenizer, AutoModel

In [32]:
data_news = pd.read_csv("GH_news.csv")

In [33]:
data_news['rubric'].value_counts()

Россия               623
Мир                  437
Спорт                300
Бывший СССР          294
Экономика            265
Силовые структуры    159
Интернет и СМИ       142
Культура             142
Наука и техника      137
Из жизни             127
Путешествия          125
Ценности             108
Дом                   77
Нацпроекты            50
69-я параллель        10
Name: rubric, dtype: int64

In [5]:
data_news['label'] = np.where(data_news['rubric']=="Россия", 1, 0)
data_news.drop('rubric', axis=1, inplace=True)

Классифицируем по тому, относится ли новость к рубрике "Россия".

In [6]:
tokenizer = AutoTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')
bert_embeder = AutoModel.from_pretrained('DeepPavlov/rubert-base-cased')

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
def embed_bert(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt', max_length=512)
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    return model_output.last_hidden_state[0]

In [8]:
class Bert_embed_data(Dataset):
    def __init__(self, df, embeder, tokenizer):
        super().__init__()
        self.df = df
        self.embeder = embeder
        self.tokenizer = tokenizer
        self.embeds = []
        pbar = tqdm(self.df["text"], leave=False)
        for text in pbar:
            self.embeds.append(embed_bert(text, self.embeder, self.tokenizer))
                      
    def __len__(self):
        return len(self.df)
  
    def __getitem__(self, index):
        label = self.df["label"][index].astype(np.int64)
        input_embed = self.embeds[index]
        return input_embed, label

In [9]:
data_train, data_test = train_test_split(data_news, test_size=0.2, shuffle=True, random_state=42)
data_train = data_train.reset_index()
data_test = data_test.reset_index()

In [10]:
#долгий шаг, получаем эмбеддинги из RuBert
train_dataset = Bert_embed_data(data_train, bert_embeder, tokenizer)
test_dataset = Bert_embed_data(data_test, bert_embeder, tokenizer)

  0%|          | 0/2396 [00:00<?, ?it/s]

  0%|          | 0/600 [00:00<?, ?it/s]

In [11]:
loaders = {
    'train': DataLoader(train_dataset, shuffle=True, batch_size=1),
    'test': DataLoader(test_dataset, shuffle=False, batch_size=1)
}

In [12]:
device = 'cuda'

In [13]:
def training(model, criterion, optimizer, num_epochs, loaders, max_grad_norm=2):
    best_f1 = 0
    for e in trange(num_epochs, leave=False):
        model.train()
        num_iter = 0
        pbar = tqdm(loaders["train"], leave=False)
        for input_embeds, labels in pbar:
            optimizer.zero_grad()
            input_embeds = input_embeds.to(device)
            labels = labels.to(device)
            prediction = model(input_embeds)
            loss = criterion(prediction, labels)
            loss.backward()
            if max_grad_norm is not None:
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            num_iter += 1
        valid_loss = 0
        valid_acc = 0
        num_iter = 0
        model.eval()
        with torch.no_grad():
            correct = 0
            num_objs = 0
            pbar = tqdm(loaders["test"], leave=False)
            all_preds = []
            all_trues = []
            for input_embeds, labels in pbar:
                input_embeds = input_embeds.to(device)
                labels = labels.to(device)
                prediction = model(input_embeds)
                valid_loss += criterion(prediction, labels)
                correct += (labels == prediction.argmax(-1)).float().sum()
                all_preds += prediction.cpu().argmax(-1)
                all_trues += labels.cpu()
                num_objs += len(labels)
                num_iter += 1
        if (f1_score(all_preds, all_trues) > best_f1):
            best_f1 = f1_score(all_preds, all_trues)
            torch.save(model.state_dict(), 'best_model_dict.pth')
        print(f"Valid Loss: {valid_loss / num_iter}, accuracy: {correct/num_objs}, f1: {f1_score(all_preds, all_trues)}")
    best_model_dict = torch.load('best_model_dict.pth')
    model.load_state_dict(best_model_dict)
    return model

In [25]:
class LSTMModel(nn.Module):
    def __init__(self, embed_size, hidden_size, batch_size=1, num_classes=2):
        super().__init__()
        self.batch_size = batch_size
        self.lstm = nn.LSTM(embed_size, hidden_size, 1, batch_first=True)
        self.ln1 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        output, (hn, cn) = self.lstm(x)
        output = output.max(dim=1)[0]
        output = self.ln1(output)
        return output

In [26]:
model = LSTMModel(768, 300).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

num_epochs = 5

In [27]:
best_model = training(model, criterion, optimizer, num_epochs, loaders)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/2396 [00:00<?, ?it/s]

  0%|          | 0/600 [00:00<?, ?it/s]

Valid Loss: 0.18474195897579193, accuracy: 0.92166668176651, f1: 0.7965367965367965


  0%|          | 0/2396 [00:00<?, ?it/s]

  0%|          | 0/600 [00:00<?, ?it/s]

Valid Loss: 0.19066227972507477, accuracy: 0.9283333420753479, f1: 0.825910931174089


  0%|          | 0/2396 [00:00<?, ?it/s]

  0%|          | 0/600 [00:00<?, ?it/s]

Valid Loss: 0.1922176033258438, accuracy: 0.9300000071525574, f1: 0.8432835820895523


  0%|          | 0/2396 [00:00<?, ?it/s]

  0%|          | 0/600 [00:00<?, ?it/s]

Valid Loss: 0.270796000957489, accuracy: 0.9316666722297668, f1: 0.8464419475655431


  0%|          | 0/2396 [00:00<?, ?it/s]

  0%|          | 0/600 [00:00<?, ?it/s]

Valid Loss: 0.26996418833732605, accuracy: 0.9316666722297668, f1: 0.8416988416988418


In [28]:
torch.save(best_model, "Social_Project_model.pkl")