## Sentiment analysis of IMDB ratings via RNN

In [1]:
!pip install datasets



In [2]:
import numpy as np
import random
import os
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

from torchtext import datasets
from torchtext.vocab import vocab
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
# from torchtext.datasets import IMDB
from datasets import load_dataset

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
seed = 42
def seed_torch(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed) # if you are using multi-GPU.
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

### Preparing Data

In [4]:
train_dataset = load_dataset('imdb', split='train').shuffle(seed=42).train_test_split(test_size=0.3)
train_dataset, valid_dataset = train_dataset['train'], train_dataset['test']
test_dataset = load_dataset('imdb', split='test')

In [5]:
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

train_texts = train_dataset['text']
vocab = build_vocab_from_iterator(yield_tokens(train_texts), specials=["<pad>", "<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [6]:
def pad_to_max_len(texts):
    max_len = max(map(len, texts))
    for i, text in enumerate(texts):
        if len(text) < max_len:
            texts[i] = [0] * (max_len - len(text)) + text
    return texts

text_pipeline = lambda text: vocab(tokenizer(text))
label_pipeline = lambda x: [int(x)]

#### Data preprocessor (1 балл)
Обработка данных. Создайте data collator, который обработает исходный батч и выдаст батч лейблов и текстов, переведенных в токены и приведенных к одной длине.

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    # YOUR CODE HERE
    label_list, text_list = [], []
    for i in batch:
        label_list.append(label_pipeline(i['label']))

        processed_text = text_pipeline(i['text'])
        text_list.append(processed_text)

    text_list = pad_to_max_len(text_list)
    text_list = torch.tensor(text_list, device=device, dtype=torch.int64)
    label_list = torch.tensor(label_list, device=device, dtype=torch.int64)
    return label_list, text_list

In [38]:
seed_torch(seed)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(valid_dataset, batch_size=64, shuffle=False, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_batch)

In [279]:
labels, input_ids = next(iter(train_dataloader))
labels.shape, input_ids.shape

(torch.Size([64, 1]), torch.Size([64, 946]))

In [280]:
input_ids

tensor([[   0,    0,    0,  ...,  195,   88,    3],
        [   0,    0,    0,  ...,   37, 2148,    3],
        [   0,    0,    0,  ..., 3920,   24,    3],
        ...,
        [   0,    0,    0,  ...,   14,   35,    3],
        [   0,    0,    0,  ...,   19,   26,    3],
        [   0,    0,    0,  ...,   45,   14,    3]], device='cuda:0')

In [281]:
labels[:10]

tensor([[0],
        [1],
        [1],
        [0],
        [0],
        [1],
        [1],
        [1],
        [1],
        [1]], device='cuda:0')

### Define the RNN-based text classification model (3 балла)

Создайте класс RNN для классификации текста по приведенной схеме.

![img](https://d2l.ai/_images/rnn.svg)


$$\mathbf{H}_t = \phi(\mathbf{X}_t \mathbf{W}_{xh} + \mathbf{H}_{t-1} \mathbf{W}_{hh}  + \mathbf{b}_h).$$

$$\mathbf{O}_t = \mathbf{H}_t \mathbf{W}_{hq} + \mathbf{b}_q.$$

In [33]:
class RNNClassifier(nn.Module):
    def __init__(self, num_inputs, num_hiddens, out_dim, sigma=0.01):
        # YOUR CODE HERE
        super().__init__()
        self.W_xh = nn.Parameter(
            torch.randn(num_inputs, num_hiddens) * sigma
        )
        self.W_hh = nn.Parameter(
            torch.randn(num_hiddens, num_hiddens) * sigma
        )
        self.b_h = nn.Parameter(torch.zeros(num_hiddens))
        self.embedding = nn.Embedding(len(vocab), embedding_dim=100)
        self.cls = nn.Parameter(torch.randn(num_hiddens, out_dim) * sigma)
        self.cls_bias = nn.Parameter(torch.zeros(out_dim))

        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens
        self.out_dim = out_dim

    def forward(self, inputs, state=None):
        # YOUR CODE HERE
        if state is None:
            state = torch.zeros(
                (inputs.shape[1], self.num_hiddens), device=inputs.device
            )
        else:
            state = state
        outputs = []
        embedded = self.embedding(inputs)
        for X in embedded:
            state = torch.tanh(
                torch.matmul(X, self.W_xh) + torch.matmul(state, self.W_hh) + self.b_h
            )
            output = torch.matmul(state, self.cls) + self.cls_bias
            outputs.append(output)

        return outputs, state

In [34]:
seed_torch(seed)
rnn = RNNClassifier(100, 50, 1)

# check the shapes of parameters
for n, p in rnn.named_parameters():
    print(n, p.shape)

W_xh torch.Size([100, 50])
W_hh torch.Size([50, 50])
b_h torch.Size([50])
cls torch.Size([50, 1])
cls_bias torch.Size([1])
embedding.weight torch.Size([84058, 100])


Должно получиться следующее:

W_xh torch.Size([100, 50])

W_hh torch.Size([50, 50])

b_h torch.Size([50])

cls torch.Size([50, 1])

cls_bias torch.Size([1])

embedding.weight torch.Size([83969, 100])

If you're using a GPU, remember to call model.cuda() to move your model to the GPU.

In [35]:
rnn.to(device)

RNNClassifier(
  (embedding): Embedding(84058, 100)
)

### Implement the training procedure (1 балл)
Обучите модель.

In [36]:
seed_torch(seed)
opt = torch.optim.Adam(rnn.parameters())
loss_func = nn.BCEWithLogitsLoss()

In [15]:
epochs = 2

Training loop

In [16]:
train_size = len(train_dataset) // 64
valid_size = len(valid_dataset) // 64

In [39]:
%%time
for epoch in range(1, epochs + 1):
    # YOUR CODE HERE
    running_loss = 0.
    rnn.train()
    for label, text in tqdm(train_dataloader):
        opt.zero_grad()
        outputs, state = rnn(text.T)
        preds = outputs[-1]
        loss = loss_func(preds.squeeze(1), label.float().squeeze(1))

        loss.backward()
        opt.step()
        running_loss += loss.item()

    epoch_loss = running_loss / train_size

    val_loss = 0.
    rnn.eval()

    correct = 0
    total = 0
    for label, text in tqdm(valid_dataloader):
        outputs, state = rnn(text.T)
        preds = outputs[-1]
        loss = loss_func(preds.squeeze(1), label.float().squeeze(1))

        val_loss += loss.item()

    valid_loss = val_loss / valid_size

    print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, epoch_loss, valid_loss))

  0%|          | 0/274 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

Epoch: 1, Training Loss: 0.669413760686532, Validation Loss: 0.6477718215722305


  0%|          | 0/274 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

Epoch: 2, Training Loss: 0.6139193183773166, Validation Loss: 0.6058791228848645
CPU times: user 3min 33s, sys: 2.3 s, total: 3min 36s
Wall time: 3min 34s


### Evaluate the trained model performance (2 балла)
Выполните подсчет метрик.

In [199]:
# YOUR CODE HERE
predicted = []
trues = []
texts = []
states = []
for label, text in tqdm(valid_dataloader):
    with torch.inference_mode():
        outputs, state = rnn(text.T)
        preds = outputs[-1]
        preds = torch.round(torch.sigmoid(preds.squeeze(1)))
    trues += list(label.cpu().detach().numpy())
    predicted += list(preds.cpu().detach().numpy())
    texts += list(text.cpu().detach().numpy())
    states += list(state.cpu().detach().numpy())

  0%|          | 0/118 [00:00<?, ?it/s]

In [200]:
predicted = np.array(predicted)
trues = np.array(trues)

val_accuracy = accuracy_score(trues, predicted)
val_precision = precision_score(trues, predicted)
val_recall = recall_score(trues, predicted)
val_f1 = f1_score(trues, predicted)

print('Accuracy:', val_accuracy, '\nPrecision:', val_precision, '\nRecall:', val_recall, '\nF1:', val_f1)

Accuracy: 0.6826666666666666 
Precision: 0.6789158196507689 
Recall: 0.6941113775646149 
F1: 0.6864295125164691


Метрики и их примерные значения:

Accuracy:  0.6993333333333334

Precision:  0.6777971592324944

Recall:  0.7387289516567083

F1:  0.7069525666016894

### Experiments (5 баллов)

Выполните не менее двух из следующих задач:

1. Попробуйте улучшить качество модели, например, изменив число и размер слоев или параметры обучения.

2. Улучшите предобработку данных. Например, можно использовать [предобученный токенизатор](https://huggingface.co/docs/transformers/model_doc/gpt2#transformers.GPT2Tokenizer)
 от модели gpt2.
3. Проведите анализ ошибок: где модель ошибается и почему? Привести примеры/статистики в данных.

4. Проведите анализ предсказаний модели: где модель меняет свое предсказание с negative -> positive и наоборот. Есть ли логика в этих случаях?

5. Возможно, использвание предсказания с последнего временного шага не оптимально. Существует ли другая функция от выходов модели, которая улучшит качество?

#Небольшой анализ предсказаний модели

In [201]:
len(texts), len(predicted), len(trues), len(states)

(7500, 7500, 7500, 7500)

In [88]:
texts[:5]

[array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
      

In [89]:
trues[:5]

array([[1],
       [0],
       [1],
       [0],
       [1]])

In [275]:
for i in range(5):
    print(np.mean(states[i]))

0.077255346
-0.08712442
-0.09309502
-0.07449494
-0.09451513


In [90]:
predicted[:5]

array([0., 1., 1., 1., 1.], dtype=float32)

Первые пять эмбеддингов очень разреженные и модель, предположительно, скатывается в угадывание. Ни о какой логике речи здесь не идёт.

In [94]:
texts[105:110]

[array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

In [95]:
trues[105:110]

array([[1],
       [1],
       [1],
       [0],
       [1]])

In [245]:
for i in range(105, 110):
    print(np.mean(states[i]))

0.034171477
-0.088657476
-0.12417705
0.02216717
0.0026065994


In [96]:
predicted[105:110]

array([0., 1., 1., 0., 0.], dtype=float32)

Здесь предсказания как-будто инверсия по знаку средних значений со скрытых состояний: где у средних знак "+" там negative, иначе - positive.

In [185]:
texts[7000:7005]

[array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
      

In [183]:
trues[7000:7005]

array([[1],
       [0],
       [1],
       [0],
       [1]])

In [246]:
for i in range(7000, 7005):
    print(np.mean(states[i]))

-0.017730491
0.029160481
0.074722864
0.0043238355
-0.12124055


In [184]:
predicted[7000:7005]

array([1., 0., 0., 0., 1.], dtype=float32)

Последовательности менее разреженные, однако логика, по которой модель выдаёт предсказания подтверждается.

In [247]:
texts[600:605]

[array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
      

In [248]:
trues[600:605]

array([[0],
       [0],
       [1],
       [1],
       [1]])

In [249]:
for i in range(600, 605):
    print(np.mean(states[i]))

0.04753111
-0.075865306
0.06624485
-0.09952092
0.0751025


In [250]:
predicted[600:605]

array([0., 1., 0., 1., 0.], dtype=float32)

Предположение снова подтверждается, однако смотря на истинные значения мы убеждаемся, что зависимость более хитрая и модель ошибается в некоторых случаях.

In [251]:
texts[670:675]

[array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
      

In [252]:
trues[670:675]

array([[1],
       [1],
       [0],
       [1],
       [0]])

In [253]:
for i in range(670, 675):
    print(np.mean(states[i]))

-0.061429564
-0.10100319
-0.10846983
-0.039346937
-0.1146792


In [254]:
predicted[670:675]

array([1., 1., 1., 1., 1.], dtype=float32)

Снова подтверждается. Наверное, уже можно принять некоторую гипотезу об инверсии знаков средних для предсказаний модели. Но отметим, что модель ошибается в данном предположении о зависимости в данных.  

In [276]:
cnt = 0
for i in range(len(predicted)):
    if states[i].mean() < 0 and predicted[i] == 1.:
        cnt += 1
    elif states[i].mean() > 0 and predicted[i] == 0.:
        cnt += 1
cnt, len(predicted)

(7231, 7500)

In [277]:
c = 0
for i in range(len(predicted)):
    if predicted[i] == trues[i]:
        c += 1
c

5120

Действительно, в подавляющем большинстве случаев модель как-будто исходит из средних значений скрытых состояний, однако, как можно убедиться истинные значения не совсем точно описываются данной функцией.

#Max pooling

Оговорка, это условно макс пулинг. Будем брать такую последовательность, которая характеризуется наиболее максимальными значениями. Теперь поэкспериментируем с различными вариантами обучения, но в экспериментах не будут затронуты параметры: размеры эмбеддингов и размерность скрытого состояния - поскольку почти очевидно, что при повышении данных параметров (до определенного момента) будет расти качество модели.

In [64]:
class RNNClassifier(nn.Module):
    def __init__(self, num_inputs, num_hiddens, out_dim, pool='max', num_layers=2):
        # YOUR CODE HERE
        super().__init__()
        self.rnn = nn.RNN(num_hiddens, num_hiddens, num_layers=num_layers, batch_first=True)
        self.embedding = nn.Embedding(len(vocab), embedding_dim=num_hiddens)
        self.linear = nn.Linear(num_hiddens, num_hiddens)
        self.projection = nn.Linear(num_hiddens, out_dim)

        self.non_lin = nn.Tanh()
        self.dropout = nn.Dropout(p=0.1)

        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens
        self.out_dim = out_dim
        self.pool = pool

    def forward(self, inputs, state=None):
        # YOUR CODE HERE
        embedded = self.embedding(inputs)
        output, _ = self.rnn(embedded)
        if self.pool == 'max':
            output = output.max(dim=1)[0]
        elif self.pool == 'mean':
            output = output.mean(dim=1)

        output = self.dropout(self.linear(self.non_lin(output)))  # [batch_size, hidden_dim]
        prediction = self.projection(output)  # [batch_size, num_classes]

        return prediction

In [67]:
seed_torch(seed)
rnn_2 = RNNClassifier(100, 50, 1, pool='max').to(device)

In [68]:
seed_torch(seed)
opt = torch.optim.Adam(rnn_2.parameters())
loss_func = nn.BCEWithLogitsLoss()

In [69]:
%%time
for epoch in range(1, epochs + 1):
    # YOUR CODE HERE
    running_loss = 0.
    rnn_2.train()
    for label, text in tqdm(train_dataloader):
        opt.zero_grad()
        preds = rnn_2(text)
        loss = loss_func(preds.to(device), label.float().to(device))

        loss.backward()
        opt.step()
        running_loss += loss.item()

    epoch_loss = running_loss / train_size

    val_loss = 0.
    rnn_2.eval()

    correct = 0
    total = 0
    for label, text in tqdm(valid_dataloader):
        preds = rnn_2(text)
        loss = loss_func(preds.to(device), label.float().to(device))

        val_loss += loss.item()

    valid_loss = val_loss / valid_size

    print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, epoch_loss, valid_loss))

  0%|          | 0/274 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

Epoch: 1, Training Loss: 0.6282232020582471, Validation Loss: 0.5143649944892297


  0%|          | 0/274 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

Epoch: 2, Training Loss: 0.4401131067083869, Validation Loss: 0.42461070698550624
CPU times: user 24.5 s, sys: 332 ms, total: 24.8 s
Wall time: 26.1 s


In [71]:
predicted = []
trues = []
for label, text in tqdm(valid_dataloader):
    with torch.inference_mode():
        preds = rnn_2(text)
        preds = torch.round(torch.sigmoid(preds.squeeze(1)))
    trues += list(label.cpu().detach().numpy())
    predicted += list(preds.cpu().detach().numpy())

  0%|          | 0/118 [00:00<?, ?it/s]

In [72]:
predicted = np.array(predicted)
trues = np.array(trues)

val_accuracy = accuracy_score(trues, predicted)
val_precision = precision_score(trues, predicted)
val_recall = recall_score(trues, predicted)
val_f1 = f1_score(trues, predicted)

print('Accuracy:', val_accuracy, '\nPrecision:', val_precision, '\nRecall:', val_recall, '\nF1:', val_f1)

Accuracy: 0.8057333333333333 
Precision: 0.7635043123014071 
Recall: 0.8901825879862397 
F1: 0.8219914477703115


#Mean pooling

Оговорка, это условно пулинг по осреднённым значениям

In [73]:
seed_torch(seed)
rnn_2 = RNNClassifier(100, 50, 1, pool='mean').to(device)

In [74]:
seed_torch(seed)
opt = torch.optim.Adam(rnn_2.parameters())
loss_func = nn.BCEWithLogitsLoss()

In [75]:
%%time
for epoch in range(1, epochs + 1):
    # YOUR CODE HERE
    running_loss = 0.
    rnn_2.train()
    for label, text in tqdm(train_dataloader):
        opt.zero_grad()
        preds = rnn_2(text)
        loss = loss_func(preds.to(device), label.float().to(device))

        loss.backward()
        opt.step()
        running_loss += loss.item()

    epoch_loss = running_loss / train_size

    val_loss = 0.
    rnn_2.eval()

    correct = 0
    total = 0
    for label, text in tqdm(valid_dataloader):
        preds = rnn_2(text)
        loss = loss_func(preds.to(device), label.float().to(device))

        val_loss += loss.item()

    valid_loss = val_loss / valid_size

    print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, epoch_loss, valid_loss))

  0%|          | 0/274 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

Epoch: 1, Training Loss: 0.6963682034950116, Validation Loss: 0.6991270322066087


  0%|          | 0/274 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

Epoch: 2, Training Loss: 0.6770550575885144, Validation Loss: 0.6331408976489662
CPU times: user 25.2 s, sys: 221 ms, total: 25.4 s
Wall time: 30.5 s


In [76]:
predicted = []
trues = []
for label, text in tqdm(valid_dataloader):
    with torch.inference_mode():
        preds = rnn_2(text)
        preds = torch.round(torch.sigmoid(preds.squeeze(1)))
    trues += list(label.cpu().detach().numpy())
    predicted += list(preds.cpu().detach().numpy())

  0%|          | 0/118 [00:00<?, ?it/s]

In [77]:
predicted = np.array(predicted)
trues = np.array(trues)

val_accuracy = accuracy_score(trues, predicted)
val_precision = precision_score(trues, predicted)
val_recall = recall_score(trues, predicted)
val_f1 = f1_score(trues, predicted)

print('Accuracy:', val_accuracy, '\nPrecision:', val_precision, '\nRecall:', val_recall, '\nF1:', val_f1)

Accuracy: 0.6138666666666667 
Precision: 0.5700237906423473 
Recall: 0.951045250066155 
F1: 0.7128123760412534


Макс пулинг ("жадный") пулинг показал себя гораздо лучше, чем по средним значениям с выходов реккурентного слоя.

#GRU

In [78]:
class GRUClassifier(nn.Module):
    def __init__(self, num_inputs, num_hiddens, out_dim, pool = 'max',
                 num_layers = 2):
        # YOUR CODE HERE
        super().__init__()
        self.rnn = nn.GRU(num_hiddens, num_hiddens, num_layers=num_layers, batch_first=True)
        self.embedding = nn.Embedding(len(vocab), embedding_dim=num_hiddens)
        self.linear = nn.Linear(num_hiddens, num_hiddens)
        self.projection = nn.Linear(num_hiddens, out_dim)

        self.non_lin = nn.Tanh()
        self.dropout = nn.Dropout(p=0.1)

        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens
        self.out_dim = out_dim
        self.pool = pool

    def forward(self, inputs, state=None):
        # YOUR CODE HERE
        embedded = self.embedding(inputs)
        output, _ = self.rnn(embedded)
        if self.pool == 'max':
            output = output.max(dim=1)[0]
        elif self.pool == 'mean':
            output = output.mean(dim=1)

        output = self.dropout(self.linear(self.non_lin(output)))  # [batch_size, hidden_dim]
        prediction = self.projection(output)  # [batch_size, num_classes]

        return prediction

In [79]:
seed_torch(seed)
gru = GRUClassifier(100, 50, 1).to(device)

In [80]:
seed_torch(seed)
opt = torch.optim.Adam(gru.parameters())
loss_func = nn.BCEWithLogitsLoss()

In [81]:
%%time
for epoch in range(1, epochs + 1):
    # YOUR CODE HERE
    running_loss = 0.
    gru.train()
    for label, text in tqdm(train_dataloader):
        opt.zero_grad()
        preds = gru(text)
        loss = loss_func(preds.to(device), label.float().to(device))

        loss.backward()
        opt.step()
        running_loss += loss.item()

    epoch_loss = running_loss / train_size

    val_loss = 0.
    gru.eval()

    correct = 0
    total = 0
    for label, text in tqdm(valid_dataloader):
        preds = gru(text)
        loss = loss_func(preds.to(device), label.float().to(device))

        val_loss += loss.item()

    valid_loss = val_loss / valid_size

    print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, epoch_loss, valid_loss))

  0%|          | 0/274 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

Epoch: 1, Training Loss: 0.6309196176109734, Validation Loss: 0.5121626950736738


  0%|          | 0/274 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

Epoch: 2, Training Loss: 0.4216734612490231, Validation Loss: 0.4142100821193467
CPU times: user 26.2 s, sys: 219 ms, total: 26.4 s
Wall time: 27.2 s


In [82]:
predicted = []
trues = []
for label, text in tqdm(valid_dataloader):
    with torch.inference_mode():
        preds = gru(text)
        preds = torch.round(torch.sigmoid(preds.squeeze(1)))
    trues += list(label.cpu().detach().numpy())
    predicted += list(preds.cpu().detach().numpy())

  0%|          | 0/118 [00:00<?, ?it/s]

In [83]:
predicted = np.array(predicted)
trues = np.array(trues)

val_accuracy = accuracy_score(trues, predicted)
val_precision = precision_score(trues, predicted)
val_recall = recall_score(trues, predicted)
val_f1 = f1_score(trues, predicted)

print('Accuracy:', val_accuracy, '\nPrecision:', val_precision, '\nRecall:', val_recall, '\nF1:', val_f1)

Accuracy: 0.8181333333333334 
Precision: 0.7722660653889515 
Recall: 0.906324424450913 
F1: 0.8339420501582664


#LSTM

In [84]:
class LSTMClassifier(nn.Module):
    def __init__(self, num_inputs, num_hiddens, out_dim, pool = 'max',
                 num_layers = 2):
        # YOUR CODE HERE
        super().__init__()
        self.rnn = nn.LSTM(num_hiddens, num_hiddens, num_layers=num_layers, batch_first=True)
        self.embedding = nn.Embedding(len(vocab), embedding_dim=num_hiddens)
        self.linear = nn.Linear(num_hiddens, num_hiddens)
        self.projection = nn.Linear(num_hiddens, out_dim)

        self.non_lin = nn.Tanh()
        self.dropout = nn.Dropout(p=0.1)

        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens
        self.out_dim = out_dim
        self.pool = pool

    def forward(self, inputs, state=None):
        # YOUR CODE HERE
        embedded = self.embedding(inputs)
        output, _ = self.rnn(embedded)
        if self.pool == 'max':
            output = output.max(dim=1)[0]
        elif self.pool == 'mean':
            output = output.mean(dim=1)

        output = self.dropout(self.linear(self.non_lin(output)))  # [batch_size, hidden_dim]
        prediction = self.projection(output)  # [batch_size, num_classes]

        return prediction

In [85]:
seed_torch(seed)
lstm = LSTMClassifier(100, 50, 1).to(device)

In [86]:
seed_torch(seed)
opt = torch.optim.Adam(lstm.parameters())
loss_func = nn.BCEWithLogitsLoss()

In [87]:
%%time
for epoch in range(1, epochs + 1):
    # YOUR CODE HERE
    running_loss = 0.
    lstm.train()
    for label, text in tqdm(train_dataloader):
        opt.zero_grad()
        preds = lstm(text)
        loss = loss_func(preds.to(device), label.float().to(device))

        loss.backward()
        opt.step()
        running_loss += loss.item()

    epoch_loss = running_loss / train_size

    val_loss = 0.
    lstm.eval()

    correct = 0
    total = 0
    for label, text in tqdm(valid_dataloader):
        preds = lstm(text)
        loss = loss_func(preds.to(device), label.float().to(device))

        val_loss += loss.item()

    valid_loss = val_loss / valid_size

    print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, epoch_loss, valid_loss))

  0%|          | 0/274 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

Epoch: 1, Training Loss: 0.6163826939605531, Validation Loss: 0.510685950008213


  0%|          | 0/274 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

Epoch: 2, Training Loss: 0.4188801827994022, Validation Loss: 0.40659958442561644
CPU times: user 25.4 s, sys: 232 ms, total: 25.6 s
Wall time: 26.2 s


In [88]:
predicted = []
trues = []
for label, text in tqdm(valid_dataloader):
    with torch.inference_mode():
        preds = lstm(text)
        preds = torch.round(torch.sigmoid(preds.squeeze(1)))
    trues += list(label.cpu().detach().numpy())
    predicted += list(preds.cpu().detach().numpy())

  0%|          | 0/118 [00:00<?, ?it/s]

In [89]:
predicted = np.array(predicted)
trues = np.array(trues)

val_accuracy = accuracy_score(trues, predicted)
val_precision = precision_score(trues, predicted)
val_recall = recall_score(trues, predicted)
val_f1 = f1_score(trues, predicted)

print('Accuracy:', val_accuracy, '\nPrecision:', val_precision, '\nRecall:', val_recall, '\nF1:', val_f1)

Accuracy: 0.816 
Precision: 0.7898043005556897 
Recall: 0.8650436623445356 
F1: 0.8257135640313209


По метрикам LSTM чуть хуже, поэтому далее будем использовать GRU.

#GPT2 Tokenizer

In [90]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m57.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m79.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m65.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
Insta

In [8]:
from transformers import GPT2Tokenizer

tokenizer_gpt2 = GPT2Tokenizer.from_pretrained("gpt2")

def collate_batch_gpt2(batch):
    tokenized_texts = []
    processed_labels = []
    for element in batch:
      tokenized_texts.append(tokenizer_gpt2(element["text"])["input_ids"])
      processed_labels.append(label_pipeline(element["label"]))

    processed_texts = pad_to_max_len(tokenized_texts)
    label_tensor = torch.tensor(processed_labels, device=device, dtype=torch.int64)
    text_tensor = torch.tensor(processed_texts, device=device, dtype=torch.int64)
    return label_tensor, text_tensor

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [26]:
seed_torch(seed)
train_dataloader_gpt2 = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_batch_gpt2)
valid_dataloader_gpt2 = DataLoader(valid_dataset, batch_size=64, shuffle=False, collate_fn=collate_batch_gpt2)
test_dataloader_gpt2 = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_batch_gpt2)

In [27]:
class GRUClassifier(nn.Module):
    def __init__(self, num_inputs, num_hiddens, out_dim, vocab_size, pool = 'max',
                 num_layers = 2):
        # YOUR CODE HERE
        super().__init__()
        self.vocab_size = vocab_size
        self.rnn = nn.GRU(num_hiddens, num_hiddens, num_layers=num_layers, batch_first=True)
        self.embedding = nn.Embedding(self.vocab_size, embedding_dim=num_hiddens)
        self.linear = nn.Linear(num_hiddens, num_hiddens)
        self.projection = nn.Linear(num_hiddens, out_dim)

        self.non_lin = nn.Tanh()
        self.dropout = nn.Dropout(p=0.1)

        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens
        self.out_dim = out_dim
        self.pool = pool


    def forward(self, inputs, state=None):
        # YOUR CODE HERE
        embedded = self.embedding(inputs)
        output, _ = self.rnn(embedded)
        if self.pool == 'max':
            output = output.max(dim=1)[0]
        elif self.pool == 'mean':
            output = output.mean(dim=1)

        output = self.dropout(self.linear(self.non_lin(output)))  # [batch_size, hidden_dim]
        prediction = self.projection(output)  # [batch_size, num_classes]

        return prediction

In [28]:
seed_torch(seed)
gru_gpt2 = GRUClassifier(100, 50, 1, vocab_size=tokenizer_gpt2.vocab_size)
gru_gpt2.to(device)

GRUClassifier(
  (rnn): GRU(50, 50, num_layers=2, batch_first=True)
  (embedding): Embedding(50257, 50)
  (linear): Linear(in_features=50, out_features=50, bias=True)
  (projection): Linear(in_features=50, out_features=1, bias=True)
  (non_lin): Tanh()
  (dropout): Dropout(p=0.1, inplace=False)
)

In [29]:
seed_torch(seed)
opt = torch.optim.Adam(gru_gpt2.parameters())
loss_func = nn.BCEWithLogitsLoss()

In [30]:
%%time
for epoch in range(1, epochs + 1):
    # YOUR CODE HERE
    running_loss = 0.
    gru_gpt2.train()
    for label, text in tqdm(train_dataloader_gpt2):
        opt.zero_grad()
        preds = gru_gpt2(text)
        loss = loss_func(preds.to(device), label.float().to(device))

        loss.backward()
        opt.step()
        running_loss += loss.item()

    epoch_loss = running_loss / train_size

    val_loss = 0.
    gru_gpt2.eval()

    correct = 0
    total = 0
    for label, text in tqdm(valid_dataloader_gpt2):
        preds = gru_gpt2(text)
        loss = loss_func(preds.to(device), label.float().to(device))

        val_loss += loss.item()

    valid_loss = val_loss / valid_size

    print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, epoch_loss, valid_loss))

  0%|          | 0/274 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

Epoch: 1, Training Loss: 0.6394343853215159, Validation Loss: 0.5217435489862393


  0%|          | 0/274 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

Epoch: 2, Training Loss: 0.446251032771645, Validation Loss: 0.41757418864812607
CPU times: user 1min 57s, sys: 807 ms, total: 1min 57s
Wall time: 2min 1s


In [31]:
predicted = []
trues = []
for label, text in tqdm(valid_dataloader_gpt2):
    with torch.inference_mode():
        preds = gru_gpt2(text)
        preds = torch.round(torch.sigmoid(preds.squeeze(1)))
    trues += list(label.cpu().detach().numpy())
    predicted += list(preds.cpu().detach().numpy())

  0%|          | 0/118 [00:00<?, ?it/s]

In [32]:
predicted = np.array(predicted)
trues = np.array(trues)

val_accuracy = accuracy_score(trues, predicted)
val_precision = precision_score(trues, predicted)
val_recall = recall_score(trues, predicted)
val_f1 = f1_score(trues, predicted)

print('Accuracy:', val_accuracy, '\nPrecision:', val_precision, '\nRecall:', val_recall, '\nF1:', val_f1)

Accuracy: 0.8101333333333334 
Precision: 0.7726527745258721 
Recall: 0.8792965627498002 
F1: 0.8225324027916251


С GPT2 токенизатором результаты немного хуже, чем у LSTM, но лучше чем у RNN.

#Выводы

1. В результате экспериментов были получены относительно неплохие результаты и положительное приращение относительно стандартной RNN. Максимальное значение F1-меры составляет 0.8339420501582664.

2. Улучшив предобработку данных с использованием предобученного токенизатора из GPT2, получили результаты ненамного превышающие максимальные значения F1-меры для RNN.

3. (3-й и 4-й пункты) В результате анализа предсказаний модели было выявлено, что в подавляющем большинстве случаев модель как-будто исходит из средних значений скрытых состояний, однако, как можно убедиться истинные значения (наши таргеты) не совсем точно описываются данной функцией от скрытых состояний (медиана тоже не подойдёт).

5. Действительно, использвание предсказания с последнего временного шага не совсем оптимально. Более оптимально использовать аналог макс пулинга, взяв такую последовательность, которая будет характеризоваться максимальными значениями. С использованием данной функции удалось добиться положительного приращения по F1-мере более 0.1.