# Задание 3

## Классификация текстов

В этом задании вам предстоит попробовать несколько методов, используемых в задаче классификации, а также понять насколько хорошо модель понимает смысл слов и какие слова в примере влияют на результат.

In [1]:
import pandas as pd
import numpy as np
import torch

from torchtext import datasets

from torchtext.data import Field, LabelField
from torchtext.data import BucketIterator

from torchtext.vocab import Vectors, GloVe

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
from tqdm.autonotebook import tqdm
from sklearn.metrics import f1_score
from statistics import mean 

В этом задании мы будем использовать библиотеку torchtext. Она довольна проста в использовании и поможет нам сконцентрироваться на задаче, а не на написании Dataloader-а.

In [24]:
TEXT = Field(sequential=True, lower=True, include_lengths=True)  # Поле текста
LABEL = LabelField(dtype=torch.float)  # Поле метки

In [25]:
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

Датасет на котором мы будем проводить эксперементы это комментарии к фильмам из сайта IMDB.

In [26]:
train, test = datasets.IMDB.splits(TEXT, LABEL)  # загрузим датасет
train, valid = train.split(random_state=random.seed(SEED))  # разобьем на части

In [27]:
TEXT.build_vocab(train)
LABEL.build_vocab(train)

In [28]:
device = "cuda" if torch.cuda.is_available() else "cpu"
train_iter, valid_iter, test_iter = BucketIterator.splits(
    (train, valid, test), 
    batch_size = 64,
    sort_within_batch = True,
    device = device)

## RNN

Для начала попробуем использовать рекурентные нейронные сети. На семинаре вы познакомились с GRU, вы можете также попробовать LSTM. Можно использовать для классификации как hidden_state, так и output последнего токена.

In [29]:
class RNNBaseline(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn =  nn.LSTM(embedding_dim,hidden_dim,n_layers)  # YOUR CODE GOES HERE
        
        self.fc = nn.Linear(hidden_dim,output_dim)  # YOUR CODE GOES HERE

        self.dropout = nn.Dropout(p=dropout)
        
        
    def forward(self, text, text_lengths):
        
        #text = [sent len, batch size]
        
        embedded = self.embedding(text)
        
        #embedded = [sent len, batch size, emb dim]
        #torch.Size([233, 64, 100]) embeded shape
        
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        
        # cell arg for LSTM, remove for GRU
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)  
        #torch.Size([64, 64, 256]) output shape #output.shape
        #torch.Size([64])                       #length

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #torch.Size([2, 64, 256]) hidden_dim

        #cell = [num layers * num directions, batch size, hid dim]
        #torch.Size([2, 64, 256]) cell shape
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        #back_hidden = torch.squeeze(hidden,0)
        
        #hidden = torch.cat((hidden[-2,:,:],hidden[-1,:,:]),0)  # YOUR CODE GOES HERE
        hidden = hidden[-2,:,:] + hidden[-1,:,:]
        hidden = self.dropout(hidden)


        #hidden = [batch size, hid dim * num directions] or [batch_size, hid dim * num directions]
            
        return self.fc(hidden)

Поиграйтесь с гиперпараметрами

In [30]:
vocab_size = len(TEXT.vocab)
emb_dim = 100
hidden_dim = 256
output_dim = 1
n_layers = 2
bidirectional = True
dropout = 0.2
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
patience=3

In [31]:
model = RNNBaseline(
    vocab_size=vocab_size,
    embedding_dim=emb_dim,
    hidden_dim=hidden_dim,
    output_dim=output_dim,
    n_layers=n_layers,
    bidirectional=bidirectional,
    dropout=dropout,
    pad_idx=PAD_IDX
)

In [32]:
model

RNNBaseline(
  (embedding): Embedding(201944, 100, padding_idx=1)
  (rnn): LSTM(100, 256, num_layers=2)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [33]:
model = model.to(device)

In [34]:
opt = torch.optim.Adam(model.parameters())
loss_func = nn.BCEWithLogitsLoss()

max_epochs = 10

Обучите сетку! Используйте любые вам удобные инструменты, Catalyst, PyTorch Lightning или свои велосипеды.

In [36]:
import numpy as np

min_loss = np.inf

cur_patience = 0
fscore_list = []

for epoch in range(1, max_epochs + 1):
    f1_score_num = 0
    train_loss = 0.0
    model.train()
    pbar = tqdm(enumerate(train_iter), total=len(train_iter), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar: 
      input_embeds = batch.text[0]    #YOUR CODE GOES HERE 64 - text length.
      labels = torch.unsqueeze(batch.label,1).to(device)
      opt.zero_grad()
      preds = model(input_embeds,batch.text[1])
      loss = loss_func(preds,labels)
      train_loss += loss_func(preds, labels)
      loss.backward()
      pbar.update(labels.size(0))

      opt.step()

    train_loss /= len(train_iter)
    val_loss = 0.0
    model.eval()
    pbar = tqdm(enumerate(valid_iter), total=len(valid_iter), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar:
      #print(batch.text)
      input_embeds = batch.text[0].to(device)
      labels = torch.unsqueeze(batch.label,1)
      prediction = model(input_embeds,batch.text[1])
      val_loss += loss_func(prediction, labels)

        # YOUR CODE GOES HERE

    val_loss /= len(valid_iter)
    if val_loss < min_loss:
        min_loss = val_loss
        best_model = model.state_dict()
    else:
        cur_patience += 1
        if cur_patience == patience:
            cur_patience = 0
            break
    
    print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, train_loss, val_loss))
model.load_state_dict(best_model)

HBox(children=(FloatProgress(value=0.0, max=274.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=118.0), HTML(value='')))

Epoch: 1, Training Loss: 0.6900084614753723, Validation Loss: 0.6847250461578369


HBox(children=(FloatProgress(value=0.0, max=274.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=118.0), HTML(value='')))

Epoch: 2, Training Loss: 0.6442990899085999, Validation Loss: 0.6690751314163208


HBox(children=(FloatProgress(value=0.0, max=274.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=118.0), HTML(value='')))

Epoch: 3, Training Loss: 0.520786464214325, Validation Loss: 0.47417518496513367


HBox(children=(FloatProgress(value=0.0, max=274.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=118.0), HTML(value='')))

Epoch: 4, Training Loss: 0.31816908717155457, Validation Loss: 0.4298434257507324


HBox(children=(FloatProgress(value=0.0, max=274.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=118.0), HTML(value='')))

Epoch: 5, Training Loss: 0.1735929399728775, Validation Loss: 0.41004735231399536


HBox(children=(FloatProgress(value=0.0, max=274.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=118.0), HTML(value='')))

Epoch: 6, Training Loss: 0.08446842432022095, Validation Loss: 0.49921125173568726


HBox(children=(FloatProgress(value=0.0, max=274.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=118.0), HTML(value='')))

Epoch: 7, Training Loss: 0.03633188083767891, Validation Loss: 0.6067810654640198


HBox(children=(FloatProgress(value=0.0, max=274.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=118.0), HTML(value='')))

<All keys matched successfully>

Посчитайте f1-score вашего классификатора на тестовом датасете.

**Ответ**: 0.78

In [38]:
#F1 score for test dataset.
min_loss = np.inf

cur_patience = 0
f1_rnn_test = []
max_epochs = 1

for epoch in range(1, max_epochs + 1):
    f1_score_num = 0
    train_loss = 0.0
    val_loss = 0.0
    model.eval()
    pbar = tqdm(enumerate(test_iter), total=len(test_iter), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar:
      #print(batch.text)
      opt.zero_grad()
      input_embeds = batch.text[0]
      labels = torch.unsqueeze(batch.label,1)
      prediction = model(input_embeds,batch.text[1])

      pred_cpu = prediction.cpu().detach().numpy()
      labels_cpu = labels.cpu().detach().numpy()

      gmoid = nn.Sigmoid()
      preds_gmoid = gmoid(prediction)
      preds_tensor = torch.ones_like(preds_gmoid)
      preds_tensor = preds_tensor.cpu().detach().numpy()
      for i,elem in enumerate(preds_gmoid):
        if elem.item() <= 0.5:
          preds_tensor[i] = 0.0

      #print(preds_tensor)
      f1_rnn_test.append(f1_score(preds_tensor, labels_cpu))

    
    print('Epoch: {}'.format(epoch))


HBox(children=(FloatProgress(value=0.0, max=391.0), HTML(value='')))

Epoch: 1


In [39]:
print(mean(f1_rnn_test ))

0.7787394574972962


## CNN

![](https://www.researchgate.net/publication/333752473/figure/fig1/AS:769346934673412@1560438011375/Standard-CNN-on-text-classification.png)

Для классификации текстов также часто используют сверточные нейронные сети. Идея в том, что как правило сентимент содержат словосочетания из двух-трех слов, например "очент хороший фильм" или "невероятноя скука". Проходясь сверткой по этим словам мы получим какой-то большой скор и выхватим его с помощью MaxPool. Далее идет обычная полносвязная сетка. Важный момент: свертки применяются не последовательно, а параллельно. Давайте попробуем!

In [44]:
TEXT = Field(sequential=True, lower=True, batch_first=True)  # batch_first тк мы используем conv  
LABEL = LabelField(batch_first=True, dtype=torch.float)
SEED = 1234
train, tst = datasets.IMDB.splits(TEXT, LABEL)
trn, vld = train.split(random_state=random.seed(SEED))

TEXT.build_vocab(trn)
LABEL.build_vocab(trn)

device = "cuda" if torch.cuda.is_available() else "cpu"

In [45]:
train_iter, val_iter, test_iter = BucketIterator.splits(
        (trn, vld, tst),
        batch_sizes=(128, 256, 256),
        sort=False,
        sort_key= lambda x: len(x.src),
        sort_within_batch=False,
        device=device,
        repeat=False,
)

Вы можете использовать Conv2d с `in_channels=1, kernel_size=(kernel_sizes[0], emb_dim))` или Conv1d c `in_channels=emb_dim, kernel_size=kernel_size[0]`. Но хорошенько подумайте над shape в обоих случаях.

In [46]:
class CNN(nn.Module):
    def __init__(
        self,
        vocab_size,
        emb_dim,
        out_channels,
        kernel_sizes,
        dropout=0.5,
    ):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, emb_dim)

        self.conv_0 = nn.Conv2d(1,out_channels,kernel_size=(kernel_sizes[0],emb_dim))  # YOUR CODE GOES HERE
        self.conv_1 = nn.Conv2d(1,out_channels,kernel_size=(kernel_sizes[1],emb_dim))  # YOUR CODE GOES HERE
        self.conv_2 = nn.Conv2d(1,out_channels,kernel_size=(kernel_sizes[2],emb_dim))  # YOUR CODE GOES HERE

        self.fc = nn.Linear(len(kernel_sizes) * out_channels, 1)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        embedded = self.embedding(text)
        
        embedded = embedded.unsqueeze(0)
        embedded = embedded.permute(1,0,2,3)  # may be reshape here
        
        conved_0 = F.relu(self.conv_0(embedded)).squeeze(-1)  # may be reshape here
        conved_1 = F.relu(self.conv_1(embedded)).squeeze(-1)  # may be reshape here
        conved_2 = F.relu(self.conv_2(embedded)).squeeze(-1)  # may be reshape here
        
        pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
        pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
        
        cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim=1))
        #cat2 = torch.reshape(cat,(-1,len(kernel_sizes) * out_channels))


            
        return self.fc(cat)

In [47]:
kernel_sizes = [3, 4, 5]
vocab_size = len(TEXT.vocab)
out_channels=64
dropout = 0.5
dim = 300
patience=3

model = CNN(vocab_size=vocab_size, emb_dim=dim, out_channels=out_channels,
            kernel_sizes=kernel_sizes, dropout=dropout)

In [48]:
#torch.cuda.memory_allocated()

In [49]:
!nvidia-smi

Thu Oct 22 13:38:40 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.23.05    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P0    28W /  70W |  10531MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [50]:
torch.cuda.empty_cache()
model.to(device)

CNN(
  (embedding): Embedding(201944, 300)
  (conv_0): Conv2d(1, 64, kernel_size=(3, 300), stride=(1, 1))
  (conv_1): Conv2d(1, 64, kernel_size=(4, 300), stride=(1, 1))
  (conv_2): Conv2d(1, 64, kernel_size=(5, 300), stride=(1, 1))
  (fc): Linear(in_features=192, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [51]:
opt = torch.optim.Adam(model.parameters())
loss_func = nn.BCEWithLogitsLoss()

In [52]:
max_epochs = 10
torch.cuda.empty_cache()

Обучите!

In [53]:
import numpy as np

min_loss = np.inf

cur_patience = 0
fscore_cnn_list = []

for epoch in range(1, max_epochs + 1):
    train_loss = 0.0
    model.train()
    pbar = tqdm(enumerate(train_iter), total=len(train_iter), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar: 
      opt.zero_grad()
      labels = torch.unsqueeze(batch.label,1)
      inputs = batch.text

      preds = model(inputs)
      train_loss += loss_func(preds, labels)
      loss = loss_func(preds,labels)
      loss.backward()
      pbar.update(labels.size(0))
      opt.step()

    train_loss /= len(train_iter)
    val_loss = 0.0
    model.eval()
    pbar = tqdm(enumerate(val_iter), total=len(val_iter), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    #!nvidia-smi
    
    for it, batch in pbar:
      with torch.no_grad():
        input_embeds2 = batch.text
        labels = torch.unsqueeze(batch.label,1)
        prediction = model(input_embeds2)
        val_loss += loss_func(prediction, labels)

        # YOUR CODE GOES HERE
    val_loss /= len(val_iter)
    if val_loss < min_loss:
        min_loss = val_loss
        best_model = model.state_dict()
    else:
        cur_patience += 1
        if cur_patience == patience:
            cur_patience = 0
            break
      
    print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, train_loss, val_loss))
model.load_state_dict(best_model)

HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 1, Training Loss: 0.6576471328735352, Validation Loss: 0.4961569607257843


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 2, Training Loss: 0.5075519680976868, Validation Loss: 0.4373265504837036


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 3, Training Loss: 0.43834590911865234, Validation Loss: 0.39829501509666443


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 4, Training Loss: 0.3826713562011719, Validation Loss: 0.37273311614990234


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 5, Training Loss: 0.316584050655365, Validation Loss: 0.344610333442688


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 6, Training Loss: 0.24532747268676758, Validation Loss: 0.3411558270454407


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 7, Training Loss: 0.19341625273227692, Validation Loss: 0.3387373685836792


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 8, Training Loss: 0.13649533689022064, Validation Loss: 0.3512106239795685


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 9, Training Loss: 0.09257607161998749, Validation Loss: 0.3723130226135254


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

<All keys matched successfully>

Посчитайте f1-score вашего классификатора.

**Ответ**: 0.95 & 0.47




In [20]:
#F1 score for test dataset.
min_loss = np.inf
f1_cnn_test = []
cur_patience = 0
max_epochs = 1

for epoch in range(1, max_epochs + 1):
    f1_score_num = 0
    train_loss = 0.0
    val_loss = 0.0
    model.eval()
    pbar = tqdm(enumerate(test_iter), total=len(test_iter), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    #!nvidia-smi
    print(f1_cnn_test)
    for it, batch in pbar:
      with torch.no_grad():
        #print(batch.label)
        input_temp = batch.text
        labels = torch.unsqueeze(batch.label,1)
        prediction = model(input_temp)
        pred_cpu = prediction.cpu().detach().numpy()
        labels_cpu = labels.cpu().detach().numpy()
        preds_tensor = torch.ones_like(labels)
        preds_tensor = preds_tensor.cpu().detach().numpy()
        for i,elem in enumerate(pred_cpu):
          if elem.item() < 0.0:
            preds_tensor[i] = 0.0
        score = f1_score(preds_tensor, labels_cpu)
        # if score == 0.0:
        #   print(preds_tensor,'preds_tensor')
        #   print(labels_cpu,'label_cpu')
        #   print(batch.label,'batch')
        #   print(batch.text,'text')
        f1_cnn_test.append(score)
    print('Epoch: {}'.format(epoch))


HBox(children=(FloatProgress(value=0.0, max=98.0), HTML(value='')))

[]
Epoch: 1


In [23]:
print(mean(f1_cnn_test))

0.4752971545955045


Такое низкое значение получается из за не очень хорошей разбивки test_iter. В этот датасет попали батчи у которых все лейблы равны 0 и поэтому F1 получается таким низким, все портят 0.0 в графе F1. Удалим эти нули и посчитаем что у нас получилось без них.

In [22]:
correct_F1_list = [i for i in f1_cnn_test if i != 0.0]
print(mean(correct_F1_list))

0.950594309191009


## Интерпретируемость

Посмотрим, куда смотрит наша модель. Достаточно запустить код ниже.

In [54]:
!pip install -q captum

In [57]:
from captum.attr import LayerIntegratedGradients, TokenReferenceBase, visualization

PAD_IND = TEXT.vocab.stoi['pad']

token_reference = TokenReferenceBase(reference_token_idx=PAD_IND)
lig = LayerIntegratedGradients(model, model.embedding)

In [58]:
def forward_with_softmax(inp):
    logits = model(inp)
    return torch.softmax(logits, 0)[0][1]

def forward_with_sigmoid(input):
    return torch.sigmoid(model(input))


# accumalate couple samples in this array for visualization purposes
vis_data_records_ig = []

def interpret_sentence(model, sentence, min_len = 7, label = 0):
    model.eval()
    text = [tok for tok in TEXT.tokenize(sentence)]
    if len(text) < min_len:
        text += ['pad'] * (min_len - len(text))
    indexed = [TEXT.vocab.stoi[t] for t in text]

    model.zero_grad()

    input_indices = torch.tensor(indexed, device=device)
    input_indices = input_indices.unsqueeze(0)
    
    # input_indices dim: [sequence_length]
    seq_length = min_len

    # predict
    pred = forward_with_sigmoid(input_indices).item()
    pred_ind = round(pred)

    # generate reference indices for each sample
    reference_indices = token_reference.generate_reference(seq_length, device=device).unsqueeze(0)

    # compute attributions and approximation delta using layer integrated gradients
    attributions_ig, delta = lig.attribute(input_indices, reference_indices, \
                                          n_steps=5000, return_convergence_delta=True)

    print('pred: ', LABEL.vocab.itos[pred_ind], '(', '%.2f'%pred, ')', ', delta: ', abs(delta))
    add_attributions_to_visualizer(attributions_ig, text, pred, pred_ind, label, delta, vis_data_records_ig)
    
def add_attributions_to_visualizer(attributions, text, pred, pred_ind, label, delta, vis_data_records):
    attributions = attributions.sum(dim=2).squeeze(0)
    attributions = attributions / torch.norm(attributions)
    attributions = attributions.cpu().detach().numpy()

    # storing couple samples in an array for visualization purposes
    vis_data_records.append(visualization.VisualizationDataRecord(
                            attributions,
                            pred,
                            LABEL.vocab.itos[pred_ind],
                            LABEL.vocab.itos[label],
                            LABEL.vocab.itos[1],
                            attributions.sum(),       
                            text,
                            delta))

In [59]:
interpret_sentence(model, 'It was a fantastic performance !', label=1)
interpret_sentence(model, 'Best film ever', label=1)
interpret_sentence(model, 'Such a great show!', label=1)
interpret_sentence(model, 'It was a horrible movie', label=0)
interpret_sentence(model, 'I\'ve never watched something as bad', label=0)
interpret_sentence(model, 'It is a disgusting movie!', label=0)

pred:  pos ( 1.00 ) , delta:  tensor([0.0005], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.40 ) , delta:  tensor([0.0002], device='cuda:0', dtype=torch.float64)
pred:  pos ( 0.67 ) , delta:  tensor([8.7897e-05], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.04 ) , delta:  tensor([1.9569e-05], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.14 ) , delta:  tensor([5.8101e-05], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.13 ) , delta:  tensor([1.2639e-05], device='cuda:0', dtype=torch.float64)


Попробуйте добавить свои примеры!

In [60]:
print('Visualize attributions based on Integrated Gradients')
visualization.visualize_text(vis_data_records_ig)

Visualize attributions based on Integrated Gradients


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
pos,pos (1.00),pos,1.2,It was a fantastic performance ! pad
,,,,
pos,neg (0.40),pos,0.54,Best film ever pad pad pad pad
,,,,
pos,pos (0.67),pos,0.43,Such a great show! pad pad pad
,,,,
neg,neg (0.04),pos,-0.7,It was a horrible movie pad pad
,,,,
neg,neg (0.14),pos,-0.5,I've never watched something as bad pad
,,,,


## Эмбэдинги слов

Вы ведь не забыли, как мы можем применить знания о word2vec и GloVe. Давайте попробуем!

In [61]:
#from torchtext.vocab import Vectors, GloVe

TEXT.build_vocab(trn, vectors='glove.6B.200d')# YOUR CODE GOES HERE
# подсказка: один из импортов пока не использовался, быть может он нужен в строке выше :)
LABEL.build_vocab(trn)

word_embeddings = TEXT.vocab.vectors

kernel_sizes = [3, 4, 5]
vocab_size = len(TEXT.vocab)
dropout = 0.5
dim = 300

.vector_cache/glove.6B.zip: 862MB [06:26, 2.23MB/s]                           
100%|█████████▉| 399837/400000 [00:33<00:00, 12892.94it/s]

In [65]:
train, tst = datasets.IMDB.splits(TEXT, LABEL)
trn, vld = train.split(random_state=random.seed(SEED))

device = "cuda" if torch.cuda.is_available() else "cpu"

train_iter, val_iter, test_iter = BucketIterator.splits(
        (trn, vld, tst),
        batch_sizes=(128, 256, 256),
        sort=False,
        sort_key= lambda x: len(x.src),
        sort_within_batch=False,
        device=device,
        repeat=False,
)

In [63]:
model = CNN(vocab_size=vocab_size, emb_dim=dim, out_channels=64,
            kernel_sizes=kernel_sizes, dropout=dropout)

word_embeddings = TEXT.vocab.vectors

prev_shape = model.embedding.weight.shape

model.embedding.weight = nn.init.uniform_(model.embedding.weight,-1.0,1.0) # инициализируйте эмбэдинги

assert prev_shape == model.embedding.weight.shape
model.to(device)

opt = torch.optim.Adam(model.parameters())

Вы знаете, что делать.

In [64]:
import numpy as np

min_loss = np.inf

cur_patience = 0
fscore_cnn_emb_list = []

for epoch in range(1, max_epochs + 1):
    train_loss = 0.0
    model.train()
    pbar = tqdm(enumerate(train_iter), total=len(train_iter), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar: 

        #YOUR CODE GOES HERE
      input_embeds2 = batch.text.to(device)
      labels = torch.unsqueeze(batch.label,1).to(device)
      opt.zero_grad()
      preds = model(input_embeds2)
      train_loss += loss_func(preds, labels)
      loss = loss_func(preds,labels)
      loss.backward()
      pbar.update(labels.size(0))
      opt.step()


    train_loss /= len(train_iter)
    val_loss = 0.0
    model.eval()
    pbar = tqdm(enumerate(valid_iter), total=len(valid_iter), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar:

        # YOUR CODE GOES HERE
      input_temp = batch.text[0]
      input_embeds = input_temp.permute(1,0)
      input_embeds2 = input_embeds.to(device)

      labels = torch.unsqueeze(batch.label,1).to(device)
      prediction = model(input_embeds2)
      val_loss += loss_func(prediction, labels)
      # pred_cpu = prediction.cpu().detach().numpy()
      # labels_cpu = labels.cpu().detach().numpy()

      # gmoid = nn.Sigmoid()
      # preds_gmoid = gmoid(prediction)
      # preds_tensor = torch.ones_like(preds_gmoid)
      # preds_tensor = preds_tensor.cpu().detach().numpy()
      # for i,elem in enumerate(preds_gmoid):
      #   if elem.item() <= 0.5:
      #     preds_tensor[i] = 0.0
      # fscore_cnn_emb_list.append(f1_score(preds_tensor, labels_cpu))


    val_loss /= len(valid_iter)
    if val_loss < min_loss:
        min_loss = val_loss
        best_model = model.state_dict()
    else:
        cur_patience += 1
        if cur_patience == patience:
            cur_patience = 0
            break
    
    print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, train_loss, val_loss))
model.load_state_dict(best_model)

HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

100%|█████████▉| 399837/400000 [00:50<00:00, 12892.94it/s]

HBox(children=(FloatProgress(value=0.0, max=118.0), HTML(value='')))

Epoch: 1, Training Loss: 0.6170217394828796, Validation Loss: 0.9777328968048096


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=118.0), HTML(value='')))

Epoch: 2, Training Loss: 0.4822031855583191, Validation Loss: 0.8403798341751099


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=118.0), HTML(value='')))

Epoch: 3, Training Loss: 0.4005446434020996, Validation Loss: 0.7413960099220276


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=118.0), HTML(value='')))

Epoch: 4, Training Loss: 0.3227633535861969, Validation Loss: 0.6850684881210327


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=118.0), HTML(value='')))

Epoch: 5, Training Loss: 0.23553982377052307, Validation Loss: 0.6545608639717102


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=118.0), HTML(value='')))

Epoch: 6, Training Loss: 0.15008018910884857, Validation Loss: 0.6818179488182068


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=118.0), HTML(value='')))

Epoch: 7, Training Loss: 0.08747316896915436, Validation Loss: 0.7957537770271301


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=118.0), HTML(value='')))

<All keys matched successfully>

In [67]:
#F1 score for test dataset.
min_loss = np.inf
max_epochs = 1

cur_patience = 0
f1_cnn_emb_list = []

for epoch in range(1, max_epochs + 1):
    model.eval()
    pbar = tqdm(enumerate(test_iter), total=len(test_iter), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar:

        # YOUR CODE GOES HERE
      with torch.no_grad():
        input_temp = batch.text
        input_embeds = input_temp
        input_embeds2 = input_embeds.to(device)

        labels = torch.unsqueeze(batch.label,1).to(device)
        prediction = model(input_embeds2)


        val_loss += loss_func(prediction, labels)
        pred_cpu = prediction.cpu().detach().numpy()
        labels_cpu = labels.cpu().detach().numpy()

        gmoid = nn.Sigmoid()
        preds_gmoid = gmoid(prediction)
        preds_tensor = torch.ones_like(preds_gmoid)
        preds_tensor = preds_tensor.cpu().detach().numpy()
        for i,elem in enumerate(preds_gmoid):
          if elem.item() <= 0.5:
            preds_tensor[i] = 0.0
        f1_cnn_emb_list.append(f1_score(preds_tensor, labels_cpu))


    val_loss /= len(valid_iter)
    if val_loss < min_loss:
        min_loss = val_loss
        best_model = model.state_dict()
    else:
        cur_patience += 1
        if cur_patience == patience:
            cur_patience = 0
            break
    
    print('Epoch: {},Validation Loss: {}'.format(epoch, val_loss))


HBox(children=(FloatProgress(value=0.0, max=98.0), HTML(value='')))

Epoch: 1,Validation Loss: 0.37838006019592285


Посчитайте f1-score вашего классификатора.

**Ответ**:0.48 & 0.96

In [68]:
print(mean(f1_cnn_emb_list))

0.4820215216573279


In [69]:
correct_F1_embed_list = [i for i in f1_cnn_emb_list if i != 0.0]
print(mean(correct_F1_embed_list ))

0.9640430433146558


Проверим насколько все хорошо!

In [70]:
PAD_IND = TEXT.vocab.stoi['pad']

token_reference = TokenReferenceBase(reference_token_idx=PAD_IND)
lig = LayerIntegratedGradients(model, model.embedding)
vis_data_records_ig = []

interpret_sentence(model, 'It was a fantastic performance !', label=1)
interpret_sentence(model, 'Best film ever', label=1)
interpret_sentence(model, 'Such a great show!', label=1)
interpret_sentence(model, 'It was a horrible movie', label=0)
interpret_sentence(model, 'I\'ve never watched something as bad', label=0)
interpret_sentence(model, 'It is a disgusting movie!', label=0)

pred:  pos ( 0.99 ) , delta:  tensor([0.0002], device='cuda:0', dtype=torch.float64)
pred:  pos ( 0.62 ) , delta:  tensor([7.8851e-05], device='cuda:0', dtype=torch.float64)
pred:  pos ( 0.88 ) , delta:  tensor([6.6449e-05], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.11 ) , delta:  tensor([0.0002], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.38 ) , delta:  tensor([5.1318e-05], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.18 ) , delta:  tensor([0.0002], device='cuda:0', dtype=torch.float64)


In [71]:
print('Visualize attributions based on Integrated Gradients')
visualization.visualize_text(vis_data_records_ig)

Visualize attributions based on Integrated Gradients


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
pos,pos (0.99),pos,1.58,It was a fantastic performance ! pad
,,,,
pos,pos (0.62),pos,1.51,Best film ever pad pad pad pad
,,,,
pos,pos (0.88),pos,1.35,Such a great show! pad pad pad
,,,,
neg,neg (0.11),pos,-1.05,It was a horrible movie pad pad
,,,,
neg,neg (0.38),pos,-0.39,I've never watched something as bad pad
,,,,
