## Gradient Clipping

In [None]:
optimizer.zero_grad()        
loss, hidden = model(data, hidden, targets)
loss.backward()

clipping_value = 1 
torch.nn.utils.clip_grad_norm_(model.parameters(), clipping_value)
optimizer.step()

## Классификация текстов

В этом задании вам предстоит попробовать несколько методов, используемых в задаче классификации, а также понять насколько хорошо модель понимает смысл слов и какие слова в примере влияют на результат.

In [None]:
import pandas as pd
import numpy as np
import torch

from torchtext.legacy import datasets
from torchtext.legacy.data import Field, LabelField
from torchtext.legacy.data import BucketIterator

from torchtext.vocab import Vectors, GloVe

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
from tqdm.autonotebook import tqdm

In [None]:
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
TEXT = Field(sequential=True, lower=True, include_lengths=True)
LABEL = LabelField(dtype=torch.float)

train, test = datasets.IMDB.splits(TEXT, LABEL)
train, valid = train.split(random_state=random.seed(SEED))

TEXT.build_vocab(train)
LABEL.build_vocab(train)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

train_iter, valid_iter, test_iter = BucketIterator.splits(
        (train, valid, test),
        batch_sizes=(128, 256, 256),
        sort=False,
        sort_key= lambda x: len(x.src),
        sort_within_batch=False,
        device=device,
        repeat=False,
)

## RNN

Попробуем применить рекурентные нейронные сети (GRU или LSTM). 

Для задач классификации используют как hidden_state, так и output последнего токена.

In [None]:
class RNNBaseline(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx):
        
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
        # text = [sent_len, batch_size]
        embedded = self.embedding(text)
        
        # embedded = [sent_len, batch_size, emb_dim]
        # pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), enforce_sorted=False)
        
        # LSTM
        packed_output, (hidden, cell) = self.lstm(packed_embedded)

        # OUTPUT
        # output = [sent_len, batch_size, hid_dim * num_directions] 
        # unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
        # get last token from output  
        # output = [batch_size, hid_dim * num_directions] 
        output = output[-1]
       
        # HIDDEN
        # hidden = [num_layers * num_directions, batch_size, hid_dim]
        # cell = [num_layers * num_directions, batch_size, hid_dim]
        # get hidden from two last layers
        # hidden = [batch_size, hid_dim * num_layers] 
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
            
        out_drop = self.dropout(hidden)
        out = self.fc(out_drop)
    
        return out

In [None]:
vocab_size = len(TEXT.vocab)
emb_dim = 100
hidden_dim = 256
output_dim = 1
n_layers = 2
bidirectional = True
dropout = 0.2
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
patience=3
max_epochs = 20

In [None]:
model = RNNBaseline(
    vocab_size=vocab_size,
    embedding_dim=emb_dim,
    hidden_dim=hidden_dim,
    output_dim=output_dim,
    n_layers=n_layers,
    bidirectional=bidirectional,
    dropout=dropout,
    pad_idx=PAD_IDX
)

model = model.to(device)

In [None]:
import numpy as np

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_func = nn.BCEWithLogitsLoss()
min_loss = np.inf

cur_patience = 0

for epoch in range(1, max_epochs + 1):
    train_loss = 0.0
    model.train()
    pbar = tqdm(enumerate(train_iter), total=len(train_iter), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar: 
        input_embeds = batch.text[0].to(device)
        l = batch.text[1].to(device)
        labels = batch.label.to(device)
        optimizer.zero_grad()
        prediction = model(input_embeds, l)
        loss = loss_func(prediction.squeeze(), labels)
        loss.backward()
        pbar.update(labels.size(0))
        optimizer.step()
        train_loss += loss.item()

    train_loss /= len(train_iter)
    val_loss = 0.0
    model.eval()
    pbar = tqdm(enumerate(valid_iter), total=len(valid_iter), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    val_loss = 0
    valid_acc = 0
    for it, batch in pbar:
        with torch.no_grad():
            input_embeds = batch.text[0].to(device)
            l = batch.text[1].to(device)
            labels = batch.label.to(device)

            prediction = model(input_embeds, l)
            val_loss += loss_func(prediction.squeeze(), labels)
            valid_acc += (labels == torch.max(prediction, axis=1)[1]).float().mean()
    val_loss /= len(valid_iter)
    if val_loss < min_loss:
        min_loss = val_loss
        best_model = model.state_dict()
    else:
        cur_patience += 1
        if cur_patience == patience:
            cur_patience = 0
            break
    
    print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, train_loss, val_loss))
model.load_state_dict(best_model)

  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 1, Training Loss: 0.6271619996885314, Validation Loss: 0.5718786716461182


  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 2, Training Loss: 0.48079926863203953, Validation Loss: 0.4841543436050415


  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 3, Training Loss: 0.3636184473977472, Validation Loss: 0.528517484664917


  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 4, Training Loss: 0.34798430015135856, Validation Loss: 0.4930211007595062


  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 5, Training Loss: 0.19927864973127407, Validation Loss: 0.4535788595676422


  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

<All keys matched successfully>

Посчитаем f1-score нашего классификатора на тестовом датасете.

In [None]:
from sklearn.metrics import f1_score

prediction=[]
label=[]
with torch.no_grad():
    for batch in test_iter:
        input_embeds = batch.text[0].to(device)
        l = batch.text[1].to(device)
        labels = batch.label.to(device)
        pr = model(input_embeds, l).cpu().detach().numpy().squeeze()
        prediction.extend(np.where(pr > 0.5, 1, 0))
           
        label.extend(labels.cpu())
        
f1_score(label, prediction)        

0.7930487091653438

## CNN

![](https://www.researchgate.net/publication/333752473/figure/fig1/AS:769346934673412@1560438011375/Standard-CNN-on-text-classification.png)

In [None]:
TEXT = Field(sequential=True, lower=True, batch_first=True) 
LABEL = LabelField(batch_first=True, dtype=torch.float)

train, test = datasets.IMDB.splits(TEXT, LABEL)
train, valid = train.split(random_state=random.seed(SEED))

TEXT.build_vocab(train)
LABEL.build_vocab(train)

device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
train_iter, valid_iter, test_iter = BucketIterator.splits(
        (train, valid, test),
        batch_sizes=(128, 256, 256),
        sort=False,
        sort_key= lambda x: len(x.src),
        sort_within_batch=False,
        device=device,
        repeat=False,
)

Вы можете использовать Conv2d с `in_channels=1, kernel_size=(kernel_sizes[0], emb_dim))` или Conv1d c `in_channels=emb_dim, kernel_size=kernel_size[0]`.

In [None]:
class CNN(nn.Module):
    def __init__(self, vocab_size, emb_dim, out_channels, kernel_sizes, dropout=0.5,):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.conv_0 = nn.Conv1d(emb_dim, out_channels, kernel_size=kernel_sizes[0])
        self.conv_1 = nn.Conv1d(emb_dim, out_channels, kernel_size=kernel_sizes[1])
        self.conv_2 = nn.Conv1d(emb_dim, out_channels, kernel_size=kernel_sizes[2])
        self.fc = nn.Linear(len(kernel_sizes) * out_channels, 1)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        embedded = self.embedding(text)
  
        conved_0 = F.relu(self.conv_0(embedded.permute(0,2,1)))
        conved_1 = F.relu(self.conv_1(embedded.permute(0,2,1)))
        conved_2 = F.relu(self.conv_2(embedded.permute(0,2,1)))
        
        pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
        pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
       
        cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim=1))
       
        return self.fc(cat)

In [None]:
kernel_sizes = [3, 4, 5]
vocab_size = len(TEXT.vocab)
out_channels=64
dropout = 0.2
dim = 300
max_epochs = 20

In [None]:
model = CNN(
    vocab_size=vocab_size, 
    emb_dim=dim, 
    out_channels=out_channels, 
    kernel_sizes=kernel_sizes, 
    dropout=dropout
    )

model = model.to(device)

In [None]:
import numpy as np

min_loss = np.inf
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_func = nn.BCEWithLogitsLoss()
cur_patience = 0

for epoch in range(1, max_epochs + 1):
    train_loss = 0.0
    model.train()
    pbar = tqdm(enumerate(train_iter), total=len(train_iter), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar: 
        #YOUR CODE GOES HERE
        input_embeds = batch.text.to(device)
        labels = batch.label.to(device)
        optimizer.zero_grad()
        prediction = model(input_embeds)
        loss = loss_func(prediction.squeeze(), labels) 
        loss.backward()
        pbar.update(labels.size(0))
        optimizer.step()
        train_loss += loss.item()

    train_loss /= len(train_iter)
    val_loss = 0.0
    model.eval()
    pbar = tqdm(enumerate(valid_iter), total=len(valid_iter), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar:
        # YOUR CODE GOES HERE
        with torch.no_grad():
            input_embeds = batch.text.to(device)
            l = batch.text[1].to(device)
            labels = batch.label.to(device)
            prediction = model(input_embeds)
            val_loss += loss_func(prediction.squeeze(), labels)
            
    val_loss /= len(valid_iter)
    if val_loss < min_loss:
        min_loss = val_loss
        best_model = model.state_dict()
    else:
        cur_patience += 1
        if cur_patience == patience:
            cur_patience = 0
            break
    
    print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, train_loss, val_loss))
model.load_state_dict(best_model)

  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 1, Training Loss: 0.5632805580640361, Validation Loss: 0.4466485381126404


  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 2, Training Loss: 0.39418128981207406, Validation Loss: 0.38123005628585815


  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 3, Training Loss: 0.29523872709187277, Validation Loss: 0.3497648239135742


  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 4, Training Loss: 0.2146153299912919, Validation Loss: 0.3522116541862488


  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 5, Training Loss: 0.14707592314612256, Validation Loss: 0.3280031383037567


  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 6, Training Loss: 0.09724387899041176, Validation Loss: 0.3667355179786682


  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

<All keys matched successfully>

Посчитаем f1-score нашего классификатора на тестовом датасете.

In [None]:
prediction=[]
label=[]
with torch.no_grad():
    for batch in test_iter:
        input_embeds = batch.text.to(device)
        labels = batch.label.to(device)
        pr = model(input_embeds).cpu().detach().numpy().squeeze()
        prediction.extend(np.where(pr > 0.5, 1, 0))
           
        label.extend(labels.cpu())
        
f1_score(label, prediction) 

0.8600754594506875

## Интерпретируемость

In [None]:
!pip install -q captum

In [None]:
from captum.attr import LayerIntegratedGradients, TokenReferenceBase, visualization

PAD_IND = TEXT.vocab.stoi['pad']

token_reference = TokenReferenceBase(reference_token_idx=PAD_IND)
lig = LayerIntegratedGradients(model, model.embedding)

In [None]:
def forward_with_softmax(inp):
    logits = model(inp)
    return torch.softmax(logits, 0)[0][1]

def forward_with_sigmoid(input):
    return torch.sigmoid(model(input))


# accumalate couple samples in this array for visualization purposes
vis_data_records_ig = []

def interpret_sentence(model, sentence, min_len = 7, label = 0):
    model.eval()
    text = [tok for tok in TEXT.tokenize(sentence)]
    if len(text) < min_len:
        text += ['pad'] * (min_len - len(text))
    indexed = [TEXT.vocab.stoi[t] for t in text]

    model.zero_grad()

    input_indices = torch.tensor(indexed, device=device)
    input_indices = input_indices.unsqueeze(0)
    
    # input_indices dim: [sequence_length]
    seq_length = min_len

    # predict
    pred = forward_with_sigmoid(input_indices).item()
    pred_ind = round(pred)

    # generate reference indices for each sample
    reference_indices = token_reference.generate_reference(seq_length, device=device).unsqueeze(0)

    # compute attributions and approximation delta using layer integrated gradients
    attributions_ig, delta = lig.attribute(input_indices, reference_indices, \
                                           n_steps=5000, return_convergence_delta=True)

    print('pred: ', LABEL.vocab.itos[pred_ind], '(', '%.2f'%pred, ')', ', delta: ', abs(delta))

    add_attributions_to_visualizer(attributions_ig, text, pred, pred_ind, label, delta, vis_data_records_ig)
    
def add_attributions_to_visualizer(attributions, text, pred, pred_ind, label, delta, vis_data_records):
    attributions = attributions.sum(dim=2).squeeze(0)
    attributions = attributions / torch.norm(attributions)
    attributions = attributions.cpu().detach().numpy()

    # storing couple samples in an array for visualization purposes
    vis_data_records.append(visualization.VisualizationDataRecord(
                            attributions,
                            pred,
                            LABEL.vocab.itos[pred_ind],
                            LABEL.vocab.itos[label],
                            LABEL.vocab.itos[1],
                            attributions.sum(),       
                            text,
                            delta))

In [None]:
interpret_sentence(model, 'It was a fantastic performance !', label=1)
interpret_sentence(model, 'Best film ever', label=1)
interpret_sentence(model, 'Such a great show!', label=1)
interpret_sentence(model, 'It was a horrible movie', label=0)
interpret_sentence(model, 'I\'ve never watched something as bad', label=0)
interpret_sentence(model, 'It is a disgusting movie!', label=0)

pred:  pos ( 0.97 ) , delta:  tensor([4.5022e-05], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.09 ) , delta:  tensor([6.0123e-06], device='cuda:0', dtype=torch.float64)
pred:  pos ( 0.98 ) , delta:  tensor([4.0423e-05], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.00 ) , delta:  tensor([0.0001], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.25 ) , delta:  tensor([1.0883e-05], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.25 ) , delta:  tensor([3.7685e-05], device='cuda:0', dtype=torch.float64)


In [None]:
print('Visualize attributions based on Integrated Gradients')
visualization.visualize_text(vis_data_records_ig)

Visualize attributions based on Integrated Gradients


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
pos,pos (0.97),pos,0.81,It was a fantastic performance ! pad
,,,,
pos,neg (0.09),pos,-0.95,Best film ever pad pad pad pad
,,,,
pos,pos (0.98),pos,0.94,Such a great show! pad pad pad
,,,,
neg,neg (0.00),pos,-0.86,It was a horrible movie pad pad
,,,,
neg,neg (0.25),pos,-0.16,I've never watched something as bad pad
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
pos,pos (0.97),pos,0.81,It was a fantastic performance ! pad
,,,,
pos,neg (0.09),pos,-0.95,Best film ever pad pad pad pad
,,,,
pos,pos (0.98),pos,0.94,Such a great show! pad pad pad
,,,,
neg,neg (0.00),pos,-0.86,It was a horrible movie pad pad
,,,,
neg,neg (0.25),pos,-0.16,I've never watched something as bad pad
,,,,
