# RNN

Привет! Это семинарский ноутбук для курса DL Basic для Тинькофф. В этом ноутбуке мы рассмотрим рекуррентные нейронные сети (RNN) и их разновидности. Мы будем использовать библиотеку PyTorch.

Для начала установим все нужные библиотеки. Если вы используете Google Colab, то просто запустите следующую ячейку. Если вы используете свой компьютер, то установите все библиотеки, перечисленные в следующей ячейке.

In [1]:
import subprocess
import sys


IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    subprocess.run("pip install datasets nltk gensim", shell=True)
    subprocess.run("python -m nltk.downloader punkt", shell=True)

In [2]:
import gensim
import nltk
import torch

from datasets import load_dataset, load_metric
import gensim.downloader as api

## `datasets` lib

Познакомимся с библиотекой datasets. Эта библиотека содержит наборы данных, которые можно использовать для обучения моделей. В этом ноутбуке мы будем использовать набор данных [SST-2](https://nlp.stanford.edu/sentiment/index.html). Это набор данных, который содержит отзывы на фильмы и их оценки (положительные или отрицательные). Давайте загрузим этот набор данных и посмотрим на него.

In [3]:
sst2_dataset = load_dataset("sst2")
sst2_dataset

Downloading builder script:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.85k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.06k [00:00<?, ?B/s]

Downloading and preparing dataset sst2/default to /root/.cache/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5...


Downloading data:   0%|          | 0.00/7.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Dataset sst2 downloaded and prepared to /root/.cache/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 872
    })
    test: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 1821
    })
})

In [None]:
sst2_dataset["train"]

Dataset({
    features: ['idx', 'sentence', 'label'],
    num_rows: 67349
})

In [None]:
sst2_dataset["train"]['sentence'][0]

## Эмбеддинги и токенизация

Для токенизации мы будем использовать библиотеку [NLTK](https://www.nltk.org/). Для работы с эмбеддингами мы будем использовать библиотеку [Gensim](https://radimrehurek.com/gensim/).

In [4]:
tokenizer = nltk.tokenize.WordPunctTokenizer()

In [5]:
lemmatizer = nltk.stem.WordNetLemmatizer()
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [20]:
wv = api.load('word2vec-google-news-300')
wv_vec = torch.FloatTensor(wv.vectors)
wv_map = {w.lower():i for i,w in enumerate(list(wv.vocab.keys()))}



In [7]:
def map_idx(input_ids):
    ii_map =[]
    for id in input_ids:
        if id in wv_map:
            ii_map.append(wv_map[id])
    return torch.LongTensor(ii_map)

In [8]:
HIDDEN_SIZE = 300
MAX_TEXT_LENGTH = 32

In [9]:
class SST2Dataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, dataset):
        self.tokenizer = tokenizer
        
        def tokenizer_sentece(example):
            return {"tokens": self.tokenizer(example["sentence"])}

        self.dataset = dataset.map(tokenizer_sentece)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        example = self.dataset[index]
        return example["tokens"], example["label"]

In [10]:
def tokenize_pipeline(sentence):
    tokens = tokenizer.tokenize(sentence)
    return [lemmatizer.lemmatize(token) for token in tokens if token.isalpha()]

In [11]:
train_dataset = SST2Dataset(tokenize_pipeline, sst2_dataset['train'])
valid_dataset = SST2Dataset(tokenize_pipeline, sst2_dataset['validation'])
test_dataset = SST2Dataset(tokenize_pipeline, sst2_dataset['test'])

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

## BoW

In [18]:
class BoW(torch.nn.Module):
    def __init__(self, wv, embedding_dim=HIDDEN_SIZE):
        super().__init__()

        self.wv = torch.nn.Embedding.from_pretrained(wv)
        self.wv.weight.requires_grad = False


        self.embedding_dim = embedding_dim
        self.linear_cls = torch.nn.Linear(embedding_dim, 1)

    def forward(self, input_ids):
        embs = self.wv(input_ids)
        
        if len(embs) > 0:
            return torch.sigmoid(self.linear_cls(torch.tensor(sum(embs),)))
        else:
            return torch.sigmoid(self.linear_cls(torch.zeros((self.embedding_dim,))))

In [38]:
model = BoW(wv_vec)
model = model.to('cuda')
optimizer = torch.optim.SGD(model.parameters(),lr=1e-3, momentum=0.9)
criterion = torch.nn.BCELoss()


In [45]:
for epoch in range(20):
    losses = []
    for example_idx in range(len(train_dataset)):
        
        optimizer.zero_grad()
        example, y_target = train_dataset[example_idx]
        example, y_target = map_idx(example).cuda(), torch.tensor(y_target).to(torch.float32).cuda()

        if len(example)==0: continue
        y_pred = model(example)

        loss = criterion(y_pred[0], y_target)
        
        if example_idx % 1000 == 0: 
            losses.append(loss.item())

        loss.backward()
        optimizer.step()
    print(f'loss:{sum(losses)/len(losses)}')


  return torch.sigmoid(self.linear_cls(torch.tensor(sum(embs),)))


loss:0.597006291818208


KeyboardInterrupt: ignored

In [46]:
with torch.no_grad():
  valid_y_pred = [1 if model(map_idx(example).to('cuda'))[0] > 0.5 else 0 for example,_ in valid_dataset]
  y = [valid_dataset[i][1] for i in range(len(valid_dataset))]
  ans = [valid_y_pred[i]==y[i] for i in range(len(y))]

  print(sum(ans)/len(ans))

  return torch.sigmoid(self.linear_cls(torch.tensor(sum(embs),)))


0.7270642201834863


In [None]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

## Simple RNN

$$
h_t = \tanh(W_{hh}h_{t-1} + W_{xh}x_t + b_h)
$$

In [None]:
class RNNCell(torch.nn.Module):
    def __init__(self, hidden_dim = HIDDEN_SIZE):
        super().__init__()

        
        self.input_linear = torch.nn.Linear(hidden_dim, hidden_dim)
        self.hidden_linear = torch.nn.Linear(hidden_dim, hidden_dim)

        self.hidden_dim = hidden_dim
    def forward(self, input_vectors, hidden):
        if hidden == None:
            hidden = torch.zeros((self.hidden_dim,)).to('cuda')


        for input in input_vectors:
            hidden = torch.tanh(
                self.hidden_linear(hidden) + self.input_linear(input)
            )
            return hidden

In [None]:
class RNN(torch.nn.Module):
    def __init__(self, wv, hidden_dim=HIDDEN_SIZE, output_dim=1, n_layers=1, dropout=0.0):
        super().__init__()

        self.wv = torch.nn.Embedding.from_pretrained(wv)
        self.wv.weight.requires_grad = False

        self.embedding_dim = hidden_dim
        self.rnn_cell = RNNCell(hidden_dim)
        self.linear_cls = torch.nn.Linear(hidden_dim, 1)

    def forward(self, input_ids):
        embs = self.wv(input_ids)
        if len(embs) == 0:
            embs = torch.zeros((self.embedding_dim,)).unsqueeze(0).to('cuda')
            

        output_state = self.rnn_cell(embs, None)
        return torch.sigmoid(self.linear_cls(output_state))

In [None]:
model = RNN(wv_vec)
model.cuda()
optimizer = torch.optim.SGD(model.parameters(),lr=1e-3, momentum=0.9)
criterion = torch.nn.BCELoss()

In [None]:
for example_idx in range(len(train_dataset)):
    optimizer.zero_grad()
    example, y_target = train_dataset[example_idx]
    example, y_target = map_idx(example).cuda(), torch.tensor(y_target).to(torch.float32).cuda()

    y_pred = model(example)
    loss = criterion(y_pred[0], y_target)
    
    if example_idx % 20000 == 0: 
        print(f'{epoch=}, {loss.item()=}')

    loss.backward()
    optimizer.step()

In [None]:
with torch.no_grad():
  valid_y_pred = [1 if model(map_idx(example).to('cuda'))[0] > 0.5 else 0 for example,_ in valid_dataset]
  y = [valid_dataset[i][1] for i in range(len(valid_dataset))]
  ans = [valid_y_pred[i]==y[i] for i in range(len(y))]

  print(sum(ans)/len(ans))

0.5561926605504587


LSTM



In [None]:
class BidirectionalLSTM(torch.nn.Module):
    def __init__(self, wv, hidden_dim=HIDDEN_SIZE, output_dim=1, n_layers=1, dropout=0.0):
        super().__init__()

        self.wv = torch.nn.Embedding.from_pretrained(wv)
        self.wv.weight.requires_grad = False

        self.forward_cell = torch.nn.LSTMCell(hidden_dim, hidden_dim)
        self.backward_cell = torch.nn.LSTMCell(hidden_dim, hidden_dim)
        
        self.embedding_dim = hidden_dim
        self.linear_cls = torch.nn.Linear(hidden_dim, 1)

    def forward(self, input_ids):
        forward_hidden = torch.zeros((self.embedding_dim,)).to('cuda')
        backward_hidden = torch.zeros((self.embedding_dim,)).to('cuda')
        context = torch.zeros((self.embedding_dim,)).to('cuda')

        embs = self.wv(input_ids)
        if len(embs) == 0:
            embs = torch.zeros((self.embedding_dim,)).unsqueeze(0).to('cuda')

        # forward

        for token in embs:
            forward_hidden, context = self.forward_cell(token, (forward_hidden, context))

        # backward

        context = torch.zeros((self.embedding_dim,)).to('cuda')

        for token in torch.flip(embs, [0, 1]):
            backward_hidden, context = self.backward_cell(token, (backward_hidden, context))
        
        return torch.sigmoid(self.linear_cls(forward_hidden + backward_hidden))

In [None]:
model = BidirectionalLSTM(wv_vec)
model.cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
criterion = torch.nn.BCELoss()

In [None]:
for example_idx in range(len(train_dataset)):
    optimizer.zero_grad()
    example, y_target = train_dataset[example_idx]
    example, y_target = map_idx(example).cuda(), torch.tensor(y_target).to(torch.float32).cuda()
    y_pred = model(example)
    loss = criterion(y_pred[0], y_target)
 
    loss.backward()
    optimizer.step()

KeyboardInterrupt: ignored

In [None]:
with torch.no_grad():
  valid_y_pred = [1 if model(map_idx(example).to('cuda'))[0] > 0.5 else 0 for example,_ in valid_dataset]
  y = [valid_dataset[i][1] for i in range(len(valid_dataset))]
  ans = [valid_y_pred[i]==y[i] for i in range(len(y))]

  print(sum(ans)/len(ans))