# RNN Classifier

This is a copy of the neural network classifier 2. Let's implement an RNN!

Changes:
- Replace the mean pooling with a simple RNN which we will write from scratch.
- To support padding, make [PAD] = token 0, and [UNK] = token 1.

In [2]:
from collections import Counter
import random

import torch
from torch.utils.data import Dataset, DataLoader

from tokenizers import Tokenizer
from tqdm.notebook import tqdm
import evaluate

import torch.nn.functional as F  # shorthand so we can do F.softmax and other functions

In [3]:
def read_data():
    tokenizer = Tokenizer.from_pretrained("bert-base-cased")

    train = []
    with open("SH-TTC/train.tsv", encoding="UTF-8") as fin:
        for line in fin:
            label, text = line.strip().split("\t")
            tokens = tokenizer.encode(text).tokens
            train.append((label, tokens))

    dev = []
    with open("SH-TTC/dev.tsv", encoding="UTF-8") as fin:
        for line in fin:
            label, text = line.strip().split("\t")
            tokens = tokenizer.encode(text).tokens
            dev.append((label, tokens))
    
    return train, dev

train_data_raw, dev_data_raw = read_data()

In [5]:
class Vocab:
    def __init__(self, tokens):
        self.vocab = [tok for tok, count in Counter(tokens).most_common()]
        self.tok2idx = {tok: idx + 2 for idx, tok in enumerate(self.vocab)}
        self.tok2idx[0] = "[PAD]"
        self.tok2idx[1] = "[UNK]"
        self.idx2tok = {idx: tok for tok, idx in self.tok2idx.items()}
    
    def __len__(self):
        return len(self.tok2idx)
    
    def to_id(self, tok):
        return self.tok2idx.get(tok, 0)

    def to_tok(self, id):
        return self.idx2tok.get(id, "[UNK]")

In [6]:
vocab = Vocab([word
               for y, x in train_data_raw
               for word in x])

In [5]:
len(vocab)

9672

In [6]:
# example of how to pad a tensor
F.pad(torch.Tensor([1,2,3]), (0, 5), value=vocab.to_id("[PAD]"))

tensor([1., 2., 3., 0., 0., 0., 0., 0.])

In [None]:
class ShttcDataset(Dataset):
    def __init__(self, tokenized_data):
        self.data = []
        for y, x in tokenized_data:
            # LongTensor is for integer ids
            x = torch.LongTensor([vocab.to_id(tok) for tok in x])``
            x = F.pad(x, (0, 100 - x.size(0)), value=vocab.to_id("[PAD]"))
            
            if y == "SH":
                y = torch.Tensor([1, 0])
            else:
                y = torch.Tensor([0, 1])
                
            self.data.append((x, y))
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

In [8]:
train_data = ShttcDataset(train_data_raw)
dev_data = ShttcDataset(dev_data_raw)

In [9]:
len(train_data), len(dev_data)

(10381, 1298)

In [10]:
train_data[0]

(tensor([   3,   16,  364, 3376,    5,  173,    2,  284,    2,   17, 1255,    2,
           30,   26,  561,    2,    5, 2399, 2975,   12, 5504,   13,    5, 1074,
          270,   28,   36, 1820,    2,   12,   17,   78,   32,  225,  102,   20,
         1485,   29,  324,   59,    6,    4,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]),
 tensor([1., 0.]))

In [11]:
train_data[0][0].shape

torch.Size([100])

In [12]:
train_data[0][1].shape

torch.Size([2])

## Model!

We want the following structure:

Tokens -> Embed -> RNN -> Hidden -> Output

After padding, the input tokens are all the same length. After we get the embeddings, we will run them one by one through the RNN, which will automatically update its hidden state.

Let's break down each component of the model:

In [13]:
x = train_data[0][0]
y = train_data[0][1]

In [14]:
# somewhat following https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html
class SimpleRNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()

        self.hidden_size = hidden_size
        
        self.i2h = torch.nn.Linear(input_size, hidden_size)
        self.h2h = torch.nn.Linear(hidden_size, hidden_size)
        self.h2o = torch.nn.Linear(hidden_size, input_size)

    def forward(self, input, hidden):
        hidden = F.tanh(self.i2h(input) + self.h2h(hidden))
        output = self.h2o(hidden)
        return output, hidden
    
    def initial_hidden(self):
        return torch.zeros(self.hidden_size)

In [15]:
emb = torch.nn.Embedding(len(vocab), 50)   # input = |V|, output = 50 (emb size)
rnn = SimpleRNN(50, 50)
linear1 = torch.nn.Linear(50, 2)  # emb size -> output size

In [16]:
x.shape

torch.Size([100])

In [17]:
e = emb(x)
e

tensor([[-0.8952, -1.0249,  0.4444,  ..., -0.1904,  0.6466,  0.4470],
        [-0.5850, -0.9442, -1.0992,  ...,  1.1571,  0.8177, -1.0776],
        [ 0.3221,  0.1908,  0.8119,  ..., -0.8874, -0.5927, -1.1309],
        ...,
        [-0.0978,  0.7601,  0.1314,  ..., -0.1516,  0.2974,  0.0963],
        [-0.0978,  0.7601,  0.1314,  ..., -0.1516,  0.2974,  0.0963],
        [-0.0978,  0.7601,  0.1314,  ..., -0.1516,  0.2974,  0.0963]],
       grad_fn=<EmbeddingBackward0>)

In [18]:
e.shape  # (n tokens, 50)

torch.Size([100, 50])

In [19]:
out, hidden = rnn(e, rnn.initial_hidden())

In [20]:
out

tensor([[-0.1062, -0.0826,  0.1073,  ...,  0.3799,  0.3248, -0.0334],
        [-0.3336,  0.3620,  0.1479,  ...,  0.6544, -0.2964,  0.3555],
        [ 0.1304, -0.2753,  0.2213,  ...,  0.5530,  0.3009,  0.1784],
        ...,
        [-0.3503, -0.0023, -0.0591,  ...,  0.4398, -0.0648,  0.4496],
        [-0.3503, -0.0023, -0.0591,  ...,  0.4398, -0.0648,  0.4496],
        [-0.3503, -0.0023, -0.0591,  ...,  0.4398, -0.0648,  0.4496]],
       grad_fn=<AddmmBackward0>)

In [21]:
out.shape  # (n tokens, 50), one vector for each timestep!

torch.Size([100, 50])

In [22]:
hidden

tensor([[ 0.0352, -0.4843, -0.0590,  ...,  0.4745,  0.2591,  0.4735],
        [-0.8966, -0.0250,  0.3315,  ...,  0.2996, -0.3051, -0.3490],
        [ 0.5499,  0.0852,  0.6340,  ...,  0.0252,  0.8690, -0.2278],
        ...,
        [-0.2615,  0.5056,  0.2398,  ...,  0.6183, -0.1203,  0.5294],
        [-0.2615,  0.5056,  0.2398,  ...,  0.6183, -0.1203,  0.5294],
        [-0.2615,  0.5056,  0.2398,  ...,  0.6183, -0.1203,  0.5294]],
       grad_fn=<TanhBackward0>)

In [23]:
hidden.shape  # (n tokens, 50)

torch.Size([100, 50])

In [24]:
# go through each timestep
hidden = rnn.initial_hidden()
for i in range(len(e)):
    output, hidden = rnn(e[i], hidden)

In [25]:
hidden  # now has encoded the entire sentence!

tensor([-0.3279,  0.6016,  0.1281, -0.0426, -0.6983,  0.4003, -0.5681, -0.6937,
         0.4413,  0.2617,  0.0911, -0.4249,  0.1860,  0.1093, -0.4143,  0.2682,
        -0.0604, -0.3182, -0.5093, -0.3150,  0.4222,  0.5239,  0.3401,  0.4358,
        -0.2037,  0.2823, -0.1884, -0.6619,  0.1288, -0.5169,  0.3293, -0.0506,
        -0.1999,  0.4386,  0.4022,  0.6700,  0.3085, -0.3685,  0.2775, -0.8203,
        -0.2625, -0.6087,  0.2349, -0.2632, -0.7485, -0.2940, -0.1349,  0.6692,
        -0.4595,  0.6520], grad_fn=<TanhBackward0>)

In [26]:
yhat = linear1(hidden)
yhat

tensor([-0.0323,  0.0198], grad_fn=<ViewBackward0>)

In [27]:
F.cross_entropy(yhat, y)  # automatically does the softmax and log

tensor(0.7195, grad_fn=<DivBackward1>)

In [28]:
class RNNClassifier(torch.nn.Module):
    def __init__(self, voc_size, emb_size, hidden_size):
        super().__init__()
        self.emb = torch.nn.Embedding(voc_size, emb_size)
        self.rnn = SimpleRNN(emb_size, hidden_size)
        self.linear = torch.nn.Linear(emb_size, 2)

    def forward(self, x):
        e = self.emb(x)
        
        hidden = self.rnn.initial_hidden()
        for i in range(len(e)):
            _out, hidden = self.rnn(e[i], hidden)

        y = self.linear(hidden)
        return y  # don't need sigmoid because cross_entropy computes sigmoid

In [29]:
model = RNNClassifier(len(vocab), 50, 50)

In [30]:
def count_parameters(model):
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            continue
        params = parameter.numel()
        print(name, "\t", params)
        total_params += params
    print(f"Total Trainable Params: {total_params}")
    
    
count_parameters(model)

emb.weight 	 483600
rnn.i2h.weight 	 2500
rnn.i2h.bias 	 50
rnn.h2h.weight 	 2500
rnn.h2h.bias 	 50
rnn.h2o.weight 	 2500
rnn.h2o.bias 	 50
linear.weight 	 100
linear.bias 	 2
Total Trainable Params: 491352


In [31]:
# get prediction for a single data point
# no_grad means we don't need to calculate gradients
# (do this when testing the model)
with torch.no_grad():
    x = train_data[0][0]
    # x = torch.LongTensor([x.tolist()])
    print(model.rnn(model.emb(x), model.rnn.initial_hidden()))
    print(model(x))

# Note: once you batch and change your torch.mean line,
# your model will not accept a single datapoint anymore,
# so you need to uncomment the above line

(tensor([[ 1.1002e-01, -3.7635e-04, -8.6351e-01,  ..., -3.5153e-02,
          2.9881e-01,  3.6698e-01],
        [ 2.7310e-02,  3.2312e-01,  9.3862e-02,  ...,  3.2303e-01,
          1.1172e-01,  2.0826e-01],
        [ 1.4793e-01,  1.9386e-01, -2.6678e-01,  ..., -2.4321e-01,
         -2.1960e-02, -6.3066e-01],
        ...,
        [-2.1318e-01,  1.0358e-01,  2.8842e-01,  ..., -1.5286e-01,
          4.8213e-02, -7.1092e-02],
        [-2.1318e-01,  1.0358e-01,  2.8842e-01,  ..., -1.5286e-01,
          4.8213e-02, -7.1092e-02],
        [-2.1318e-01,  1.0358e-01,  2.8842e-01,  ..., -1.5286e-01,
          4.8213e-02, -7.1092e-02]]), tensor([[-0.3097, -0.8766, -0.0166,  ..., -0.7016, -0.3213,  0.2376],
        [-0.0409, -0.2567, -0.3584,  ...,  0.4818, -0.1989,  0.1717],
        [-0.8485,  0.8749, -0.1641,  ..., -0.1630, -0.8509,  0.1757],
        ...,
        [-0.4566,  0.2630, -0.1004,  ...,  0.1121,  0.0827,  0.1520],
        [-0.4566,  0.2630, -0.1004,  ...,  0.1121,  0.0827,  0.1520],
   

In [32]:
# setup the training
loss_func = F.cross_entropy  # same as torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

In [33]:
# train!
for epoch in range(10):
    print("Epoch", epoch)

    random.shuffle(train_data.data)  # TODO: not really good style, remove this when using dataloader
    
    for x, y in tqdm(train_data):
        model.zero_grad()  # do this before running

        pred = model(x)
        loss = loss_func(pred, y)
        loss.backward()  # calculate gradients
        optimizer.step()  # updates thetas

    # after each epoch, check how we're doing
    # compute avg loss over train and dev sets
    with torch.no_grad():
        total_loss = 0
        for x, y in tqdm(train_data):
            pred = model(x)
            loss = loss_func(pred, y)
            total_loss += loss
        print("train loss:", total_loss / len(train_data))

        total_loss = 0
        for x, y in tqdm(dev_data):
            pred = model(x)
            loss = loss_func(pred, y)
            total_loss += loss
        print("dev loss:", total_loss / len(dev_data))

Epoch 0


  0%|          | 0/10381 [00:00<?, ?it/s]

  0%|          | 0/10381 [00:00<?, ?it/s]

train loss: tensor(0.6897)


  0%|          | 0/1298 [00:00<?, ?it/s]

dev loss: tensor(0.6914)
Epoch 1


  0%|          | 0/10381 [00:00<?, ?it/s]

  0%|          | 0/10381 [00:00<?, ?it/s]

train loss: tensor(0.6904)


  0%|          | 0/1298 [00:00<?, ?it/s]

dev loss: tensor(0.6935)
Epoch 2


  0%|          | 0/10381 [00:00<?, ?it/s]

  0%|          | 0/10381 [00:00<?, ?it/s]

train loss: tensor(0.6903)


  0%|          | 0/1298 [00:00<?, ?it/s]

dev loss: tensor(0.6955)
Epoch 3


  0%|          | 0/10381 [00:00<?, ?it/s]

  0%|          | 0/10381 [00:00<?, ?it/s]

train loss: tensor(0.6893)


  0%|          | 0/1298 [00:00<?, ?it/s]

dev loss: tensor(0.6994)
Epoch 4


  0%|          | 0/10381 [00:00<?, ?it/s]

  0%|          | 0/10381 [00:00<?, ?it/s]

train loss: tensor(0.6978)


  0%|          | 0/1298 [00:00<?, ?it/s]

dev loss: tensor(0.7023)
Epoch 5


  0%|          | 0/10381 [00:00<?, ?it/s]

  0%|          | 0/10381 [00:00<?, ?it/s]

train loss: tensor(0.6954)


  0%|          | 0/1298 [00:00<?, ?it/s]

dev loss: tensor(0.7018)
Epoch 6


  0%|          | 0/10381 [00:00<?, ?it/s]

  0%|          | 0/10381 [00:00<?, ?it/s]

train loss: tensor(0.6878)


  0%|          | 0/1298 [00:00<?, ?it/s]

dev loss: tensor(0.6937)
Epoch 7


  0%|          | 0/10381 [00:00<?, ?it/s]

  0%|          | 0/10381 [00:00<?, ?it/s]

train loss: tensor(0.6882)


  0%|          | 0/1298 [00:00<?, ?it/s]

dev loss: tensor(0.6971)
Epoch 8


  0%|          | 0/10381 [00:00<?, ?it/s]

  0%|          | 0/10381 [00:00<?, ?it/s]

train loss: tensor(0.6862)


  0%|          | 0/1298 [00:00<?, ?it/s]

dev loss: tensor(0.6939)
Epoch 9


  0%|          | 0/10381 [00:00<?, ?it/s]

  0%|          | 0/10381 [00:00<?, ?it/s]

train loss: tensor(0.6868)


  0%|          | 0/1298 [00:00<?, ?it/s]

dev loss: tensor(0.6956)


In [39]:
def run_model_on_dev_data():
    preds = []
    with torch.no_grad():
        for x, y in dev_data:
            pred = model(x)  # pred is something like [0.6, 0.4]
            # TODO: when using batched inputs, your output will also be batched
            # so you need to split them before appending to preds
            preds.append(pred)
    return preds

def sample_predictions(preds):
    for _ in range(5):
        idx = random.randint(0, len(dev_data))
        
        # argmax gives the index with the highest value
        pred_label = "SH" if torch.argmax(preds[idx]) == 0 else "TTC"

        print("Input:", " ".join(dev_data_raw[idx][1]))
        print("Gold: ", dev_data_raw[idx][0])

        # preds are not normalized, so for better viewing, run it through softmax
        print("Pred: ", pred_label, F.softmax(preds[idx], dim=0)) 
        print()

In [None]:
preds = run_model_on_dev_data()
sample_predictions(preds)

In [46]:
precision = evaluate.load("precision")
recall = evaluate.load("recall")
accuracy = evaluate.load("accuracy")

In [None]:
# evaluate functions require numeric data, so convert labels to 0 and 1
refs = []
for label, text in dev_data_raw:
    if label == "SH":
        refs.append(0)
    else:
        refs.append(1)

preds_binary = []
for pred in preds:
    preds_binary.append(torch.argmax(pred))

print(precision.compute(references=refs, predictions=preds_binary))
print(recall.compute(references=refs, predictions=preds_binary))
print(accuracy.compute(references=refs, predictions=preds_binary))

## Your Tasks

- Use a DataLoader, which supports automatic batching and shuffling
    - You will need to pad your inputs to a constant length using `torch.nn.functional.pad` so that the DataLoader can batch properly. Use `pad` to add zeros to the right of the sequence so the length is 100. (You did not need to do this in nn-classifier1 because the input lengths were already equal)
    - Alternatively, use `torch.nn.utils.rnn.pack_padded_sequence`
    - Pack the sequence using `pack_padded_sequence`, run it through your RNN, then use `pad_packed_sequence`.
    - See [this link](https://suzyahyah.github.io/pytorch/2019/07/01/DataLoader-Pad-Pack-Sequence.html) for an example.
    - RNNs in PyTorch have the batch dim 1. To make it 0 (recommended), set batch_first=True
- Use torch.nn.RNN/LSTM/GRU instead of writing your own.