# Neural Network Classifier 2

This is a copy of the neural network classifier 1 with some minor changes, so if you haven't gone through that notebook, then do that first!

Changes:
- DataSet class to handle the data
- Embedding layer will compute a vector representation of each token
- Instead of a counts vector of length $V$, the input is now a sequence of token ids
- These embeddings will get averaged into a single vector, which will be the input to the hidden layer

In [4]:
from collections import Counter
import random

import torch
from torch.utils.data import Dataset, DataLoader

from tokenizers import Tokenizer
from tqdm.notebook import tqdm
import evaluate

import torch.nn.functional as F  # shorthand so we can do F.softmax and other functions

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
def read_data():
    tokenizer = Tokenizer.from_pretrained("bert-base-cased")

    train = []
    with open("SH-TTC/train.tsv", encoding="UTF-8") as fin:
        for line in fin:
            label, text = line.strip().split("\t")
            tokens = tokenizer.encode(text).tokens
            train.append((label, tokens))

    dev = []
    with open("SH-TTC/dev.tsv", encoding="UTF-8") as fin:
        for line in fin:
            label, text = line.strip().split("\t")
            tokens = tokenizer.encode(text).tokens
            dev.append((label, tokens))
    
    return train, dev

train_data_raw, dev_data_raw = read_data()
print(dev_data_raw[0:20])


[('SH', ['[CLS]', '‘', 'What', 'have', 'I', 'to', 'do', 'with', 'sun', '##dial', '##s', 'and', 'papers', '?', '[SEP]']), ('SH', ['[CLS]', 'Let', 'me', 'have', 'the', 'date', 'of', 'the', 'reception', 'by', 'your', 'uncle', 'of', 'the', 'letter', ',', 'and', 'the', 'date', 'of', 'his', 'supposed', 'suicide', '.', '”', '[SEP]']), ('SH', ['[CLS]', '“', 'That', 'is', 'a', 'detail', 'which', 'I', 'shall', 'speed', '##ily', 'supply', '.', '”', '[SEP]']), ('SH', ['[CLS]', 'And', 'on', 'the', 'morning', 'of', 'the', 'wedding', '?', '”', '“', 'She', 'was', 'as', 'bright', 'as', 'possible', '—', 'at', 'least', 'until', 'after', 'the', 'ceremony', '.', '”', '[SEP]']), ('SH', ['[CLS]', '“', 'I', 'am', 'sorry', 'to', 'knock', 'you', 'up', 'so', 'early', ',', '[', 'N', '##AM', '##E', ']', ',', '”', 'said', 'he', ',', '“', 'but', 'I', 'have', 'had', 'a', 'very', 'serious', 'accident', 'during', 'the', 'night', '.', '[SEP]']), ('SH', ['[CLS]', 'I', 'shall', 'look', 'forward', 'to', 'seeing', 'you', 'a

In [19]:
class Vocab:
    def __init__(self, tokens):
        self.vocab = [tok for tok, count in Counter(tokens).most_common()]
        self.tok2idx = {tok: idx + 1 for idx, tok in enumerate(self.vocab)}
        self.tok2idx[0] = "[UNK]"
        self.idx2tok = {idx: tok for tok, idx in self.tok2idx.items()}
    
    def __len__(self):
        return len(self.tok2idx)
    
    def to_id(self, tok):
        return self.tok2idx.get(tok, 0)

    def to_tok(self, id):
        return self.idx2tok.get(id, "[UNK]")

In [20]:
vocab = Vocab([word
               for y, x in train_data_raw
               for word in x])

In [21]:
len(vocab)

9671

In [23]:
class ShttcDataset(Dataset):
    def __init__(self, tokenized_data):
        self.data = []
        for y, x in tokenized_data:
            # LongTensor is for integer ids
            x = torch.LongTensor([vocab.to_id(tok) for tok in x])
            
            if y == "SH":
                y = torch.Tensor([1, 0])
            else:
                y = torch.Tensor([0, 1])
                
            self.data.append((x, y))
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

In [24]:
train_data = ShttcDataset(train_data_raw)
dev_data = ShttcDataset(dev_data_raw)

In [25]:
len(train_data), len(dev_data)

(10381, 1298)

In [26]:
train_data[0]

(tensor([   2,   15,  363, 3375,    4,  172,    1,  283,    1,   16, 1254,    1,
           29,   25,  560,    1,    4, 2398, 2974,   11, 5503,   12,    4, 1073,
          269,   27,   35, 1819,    1,   11,   16,   77,   31,  224,  101,   19,
         1484,   28,  323,   58,    5,    3]),
 tensor([1., 0.]))

In [27]:
train_data[0][0].shape

torch.Size([42])

In [28]:
train_data[0][1].shape

torch.Size([2])

## Model!

We want the following structure:

Tokens -> Embed -> Hidden -> Output

The problem is that the input can be variable length. So after we get the embeddings, we will use mean pooling, so that the input to the hidden layer has a fixed size.

Let's break down each component of the model:

In [29]:
x = train_data[0][0]
y = train_data[0][1]

In [30]:
emb = torch.nn.Embedding(len(vocab), 50)   # input = |V|, output = 50 (emb size)
linear1 = torch.nn.Linear(50, 50)  # emb size -> hidden size
linear2 = torch.nn.Linear(50, 2)  # hidden size -> output size

In [31]:
e = emb(x)
e

tensor([[ 0.8790,  0.0843, -0.9360,  ...,  1.3626,  0.1041, -0.1704],
        [ 0.7654,  0.1776, -0.9183,  ..., -1.0696,  0.1173, -1.4512],
        [-0.3904,  0.3017,  1.1361,  ..., -0.8859, -0.6963,  0.7852],
        ...,
        [ 0.5199,  0.0571,  1.4610,  ...,  0.7754,  1.2393,  0.4563],
        [-1.1866,  0.6333,  0.8945,  ..., -0.5109, -1.5369, -0.1395],
        [-1.2452, -2.0386,  3.0467,  ..., -0.6121, -1.0770, -1.3523]],
       grad_fn=<EmbeddingBackward0>)

In [32]:
e.shape  # (n tokens, 50)

torch.Size([42, 50])

In [33]:
e_mean = torch.mean(e, dim=0)  # dim=0 will do element-wise mean, so the result has shape (50)
e_mean.shape

torch.Size([50])

In [39]:
e_mean

tensor([ 0.0538,  0.0361, -0.1072, -0.3866, -0.0068,  0.2832,  0.0442,  0.3247,
         0.0450,  0.1217, -0.0498,  0.1295,  0.1963, -0.2979, -0.0727, -0.0354,
         0.0273, -0.0094,  0.1431, -0.0670, -0.0644, -0.0363,  0.1676, -0.3237,
         0.2153,  0.1263, -0.0789,  0.2627, -0.1378,  0.0343,  0.0732,  0.1453,
        -0.1379,  0.1150,  0.2551,  0.4546,  0.0691, -0.0193, -0.2975, -0.0842,
         0.1738, -0.1802, -0.2405,  0.0381, -0.1912, -0.1590, -0.0357, -0.0809,
         0.2163,  0.0191], grad_fn=<MeanBackward1>)

In [40]:
h1 = linear1(e_mean)
h1 = F.relu(h1)
h1

tensor([0.1862, 0.0869, 0.0885, 0.0000, 0.0000, 0.0000, 0.2752, 0.0000, 0.0000,
        0.0000, 0.1284, 0.0038, 0.0000, 0.0000, 0.0613, 0.0000, 0.0000, 0.0477,
        0.0000, 0.0000, 0.0000, 0.0000, 0.1256, 0.0000, 0.0000, 0.0000, 0.2537,
        0.0000, 0.1305, 0.0000, 0.0000, 0.0000, 0.1843, 0.0000, 0.0129, 0.0000,
        0.0661, 0.0000, 0.0000, 0.0596, 0.0000, 0.0000, 0.0777, 0.0000, 0.0239,
        0.0992, 0.0000, 0.0000, 0.0000, 0.0000], grad_fn=<ReluBackward0>)

In [36]:
h1.shape

torch.Size([50])

In [41]:
y = linear2(h1)
y

tensor([ 0.1439, -0.1760], grad_fn=<ViewBackward0>)

In [42]:
F.cross_entropy(y, torch.Tensor([0, 1]))  # automatically does the softmax and log

tensor(0.8658, grad_fn=<DivBackward1>)

In [43]:
class NNClassifier2(torch.nn.Module):
    def __init__(self, voc_size, emb_size, hidden_size):
        super().__init__()
        self.emb = torch.nn.Embedding(voc_size, emb_size)
        self.linear1 = torch.nn.Linear(emb_size, hidden_size)
        self.linear2 = torch.nn.Linear(hidden_size, 2)

    def forward(self, x):
        e = self.emb(x)  # (n tokens, 50)
        e_mean = torch.mean(e, dim=0)  # (50)
        h = self.linear1(e_mean)
        h = F.relu(h)
        y = self.linear2(h)
        return y  # don't need sigmoid because cross_entropy computes sigmoid

In [44]:
model = NNClassifier2(len(vocab), 50, 50)

In [45]:
def count_parameters(model):
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            continue
        params = parameter.numel()
        print(name, "\t", params)
        total_params += params
    print(f"Total Trainable Params: {total_params}")
    
    
count_parameters(model)

emb.weight 	 483550
linear1.weight 	 2500
linear1.bias 	 50
linear2.weight 	 100
linear2.bias 	 2
Total Trainable Params: 486202


In [47]:
# get prediction for a single data point
# no_grad means we don't need to calculate gradients
# (do this when testing the model)
with torch.no_grad():
    x = train_data[0][0]
    print(model(x))

tensor([-0.0601, -0.0164])


In [48]:
# setup the training
loss_func = F.cross_entropy  # same as torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

In [49]:
# train!
for epoch in range(10):
    print("Epoch", epoch)

    random.shuffle(train_data.data)  # not really good style, remove this when using dataloader
    
    for x, y in train_data:
        model.zero_grad()  # do this before running

        pred = model(x)
        loss = loss_func(pred, y)
        loss.backward()  # calculate gradients
        optimizer.step()  # updates thetas

    # after each epoch, check how we're doing
    # compute avg loss over train and dev sets
    with torch.no_grad():
        total_loss = 0
        for x, y in train_data:
            pred = model(x)
            loss = loss_func(pred, y)
            total_loss += loss
        print("train loss:", total_loss / len(train_data))

        total_loss = 0
        for x, y in dev_data:
            pred = model(x)
            loss = loss_func(pred, y)
            total_loss += loss
        print("dev loss:", total_loss / len(dev_data))

Epoch 0
train loss: tensor(0.4263)
dev loss: tensor(0.4776)
Epoch 1
train loss: tensor(0.3174)
dev loss: tensor(0.4287)
Epoch 2
train loss: tensor(0.2542)
dev loss: tensor(0.4432)
Epoch 3


KeyboardInterrupt: 

In [50]:
def run_model_on_dev_data():
    preds = []
    with torch.no_grad():
        for x, y in dev_data:
            pred = model(x)  # pred is something like [0.6, 0.4]
            preds.append(pred)
    return preds

def sample_predictions(preds):
    for _ in range(5):
        idx = random.randint(0, len(dev_data))
        
        # argmax gives the index with the highest value
        pred_label = "SH" if torch.argmax(preds[idx]) == 0 else "TTC"

        print("Input:", " ".join(dev_data_raw[idx][1]))
        print("Gold: ", dev_data_raw[idx][0])

        # preds are not normalized, so for better viewing, run it through softmax
        print("Pred: ", pred_label, F.softmax(preds[idx], dim=0)) 
        print()

In [51]:
preds = run_model_on_dev_data()
sample_predictions(preds)

Input: [CLS] I have watched the fellow more than once before ever I thought of making his professional acquaintance , and I have been surprised at the harvest which he has re ##ap ##ed in a short time . [SEP]
Gold:  SH
Pred:  SH tensor([0.9988, 0.0012])

Input: [CLS] The butcher and the pork ##man painted up , only the lean ##est s ##c ##rag ##s of meat ; the b ##aker , the coarse ##st of me ##ag ##re lo ##aves . [SEP]
Gold:  TTC
Pred:  TTC tensor([2.9366e-05, 9.9997e-01])

Input: [CLS] “ Have you ever observed that his ears are pierced for ear ##rings ? ” [SEP]
Gold:  SH
Pred:  TTC tensor([0.1602, 0.8398])

Input: [CLS] “ Yes , at the mines . ” [SEP]
Gold:  SH
Pred:  SH tensor([0.6725, 0.3275])

Input: [CLS] I lounge ##d up the side aisle like any other idle ##r who has dropped into a church . [SEP]
Gold:  SH
Pred:  SH tensor([0.9500, 0.0500])



In [52]:
precision = evaluate.load("precision")
recall = evaluate.load("recall")
accuracy = evaluate.load("accuracy")

Downloading builder script: 100%|██████████| 7.55k/7.55k [00:00<00:00, 3.77MB/s]
Downloading builder script: 100%|██████████| 7.36k/7.36k [00:00<?, ?B/s]
Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<?, ?B/s]


In [53]:
# evaluate functions require numeric data, so convert labels to 0 and 1
refs = []
for label, text in dev_data_raw:
    if label == "SH":
        refs.append(0)
    else:
        refs.append(1)

preds_binary = []
for pred in preds:
    preds_binary.append(torch.argmax(pred))

print(precision.compute(references=refs, predictions=preds_binary))
print(recall.compute(references=refs, predictions=preds_binary))
print(accuracy.compute(references=refs, predictions=preds_binary))

{'precision': 0.7723684210526316}
{'recall': 0.8670605612998523}
{'accuracy': 0.7973805855161787}


## Your Tasks

- Use a dataloader, which supports automatic batching and shuffling
- Make tweaks to your model and compare their performance. This is called hyperparameter tuning, and it is really more of an art than a science. Some things you can try:
    - Change the batch size
        - larger = faster to train, but model may be less generalizable
    - Change the embedding size and hidden layer size
        - larger = slower to train, but usually better results. Try to see how small you can get it without severely degrading results!
    - Try different activation functions (full list at https://pytorch.org/docs/stable/nn.functional.html#non-linear-activation-functions)
        - Most popular now is ReLU and its variants. There is no one best activation function for every task, so people usually start with ReLU since it is simple