# PyTorch POS Tagging

## Requirements
- torch == 1.2.0
- torchtext == 0.4.0
- tqdm

In [1]:
import os
import random

from tqdm import tqdm_notebook as tqdm
import torch
import torchtext
from torchtext.vocab import GloVe

print("Torch Version: ", torch.__version__)
print("Torchtext Version: ", torchtext.__version__)

Torch Version:  1.2.0
Torchtext Version:  0.4.0


## Some global settings

In [2]:
EMB_CACHE = os.path.expanduser("~/arbeitsdaten/embeddings/glove/")
DATASET_CACHE = os.path.expanduser("./")
BATCH_SIZE = 8
DEVICE = torch.device('cpu')

## The dataset is adapted from the UDPOS where the format has been slightly changed

In [3]:
class POSTaggingDataset(torchtext.data.TabularDataset):

    # Universal Dependencies English Web Treebank by Universal Dependencies contributors
    # Modified by Maximilian Schmidt for use at the IMS, University of Stuttgart
    # License: http://creativecommons.org/licenses/by-sa/4.0/
    urls = ['file:./udpos/en-ud-v2']
    dirname = 'en-ud-v2'
    name = 'udpos'

    @classmethod
    def splits(cls, text_field, label1_field, label2_field, id_field, root=".data", train="train.jsonl",
               validation="dev.jsonl",
               test="test.jsonl", **kwargs):
        """Downloads and loads the Universal Dependencies Version 2 POS Tagged
        data.
        """

        fields = {'text': text_field}
        if label1_field is not None:
            fields.update(label1=label1_field)
        if label2_field is not None:
            fields.update(label2=label2_field)
        if id_field is not None:
            fields.update(id=id_field)

        return super(POSTaggingDataset, cls).splits(
            fields=fields, root=root, train=train, validation=validation,
            format='json', test=test, **kwargs)
            

## Our neural network consists of one fully connected linear layer (and a softmax - see the loss)

Embedding layer
- maps from indices to vectors
- is not trained (freezed)

In [4]:
class Net(torch.nn.Module):
    # this resembles a really simple neural network: an embedding layer followed by a fully
    # connected linear layer such that predictions are computed for each token in the sequence
    # and batch independently
    def __init__(self, embedding_vectors, num_classes):
        super().__init__()
        # Pytorch's embedding layer maps from indices to embeddings, freeze will tell Pytorch to
        # not train this layer, i.e. not modifying any weight
        self.embedding = torch.nn.Embedding.from_pretrained(embedding_vectors, freeze=True)
        # a fully connected linear layer mapping the embedded vector to a vector of fixed size
        # (num_classes in this case)
        self.fc = torch.nn.Linear(embedding_vectors.size(1), num_classes)

    def forward(self, inputs):
        # simple forwarding through our model
        # Pytorch takes care of keeping track of the operations for the backward pass
        emmedded_inputs = self.embedding(inputs)
        outputs = self.fc(emmedded_inputs)
        return outputs

## Set up our fields as placeholder for the actual data

- text (input)
- label (gold label / ground truth)

## Split into training, validation & test dataset and build vocabulary for *training* dataset (only)

In [5]:
# set up fields
TEXT = torchtext.data.Field(sequential=True, lower=True, include_lengths=True, batch_first=True, tokenize='spacy')
LABEL = torchtext.data.Field(sequential=True, use_vocab=True, batch_first=True, unk_token=None)

# make splits for data
train, val, test = POSTaggingDataset.splits(root=DATASET_CACHE, text_field=('Text',TEXT), label1_field=None, label2_field=('Label',LABEL), id_field=None)

# build the vocabulary
TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300, cache=EMB_CACHE))
LABEL.build_vocab(train)

## Create iterator such that each iteration returns a batch from shuffled data

In [6]:
# make iterator for splits
train_iter, val_iter, test_iter = torchtext.data.Iterator.splits((train, val, test), batch_size=BATCH_SIZE, device=DEVICE, sort=False)

vocab = TEXT.vocab
classes = LABEL.vocab.itos
print(f"Available classes: {len(classes)}\n{classes}")

Available classes: 51
['<pad>', 'NN', 'IN', 'DT', 'NNP', 'PRP', 'JJ', 'RB', '.', 'VB', 'NNS', ',', 'CC', 'VBP', 'VBD', 'VBZ', 'CD', 'VBN', 'VBG', 'MD', 'TO', 'PRP$', '-RRB-', '-LRB-', 'WDT', 'WRB', ':', 'WP', 'UH', '``', "''", 'RP', 'HYPH', 'POS', 'NNPS', 'JJR', 'JJS', 'NFP', 'EX', 'ADD', 'GW', 'RBR', '$', 'PDT', 'RBS', 'SYM', 'FW', 'LS', 'AFX', 'WP$', 'XX']


## Set up model and optimizer
- Cross Entropy is Softmax + Negative Log Likelihood
- Optimizer is Stochastic Gradient Descent (with momentum)

(run this only once as Jupyter keeps the model (including the weights) and the optimizer in memory)

In [7]:
# set up model and optimizer
model = Net(vocab.vectors, len(classes))
criterion = torch.nn.CrossEntropyLoss(reduction='mean')
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
metric_dict = {'loss': '------', 'accuracy': '------'}

## Evaluation function comparing prediction with gold label

In [8]:
def evaluate(data_iter, net):
    correct_count = 0
    total_count = 0
    for i, batch in enumerate(data_iter):
        # extract input and labels
        (inputs, inputs_lengths), labels = batch.Text, batch.Label

        # predict only
        with torch.no_grad():
            outputs = net(inputs).cpu()
        outputs_classes = outputs.argmax(dim=2)

        # compute amount of correct predictions
        # sequence lengths within the batch might be different, so we need to take care of that

        total_count += inputs_lengths.sum()
        # option 1: iterate over each sample of the batch
        batch_size = outputs_classes.size(0)
        for i in range(batch_size):
            for j in range(inputs_lengths[i]):
                correct_count += int(outputs_classes[i][j] == labels[i][j])
    return correct_count/total_count.float().item()

## The actual training loop

- runs several epochs
- in each epoch
 - forward the batch
 - computes the loss for the output of the whole batch
 - reduces (e.g. average, sum) the loss
 - computes derivatives of weights by backpropagation
 - optimizer updates weights
 - evaluate on validation/development dataset

In [9]:
NUM_EPOCHS = 10

# a nice progress bar to make the waiting time much better
pbar = tqdm(total=NUM_EPOCHS*len(train), postfix=metric_dict)

# run for NUM_EPOCHS epochs
for epoch in range(NUM_EPOCHS):
    # run for every data (in batches) of our iterator
    running_loss = 0.0
    
    pbar.set_description(f"Epoch {epoch + 1}/{NUM_EPOCHS}")
    for i, batch in enumerate(train_iter):
        # extract input and labels
        (inputs, inputs_lengths), labels = batch.Text, batch.Label

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        
        # 2D loss function expects input as (batch, prediction, sequence) and target as (batch, sequence) (containing the class INDEX)
        # loss = criterion(outputs.permute(0,2,1), labels)
        # otherwise use view function to get rid of sequence dimension by effectively concatenating all sequence items
        loss = criterion(outputs.view(-1, len(classes)), labels.view(-1))

        loss.backward()
        optimizer.step()

        # print statistics
        pbar.update(labels.size(0))
        metric_dict.update({'loss': f'{loss.item():6.3f}'})
        pbar.set_postfix(metric_dict)
        
    # evaluate on validation set after each epoch
    metric_dict.update({'accuracy': f'{100*evaluate(val_iter, model):6.2f}%'})
    pbar.set_postfix(metric_dict)

HBox(children=(IntProgress(value=0, max=116360), HTML(value='')))

In [10]:
def tokens_to_index(tokens: list, vocabulary: dict):
    return [vocabulary[token] for token in tokens]

def indices_to_class(indices: list, classes: dict):
    return [classes[value] for value in indices]

In [11]:
# get any tokenizer
tokenizer = torchtext.data.get_tokenizer('spacy', language='en')

## Interactive prediction

In [12]:
text = input("Please enter your text: ")

# map tokens to index using vocabulary
tokens = tokenizer(text)
tokens_indexed = tokens_to_index(tokens, vocab)
# build input vector and add batch dimension
tensor = torch.tensor(tokens_indexed).unsqueeze(dim=0)

# forward / predict
with torch.no_grad():
    # get rid of batch dimension (is set to 1)
    outputs = model(tensor).squeeze(dim=0)

print("Prediction: ", indices_to_class(outputs.argmax(dim=1), classes))



Please enter your text: How is the weather tomorrow?
Prediction:  ['<pad>', 'VBZ', 'DT', 'NN', 'NN', '.']


## Randomly predict sample from test set

In [13]:
sample_idx = random.randint(1, len(test))
sample = test[sample_idx]
# map tokens to index using vocabulary
sample_tokens_indexed = tokens_to_index(sample.Text, vocab)
# build input vector and add batch dimension
sample_tensor = torch.tensor(sample_tokens_indexed).unsqueeze(dim=0)

# forward / predict
with torch.no_grad():
    # get rid of batch dimension (is set to 1)
    outputs = model(sample_tensor).squeeze(dim=0)

print("Input:\t\t    ", ' '.join(sample.Text))
print("Prediction:\t    ", indices_to_class(outputs.argmax(dim=1), classes))
print("Expected prediction:", sample.Label)

Input:		     the staff is also just so pleasant to deal with .
Prediction:	     ['DT', 'NN', 'VBZ', 'IN', 'RB', 'RB', 'JJ', 'TO', 'NN', 'IN', '.']
Expected prediction: ['DT', 'NN', 'VBZ', 'RB', 'RB', 'RB', 'JJ', 'TO', 'VB', 'IN', '.']
