## Dataset

HF: [batterydata/pos_tagging](https://huggingface.co/datasets/batterydata/pos_tagging)

In [1]:
from datasets import load_dataset

dataset_name = "batterydata/pos_tagging"
dataset = load_dataset(dataset_name)

Downloading readme:   0%|          | 0.00/587 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.05M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/601k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

### Integer mappings

Taking a pre-transformer era NLP approach with a fixed vocab from the training dataset with OOV and PAD tokens.

In [2]:
# a dict containing word -> idx mapping
def create_word_indices(dataset):
    unique_words = set()
    word_to_idx = dict()
    # add an out of vocab token
    oov_token = "<OOV>"
    pad_token = "<PAD>"
    word_to_idx[oov_token] = 0
    word_to_idx[pad_token] = 1
    
    # find unique words
    for data in dataset:
        words = data["words"]
        for w in words:
            unique_words.add(w)
            
    # add index to them
    for idx, uw in enumerate(list(unique_words)):
        word_to_idx[uw] = idx + 2 # since oov is at 0 and pad at 1
        
    
    return word_to_idx


# ===============
word_to_idx = create_word_indices(dataset['train'])

word_to_idx maps each word in the vocabulary to an integer label. Ultimately, we need integer ids for each token.

In [3]:
def create_label_to_idx(dataset):
    unique_labels = set()
    label_to_idx = dict()
    # add an out of vocab token
    oov_token = "<OOV>"
    pad_token = "<PAD>"
    label_to_idx[oov_token] = 0
    label_to_idx[pad_token] = 1
    
    # find the labels
    for data in dataset:
        labels = data["labels"]
        for l in labels:
            unique_labels.add(l)
            
    # index
    for idx, label in enumerate(list(unique_labels)):
        label_to_idx[label] = idx + 2
        
    return label_to_idx
    
label_to_idx = create_label_to_idx(dataset['train'])

parts of speech tags are categorical data. We can one hot encode them or just use integer labels. Pytorch loss functions can work with both. 


### Encoding

Given a single data instance, which is a sentence and corresponding pos tags, this function will encode the words and the tags using the word / tag to integer mappings we created earlier.



In [4]:
# for a single instance
def encode_data_instance(data, word_to_idx, label_to_idx):
    words = [
        word_to_idx.get(word, word_to_idx["<OOV>"]) for word in data["words"]
    ]
    
    labels = [
        label_to_idx[label] for label in data["labels"]
    ]
    
    return {
        "words": words,
        "labels": labels
    }
    



### Splits

The problem with this dataset is that it doesn't come with a validation set. You should always have a validation set to check your training.

Also setting a random seed helps reproducing the results.

In [5]:
def make_train_validation_splits(dataset_split,
                                 validation_split=0.2,
                                 seed=42):
    # make a copy of the data
    dataset_split = dataset_split.shuffle(seed=seed)
    # using the train test split method to create validation set
    dataset_split = dataset_split.train_test_split(test_size=validation_split,
                                                   shuffle=True,
                                                   seed=seed)
    return dataset_split["train"], dataset_split["test"]



def prepare_splits(dataset,
                   validation_size=0.2,
                   seed=42):
    # train and test split
    train_split = dataset["train"]
    test_split = dataset["test"]

    train_split, validation_split = make_train_validation_splits(train_split, validation_size, seed)
    return train_split, validation_split, test_split



train_split, validation_split, test_split = prepare_splits(dataset)

### Torch Dataset

Padding index is 1 (as set in the vocab before). 

In [6]:
import torch
from torch.utils.data import Dataset
import numpy as np

# set seeds
np.random.seed(2023)
torch.manual_seed(2023)



class TagDataset(Dataset):
    def __init__(self, dataset_split,
                 pad_token_idx,
                 word_to_idx,
                 label_to_idx) -> None:
        self.dataset = dataset_split
        self.pad_token_idx = pad_token_idx
        self.word_to_idx = word_to_idx
        self.label_to_idx = label_to_idx

    def __len__(self):
        return len(self.dataset)

    # use word_to_idx and label_to_idx to convert
    # the string sequences to int sequences
    def __encode(self, data_instance):
        words = data_instance["words"]
        labels = data_instance["labels"]

        # convert to int sequences
        words = [self.word_to_idx.get(w, 0) for w in words]
        labels = [self.label_to_idx.get(l) for l in labels]

        words = torch.tensor(words)
        labels = torch.tensor(labels)

        return words, labels

    def __getitem__(self, index):
        data = self.dataset[index]
        words, labels = self.__encode(data)

        return words, labels

In [7]:
dataset_config = {
    "pad_token_idx": 1,
    "word_to_idx": word_to_idx,
    "label_to_idx": label_to_idx
}

train_set = TagDataset(train_split, **dataset_config)
val_set = TagDataset(validation_split, **dataset_config)
test_set = TagDataset(test_split, **dataset_config)

### Padding

To stack all the variable length sequences the dataloader will expect them to be padded. One quick way to do this is to use pad_sequence from pytorch. This way, you don't have to mention a fixed padding size. Also, it's memory efficient since you're not always passing large sparse sequences per batch.

In [8]:
from torch.nn.utils.rnn import pad_sequence


def pad_collate(batch):
    (xx, yy) = zip(*batch)
    x_lens = [len(x) for x in xx]
    y_lens = [len(y) for y in yy]

    # pad token is 1
    xx_pad = pad_sequence(xx, batch_first=True, padding_value=1)
    yy_pad = pad_sequence(yy, batch_first=True, padding_value=1)

    return xx_pad, yy_pad, x_lens, y_lens

### Dataloader

In [9]:
from torch.utils.data import DataLoader




train_loader = DataLoader(
   train_set, batch_size=128, shuffle=True, collate_fn=pad_collate)
val_loader = DataLoader(
    val_set, batch_size=128, shuffle=False, collate_fn=pad_collate)
test_loader = DataLoader(
    test_set, batch_size=128, shuffle=False, collate_fn=pad_collate)

In [10]:
# =========== test a dataloader ==========
for batch in train_loader:
    w, l, ws, ls = batch
    print(w.size())
    print(l.size())
    break

torch.Size([128, 67])
torch.Size([128, 67])


## Model

The model here consists of a single LSTM layer with embedding. The embedding is trained from the corpus, can also be replaced with word2vec embeddings from gensim.

LR default 1e-3 for Adam. 

In [11]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [12]:
# for newer nvidia gpus

torch.set_float32_matmul_precision('high')

Pytorch lightning to make things easier. xD

In [13]:
from typing import Any
from lightning.pytorch.utilities.types import STEP_OUTPUT, OptimizerLRScheduler
from tqdm.auto import trange, tqdm
from einops import rearrange
import lightning.pytorch as L
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


class LSTMTagger(L.LightningModule):
    def __init__(self, vocab_size, embedding_dimension, projection_dims, n_labels, pad_idx) -> None:
        super().__init__()
        
        # hparams
        self.vocab_size = vocab_size
        self.embedding_dimension = embedding_dimension
        self.projection_dims = projection_dims
        self.n_labels = n_labels
        self.pad_idx = pad_idx
        self.save_hyperparameters()
        
        # modules
        self.embedding = nn.Embedding(self.vocab_size, 
                                      self.embedding_dimension, 
                                      padding_idx=self.pad_idx)
        self.lstm = nn.LSTM(self.embedding_dimension, self.projection_dims, batch_first=True)        
        self.fc = nn.Linear(self.projection_dims, self.n_labels)
        
        self.dropout = nn.Dropout(0.2)
                
    def forward(self, x, xlen):
        out = self.embedding(x) 
        # out = self.dropout(out)
        
        # this reshaping changed things. :3 Well hell pytorch
        # out = rearrange(out, "batch L embed -> batch embed L")
        
        # pack padded sequence
        out = pack_padded_sequence(out, xlen, batch_first=True, enforce_sorted=False)        
        out, _ = self.lstm(out)
        
        # back to padded 
        out, _ = pad_packed_sequence(out, batch_first=True, padding_value=1)
        out = self.dropout(out)
        
        out = self.fc(out)
        out = F.leaky_relu(out)
        
        
        
        return out
    
    def compute_loss(self, batch):
        words, labels, word_len, _ = batch
        
        logits = self(words, word_len)
        # reshape logits
        logits = rearrange(logits, "batch seq log -> batch log seq")
        
        loss = F.cross_entropy(logits, labels, ignore_index=self.pad_idx)
        return loss
    
    def configure_optimizers(self) -> OptimizerLRScheduler:
        return optim.AdamW(self.parameters())
    
    def training_step(self, batch, batch_idx) -> STEP_OUTPUT:
        loss = self.compute_loss(batch)
        
        self.log("Loss/Train", loss, prog_bar=True, 
                 batch_size=batch[0].size(0))
        
        return {
            "loss": loss,
            "log": {
                "Loss/Train": loss
            }
        }
        
    
    def validation_step(self, batch, batch_idx) -> STEP_OUTPUT:
        loss = self.compute_loss(batch)

        self.log("Loss/Validation", loss, prog_bar=True,
                 batch_size=batch[0].size(0))

        return {
            "val_loss": loss,
            "log": {
                "Loss/Validation": loss
            }
        }

In [14]:
model = LSTMTagger(len(word_to_idx), 300, 300, len(label_to_idx), 1)

In [15]:
# sample forward pass

with torch.no_grad():
     for batch in train_loader:
         loss = model.compute_loss(batch)    
         print(loss)
         break

tensor(3.9149)


### Logging and training

In [16]:
# create a tensorboard logger
from lightning.pytorch import loggers as pl_loggers
tb_logger = pl_loggers.TensorBoardLogger(save_dir="tb_logs/")


In [17]:
# create the lightning trainer

trainer = L.Trainer(logger=tb_logger,
                    max_epochs=10,
                    accelerator="gpu",
                    devices=1,
                    precision="bf16-mixed",
                    log_every_n_steps=50)

Using bfloat16 Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [18]:
# train

trainer.fit(model, train_loader, val_loader)

Missing logger folder: tb_logs/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 7.5 M 
1 | lstm      | LSTM      | 722 K 
2 | fc        | Linear    | 15.1 K
3 | dropout   | Dropout   | 0     
----------------------------------------
8.2 M     Trainable params
0         Non-trainable params
8.2 M     Total params
32.769    Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/opt/conda/envs/from-scratch/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.
/opt/conda/envs/from-scratch/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


## Evaluation

Categorical accuracy ignores the padded indexes and checks for matches in the non padded ones. 

In [19]:
# numpy, torch eq is weird
def categorical_accuracy(preds, actual):
    non_pad = np.nonzero(actual != 1)
    matches = np.equal(preds[non_pad], actual[non_pad]).sum()
    return matches / actual[non_pad].shape[0]

In [20]:
def evaluate(dataloader):
    scores = list()

    for batch in tqdm(dataloader):
        words, labels, wlen, _ = batch
        words = words
        
        with torch.no_grad():
            logits = model(words, wlen)
            
        probas = logits.log_softmax(dim=-1)
        
        preds = probas.argmax(dim=-1)
        
        acc = categorical_accuracy(preds.numpy(), labels.numpy())
        scores.append(acc)
        
        
    print(torch.tensor(scores).mean(dim=-1))
    

# ================
evaluate(train_loader)
evaluate(val_loader)
evaluate(test_loader)

  0%|          | 0/82 [00:00<?, ?it/s]

tensor(0.9849, dtype=torch.float64)


  0%|          | 0/21 [00:00<?, ?it/s]

tensor(0.9307, dtype=torch.float64)


  0%|          | 0/12 [00:00<?, ?it/s]

tensor(0.9271, dtype=torch.float64)
