In [1]:
from datasets import load_dataset

dataset_name = "batterydata/pos_tagging"
training_dataset = load_dataset(dataset_name, split="train")
test_dataset = load_dataset(dataset_name, split="test")

In [2]:
# a dict containing word -> idx mapping
def create_word_indices(dataset):
    unique_words = set()
    word_to_idx = dict()
    # add an out of vocab token
    oov_token = "<OOV>"
    pad_token = "<PAD>"
    word_to_idx[oov_token] = 0
    word_to_idx[pad_token] = 1
    
    # find unique words
    for data in dataset:
        words = data["words"]
        for w in words:
            unique_words.add(w)
            
    # add index to them
    for idx, uw in enumerate(list(unique_words)):
        word_to_idx[uw] = idx + 2 # since oov is at 0 and pad at 1
        
    
    return word_to_idx


# ===============
word_to_idx = create_word_indices(training_dataset)

In [3]:
def create_label_to_idx(dataset):
    unique_labels = set()
    label_to_idx = dict()
    # add an out of vocab token
    oov_token = "<OOV>"
    pad_token = "<PAD>"
    label_to_idx[oov_token] = 0
    label_to_idx[pad_token] = 1
    
    # find the labels
    for data in dataset:
        labels = data["labels"]
        for l in labels:
            unique_labels.add(l)
            
    # index
    for idx, label in enumerate(list(unique_labels)):
        label_to_idx[label] = idx + 2
        
    return label_to_idx
    
label_to_idx = create_label_to_idx(training_dataset)

In [4]:


# for a single instance
def encode_data_instance(data, word_to_idx, label_to_idx):
    words = [
        word_to_idx.get(word, word_to_idx["<OOV>"]) for word in data["words"]
    ]
    
    labels = [
        label_to_idx[label] for label in data["labels"]
    ]
    
    return {
        "words": words,
        "labels": labels
    }
    

In [5]:
trainset = map(lambda data: encode_data_instance(data, word_to_idx, label_to_idx), training_dataset)
trainset = list(trainset)



testset = map(lambda data: encode_data_instance(
    data, word_to_idx, label_to_idx), test_dataset)
testset = list(testset)



In [6]:
# now to create the validation set
import numpy as np

def create_train_validation_splits(trainset, validation_ratio):
    validation_set_size = int(len(trainset) * validation_ratio)
    validation_indices = np.random.choice(len(trainset), replace=False, size=validation_set_size).tolist()
    
    # now to separate trainset indices
    trainset_indices = [i for i in range(len(trainset)) if i not in validation_indices]
    
    return trainset_indices, validation_indices


trainset_indices, validation_indices = create_train_validation_splits(trainset, 0.3)

print(len(trainset_indices))
print(len(validation_indices))


assert len(trainset_indices) + len(validation_indices) == len(trainset)

9138
3916


In [7]:
import torch
from torch.utils.data import Dataset

torch.manual_seed(2023)


class TagDataset(Dataset):
    def __init__(self, indices, dataset) -> None:
        self.indices = indices
        self.dataset = dataset
        
        
    def __len__(self):
        if self.indices is None:
            # this is for the test case
            return len(self.dataset)
        else:
            return len(self.indices)
        
    def __getitem__(self, index):
        if self.indices is None:
            idx = index
        else:
            idx = self.indices[index]
            
        data = self.dataset[idx]
        
        words = data["words"]
        labels = data["labels"]
        
        # padding to 300
        # pad token idx is 1
        # words = np.ones((300, ), dtype=np.int32)
        # words[:len(data["words"])] = data["words"] 
    
        
        # labels = np.ones((300, ), dtype=np.int32)
        # labels[:len(data["labels"])] = data["labels"]
        
        
        return torch.tensor(words).long(), torch.tensor(labels).long()

In [8]:
from torch.nn.utils.rnn import pad_sequence


def pad_collate(batch):
  (xx, yy) = zip(*batch)
  x_lens = [len(x) for x in xx]
  y_lens = [len(y) for y in yy]

  xx_pad = pad_sequence(xx, batch_first=True, padding_value=1)
  yy_pad = pad_sequence(yy, batch_first=True, padding_value=1)

  return xx_pad, yy_pad, x_lens, y_lens

In [9]:
from torch.utils.data import DataLoader

train_loader = DataLoader(
    TagDataset(trainset_indices, trainset), batch_size=128, shuffle=True, collate_fn=pad_collate)
val_loader = DataLoader(
    TagDataset(validation_indices, trainset), batch_size=128, shuffle=False, collate_fn=pad_collate)
test_loader = DataLoader(
    TagDataset(None, testset), batch_size=128, shuffle=False, collate_fn=pad_collate)

In [10]:
# =========== test a dataloader ==========
for batch in train_loader:
    w, l, ws, ls = batch
    print(w.size())
    print(l.size())
    break

torch.Size([128, 56])
torch.Size([128, 56])


In [11]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [12]:
torch.set_float32_matmul_precision('high')

In [13]:
from lightning.pytorch import loggers as pl_loggers
from typing import Any
from lightning.pytorch.utilities.types import STEP_OUTPUT, OptimizerLRScheduler
from tqdm.auto import trange, tqdm
from einops import rearrange
import lightning.pytorch as L
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


class LSTMTagger(L.LightningModule):
    def __init__(self, vocab_size, embedding_dimension, projection_dims, n_labels, pad_idx) -> None:
        super().__init__()
        
        # hparams
        self.vocab_size = vocab_size
        self.embedding_dimension = embedding_dimension
        self.projection_dims = projection_dims
        self.n_labels = n_labels
        self.pad_idx = pad_idx
        self.save_hyperparameters()
        
        # modules
        self.embedding = nn.Embedding(self.vocab_size, 
                                      self.embedding_dimension, 
                                      padding_idx=self.pad_idx)
        self.lstm = nn.LSTM(self.embedding_dimension, self.projection_dims, batch_first=True)        
        self.fc = nn.Linear(self.projection_dims, self.n_labels)
        
        # normal init
        self.__custom_init()
        self.embedding.weight.data[self.pad_idx] = torch.zeros(self.embedding_dimension, )
        
        self.dropout = nn.Dropout(0.2)
        
        
    def __custom_init(self):
        for p in self.parameters():
            nn.init.normal_(p.data, mean=0, std=0.1)
                
    def forward(self, x, xlen):
        out = self.embedding(x) 
        out = self.dropout(out)
        
        # this reshaping changed things. :3 Well hell pytorch
        # out = rearrange(out, "batch L embed -> batch embed L")
        
        # pack padded sequence
        out = pack_padded_sequence(out, xlen, batch_first=True, enforce_sorted=False)        
        out, _ = self.lstm(out)
        
        # back to padded 
        out, _ = pad_packed_sequence(out, batch_first=True, padding_value=1)
        
        out = self.fc(out)
        out = F.leaky_relu(out)
        
        
        return out
    
    def compute_loss(self, batch):
        words, labels, word_len, _ = batch
        
        logits = self(words, word_len)
        # reshape logits
        logits = rearrange(logits, "batch seq log -> batch log seq")
        
        loss = F.cross_entropy(logits, labels, ignore_index=self.pad_idx)
        return loss
    
    def configure_optimizers(self) -> OptimizerLRScheduler:
        return optim.AdamW(self.parameters())
    
    def training_step(self, batch, batch_idx) -> STEP_OUTPUT:
        loss = self.compute_loss(batch)
        
        self.log("Loss/Train", loss, prog_bar=True, 
                 batch_size=batch[0].size(0))
        
        return {
            "loss": loss,
            "log": {
                "Loss/Train": loss
            }
        }
        
    
    def validation_step(self, batch, batch_idx) -> STEP_OUTPUT:
        loss = self.compute_loss(batch)

        self.log("Loss/Validation", loss, prog_bar=True,
                 batch_size=batch[0].size(0))

        return {
            "val_loss": loss,
            "log": {
                "Loss/Validation": loss
            }
        }
        
        

model = LSTMTagger(len(word_to_idx), 300, 300, len(label_to_idx), 1)
# with torch.no_grad():
#     for batch in train_loader:
#         loss = model.compute_loss(batch)    
#         print(loss)
#         break


# create a tensorboard logger

tb_logger = pl_loggers.TensorBoardLogger(save_dir="tb_logs/")
trainer = L.Trainer(logger=tb_logger,
                    max_epochs=10,
                    accelerator="gpu",
                    devices=1,
                    precision="bf16-mixed",
                    log_every_n_steps=50)

trainer.fit(model, train_loader, val_loader)

Using bfloat16 Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 7.5 M 
1 | lstm      | LSTM      | 722 K 
2 | fc        | Linear    | 15.1 K
3 | dropout   | Dropout   | 0     
----------------------------------------
8.2 M     Trainable params
0         Non-trainable params
8.2 M     Total params
32.769    Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/shawon/miniconda3/envs/exp/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.
/home/shawon/miniconda3/envs/exp/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


In [14]:
# numpy, torch eq is weird
def categorical_accuracy(preds, actual):
    non_pad = np.nonzero(actual != 1)
    matches = np.equal(preds[non_pad], actual[non_pad]).sum()
    return matches / actual[non_pad].shape[0]

In [16]:
def evaluate(dataloader):
    scores = list()

    for batch in tqdm(dataloader):
        words, labels, wlen, _ = batch
        words = words
        
        with torch.no_grad():
            logits = model(words, wlen)
            
        probas = logits.log_softmax(dim=-1)
        
        preds = probas.argmax(dim=-1)
        
        acc = categorical_accuracy(preds.numpy(), labels.numpy())
        scores.append(acc)
        
        
    print(torch.tensor(scores).mean(dim=-1))
    

# ================
evaluate(train_loader)
evaluate(val_loader)
evaluate(test_loader)

  0%|          | 0/72 [00:00<?, ?it/s]

tensor(0.9942, dtype=torch.float64)


  0%|          | 0/31 [00:00<?, ?it/s]

tensor(0.9441, dtype=torch.float64)


  0%|          | 0/12 [00:00<?, ?it/s]

tensor(0.9343, dtype=torch.float64)
