In [1]:
from sys import path
from os.path import dirname, abspath
path.append(dirname(dirname(abspath("__file__"))))

In [2]:
from torchcrf import CRF
import torch
import torch.nn as nn
from torch.optim import AdamW
from transformers import get_cosine_schedule_with_warmup
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, random_split
from scripts.utils import *
from scripts.metrics import f1score
from scripts.transformer_elements import PositionalEncoding, EncoderLayer, EncoderBlock
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from multiprocessing import cpu_count
from platform import system

pl.seed_everything(seed=42)

Global seed set to 42


42

In [3]:
LEARNING_RATE = 5e-1
BATCH_SIZE = 256
WEIGHT_DECAY = 1e-1
EPOCHS = 25
MAX_LEN = None
N_JOBS = cpu_count() if system() != "Windows" else 0
MODEL_NAME = "transformer-encoder-ner"

TAG2IDX = {'B': 0, 'I': 1, 'O': 2, 'E': 3, 'S': 4, '<': 5, '>': 6, '$': 7}

In [4]:
class TRANSFORMER(pl.LightningModule):
    def __init__(self, 
                 input_dim,
                 d_model=512,
                 n_heads=16,
                 n_layers=6,
                 dropout=0.5,
                 use_scheduler=True,
                 num_tags=len(TAG2IDX),
                 total_steps=1024,
                 train_dataset=None,
                 val_dataset=None,
                 test_dataset=None):
        
        super().__init__()
        self.crf = CRF(num_tags=num_tags, batch_first=True)
        self.fc = nn.Linear(d_model, num_tags)
        self.use_scheduler = use_scheduler
        
        self.embedding = nn.Embedding(num_embeddings=input_dim, 
                                      embedding_dim=d_model, 
                                      padding_idx=0)
        
        self.pos_encoding = PositionalEncoding(d_model=d_model)
        
        self.encoder_layer = EncoderLayer(d_model=d_model, 
                                          d_fc=d_model*4, 
                                          n_heads=n_heads,
                                          dropout=dropout)
        
        self.encoder_block = EncoderBlock(encoder_layer=self.encoder_layer,
                                          n_layers=n_layers)
        
        ## Hyperparameters ##
        self.learning_rate = LEARNING_RATE
        self.weight_decay = WEIGHT_DECAY
        self.total_steps = total_steps
        self.batch_size = BATCH_SIZE
        ## Datasets ##
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        self.test_dataset = test_dataset
        ## steps ##
        if self.use_scheduler: 
            self.total_steps = len(train_dataset) // self.batch_size


    # create the dataloaders
    # add shuffle only for train_dataloader
    # make sure num_workers is set appropriately and drop_last is set to False
    def train_dataloader(self):
        return DataLoader(self.train_dataset, 
                          batch_size=self.batch_size,
                          num_workers=N_JOBS,
                          shuffle=True,
                          drop_last=False)


    def val_dataloader(self):
        return DataLoader(self.val_dataset, 
                          batch_size=self.batch_size,
                          num_workers=N_JOBS,
                          shuffle=False,
                          drop_last=False)


    def test_dataloader(self):
        return DataLoader(self.test_dataset, 
                          batch_size=self.batch_size,
                          num_workers=N_JOBS,
                          shuffle=False,
                          drop_last=False)
    

    def forward(self, input_ids, masks):
        out = self.embedding(input_ids)
        out = self.pos_encoding(out)
        out = self.encoder_block(out, mask=~masks)
        out = self.fc(out)
        return out

    
    def _shared_evaluation_step(self, batch, batch_idx):
        ids, masks, lbls = batch
        emissions = self(ids, masks)
        loss = -self.crf(emissions, lbls, mask=masks)
        pred = self.crf.decode(emissions, mask=masks)
        r, p, f1 = f1score(lbls, pred)
        return loss, r, p, f1


    def training_step(self, batch, batch_idx):
        loss, r, p, f1 = self._shared_evaluation_step(batch, batch_idx)
        self.log("train_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log("train_recall", r, on_step=False, on_epoch=True, prog_bar=True)
        self.log("train_precision", p, on_step=False, on_epoch=True, prog_bar=True)
        self.log("train_f1score", f1, on_step=False, on_epoch=True, prog_bar=True)
        return loss


    def validation_step(self, batch, batch_idx):
        loss, r, p, f1 = self._shared_evaluation_step(batch, batch_idx)
        self.log("val_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log("val_recall", r, on_step=False, on_epoch=True, prog_bar=True)
        self.log("val_precision", p, on_step=False, on_epoch=True, prog_bar=True)
        self.log("val_f1score", f1, on_step=False, on_epoch=True, prog_bar=True)

    
    def test_step(self, batch, batch_idx):
        loss, r, p, f1 = self._shared_evaluation_step(batch, batch_idx)
        self.log("test_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log("test_recall", r, on_step=False, on_epoch=True, prog_bar=True)
        self.log("test_precision", p, on_step=False, on_epoch=True, prog_bar=True)
        self.log("test_f1score", f1, on_step=False, on_epoch=True, prog_bar=True)


    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        ids, masks, _ = batch 
        return self.crf.decode(self(ids, masks), mask=masks)
    
    
    def configure_optimizers(self):           
        optimizer = AdamW(self.parameters(), 
                          lr=self.learning_rate,
                          weight_decay=self.weight_decay,
                          amsgrad=True)

        if self.use_scheduler:
            scheduler = get_cosine_schedule_with_warmup(optimizer=optimizer,
                                                        num_warmup_steps=1,
                                                        num_training_steps=self.total_steps)
            lr_scheduler = {
                'scheduler': scheduler,
                'interval': 'epoch', 
                'frequency': 1
            }
            return [optimizer], [lr_scheduler]
        else:
            return [optimizer]

In [5]:
with open("../../data/full_vocab_ner.txt", mode="r", encoding="utf-8") as f:
    vocab = [s.strip() for s in f.readlines()]
    VOCAB2IDX = {v:k for (k, v) in enumerate(vocab)}

In [6]:
encoded_input, masks, extended_labels = get_encoded_input("../../data/train_290818.txt", 
                                                          tag2idx=TAG2IDX,
                                                          vocab2idx=VOCAB2IDX,
                                                          maxlen=MAX_LEN)

L = len(extended_labels)

dataset = TensorDataset(torch.LongTensor(encoded_input),
                        torch.BoolTensor(masks),
                        torch.LongTensor(extended_labels))

train_sz, val_sz = L-int(0.1*L), int(0.1*L)
train_dataset, val_dataset = random_split(dataset, (train_sz, val_sz))                                                                                                                  

In [7]:
encoded_input, masks, extended_labels = get_encoded_input("../../data/test_290818.txt", 
                                                          tag2idx=TAG2IDX,
                                                          vocab2idx=VOCAB2IDX,
                                                          maxlen=MAX_LEN)

test_dataset = TensorDataset(torch.LongTensor(encoded_input),
                             torch.BoolTensor(masks),
                             torch.LongTensor(extended_labels))

In [8]:
model = TRANSFORMER(input_dim=len(VOCAB2IDX),
                    train_dataset=train_dataset,
                    val_dataset=val_dataset,
                    test_dataset=test_dataset,
                    use_scheduler=True)

earlystopping_callback = EarlyStopping(monitor="val_f1score", 
                                       min_delta=1e-4, 
                                       patience=EPOCHS,
                                       mode="max")

checkpoint_callback = ModelCheckpoint(dirpath="../saved_weights",
                                      filename=MODEL_NAME,
                                      save_top_k=1, 
                                      mode="max",
                                      monitor="val_f1score",
                                      save_weights_only=True)

logger = TensorBoardLogger("../../tb_logs", name=MODEL_NAME)

trainer = pl.Trainer(accelerator="gpu",
                     max_epochs=EPOCHS,
                     precision=16,
                     logger=logger,
                     log_every_n_steps=1,
                     callbacks=[earlystopping_callback, 
                                checkpoint_callback])

Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [9]:
trainer.fit(model)

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type               | Params
-----------------------------------------------------
0 | crf           | CRF                | 80    
1 | fc            | Linear             | 4.1 K 
2 | embedding     | Embedding          | 2.5 M 
3 | pos_encoding  | PositionalEncoding | 0     
4 | encoder_layer | EncoderLayer       | 3.2 M 
5 | encoder_block | EncoderBlock       | 3.2 M 
-----------------------------------------------------
5.6 M     Trainable params
0         Non-trainable params
5.6 M     Total params
11.214    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

	addcmul_(Number value, Tensor tensor1, Tensor tensor2)
Consider using one of the following signatures instead:
	addcmul_(Tensor tensor1, Tensor tensor2, *, Number value) (Triggered internally at  ../torch/csrc/utils/python_arg_parser.cpp:1174.)
  exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [10]:
model.load_state_dict(torch.load(f"../saved_weights/{MODEL_NAME}.ckpt")["state_dict"])
trainer.test(model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_f1score                  0.0
        test_loss            2281.76416015625
     test_precision                 0.0
       test_recall                  0.0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 2281.76416015625,
  'test_recall': 0.0,
  'test_precision': 0.0,
  'test_f1score': 0.0}]