In [1]:
from sys import path
from os.path import dirname, abspath
path.append(dirname(dirname(abspath("__file__"))))

In [2]:
# import statements
from transformers import AutoModel, get_cosine_schedule_with_warmup
from torch_optimizer import Ranger
from torchcrf import CRF
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, random_split
from scripts.utils_bert import *
from scripts.metrics import f1score
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from multiprocessing import cpu_count
from os import environ
from platform import system

# set the environment to run tokenizers in parallel
environ["TOKENIZERS_PARALLELISM"] = "false"
pl.seed_everything(seed=101)

Global seed set to 101


101

In [3]:
# some important hyperparameters
LEARNING_RATE = 5e-4
BATCH_SIZE = 10
WEIGHT_DECAY = 1e-1
EPOCHS = 25
N_JOBS = cpu_count() if system() != "Windows" else 0

BERT_TYPE = "facebook/bart-base"
MODEL_NAME = f"{BERT_TYPE}-ner"
TAG2IDX = {'B': 0, 'I': 1, 'O': 2, 'E': 3, 'S': 4, '<': 5, ">":6, "$": 7}

In [4]:
class BERT_NER(pl.LightningModule):
    def __init__(self, 
                 bert_type=BERT_TYPE,
                 use_scheduler=True,
                 num_tags=len(TAG2IDX),
                 total_steps=1024,
                 train_dataset=None,
                 val_dataset=None,
                 test_dataset=None):
        
        super().__init__()
        self.bert = AutoModel.from_pretrained(bert_type)
        self.crf = CRF(num_tags=num_tags, batch_first=True)
        self.fc = nn.Linear(768, num_tags)
        self.use_scheduler = use_scheduler
        ## Hyperparameters ##
        self.learning_rate = LEARNING_RATE
        self.weight_decay = WEIGHT_DECAY
        self.total_steps = total_steps
        self.batch_size = BATCH_SIZE
        ## Datasets ##
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        self.test_dataset = test_dataset
        ## steps ##
        if self.use_scheduler: 
            self.total_steps = len(train_dataset) // self.batch_size


    # create the dataloaders
    # add shuffle only for train_dataloader
    # make sure num_workers is set appropriately and drop_last is set to False
    def train_dataloader(self):
        return DataLoader(self.train_dataset, 
                          batch_size=self.batch_size,
                          num_workers=N_JOBS,
                          shuffle=True,
                          drop_last=False)


    def val_dataloader(self):
        return DataLoader(self.val_dataset, 
                          batch_size=self.batch_size,
                          num_workers=N_JOBS,
                          shuffle=False,
                          drop_last=False)


    def test_dataloader(self):
        return DataLoader(self.test_dataset, 
                          batch_size=self.batch_size,
                          num_workers=N_JOBS,
                          shuffle=False,
                          drop_last=False)
    

    def forward(self, input_ids, attention_masks):
        out = self.bert(input_ids, attention_masks).last_hidden_state
        out = self.fc(out)
        return out

    
    def _shared_evaluation_step(self, batch, batch_idx):
        ids, masks, lbls = batch
        emissions = self(ids, masks)
        loss = -self.crf(emissions, lbls, mask=masks)
        pred = self.crf.decode(emissions, mask=masks)
        r, p, f1 = f1score(lbls, pred, model="transformer")
        return loss, r, p, f1


    def training_step(self, batch, batch_idx):
        loss, r, p, f1 = self._shared_evaluation_step(batch, batch_idx)
        self.log("train_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log("train_recall", r, on_step=False, on_epoch=True, prog_bar=True)
        self.log("train_precision", p, on_step=False, on_epoch=True, prog_bar=True)
        self.log("train_f1score", f1, on_step=False, on_epoch=True, prog_bar=True)
        return loss


    def validation_step(self, batch, batch_idx):
        loss, r, p, f1 = self._shared_evaluation_step(batch, batch_idx)
        self.log("val_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log("val_recall", r, on_step=False, on_epoch=True, prog_bar=True)
        self.log("val_precision", p, on_step=False, on_epoch=True, prog_bar=True)
        self.log("val_f1score", f1, on_step=False, on_epoch=True, prog_bar=True)

    
    def test_step(self, batch, batch_idx):
        loss, r, p, f1 = self._shared_evaluation_step(batch, batch_idx)
        self.log("test_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log("test_recall", r, on_step=False, on_epoch=True, prog_bar=True)
        self.log("test_precision", p, on_step=False, on_epoch=True, prog_bar=True)
        self.log("test_f1score", f1, on_step=False, on_epoch=True, prog_bar=True)


    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        ids, masks, _ = batch 
        return self.crf.decode(self(ids, masks), mask=masks)
    
    
    def configure_optimizers(self):      
        optimizer = Ranger(self.parameters(), 
                           lr=self.learning_rate,
                           weight_decay=self.weight_decay)

        if self.use_scheduler:
            scheduler = get_cosine_schedule_with_warmup(optimizer=optimizer,
                                                        num_warmup_steps=1,
                                                        num_training_steps=self.total_steps)
            lr_scheduler = {
                'scheduler': scheduler, 
                'interval': 'epoch', 
                'frequency': 1
            }
            return [optimizer], [lr_scheduler]
        else:
            return [optimizer]

In [5]:
# get the input encoded as numbers along with extended_labels
# extended labels have START, END, PAD tokens as well
# any word that was split during tokenization would have the same label as the parent label
encoded_input, extended_labels = get_encoded_input("../../data/train_290818.txt", 
                                                   tag2idx=TAG2IDX, 
                                                   tokenizer_name=BERT_TYPE)

L = len(extended_labels)

# create a tensor dataset from the input_ids, masks and extended labels
# these datasets will help create dataloader for batched execution of data
dataset = TensorDataset(torch.LongTensor(encoded_input["input_ids"]),
                        torch.BoolTensor(encoded_input["attention_mask"]),
                        torch.LongTensor(extended_labels))

train_sz, val_sz = L-int(0.1*L), int(0.1*L)
# create a random 10% validation split
train_dataset, val_dataset = random_split(dataset, (train_sz, val_sz))

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.68k [00:00<?, ?B/s]

In [6]:
# similarly read the test data and create the dataset
encoded_input, extended_labels = get_encoded_input("../../data/test_290818.txt", 
                                                   tag2idx=TAG2IDX, 
                                                   tokenizer_name=BERT_TYPE)

test_dataset = TensorDataset(torch.LongTensor(encoded_input["input_ids"]),
                             torch.BoolTensor(encoded_input["attention_mask"]),
                             torch.LongTensor(extended_labels))

In [7]:
# create the model, Trainer object and add necessary callbacks
# the trainer saves the weights of model whenever it reaches a local maxima for val_f1score 
model = BERT_NER(bert_type=BERT_TYPE,
                 use_scheduler=True,
                 train_dataset=train_dataset,
                 val_dataset=val_dataset,
                 test_dataset=test_dataset)


earlystopping_callback = EarlyStopping(monitor="val_f1score", 
                                       min_delta=1e-4, 
                                       patience=5, 
                                       mode="max")

checkpoint_callback = ModelCheckpoint(dirpath="../saved_weights",
                                      filename=MODEL_NAME,
                                      save_top_k=1, 
                                      mode="max",
                                      monitor="val_f1score",
                                      save_weights_only=True)

logger = TensorBoardLogger("../../tb_logs", name=MODEL_NAME)

# precision=16 runs the model at half-precision
# it is faster and consumes lower memory
# STRONGLY RECOMMENDED TO RUN THIS CODE ON A GOOD GPU
trainer = pl.Trainer(accelerator="gpu",
                     max_epochs=EPOCHS,
                     precision=16,
                     logger=logger,
                     log_every_n_steps=1,
                     callbacks=[earlystopping_callback, 
                                checkpoint_callback])

Downloading pytorch_model.bin:   0%|          | 0.00/532M [00:00<?, ?B/s]

Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
# train the model
trainer.fit(model)

Missing logger folder: ../../tb_logs/facebook/bart-base-ner
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name | Type      | Params
-----------------------------------
0 | bert | BartModel | 139 M 
1 | crf  | CRF       | 80    
2 | fc   | Linear    | 6.2 K 
-----------------------------------
139 M     Trainable params
0         Non-trainable params
139 M     Total params
278.853   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

	addcmul_(Number value, Tensor tensor1, Tensor tensor2)
Consider using one of the following signatures instead:
	addcmul_(Tensor tensor1, Tensor tensor2, *, Number value) (Triggered internally at  ../torch/csrc/utils/python_arg_parser.cpp:1174.)
  exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [None]:
# re-load the best weights and test the model
model.load_state_dict(torch.load(f"../saved_weights/{MODEL_NAME}.ckpt")["state_dict"])
trainer.test(model)