In [1]:
from torchcrf import CRF
import torch
import torch.nn as nn
from torch_optimizer import SGDW, Lookahead
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from utils import get_encoded_input
from metrics import f1score
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
from multiprocessing import cpu_count
from platform import system
from os import environ

environ["TOKENIZERS_PARALLELISM"] = "false"
pl.seed_everything(seed=42)

Global seed set to 42


42

In [2]:
LEARNING_RATE = 1e-3
BATCH_SIZE = 64
WEIGHT_DECAY = 5e-7
EPOCHS = 25
MAX_LEN = 128
N_JOBS = cpu_count() if system() != "Windows" else 0

tag2idx = {'B': 0, 'I': 1, 'O': 2, 'E': 3, 'S': 4, 'X': 5}

In [3]:
class LSTM4NER(pl.LightningModule):
    def __init__(self, 
                 input_dim, 
                 bidirectional=False, 
                 num_lstm_layers=3,
                 embed_dim=128, 
                 dropout=0.1, 
                 lstm_dim=128,
                 num_tags=len(tag2idx),
                 train_dataset=None,
                 val_dataset=None,
                 test_dataset=None):

        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=input_dim,
                                      embedding_dim=embed_dim,
                                      padding_idx=0)
        

        c = (2 if bidirectional else 1)
        self.lstm = nn.LSTM(input_size=embed_dim, 
                            hidden_size=lstm_dim, 
                            dropout=dropout,
                            num_layers=num_lstm_layers, 
                            bidirectional=bidirectional)

        
        self.fc = nn.Linear(lstm_dim*c, num_tags)
        self.crf = CRF(num_tags=num_tags, batch_first=True)
        self.dropout = nn.Dropout(p=dropout)
        ## Hyperparameters ##
        self.learning_rate = LEARNING_RATE
        self.weight_decay = WEIGHT_DECAY
        self.batch_size = BATCH_SIZE
        ## Datasets ##
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        self.test_dataset = test_dataset

    def train_dataloader(self):
        return DataLoader(self.train_dataset, 
                          batch_size=self.batch_size,
                          shuffle=True,
                          num_workers=N_JOBS,
                          drop_last=False)


    def val_dataloader(self):
        return DataLoader(self.val_dataset, 
                          batch_size=self.batch_size,
                          num_workers=N_JOBS,
                          drop_last=False)


    def test_dataloader(self):
        return DataLoader(self.test_dataset, 
                          batch_size=self.batch_size,
                          num_workers=N_JOBS,
                          drop_last=False)


    def forward(self, input_ids):
        out = self.embedding(input_ids)
        out, _ = self.lstm(out)
        out = self.dropout(out)
        out = self.fc(out)
        return out


    def _shared_evaluation_step(self, batch, batch_idx):
        ids, masks, lbls = batch
        emissions = self(ids)
        loss = -self.crf(emissions, lbls, mask=masks)
        pred = self.crf.decode(emissions, mask=masks)
        r, p, f1 = f1score(lbls, pred)
        return loss, r, p, f1
     
        
    def training_step(self, batch, batch_idx):
        loss, r, p, f1 = self._shared_evaluation_step(batch, batch_idx)
        self.log("train_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log("train_recall", r, on_step=False, on_epoch=True, prog_bar=True)
        self.log("train_precision", p, on_step=False, on_epoch=True, prog_bar=True)
        self.log("train_f1score", f1, on_step=False, on_epoch=True, prog_bar=True)
        return loss


    def validation_step(self, batch, batch_idx):
        loss, r, p, f1 = self._shared_evaluation_step(batch, batch_idx)
        self.log("val_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log("val_recall", r, on_step=False, on_epoch=True, prog_bar=True)
        self.log("val_precision", p, on_step=False, on_epoch=True, prog_bar=True)
        self.log("val_f1score", f1, on_step=False, on_epoch=True, prog_bar=True)

    
    def test_step(self, batch, batch_idx):
        loss, r, p, f1 = self._shared_evaluation_step(batch, batch_idx)
        self.log("test_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log("test_recall", r, on_step=False, on_epoch=True, prog_bar=True)
        self.log("test_precision", p, on_step=False, on_epoch=True, prog_bar=True)
        self.log("test_f1score", f1, on_step=False, on_epoch=True, prog_bar=True)


    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        ids, masks, _ = batch
        return self.crf.decode(self(ids), mask=masks)


    def configure_optimizers(self):   
        return Lookahead(SGDW(self.parameters(),
                              lr=self.learning_rate,
                              momentum=0.9,
                              nesterov=True,
                              weight_decay=self.weight_decay))

In [4]:
with open("../../data/full_vocab_290818_tree_bank_tokenier.txt", "r") as f:
    vocab = [s.strip() for s in f.readlines()]

In [5]:
encoded_input, attn_masks, extended_labels = get_encoded_input("../../data/train_290818.txt", 
                                                               tag2idx=tag2idx,
                                                               vocab=vocab,
                                                               visualize=True,
                                                               max_len=MAX_LEN)

input_ids_train, input_ids_val, attn_masks_train, attn_masks_val, extended_labels_train, extended_labels_val = train_test_split(encoded_input,
                                                                                                                                attn_masks,
                                                                                                                                extended_labels, 
                                                                                                                                test_size=0.1, 
                                                                                                                                shuffle=True) 

input_ids_train = torch.LongTensor(input_ids_train)
attn_masks_train = torch.BoolTensor(attn_masks_train)
extended_labels_train = torch.LongTensor(extended_labels_train)

input_ids_val = torch.LongTensor(input_ids_val)
attn_masks_val = torch.BoolTensor(attn_masks_val)
extended_labels_val = torch.LongTensor(extended_labels_val)

train_dataset = TensorDataset(input_ids_train, attn_masks_train, extended_labels_train)
val_dataset = TensorDataset(input_ids_val, attn_masks_val, extended_labels_val)                                                                                                                        

count     2187.000000
mean        18.227252
std         14.683643
min          4.000000
25%         10.000000
50%         14.000000
75%         21.000000
90%         32.000000
95%         40.700000
99%         82.420000
99.9%      152.512000
99.99%     200.758400
max        213.000000
Name: seq_len, dtype: float64


In [6]:
encoded_input, attn_masks, extended_labels = get_encoded_input("../../data/test_290818.txt", 
                                                               tag2idx=tag2idx,
                                                               vocab=vocab,
                                                               visualize=True,
                                                               max_len=MAX_LEN)

input_ids_test = torch.LongTensor(encoded_input)
attn_masks_test = torch.BoolTensor(attn_masks)
extended_labels_test = torch.LongTensor(extended_labels) 

test_dataset = TensorDataset(input_ids_test, attn_masks_test, extended_labels_test)

count     547.00000
mean       17.08958
std        12.16015
min         4.00000
25%         9.00000
50%        13.00000
75%        20.50000
90%        30.40000
95%        39.00000
99%        65.70000
99.9%      95.08600
99.99%     99.50860
max       100.00000
Name: seq_len, dtype: float64


In [7]:
model = LSTM4NER(input_dim=len(vocab),
                 bidirectional=False,
                 train_dataset=train_dataset,
                 val_dataset=val_dataset,
                 test_dataset=test_dataset)

earlystopping_callback = EarlyStopping(monitor="val_f1score", 
                                       min_delta=1e-4, 
                                       patience=5, 
                                       mode="max")

checkpoint_callback = ModelCheckpoint(dirpath="./",
                                      filename=f"lstm-ner-val-f1score",
                                      save_top_k=1, 
                                      mode="max",
                                      monitor="val_f1score",
                                      save_weights_only=True)

trainer = pl.Trainer(accelerator="gpu",
                     max_epochs=EPOCHS,
                     precision=16,
                     log_every_n_steps=1,
                     callbacks=[earlystopping_callback, checkpoint_callback])

Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [8]:
trainer.fit(model)

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 612 K 
1 | lstm      | LSTM      | 396 K 
2 | fc        | Linear    | 774   
3 | crf       | CRF       | 48    
4 | dropout   | Dropout   | 0     
----------------------------------------
1.0 M     Trainable params
0         Non-trainable params
1.0 M     Total params
2.019     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

	add(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add(Tensor other, *, Number alpha) (Triggered internally at  ../torch/csrc/utils/python_arg_parser.cpp:1174.)
  d_p = d_p.add(momentum, buf)


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [9]:
model.load_state_dict(torch.load(f"./lstm-ner-val-f1score.ckpt")["state_dict"])
trainer.test(model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

[{'test_loss': 221.8532257080078,
  'test_recall': 0.25784752535587896,
  'test_precision': 0.3982073471248358,
  'test_f1score': 0.31143996928350354}]