In [1]:
from transformers import AutoModel, get_linear_schedule_with_warmup
from torch.optim import AdamW
from torchcrf import CRF
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, random_split
from utils_bert import get_encoded_input
from metrics import f1score
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
from multiprocessing import cpu_count
from os import environ
from platform import system

environ["TOKENIZERS_PARALLELISM"] = "false"
pl.seed_everything(seed=101)

  _CPU_DEVICES = ("cpu", torch.device("cpu"))
Global seed set to 101


101

In [2]:
LEARNING_RATE = 2.75e-5
BATCH_SIZE = 20
WEIGHT_DECAY = 1e-2
EPOCHS = 25
N_JOBS = cpu_count() if system() != "Windows" else 0

BERT_TYPE = "roberta-base"
# tag2idx = {'B': 0, 'I': 1, 'O': 2, 'E': 3, 'S': 4, 'X': 5}
tag2idx = {'B': 0, 'I': 1, 'O': 2, 'E': 3, 'S': 4, '<': 5, ">":6, "$": 7}

In [3]:
class BERT4NER(pl.LightningModule):
    def __init__(self, 
                 bert_type=BERT_TYPE, 
                 num_tags=len(tag2idx), 
                 warmup_steps=0, 
                 total_steps=1024,
                 train_dataset=None,
                 val_dataset=None,
                 test_dataset=None):
        
        super().__init__()
        self.bert = AutoModel.from_pretrained(bert_type)
        self.crf = CRF(num_tags=num_tags, batch_first=True)
        self.fc = nn.Linear(768, num_tags)
        ## Hyperparameters ##
        self.learning_rate = LEARNING_RATE
        self.weight_decay = WEIGHT_DECAY
        self.warmup_steps = warmup_steps
        self.total_steps = total_steps
        self.batch_size = BATCH_SIZE
        ## Datasets ##
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        self.test_dataset = test_dataset


    def train_dataloader(self):
        return DataLoader(self.train_dataset, 
                          batch_size=self.batch_size,
                          shuffle=True,
                          num_workers=N_JOBS,
                          drop_last=False)


    def val_dataloader(self):
        return DataLoader(self.val_dataset, 
                          batch_size=self.batch_size,
                          num_workers=N_JOBS,
                          drop_last=False)


    def test_dataloader(self):
        return DataLoader(self.test_dataset, 
                          batch_size=self.batch_size,
                          num_workers=N_JOBS,
                          drop_last=False)
        

    def forward(self, input_ids, attention_masks):
        out = self.bert(input_ids, attention_masks).last_hidden_state
        out = self.fc(out)
        return out

    
    def _shared_evaluation_step(self, batch, batch_idx):
        ids, masks, lbls = batch
        emissions = self(ids, masks)
        loss = -self.crf(emissions, lbls, mask=masks)
        pred = self.crf.decode(emissions, mask=masks)
        r, p, f1 = f1score(lbls, pred)
        return loss, r, p, f1


    def training_step(self, batch, batch_idx):
        loss, r, p, f1 = self._shared_evaluation_step(batch, batch_idx)
        self.log("train_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log("train_recall", r, on_step=False, on_epoch=True, prog_bar=True)
        self.log("train_precision", p, on_step=False, on_epoch=True, prog_bar=True)
        self.log("train_f1score", f1, on_step=False, on_epoch=True, prog_bar=True)
        return loss


    def validation_step(self, batch, batch_idx):
        loss, r, p, f1 = self._shared_evaluation_step(batch, batch_idx)
        self.log("val_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log("val_recall", r, on_step=False, on_epoch=True, prog_bar=True)
        self.log("val_precision", p, on_step=False, on_epoch=True, prog_bar=True)
        self.log("val_f1score", f1, on_step=False, on_epoch=True, prog_bar=True)

    
    def test_step(self, batch, batch_idx):
        loss, r, p, f1 = self._shared_evaluation_step(batch, batch_idx)
        self.log("test_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log("test_recall", r, on_step=False, on_epoch=True, prog_bar=True)
        self.log("test_precision", p, on_step=False, on_epoch=True, prog_bar=True)
        self.log("test_f1score", f1, on_step=False, on_epoch=True, prog_bar=True)


    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        ids, masks = batch 
        return self.crf.decode(self(ids, masks), mask=masks)
    
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), 
                          lr=self.learning_rate,
                          amsgrad=True,
                          weight_decay=self.weight_decay)

        scheduler = get_linear_schedule_with_warmup(optimizer=optimizer,
                                                    num_warmup_steps=self.warmup_steps,
                                                    num_training_steps=self.total_steps)

        lr_scheduler = {
            'scheduler': scheduler, 
            'interval': 'epoch', 
            'frequency': 1
        }
        
        return [optimizer], [lr_scheduler]

In [4]:
encoded_input, extended_labels = get_encoded_input("../data/train_290818.txt", tag2idx=tag2idx, tokenizer_name=BERT_TYPE)

L = len(extended_labels)

dataset = TensorDataset(torch.LongTensor(encoded_input["input_ids"]),
                        torch.BoolTensor(encoded_input["attention_mask"]),
                        torch.LongTensor(extended_labels))

train_sz, val_sz = L-int(0.1*L), int(0.1*L)
train_dataset, val_dataset = random_split(dataset, (train_sz, val_sz))

In [5]:
encoded_input, extended_labels = get_encoded_input("../data/test_290818.txt", tag2idx=tag2idx, tokenizer_name=BERT_TYPE)

test_dataset = TensorDataset(torch.LongTensor(encoded_input["input_ids"]),
                             torch.BoolTensor(encoded_input["attention_mask"]),
                             torch.LongTensor(extended_labels))

In [6]:
WARMUP_RATIO = 0.05
TOTAL_STEPS = len(train_dataset) // BATCH_SIZE
WARMUP_STEPS = int(WARMUP_RATIO * TOTAL_STEPS)

In [7]:
model = BERT4NER(bert_type=BERT_TYPE,
                 warmup_steps=WARMUP_STEPS,
                 total_steps=TOTAL_STEPS,
                 train_dataset=train_dataset,
                 val_dataset=val_dataset,
                 test_dataset=test_dataset)


earlystopping_callback = EarlyStopping(monitor="val_f1score", 
                                       min_delta=1e-4, 
                                       patience=5, 
                                       mode="max")

checkpoint_callback = ModelCheckpoint(dirpath="./",
                                      filename=f"{BERT_TYPE}-ner-val-f1score",
                                      save_top_k=1, 
                                      mode="max",
                                      monitor="val_f1score",
                                      save_weights_only=True)

trainer = pl.Trainer(accelerator="gpu",
                     max_epochs=EPOCHS,
                     precision=16,
                     log_every_n_steps=1,
                     callbacks=[earlystopping_callback, checkpoint_callback])

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [8]:
trainer.fit(model)

Missing logger folder: d:\Suggestion-Mining-from-Noisy-Data\src_feat\lightning_logs
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name | Type         | Params
--------------------------------------
0 | bert | RobertaModel | 124 M 
1 | crf  | CRF          | 80    
2 | fc   | Linear       | 6.2 K 
--------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
249.304   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [9]:
model.load_state_dict(torch.load(f"./{BERT_TYPE}-ner-val-f1score.ckpt")["state_dict"])
trainer.test(model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

[{'test_loss': 144.6875,
  'test_recall': 0.6255844831466675,
  'test_precision': 0.4905465841293335,
  'test_f1score': 0.5408464670181274}]