In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
!nvidia-smi

Fri Aug 27 12:02:08 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    25W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install --quiet transformers
!pip install --quiet pytorch-lightning

[K     |████████████████████████████████| 2.6 MB 7.4 MB/s 
[K     |████████████████████████████████| 636 kB 89.1 MB/s 
[K     |████████████████████████████████| 3.3 MB 54.5 MB/s 
[K     |████████████████████████████████| 895 kB 68.2 MB/s 
[K     |████████████████████████████████| 918 kB 7.8 MB/s 
[K     |████████████████████████████████| 829 kB 40.2 MB/s 
[K     |████████████████████████████████| 118 kB 83.8 MB/s 
[K     |████████████████████████████████| 272 kB 81.8 MB/s 
[K     |████████████████████████████████| 1.3 MB 50.3 MB/s 
[K     |████████████████████████████████| 142 kB 74.5 MB/s 
[K     |████████████████████████████████| 294 kB 80.4 MB/s 
[?25h  Building wheel for future (setup.py) ... [?25l[?25hdone


In [None]:
import pandas as pd
import numpy as np

from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup

import pytorch_lightning as pl
from pytorch_lightning.metrics.functional import accuracy, f1, auroc
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

%matplotlib inline  
%config InlineBackend.figure_format='retina'

RANDOM_SEED = 42

sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8

pl.seed_everything(RANDOM_SEED)

Global seed set to 42


42

In [None]:
# import dataset
train_df = pd.read_csv('drive/MyDrive/Biocreative/Biocreative/datasets/TRAIN.csv')
train_df = train_df.sample(frac=1).reset_index(drop=True)
train_df.head(), train_df.shape

(       pmid  ... Treatment
 0  33953243  ...         0
 1  33734063  ...         0
 2  32304745  ...         0
 3  34006330  ...         1
 4  32496253  ...         0
 
 [5 rows x 12 columns], (30738, 12))

In [None]:
LABEL_COLUMNS = train_df.columns.tolist()[5:]

In [None]:
print(LABEL_COLUMNS)

['Case Report', 'Diagnosis', 'Epidemic Forecasting', 'Mechanism', 'Prevention', 'Transmission', 'Treatment']


In [None]:
# BERT_MODEL_NAME = 'dmis-lab/biobert-base-cased-v1.1'
# Ref : microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext
# Ref 2.0 : lordtt13/COVID-SciBERT
BERT_MODEL_NAME = input("Enter Bert Model Name: ")
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)

Enter Bert Model Name: microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext


Downloading:   0%|          | 0.00/337 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [None]:
train_df[LABEL_COLUMNS].sum().sort_values()

Epidemic Forecasting     1528
Case Report              2291
Transmission             3165
Mechanism                5414
Diagnosis                7456
Treatment               10091
Prevention              13998
dtype: int64

In [None]:
MAX_TOKEN_COUNT = 512

In [None]:
class TopicAnnotationDataset(Dataset):

  def __init__(
    self,
    data: pd.DataFrame,
    tokenizer: AutoTokenizer,
    max_token_len: int = 512,
    title_max_token_len: int = 128
  ):
    self.tokenizer = tokenizer
    self.data = data
    self.max_token_len = max_token_len
    self.title_max_token_len = title_max_token_len
    
  def __len__(self):
    return len(self.data)

  def __getitem__(self, index=int):
    
    data_row = self.data.iloc[index]

    abstract_text = data_row.abstract
    labels = data_row[LABEL_COLUMNS]

    title_text = data_row.title

    encoding = self.tokenizer.encode_plus(
        abstract_text,
        add_special_tokens=True,
        max_length=self.max_token_len,
        return_token_type_ids=False,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    title_encoding = self.tokenizer.encode_plus(
        title_text,
        add_special_tokens=True,
        max_length=self.title_max_token_len,
        return_token_type_ids=False,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    return dict(
        abstract_text=abstract_text,
        title_text=title_text,
        input_ids=encoding["input_ids"].flatten(),
        attention_mask=encoding["attention_mask"].flatten(),
        title_input_ids=title_encoding["input_ids"].flatten(),
        title_attention_mask=title_encoding["attention_mask"].flatten(),
        labels=torch.FloatTensor(labels) 
    )  

In [None]:
train_dataset = TopicAnnotationDataset(
    train_df,
    tokenizer,
    max_token_len=MAX_TOKEN_COUNT
)

In [None]:
class TopicAnnotationDataModule(pl.LightningDataModule):

  def __init__(self, train_df, test_df, tokenizer, batch_size=8, max_token_len=128):
    
    super().__init__()
    self.batch_size = batch_size
    self.train_df = train_df
    self.test_df = test_df
    self.tokenizer = tokenizer
    self.max_token_len = max_token_len

  def setup(self, stage=None):
    
    self.train_dataset =  TopicAnnotationDataset(
        self.train_df,
        self.tokenizer,
        self.max_token_len
    )

    self.test_dataset = TopicAnnotationDataset(
        self.test_df,
        self.tokenizer,
        self.max_token_len
    )

  def train_dataloader(self):

    return DataLoader(
      self.train_dataset,
      batch_size=self.batch_size,
      shuffle=True,
      num_workers=2
    )

  def val_dataloader(self):

    return DataLoader(
      self.test_dataset,
      batch_size=self.batch_size,
      num_workers=2
    )

  def test_dataloader(self):
    
    return DataLoader(
      self.test_dataset,
      batch_size=self.batch_size,
      num_workers=2
    )

In [None]:
  %pwd

'/content'

In [None]:
val_df = pd.read_csv("/content/drive/MyDrive/Biocreative/Biocreative/datasets/VAL.csv")
val_df = val_df.sample(frac=1).reset_index(drop=True)
val_df.shape

(3423, 12)

In [None]:
N_EPOCHS = 10
BATCH_SIZE = 8

data_module = TopicAnnotationDataModule(
    train_df,
    val_df,
    tokenizer,
    batch_size=BATCH_SIZE,
    max_token_len=MAX_TOKEN_COUNT
)

In [None]:
Label_count = train_df[LABEL_COLUMNS].sum().to_dict()
count = list(Label_count.values())
max_val = max(count)
class_weight = [max_val/val for val in count]
print(class_weight)

[6.109995635093846, 1.8774141630901287, 9.160994764397905, 2.5855190247506465, 1.0, 4.4227488151658765, 1.3871766921018729]


In [None]:
# print label and corresponding weights
dict(zip(LABEL_COLUMNS, class_weight))

{'Case Report': 6.109995635093846,
 'Diagnosis': 1.8774141630901287,
 'Epidemic Forecasting': 9.160994764397905,
 'Mechanism': 2.5855190247506465,
 'Prevention': 1.0,
 'Transmission': 4.4227488151658765,
 'Treatment': 1.3871766921018729}

In [None]:
class_weight = torch.tensor(class_weight, device="cuda")

In [None]:
class FFN(nn.Module):
  def __init__(self, in_feat, out_feat, dropout):
      super(FFN, self).__init__()
      self.in2hid = nn.Linear(in_feat, in_feat)
      self.hid2out = nn.Linear(in_feat, out_feat)

      self.activation = nn.ReLU()
      self.dropout = nn.Dropout(dropout)

  def forward(self, input):
      hid = self.activation(self.dropout(self.in2hid(input)))
      return self.hid2out(hid)

In [None]:
class TopicAnnotationTagger(pl.LightningModule):

  def __init__(self, n_classes: int, n_training_steps=None, n_warmup_steps=None, ffn_dim: int = 512, out_feat: int = 768, dropout=0.2):
    
    super().__init__()
    self.bert = AutoModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
    self.bert_title = AutoModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
    # self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
    self.n_training_steps = n_training_steps
    self.n_warmup_steps = n_warmup_steps
    self.criterion = nn.BCELoss(weight=class_weight)
    self.sigmoid = nn.Sigmoid()

    # angry-bert layers
    self.ffn_dim = ffn_dim
    self.out_feat = out_feat
    self.dropout = nn.Dropout(p=dropout)
    self.activation = nn.ReLU()
    
    self.gate = nn.Linear(
        (self.bert.config.hidden_size + self.bert_title.config.hidden_size), 
        self.ffn_dim
    )

    self.ffn = FFN(self.ffn_dim, out_feat, dropout=0.2)
    self.classifier = nn.Linear(self.out_feat, n_classes)

  def forward(self, input_ids, attention_mask, title_input_ids, title_attention_mask, labels=None):

    bert_outputs = self.bert(input_ids, attention_mask=attention_mask)
    # pooler_output (batch_size x hidden_size)
    encoded_output_bert = bert_outputs.pooler_output

    bert_title_outputs = self.bert(title_input_ids, attention_mask=title_attention_mask)
    # pooler_output (batch_size x hidden_size)
    encoded_output_bert_title = bert_outputs.pooler_output

    # gate-fusion
    gatein = torch.cat((encoded_output_bert, encoded_output_bert_title), dim=-1)
    chosen = self.activation(self.dropout(self.gate(gatein)))

    res = self.ffn(chosen)

    # classification
    output = self.classifier(res)
    output = torch.sigmoid(output)

    loss = 0
    if labels is not None:
      loss = self.criterion(output, labels)
    return loss, output

  def training_step(self, batch, batch_idx):

    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    title_input_ids = batch["title_input_ids"]
    title_attention_mask = batch["title_attention_mask"]
    labels = batch["labels"]

    loss, outputs = self(input_ids, attention_mask, title_input_ids, title_attention_mask, labels)
    self.log("train_loss", loss, prog_bar=True, logger=True)
    return {"loss": loss, "predictions": outputs, "labels": labels}

  def validation_step(self, batch, batch_idx):

    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    title_input_ids = batch["title_input_ids"]
    title_attention_mask = batch["title_attention_mask"]
    labels = batch["labels"]

    loss, outputs = self(input_ids, attention_mask, title_input_ids, title_attention_mask, labels)
    self.log("val_loss", loss, prog_bar=True, logger=True)
    return loss

  def test_step(self, batch, batch_idx):
    
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    title_input_ids = batch["title_input_ids"]
    title_attention_mask = batch["title_attention_mask"]
    labels = batch["labels"]

    loss, outputs = self(input_ids, attention_mask, title_input_ids, title_attention_mask, labels)
    self.log("test_loss", loss, prog_bar=True, logger=True)
    return loss

  def training_epoch_end(self, outputs):

    labels = []
    predictions = []
    for output in outputs:
      for out_labels in output["labels"].detach().cpu():
        labels.append(out_labels)
      for out_predictions in output["predictions"].detach().cpu():
        predictions.append(out_predictions)

    labels = torch.stack(labels).int()
    predictions = torch.stack(predictions)

    for i, name in enumerate(LABEL_COLUMNS):
      class_roc_auc = auroc(predictions[:, i], labels[:, i])
      self.logger.experiment.add_scalar(f"{name}_roc_auc/Train", class_roc_auc, self.current_epoch)

  def configure_optimizers(self):

    optimizer = AdamW(self.parameters(), lr=2e-5)

    scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=self.n_warmup_steps,
      num_training_steps=self.n_training_steps
    )

    return dict(
        optimizer=optimizer,
        lr_scheduler=dict(
            scheduler=scheduler,
            interval='step'
            )
        )

In [None]:
steps_per_epoch=len(train_df) // BATCH_SIZE
total_training_steps = steps_per_epoch * N_EPOCHS

In [None]:
warmup_steps = total_training_steps // 5
warmup_steps, total_training_steps

(7684, 38420)

In [None]:
model = TopicAnnotationTagger(
    n_classes=len(LABEL_COLUMNS),
    n_warmup_steps=warmup_steps,
    n_training_steps=total_training_steps
)

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-unc

In [None]:
%pwd

'/content/drive/My Drive/Biocreative/Biocreative/checkpoints_and_logs'

In [None]:
cd '/content/drive/MyDrive/Biocreative/Biocreative/checkpoints_and_logs/'

/content/drive/MyDrive/Biocreative/Biocreative/checkpoints_and_logs


In [None]:
checkpoint_callback = ModelCheckpoint(
    dirpath="ANGRYBERT-lwan-checkpoints-upd",
    filename="ANGRYBERT-lwan-best-checkpoint-upd",
    save_top_k=3,
    verbose=True,
    monitor="val_loss",
    mode="min"
)

In [None]:
logger = TensorBoardLogger("ANGRYBERT-lwan-lightning_logs-upd", name="topic-annotations")

In [None]:
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=5)

In [None]:
trainer = pl.Trainer(
    logger=logger,
    checkpoint_callback=True,
    callbacks=[checkpoint_callback, early_stopping_callback],
    max_epochs=N_EPOCHS,
    gpus=1,
    progress_bar_refresh_rate=30
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [None]:
trainer.fit(model, data_module)

  f"DataModule.{name} has already been called, so it will not be called again. "
  f"DataModule.{name} has already been called, so it will not be called again. "
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type      | Params
-----------------------------------------
0 | bert       | BertModel | 109 M 
1 | bert_title | BertModel | 109 M 
2 | criterion  | BCELoss   | 0     
3 | sigmoid    | Sigmoid   | 0     
4 | dropout    | Dropout   | 0     
5 | activation | ReLU      | 0     
6 | gate       | Linear    | 1.2 M 
7 | ffn        | FFN       | 1.4 M 
8 | classifier | Linear    | 7.2 K 
-----------------------------------------
221 M     Trainable params
0         Non-trainable params
221 M     Total params
886.120   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 42


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 3842: val_loss reached 0.38273 (best 0.38273), saving model to "/content/drive/My Drive/Biocreative/Biocreative/checkpoints_and_logs/ANGRYBERT-lwan-checkpoints-upd/ANGRYBERT-lwan-best-checkpoint-upd.ckpt" as top 3


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 7685: val_loss reached 0.35202 (best 0.35202), saving model to "/content/drive/My Drive/Biocreative/Biocreative/checkpoints_and_logs/ANGRYBERT-lwan-checkpoints-upd/ANGRYBERT-lwan-best-checkpoint-upd-v1.ckpt" as top 3


Validating: 0it [00:00, ?it/s]

Epoch 2, global step 11528: val_loss reached 0.31994 (best 0.31994), saving model to "/content/drive/My Drive/Biocreative/Biocreative/checkpoints_and_logs/ANGRYBERT-lwan-checkpoints-upd/ANGRYBERT-lwan-best-checkpoint-upd-v2.ckpt" as top 3


Validating: 0it [00:00, ?it/s]

Epoch 3, global step 15371: val_loss reached 0.35958 (best 0.31994), saving model to "/content/drive/My Drive/Biocreative/Biocreative/checkpoints_and_logs/ANGRYBERT-lwan-checkpoints-upd/ANGRYBERT-lwan-best-checkpoint-upd.ckpt" as top 3


Validating: 0it [00:00, ?it/s]

Epoch 4, global step 19214: val_loss reached 0.35413 (best 0.31994), saving model to "/content/drive/My Drive/Biocreative/Biocreative/checkpoints_and_logs/ANGRYBERT-lwan-checkpoints-upd/ANGRYBERT-lwan-best-checkpoint-upd.ckpt" as top 3


Validating: 0it [00:00, ?it/s]

Epoch 5, global step 23057: val_loss was not in top 3


Validating: 0it [00:00, ?it/s]

Epoch 6, global step 26900: val_loss was not in top 3
