In [1]:
from transformers import AutoModel, get_cosine_schedule_with_warmup, AutoTokenizer
from torch_optimizer import Ranger
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, random_split
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from multiprocessing import cpu_count
from os import environ
import pandas as pd
from platform import system
from sklearn.metrics import f1_score

environ["TOKENIZERS_PARALLELISM"] = "false"
pl.seed_everything(seed=101)

Global seed set to 101


101

In [2]:
LEARNING_RATE = 5e-5
BATCH_SIZE = 8
WEIGHT_DECAY = 1e-1
EPOCHS = 10
MAX_LEN = 256
N_JOBS = cpu_count() if system() != "Windows" else 0

BERT_TYPE = "roberta-base"
MODEL_NAME = f"{BERT_TYPE}-sug"
tokenizer = AutoTokenizer.from_pretrained(BERT_TYPE, use_fast=True)

In [3]:
class BERT_SUG(pl.LightningModule):
    def __init__(self, 
                 bert_type=BERT_TYPE,
                 dropout=0.5,
                 use_scheduler=True,
                 train_dataset=None,
                 val_dataset=None,
                 test_dataset=None):

        super().__init__()
        self.bert = AutoModel.from_pretrained(bert_type)
        self.fc = nn.Linear(768, 1)
        self.loss_fn = nn.BCEWithLogitsLoss()
        self.layer_norm = nn.LayerNorm(768)
        self.dropout = nn.Dropout(p=dropout)
        self.use_scheduler = use_scheduler
        ## Hyperparameters ##
        self.learning_rate = LEARNING_RATE
        self.weight_decay = WEIGHT_DECAY
        self.batch_size = BATCH_SIZE
        ## Datasets ##
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        self.test_dataset = test_dataset
        ## steps ##
        if self.use_scheduler: 
            self.total_steps = len(train_dataset) // self.batch_size


    def _f1score(self, logits, lbls):
        lbls = torch.flatten(lbls)
        preds = torch.flatten(torch.round(torch.sigmoid(logits)))
        return f1_score(lbls.tolist(), preds.tolist(), zero_division=0)


    def train_dataloader(self):
        return DataLoader(self.train_dataset, 
                          batch_size=self.batch_size,
                          num_workers=N_JOBS,
                          drop_last=False)


    def val_dataloader(self):
        return DataLoader(self.val_dataset, 
                          batch_size=self.batch_size,
                          num_workers=N_JOBS,
                          shuffle=True,
                          drop_last=False)


    def test_dataloader(self):
        return DataLoader(self.test_dataset, 
                          batch_size=self.batch_size,
                          num_workers=N_JOBS,
                          shuffle=False,
                          drop_last=False)


    def forward(self, input_ids, attention_masks):
        out = self.bert(input_ids, attention_masks).pooler_output
        out = self.layer_norm(out)
        out = self.dropout(out)
        out = self.fc(out)
        return out
    
    
    def _shared_evaluation_step(self, batch, batch_idx):
        ids, masks, lbls = batch
        logits = self(ids, masks)
        loss = self.loss_fn(logits, lbls.float())
        f1 = self._f1score(logits, lbls)
        return loss, f1


    def training_step(self, batch, batch_idx):
        loss, f1 = self._shared_evaluation_step(batch, batch_idx)
        self.log("train_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log("train_f1score", f1, on_step=False, on_epoch=True, prog_bar=True)
        return loss

    
    def validation_step(self, batch, batch_idx):
        loss, f1 = self._shared_evaluation_step(batch, batch_idx)
        self.log("val_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log("val_f1score", f1, on_step=False, on_epoch=True, prog_bar=True)

    
    def test_step(self, batch, batch_idx):
        loss, f1 = self._shared_evaluation_step(batch, batch_idx)
        self.log("test_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log("test_f1score", f1, on_step=False, on_epoch=True, prog_bar=True)

        
    def configure_optimizers(self):
        optimizer = Ranger(self.parameters(),
                           lr=self.learning_rate,
                           weight_decay=self.weight_decay)

        if self.use_scheduler:
            scheduler = get_cosine_schedule_with_warmup(optimizer=optimizer,
                                                        num_warmup_steps=1,
                                                        num_training_steps=self.total_steps)
            lr_scheduler = {
                'scheduler': scheduler, 
                'interval': 'epoch', 
                'frequency': 1
            }
            return [optimizer], [lr_scheduler]
        else:
            return [optimizer]

In [4]:
df = pd.read_csv("../data/App_Training.csv", names=["sno", "id", "text", "lbl"])
labels, text, L = df["lbl"].tolist(), df["text"].tolist(), len(df)

encoded_input = tokenizer(text, 
                          padding="max_length",
                          max_length=MAX_LEN,
                          truncation=True,
                          return_attention_mask=True)

dataset = TensorDataset(torch.LongTensor(encoded_input["input_ids"]),
                        torch.BoolTensor(encoded_input["attention_mask"]),
                        torch.LongTensor(labels).unsqueeze(1))

train_sz, val_sz = L-int(0.1*L), int(0.1*L)
train_dataset, val_dataset = random_split(dataset, (train_sz, val_sz))                                                                                                    

In [5]:
df = pd.read_csv("../data/App_Test_Labeled.csv", names=["sno", "id", "text", "lbl"])
text, labels = df["text"].tolist(), df["lbl"].tolist()

encoded_input = tokenizer(text, 
                          padding="max_length",
                          max_length=MAX_LEN,
                          truncation=True,
                          return_attention_mask=True)

test_dataset = TensorDataset(torch.LongTensor(encoded_input["input_ids"]),
                             torch.BoolTensor(encoded_input["attention_mask"]),
                             torch.LongTensor(labels).unsqueeze(1))

In [6]:
model = BERT_SUG(bert_type=BERT_TYPE,
                 dropout=0.25,
                 train_dataset=train_dataset,
                 val_dataset=val_dataset,
                 test_dataset=test_dataset,
                 use_scheduler=True)

earlystopping_callback = EarlyStopping(monitor="val_f1score", 
                                       min_delta=1e-4, 
                                       patience=5, 
                                       mode="max")

checkpoint_callback = ModelCheckpoint(dirpath="./saved_weights",
                                      filename=MODEL_NAME,
                                      save_top_k=1,
                                      mode="max",
                                      monitor="val_f1score",
                                      save_weights_only=True)

logger = TensorBoardLogger("../tb_logs", name=MODEL_NAME)

trainer = pl.Trainer(accelerator="gpu",
                     max_epochs=EPOCHS,
                     precision=16,
                     logger=logger,
                     log_every_n_steps=1,
                     callbacks=[earlystopping_callback, 
                                checkpoint_callback])

Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [7]:
trainer.fit(model)

Missing logger folder: ../tb_logs/facebook/bart-base-sug
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type              | Params
-------------------------------------------------
0 | bert       | BartModel         | 139 M 
1 | fc         | Linear            | 769   
2 | loss_fn    | BCEWithLogitsLoss | 0     
3 | layer_norm | LayerNorm         | 1.5 K 
4 | dropout    | Dropout           | 0     
-------------------------------------------------
139 M     Trainable params
0         Non-trainable params
139 M     Total params
278.845   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


AttributeError: 'Seq2SeqModelOutput' object has no attribute 'pooler_output'

In [None]:
model.load_state_dict(torch.load(f"./saved_weights/{MODEL_NAME}.ckpt")["state_dict"])
trainer.test(model)