<a href="https://colab.research.google.com/github/TheBlueHawk/CS4NLP_Project2022/blob/main/mctaco_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets
!pip install transformers[sentencepiece]
!pip install sentencepiece # necessary for DeBERTa-v3
!pip install pytorch-lightning==1.5.10
!pip install wandb
!pip install rich
!pip install torchmetrics
!pip install smart-pytorch 

In [1]:
# Login to Wandb for logging
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mschneider[0m ([33mfrl[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
import pytorch_lightning as pl
from transformers import AutoTokenizer, AutoModelForSequenceClassification
pl.seed_everything(42)

params = {
    'pretrained_model_name': 'roberta-base', # 'microsoft/deberta-v3-base', 'roberta-base', 'microsoft/mdeberta-v3-base', 'bert-base-uncased'
    'batch_size': 32,
    'sequence_length': 128,
    'max_epochs': 20,
    'smart_loss_weight': 1.0
}

tokenizer = AutoTokenizer.from_pretrained(params['pretrained_model_name'])
architecture = AutoModelForSequenceClassification.from_pretrained(params['pretrained_model_name'])

Global seed set to 42
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.den

In [3]:
import torch 
from torch.utils.data import Dataset
from datasets import load_dataset
from transformers import BertTokenizer

class MCTACODataset(Dataset):

    def __init__(self, split: str, tokenizer, sequence_length: int):
        self.dataset = load_dataset("mc_taco")[split]
        self.tokenizer = tokenizer
        self.sequence_length = sequence_length

    def __len__(self):
        return len(self.dataset)

    def truncate_pair(self, tokens_a, tokens_b, max_length):
        while True:
            total_length = len(tokens_a) + len(tokens_b)
            if total_length <= max_length:
                break
            if len(tokens_a) > len(tokens_b):
                tokens_a.pop()
            else:
                tokens_b.pop()

    def __getitem__(self, idx): 
        item = self.dataset[idx] 
        tokenize = self.tokenizer.tokenize
        sequence = tokenize(item['sentence'] + " " + item['question'])
        answer = tokenize(item['answer']) 
        label = item['label']
        # Truncate excess tokens 
        if answer: 
            self.truncate_pair(sequence, answer, self.sequence_length - 3)
        else: 
            if len(sequence) > self.sequence_length - 2:
                sequence = sequence[0:(self.sequence_length - 2)]
        # Compute tokens, ids, mask 
        tokens = ['<s>'] + sequence + ['</s></s>'] + answer + ['</s>']
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_ids)
        # Pad with 0 
        while len(input_ids) < self.sequence_length:
            input_ids.append(0)
            input_mask.append(0)
        return torch.tensor(input_ids), torch.tensor(input_mask), torch.tensor(label)
        
dataset = MCTACODataset(split='validation', tokenizer=tokenizer, sequence_length=params['sequence_length'])
print(dataset[10])

Reusing dataset mc_taco (/root/.cache/huggingface/datasets/mc_taco/plain_text/1.1.0/fca37fbe424ae58845baa626b2794046ca67f3b8e85749cdf169a983584f7614)


  0%|          | 0/2 [00:00<?, ?it/s]

(tensor([    0, 23239,   423,  4373,    25,     5,  1647,  6825,   148,     5,
        11505,     9, 37625,  2178,     6,   600,    10,  1233,  2412,  5688,
         2442,     4,  1336,   251,    34,    10,  1233,  2412,  5688,  2442,
          116,     3,   102,   325,   722,     2,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0]

In [4]:
from torch.utils.data import DataLoader

class MCTACODatamodule(pl.LightningDataModule):
    def __init__(
        self,
        tokenizer,
        batch_size: int,
        sequence_length: int 
    ):
        super().__init__()
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.sequence_length = sequence_length
        self.dataset_train = None
        self.dataset_valid = None

    def setup(self, stage = None):
        self.dataset_train = MCTACODataset(
            split='validation', 
            tokenizer=self.tokenizer, 
            sequence_length=self.sequence_length
        )
        self.dataset_valid = MCTACODataset(
            split='test', 
            tokenizer=self.tokenizer, 
            sequence_length=self.sequence_length
        )

    def train_dataloader(self) -> DataLoader:
        return DataLoader(
            dataset=self.dataset_train,
            batch_size=self.batch_size,
            shuffle=True,
        )

    def val_dataloader(self) -> DataLoader:
        return DataLoader(
            dataset=self.dataset_valid,
            batch_size=self.batch_size,
            shuffle=False,
        )

datamodule = MCTACODatamodule(tokenizer, batch_size = params['batch_size'], sequence_length = params['sequence_length']) 
datamodule.setup()

Reusing dataset mc_taco (/root/.cache/huggingface/datasets/mc_taco/plain_text/1.1.0/fca37fbe424ae58845baa626b2794046ca67f3b8e85749cdf169a983584f7614)


  0%|          | 0/2 [00:00<?, ?it/s]

Reusing dataset mc_taco (/root/.cache/huggingface/datasets/mc_taco/plain_text/1.1.0/fca37fbe424ae58845baa626b2794046ca67f3b8e85749cdf169a983584f7614)


  0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
from smart_pytorch import SMARTLoss
import torch.nn as nn
import torch.nn.functional as F

def kl_loss(s_p, s):
    # s_p: perturbed state, s: initial state 
    s_p = F.log_softmax(s_p, dim=1) # (b, n)
    s = F.log_softmax(s, dim=1) # (b, n)
    l0 = F.kl_div(s_p, s, reduction = 'sum', log_target=True)
    l1 = F.kl_div(s, s_p, reduction = 'sum', log_target=True)
    return l0 + l1

class SMARTClassificationModel(nn.Module):
    # b: batch_size, s: sequence_length, d: hidden_size , n: num_labels

    def __init__(self, model, weight):
        super().__init__()
        self.model = model 
        self.weight = weight

    def forward(self, input_ids, attention_mask, labels):
        # input_ids: (b, s), attention_mask: (b, s), labels: (b,)

        embed = self.model.roberta.embeddings(input_ids) # (b, s, d)

        def eval(embed):
            outputs = self.model.roberta(inputs_embeds=embed, attention_mask=attention_mask) # (b, s, d)
            pooled = outputs[0] # (b, d)
            logits = self.model.classifier(pooled) # (b, n)
            return logits 

        smart_loss_fn = SMARTLoss(eval_fn = eval, loss_fn = kl_loss)
        state = eval(embed)
        loss = F.cross_entropy(state.view(-1, 2), labels.view(-1))
        smart_loss = torch.tensor(0)
        if embed.requires_grad:
            smart_loss = smart_loss_fn(embed, state)
            loss += self.weight * smart_loss
        #print(loss, smart_loss)
        return state, loss
           
input_ids, input_mask, labels = next(iter(datamodule.train_dataloader()))    
smart_architecture = SMARTClassificationModel(architecture, weight=params['smart_loss_weight'])
#output, loss = smart_architecture(input_ids, input_mask, labels)

In [11]:
import torch.nn as nn 
from transformers import Adafactor
from torchmetrics import MetricCollection, Accuracy, F1Score

class TextClassificationModel(pl.LightningModule):
    def __init__(
        self,
        model: nn.Module
    ):
        super().__init__()
        self.model = model
        metrics = MetricCollection([ Accuracy(), F1Score() ])
        self.train_metrics = metrics.clone(prefix='train_')
        self.valid_metrics = metrics.clone(prefix='val_')

    def configure_optimizers(self):
        optimizer = Adafactor(self.model.parameters(), warmup_init=True)
        return optimizer

    def training_step(self, batch, batch_idx):
        input_ids, attention_masks, labels = batch
        # Compute output 
        outputs, loss = self.model(input_ids = input_ids, attention_mask = attention_masks, labels = labels)
        labels_pred = torch.argmax(outputs, dim=1)
        # Compute metrics
        metrics = self.train_metrics(labels, labels_pred)
        # Log loss and metrics
        self.log("train_loss", loss, on_step=True)
        self.log_dict(metrics, on_step=True, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids, attention_masks, labels = batch
        # Compute output 
        outputs, loss = self.model(input_ids = input_ids, attention_mask = attention_masks, labels = labels)
        labels_pred = torch.argmax(outputs, dim=1)
        # Compute metrics
        metrics = self.valid_metrics(labels, labels_pred)
        # Log loss and metrics
        self.log("valid_loss", loss, on_step=True)
        self.log_dict(metrics, on_step=True, on_epoch=True)
        return loss

model = TextClassificationModel(smart_architecture)

In [None]:
# Wandb Logger
logger = pl.loggers.wandb.WandbLogger(project = 'cs4nlp', entity='nextmachina')
# Callbacks 
cb_progress_bar = pl.callbacks.RichProgressBar()
cb_model_summary = pl.callbacks.RichModelSummary()
# Train 
trainer = pl.Trainer(logger=logger, callbacks=[cb_progress_bar, cb_model_summary], max_epochs=params['max_epochs'], gpus=1)
trainer.logger.log_hyperparams(params)
trainer.fit(model=model, datamodule=datamodule)
wandb.finish() 

Trainer already configured with model summary callbacks: [<class 'pytorch_lightning.callbacks.rich_model_summary.RichModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
[34m[1mwandb[0m: Currently logged in as: [33mschneider[0m ([33mnextmachina[0m). Use [1m`wandb login --relogin`[0m to force relogin


  f"DataModule.{name} has already been called, so it will not be called again. "
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Global seed set to 42
