In [1]:
!pip install pytorch-lightning > /dev/null
!pip install torchmetrics > /dev/null
!pip install comet-ml > /dev/null
!pip install torch-ema > /dev/null
!pip install transformers > /dev/null
!pip install datasets > /dev/null
!pip install pynvml > /dev/null

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from torch import optim, Tensor
import pytorch_lightning as pl
import torch.nn.functional as F
from torchmetrics import MetricCollection
from torch_ema import ExponentialMovingAverage
import math
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch
from typing import List

In [4]:
from datasets import load_dataset

dataset = load_dataset("MonoHime/ru_sentiment_dataset")

In [5]:
def accuracy(output, target, topk=(1, )):
    # Computes the precision@k for the specified values of k

    maxk = max(topk)
    batch_size = target.size(0)

    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))

    res = []
    for k in topk:
        correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
        res.append(correct_k.mul_(100.0/batch_size))

    return res

class AAMSoftmaxLoss(nn.Module):
    # Additive angular margin softmax loss

    def __init__(self, nOut, nClasses,
                 margin=0.4, scale=20,
                 easy_margin=False, **kwargs):
        super(AAMSoftmaxLoss, self).__init__()

        self.test_normalize = True

        self.m = margin
        self.s = scale
        self.in_feats = nOut
        self.weight = torch.nn.Parameter(torch.FloatTensor(nClasses, nOut), requires_grad=True)
        self.ce = nn.CrossEntropyLoss()

        nn.init.xavier_normal_(self.weight, gain=1)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(self.m)
        self.sin_m = math.sin(self.m)

        # Make the function cos(theta+m) monotonic decreasing while theta in [0°, 180°]
        self.th = math.cos(math.pi - self.m)
        self.mm = math.sin(math.pi - self.m)*self.m

        print('Initialised AAM softmax margin %.3f scale %.3f.'%(self.m,self.s))

    def forward(self, x, label=None, return_unswer=False):

        assert x.size()[0] == label.size()[0]
        assert x.size()[1] == self.in_feats

        # cos(theta)
        cosine = F.linear(F.normalize(x), F.normalize(self.weight))
        if return_unswer:
          return cosine

        # cos(theta + m)
        sine = torch.sqrt((1.0 - torch.mul(cosine, cosine)).clamp(0, 1))
        phi = cosine*self.cos_m - sine*self.sin_m

        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)

        else:
            phi = torch.where((cosine - self.th) > 0, phi, cosine - self.mm)

        one_hot = torch.zeros_like(cosine)
        one_hot.scatter_(1, label.view(-1, 1), 1)
        output = (one_hot*phi) + ((1.0 - one_hot)*cosine)
        output = output*self.s

        loss   = self.ce(output, label)
        prec1  = accuracy(cosine.detach(), label.detach(), topk=(1, ))[0]

        return loss, prec1

In [6]:
class Model(pl.LightningModule):
    def __init__(
        self,
        lr,
        steps,
        batch_size_train=32,
        batch_size_val=1,

    ):
        super().__init__()
        self.save_hyperparameters()

        self.model = AutoModelForSequenceClassification.from_pretrained("philschmid/RoBERTa-Banking77")
        self.model.classifier.out_proj = nn.Identity()
        self.head = AAMSoftmaxLoss(768, 3)

    def training_step(self, batch, batch_idx):
        feats = self.model(torch.stack(batch["input_ids"]).T,
                           torch.stack(batch["attention_mask"]).T)["logits"]
        loss, accuracy = self.head(feats, batch["sentiment"])
        self.log("Train_accuracy", accuracy, on_epoch=True,
                 on_step=True, prog_bar=True,)
        self.log(
              "Train_loss",
              loss,
              on_epoch=True,
              on_step=True,
              prog_bar=True,
              batch_size=self.hparams.batch_size_train,
          )
        return loss


    def validation_step(self, batch, batch_idx):
        feats = self.model(torch.stack(batch["input_ids"]).T,
                           torch.stack(batch["attention_mask"]).T)["logits"]
        loss, accuracy = self.head(feats, batch["sentiment"])
        self.log("Validation_accuracy", accuracy, on_epoch=True,
                 on_step=True, prog_bar=True,)
        self.log(
              "Validation_loss",
              loss,
              on_epoch=True,
              on_step=True,
              prog_bar=True,
              batch_size=self.hparams.batch_size_train,
          )
        return loss

    def configure_optimizers(self):
        opt = torch.optim.AdamW(list(self.model.parameters()) + list(self.head.parameters()),
                                  lr=self.hparams.lr)

        sch = torch.optim.lr_scheduler.OneCycleLR(opt, max_lr=self.hparams.lr,
                                                   total_steps=self.hparams.steps,
                                                  pct_start=0.05,)

        self.ema = ExponentialMovingAverage(list(self.model.parameters()) + list(self.head.parameters()), 0.97)
        return ([opt,],
                [{"scheduler": sch, "interval": "step"},])

    def optimizer_step(self, *args, **kwargs):
      super().optimizer_step(*args, **kwargs)
      self.ema.update(list(self.model.parameters()) + list(self.head.parameters()))

In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("philschmid/RoBERTa-Banking77")

In [9]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [10]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/21098 [00:00<?, ? examples/s]

In [11]:
train_dataset = tokenized_datasets["train"].shuffle(seed=42)
eval_dataset = tokenized_datasets["validation"].shuffle(seed=42)

In [59]:
train_dataloader = DataLoader(train_dataset, batch_size=32, drop_last=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=16)

In [66]:
from pynvml import *
import gc
# try:
# del trainer
# del model
# del ckpt
# except:
#   ...
gc.collect()
torch.cuda.empty_cache()
nvmlInit()
h = nvmlDeviceGetHandleByIndex(0)
info = nvmlDeviceGetMemoryInfo(h)
print(f'total    : {info.total}')
print(f'free     : {info.free}')
print(f'used     : {info.used}')

total    : 16106127360
free     : 13380681728
used     : 2725445632


In [67]:
import comet_ml
import random
import numpy as np
import torch
from pytorch_lightning.loggers import CometLogger
from pytorch_lightning.callbacks import LearningRateMonitor

comet_logger = CometLogger(
    api_key="WaBfOytSxqH1MBchWm2iqt220",
    project_name="sentiment",
    workspace="nikkitoss",
    experiment_name="Roberta-Banking",  # Optional
)

lr_monitor = LearningRateMonitor(logging_interval='step')

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.backends.cudnn.deterministic = True


epoches = 1

print("iteration in epoch : ", len(train_dataloader))
steps = math.ceil(len(train_dataloader) * epoches / 16)

model = Model(lr=0.01, steps=steps,
            batch_size_train=16, batch_size_val=16)
ckpt = torch.load("/content/drive/MyDrive/Result/sentiment/abfc461c58f847e69d04305b4998f3c0/checkpoints/epoch=0-step=186.ckpt")
model.load_state_dict(ckpt["state_dict"])
del ckpt
gc.collect()
torch.cuda.empty_cache()

trainer = pl.Trainer(max_epochs=epoches, accelerator="gpu", logger=comet_logger,
                     gradient_clip_val=0.5, callbacks=[lr_monitor],
                     default_root_dir="/content/drive/MyDrive/Result_v2/",
                     accumulate_grad_batches=16)

INFO:pytorch_lightning.loggers.comet:CometLogger will be initialized in online mode


iteration in epoch :  5934
Initialised AAM softmax margin 0.400 scale 20.000.


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [68]:
nvmlInit()
h = nvmlDeviceGetHandleByIndex(0)
info = nvmlDeviceGetMemoryInfo(h)
print(f'total    : {info.total}')
print(f'free     : {info.free}')
print(f'used     : {info.used}')

total    : 16106127360
free     : 13745586176
used     : 2360541184


In [69]:
trainer.fit(model=model,
            train_dataloaders=train_dataloader,
            val_dataloaders=eval_dataloader)

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/content' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/nikkitoss/sentiment/c958849c96f84db5b67c23c2cdca2913

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                             | Params
-----------------------------------------------------------
0 | model | RobertaForSequenceClassification | 82.1 M
1 | head  | AAMSoftmaxLoss                   | 2.3 K 
-----------------------------------------------------------
82.1 M    Trainable params
0         Non-trainable params
82.1 M    Total params
328.483   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/utilities/data.py:77: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 10. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.
INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/nikkitoss/sentiment/c958849c96f84db5b67c23c2cdca2913
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     Train_accuracy_epoch            : 47.642822265625
[1;38;5;

In [27]:
model.model(torch.stack(next(iter(train_dataloader))["input_ids"]).T[:2])

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.0000, -0.0000,  0.5131,  ..., -0.0985,  0.0038,  0.2299],
        [ 0.5042, -0.4418,  0.1180,  ..., -0.4738,  0.2181,  0.3334]],
       grad_fn=<MulBackward0>), hidden_states=None, attentions=None)