In [1]:
!pip install transformers datasets

[0m

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset

In [3]:
# Load Dataset

dataset = load_dataset("imdb")

Downloading builder script:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base', truncation=True)

def tokenization(example):
    return tokenizer(example["text"], truncation=True, padding=True)

dataset = dataset.map(tokenization, batched=True)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

In [5]:
class ImdbDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
        
    def __getitem__(self, index):
        return self.dataset[index]
    
    def __len__(self):
        return self.dataset.num_rows

In [6]:
dataset.set_format(type="torch", columns=['input_ids', 'attention_mask', 'label'])

In [7]:
batch_size = 16

ds_train = ImdbDataset(dataset["train"])
ds_test = ImdbDataset(dataset["test"])

dl_train = DataLoader(ds_train, batch_size=batch_size, drop_last=True, shuffle=True)
dl_test = DataLoader(ds_test, batch_size=batch_size, drop_last=True, shuffle=False)

# Load Model

In [8]:
# set os env needed for model
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [9]:
model = AutoModelForSequenceClassification.from_pretrained("aychang/roberta-base-imdb")

Downloading:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

In [10]:
import pytorch_lightning as pl
from typing import Any
from torchmetrics.functional.classification import binary_accuracy
from torchmetrics.classification import BinaryAccuracy

class HuggingFaceWrapper(pl.LightningModule):

    def __init__(self, 
        loss: callable, 
        lr: float,
        model: torch.nn.Module,
    ) -> None:
        super().__init__()

        self.loss = loss
        self.lr = lr
        
        self.model = model
        self.test_acc = BinaryAccuracy()      

    def forward(self, input_ids, a_mask) -> torch.Tensor:
        out = self.model(input_ids, attention_mask=a_mask).logits
        return out

    def _step(self, batch) -> torch.Tensor:
        a_mask = batch["attention_mask"]
        input_ids = batch["input_ids"]
        y = batch["label"]
        logits  = self.forward(input_ids, a_mask)
        pred = torch.topk(logits, 1).indices.squeeze()
        y = torch.nn.functional.one_hot(y.long(), num_classes=2).float()
        loss = self.loss(logits, y)
        return pred, loss

    def training_step(self, batch, batch_idx) -> torch.Tensor:
        pred, loss = self._step(batch)
        self.log("train/loss", loss)
        acc = binary_accuracy(pred, batch["label"])
        self.log("train/acc", acc)
        return loss
    
    def _eval_step(self, batch, acc):
        pred, loss = self._step(batch)
        y = batch["label"]
        acc.update(pred, y)

        return loss

    def _eval_epoch_end(self, eval_type, acc):
        acc_val = acc.compute()
        print(f"{eval_type} Accuracy: {acc_val.data}")
        
    def test_step(self, batch: torch.Tensor, batch_idx: int) -> torch.Tensor:
        loss = self._eval_step(batch, self.test_acc)

    def test_epoch_end(self, outputs) -> None:
        self._eval_epoch_end("Test", self.test_acc)
                
    def configure_optimizers(self) -> Any:
        optim = torch.optim.Adam(self.parameters(), lr=self.lr)
        return optim

In [11]:
# unfreeze model
for parm in model.parameters():
    parm.requires_grad = True

In [12]:
loss = torch.nn.CrossEntropyLoss()

log_intervall = 10
epochs = 1
lr=1e-3

lm = HuggingFaceWrapper(loss, lr, model)

In [13]:
# Display memory availability on the graphics card. I hat a little bit of trouble with it initaly
torch.cuda.empty_cache()

t = torch.cuda.get_device_properties(0).total_memory
r = torch.cuda.memory_reserved(0)
a = torch.cuda.memory_allocated(0)
f = r-a  # free inside reserved

from pynvml import *
nvmlInit()
h = nvmlDeviceGetHandleByIndex(0)
info = nvmlDeviceGetMemoryInfo(h)
print(f'total    : {info.total}')
print(f'free     : {info.free}')
print(f'used     : {info.used}')

torch.cuda.mem_get_info()


total    : 17071734784
free     : 17069506560
used     : 2228224


(16393175040, 17071734784)

In [14]:
trainer = pl.Trainer(max_epochs=epochs, accelerator="gpu", devices=1, log_every_n_steps=log_intervall)
trainer.fit(lm, dl_train)

Training: 0it [00:00, ?it/s]

In [15]:
trainer.test(lm, dl_test)

Testing: 0it [00:00, ?it/s]

Test Accuracy: 0.4998399615287781


[{}]