In [1]:
from data_utils import DPMDataModule

from ast import literal_eval

import torch
import torch.nn as nn
import torch.optim as optim

from pytorch_lightning import LightningModule
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import DataLoader, Dataset

from torchmetrics.functional import accuracy

from transformers import  AutoModelForSequenceClassification

train_on_gpu = torch.cuda.is_available()
if not train_on_gpu:
  print('CUDA is not available. Training on CPU ...')
else:
  print('CUDA is available. Training on GPU ...')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

CUDA is available. Training on GPU ...


In [2]:
class DPMDataset_extended(Dataset):
    def __init__(self, path=None) -> None:
        super().__init__()

        self.tokenizer = AutoTokenizer.from_pretrained(
            "Hate-speech-CNERG/bert-base-uncased-hatexplain"
        )
        self.max_len = 100
        self.data = pd.read_csv(path)

    def __getitem__(self, index):
        text = self.data.loc[index, 'text']
        labels = self.data.loc[index, 'label_x']
        labels = literal_eval(labels.replace(" ", ","))

        text = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True,
        )
        return {
            "ids": torch.tensor(text["input_ids"], dtype=torch.long),
            "mask": torch.tensor(text["attention_mask"], dtype=torch.long),
            "labels": torch.tensor(labels, dtype=torch.float),
        }

    def __len__(self):
        return len(self.data)


class DPMDataModule(LightningDataModule):
    def __init__(self, num_workers=8, batch_size=32, shuffle=True):
        super().__init__()
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.shuffle = shuffle
        self.data_dir = "../dataset/task2_merged_datsets/"
        self.path_train = osp.join(self.data_dir, "train_task2.csv")
        self.path_val = osp.join(self.data_dir, "val_task2.csv")

    def setup(self, stage: Optional[str]):
        self.dpm_train = DPMDataset_extended(path=self.path_train)
        self.dpm_val = DPMDataset_extended(path=self.path_val)

    def train_dataloader(self):
        return DataLoader(
            self.dpm_train,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            shuffle=self.shuffle,
        )

    def val_dataloader(self):
        return DataLoader(
            self.dpm_val,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            shuffle=self.shuffle,
        )

NameError: name 'Dataset' is not defined

In [2]:
class DontPatronizeMe(LightningModule):
    def __init__(self):
        super(DontPatronizeMe,self).__init__()

        self.n_classes = 7
        self.lr = 0.001

        self.bert = AutoModelForSequenceClassification.from_pretrained("Hate-speech-CNERG/bert-base-uncased-hatexplain").bert
        self.dropout = nn.Dropout(0.5)
        self.linear1 = nn.Linear(768, 768)
        self.linear2 = nn.Linear(768, self.n_classes)
        self.relu = nn.ReLU()

        for param in self.bert.parameters():
            param.require_grad = False

        self.criterion = nn.BCEWithLogitsLoss()


    def forward(self, input_id, mask):
        x = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
        x = x[0]
        x = x[:,0]
        x = self.dropout(x)
        x = self.relu(self.linear1(x))
        x = self.relu(self.linear2(x))
        return x

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=self.lr)
        return {
            "optimizer": optimizer,
        }

    def training_step(self, batch, _):
        ids, mask, labels = batch["ids"], batch["mask"], batch["labels"]
        output = self(ids, mask)
        loss = self.criterion(output, labels)
        self.log("train/loss", loss, prog_bar=True, on_epoch=True, on_step=False)
        return loss

    def validation_step(self, batch, _):
        ids, mask, labels = batch["ids"], batch["mask"], batch["labels"]
        output = self(ids, mask)
        loss   = self.criterion(output, labels)
        acc    = accuracy(output, labels.int(), multiclass=True)
        return {"loss": loss, "acc": acc}

    def validation_epoch_end(self, out):
        loss = torch.stack([x["loss"] for x in out]).mean()
        self.log("val/val_loss", loss, on_epoch=True, on_step=False)
        acc = torch.stack([x["acc"] for x in out]).mean()
        self.log("val/val_acc", acc, on_epoch=True, on_step=False)

In [3]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

BATCH_SIZE = 2
EPOCHS = 10000

data = DPMDataModule(num_workers=16, batch_size=4, shuffle=False)
model_name = "bert"
model = DontPatronizeMe()
logger = TensorBoardLogger("tb_logs", name=f"{model_name}")

trainer = Trainer(
    detect_anomaly=True,
    gpus=1,
    enable_model_summary=True,
    logger=logger,
    log_every_n_steps=BATCH_SIZE,
    max_epochs=EPOCHS,

    callbacks=[
        ModelCheckpoint(
            monitor="val/val_loss",
            mode="min",
            dirpath=f"models/{model_name}",
            filename="radar-epoch{epoch:02d}-val_loss{val/val_loss:.2f}",
            auto_insert_metric_name=False,
        ),
        EarlyStopping(monitor="val/val_loss", patience=6),
    ],
)
trainer.fit(model, data)


Downloading: 100%|██████████| 418M/418M [00:08<00:00, 52.4MB/s] 
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Downloading: 100%|██████████| 40.0/40.0 [00:00<00:00, 19.2kB/s]
Downloading: 100%|██████████| 226k/226k [00:02<00:00, 115kB/s]  
Downloading: 100%|██████████| 112/112 [00:00<00:00, 116kB/s]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Missing logger folder: tb_logs/bert

  | Name      | Type              | Params
------------------------------------------------
0 | bert      | BertModel         | 109 M 
1 | dropout   | Dropout           | 0     
2 | linear1   | Linear            | 590 K 
3 | linear2   | Linear            | 5.4 K 
4 | relu      | ReLU              | 0     
5 | criterion | BCEWithLogitsLoss | 0     
------------------------------------------------
110 M     Trainable params
0         Non-trainable params
110 M     Total params
440.313   Total estimated model params size (MB)


Epoch 6: 100%|██████████| 249/249 [00:26<00:00,  9.28it/s, loss=0.693, v_num=0, train/loss=0.693]
