In [7]:

import pytorch_lightning as pl
import torch
# from custom_dataset_for_ear import get_dataset_by_name, TokenizerDataModule
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    AdamW,
    get_linear_schedule_with_warmup,

)
from ear_with_gab import *
from torch.utils.data import Dataset, DataLoader, ConcatDataset


In [12]:

src_model_path = './ear_bert/entropybert-gab25k-0-0.01/'
src_model = [x for x in os.listdir(src_model_path) if x.startswith('PL-epoch')][0]
warmup_train_perc = 0.1
max_epochs = 10
balanced_loss = True

In [13]:
tokenizer = AutoTokenizer.from_pretrained(src_model_path)

In [10]:
data_path = './data/Dynamically-Generated-Hate-Speech-Dataset/Dynamically Generated Hate Dataset v0.2.3.csv'
data = pd.read_csv(data_path, index_col=0)

In [14]:
class DynaDataset(Dataset):
    def __init__(self, data: pd.DataFrame, tokenizer):
        self.texts = data["text"].tolist()
        self.labels = torch.LongTensor((data['label']=='hate').astype(int).to_list())
        self.encodings = tokenizer(
                self.texts,
                truncation=True,
                padding="max_length",
                max_length=128,
                return_tensors="pt",
            )

    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

        # return {"text": self.texts[idx], "label": self.labels[idx]}

    def __len__(self):
        return len(self.labels)

    def get_texts(self):
        return self.texts

    def get_labels(self):
        return self.labels

In [15]:
dataloader_rounds_split = []
for round in range(1, 5):
    train = DynaDataset(data[(data['split']=='train')&(data['round.base']==round)], tokenizer)
    dev = DynaDataset(data[(data['split']=='dev')&(data['round.base']==round)], tokenizer)
    test = DynaDataset(data[(data['split']=='test')&(data['round.base']==round)], tokenizer)
    # datasets_rounds_split.append([train, dev, test])
    dataloader_rounds_split.append(
        [
            DataLoader(train, batch_size=32, shuffle=True),
            DataLoader(dev, batch_size=32, shuffle=False),
            DataLoader(test, batch_size=32, shuffle=False),
        ]
    )

In [4]:
model = LMForSequenceClassification.load_from_checkpoint(
            './ear_bert/entropybert-gab25k-0-0.01/R1/PL-epoch=0-val_loss=-0.116-train_loss=-0.276-Rround=0-v1.ckpt'
        )

In [17]:
trainer = pl.Trainer(accelerator='gpu')

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [19]:
all_tests = [dataloader_rounds_split[0][1]] + [x[2] for x in dataloader_rounds_split]
test_result = trainer.test(model= model, dataloaders=all_tests[0])

You are using a CUDA device ('NVIDIA GeForce RTX 3080') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
         test_F1            0.9516616463661194
        test_acc            0.9413382411003113
        test_loss          -0.13308857381343842
        test_prec           0.9752321839332581
        test_rec            0.9292035102844238
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
