In [1]:
from ast import literal_eval

import torch
import torch.nn as nn
import torch.optim as optim

import gensim
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from pytorch_lightning import LightningDataModule
from pytorch_lightning import LightningModule
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers.models.auto.tokenization_auto import AutoTokenizer

from torchmetrics.functional import accuracy

from transformers import  AutoModelForSequenceClassification

train_on_gpu = torch.cuda.is_available()
if not train_on_gpu:
  print('CUDA is not available. Training on CPU ...')
else:
  print('CUDA is available. Training on GPU ...')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

import os.path as osp
import pandas as pd



CUDA is available. Training on GPU ...


In [2]:
class DPMDataset_extended(Dataset):
    def __init__(self, path=None) -> None:
        super().__init__()

        self.tokenizer = AutoTokenizer.from_pretrained(
            "Hate-speech-CNERG/bert-base-uncased-hatexplain"
        )
        self.max_len = 100
        self.data = pd.read_csv(path)
        self.stop_words = set(stopwords.words('english'))
        self.model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary = True)


    def __getitem__(self, index):
        text = self.data.loc[index, 'text']
        labels = self.data.loc[index, 'label_x']
        labels = literal_eval(labels.replace(" ", ","))
        tokens = word_tokenize(text)
        text = [word.lower() for word in tokens if word.isalpha()]
        text = [word for word in text if not word in self.stop_words]
        text = [self.model.get_index(word) for word in text if word in self.model]
        text = torch.tensor(text, dtype=torch.float).unsqueeze(0)
        text = F.pad(input=text, pad=(0, 300-text.shape[1], 0, 0), mode='constant', value=0)
        
        return {
            "ids": text,
            "labels": torch.tensor(labels, dtype=torch.float),
        }

    def __len__(self):
        return len(self.data)


class DPMDataModule(LightningDataModule):
    def __init__(self, num_workers=0, batch_size=4, shuffle=True):
        super().__init__()
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.shuffle = shuffle
        self.data_dir = "../dataset/task2_merged_datsets/"
        self.path_train = osp.join(self.data_dir, "train_task2.csv")
        self.path_val = osp.join(self.data_dir, "val_task2.csv")

    def setup(self, stage=None):
        self.dpm_train = DPMDataset_extended(path=self.path_train)
        self.dpm_val = DPMDataset_extended(path=self.path_val)

    def train_dataloader(self):
        return DataLoader(
            self.dpm_train,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            shuffle=self.shuffle,
        )

    def val_dataloader(self):
        return DataLoader(
            self.dpm_val,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            shuffle=self.shuffle,
        )


In [3]:
class DontPatronizeMe(LightningModule):
    def __init__(self, embedding_dim, hidden_dim, n_layers, drop=0.5):
        super(DontPatronizeMe,self).__init__()

        model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary = True)
        self.weights = torch.FloatTensor(model.vectors)
        del model
        
        self.lr = 0.001
        self.n_classes = 7

        self.embedding = nn.Embedding.from_pretrained(self.weights)
        self.lstm      = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop, batch_first=True)
        self.dropout   = nn.Dropout(0.3)
        self.linear    = nn.Linear(hidden_dim, self.n_classes)
        self.sigmoid   = nn.Sigmoid()

        self.embedding.requires_grad = False

        self.criterion = nn.BCEWithLogitsLoss()


    def forward(self, x):
        bs = x.size(0)
        x_e = self.embedding(x)
        print(x_e.shape)
        x_lstm, _ = self.lstm(x_e.view(300,-1, 1))
        x_lstm = x_lstm.contiguous().view(-1, self.hidden_dim)
        x = self.dropout(x_lstm)
        x = self.sigmoind(self.linear(x))

        x = x.view(bs, -1)
        x = x[:, -1]
        return x

    # def init_hidden(self, batch_size):
    #     weight = next(self.parameters()).data
        
    #     if (train_on_gpu):
    #         hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
    #               weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
    #     else:
    #         hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
    #                   weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
    #     return hidden

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=self.lr)
        return {
            "optimizer": optimizer,
        }

    def training_step(self, batch, _):
        # h = self.init_hidden(batch_size)
        ids, labels = batch["ids"], batch["labels"]
        output = self(ids.int().cuda())
        loss = self.criterion(output, labels)
        self.log("train/loss", loss, prog_bar=True, on_epoch=True, on_step=False)
        return loss

    def validation_step(self, batch, _):
        # h = self.init_hidden(batch_size)
        ids, labels = batch["ids"], batch["labels"]
        output = self(ids.int().cuda())
        loss   = self.criterion(output, labels)
        acc    = accuracy(output, labels.int(), multiclass=True)
        return {"loss": loss, "acc": acc}

    def validation_epoch_end(self, out):
        loss = torch.stack([x["loss"] for x in out]).mean()
        self.log("val/val_loss", loss, on_epoch=True, on_step=False)
        acc = torch.stack([x["acc"] for x in out]).mean()
        self.log("val/val_acc", acc, on_epoch=True, on_step=False)

In [4]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

BATCH_SIZE = 4
EPOCHS = 10000

embedding_dim = 300
hidden_dim = 256
n_layers = 4

data = DPMDataModule(num_workers=16, batch_size=BATCH_SIZE, shuffle=False)
model_name = "word2vec"
model = DontPatronizeMe(embedding_dim, hidden_dim, n_layers)
model = model.cuda()
logger = TensorBoardLogger("tb_logs", name=f"{model_name}")

trainer = Trainer(
    detect_anomaly=True,
    gpus=1,
    enable_model_summary=True,
    logger=logger,
    log_every_n_steps=BATCH_SIZE,
    max_epochs=EPOCHS,
    fast_dev_run=True,

    callbacks=[
        ModelCheckpoint(
            monitor="val/val_loss",
            mode="min",
            dirpath=f"models/{model_name}",
            filename="radar-epoch{epoch:02d}-val_loss{val/val_loss:.2f}",
            auto_insert_metric_name=False,
        ),
        EarlyStopping(monitor="val/val_loss", patience=6),
    ],
)
trainer.fit(model, data)


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Running in fast_dev_run mode: will run a full train, val, test and prediction loop using 1 batch(es).
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type              | Params
------------------------------------------------
0 | embedding | Embedding         | 900 M 
1 | lstm      | LSTM              | 2.2 M 
2 | dropout   | Dropout           | 0     
3 | linear    | Linear            | 1.8 K 
4 | sigmoid   | Sigmoid           | 0     
5 | criterion | BCEWithLogitsLoss | 0     
------------------------------------------------
2.2 M     Trainable params
900 M     Non-trainable params
902 M     Total params
3,608.609 Total estimated model params size (MB)
  rank_zero_warn(


Epoch 0:   0%|          | 0/2 [00:00<?, ?it/s] torch.Size([4, 1, 300, 300])


RuntimeError: input.size(-1) must be equal to input_size. Expected 300, got 1