In [49]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import torch
from torch import nn
from torch.utils.data import DataLoader, SubsetRandomSampler
from pytorch_lightning import Trainer, LightningModule, LightningDataModule, seed_everything
from pytorch_lightning.callbacks import EarlyStopping
from torchmetrics import Accuracy, Precision, Recall
import os
import pandas as pd
import re
import numpy as np
from torchtext.models import RobertaClassificationHead, XLMR_BASE_ENCODER
import torchtext.functional as F

In [17]:
GLOBAL_SEED = 0
seed_everything(GLOBAL_SEED, workers=True)
CUDA_DEVICE_COUNT = torch.cuda.device_count()
DATA_PATH = "../../../data-davidson/data/labeled_data.csv"
NUM_WORKERS = 0
MAX_LENGTH = 50
DIMS = 50

INFO:pytorch_lightning.utilities.seed:Global seed set to 0


In [64]:
class DataModule(LightningDataModule):
    def __init__(self, path, fold_index = None, num_folds = None, batch_size = 32):
        super().__init__()
        self.val_sampler = None
        self.train_sampler = None
        self.val_fold = None
        self.train_fold = None
        self.splits = None
        self.val_dataset = None
        self.train_dataset = None

        self.fold_index = fold_index
        self.num_folds = num_folds
        self.path = path
        self.batch_size = batch_size

        self.transform = XLMR_BASE_ENCODER.transform()


    def read_csv_to_numpy(self, path):
        data = pd.read_csv(path)
        data = data.drop(["Unnamed: 0", "count", "hate_speech", "offensive_language", "neither"], axis=1)
        data = data.reindex(columns=["tweet", "class"])

        df = data.drop(
            data[data["class"] == 1]
            .sample(frac=0.92)
            .index
        )

        df = df.drop(
            df[df["class"] == 2]
            .sample(frac=0.65)
            .index
        )

        return df.to_numpy()

    def preprocess_data(self, batch):
        x_arr, y_arr = [], []
        for tweet, label in batch:

            # space_pattern = '\s+'
            # giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
            #     '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
            # mention_regex = '@[\w\-]+'
            # tweet = re.sub(space_pattern, ' ', tweet)
            # tweet = re.sub(giant_url_regex, 'URLHERE', tweet)
            # tweet = re.sub(mention_regex, 'MENTIONHERE', tweet)

            transformed = self.transform(tweet)
            x_arr.append(transformed)
            y_arr.append(label)

        x = F.to_tensor(x_arr, padding_value=1)
        y = torch.tensor(y_arr)
        

        return x, y

    def setup(self, stage = None):
        self.dataset = self.read_csv_to_numpy(self.path)
        x_indices = list(range(len(self.dataset)))
        
        train_i, val_i = train_test_split(x_indices, test_size=0.2, stratify=self.dataset[:, 1], random_state=GLOBAL_SEED)

        self.train_sampler = SubsetRandomSampler(train_i)
        self.val_sampler = SubsetRandomSampler(val_i)

    def train_dataloader(self):
        return DataLoader(self.dataset, batch_size=self.batch_size, sampler=self.train_sampler, num_workers=NUM_WORKERS, collate_fn=self.preprocess_data)

    def val_dataloader(self):
        return DataLoader(self.dataset, batch_size=self.batch_size, sampler=self.val_sampler, num_workers=NUM_WORKERS, collate_fn=self.preprocess_data)


In [75]:
class Net(LightningModule):
    def __init__(self, learning_rate=1e-3):
        super().__init__()
        self.softmax = nn.Softmax(dim=1)
        self.loss = nn.CrossEntropyLoss()

        self.accuracy = Accuracy(num_classes=3)
        self.precision_metric = Precision(num_classes=3)
        self.recall = Recall(num_classes=3)

        self.learning_rate = learning_rate
        
        head = RobertaClassificationHead(num_classes=3, input_dim=768)
        self.model = XLMR_BASE_ENCODER.get_model(head=head, freeze_encoder=True)

    def forward(self, x):
        return self.model(x)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.learning_rate)

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        
        loss = self.loss(logits, y)

        preds = self.softmax(logits)
        acc = self.accuracy(preds, y)

        self.log('train_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log('train_acc', acc, on_step=False, on_epoch=True, prog_bar=True)

        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)

        loss = self.loss(logits, y)

        preds = self.softmax(logits)
        acc = self.accuracy(preds, y)
        pre = self.precision_metric(preds, y)
        rec = self.recall(preds, y)

        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log('val_acc', acc, on_step=False, on_epoch=True, prog_bar=True)

        self.log('val_pre', pre, on_step=False, on_epoch=True, prog_bar=True)
        self.log('val_rec', rec, on_step=False, on_epoch=True, prog_bar=True)
        # self.log('val_spe', spe, on_step=False, on_epoch=True, prog_bar=True)
        # self.log('val_rocauc', rocauc, on_step=False, on_epoch=True, prog_bar=True)


    def predict_sentence(self, sentence):
        transformed = XLMR_BASE_ENCODER.transform()(sentence)
        transformed = F.to_tensor(transformed).unsqueeze(0)
        logits = self(transformed).detach()
        return {
            "probs": self.softmax(logits),
            "class": torch.argmax(logits)
        }

In [None]:
%%time
seed_everything(GLOBAL_SEED, workers=True)

model = Net(learning_rate=1e-3)
data = DataModule(path=DATA_PATH, batch_size=128)

early = EarlyStopping(monitor="val_loss", patience=10)
trainer = Trainer(max_epochs=70, log_every_n_steps=20, gpus=CUDA_DEVICE_COUNT, callbacks=[early], deterministic=True)
torch.use_deterministic_algorithms(True, warn_only=True)
trainer.fit(model=model, datamodule=data)

In [72]:
model.predict_sentence("you fucking piece of cunt")

tensor([[0.0509, 0.9388, 0.0103]], grad_fn=<SoftmaxBackward0>)

In [77]:
model.predict_sentence("hello")

tensor([[0.1819, 0.5572, 0.2609]], grad_fn=<SoftmaxBackward0>)

In [66]:
model.predict_sentence("you fucking piece of cunt")

(tensor([[0.3514, 0.3726, 0.2761]]), tensor(1))

In [67]:
model.predict_sentence("it's a shame hitler didn't finish what he started")

(tensor([[0.3513, 0.4330, 0.2158]]), tensor(1))

In [68]:
model.predict_sentence("nigga")

(tensor([[0.3301, 0.3230, 0.3469]]), tensor(2))

In [69]:
model.predict_sentence("i love my beautiful dogs")

(tensor([[0.3160, 0.4213, 0.2627]]), tensor(1))