In [46]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
import torch
from torch import nn
from torch.utils.data import Subset, DataLoader, SubsetRandomSampler
from pytorch_lightning import Trainer, LightningModule, LightningDataModule, seed_everything, Callback
from torchmetrics import Accuracy, Precision, Recall, Specificity, AUROC
import os
import pandas as pd
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import GloVe
import re

In [3]:
GLOBAL_SEED = 0
seed_everything(GLOBAL_SEED, workers=True)
CUDA_DEVICE_COUNT = torch.cuda.device_count()
DATA_PATH = "../../../data-davidson/data/labeled_data.csv"
NUM_WORKERS = 0
MAX_LENGTH = 50
DIMS = 50

Global seed set to 0


In [47]:
class Preprocess:
    def __init__(self):
        self.tokenizer = get_tokenizer("basic_english")
        self.glove = GloVe(name="6B", dim=DIMS)

    def collate_preprocess_data(self, batch):
        x_arr, y_arr = [], []
        for tweet, label in batch:
            tokenized = self.tokenizer(tweet)
            tweet = self.glove.get_vecs_by_tokens(tokenized)[:MAX_LENGTH, :]
            padding = torch.zeros(MAX_LENGTH-tweet.shape[0], DIMS)
            tweet = torch.cat((tweet, padding), dim=0)
            x_arr.append(tweet.unsqueeze(0))
            y_arr.append(label)

        x = torch.cat(x_arr)
        y = torch.tensor(y_arr)
        
        return x, y

    def inference_preprocess_data(self, x):
        space_pattern = '\s+'
        giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
            '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        mention_regex = '@[\w\-]+'
        x = re.sub(space_pattern, ' ', x)
        x = re.sub(giant_url_regex, 'URLHERE', x)
        x = re.sub(mention_regex, 'MENTIONHERE', x)

        tokenized = self.tokenizer(x)
        tweet = self.glove.get_vecs_by_tokens(tokenized)[:MAX_LENGTH, :]
        padding = torch.zeros(MAX_LENGTH-tweet.shape[0], DIMS)
        tweet = torch.cat((tweet, padding), dim=0)
        tweet = tweet.unsqueeze(0)
        return tweet

In [48]:
class DataModule(LightningDataModule):
    def __init__(self, path, fold_index = None, num_folds = None, batch_size = 32):
        super().__init__()
        self.val_sampler = None
        self.train_sampler = None
        self.val_fold = None
        self.train_fold = None
        self.splits = None
        self.val_dataset = None
        self.train_dataset = None

        self.fold_index = fold_index
        self.num_folds = num_folds
        self.path = path
        self.batch_size = batch_size

        # self.preprocessor = preprocessor

        
        self.tokenizer = get_tokenizer("basic_english")
        self.glove = GloVe(name="6B", dim=DIMS)


    def read_csv_to_numpy(self, path):
        data = pd.read_csv(path)
        data = data.drop(["Unnamed: 0", "count", "hate_speech", "offensive_language", "neither"], axis=1)
        data = data.reindex(columns=["tweet", "class"])

        df = data.drop(
            data[data["class"] == 1]
            .sample(frac=0.92)
            .index
        )

        df = df.drop(
            df[df["class"] == 2]
            .sample(frac=0.65)
            .index
        )

        return df.to_numpy()

    def preprocess_data(self, batch):
        x_arr, y_arr = [], []
        for tweet, label in batch:

            space_pattern = '\s+'
            giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
                '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
            mention_regex = '@[\w\-]+'
            tweet = re.sub(space_pattern, ' ', tweet)
            tweet = re.sub(giant_url_regex, 'URLHERE', tweet)
            tweet = re.sub(mention_regex, 'MENTIONHERE', tweet)



            tokenized = self.tokenizer(tweet)
            tweet = self.glove.get_vecs_by_tokens(tokenized)[:MAX_LENGTH, :]
            padding = torch.zeros(MAX_LENGTH-tweet.shape[0], DIMS)
            tweet = torch.cat((tweet, padding), dim=0)
            x_arr.append(tweet.unsqueeze(0))
            y_arr.append(label)

        x = torch.cat(x_arr)
        y = torch.tensor(y_arr)
        
        return x, y 

    def setup(self, stage = None):
        # self.train_dataset = self.read_csv_to_numpy(self.path)
        # self.val_dataset = self.read_csv_to_numpy(self.path)
        self.dataset = self.read_csv_to_numpy(self.path)
        x_indices = list(range(len(self.dataset)))

        # if self.fold_index is not None and self.num_folds is not None:
        #     self.splits = list(
        #         StratifiedKFold(self.num_folds, shuffle=True, random_state=GLOBAL_SEED)
        #             .split(X=x_indices, y=self.train_dataset.targets)
        #     )
        #     train_i, val_i = self.splits[self.fold_index]
        # else:
        #     train_i, val_i = train_test_split(x_indices, test_size=0.2, stratify=self.train_dataset.targets, random_state=GLOBAL_SEED)
        
        train_i, val_i = train_test_split(x_indices, test_size=0.2, stratify=self.dataset[:, 1], random_state=GLOBAL_SEED)

        self.train_sampler = SubsetRandomSampler(train_i)
        self.val_sampler = SubsetRandomSampler(val_i)

    def train_dataloader(self):
        return DataLoader(self.dataset, batch_size=self.batch_size, sampler=self.train_sampler, num_workers=NUM_WORKERS, collate_fn=self.preprocess_data)

    def val_dataloader(self):
        return DataLoader(self.dataset, batch_size=self.batch_size, sampler=self.val_sampler, num_workers=NUM_WORKERS, collate_fn=self.preprocess_data)

    # def test_dataloader(self):
    #     return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=4)


In [64]:
class Net(LightningModule):
    def __init__(self, preprocessor, learning_rate=1e-3):
        super().__init__()
        self.softmax = nn.Softmax(dim=1)
        # self.argmax = nn.Arg
        self.loss = nn.CrossEntropyLoss()


        self.accuracy = Accuracy(num_classes=3)
        self.precision_metric = Precision()
        self.recall = Recall()
        self.rocauc = AUROC(pos_label=1)
        self.specificity = Specificity()

        # self.backbone = backbone

        self.relu = nn.ReLU()
        self.flatten = nn.Flatten()
        self.conv = nn.Conv2d(1, 32, 3)
        self.fc1 = nn.Linear(DIMS*MAX_LENGTH, 128)
        # self.fc1 = nn.Linear(32*48*48, 512)
        self.fc2 = nn.Linear(512, 128)
        self.fc3 = nn.Linear(128, 3)

        self.learning_rate = learning_rate
        
        self.preprocessor = preprocessor

    def forward(self, x):
        # x = self.relu(self.conv(x))
        # x = self.flatten(x)
        x = self.relu(self.fc1(x))
        # x = self.relu(self.fc2(x))
        x = self.fc3(x)
        # x = self.relu(self.fc3(x))
        return x

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.learning_rate)

    def training_step(self, batch, batch_idx):
        x, y = batch
        x = torch.flatten(x, start_dim=1)
        # x = x.unsqueeze(1)
        # y = y.unsqueeze(1).float()
        logits = self(x)
        
        loss = self.loss(logits, y)

        preds = self.softmax(logits)
        acc = self.accuracy(preds, y)

        self.log('train_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log('train_acc', acc, on_step=False, on_epoch=True, prog_bar=True)

        # self.log("losses", {"train_loss": loss}, on_step=False, on_epoch=True)
        # self.log("accuracies", {"train_acc": acc}, on_step=False, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        x = torch.flatten(x, start_dim=1)
        # x = x.unsqueeze(1)
        # y = y.unsqueeze(1).float()
        logits = self(x)

        loss = self.loss(logits, y)

        preds = self.softmax(logits)
        acc = self.accuracy(preds, y)
        pre = self.precision_metric(preds, y)
        rec = self.recall(preds, y)
        # spe = self.specificity(preds, y_int)
        # rocauc = self.rocauc(preds, y_int)

        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log('val_acc', acc, on_step=False, on_epoch=True, prog_bar=True)

        self.log('val_pre', pre, on_step=False, on_epoch=True, prog_bar=True)
        self.log('val_rec', rec, on_step=False, on_epoch=True, prog_bar=True)
        # self.log('val_spe', spe, on_step=False, on_epoch=True, prog_bar=False)
        # self.log('val_rocauc', rocauc, on_step=False, on_epoch=True, prog_bar=False)

        # self.log("losses", {"val_loss": loss}, on_step=False, on_epoch=True)
        # self.log("accuracies", {"val_acc": acc}, on_step=False, on_epoch=True)

    def predict_sentence(self, sentence):
        transformed = self.preprocessor.inference_preprocess_data(sentence)
        transformed = torch.flatten(transformed, start_dim=1)
        # [1, 50, 50]
        logits = self(transformed).detach()
        return (self.softmax(logits), torch.argmax(logits))

In [70]:
%%time
seed_everything(GLOBAL_SEED, workers=True)

pp = Preprocess()
model = Net(pp, learning_rate=1e-3)
data = DataModule(path=DATA_PATH, batch_size=128)
# data.setup()

# d = next(iter(data.train_dataloader()))[0].unsqueeze(1)
# model(d).shape
trainer = Trainer(max_epochs=30, log_every_n_steps=20, gpus=CUDA_DEVICE_COUNT, deterministic=True)
#os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
#CUBLAS_WORKSPACE_CONFIG=:4096:8 or CUBLAS_WORKSPACE_CONFIG=:16:8
torch.use_deterministic_algorithms(True, warn_only=True)
trainer.fit(model=model, datamodule=data)

Global seed set to 0
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

   | Name             | Type             | Params
-------------------------------------------------------
0  | softmax          | Softmax          | 0     
1  | loss             | CrossEntropyLoss | 0     
2  | accuracy         | Accuracy         | 0     
3  | precision_metric | Precision        | 0     
4  | recall           | Recall           | 0     
5  | rocauc           | AUROC            | 0     
6  | specificity      | Specificity      | 0     
7  | relu             | ReLU             | 0     
8  | flatten          | Flatten          | 0     
9  | conv             | Conv2d           | 320   
10 | fc1              | Linear           | 320 K 
11 | fc2              | Linear           | 65.7 K
12 | fc3              | Linear           | 387   
-------------------------------------------------------
386 K     Trainable

                                                                           

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


Epoch 29: 100%|██████████| 35/35 [00:27<00:00,  1.27it/s, loss=0.0165, v_num=28, val_loss=2.040, val_acc=0.557, val_pre=0.557, val_rec=0.557, train_loss=0.0158, train_acc=0.998]
CPU times: user 28.4 s, sys: 2.78 s, total: 31.2 s
Wall time: 28.6 s
