In [32]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertModel, BertTokenizer, AdamW,   get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import pytorch_lightning as pl
from pytorch_lightning.metrics.functional.classification import auroc
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

In [33]:
df = pd.read_csv("../input/train.csv")

In [34]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [35]:
train_df, val_df = train_test_split(df, test_size=0.05)

In [36]:
LABEL_COLUMNS = df.columns[2:].to_list()

In [37]:
train_df.shape, val_df.shape

((151592, 8), (7979, 8))

In [38]:
train_df[LABEL_COLUMNS].sum().sum()

33214

In [39]:
train_df[LABEL_COLUMNS].sum(axis=1).head()

3873      0
105730    0
20338     0
157397    0
78552     0
dtype: int64

In [40]:
train_toxic = train_df[train_df[LABEL_COLUMNS].sum(axis=1) > 0]

In [41]:
train_toxic.shape

(15378, 8)

In [42]:
train_clean = train_df[train_df[LABEL_COLUMNS].sum(axis=1) == 0]

In [43]:
train_clean.shape

(136214, 8)

In [44]:
train_df = pd.concat([
    train_toxic,
    train_clean.sample(15_000)
])

In [45]:
train_df.shape

(30378, 8)

In [46]:
train_df[LABEL_COLUMNS].sum()

toxic            14500
severe_toxic      1512
obscene           7987
threat             443
insult            7434
identity_hate     1338
dtype: int64

In [47]:
BERT_MODEL = "bert-base-cased"
TOKENIZER = BertTokenizer.from_pretrained(BERT_MODEL)


In [48]:
sample_row = val_df.iloc[20]
sample_text = sample_row.comment_text
sample_text_values = sample_row[LABEL_COLUMNS]
sample_text, sample_text_values.to_dict()


 {'identity_hate': 0,
  'insult': 0,
  'obscene': 0,
  'severe_toxic': 0,
  'threat': 0,
  'toxic': 0})

In [49]:
encoding = TOKENIZER.encode_plus(
    sample_text,
    add_special_tokens = True,
    max_length = 512,
    padding="max_length",
    return_token_type_ids=False,
    return_attention_mask = True,
    return_tensors = "pt",
    truncation=True
)

In [50]:
encoding.input_ids.squeeze()[:20], encoding.attention_mask.squeeze()[:100]

(tensor([  101,   107,  3605,  6919,  1863,  1113,  1624,   112,   188, 24645,
          1118,   146,  1125,  1106,  2367,  1240,  1494,  1113,  1142,  1164]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1]))

In [51]:
encoding.input_ids.flatten()[:20]

tensor([  101,   107,  3605,  6919,  1863,  1113,  1624,   112,   188, 24645,
         1118,   146,  1125,  1106,  2367,  1240,  1494,  1113,  1142,  1164])

In [52]:
TOKENIZER.convert_ids_to_tokens(encoding.input_ids.squeeze()[:20])

['[CLS]',
 '"',
 'Van',
 '##dal',
 '##ism',
 'on',
 'King',
 "'",
 's',
 'Daughters',
 'by',
 'I',
 'had',
 'to',
 'ask',
 'your',
 'help',
 'on',
 'this',
 'about']

In [53]:
class ToxicCommentsDataset(Dataset):
    def __init__(self, data, tokenizer, max_token_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_token_len
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index:int):
        data_row = self.data.iloc[index]
        comment_text = data_row.comment_text
        labels = data_row[LABEL_COLUMNS]
        encoding = self.tokenizer.encode_plus(
            comment_text,
            max_length = 128,
            padding = "max_length",
            truncation=True,
            return_token_type_ids=False,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )

        return dict(
            comment_text=comment_text,
            input_ids = encoding.input_ids.flatten(),
            attention_mask = encoding.attention_mask.flatten(),
            labels = torch.FloatTensor(labels)
        )


In [54]:
train_dataset = ToxicCommentsDataset(train_df, TOKENIZER, 128)

In [55]:
train_dataset[0]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 'comment_text': 'resuu u are all niggers \n  lt',
 'input_ids': tensor([  101,  1231,  6385,  1358,   190,  1132,  1155, 11437,  9146,  1116,
           181,  1204,   102,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
    

In [56]:
class ToxicCommentsDataModule(pl.LightningDataModule):
    def __init__(self, train_df, val_Df, tokenizer, max_length=128, batch_size=8):
        super(ToxicCommentsDataModule, self).__init__()
        self.train_df = train_df
        self.val_df = val_df
        self.tokenizer = tokenizer
        self.max_len = max_length
        self.batch_size = batch_size

    def setup(self):
        self.train_dataset = ToxicCommentsDataset(
            self.train_df,
            self.tokenizer,
            self.max_len
        )

        self.val_dataset = ToxicCommentsDataset(
            self.train_df,
            self.tokenizer,
            self.max_len
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=4
        )
    
    def val_dataloader(self):
        return DataLoader(
            self.val_dataset,
            batch_size=1,
            shuffle=False,
            num_workers=4
        )

    def test_dataloader(self):
        return DataLoader(
            self.val_dataset,
            batch_size=1,
            shuffle=True,
            num_workers=4
        )

In [57]:
BATCH_SIZE=32
N_EPOCHS = 30

datamodule=ToxicCommentsDataModule(train_df, val_df, TOKENIZER, max_length=128, batch_size=BATCH_SIZE)
datamodule.setup()

In [58]:
class ToxicCommentClassifier(pl.LightningModule):
    def __init__(self, n_classes, steps_per_epoch, n_epochs):
        super(ToxicCommentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(BERT_MODEL)
        self.classifier = (self.bert.config.hidden_size, n_classes)
        self.steps_per_epoch = steps_per_epoch
        self.n_epochs = n_epochs
        self.criterion = nn.BCELoss()

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.bert(input_ids, attention_mask=attention_mask)
        output = self.classifier(output.pooler_output)
        output = torch.sigmoid(output)
        loss = 0
        if labels is not None:
            loss = self.criterion(output, labels)
            return loss, output
        return output

    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, output = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return {"loss": loss, "predictions" : output, "labels": labels}

    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, output = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss

    def test_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, output = self(input_ids, attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss

    def training_epoch_end(self, outputs):
        labels= []
        predictions = []

        for output in outputs:
            for out_labels in output["labels"].detach().cpu():
                labels.append(out_labels)

        for output in outputs:
            for out_preds in output["predictions"].detach().cpu():
                predictions.append(out_preds)

        labels = torch.stack(labels)
        predictions = torch.stack(predictions)

        for i, name in enumerate(LABEL_COLUMNS):
            roc_score = auroc(predictions[:, i], labels[:, i])
            self.logger.experiment.add_scalar(f"{name}_roc_auc/Train", roc_score, self.current_epoch)

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr = 2e-5)
        warmup_steps = self.steps_per_epoch // 3
        total_steps = self.steps_per_epoch * self.n_epochs - warmup_steps

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            warmup_steps,
            total_steps
        )

        return [optimizer], [scheduler]
            
    


In [59]:
model = ToxicCommentClassifier(
    n_classes=6,
    steps_per_epoch=len(train_df) // BATCH_SIZE,
    n_epochs=N_EPOCHS
)