<a href="https://colab.research.google.com/github/Potisimus/AlgorithmToolBox/blob/master/Toxic_Comment_Classification_Kaggle_Mohammad.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Sat Feb  6 00:27:26 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.39       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P8     9W /  70W |      0MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Installing Transformers

In [2]:
!pip install transformers==4.0.1



### Installing Pytorch lighting library

In [3]:
!pip install pytorch-lightning==1.1.0



### Importing General Libraries

In [4]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import pytorch_lightning as pl
from pytorch_lightning.metrics.functional.classification import auroc

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc


In [5]:
%matplotlib inline
%config InlineBackend.figure_format= 'retina'

RANDOM_SEED = 42

sns.set(style= 'whitegrid', palette= 'muted', font_scale =1.2)
HAPPY_COLORS_PALETTE = ['#01BEFE', '#FFDD00', '#FF7D00', '#FF006D', '#ADFF02', '#8F00FF']
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams["figure.figsize"] = 12,8

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7f382924bc30>

In [6]:
# Data is was saved in google drive

In [7]:
# Let's have a look at the data
df = pd.read_csv("/content/drive/MyDrive/Kaggle/jigsaw-toxic-comment-classification-challenge/train.csv/train.csv")

In [8]:
df.head(n=20)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [9]:
# Let's split the data into train and validation set
train_df, val_df = train_test_split(df, test_size=0.05)

In [10]:
# Checking the shapes of the data frame
train_df.shape, val_df.shape

((151592, 8), (7979, 8))

In [11]:
# Creating a list of the labels
LABEL_COLUMNS = ["toxic",	"severe_toxic",	"obscene",	"threat",	"insult",	"identity_hate"]

In [12]:
# Let's look at the distribution of the data
train_df[LABEL_COLUMNS].sum()

toxic            14546
severe_toxic      1515
obscene           8028
threat             465
insult            7467
identity_hate     1334
dtype: int64

In [13]:
train_df[LABEL_COLUMNS].sum().sum()

33355

In [14]:
train_toxic = train_df[train_df[LABEL_COLUMNS].sum(axis=1)>0]

In [15]:
train_toxic.shape

(15427, 8)

In [16]:
train_clean = train_df[train_df[LABEL_COLUMNS].sum(axis=1)==0]

In [17]:
train_toxic.shape, train_clean.shape

((15427, 8), (136165, 8))

In [18]:
train_df = pd.concat([
    train_toxic,
    train_clean.sample(15_000)
])

In [19]:
train_df[LABEL_COLUMNS].sum()

toxic            14546
severe_toxic      1515
obscene           8028
threat             465
insult            7467
identity_hate     1334
dtype: int64

In [20]:
#BERT Model

BERT_MODEL_NAME = "bert-base-cased"
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

In [21]:
class  ToxicCommentsDataset(Dataset):
  
  def __init__(self, data: pd.DataFrame, tokenizer: BertTokenizer, max_token_len = 128):

    self.data = data
    self.tokenizer = tokenizer
    self.max_token_len = max_token_len
  
  def __len__(self):
    return len(self.data)

  def __getitem__(self, index: int):

    data_row = self.data.iloc[index]

    comment_text = data_row.comment_text
    labels = data_row[LABEL_COLUMNS]

    encoding = self.tokenizer.encode_plus(
        comment_text,
        add_special_tokens=True,
        max_length= self.max_token_len,
        return_token_type_ids = False,
        padding = "max_length",
        truncation = True,
        return_attention_mask = True,
        return_tensors  ='pt'
    )

    return dict(
        comment_text = comment_text,
        input_ids = encoding["input_ids"].flatten(),
        attention_mask = encoding["attention_mask"].flatten(),
        labels = torch.FloatTensor(labels)

    )

In [22]:
train_dataset = ToxicCommentsDataset(train_df, tokenizer)

In [23]:
sample_item = train_dataset[0]

In [24]:
sample_item.keys()

dict_keys(['comment_text', 'input_ids', 'attention_mask', 'labels'])

In [25]:
sample_item["comment_text"]

'Hi, ya fucking idiot. ^_^'

In [26]:
sample_item["labels"]

tensor([1., 0., 1., 0., 1., 0.])

In [27]:
sample_item["input_ids"].shape

torch.Size([128])

In [28]:
bert_model = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict = True)

In [29]:
prediction = bert_model(sample_item["input_ids"].unsqueeze(dim=0), sample_item["attention_mask"].unsqueeze(dim=0))

In [30]:
prediction.last_hidden_state.shape,prediction.pooler_output.shape

(torch.Size([1, 128, 768]), torch.Size([1, 768]))

In [31]:
class  ToxicCommentDataModule(pl.LightningDataModule):

  def __init__(self, train_df, test_df, tokenizer, batch_size = 8, max_token_len =128):
    super().__init__()
    self.train_df = train_df
    self.test_df = test_df
    self.tokenizer = tokenizer
    self.batch_size = batch_size
    self.max_token_len = max_token_len

  def setup(self):
    self.train_dataset = ToxicCommentsDataset(
        self.train_df,
        self.tokenizer,
        self.max_token_len
    )

    self.test_dataset = ToxicCommentsDataset(
        self.test_df,
        self.tokenizer,
        self.max_token_len
    )

    def train_dataloader(self):
      return DataLoader(
          self.train_dataset,
          batch_size = self.batch_size,
          shuffle = True,
          num_workers =4
      )

    def val_dataloader(self):
       return DataLoader(
          self.test_dataset,
          batch_size = 1,
          num_workers =4
      )

    def test_dataloader(self):
        return DataLoader(
          self.test_dataset,
          batch_size = 1,
          num_workers =4
      )

In [33]:
N_EPOCHS = 10
BATCH_SIZE = 32

data_module = ToxicCommentDataModule(train_df, val_df, tokenizer, batch_size= BATCH_SIZE)
data_module.setup()