# BERT only end-to-end Baseline for Jigsaw Toxic Classification

Hello, I'm a new Kaggler and this is the very first competition I participate in Kaggle  
Saw many of great Kagglers utilize sklearn based architecture like TF-IDF, Ridge Regression which are really really awesome and well-performing!!  
And I wrote just a simple baseline notebook for the freshman like me! (BERT architecture only ver.)  
Appreciate any comment or advice except for TOXIC one LOL🤗  

References :
- https://www.kaggle.com/debarshichanda/pytorch-w-b-jigsaw-starter
- https://www.kaggle.com/chryzal/jigsaw-ensemble-0-864
- https://www.kaggle.com/readoc/toxic-linear-model-pseudo-labelling-lb-0-864  

Please Upvote for the above links which are much greater than mine :)


### Approach Summary
1. Train: BERT model with pseudo-labeled "jigsaw-toxic-comment-classification-challenge" data
2. Valid: test accuracy with our "jigsaw-toxic-severity-rating" data after every one-epoch-training done
3. Test: make submission with the model which got best accuracy in validation part

In [None]:
import os
import numpy as np
import pandas as pd
import sys
import gc
import re
import time
from tqdm.auto import tqdm
from transformers import AutoModel, AutoTokenizer, logging
# Ignore model init warning
logging.set_verbosity_error()

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
device = "cuda" if torch.cuda.is_available() else "cpu"

from IPython.display import display
from copy import deepcopy

import warnings
warnings.filterwarnings("ignore")
# Ignore tokenizers warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Original Dataset

In [None]:
valid_df = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
valid_df.head()

In [None]:
type(valid_df)

In [None]:
def clean(data):

    # Clean some punctutations
    data = re.sub('\n', ' ', data)
    # Remove ip address
    data = re.sub(r'(([0-9]+\.){2,}[0-9]+)',' ', data)
    
    data = re.sub(r'([a-zA-Z]+)([/!?.])([a-zA-Z]+)',r'\1 \2 \3', data)
    # Replace repeating characters more than 3 times to length of 3
    data = re.sub(r'([*!?\'])\1\1{2,}',r'\1\1\1', data)
    # patterns with repeating characters 
    data = re.sub(r'([a-zA-Z])\1{2,}\b',r'\1\1', data)
    data = re.sub(r'([a-zA-Z])\1\1{2,}\B',r'\1\1\1', data)

    # Add space around repeating characters
    data = re.sub(' +', ' ', data)
    
    # Ex) I didn ' t -> I didn't
    data = re.sub(" ' ", "'", data)
    
    return data

In [None]:
valid_df["less_toxic"] = valid_df["less_toxic"].apply(clean)
valid_df["more_toxic"] = valid_df["more_toxic"].apply(clean)
valid_df.head()

In [None]:
comments_df = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
submission_df = pd.read_csv("../input/jigsaw-toxic-severity-rating/sample_submission.csv")
display(comments_df.head())
display(submission_df.head())

# Jigsaw Toxic Comment Classification Data

In [None]:
jigsaw_train_df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
jigsaw_train_df.head()

In [None]:
features = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

for feature in features:
    print("*"*20, feature.upper(), "*"*20)
    display(jigsaw_train_df[jigsaw_train_df[feature]==1][["comment_text", feature]].sample(5))

In [None]:
jigsaw_label = deepcopy(jigsaw_train_df)

FEATURE_WTS = {
    'severe_toxic': 3, 'identity_hate': 1.5, 'threat': 1.5, 
    'insult': 0.64, 'toxic': 2, 'obscene': 0.16, 
}

FEATURES = list(FEATURE_WTS.keys())

jigsaw_label['label'] = 0
for feat, wt in FEATURE_WTS.items(): 
    jigsaw_label.label += wt*jigsaw_label[feat]
jigsaw_label.label = jigsaw_label.label/jigsaw_label.label.max()
    
pos = jigsaw_label[jigsaw_label.label>0]
neg = jigsaw_label[jigsaw_label.label==0].sample(len(pos), random_state=201)
jigsaw_label = pd.concat([pos, neg])
jigsaw_label = jigsaw_label[["comment_text", "label"]]
jigsaw_label

In [None]:
jigsaw_label["comment_text"] = jigsaw_label["comment_text"].apply(clean)
jigsaw_label

## Datasets & DataLoaders

In [None]:
# Define Train Dataset Class
class JigsawDataset(Dataset):
    def __init__(self, df, tokenizer, max_length, text_col="comment_text", is_test=False):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.is_test = is_test
        if not is_test:
            self.labels = [torch.tensor(label, dtype=torch.float) for label in tqdm(df['label'].values)]
        self.texts = [
            self.tokenizer.encode_plus(
                                text,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.max_len,
                                padding='max_length'
                            )
            for text in tqdm(df[text_col].values)]
        
        self.input_ids = [torch.tensor(text["input_ids"], dtype=torch.long) for text in tqdm(self.texts)]
        self.attention_masks = [torch.tensor(text["attention_mask"], dtype=torch.long) for text in tqdm(self.texts)]

        
    def __len__(self):
        return len(self.df)
    
    
    def __getitem__(self, index):
        if self.is_test:
            return {
                "text_ids": self.input_ids[index],
                "text_mask": self.attention_masks[index]
            }
        
        label = self.labels[index]
        return {
            "text_ids": self.input_ids[index],
            "text_mask": self.attention_masks[index],
            "label": label
        }


In [None]:
model_name = "../input/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
train_dataset = JigsawDataset(
    df=jigsaw_label,
    tokenizer=tokenizer,
    max_length=tokenizer.max_model_input_sizes[model_name.split("/")[-1]]
)

valid_less_dataset = JigsawDataset(
    df=valid_df,
    tokenizer=tokenizer,
    max_length=tokenizer.max_model_input_sizes[model_name.split("/")[-1]],
    text_col="less_toxic",
    is_test=True
)

valid_more_dataset = JigsawDataset(
    df=valid_df,
    tokenizer=tokenizer,
    max_length=tokenizer.max_model_input_sizes[model_name.split("/")[-1]],
    text_col="more_toxic",
    is_test=True
)

test_dataset = JigsawDataset(
    df=comments_df,
    tokenizer=tokenizer,
    max_length=tokenizer.max_model_input_sizes[model_name.split("/")[-1]],
    text_col="text",
    is_test=True
)

In [None]:
train_dataloader = DataLoader(
    train_dataset,
    batch_size=16,
    num_workers=2,
    pin_memory=True,
    shuffle=True
)

valid_less_dataloader = DataLoader(
    valid_less_dataset,
    batch_size=16,
    num_workers=2,
    pin_memory=True,
    shuffle=False
)

valid_more_dataloader = DataLoader(
    valid_more_dataset,
    batch_size=16,
    num_workers=2,
    pin_memory=True,
    shuffle=False
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=16,
    num_workers=2,
    pin_memory=True,
    shuffle=False
)

## Define Model Class

In [None]:
class JigsawModel(nn.Module):
    def __init__(self, model_name):
        super(JigsawModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.drop = nn.Dropout(p=0.2)
        self.fc = nn.Linear(768, 1)
        
    def forward(self, ids, mask):        
        out = self.model(input_ids=ids,
                         attention_mask=mask,
                         output_hidden_states=False)
        out = self.drop(out[1])
        outputs = self.fc(out)
        return outputs

## Set-up for Training

In [None]:
learning_rate = 1e-5
epochs = 2

model = JigsawModel(model_name).to(device)
criterion = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer,
                                                max_lr=learning_rate, 
                                                steps_per_epoch=len(train_dataloader),
                                                epochs=epochs,
                                                pct_start=0.05
                                               )

## Functions for Training

In [None]:
def train(model, optimizer, scheduler, dataloader, device):
    
    model.train()
    dataset_size = 0
    running_loss = 0.0

    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        text_ids = data['text_ids'].to(device, dtype = torch.long)
        text_mask = data['text_mask'].to(device, dtype = torch.long)
        targets = data['label'].to(device, dtype=torch.float)

        batch_size = text_ids.size(0)

        outputs = model(text_ids, text_mask)

        loss = criterion(outputs, targets)

        loss.backward()

        optimizer.step()
        optimizer.zero_grad()
        
        if scheduler is not None:
            scheduler.step()

        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size

        epoch_loss = running_loss / dataset_size

        bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])
        
    return epoch_loss
            
        

In [None]:
@torch.no_grad()
def valid(model, loader, device):
    
    all_labels = []
    model.eval()
    for data in tqdm(loader):
        
        text_ids = data['text_ids'].to(device, dtype = torch.long)
        text_mask = data['text_mask'].to(device, dtype = torch.long)

        batch_size = text_ids.size(0)

        outputs = model(text_ids, text_mask).view(-1)
        all_labels.extend(list(outputs.detach().cpu().numpy()))
        
    return np.array(all_labels)

## Main Training Part

In [None]:
# Training Part
best_acc = 0
all_preds = []
for epoch in tqdm(range(epochs)):
    
    epoch_loss = train(
        model = model,
        optimizer = optimizer,
        scheduler = scheduler,
        dataloader = train_dataloader,
        device = device
    )
    
    less_labels = valid(model, valid_less_dataloader, device)
    more_labels = valid(model, valid_more_dataloader, device)
    preds = more_labels - less_labels
    accuracy = len(preds[preds > 0])/len(preds)

    if accuracy > best_acc:
        print(f"Best Accuracy Updated: {accuracy:.4f}")
        print(f"Outdated Best Acc.: {best_acc:.4f}")
        best_acc = accuracy
        best_submission = valid(model, test_dataloader, device)
        if os.path.exists("checkpoint.pth"):
            os.remove("checkpoint.pth")
        torch.save(model, "checkpoint.pth")

## Submit Your Best Validated Result

In [None]:
submission_df['score'] = best_submission
submission_df.to_csv("submission.csv", index=False)