# Using Toxic Comment Classification

Here I have used the multi-label Toxic Comment Classification Dataset (by Jigsaw).

I have used Roberta-base to predict 6 different outputs each belonging to one label (toxic, severe_toxic, etc...)

I have not passed the output logits through any sigmoid function. Just used the outputs for these 6 labels and taken a linear average of them.

That's what I have used as the score for comparison.

#### I have used:

* BCELogitsLoss as the loss funtion : It already applies a sigmoid function on the output before calculating the loss
* Early stooping with a patience of 1 : You can modify according to your need
* Used CosineAnnealingLR scheduler : It changes the LR per step following a cosine funtion.



## The final layer of the model looks like this:

 ![NN](https://user-images.githubusercontent.com/74188336/141213710-3a1b7473-8436-4683-841e-64d87789f47e.png)

It has 6 output heads giving outputs to the 6 different labels to determine.

This 6 different heads are attached on top the roberta-base (for now) will implement roberta-large too.

### Please <span style="color:red">UPVOTE</span> If it Helps :)

Model used : RoBERTa-base (RoBERTa-large is way too big and takes a whole lot of time training O.o)

#### The Inference Kernel : [INFER | ToxicComments | 🌟](https://www.kaggle.com/kishalmandal/inference-comments/edit)

**I have trained only 2 folds till now and got 0.798 on the LB (made public)**

In [None]:
!pip install tez -q

In [None]:
import gc
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import transformers
import tez
from transformers import AdamW, AutoTokenizer, AutoModel
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader
import time
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
import optuna

# Config

In [None]:
class Config:
    model_name = 'roberta-base'
    batch_size = 96
    lr = 1e-4
    weight_decay = 0.01
    scheduler = 'CosineAnnealingLR'
    early_stopping_epochs = 1
    epochs = 20
    max_length = 128

# Dataset

In [None]:
class ToxicDataset:
    def __init__(self, data, tokenizer, max_len):
        self.comments = data['comment_text'].values
        self.tokenizer = tokenizer
        self.targets = data[[
            'toxic', 'severe_toxic', 'obscene',
            'threat','insult', 'identity_hate']].values
        self.max_len = max_len
        
    def __len__(self):
        return len(self.comments)
    
    def __getitem__(self, idx):
        
        tokenized = self.tokenizer.encode_plus(
            self.comments[idx],
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length'
        )
        
        input_ids = tokenized['input_ids']
        attention_mask = tokenized['attention_mask']
        
        toxic, severe_toxic, obscene, threat, insult, identity_hate = self.targets[idx]

        return {
            'input_ids' : torch.tensor(input_ids, dtype=torch.long),
            'attention_mask' : torch.tensor(attention_mask, dtype=torch.long),
            'toxic' : torch.tensor(toxic, dtype=torch.float),
            'severe_toxic' : torch.tensor(severe_toxic, dtype=torch.float),
            'obscene' : torch.tensor(obscene, dtype=torch.float),
            'threat' : torch.tensor(threat, dtype=torch.float),
            'insult' : torch.tensor(insult, dtype=torch.float),
            'identity_hate' : torch.tensor(identity_hate, dtype=torch.float)
        }

# The Model

In [None]:
class ToxicModel(nn.Module):
    def __init__(self, args, model_name):
        super(ToxicModel, self).__init__()
        self.args = args
        self.model = AutoModel.from_pretrained(self.args.model_name)
        self.dropout = nn.Dropout(p=0.2)
        self.toxic = nn.Linear(768, 1)
        self.stoxic = nn.Linear(768, 1)
        self.obs = nn.Linear(768, 1)
        self.threat = nn.Linear(768, 1)
        self.insult = nn.Linear(768, 1)
        self.id_hate = nn.Linear(768, 1)
    
        
    def forward(self, input_ids, attention_mask):
        
        out = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=False
        )
        
        out = self.dropout(out[1])
        
        toxic = self.toxic(out)
        stoxic = self.stoxic(out)
        obs = self.obs(out)
        threat = self.threat(out)
        insult = self.insult(out)
        id_hate = self.id_hate(out)

        return [toxic, stoxic, obs, threat, insult, id_hate]
        

# Loss Function

In [None]:
def loss_fn(outputs, targets):
    o1, o2, o3, o4, o5, o6 = outputs
    t1, t2, t3, t4, t5, t6 = targets
    l1 = nn.BCEWithLogitsLoss()(o1, t1.view(-1,1))
    l2 = nn.BCEWithLogitsLoss()(o2, t2.view(-1,1))
    l3 = nn.BCEWithLogitsLoss()(o3, t3.view(-1,1))
    l4 = nn.BCEWithLogitsLoss()(o4, t4.view(-1,1))
    l5 = nn.BCEWithLogitsLoss()(o5, t5.view(-1,1))
    l6 = nn.BCEWithLogitsLoss()(o6, t6.view(-1,1))
    
    total_loss = (l1+l2+l3+l4+l5+l6)/6
    
    return total_loss

In [None]:
def metrics(outputs, targets):
    auc_scores=[]
    for o, t in zip(outputs, targets):
        o = o.cpu().detach().numpy()
        t = t.cpu().detach().numpy()
        auc = roc_auc_score(o, t)
        auc_scores(auc)

    return np.mean(auc_scores)

# Training epoch

In [None]:
def train_epoch(args, dataloader, model, optimizer, scheduler, epoch):
    model.train()
    epoch_loss = 0.0
    running_loss = 0.0
    dataset_size=0
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        optimizer.zero_grad()
        
        input_ids = data['input_ids'].cuda()
        attention_mask = data['attention_mask'].cuda()
        toxic = data['toxic'].cuda()
        severe_toxic = data['severe_toxic'].cuda()
        obscene = data['obscene'].cuda()
        threat = data['threat'].cuda()
        insult = data['insult'].cuda()
        identity_hate = data['identity_hate'].cuda()
        
        batch_size = args.batch_size
        
        targets = (toxic, severe_toxic, obscene, threat, insult, identity_hate)
        outputs = model(input_ids, attention_mask)
        
        
        loss = loss_fn(outputs, targets)

        
        loss.backward()
        optimizer.step()
        if scheduler is not None:
            scheduler.step()
        
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        epoch_loss = running_loss / dataset_size
        
        bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])
    gc.collect()
    return epoch_loss

# Validation Epoch

In [None]:
def validation(args, dataloader, model):
    model.eval()
    epoch_loss = 0.0
    running_loss = 0.0
    dataset_size=0
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    with torch.no_grad():
        for step, data in bar:
            batch_size = args.batch_size

            input_ids = data['input_ids'].cuda()
            attention_mask = data['attention_mask'].cuda()
            toxic = data['toxic'].cuda()
            severe_toxic = data['severe_toxic'].cuda()
            obscene = data['obscene'].cuda()
            threat = data['threat'].cuda()
            insult = data['insult'].cuda()
            identity_hate = data['identity_hate'].cuda()

            targets = (toxic, severe_toxic, obscene, threat, insult, identity_hate)

            outputs = model(input_ids, attention_mask)

            loss = loss_fn(outputs, targets)

            running_loss += (loss.item() * batch_size)
            dataset_size += batch_size

            epoch_loss = running_loss / dataset_size

            bar.set_postfix(Valid_Loss=epoch_loss,
                            Stage='Validation') 
    return epoch_loss

# Optimizer

In [None]:
def get_optimizer(args, params):
    opt = AdamW(params, lr=args.lr, weight_decay=args.weight_decay)
    return opt

# Scheduler

In [None]:
def get_scheduler(args, optimizer):
    if args.scheduler == 'CosineAnnealingLR':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=500, 
                                                   eta_min=1e-6)
    else:
        schduler = None
    return scheduler

# Training and validation Loop

In [None]:
def run(data, fold, args=None, save_model=False):
    print('-'*50)
    print(f'Fold : {fold}')
    print('-'*50)
    
    if args is None:
        args = Config()
        
    start = time.time()
    model = ToxicModel(args, args.model_name)
    model = model.cuda()
    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
    
    optimizer = get_optimizer(args, model.parameters())
    scheduler = get_scheduler(args, optimizer)
    
    train = data[data['kfold']!=fold]
    valid = data[data['kfold']==fold]
    
    train_dataset = ToxicDataset(train, tokenizer, args.max_length)
    valid_dataset = ToxicDataset(valid, tokenizer, args.max_length)
    
    train_loader = DataLoader(train_dataset, batch_size=args.batch_size)
    valid_loader = DataLoader(valid_dataset, batch_size=2*args.batch_size)
    
    best_val_loss = np.inf
    patience_counter = 0

    for epoch in range(args.epochs):
        train_loss = train_epoch(args, train_loader, model, optimizer, scheduler, epoch)
        
        valid_loss = validation(args, valid_loader, model)
        
        if valid_loss <= best_val_loss:
            print(f"Validation Loss Improved ({best_val_loss} ---> {valid_loss})")
            best_val_loss = valid_loss
            
            if save_model:
                PATH = f"model_fold_{fold}.bin"
                torch.save(model.state_dict(), PATH)
                print(f"----------Model Saved----------")
        
        else:
            patience_counter += 1
            print(f'Early stopping counter {patience_counter} of {args.early_stopping_epochs}')
            if patience_counter == args.early_stopping_epochs:
                print('*************** Early Stopping ***************')
                break
    
    end = time.time()
    time_elapsed = end-start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    print("Best Loss: {:.4f}".format(best_val_loss))
    
    del model, train_loader, valid_loader
    gc.collect()
    return best_val_loss

# Read the data

I have used 5 folds that I have cleaned using this notebook: [Multi-Label Stratified K-fold | Toxic Comments](https://www.kaggle.com/kishalmandal/multi-label-stratified-k-fold-toxic-comments)

The data is the same data used in the toxicity classification challenge by Jigsaw

In [None]:
df = pd.read_csv('../input/multi-label-stratified-k-fold-toxic-comments/5folds.csv')

In [None]:
df = df.dropna()

In [None]:
df.isnull().sum()

# Optuna

In [None]:
# df_trial = df[:100]

In [None]:
# def objective(trial):
#     args = Config()
#     args.epochs=1
#     args.lr = trial.suggest_uniform('lr',1e-6, 1e-3)
#     all_losses = []
#     for fold in range(5):
#         temp_loss = run(df_trial, fold, args=args)
#         all_losses.append(temp_loss)
    
#     return np.mean(all_losses)

In [None]:
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=10)

# print('Best Trial:')
# trial_ = study.best_trial
# print(trial_.values)

In [None]:
# trial_.params['lr']

# Run training

In [None]:
args=Config()
args.lr = 0.0005149849355804644
run(df, fold=0, save_model=True, args=args)