## Import libraries

In [None]:
import os; os.environ['TOKENIZERS_PARALLELISM'] = 'false'

import torch
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from transformers import AutoModelForSequenceClassification, AutoModel, AutoTokenizer

## Test and Validation Dataset

In [None]:
class Dataset:
    """
    For comments_to_score.csv (the submission), gets only one comment per row.
    """

    def __init__(self, text, tokenizer, max_len):
        self.text = text
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        inputs = self.tokenizer(
            text, 
            max_length=self.max_len, 
            padding="max_length", 
            truncation=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long)
        }


class ValidationDataset:
    """
    For validation_data.csv, loads and tokenizes both less_toxic and more_toxic.
    """

    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def tokenize(self, text):
        return self.tokenizer(text, max_length=self.max_len, 
                              padding="max_length", truncation=True)

    def __getitem__(self, i):
        more_toxic = self.df['more_toxic'].iloc[i]
        less_toxic = self.df['less_toxic'].iloc[i]
        
        less_inputs = self.tokenize(less_toxic)
        more_inputs = self.tokenize(more_toxic)

        return {
            "less_input_ids": torch.tensor(less_inputs["input_ids"], dtype=torch.long),
            "less_attention_mask": torch.tensor(less_inputs["attention_mask"], dtype=torch.long),
            "more_input_ids": torch.tensor(more_inputs["input_ids"], dtype=torch.long),
            "more_attention_mask": torch.tensor(more_inputs["attention_mask"], dtype=torch.long),
        }

## Validation


In [None]:
def validate(model_path, max_len, is_multioutput):
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    model.to("cuda")
    model.eval()

    df = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")

    dataset = ValidationDataset(df=df, tokenizer=tokenizer, max_len=max_len)
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=64, num_workers=2, pin_memory=True, shuffle=False
    )

    n_samples = len(dataset)
    hits = 0

    for data in data_loader:
        with torch.no_grad():
            for key, value in data.items():
                data[key] = value.to("cuda")

            less_output = model(input_ids=data['less_input_ids'], 
                                attention_mask=data['less_attention_mask'])

            more_output = model(input_ids=data['more_input_ids'], 
                                attention_mask=data['more_attention_mask'])

            if is_multioutput:
                # Sum the logits of the 6 toxic labels
                less_score = less_output.logits.sum(dim=1)
                more_score = more_output.logits.sum(dim=1)
                hits += (less_score < more_score).sum().item()
            else:
                less_score = less_output.logits[:, 1]
                more_score = more_output.logits[:, 1]
                hits += (less_score < more_score).sum().item()


    accuracy = hits / n_samples
    print(f"Validation Accuracy: {accuracy:4.2f}")

    torch.cuda.empty_cache()
    return accuracy

### Check the performance of HF models on validation set

Best models are:
 
- Toxic BERT
- BERT Jigsaw
- Toxic detector Distil-RoBERTa

In [None]:
# MODEL_PATH = "../input/toxic-bert" # 0.7058
MODEL_PATH = '../input/hugging-face-models/toxic-detector-distilroberta' # 0.6952
# https://huggingface.co/jpcorb20/toxic-detector-distilroberta 0.6952

MAX_LENGTH = 192
IS_MULTIOUTPUT = True

# https://huggingface.co/Cameron/BERT-Jigsaw 0.6952
# https://huggingface.co/abhishek/autonlp-toxic-new-30516963 0.6864
# https://huggingface.co/SkolkovoInstitute/roberta_toxicity_classifier_v1 0.686
# https://huggingface.co/unitary/unbiased-toxic-roberta 0.6848

# MODEL_PATH = "../input/roberta-base-toxicity" # 0.6616
# MODEL_PATH = "../input/roberta-toxicity-classifier" # 0.6858

# MODEL_PATH = '../input/hugging-face-models/BERT-Jigsaw' # 0.6952

# IS_MULTIOUTPUT = False
# DO_VALIDATE = False
# VALIDATION_SIZE = 5000

In [None]:
validate(MODEL_PATH, MAX_LENGTH, IS_MULTIOUTPUT)

## Prediction

In [None]:
def generate_predictions(model_path, max_len, is_multioutput):
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    model.to("cuda")
    model.eval()
    
    df = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
    
    dataset = Dataset(text=df.text.values, tokenizer=tokenizer, max_len=max_len)
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=32, num_workers=2, pin_memory=True, shuffle=False
    )

    final_output = []

    for data in data_loader:
        with torch.no_grad():
            for key, value in data.items():
                data[key] = value.to("cuda")
            output = model(**data)
            
            if is_multioutput:
                # Sum the logits for all the toxic labels
                # One strategy out of various possible
                output = output.logits.sum(dim=1)
            else:
                # Classifier. Get logits for "toxic"
                output = output.logits[:, 1]
            
            output = output.detach().cpu().numpy().tolist()
            final_output.extend(output)
    
    torch.cuda.empty_cache()
    return np.array(final_output)

### Get predictions from the best models

In [None]:
preds1 = generate_predictions("../input/toxic-bert", max_len=192, is_multioutput=True)
preds2 = generate_predictions("../input/hugging-face-models/toxic-detector-distilroberta", max_len=192, is_multioutput=True)
preds3 = generate_predictions("../input/hugging-face-models/BERT-Jigsaw", max_len=192, is_multioutput=False)

## Ensemble

Average of the three models.

In [None]:
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
df_sub["score_bert"] = preds1
df_sub["score_distilrob"] = preds2
df_sub["score_bertjig"] = preds3

# Since their scales are off, first MinMaxScale the results (per model) and then average the scores.
sc = MinMaxScaler()
df_sub[["score_bert", "score_distilrob", "score_bertjig"]] = sc.fit_transform(df_sub[["score_bert", "score_distilrob", "score_bertjig"]])

df_sub["score"] = df_sub[["score_bert", "score_distilrob", "score_bertjig"]].mean(axis=1)

print(df_sub.duplicated('score').value_counts())

df_sub.head()

## View some results

In [None]:
pd.set_option("display.max_colwidth", 500)

In [None]:
df_sub.sort_values("score").head(3)[['score', 'text']]

In [None]:
df_sub.sort_values("score").tail(3)[['score', 'text']]

## Submit

In [None]:
# Tie-break, if any
df_sub['score'] = df_sub['score'].rank(method='first')

df_sub = df_sub[["comment_id", "score"]]
df_sub.to_csv("submission.csv", index=False)
df_sub.head()