<span style="color: #000508; font-family: Segoe UI; font-size: 1.2em; font-weight: 300;">🎯 Training Kernel: <strong><a href="https://www.kaggle.com/debarshichanda/pytorch-w-b-jigsaw-starter">[Pytorch + W&B] Jigsaw Starter</a></strong>.</span>

In [None]:
import os
import gc
import cv2
import copy
import time
import random

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# For Transformer Models
from transformers import AutoTokenizer, AutoModel
from transformers import DistilBertTokenizer, DistilBertModel, DistilBertConfig

# Utils
from tqdm import tqdm

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
CONFIG_hatebert = dict(
    seed = 42,
    model_name = '../input/hatebert/dehatebert/model/',
    test_batch_size = 64,
    max_length = 128,
    num_classes = 1,
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
)

CONFIG_distilbert = dict(
    seed = 42,
    model_name = '../input/distilbertbaseuncased/',
    test_batch_size = 64,
    max_length = 128,
    num_classes = 1,
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
)

CONFIG_robertabase = dict(
    seed = 42,
    model_name = '../input/roberta-base/',
    test_batch_size = 64,
    max_length = 128,
    num_classes = 1,
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
)

CONFIG_GroNLP = dict(
    seed = 42,
    model_name = '../input/hatebert/GroNLP/model/',
    test_batch_size = 64,
    max_length = 128,
    num_classes = 1,
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
)

CONFIG_hatebert["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG_hatebert['model_name'])
CONFIG_distilbert["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG_distilbert['model_name'])
CONFIG_robertabase["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG_robertabase['model_name'])
CONFIG_GroNLP["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG_GroNLP['model_name'])

In [None]:
MODEL_PATHS_hatebert = [
    '../input/jigsaw-toxic-severity-hatebert-5f/Loss-Fold-0.bin',
    '../input/jigsaw-toxic-severity-hatebert-5f/Loss-Fold-1.bin',
    '../input/jigsaw-toxic-severity-hatebert-5f/Loss-Fold-2.bin',
    '../input/jigsaw-toxic-severity-hatebert-5f/Loss-Fold-3.bin',
    '../input/jigsaw-toxic-severity-hatebert-5f/Loss-Fold-4.bin'
]

MODEL_PATHS_distilbert = [
    '../input/jigsaw-toxic-severity-hatebert-5f/DistilBERT_Loss-Fold-0.bin',
    '../input/jigsaw-toxic-severity-hatebert-5f/DistilBERT_Loss-Fold-1.bin',
    '../input/jigsaw-toxic-severity-hatebert-5f/DistilBERT_Loss-Fold-2.bin',
    '../input/jigsaw-toxic-severity-hatebert-5f/DistilBERT_Loss-Fold-3.bin',
    '../input/jigsaw-toxic-severity-hatebert-5f/DistilBERT_Loss-Fold-4.bin'
]

MODEL_PATHS_robertabase = [
    '../input/jigsaw-toxic-severity-hatebert-5f/roberta_base_Loss-Fold-0.bin',
    '../input/jigsaw-toxic-severity-hatebert-5f/roberta_base_Loss-Fold-1.bin',
    '../input/jigsaw-toxic-severity-hatebert-5f/roberta_base_Loss-Fold-2.bin',
    '../input/jigsaw-toxic-severity-hatebert-5f/roberta_base_Loss-Fold-3.bin',
    '../input/jigsaw-toxic-severity-hatebert-5f/roberta_base_Loss-Fold-4.bin'
]

MODEL_PATHS_GroNLP = [
    '../input/jigsaw-toxic-severity-hatebert-5f/GroNLP_CustomOpt-Fold-0.bin',
    '../input/jigsaw-toxic-severity-hatebert-5f/GroNLP_CustomOpt-Fold-1.bin',
    '../input/jigsaw-toxic-severity-hatebert-5f/GroNLP_CustomOpt-Fold-2.bin',
    '../input/jigsaw-toxic-severity-hatebert-5f/GroNLP_CustomOpt-Fold-3.bin',
    '../input/jigsaw-toxic-severity-hatebert-5f/GroNLP_CustomOpt-Fold-4.bin'
]

In [None]:
def set_seed(seed = 42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CONFIG_hatebert['seed'])

In [None]:
df = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")


df_test = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

In [None]:
class JigsawDataset(Dataset):
    def __init__(self, df, tokenizer, max_length, col_name):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.text = df[col_name].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
                        text,
                        truncation=True,
                        add_special_tokens=True,
                        max_length=self.max_len,
                        padding='max_length'
                    )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']        
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long)
        }

In [None]:
# less_toxic_test_dataset = JigsawDataset(df, CONFIG['tokenizer'], CONFIG['max_length'], 'less_toxic')
# less_toxic_test_loader = DataLoader(less_toxic_test_dataset, batch_size=CONFIG['test_batch_size'],
#                          num_workers=2, shuffle=False, pin_memory=True)

# more_toxic_test_dataset = JigsawDataset(df, CONFIG['tokenizer'], CONFIG['max_length'], 'more_toxic')
# more_toxic_test_loader = DataLoader(more_toxic_test_dataset, batch_size=CONFIG['test_batch_size'],
#                          num_workers=2, shuffle=False, pin_memory=True)

test_dataset_distilbert = JigsawDataset(df_test, CONFIG_distilbert['tokenizer'], CONFIG_distilbert['max_length'], 'text')
test_loader_distilbert = DataLoader(test_dataset_distilbert, batch_size=CONFIG_distilbert['test_batch_size'],
                         num_workers=2, shuffle=False, pin_memory=True)

test_dataset_hatebert = JigsawDataset(df_test, CONFIG_hatebert['tokenizer'], CONFIG_hatebert['max_length'], 'text')
test_loader_hatebert = DataLoader(test_dataset_hatebert, batch_size=CONFIG_hatebert['test_batch_size'],
                         num_workers=2, shuffle=False, pin_memory=True)

test_dataset_robertabase = JigsawDataset(df_test, CONFIG_robertabase['tokenizer'], CONFIG_robertabase['max_length'], 'text')
test_loader_robertabase = DataLoader(test_dataset_robertabase, batch_size=CONFIG_robertabase['test_batch_size'],
                         num_workers=2, shuffle=False, pin_memory=True)

test_dataset_GroNLP = JigsawDataset(df_test, CONFIG_GroNLP['tokenizer'], CONFIG_GroNLP['max_length'], 'text')
test_loader_GroNLP = DataLoader(test_dataset_GroNLP, batch_size=CONFIG_GroNLP['test_batch_size'],
                         num_workers=2, shuffle=False, pin_memory=True)


# more_toxic_test_dataset_distilbert = JigsawDataset(df, CONFIG_distilbert['tokenizer'], CONFIG_distilbert['max_length'], 'more_toxic')
# more_toxic_test_loader_distilbert = DataLoader(more_toxic_test_dataset_distilbert, batch_size=CONFIG_distilbert['test_batch_size'],
#                          num_workers=2, shuffle=False, pin_memory=True)

# more_toxic_test_dataset_hatebert = JigsawDataset(df, CONFIG_hatebert['tokenizer'], CONFIG_hatebert['max_length'], 'more_toxic')
# more_toxic_test_loader_hatebert = DataLoader(more_toxic_test_dataset_hatebert, batch_size=CONFIG_hatebert['test_batch_size'],
#                          num_workers=2, shuffle=False, pin_memory=True)

# more_toxic_test_dataset_robertabase = JigsawDataset(df, CONFIG_robertabase['tokenizer'], CONFIG_robertabase['max_length'], 'more_toxic')
# more_toxic_test_loader_robertabase = DataLoader(more_toxic_test_dataset_robertabase, batch_size=CONFIG_robertabase['test_batch_size'],
#                          num_workers=2, shuffle=False, pin_memory=True)

# more_toxic_test_dataset_GroNLP = JigsawDataset(df, CONFIG_GroNLP['tokenizer'], CONFIG_GroNLP['max_length'], 'more_toxic')
# more_toxic_test_loader_GroNLP = DataLoader(more_toxic_test_dataset_GroNLP, batch_size=CONFIG_GroNLP['test_batch_size'],
#                          num_workers=2, shuffle=False, pin_memory=True)

In [None]:
class JigsawModel_distilbert(nn.Module):
    def __init__(self, model_name):
        super(JigsawModel_distilbert, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.drop = nn.Dropout(p=0.2)
        self.fc = nn.Linear(1024, CONFIG_distilbert['num_classes'])
        
    def forward(self, ids, mask):        
        out = self.model(input_ids=ids,attention_mask=mask,
                         output_hidden_states=False)
        out = self.drop(out[1])
        outputs = self.fc(out)
        return outputs
    
class JigsawModel_hatebert(nn.Module):
    def __init__(self, model_name):
        super(JigsawModel_hatebert, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.drop = nn.Dropout(p=0.2)
        self.fc = nn.Linear(768, CONFIG_hatebert['num_classes'])
        
    def forward(self, ids, mask):        
        out = self.model(input_ids=ids,attention_mask=mask,
                         output_hidden_states=False)
        out = self.drop(out[1])
        outputs = self.fc(out)
        return outputs
    
class JigsawModel_robertabase(nn.Module):
    def __init__(self, model_name):
        super(JigsawModel_robertabase, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.drop = nn.Dropout(p=0.2)
        self.fc = nn.Linear(768, CONFIG_hatebert['num_classes'])
        
    def forward(self, ids, mask):        
        out = self.model(input_ids=ids,attention_mask=mask,
                         output_hidden_states=False)
        out = self.drop(out[1])
        outputs = self.fc(out)
        return outputs
    
class JigsawModel_GroNLP(nn.Module):
    def __init__(self, model_name):
        super(JigsawModel_GroNLP, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.drop = nn.Dropout(p=0.2)
        self.fc = nn.Linear(768, CONFIG_hatebert['num_classes'])
        
    def forward(self, ids, mask):        
        out = self.model(input_ids=ids,attention_mask=mask,
                         output_hidden_states=False)
        out = self.drop(out[1])
        outputs = self.fc(out)
        return outputs

In [None]:
class JigsawModel_distilbert(nn.Module):
    def __init__(self):
        super().__init__()

        config = DistilBertConfig.from_pretrained(CONFIG_distilbert['model_name'])
        config.update({"layer_norm_eps": 1e-7,
                       "output_hidden_states":True,
                      "hidden_dropout_prob": 0.0})                       
        
        self.distilbert = DistilBertModel.from_pretrained(CONFIG_distilbert['model_name'], config=config)  
            
        self.attention = nn.Sequential(            
            nn.Linear(768, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )        

        self.regressor = nn.Sequential(                        
            nn.Linear(768, 1)                        
        )
        

    def forward(self, input_ids, attention_mask):
        distilbert_output = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)        

        last_layer_hidden_states = distilbert_output.last_hidden_state

        weights = self.attention(last_layer_hidden_states)

        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)        

        return self.regressor(context_vector)

In [None]:
@torch.no_grad()
def valid_fn(model, dataloader, device):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    PREDS = []
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        
        outputs = model(ids, mask)
        PREDS.append(outputs.view(-1).cpu().detach().numpy()) 
    
    PREDS = np.concatenate(PREDS)
    gc.collect()
    
    return PREDS

In [None]:
def inference_distilbert(model_paths, dataloader, device):
    final_preds = []
    for i, path in enumerate(model_paths):
#         model = JigsawModel_distilbert(CONFIG_distilbert['model_name'])
        model = JigsawModel_distilbert()
        model.to(CONFIG_distilbert['device'])
        model.load_state_dict(torch.load(path))
        
        print(f"Getting predictions for model {i+1}")
        preds = valid_fn(model, dataloader, device)
        final_preds.append(preds)
    
    final_preds = np.array(final_preds)
    final_preds = np.mean(final_preds, axis=0)
    return final_preds

def inference_hatebert(model_paths, dataloader, device):
    final_preds = []
    for i, path in enumerate(model_paths):
        model = JigsawModel_hatebert(CONFIG_hatebert['model_name'])
        model.to(CONFIG_hatebert['device'])
        model.load_state_dict(torch.load(path))
        
        print(f"Getting predictions for model {i+1}")
        preds = valid_fn(model, dataloader, device)
        final_preds.append(preds)
    
    final_preds = np.array(final_preds)
    final_preds = np.mean(final_preds, axis=0)
    return final_preds

def inference_GroNLP(model_paths, dataloader, device):
    final_preds = []
    for i, path in enumerate(model_paths):
        model = JigsawModel_hatebert(CONFIG_GroNLP['model_name'])
        model.to(CONFIG_GroNLP['device'])
        model.load_state_dict(torch.load(path))
        
        print(f"Getting predictions for model {i+1}")
        preds = valid_fn(model, dataloader, device)
        final_preds.append(preds)
    
    final_preds = np.array(final_preds)
    final_preds = np.mean(final_preds, axis=0)
    return final_preds

def inference_robertabase(model_paths, dataloader, device):
    final_preds = []
    for i, path in enumerate(model_paths):
        model = JigsawModel_hatebert(CONFIG_robertabase['model_name'])
        model.to(CONFIG_robertabase['device'])
        model.load_state_dict(torch.load(path))
        
        print(f"Getting predictions for model {i+1}")
        preds = valid_fn(model, dataloader, device)
        final_preds.append(preds)
    
    final_preds = np.array(final_preds)
    final_preds = np.mean(final_preds, axis=0)
    return final_preds

In [None]:
# less_toxic_preds_distilbert = inference_distilbert(MODEL_PATHS_distilbert, less_toxic_test_loader_distilbert, CONFIG_distilbert['device'])
# less_toxic_preds_robertabase = inference_robertabase(MODEL_PATHS_robertabase, less_toxic_test_loader_robertabase, CONFIG_robertabase['device'])
# less_toxic_preds_hatebert = inference_hatebert(MODEL_PATHS_hatebert, less_toxic_test_loader_hatebert, CONFIG_GroNLP['device'])
# less_toxic_preds_GroNLP = inference_GroNLP(MODEL_PATHS_GroNLP, less_toxic_test_loader_GroNLP, CONFIG_GroNLP['device'])

In [None]:
# more_toxic_preds = inference(MODEL_PATHS, more_toxic_test_loader, CONFIG['device'])

# more_toxic_preds_distilbert = inference_distilbert(MODEL_PATHS_distilbert, more_toxic_test_loader_distilbert, CONFIG_distilbert['device'])
# more_toxic_preds_robertabase = inference_robertabase(MODEL_PATHS_robertabase, more_toxic_test_loader_robertabase, CONFIG_robertabase['device'])
# more_toxic_preds_hatebert = inference_hatebert(MODEL_PATHS_hatebert, more_toxic_test_loader_hatebert, CONFIG_GroNLP['device'])
# more_toxic_preds_GroNLP = inference_GroNLP(MODEL_PATHS_GroNLP, more_toxic_test_loader_GroNLP, CONFIG_GroNLP['device'])

In [None]:
# less_toxic_preds_ensemble =  less_toxic_preds_distilbert * 0.10 + less_toxic_preds_hatebert * 0.35 + less_toxic_preds_robertabase * 0.20 + less_toxic_preds_GroNLP * 0.35
# more_toxic_preds_ensemble =  more_toxic_preds_distilbert * 0.10 + more_toxic_preds_hatebert * 0.35 + more_toxic_preds_robertabase * 0.20 + more_toxic_preds_GroNLP * 0.35

In [None]:
# less_toxic_preds_ensemble =  less_toxic_preds_distilbert * 0.40 + less_toxic_preds_GroNLP * 0.60
# more_toxic_preds_ensemble =  more_toxic_preds_distilbert * 0.40 + more_toxic_preds_GroNLP * 0.60

In [None]:
# (less_toxic_preds_ensemble < more_toxic_preds_ensemble).mean()

In [None]:
preds_distilbert = inference_distilbert(MODEL_PATHS_distilbert, test_loader_distilbert, CONFIG_distilbert['device'])
preds_robertabase = inference_robertabase(MODEL_PATHS_robertabase, test_loader_robertabase, CONFIG_robertabase['device'])
preds_hatebert = inference_hatebert(MODEL_PATHS_hatebert, test_loader_hatebert, CONFIG_GroNLP['device'])
preds_GroNLP = inference_GroNLP(MODEL_PATHS_GroNLP, test_loader_GroNLP, CONFIG_GroNLP['device'])

In [None]:
# print(f"Total Predictiions: {preds.shape[0]}")
# print(f"Total Unique Predictions: {np.unique(preds).shape[0]}")

In [None]:
df_test['score'] = preds_distilbert * 0.25 + preds_GroNLP * 0.50 + preds_robertabase * 0.125 + preds_hatebert * 0.125
# df_test.head()

In [None]:
df_test['score'] = df_test['score'].rank(method='first')
df_test.head()

In [None]:
df_test.drop('text', axis=1, inplace=True)
df_test.to_csv("submission.csv", index=False)