In [1]:
import os
import math
import random
import time
import glob
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn.functional as F

from transformers import AdamW
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoConfig
from transformers import get_cosine_schedule_with_warmup

from sklearn.model_selection import KFold

import gc
gc.enable()

In [2]:
NUM_FOLDS = 5
BATCH_SIZE = 8
MAX_LEN = 256

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)

    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    torch.backends.cudnn.deterministic = True

In [4]:
test_df = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
submission_df = pd.read_csv("../input/jigsaw-toxic-severity-rating/sample_submission.csv")
if len(submission_df) == 7537:
    test_df = test_df.head(15)
    submission_df = submission_df.head(15)

# Dataset

In [5]:

class LitDataset(Dataset):
    def __init__(self, df, tokenizer, MAX_LEN=MAX_LEN):
        super().__init__()

        self.df = df        
        self.text = df.text.tolist()
        self.encoded = tokenizer.batch_encode_plus(
                                                    self.text,
                                                    padding = 'max_length',            
                                                    max_length = MAX_LEN,
                                                    truncation = True,
                                                    return_attention_mask=True
                                                )        
 

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        return (input_ids, attention_mask)            


# Model
The model is inspired by the one from [Maunish](https://www.kaggle.com/maunish/clrp-roberta-svm).

In [6]:
def predict(model, data_loader):
    """Returns an np.array with predictions of the |model| on |data_loader|"""
    model.eval()

    result = np.zeros(len(data_loader.dataset))    
    index = 0
    
    with torch.no_grad():
        for batch_num, (input_ids, attention_mask) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)
                        
            pred = model(input_ids, attention_mask)                        

            result[index : index + pred.shape[0]] = pred.flatten().to("cpu")
            index += pred.shape[0]

    return result

# Inference

## Deberta large att

In [7]:
class AttentionModel(nn.Module):
    def __init__(self, path):
        super().__init__()

        config = AutoConfig.from_pretrained(ROBERTA_PATH)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})                       
        
        self.roberta = AutoModel.from_pretrained(ROBERTA_PATH, config=config)  
        #self.roberta.base_model.embeddings.requires_grad_(False)    
        self.attention = nn.Sequential(            
            nn.Linear(config.hidden_size, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )        
        self.fc = nn.Linear(config.hidden_size, 1)                        
        
    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask)        
        last_layer_hidden_states = roberta_output.hidden_states[-1]
        weights = self.attention(last_layer_hidden_states)

        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)       
        return self.fc(context_vector)

In [8]:
TOKENIZER_PATH = ROBERTA_PATH = "../input/roberta-transformers-pytorch/roberta-large"
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
all_predictions = np.zeros((5, len(test_df)))
test_dataset = LitDataset(test_df, tokenizer=tokenizer)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, drop_last=False, shuffle=False, num_workers=2)

for index in range(5):            
    model_path = f"../input/robertalargeatt-models/roberta_large_att_{index + 1}.pth"
    print(f"\nUsing {model_path}")
                        
    model = AttentionModel(path=ROBERTA_PATH)
    model.load_state_dict(torch.load(model_path))    
    model.to(DEVICE)
    
    all_predictions[index] = predict(model, test_loader)
    
    del model
    gc.collect()
pred1 = all_predictions.mean(axis=0)


Using ../input/robertalargeatt-models/roberta_large_att_1.pth

Using ../input/robertalargeatt-models/roberta_large_att_2.pth

Using ../input/robertalargeatt-models/roberta_large_att_3.pth

Using ../input/robertalargeatt-models/roberta_large_att_4.pth

Using ../input/robertalargeatt-models/roberta_large_att_5.pth


## Roberta large cnn

In [9]:
class CNNModel(nn.Module):
    def __init__(self, path):
        super().__init__()

        config = AutoConfig.from_pretrained(ROBERTA_PATH)
        config.update({"output_hidden_states":True, 
                "hidden_dropout_prob": 0.0,
                'return_dict':True})                      
        
        self.roberta = AutoModel.from_pretrained(ROBERTA_PATH, config=config)
            
        self.conv1 = nn.Conv1d(config.hidden_size, 512, 3, stride=1, padding=1, dilation=1, groups=1, bias=True, padding_mode='zeros')
        self.conv2 = nn.Conv1d(512, 1, 3, stride=1, padding=1, dilation=1, groups=1, bias=True, padding_mode='zeros')


    def forward(self, input_ids, attention_mask):
        output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)        
        hs = output.hidden_states
        #x = hs[-2]
        x = torch.stack(hs)
        x = torch.mean(x, 0)
        conv1_logits = self.conv1(x.transpose(1, 2))
        conv2_logits = self.conv2(conv1_logits)
        logits = conv2_logits.transpose(1, 2)
        x = torch.mean(logits, 1)
        return x

In [10]:
TOKENIZER_PATH = ROBERTA_PATH = "../input/deberta/large"
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
all_predictions = np.zeros((5, len(test_df)))
test_dataset = LitDataset(test_df, tokenizer=tokenizer)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, drop_last=False, shuffle=False, num_workers=2)

for index in range(5):            
    model_path = f"../input/largecnn/deberta_large_cnn_{index + 1}.pth"
    print(f"\nUsing {model_path}")
                        
    model = CNNModel(path=ROBERTA_PATH)
    model.load_state_dict(torch.load(model_path))    
    model.to(DEVICE)
    
    all_predictions[index] = predict(model, test_loader)
    
    del model
    gc.collect()
pred2 = all_predictions.mean(axis=0)


Using ../input/largecnn/deberta_large_cnn_1.pth

Using ../input/largecnn/deberta_large_cnn_2.pth

Using ../input/largecnn/deberta_large_cnn_3.pth

Using ../input/largecnn/deberta_large_cnn_4.pth

Using ../input/largecnn/deberta_large_cnn_5.pth


## Electra large cnn

In [11]:
TOKENIZER_PATH = ROBERTA_PATH = "../input/electra/large-discriminator"
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
all_predictions = np.zeros((NUM_FOLDS, len(test_df)))
test_dataset = LitDataset(test_df, tokenizer=tokenizer)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, drop_last=False, shuffle=False, num_workers=2)

for index in range(NUM_FOLDS):            
    model_path = f"../input/largecnn/electra_large_cnn_{index + 1}.pth"
    print(f"\nUsing {model_path}")
                        
    model = CNNModel(path=ROBERTA_PATH)
    model.load_state_dict(torch.load(model_path))    
    model.to(DEVICE)
    
    all_predictions[index] = predict(model, test_loader)
    
    del model
    gc.collect()
pred3 = all_predictions.mean(axis=0)


Using ../input/largecnn/electra_large_cnn_1.pth

Using ../input/largecnn/electra_large_cnn_2.pth

Using ../input/largecnn/electra_large_cnn_3.pth

Using ../input/largecnn/electra_large_cnn_4.pth

Using ../input/largecnn/electra_large_cnn_5.pth


In [12]:
submission_df.score = (pred1+pred2+pred3)/3
print(submission_df)
submission_df.to_csv("submission.csv", index=False)

    comment_id     score
0       114890  0.000995
1       732895  0.009851
2      1139051  0.004916
3      1434512  0.001150
4      2084821  0.062548
5      2452675  0.001626
6      3206615  0.018351
7      3665348  0.018531
8      4502494  0.000115
9      4804786  0.019353
10     4974702  0.001323
11     5407955  0.006877
12     5760889  0.004328
13     6468556  0.088157
14     6545351  0.320189
