In [None]:
import os
import gc
import cv2
import copy
import time
import random

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# For Transformer Models
from transformers import AutoTokenizer, AutoModel

# Utils
from tqdm import tqdm

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
CONFIG = dict(
    seed = 42,
    model_name = '../input/roberta-base',
    test_batch_size = 64,
    max_length = 128,
    num_classes = 1,
    dropout=0.2,
    output_logits= 768,
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
)
 

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])

In [None]:
MODEL_PATHS = [
    '../input/pytorch-w-b-jigsaw-cardiffnlp-twitter-model/Loss-Fold-0.bin',
    '../input/pytorch-w-b-jigsaw-cardiffnlp-twitter-model/Loss-Fold-1.bin',
    '../input/pytorch-w-b-jigsaw-cardiffnlp-twitter-model/Loss-Fold-2.bin',
    '../input/pytorch-w-b-jigsaw-cardiffnlp-twitter-model/Loss-Fold-3.bin',
    '../input/pytorch-w-b-jigsaw-cardiffnlp-twitter-model/Loss-Fold-4.bin'
]

In [None]:
def set_seed(seed = 42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CONFIG['seed'])

In [None]:
df = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
df.head()

In [None]:
class JigsawDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.text = df['text'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
                        text,
                        truncation=True,
                        add_special_tokens=True,
                        max_length=self.max_len,
                        padding='max_length'
                    )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']        
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long)
        }

In [None]:
test_dataset = JigsawDataset(df, CONFIG['tokenizer'], max_length=CONFIG['max_length'])
test_loader = DataLoader(test_dataset, batch_size=CONFIG['test_batch_size'],
                         num_workers=2, shuffle=False, pin_memory=True)

In [None]:
class JigsawModel_Novel(nn.Module):
    def __init__(self, model_name):
        super(JigsawModel_Novel, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.distilbert  = AutoModel.from_pretrained("../input/cardiffnlptwitterrobertabaseoffensive") 
        self.drop = nn.Dropout(p=CONFIG['dropout'])
        self.layer_norm = nn.LayerNorm(CONFIG['output_logits'])
        self.fc = nn.Linear(CONFIG['output_logits'], CONFIG['num_classes'])
        self.dense = nn.Sequential(
            #nn.Dropout(p=CONFIG['dropout']),
            nn.Linear(CONFIG['output_logits'], CONFIG['num_classes']),
            #nn.Tanh(),
            #nn.Dropout(p=0.1),
            #nn.Linear(768, 3, bias=True)
        )
        
    def forward(self, ids, mask): 
        out = self.bert(input_ids=ids,attention_mask=mask,
                         output_hidden_states=True).last_hidden_state[:, 0, :]
        out_dist = self.distilbert(input_ids=ids,attention_mask=mask,
                         output_hidden_states=True).last_hidden_state[:, 0, :] 
        #concatenated_vectors = torch.cat(out , out_dist )
        output = self.drop(out_dist)
        #pooled_output = self.layer_norm(out[1])
        #output = self.dense(output)
        output = self.fc(output)
        return output

In [None]:
@torch.no_grad()
def valid_fn(model, dataloader, device):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    PREDS = []
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        
        outputs = model(ids, mask)
        PREDS.append(outputs.view(-1).cpu().detach().numpy()) 
    
    PREDS = np.concatenate(PREDS)
    gc.collect()
    
    return PREDS

In [None]:
def inference(model_paths, dataloader, device):
    final_preds = []
    for i, path in enumerate(model_paths):
        model = JigsawModel_Novel(CONFIG['model_name'])
        model.to(CONFIG['device'])
        model.load_state_dict(torch.load(path))
        
        print(f"Getting predictions for model {i+1}")
        preds = valid_fn(model, dataloader, device)
        final_preds.append(preds)
    
    final_preds = np.array(final_preds)
    final_preds = np.mean(final_preds, axis=0)
    return final_preds

In [None]:
preds = inference(MODEL_PATHS, test_loader, CONFIG['device'])

In [None]:
print(f"Total Predictiions: {preds.shape[0]}")
print(f"Total Unique Predictions: {np.unique(preds).shape[0]}")

In [None]:
df['score'] = preds
df.head()

In [None]:
df['score'] = df['score'].rank(method='first')
df.head()

In [None]:
# Open the file
df_train = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
print('Dim Train :', df_train.shape)

# If the pair has been ranked by multiple worker, we keep the order that is most unanimous
df_train['TEXT_ranked'] = df_train.apply(lambda row : row['less_toxic'] + ':' + row['more_toxic'], axis = 1)
df_train['TEXT_paire'] = df_train.apply(lambda row : min(row['less_toxic'], row['more_toxic']) + ':' + max(row['less_toxic'], row['more_toxic']), axis = 1)
df_train['Count_paire_ranked'] = df_train.groupby(['TEXT_ranked'])['TEXT_ranked'].transform('count')
df_train['Count_paire'] = df_train.groupby(['TEXT_paire'])['TEXT_ranked'].transform('count')
df_train['count_max'] = df_train.groupby(['TEXT_paire'])['Count_paire_ranked'].transform(max)

# Selection
df_train = df_train[df_train['Count_paire_ranked'] == df_train['count_max']]
df_train = df_train[df_train['Count_paire_ranked'] == 3] # every workers agreed

# Delete duplicates
df_train = df_train.drop(columns = ['worker'])
df_train = df_train.drop_duplicates()

# Results
df_train = df_train.sort_values(by = ['TEXT_ranked'])
df_train = df_train.drop(columns = ['Count_paire', 'count_max', 'TEXT_ranked', 'TEXT_paire']).drop_duplicates()
print('Dim APRES :', df_train.shape)
df_train.head()

In [None]:
# Add score to the validation dataset
df_train = df_train.merge(df[['text', 'score']], left_on = 'less_toxic', right_on = 'text', how = 'left').drop_duplicates()
df_train = df_train.rename(columns = {'score' : 'score_less'})
df_train = df_train.drop(columns = ['text'])
df_train = df_train.merge(df[['text', 'score']], left_on = 'more_toxic', right_on = 'text', how = 'left').drop_duplicates()
df_train = df_train.rename(columns = {'score' : 'score_more'})
df_train = df_train.drop(columns = ['text'])

# Stats
df_train.head()
print(len(df_train[df_train['score_more'] < df_train['score_less']]), '/', len(df_train))
df_train[df_train['score_more'] < df_train['score_less']].sort_values(['less_toxic'])
# Correction of scores
df_train['score_max_du_less_toxic'] = df_train.groupby(['less_toxic'])['score_more'].transform(min) # score_min des textes + toxics
df_train['score_min_du_more_toxic'] = df_train.groupby(['more_toxic'])['score_less'].transform(max) # score_max des textes - toxics

# Join
df = df.merge(df_train[['less_toxic', 'score_less', 'score_max_du_less_toxic']], left_on = ['text', 'score'], right_on = ['less_toxic', 'score_less'], how = 'left')
df = df.drop(columns = ['less_toxic', 'score_less'])
df = df.merge(df_train[['more_toxic', 'score_more', 'score_min_du_more_toxic']], left_on = ['text', 'score'], right_on = ['more_toxic', 'score_more'], how = 'left')
df = df.drop(columns = ['more_toxic', 'score_more'])

# Rename
df = df.rename(columns = {'score_max_du_less_toxic' : 'borne_max', 'score_min_du_more_toxic' : 'borne_min'}) # le score doit est + petit que borne_max
df = df[['comment_id', 'text', 'score', 'borne_min', 'borne_max']].drop_duplicates()

# Aperçu
df.head()

In [None]:
# CORRECTION of the scores
def corrige(row) :
    score, borne_min, borne_max = row['score'], row['borne_min'], row['borne_max']
    
    if not(pd.isna(borne_min)) and not(pd.isna(borne_max)) :
        if borne_max < borne_min : return (borne_max + borne_min ) // 2 # return score
        if score < borne_min : return borne_min+1
        if score > borne_max : return borne_max-1
        else :
            return score
        
    elif not(pd.isna(borne_min)) :
        if score < borne_min : return borne_min+1
        else : return score

    elif not(pd.isna(borne_max)) :
        if score > borne_max : return borne_max-1
        else : return score
        
    else :
        return score
    
# --------------------

# Application of correction
df['score_corrige'] = df.apply(lambda row : corrige(row), axis=1)
corrections = df[df['score'] != df['score_corrige']]
print("Nb of corrections : {}/{}.".format(len(corrections), len(df)))


# Show
corrections[['comment_id', 'text', 'score', 'score_corrige']]

In [None]:
# Rank first
df = df[['comment_id', 'text', 'score_corrige']].drop_duplicates()
df['score'] = df['score_corrige'].rank(method='first')
df = df[['comment_id', 'text', 'score']].drop_duplicates()

# Show
df.head()

In [None]:
df = df[['comment_id', 'score']].drop_duplicates()
print(df.shape)
df.to_csv("submission.csv", index=False)