# deberta-v3-base Inf 

**Train notebook** https://www.kaggle.com/gauravbrills/jigsaw-deberta-v3-base-train-model-3?scriptVersionId=80522297

In [None]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# CFG

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    model_dir='../input/jigsaw-deberta-v3-base-train-model-3'
    num_workers=4
    model="../input/deberta-v3-base/deberta-v3-base"
    batch_size=128
    fc_dropout=0.0000001
    text="text"
    target="target"
    target_size=1
    head=32
    tail=32
    seed=2021
    n_fold=5


CFG.max_len = CFG.head + CFG.tail

# Library

In [None]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import sys
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")
import logging
logging.basicConfig(level=logging.ERROR)
import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig 
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Utils

In [None]:
# ====================================================
# Utils
# ====================================================
def get_score(df):
    score = len(df[df['less_toxic_pred'] < df['more_toxic_pred']]) / len(df)
    return score


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=2021):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=2021)

# Data Loading

In [None]:
# ====================================================
# Data Loading
# ====================================================
df = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
sub  = pd.read_csv('../input/jigsaw-toxic-severity-rating/sample_submission.csv')


# tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model, lowercase=True)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

# Dataset

In [None]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(text, cfg):
    if cfg.tail == 0:
        inputs = cfg.tokenizer.encode_plus(text, 
                                           return_tensors=None, 
                                           add_special_tokens=True, 
                                           max_length=cfg.max_len,
                                           pad_to_max_length=True,
                                           truncation=True)
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
    else:
        inputs = cfg.tokenizer.encode_plus(text,
                                           return_tensors=None, 
                                           add_special_tokens=True, 
                                           truncation=True)
        for k, v in inputs.items():
            v_length = len(v)
            if v_length > cfg.max_len:
                v = np.hstack([v[:cfg.head], v[-cfg.tail:]])
            if k == 'input_ids':
                new_v = np.ones(cfg.max_len) * cfg.tokenizer.pad_token_id
            else:
                new_v = np.zeros(cfg.max_len)
            new_v[:v_length] = v 
            inputs[k] = torch.tensor(new_v, dtype=torch.long)
    return inputs


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.text = df[cfg.text].fillna("none").values

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        inputs = prepare_input(text, self.cfg)
        return inputs

# Model

In [None]:
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim
        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

In [None]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        self.head = AttentionHead(self.config.hidden_size,self.config.hidden_size)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(self.config.hidden_size,1)

    def forward(self, xb):
        x = self.model(**xb)[0]
        x = self.head(x)
        x = self.dropout(x)
        x = self.linear(x)
        return x

class CustomModel_Legacy(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = torch.mean(last_hidden_states, 1)
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

# inference

In [None]:
# ====================================================
# inference
# ====================================================
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [None]:
test_dataset = TestDataset(CFG, df)
test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False, 
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
config_path = CFG.model_dir+"config.pth"
predictions = []
for fold in range(CFG.n_fold):
    model = CustomModel(CFG, config_path=config_path, pretrained=False)
    state = torch.load(CFG.model_dir+f"/jigsaw_fold{fold}_best.pth", map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state; gc.collect()
    torch.cuda.empty_cache()

# submission

In [None]:
df['score'] = np.mean(predictions, axis=0)
df.to_csv("submission_nlp_raw.csv", index=False)
df['score'] = df['score'].rank(method='first')
df[['comment_id', 'score']].to_csv("submission_ranked.csv", index=False)
df.head()
# 0.766  with this above logic 

In [None]:
# Open the file
df_train = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
print('Dim Train :', df_train.columns)

# If the pair has been ranked by multiple worker, we keep the order that is most unanimous
df_train['TEXT_ranked'] = df_train.apply(lambda row : row['less_toxic'] + ':' + row['more_toxic'], axis = 1)
df_train['TEXT_paire'] = df_train.apply(lambda row : min(row['less_toxic'], row['more_toxic']) + ':' + max(row['less_toxic'], row['more_toxic']), axis = 1)
df_train['Count_paire_ranked'] = df_train.groupby(['TEXT_ranked'])['TEXT_ranked'].transform('count')
df_train['Count_paire'] = df_train.groupby(['TEXT_paire'])['TEXT_ranked'].transform('count')
df_train['count_max'] = df_train.groupby(['TEXT_paire'])['Count_paire_ranked'].transform(max)

# Selection
df_train = df_train[df_train['Count_paire_ranked'] == df_train['count_max']]
df_train = df_train[df_train['Count_paire_ranked'] == 3] # every workers agreed

# Delete duplicates
df_train = df_train.drop(columns = ['worker'])
df_train = df_train.drop_duplicates()

# Results
df_train = df_train.sort_values(by = ['TEXT_ranked'])
df_train = df_train.drop(columns = ['Count_paire', 'count_max', 'TEXT_ranked', 'TEXT_paire']).drop_duplicates()
print('Dim APRES :', df_train.shape)
df_train.head()
# Add score to the validation dataset
df_train = df_train.merge(df[['text', 'score']], left_on = 'less_toxic', right_on = 'text', how = 'left').drop_duplicates()
df_train = df_train.rename(columns = {'score' : 'score_less'})
df_train = df_train.drop(columns = ['text'])
df_train = df_train.merge(df[['text', 'score']], left_on = 'more_toxic', right_on = 'text', how = 'left').drop_duplicates()
df_train = df_train.rename(columns = {'score' : 'score_more'})
df_train = df_train.drop(columns = ['text'])

# Stats
df_train.head()

In [None]:
# Stats
print(len(df_train[df_train['score_more'] < df_train['score_less']]), '/', len(df_train))

In [None]:
# Test
df_train[df_train['score_more'] < df_train['score_less']].sort_values(['less_toxic'])

In [None]:
# Correction of scores
df_train['score_max_du_less_toxic'] = df_train.groupby(['less_toxic'])['score_more'].transform(min) # score_min des textes + toxics
df_train['score_min_du_more_toxic'] = df_train.groupby(['more_toxic'])['score_less'].transform(max) # score_max des textes - toxics

# Join
df = df.merge(df_train[['less_toxic', 'score_less', 'score_max_du_less_toxic']], left_on = ['text', 'score'], right_on = ['less_toxic', 'score_less'], how = 'left')
df = df.drop(columns = ['less_toxic', 'score_less'])
df = df.merge(df_train[['more_toxic', 'score_more', 'score_min_du_more_toxic']], left_on = ['text', 'score'], right_on = ['more_toxic', 'score_more'], how = 'left')
df = df.drop(columns = ['more_toxic', 'score_more'])

# Rename
df = df.rename(columns = {'score_max_du_less_toxic' : 'borne_max', 'score_min_du_more_toxic' : 'borne_min'}) # le score doit est + petit que borne_max
df = df[['comment_id', 'text', 'score', 'borne_min', 'borne_max']].drop_duplicates()

# Aperçu
df.head()
# CORRECTION of the scores
def corrige(row) :
    score, borne_min, borne_max = row['score'], row['borne_min'], row['borne_max']
    
    if not(pd.isna(borne_min)) and not(pd.isna(borne_max)) :
        if borne_max < borne_min : return (borne_max + borne_min ) // 2 # return score
        if score < borne_min : return borne_min+1
        if score > borne_max : return borne_max-1
        else :
            return score
        
    elif not(pd.isna(borne_min)) :
        if score < borne_min : return borne_min+1
        else : return score

    elif not(pd.isna(borne_max)) :
        if score > borne_max : return borne_max-1
        else : return score
        
    else :
        return score
    
# --------------------

# Application of correction
df['score_corrige'] = df.apply(lambda row : corrige(row), axis=1)
corrections = df[df['score'] != df['score_corrige']]
print("Nb of corrections : {}/{}.".format(len(corrections), len(df)))


# Show
corrections[['comment_id', 'text', 'score', 'score_corrige']].head()

In [None]:
# Rank first
df = df[['comment_id', 'text', 'score_corrige']].drop_duplicates()
df['score'] = df['score_corrige'].rank(method='first')
df = df[['comment_id', 'text', 'score']].drop_duplicates()

# Show
df.head()

In [None]:
df = df[['comment_id', 'score']].drop_duplicates()
print(df.shape)
df.to_csv("submission.csv", index=False)