As a baseline solution we used this public [notebook](https://www.kaggle.com/code/yasufuminakama/pppm-deberta-v3-large-baseline-w-w-b-train)

In [None]:
# For right dependencies on kaggle this library was uninstalled
!pip uninstall allennlp -y

Found existing installation: allennlp 2.10.1
Uninstalling allennlp-2.10.1:
  Successfully uninstalled allennlp-2.10.1
[0m

### Global settings

In [None]:
import os

OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# CFG

Main points concerning the architecture:
* We use pretrained weigts of deberta-v3-base model 
* We fine-tune the last linear layer in respect to 6 desired metrics
* Training via 4-fold cross-validation
* Averaging the score using model from each fold

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
#     wandb=True
    wandb=False
    competition='FB3'
    _wandb_kernel='wallykop'
    debug=False
    apex=True
    print_freq=20
    num_workers=4
    model="/kaggle/input/debertav3base/"
    gradient_checkpointing=True
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=5
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=8
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=2807
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    train=True

### Imports

In [None]:
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

os.system('python -m pip install --no-index --find-links=../input/fb3-pip-wheels iterative-stratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

os.system('pip uninstall -y transformers')
os.system('pip uninstall -y tokenizers')
os.system('python -m pip install --no-index --find-links=../input/fb3-pip-wheels transformers')
os.system('python -m pip install --no-index --find-links=../input/fb3-pip-wheels tokenizers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Looking in links: ../input/fb3-pip-wheels
Processing /kaggle/input/fb3-pip-wheels/iterative_stratification-0.1.7-py3-none-any.whl
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.7




Found existing installation: transformers 4.20.1
Uninstalling transformers-4.20.1:
  Successfully uninstalled transformers-4.20.1




Found existing installation: tokenizers 0.12.1
Uninstalling tokenizers-0.12.1:
  Successfully uninstalled tokenizers-0.12.1




Looking in links: ../input/fb3-pip-wheels
Processing /kaggle/input/fb3-pip-wheels/transformers-4.21.2-py3-none-any.whl
Processing /kaggle/input/fb3-pip-wheels/tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl
Installing collected packages: tokenizers, transformers
Successfully installed tokenizers-0.12.1 transformers-4.21.2




Looking in links: ../input/fb3-pip-wheels




tokenizers.__version__: 0.12.1
transformers.__version__: 4.21.2
env: TOKENIZERS_PARALLELISM=true


## Definition of utils functions

In [None]:
from utils import (
    MCRMSE,
    get_score,
    get_logger,
    seed_everything
)

LOGGER = get_logger()  
seed_everything(seed=2807)

# Data load

In [None]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv('../input/feedback-prize-english-language-learning/train.csv')
test = pd.read_csv('../input/feedback-prize-english-language-learning/test.csv')
submission = pd.read_csv('../input/feedback-prize-english-language-learning/sample_submission.csv')

print(f"train.shape: {train.shape}")
display(train.head())
print(f"test.shape: {test.shape}")
display(test.head())
print(f"submission.shape: {submission.shape}")
display(submission.head())

train.shape: (3911, 8)


Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


test.shape: (3, 2)


Unnamed: 0,text_id,full_text
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."


submission.shape: (3, 7)


Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,3.0,3.0,3.0,3.0,3.0,3.0
1,000BAD50D026,3.0,3.0,3.0,3.0,3.0,3.0
2,00367BB2546B,3.0,3.0,3.0,3.0,3.0,3.0


# CV split

In [None]:
# ====================================================
# CV split
# ====================================================
Fold = MultilabelStratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(train, train[CFG.target_cols])):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    977
1    978
2    978
3    978
dtype: int64

# Deberta tokenizer load

In [None]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model, local_files_only=True)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Dataset

In [None]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['full_text'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 3 # cls & sep & sep
LOGGER.info(f"max_len: {CFG.max_len}")

  0%|          | 0/3911 [00:00<?, ?it/s]

max_len: 1429


In [None]:
# ====================================================
# Dataset
# ====================================================
from dataset import (
    prepare_input,
    TrainDataset,
    collate
)

# Model import

In [None]:
# ====================================================
# Model
# ====================================================
from model import (
    MeanPooling,
    CustomModel
)

In [None]:
# ====================================================
# Loss
# ====================================================
from model import RMSELoss

# ====================================================
# Helper functions
# ====================================================
from model import AverageMeter
from utils import asMinutes, timeSince
from main_functions import train_fn, valid_fn

# train loop

In [None]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target_cols].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.SmoothL1Loss(reduction='mean') # RMSELoss(reduction="mean")
    
    best_score = np.inf

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score, scores = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {scores}')
        
        if best_score > score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[f"pred_{c}" for c in CFG.target_cols]] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

  def get_result(oof_df):
      labels = oof_df[CFG.target_cols].values
      preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values
      score, scores = get_score(labels, preds)
      LOGGER.info(f'Score: {score:<.4f}  Scores: {scores}')

In [None]:
if __name__ == '__main__':
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl')
        

DebertaV2Config {
  "_name_or_path": "/kaggle/input/debertav3base/",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at /kaggle/input/debertav3base/ were not used when initializing DebertaV2

Epoch: [1][0/366] Elapsed 0m 2s (remain 17m 14s) Loss: 2.4604(2.4604) Grad: inf  LR: 0.00002000  
Epoch: [1][20/366] Elapsed 0m 44s (remain 12m 3s) Loss: 0.2929(1.4000) Grad: 132482.9688  LR: 0.00001999  
Epoch: [1][40/366] Elapsed 1m 15s (remain 9m 58s) Loss: 0.1745(0.8305) Grad: 94354.8281  LR: 0.00001998  
Epoch: [1][60/366] Elapsed 1m 54s (remain 9m 33s) Loss: 0.1684(0.6046) Grad: 65162.9609  LR: 0.00001995  
Epoch: [1][80/366] Elapsed 2m 35s (remain 9m 7s) Loss: 0.1224(0.4876) Grad: 82596.6719  LR: 0.00001990  
Epoch: [1][100/366] Elapsed 3m 12s (remain 8m 26s) Loss: 0.1240(0.4257) Grad: 76056.7812  LR: 0.00001985  
Epoch: [1][120/366] Elapsed 3m 51s (remain 7m 48s) Loss: 0.1067(0.3777) Grad: 96725.1562  LR: 0.00001979  
Epoch: [1][140/366] Elapsed 4m 30s (remain 7m 12s) Loss: 0.1338(0.3427) Grad: 66807.3359  LR: 0.00001971  
Epoch: [1][160/366] Elapsed 5m 6s (remain 6m 30s) Loss: 0.2027(0.3150) Grad: 127323.6328  LR: 0.00001962  
Epoch: [1][180/366] Elapsed 5m 39s (remain 5m 47s)

Epoch 1 - avg_train_loss: 0.2093  avg_val_loss: 0.1172  time: 739s
Epoch 1 - Score: 0.4852  Scores: [0.5252723293837455, 0.4603949603507831, 0.4324231774772289, 0.46372561913896837, 0.49361884898336805, 0.5359030621072157]
Epoch 1 - Save Best Score: 0.4852 Model


EVAL: [60/62] Elapsed 1m 10s (remain 0m 1s) Loss: 0.0959(0.1172) 
EVAL: [61/62] Elapsed 1m 10s (remain 0m 0s) Loss: 0.1394(0.1172) 
Epoch: [2][0/366] Elapsed 0m 1s (remain 9m 16s) Loss: 0.0993(0.0993) Grad: 132664.5781  LR: 0.00001809  
Epoch: [2][20/366] Elapsed 0m 43s (remain 11m 52s) Loss: 0.1007(0.1084) Grad: 105600.1641  LR: 0.00001788  
Epoch: [2][40/366] Elapsed 1m 22s (remain 10m 52s) Loss: 0.0884(0.1115) Grad: 49259.2461  LR: 0.00001766  
Epoch: [2][60/366] Elapsed 1m 52s (remain 9m 20s) Loss: 0.0680(0.1073) Grad: 108320.7578  LR: 0.00001744  
Epoch: [2][80/366] Elapsed 2m 26s (remain 8m 35s) Loss: 0.0809(0.1044) Grad: 57301.0391  LR: 0.00001721  
Epoch: [2][100/366] Elapsed 3m 4s (remain 8m 3s) Loss: 0.0765(0.1022) Grad: 109719.0078  LR: 0.00001696  
Epoch: [2][120/366] Elapsed 3m 40s (remain 7m 26s) Loss: 0.0681(0.1022) Grad: 94761.5312  LR: 0.00001671  
Epoch: [2][140/366] Elapsed 4m 17s (remain 6m 51s) Loss: 0.0658(0.1032) Grad: 120961.5547  LR: 0.00001646  
Epoch: [2][160

Epoch 2 - avg_train_loss: 0.1031  avg_val_loss: 0.1122  time: 741s
Epoch 2 - Score: 0.4748  Scores: [0.518505217112889, 0.44943228475971403, 0.4412834160887432, 0.4675421449245012, 0.48459885522931184, 0.48719429333924563]
Epoch 2 - Save Best Score: 0.4748 Model


EVAL: [60/62] Elapsed 1m 9s (remain 0m 1s) Loss: 0.1032(0.1120) 
EVAL: [61/62] Elapsed 1m 9s (remain 0m 0s) Loss: 0.3117(0.1122) 
Epoch: [3][0/366] Elapsed 0m 4s (remain 24m 24s) Loss: 0.0724(0.0724) Grad: 146796.7969  LR: 0.00001309  
Epoch: [3][20/366] Elapsed 0m 37s (remain 10m 19s) Loss: 0.0901(0.0947) Grad: 205534.8281  LR: 0.00001277  
Epoch: [3][40/366] Elapsed 1m 14s (remain 9m 51s) Loss: 0.1641(0.1011) Grad: 198201.6719  LR: 0.00001243  
Epoch: [3][60/366] Elapsed 1m 50s (remain 9m 11s) Loss: 0.1044(0.0983) Grad: 228772.2500  LR: 0.00001210  
Epoch: [3][80/366] Elapsed 2m 28s (remain 8m 41s) Loss: 0.0654(0.0954) Grad: 117233.8281  LR: 0.00001176  
Epoch: [3][100/366] Elapsed 3m 2s (remain 7m 59s) Loss: 0.0863(0.0941) Grad: 136152.3594  LR: 0.00001143  
Epoch: [3][120/366] Elapsed 3m 39s (remain 7m 23s) Loss: 0.1123(0.0957) Grad: 353823.6250  LR: 0.00001109  
Epoch: [3][140/366] Elapsed 4m 15s (remain 6m 47s) Loss: 0.1469(0.0948) Grad: 308057.7500  LR: 0.00001074  
Epoch: [3][1

Epoch 3 - avg_train_loss: 0.0932  avg_val_loss: 0.1026  time: 737s
Epoch 3 - Score: 0.4535  Scores: [0.48117085292346845, 0.44260119924765545, 0.40730468448399076, 0.45160246969464946, 0.48049994252348677, 0.45778602639632077]
Epoch 3 - Save Best Score: 0.4535 Model


EVAL: [60/62] Elapsed 1m 9s (remain 0m 1s) Loss: 0.1025(0.1025) 
EVAL: [61/62] Elapsed 1m 9s (remain 0m 0s) Loss: 0.1667(0.1026) 
Epoch: [4][0/366] Elapsed 0m 2s (remain 12m 50s) Loss: 0.1310(0.1310) Grad: 241771.5156  LR: 0.00000692  
Epoch: [4][20/366] Elapsed 0m 41s (remain 11m 23s) Loss: 0.0932(0.0920) Grad: 175292.1875  LR: 0.00000660  
Epoch: [4][40/366] Elapsed 1m 16s (remain 10m 3s) Loss: 0.0756(0.0874) Grad: 190115.8750  LR: 0.00000628  
Epoch: [4][60/366] Elapsed 1m 49s (remain 9m 8s) Loss: 0.0653(0.0880) Grad: 150851.5781  LR: 0.00000596  
Epoch: [4][80/366] Elapsed 2m 30s (remain 8m 49s) Loss: 0.0618(0.0877) Grad: 93135.4688  LR: 0.00000565  
Epoch: [4][100/366] Elapsed 3m 4s (remain 8m 4s) Loss: 0.1111(0.0900) Grad: 122823.4453  LR: 0.00000535  
Epoch: [4][120/366] Elapsed 3m 46s (remain 7m 38s) Loss: 0.0875(0.0898) Grad: 130664.5625  LR: 0.00000504  
Epoch: [4][140/366] Elapsed 4m 18s (remain 6m 51s) Loss: 0.0651(0.0890) Grad: 157491.5781  LR: 0.00000475  
Epoch: [4][160/

Epoch 4 - avg_train_loss: 0.0875  avg_val_loss: 0.1054  time: 738s
Epoch 4 - Score: 0.4600  Scores: [0.4930611698468935, 0.4438371756865151, 0.40814314134460855, 0.46035874571361546, 0.4876696564448261, 0.4668620834309125]


EVAL: [60/62] Elapsed 1m 9s (remain 0m 1s) Loss: 0.1051(0.1054) 
EVAL: [61/62] Elapsed 1m 9s (remain 0m 0s) Loss: 0.1193(0.1054) 
Epoch: [5][0/366] Elapsed 0m 2s (remain 15m 19s) Loss: 0.0915(0.0915) Grad: 211130.7188  LR: 0.00000192  
Epoch: [5][20/366] Elapsed 0m 30s (remain 8m 28s) Loss: 0.0943(0.0835) Grad: 214553.3125  LR: 0.00000173  
Epoch: [5][40/366] Elapsed 1m 11s (remain 9m 26s) Loss: 0.0784(0.0871) Grad: 200855.9531  LR: 0.00000154  
Epoch: [5][60/366] Elapsed 1m 50s (remain 9m 10s) Loss: 0.0721(0.0872) Grad: 157207.3281  LR: 0.00000136  
Epoch: [5][80/366] Elapsed 2m 28s (remain 8m 43s) Loss: 0.0822(0.0873) Grad: 172869.8125  LR: 0.00000119  
Epoch: [5][100/366] Elapsed 3m 2s (remain 7m 59s) Loss: 0.0680(0.0871) Grad: 126036.0391  LR: 0.00000104  
Epoch: [5][120/366] Elapsed 3m 40s (remain 7m 26s) Loss: 0.0623(0.0853) Grad: 96597.7812  LR: 0.00000089  
Epoch: [5][140/366] Elapsed 4m 12s (remain 6m 42s) Loss: 0.1045(0.0845) Grad: 149400.2969  LR: 0.00000075  
Epoch: [5][160

Epoch 5 - avg_train_loss: 0.0846  avg_val_loss: 0.1028  time: 740s
Epoch 5 - Score: 0.4541  Scores: [0.48104976580720415, 0.44073006876148846, 0.4084444241003377, 0.45126805628252614, 0.481057429026221, 0.4619987114760102]
Score: 0.4535  Scores: [0.48117085292346845, 0.44260119924765545, 0.40730468448399076, 0.45160246969464946, 0.48049994252348677, 0.45778602639632077]
DebertaV2Config {
  "_name_or_path": "/kaggle/input/debertav3base/",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidd

Epoch: [1][0/366] Elapsed 0m 2s (remain 15m 45s) Loss: 2.6186(2.6186) Grad: inf  LR: 0.00002000  
Epoch: [1][20/366] Elapsed 0m 40s (remain 11m 4s) Loss: 0.1078(1.3688) Grad: 70585.6953  LR: 0.00001999  
Epoch: [1][40/366] Elapsed 1m 14s (remain 9m 51s) Loss: 0.1497(0.7886) Grad: 127961.9297  LR: 0.00001998  
Epoch: [1][60/366] Elapsed 1m 52s (remain 9m 22s) Loss: 0.1642(0.5829) Grad: 60625.3242  LR: 0.00001995  
Epoch: [1][80/366] Elapsed 2m 30s (remain 8m 49s) Loss: 0.1139(0.4770) Grad: 49952.0391  LR: 0.00001990  
Epoch: [1][100/366] Elapsed 3m 3s (remain 8m 0s) Loss: 0.1660(0.4106) Grad: 198367.7656  LR: 0.00001985  
Epoch: [1][120/366] Elapsed 3m 46s (remain 7m 38s) Loss: 0.2348(0.3670) Grad: 178132.9062  LR: 0.00001979  
Epoch: [1][140/366] Elapsed 4m 22s (remain 6m 58s) Loss: 0.1111(0.3334) Grad: 72750.3594  LR: 0.00001971  
Epoch: [1][160/366] Elapsed 5m 0s (remain 6m 22s) Loss: 0.1064(0.3071) Grad: 104039.9141  LR: 0.00001962  
Epoch: [1][180/366] Elapsed 5m 33s (remain 5m 41s

Epoch 1 - avg_train_loss: 0.2067  avg_val_loss: 0.1212  time: 740s
Epoch 1 - Score: 0.4925  Scores: [0.5950716692156336, 0.46834352305365384, 0.42985427299761353, 0.5003445408237472, 0.4875930877730655, 0.47361892520254006]
Epoch 1 - Save Best Score: 0.4925 Model


Epoch: [2][0/366] Elapsed 0m 1s (remain 9m 42s) Loss: 0.1290(0.1290) Grad: inf  LR: 0.00001809  
Epoch: [2][20/366] Elapsed 0m 38s (remain 10m 36s) Loss: 0.1628(0.1114) Grad: 225714.2031  LR: 0.00001788  
Epoch: [2][40/366] Elapsed 1m 16s (remain 10m 6s) Loss: 0.1559(0.1095) Grad: 184036.1719  LR: 0.00001766  
Epoch: [2][60/366] Elapsed 1m 49s (remain 9m 7s) Loss: 0.1133(0.1090) Grad: 144973.7656  LR: 0.00001744  
Epoch: [2][80/366] Elapsed 2m 30s (remain 8m 49s) Loss: 0.1828(0.1137) Grad: 143772.7500  LR: 0.00001721  
Epoch: [2][100/366] Elapsed 3m 7s (remain 8m 12s) Loss: 0.1088(0.1104) Grad: 97082.3828  LR: 0.00001696  
Epoch: [2][120/366] Elapsed 3m 42s (remain 7m 30s) Loss: 0.0991(0.1112) Grad: 71216.5234  LR: 0.00001671  
Epoch: [2][140/366] Elapsed 4m 20s (remain 6m 54s) Loss: 0.1364(0.1111) Grad: 216348.1719  LR: 0.00001646  
Epoch: [2][160/366] Elapsed 4m 57s (remain 6m 18s) Loss: 0.1137(0.1111) Grad: 146086.0625  LR: 0.00001619  
Epoch: [2][180/366] Elapsed 5m 33s (remain 5m 

Epoch 2 - avg_train_loss: 0.1061  avg_val_loss: 0.1103  time: 744s
Epoch 2 - Score: 0.4710  Scores: [0.5063506726500945, 0.44864769570711227, 0.4479158418230321, 0.4761561334626112, 0.4956693222657946, 0.4509865635569898]
Epoch 2 - Save Best Score: 0.4710 Model


Epoch: [3][0/366] Elapsed 0m 1s (remain 11m 55s) Loss: 0.0901(0.0901) Grad: 243087.0312  LR: 0.00001309  
Epoch: [3][20/366] Elapsed 0m 42s (remain 11m 30s) Loss: 0.1048(0.0949) Grad: 249317.8750  LR: 0.00001277  
Epoch: [3][40/366] Elapsed 1m 16s (remain 10m 3s) Loss: 0.1124(0.0928) Grad: 210587.7812  LR: 0.00001243  
Epoch: [3][60/366] Elapsed 1m 49s (remain 9m 8s) Loss: 0.0316(0.0895) Grad: 135745.7969  LR: 0.00001210  
Epoch: [3][80/366] Elapsed 2m 28s (remain 8m 41s) Loss: 0.0763(0.0904) Grad: 156789.4219  LR: 0.00001176  
Epoch: [3][100/366] Elapsed 3m 1s (remain 7m 56s) Loss: 0.0780(0.0919) Grad: 193278.4844  LR: 0.00001143  
Epoch: [3][120/366] Elapsed 3m 39s (remain 7m 23s) Loss: 0.1172(0.0925) Grad: 80283.8906  LR: 0.00001109  
Epoch: [3][140/366] Elapsed 4m 11s (remain 6m 40s) Loss: 0.1601(0.0927) Grad: 86895.2734  LR: 0.00001074  
Epoch: [3][160/366] Elapsed 4m 47s (remain 6m 6s) Loss: 0.1325(0.0943) Grad: 177370.5781  LR: 0.00001040  
Epoch: [3][180/366] Elapsed 5m 23s (re

Epoch 3 - avg_train_loss: 0.0957  avg_val_loss: 0.1096  time: 738s
Epoch 3 - Score: 0.4694  Scores: [0.5075549153818055, 0.4630397613660995, 0.42907840327263336, 0.4684789961711547, 0.4864373348060382, 0.46177043023965214]
Epoch 3 - Save Best Score: 0.4694 Model


Epoch: [4][0/366] Elapsed 0m 2s (remain 15m 17s) Loss: 0.1174(0.1174) Grad: 132398.0781  LR: 0.00000692  
Epoch: [4][20/366] Elapsed 0m 40s (remain 10m 59s) Loss: 0.0880(0.0868) Grad: 222821.0781  LR: 0.00000660  
Epoch: [4][40/366] Elapsed 1m 10s (remain 9m 21s) Loss: 0.0528(0.0896) Grad: 74502.5703  LR: 0.00000628  
Epoch: [4][60/366] Elapsed 1m 53s (remain 9m 27s) Loss: 0.0965(0.0926) Grad: 75628.3203  LR: 0.00000596  
Epoch: [4][80/366] Elapsed 2m 37s (remain 9m 14s) Loss: 0.0728(0.0913) Grad: 75084.0000  LR: 0.00000565  
Epoch: [4][100/366] Elapsed 3m 17s (remain 8m 37s) Loss: 0.0736(0.0906) Grad: 110710.2109  LR: 0.00000535  
Epoch: [4][120/366] Elapsed 3m 49s (remain 7m 44s) Loss: 0.1194(0.0918) Grad: 83585.7812  LR: 0.00000504  
Epoch: [4][140/366] Elapsed 4m 25s (remain 7m 4s) Loss: 0.0839(0.0903) Grad: 119024.8984  LR: 0.00000475  
Epoch: [4][160/366] Elapsed 4m 59s (remain 6m 21s) Loss: 0.0911(0.0903) Grad: 122904.0625  LR: 0.00000446  
Epoch: [4][180/366] Elapsed 5m 33s (re

Epoch 4 - avg_train_loss: 0.0893  avg_val_loss: 0.1078  time: 737s
Epoch 4 - Score: 0.4654  Scores: [0.5067774834391483, 0.45344570921870164, 0.4292903845739587, 0.47269260543314656, 0.4807932095865946, 0.44956432662885254]
Epoch 4 - Save Best Score: 0.4654 Model


Epoch: [5][0/366] Elapsed 0m 3s (remain 20m 18s) Loss: 0.0738(0.0738) Grad: 201444.0625  LR: 0.00000192  
Epoch: [5][20/366] Elapsed 0m 41s (remain 11m 18s) Loss: 0.0689(0.0875) Grad: 240021.3438  LR: 0.00000173  
Epoch: [5][40/366] Elapsed 1m 21s (remain 10m 43s) Loss: 0.0670(0.0843) Grad: 109692.1719  LR: 0.00000154  
Epoch: [5][60/366] Elapsed 1m 57s (remain 9m 49s) Loss: 0.0966(0.0852) Grad: 184033.2031  LR: 0.00000136  
Epoch: [5][80/366] Elapsed 2m 37s (remain 9m 13s) Loss: 0.0761(0.0853) Grad: 99488.9844  LR: 0.00000119  
Epoch: [5][100/366] Elapsed 3m 13s (remain 8m 26s) Loss: 0.1189(0.0863) Grad: 61058.0039  LR: 0.00000104  
Epoch: [5][120/366] Elapsed 3m 40s (remain 7m 26s) Loss: 0.0704(0.0858) Grad: 75297.6094  LR: 0.00000089  
Epoch: [5][140/366] Elapsed 4m 17s (remain 6m 50s) Loss: 0.0584(0.0848) Grad: 61393.8789  LR: 0.00000075  
Epoch: [5][160/366] Elapsed 4m 50s (remain 6m 10s) Loss: 0.0621(0.0845) Grad: 56876.7500  LR: 0.00000063  
Epoch: [5][180/366] Elapsed 5m 27s (r

Epoch 5 - avg_train_loss: 0.0845  avg_val_loss: 0.1073  time: 733s
Epoch 5 - Score: 0.4643  Scores: [0.5033233474032056, 0.4517743618555293, 0.428225181043949, 0.47143894696864036, 0.4820258599401173, 0.4488555485638754]
Epoch 5 - Save Best Score: 0.4643 Model
Score: 0.4643  Scores: [0.5033233474032056, 0.4517743618555293, 0.428225181043949, 0.47143894696864036, 0.4820258599401173, 0.4488555485638754]
DebertaV2Config {
  "_name_or_path": "/kaggle/input/debertav3base/",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hid

Epoch: [1][0/366] Elapsed 0m 2s (remain 14m 22s) Loss: 2.1853(2.1853) Grad: inf  LR: 0.00002000  
Epoch: [1][20/366] Elapsed 0m 40s (remain 10m 57s) Loss: 0.2192(1.1833) Grad: 126762.8594  LR: 0.00001999  
Epoch: [1][40/366] Elapsed 1m 16s (remain 10m 6s) Loss: 0.3823(0.7148) Grad: 186256.7969  LR: 0.00001998  
Epoch: [1][60/366] Elapsed 1m 52s (remain 9m 24s) Loss: 0.2492(0.5455) Grad: 193189.8438  LR: 0.00001995  
Epoch: [1][80/366] Elapsed 2m 33s (remain 8m 59s) Loss: 0.1332(0.4476) Grad: 77358.5859  LR: 0.00001990  
Epoch: [1][100/366] Elapsed 3m 7s (remain 8m 11s) Loss: 0.1214(0.3868) Grad: 117346.1484  LR: 0.00001985  
Epoch: [1][120/366] Elapsed 3m 50s (remain 7m 46s) Loss: 0.1222(0.3447) Grad: 57240.1250  LR: 0.00001979  
Epoch: [1][140/366] Elapsed 4m 25s (remain 7m 2s) Loss: 0.1489(0.3131) Grad: 117089.4375  LR: 0.00001971  
Epoch: [1][160/366] Elapsed 4m 58s (remain 6m 19s) Loss: 0.1628(0.2916) Grad: 150434.7500  LR: 0.00001962  
Epoch: [1][180/366] Elapsed 5m 29s (remain 5m

Epoch 1 - avg_train_loss: 0.1990  avg_val_loss: 0.1326  time: 743s
Epoch 1 - Score: 0.5183  Scores: [0.5227589217268973, 0.5270870068347155, 0.51705989920806, 0.5435950290674316, 0.5169029096645373, 0.48221947016704536]
Epoch 1 - Save Best Score: 0.5183 Model


Epoch: [2][0/366] Elapsed 0m 2s (remain 17m 33s) Loss: 0.1519(0.1519) Grad: inf  LR: 0.00001809  
Epoch: [2][20/366] Elapsed 0m 39s (remain 10m 45s) Loss: 0.1144(0.1079) Grad: 135854.0000  LR: 0.00001788  
Epoch: [2][40/366] Elapsed 1m 14s (remain 9m 54s) Loss: 0.1143(0.1069) Grad: 200769.7188  LR: 0.00001766  
Epoch: [2][60/366] Elapsed 1m 49s (remain 9m 8s) Loss: 0.1214(0.1096) Grad: 153768.4375  LR: 0.00001744  
Epoch: [2][80/366] Elapsed 2m 27s (remain 8m 37s) Loss: 0.1472(0.1084) Grad: 149613.8438  LR: 0.00001721  
Epoch: [2][100/366] Elapsed 3m 0s (remain 7m 52s) Loss: 0.1255(0.1067) Grad: 72942.4453  LR: 0.00001696  
Epoch: [2][120/366] Elapsed 3m 38s (remain 7m 22s) Loss: 0.0993(0.1070) Grad: 158477.7812  LR: 0.00001671  
Epoch: [2][140/366] Elapsed 4m 14s (remain 6m 46s) Loss: 0.1489(0.1079) Grad: 106016.4766  LR: 0.00001646  
Epoch: [2][160/366] Elapsed 4m 50s (remain 6m 10s) Loss: 0.0562(0.1075) Grad: 126872.9141  LR: 0.00001619  
Epoch: [2][180/366] Elapsed 5m 31s (remain 5

Epoch 2 - avg_train_loss: 0.1069  avg_val_loss: 0.1159  time: 741s
Epoch 2 - Score: 0.4828  Scores: [0.5041247069781106, 0.487657123107037, 0.4357074165377607, 0.4739384958248586, 0.5266730666650974, 0.4684065638340271]
Epoch 2 - Save Best Score: 0.4828 Model


Epoch: [3][0/366] Elapsed 0m 3s (remain 21m 24s) Loss: 0.1824(0.1824) Grad: 250902.8438  LR: 0.00001309  
Epoch: [3][20/366] Elapsed 0m 37s (remain 10m 9s) Loss: 0.0878(0.1050) Grad: 108037.2734  LR: 0.00001277  
Epoch: [3][40/366] Elapsed 1m 22s (remain 10m 51s) Loss: 0.0851(0.0986) Grad: 111627.6875  LR: 0.00001243  
Epoch: [3][60/366] Elapsed 1m 59s (remain 9m 55s) Loss: 0.1309(0.1004) Grad: 125506.8750  LR: 0.00001210  
Epoch: [3][80/366] Elapsed 2m 39s (remain 9m 19s) Loss: 0.0918(0.1004) Grad: 160428.2969  LR: 0.00001176  
Epoch: [3][100/366] Elapsed 3m 17s (remain 8m 39s) Loss: 0.0877(0.1000) Grad: 124458.2734  LR: 0.00001143  
Epoch: [3][120/366] Elapsed 3m 53s (remain 7m 52s) Loss: 0.1279(0.0996) Grad: 80006.8047  LR: 0.00001109  
Epoch: [3][140/366] Elapsed 4m 30s (remain 7m 11s) Loss: 0.1596(0.1010) Grad: 140786.4219  LR: 0.00001074  
Epoch: [3][160/366] Elapsed 5m 6s (remain 6m 30s) Loss: 0.1403(0.1001) Grad: 106502.0156  LR: 0.00001040  
Epoch: [3][180/366] Elapsed 5m 38s 

Epoch 3 - avg_train_loss: 0.0991  avg_val_loss: 0.1048  time: 743s
Epoch 3 - Score: 0.4585  Scores: [0.49362733493896865, 0.45784587692501705, 0.4183949132215376, 0.44939004709746744, 0.48841406337862364, 0.443480341860978]
Epoch 3 - Save Best Score: 0.4585 Model


Epoch: [4][0/366] Elapsed 0m 2s (remain 13m 29s) Loss: 0.0996(0.0996) Grad: inf  LR: 0.00000692  
Epoch: [4][20/366] Elapsed 0m 42s (remain 11m 37s) Loss: 0.0757(0.0882) Grad: 103428.2812  LR: 0.00000660  
Epoch: [4][40/366] Elapsed 1m 19s (remain 10m 26s) Loss: 0.1130(0.0914) Grad: 100600.4453  LR: 0.00000628  
Epoch: [4][60/366] Elapsed 1m 48s (remain 9m 4s) Loss: 0.0695(0.0940) Grad: 36366.2344  LR: 0.00000596  
Epoch: [4][80/366] Elapsed 2m 26s (remain 8m 35s) Loss: 0.0945(0.0931) Grad: 98778.5703  LR: 0.00000565  
Epoch: [4][100/366] Elapsed 2m 59s (remain 7m 50s) Loss: 0.0901(0.0936) Grad: 77871.5391  LR: 0.00000535  
Epoch: [4][120/366] Elapsed 3m 37s (remain 7m 19s) Loss: 0.0675(0.0935) Grad: 136270.5469  LR: 0.00000504  
Epoch: [4][140/366] Elapsed 4m 8s (remain 6m 35s) Loss: 0.0617(0.0931) Grad: 95850.9219  LR: 0.00000475  
Epoch: [4][160/366] Elapsed 4m 49s (remain 6m 8s) Loss: 0.1134(0.0926) Grad: 150634.1250  LR: 0.00000446  
Epoch: [4][180/366] Elapsed 5m 27s (remain 5m 3

Epoch 4 - avg_train_loss: 0.0926  avg_val_loss: 0.1036  time: 739s
Epoch 4 - Score: 0.4559  Scores: [0.48864049488221334, 0.45854790837407083, 0.41772157913792524, 0.4486687917330637, 0.48005485601132636, 0.4416262970791042]
Epoch 4 - Save Best Score: 0.4559 Model


Epoch: [5][0/366] Elapsed 0m 1s (remain 10m 13s) Loss: 0.0739(0.0739) Grad: 182782.2344  LR: 0.00000192  
Epoch: [5][20/366] Elapsed 0m 34s (remain 9m 27s) Loss: 0.0907(0.0865) Grad: 95687.1953  LR: 0.00000173  
Epoch: [5][40/366] Elapsed 1m 8s (remain 9m 6s) Loss: 0.0721(0.0861) Grad: 60288.7539  LR: 0.00000154  
Epoch: [5][60/366] Elapsed 1m 44s (remain 8m 42s) Loss: 0.0430(0.0877) Grad: 80165.3203  LR: 0.00000136  
Epoch: [5][80/366] Elapsed 2m 21s (remain 8m 18s) Loss: 0.1048(0.0895) Grad: 117599.4297  LR: 0.00000119  
Epoch: [5][100/366] Elapsed 2m 58s (remain 7m 49s) Loss: 0.0725(0.0891) Grad: 55358.4805  LR: 0.00000104  
Epoch: [5][120/366] Elapsed 3m 31s (remain 7m 8s) Loss: 0.1315(0.0875) Grad: 80968.9766  LR: 0.00000089  
Epoch: [5][140/366] Elapsed 4m 11s (remain 6m 41s) Loss: 0.0650(0.0874) Grad: 122198.6719  LR: 0.00000075  
Epoch: [5][160/366] Elapsed 4m 54s (remain 6m 14s) Loss: 0.1103(0.0878) Grad: 195406.8125  LR: 0.00000063  
Epoch: [5][180/366] Elapsed 5m 31s (remain

Epoch 5 - avg_train_loss: 0.0883  avg_val_loss: 0.1038  time: 739s
Epoch 5 - Score: 0.4565  Scores: [0.48873735754230974, 0.4598431652748847, 0.41762654273580974, 0.4497675341817808, 0.4805897329796801, 0.4424896867091492]
Score: 0.4559  Scores: [0.48864049488221334, 0.45854790837407083, 0.41772157913792524, 0.4486687917330637, 0.48005485601132636, 0.4416262970791042]
DebertaV2Config {
  "_name_or_path": "/kaggle/input/debertav3base/",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden

Epoch: [1][0/366] Elapsed 0m 1s (remain 10m 58s) Loss: 2.4907(2.4907) Grad: inf  LR: 0.00002000  
Epoch: [1][20/366] Elapsed 0m 46s (remain 12m 42s) Loss: 0.3629(1.3937) Grad: 164468.1875  LR: 0.00001999  
Epoch: [1][40/366] Elapsed 1m 23s (remain 10m 59s) Loss: 0.1169(0.8070) Grad: 48368.6953  LR: 0.00001998  
Epoch: [1][60/366] Elapsed 1m 52s (remain 9m 24s) Loss: 0.1376(0.5868) Grad: 73096.2500  LR: 0.00001995  
Epoch: [1][80/366] Elapsed 2m 26s (remain 8m 36s) Loss: 0.0934(0.4781) Grad: 55550.5781  LR: 0.00001990  
Epoch: [1][100/366] Elapsed 3m 7s (remain 8m 11s) Loss: 0.0784(0.4106) Grad: 35145.7031  LR: 0.00001985  
Epoch: [1][120/366] Elapsed 3m 45s (remain 7m 36s) Loss: 0.0938(0.3637) Grad: 40209.0469  LR: 0.00001979  
Epoch: [1][140/366] Elapsed 4m 22s (remain 6m 58s) Loss: 0.1643(0.3314) Grad: 74800.8906  LR: 0.00001971  
Epoch: [1][160/366] Elapsed 4m 56s (remain 6m 17s) Loss: 0.1731(0.3046) Grad: 80199.1094  LR: 0.00001962  
Epoch: [1][180/366] Elapsed 5m 33s (remain 5m 40

Epoch 1 - avg_train_loss: 0.2043  avg_val_loss: 0.1111  time: 729s
Epoch 1 - Score: 0.4722  Scores: [0.5080878678481079, 0.487717426449642, 0.4342749540602561, 0.45921968108685807, 0.4849055988885666, 0.4588816029517034]
Epoch 1 - Save Best Score: 0.4722 Model


Epoch: [2][0/366] Elapsed 0m 1s (remain 10m 5s) Loss: 0.1186(0.1186) Grad: 156966.3438  LR: 0.00001809  
Epoch: [2][20/366] Elapsed 0m 43s (remain 11m 48s) Loss: 0.0712(0.1022) Grad: 111189.9141  LR: 0.00001788  
Epoch: [2][40/366] Elapsed 1m 20s (remain 10m 37s) Loss: 0.0707(0.1048) Grad: 116930.9844  LR: 0.00001766  
Epoch: [2][60/366] Elapsed 2m 0s (remain 10m 1s) Loss: 0.0966(0.1048) Grad: 102143.5312  LR: 0.00001744  
Epoch: [2][80/366] Elapsed 2m 36s (remain 9m 10s) Loss: 0.0913(0.1054) Grad: 71865.5625  LR: 0.00001721  
Epoch: [2][100/366] Elapsed 3m 9s (remain 8m 16s) Loss: 0.0937(0.1045) Grad: 110377.0000  LR: 0.00001696  
Epoch: [2][120/366] Elapsed 3m 48s (remain 7m 42s) Loss: 0.0894(0.1056) Grad: 149451.3906  LR: 0.00001671  
Epoch: [2][140/366] Elapsed 4m 21s (remain 6m 57s) Loss: 0.1088(0.1039) Grad: 67825.3828  LR: 0.00001646  
Epoch: [2][160/366] Elapsed 4m 56s (remain 6m 17s) Loss: 0.0982(0.1036) Grad: 110690.3672  LR: 0.00001619  
Epoch: [2][180/366] Elapsed 5m 34s (r

Epoch 2 - avg_train_loss: 0.1039  avg_val_loss: 0.1083  time: 724s
Epoch 2 - Score: 0.4664  Scores: [0.5107878228886483, 0.45890497554896037, 0.44286678202107393, 0.4613802784780657, 0.477589243552756, 0.44684256122961546]
Epoch 2 - Save Best Score: 0.4664 Model


Epoch: [3][0/366] Elapsed 0m 3s (remain 22m 59s) Loss: 0.0951(0.0951) Grad: 133471.4375  LR: 0.00001309  
Epoch: [3][20/366] Elapsed 0m 39s (remain 10m 42s) Loss: 0.0909(0.0913) Grad: 218474.1094  LR: 0.00001277  
Epoch: [3][40/366] Elapsed 1m 14s (remain 9m 48s) Loss: 0.1155(0.0916) Grad: 136418.1250  LR: 0.00001243  
Epoch: [3][60/366] Elapsed 1m 51s (remain 9m 16s) Loss: 0.1604(0.0918) Grad: 182349.3594  LR: 0.00001210  
Epoch: [3][80/366] Elapsed 2m 28s (remain 8m 43s) Loss: 0.0563(0.0921) Grad: 87213.9375  LR: 0.00001176  
Epoch: [3][100/366] Elapsed 3m 5s (remain 8m 6s) Loss: 0.2013(0.0928) Grad: 157039.6719  LR: 0.00001143  
Epoch: [3][120/366] Elapsed 3m 45s (remain 7m 36s) Loss: 0.0891(0.0930) Grad: 109520.4922  LR: 0.00001109  
Epoch: [3][140/366] Elapsed 4m 17s (remain 6m 51s) Loss: 0.1218(0.0925) Grad: 102945.6562  LR: 0.00001074  
Epoch: [3][160/366] Elapsed 4m 53s (remain 6m 13s) Loss: 0.0910(0.0952) Grad: 115894.2969  LR: 0.00001040  
Epoch: [3][180/366] Elapsed 5m 32s (

Epoch 3 - avg_train_loss: 0.0941  avg_val_loss: 0.1035  time: 729s
Epoch 3 - Score: 0.4557  Scores: [0.49409977667841654, 0.4513598118150845, 0.4201327976858789, 0.45634952975229137, 0.46602727507509195, 0.4460431630969256]
Epoch 3 - Save Best Score: 0.4557 Model


Epoch: [4][0/366] Elapsed 0m 1s (remain 11m 59s) Loss: 0.0747(0.0747) Grad: 172744.1875  LR: 0.00000692  
Epoch: [4][20/366] Elapsed 0m 42s (remain 11m 32s) Loss: 0.0965(0.0808) Grad: 216642.2344  LR: 0.00000660  
Epoch: [4][40/366] Elapsed 1m 17s (remain 10m 10s) Loss: 0.0774(0.0802) Grad: 168718.2188  LR: 0.00000628  
Epoch: [4][60/366] Elapsed 1m 50s (remain 9m 12s) Loss: 0.0753(0.0826) Grad: 191281.5000  LR: 0.00000596  
Epoch: [4][80/366] Elapsed 2m 23s (remain 8m 23s) Loss: 0.0643(0.0855) Grad: 134483.3438  LR: 0.00000565  
Epoch: [4][100/366] Elapsed 2m 59s (remain 7m 50s) Loss: 0.0815(0.0876) Grad: 126995.8672  LR: 0.00000535  
Epoch: [4][120/366] Elapsed 3m 36s (remain 7m 17s) Loss: 0.0827(0.0883) Grad: 171864.2031  LR: 0.00000504  
Epoch: [4][140/366] Elapsed 4m 10s (remain 6m 39s) Loss: 0.1289(0.0878) Grad: 301860.0000  LR: 0.00000475  
Epoch: [4][160/366] Elapsed 4m 44s (remain 6m 1s) Loss: 0.1077(0.0875) Grad: 207718.3125  LR: 0.00000446  
Epoch: [4][180/366] Elapsed 5m 22

Epoch 4 - avg_train_loss: 0.0855  avg_val_loss: 0.1033  time: 720s
Epoch 4 - Score: 0.4554  Scores: [0.49056861926216083, 0.4522281905279417, 0.4199962631727616, 0.45556276268884976, 0.4675090988182876, 0.44633788321852924]
Epoch 4 - Save Best Score: 0.4554 Model


Epoch: [5][0/366] Elapsed 0m 1s (remain 10m 19s) Loss: 0.0627(0.0627) Grad: 220466.4844  LR: 0.00000192  
Epoch: [5][20/366] Elapsed 0m 46s (remain 12m 50s) Loss: 0.0950(0.0889) Grad: 119457.3281  LR: 0.00000173  
Epoch: [5][40/366] Elapsed 1m 20s (remain 10m 34s) Loss: 0.0815(0.0863) Grad: 161190.3438  LR: 0.00000154  
Epoch: [5][60/366] Elapsed 1m 48s (remain 9m 1s) Loss: 0.0677(0.0840) Grad: 128656.9609  LR: 0.00000136  
Epoch: [5][80/366] Elapsed 2m 24s (remain 8m 29s) Loss: 0.0742(0.0850) Grad: 154329.0469  LR: 0.00000119  
Epoch: [5][100/366] Elapsed 2m 57s (remain 7m 44s) Loss: 0.0556(0.0840) Grad: 130542.9688  LR: 0.00000104  
Epoch: [5][120/366] Elapsed 3m 28s (remain 7m 1s) Loss: 0.0762(0.0836) Grad: 186969.7031  LR: 0.00000089  
Epoch: [5][140/366] Elapsed 4m 7s (remain 6m 34s) Loss: 0.1138(0.0834) Grad: 177147.8125  LR: 0.00000075  
Epoch: [5][160/366] Elapsed 4m 45s (remain 6m 3s) Loss: 0.0664(0.0826) Grad: 116900.3750  LR: 0.00000063  
Epoch: [5][180/366] Elapsed 5m 19s (

Epoch 5 - avg_train_loss: 0.0814  avg_val_loss: 0.1026  time: 731s
Epoch 5 - Score: 0.4538  Scores: [0.4901420855573412, 0.4502627898082984, 0.41639586145381474, 0.45396874903176276, 0.46711645356694903, 0.44471446197691805]
Epoch 5 - Save Best Score: 0.4538 Model
Score: 0.4538  Scores: [0.4901420855573412, 0.4502627898082984, 0.41639586145381474, 0.45396874903176276, 0.46711645356694903, 0.44471446197691805]
Score: 0.4569  Scores: [0.4908865030819827, 0.45083427691211003, 0.41748033615805047, 0.456507211159442, 0.4774611486443539, 0.44828430864670243]


========== fold: 0 result ==========

Score: 0.4535 

========== fold: 1 result ==========

Score: 0.4643  

========== fold: 2 result ==========

Score: 0.4559  

========== fold: 3 result ==========

Score: 0.4538  


**After averaging the scores:**

**========== CV ==========**

**Score: 0.4569**

Each particular error score: 

[0.4908865030819827, 0.45083427691211003, 0.41748033615805047, 0.456507211159442, 0.4774611486443539, 0.44828430864670243] 

for cohesion, syntax, vocabulary, phraseology, grammar, conventions

# Model inference

In [None]:
# CFG class for inference

class CFG:
    num_workers=4
    path="./"
    config_path=path+'config.pth'
    model="/kaggle/input/debertav3base/"
    gradient_checkpointing=False
    batch_size=24
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path+'tokenizer/')

Loading test sample

In [None]:
test = pd.read_csv('../input/feedback-prize-english-language-learning/test.csv')
submission = pd.read_csv('../input/feedback-prize-english-language-learning/sample_submission.csv')

print(f"test.shape: {test.shape}")
display(test.head())
print(f"submission.shape: {submission.shape}")
display(submission.head())

test.shape: (3, 2)


Unnamed: 0,text_id,full_text
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."


submission.shape: (3, 7)


Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,3.0,3.0,3.0,3.0,3.0,3.0
1,000BAD50D026,3.0,3.0,3.0,3.0,3.0,3.0
2,00367BB2546B,3.0,3.0,3.0,3.0,3.0,3.0


In [None]:
test['tokenize_length'] = [len(CFG.tokenizer(text)['input_ids']) for text in test['full_text'].values]
test = test.sort_values('tokenize_length', ascending=True).reset_index(drop=True)
display(test.head())

Unnamed: 0,text_id,full_text,tokenize_length
0,000BAD50D026,Do you think students would benefit from being...,441
1,00367BB2546B,"Thomas Jefferson once states that ""it is wonde...",488
2,0000C359D63E,when a person has no experience on a job their...,897


In [None]:
# ====================================================
# Dataset
# ====================================================
from dataset import prepare_input, TestDataSet

# ====================================================
# inference
# ====================================================
from main_functions import inference_fn

from transformers import DataCollatorWithPadding

For final prediction, we average inference from 4 models of cross-validation

In [None]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         collate_fn=DataCollatorWithPadding(tokenizer=CFG.tokenizer, padding='longest'),
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
predictions = np.mean(predictions, axis=0)

In [None]:
test[CFG.target_cols] = predictions
submission = submission.drop(columns=CFG.target_cols).merge(test[['text_id'] + CFG.target_cols], on='text_id', how='left')
display(submission.head())
submission[['text_id'] + CFG.target_cols].to_csv('submission.csv', index=False)

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.936991,2.822174,3.145676,2.973789,2.714838,2.690477
1,000BAD50D026,2.574048,2.419027,2.668774,2.365274,2.152936,2.551006
2,00367BB2546B,3.459992,3.365598,3.575336,3.514001,3.433239,3.27455


This baseline solution give 0.44 score on a public leaderboard