# About this notebook
- Deberta-base starter code
- pip wheels is [here](https://www.kaggle.com/yasufuminakama/nbme-pip-wheels)
- Inference notebook is [here](https://www.kaggle.com/yasufuminakama/nbme-deberta-base-baseline-inference)

If this notebook is helpful, feel free to upvote :)

# Directory settings

In [1]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# CFG

In [2]:
# ====================================================
# CFG
# ====================================================
class CFG:
    wandb=False
    competition='NBME'
    _wandb_kernel='nakama'
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model="microsoft/deberta-base"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=5
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=12
    fc_dropout=0.2
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    train=True
    
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

In [3]:
# ====================================================
# wandb
# ====================================================
if CFG.wandb:
    
    import wandb

    try:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        secret_value_0 = user_secrets.get_secret("wandb_api")
        wandb.login(key=secret_value_0)
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project='NBME-Public', 
                     name=CFG.model,
                     config=class2dict(CFG),
                     group=CFG.model,
                     job_type="train",
                     anonymous=anony)

# Library

In [4]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

os.system('pip uninstall -y transformers')
os.system('python -m pip install --no-index --find-links=../input/nbme-pip-wheels transformers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

ModuleNotFoundError: No module named 'tokenizers'

# Helper functions for scoring

In [None]:
# From https://www.kaggle.com/theoviel/evaluation-metric-folds-baseline

def micro_f1(preds, truths):
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    return f1_score(truths, preds)


def spans_to_binary(spans, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
    return binary


def span_micro_f1(preds, truths):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
    return micro_f1(bin_preds, bin_truths)

In [None]:
def create_labels_for_scoring(df):
    # example: ['0 1', '3 4'] -> ['0 1; 3 4']
    df['location_for_create_labels'] = [ast.literal_eval(f'[]')] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, 'location']
        if lst:
            new_lst = ';'.join(lst)
            df.loc[i, 'location_for_create_labels'] = ast.literal_eval(f'[["{new_lst}"]]')
    # create labels
    truths = []
    for location_list in df['location_for_create_labels'].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)
    return truths


def get_char_probs(texts, predictions, tokenizer):
    results = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        encoded = tokenizer(text, 
                            add_special_tokens=True,
                            return_offsets_mapping=True)
        for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
            start = offset_mapping[0]
            end = offset_mapping[1]
            results[i][start:end] = pred
    return results


def get_results(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        result = np.where(char_prob >= th)[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results


def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions

# Utils

In [None]:
# ====================================================
# Utils
# ====================================================
def get_score(y_true, y_pred):
    score = span_micro_f1(y_true, y_pred)
    return score


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

# Data Loading

In [None]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv('../input/nbme-score-clinical-patient-notes/train.csv')
train['annotation'] = train['annotation'].apply(ast.literal_eval)
train['location'] = train['location'].apply(ast.literal_eval)
features = pd.read_csv('../input/nbme-score-clinical-patient-notes/features.csv')
def preprocess_features(features):
    features.loc[27, 'feature_text'] = "Last-Pap-smear-1-year-ago"
    return features
features = preprocess_features(features)
patient_notes = pd.read_csv('../input/nbme-score-clinical-patient-notes/patient_notes.csv')

print(f"train.shape: {train.shape}")
display(train.head())
print(f"features.shape: {features.shape}")
display(features.head())
print(f"patient_notes.shape: {patient_notes.shape}")
display(patient_notes.head())

train.shape: (14300, 6)


Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724]
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693]
2,00016_002,0,16,2,[chest pressure],[203 217]
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]"
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258]


features.shape: (143, 3)


Unnamed: 0,feature_num,case_num,feature_text
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,1,0,Family-history-of-thyroid-disorder
2,2,0,Chest-pressure
3,3,0,Intermittent-symptoms
4,4,0,Lightheaded


patient_notes.shape: (42146, 3)


Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...


In [None]:
train = train.merge(features, on=['feature_num', 'case_num'], how='left')
train = train.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
display(train.head())

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693],Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...
2,00016_002,0,16,2,[chest pressure],[203 217],Chest-pressure,HPI: 17yo M presents with palpitations. Patien...
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]",Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258],Lightheaded,HPI: 17yo M presents with palpitations. Patien...


In [None]:
# incorrect annotation
train.loc[338, 'annotation'] = ast.literal_eval('[["father heart attack"]]')
train.loc[338, 'location'] = ast.literal_eval('[["764 783"]]')

train.loc[621, 'annotation'] = ast.literal_eval('[["for the last 2-3 months"]]')
train.loc[621, 'location'] = ast.literal_eval('[["77 100"]]')

train.loc[655, 'annotation'] = ast.literal_eval('[["no heat intolerance"], ["no cold intolerance"]]')
train.loc[655, 'location'] = ast.literal_eval('[["285 292;301 312"], ["285 287;296 312"]]')

train.loc[1262, 'annotation'] = ast.literal_eval('[["mother thyroid problem"]]')
train.loc[1262, 'location'] = ast.literal_eval('[["551 557;565 580"]]')

train.loc[1265, 'annotation'] = ast.literal_eval('[[\'felt like he was going to "pass out"\']]')
train.loc[1265, 'location'] = ast.literal_eval('[["131 135;181 212"]]')

train.loc[1396, 'annotation'] = ast.literal_eval('[["stool , with no blood"]]')
train.loc[1396, 'location'] = ast.literal_eval('[["259 280"]]')

train.loc[1591, 'annotation'] = ast.literal_eval('[["diarrhoe non blooody"]]')
train.loc[1591, 'location'] = ast.literal_eval('[["176 184;201 212"]]')

train.loc[1615, 'annotation'] = ast.literal_eval('[["diarrhea for last 2-3 days"]]')
train.loc[1615, 'location'] = ast.literal_eval('[["249 257;271 288"]]')

train.loc[1664, 'annotation'] = ast.literal_eval('[["no vaginal discharge"]]')
train.loc[1664, 'location'] = ast.literal_eval('[["822 824;907 924"]]')

train.loc[1714, 'annotation'] = ast.literal_eval('[["started about 8-10 hours ago"]]')
train.loc[1714, 'location'] = ast.literal_eval('[["101 129"]]')

train.loc[1929, 'annotation'] = ast.literal_eval('[["no blood in the stool"]]')
train.loc[1929, 'location'] = ast.literal_eval('[["531 539;549 561"]]')

train.loc[2134, 'annotation'] = ast.literal_eval('[["last sexually active 9 months ago"]]')
train.loc[2134, 'location'] = ast.literal_eval('[["540 560;581 593"]]')

train.loc[2191, 'annotation'] = ast.literal_eval('[["right lower quadrant pain"]]')
train.loc[2191, 'location'] = ast.literal_eval('[["32 57"]]')

train.loc[2553, 'annotation'] = ast.literal_eval('[["diarrhoea no blood"]]')
train.loc[2553, 'location'] = ast.literal_eval('[["308 317;376 384"]]')

train.loc[3124, 'annotation'] = ast.literal_eval('[["sweating"]]')
train.loc[3124, 'location'] = ast.literal_eval('[["549 557"]]')

train.loc[3858, 'annotation'] = ast.literal_eval('[["previously as regular"], ["previously eveyr 28-29 days"], ["previously lasting 5 days"], ["previously regular flow"]]')
train.loc[3858, 'location'] = ast.literal_eval('[["102 123"], ["102 112;125 141"], ["102 112;143 157"], ["102 112;159 171"]]')

train.loc[4373, 'annotation'] = ast.literal_eval('[["for 2 months"]]')
train.loc[4373, 'location'] = ast.literal_eval('[["33 45"]]')

train.loc[4763, 'annotation'] = ast.literal_eval('[["35 year old"]]')
train.loc[4763, 'location'] = ast.literal_eval('[["5 16"]]')

train.loc[4782, 'annotation'] = ast.literal_eval('[["darker brown stools"]]')
train.loc[4782, 'location'] = ast.literal_eval('[["175 194"]]')

train.loc[4908, 'annotation'] = ast.literal_eval('[["uncle with peptic ulcer"]]')
train.loc[4908, 'location'] = ast.literal_eval('[["700 723"]]')

train.loc[6016, 'annotation'] = ast.literal_eval('[["difficulty falling asleep"]]')
train.loc[6016, 'location'] = ast.literal_eval('[["225 250"]]')

train.loc[6192, 'annotation'] = ast.literal_eval('[["helps to take care of aging mother and in-laws"]]')
train.loc[6192, 'location'] = ast.literal_eval('[["197 218;236 260"]]')

train.loc[6380, 'annotation'] = ast.literal_eval('[["No hair changes"], ["No skin changes"], ["No GI changes"], ["No palpitations"], ["No excessive sweating"]]')
train.loc[6380, 'location'] = ast.literal_eval('[["480 482;507 519"], ["480 482;499 503;512 519"], ["480 482;521 531"], ["480 482;533 545"], ["480 482;564 582"]]')

train.loc[6562, 'annotation'] = ast.literal_eval('[["stressed due to taking care of her mother"], ["stressed due to taking care of husbands parents"]]')
train.loc[6562, 'location'] = ast.literal_eval('[["290 320;327 337"], ["290 320;342 358"]]')

train.loc[6862, 'annotation'] = ast.literal_eval('[["stressor taking care of many sick family members"]]')
train.loc[6862, 'location'] = ast.literal_eval('[["288 296;324 363"]]')

train.loc[7022, 'annotation'] = ast.literal_eval('[["heart started racing and felt numbness for the 1st time in her finger tips"]]')
train.loc[7022, 'location'] = ast.literal_eval('[["108 182"]]')

train.loc[7422, 'annotation'] = ast.literal_eval('[["first started 5 yrs"]]')
train.loc[7422, 'location'] = ast.literal_eval('[["102 121"]]')

train.loc[8876, 'annotation'] = ast.literal_eval('[["No shortness of breath"]]')
train.loc[8876, 'location'] = ast.literal_eval('[["481 483;533 552"]]')

train.loc[9027, 'annotation'] = ast.literal_eval('[["recent URI"], ["nasal stuffines, rhinorrhea, for 3-4 days"]]')
train.loc[9027, 'location'] = ast.literal_eval('[["92 102"], ["123 164"]]')

train.loc[9938, 'annotation'] = ast.literal_eval('[["irregularity with her cycles"], ["heavier bleeding"], ["changes her pad every couple hours"]]')
train.loc[9938, 'location'] = ast.literal_eval('[["89 117"], ["122 138"], ["368 402"]]')

train.loc[9973, 'annotation'] = ast.literal_eval('[["gaining 10-15 lbs"]]')
train.loc[9973, 'location'] = ast.literal_eval('[["344 361"]]')

train.loc[10513, 'annotation'] = ast.literal_eval('[["weight gain"], ["gain of 10-16lbs"]]')
train.loc[10513, 'location'] = ast.literal_eval('[["600 611"], ["607 623"]]')

train.loc[11551, 'annotation'] = ast.literal_eval('[["seeing her son knows are not real"]]')
train.loc[11551, 'location'] = ast.literal_eval('[["386 400;443 461"]]')

train.loc[11677, 'annotation'] = ast.literal_eval('[["saw him once in the kitchen after he died"]]')
train.loc[11677, 'location'] = ast.literal_eval('[["160 201"]]')

train.loc[12124, 'annotation'] = ast.literal_eval('[["tried Ambien but it didnt work"]]')
train.loc[12124, 'location'] = ast.literal_eval('[["325 337;349 366"]]')

train.loc[12279, 'annotation'] = ast.literal_eval('[["heard what she described as a party later than evening these things did not actually happen"]]')
train.loc[12279, 'location'] = ast.literal_eval('[["405 459;488 524"]]')

train.loc[12289, 'annotation'] = ast.literal_eval('[["experienced seeing her son at the kitchen table these things did not actually happen"]]')
train.loc[12289, 'location'] = ast.literal_eval('[["353 400;488 524"]]')

train.loc[13238, 'annotation'] = ast.literal_eval('[["SCRACHY THROAT"], ["RUNNY NOSE"]]')
train.loc[13238, 'location'] = ast.literal_eval('[["293 307"], ["321 331"]]')

train.loc[13297, 'annotation'] = ast.literal_eval('[["without improvement when taking tylenol"], ["without improvement when taking ibuprofen"]]')
train.loc[13297, 'location'] = ast.literal_eval('[["182 221"], ["182 213;225 234"]]')

train.loc[13299, 'annotation'] = ast.literal_eval('[["yesterday"], ["yesterday"]]')
train.loc[13299, 'location'] = ast.literal_eval('[["79 88"], ["409 418"]]')

train.loc[13845, 'annotation'] = ast.literal_eval('[["headache global"], ["headache throughout her head"]]')
train.loc[13845, 'location'] = ast.literal_eval('[["86 94;230 236"], ["86 94;237 256"]]')

train.loc[14083, 'annotation'] = ast.literal_eval('[["headache generalized in her head"]]')
train.loc[14083, 'location'] = ast.literal_eval('[["56 64;156 179"]]')

In [None]:
train['annotation_length'] = train['annotation'].apply(len)
display(train['annotation_length'].value_counts())

1    8185
0    4399
2    1292
3     287
4      99
5      27
6       9
7       1
8       1
Name: annotation_length, dtype: int64

# CV split

In [None]:
# ====================================================
# CV split
# ====================================================
Fold = GroupKFold(n_splits=CFG.n_fold)
groups = train['pn_num'].values
for n, (train_index, val_index) in enumerate(Fold.split(train, train['location'], groups)):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    2860
1    2860
2    2860
3    2860
4    2860
dtype: int64

In [None]:
if CFG.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    display(train.groupby('fold').size())

# tokenizer

In [None]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/474 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

# Dataset

In [None]:
# ====================================================
# Define max_len
# ====================================================
for text_col in ['pn_history']:
    pn_history_lengths = []
    tk0 = tqdm(patient_notes[text_col].fillna("").values, total=len(patient_notes))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        pn_history_lengths.append(length)
    LOGGER.info(f'{text_col} max(lengths): {max(pn_history_lengths)}')

for text_col in ['feature_text']:
    features_lengths = []
    tk0 = tqdm(features[text_col].fillna("").values, total=len(features))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        features_lengths.append(length)
    LOGGER.info(f'{text_col} max(lengths): {max(features_lengths)}')

CFG.max_len = max(pn_history_lengths) + max(features_lengths) + 3 # cls & sep & sep
LOGGER.info(f"max_len: {CFG.max_len}")

  0%|          | 0/42146 [00:00<?, ?it/s]

pn_history max(lengths): 433


  0%|          | 0/143 [00:00<?, ?it/s]

feature_text max(lengths): 30
max_len: 466


In [None]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text, feature_text):
    inputs = cfg.tokenizer(text, feature_text, 
                           add_special_tokens=True,
                           max_length=CFG.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


def create_label(cfg, text, annotation_length, location_list):
    encoded = cfg.tokenizer(text,
                            add_special_tokens=True,
                            max_length=CFG.max_len,
                            padding="max_length",
                            return_offsets_mapping=True)
    offset_mapping = encoded['offset_mapping']
    ignore_idxes = np.where(np.array(encoded.sequence_ids()) != 0)[0]
    label = np.zeros(len(offset_mapping))
    label[ignore_idxes] = -1
    if annotation_length != 0:
        for location in location_list:
            for loc in [s.split() for s in location.split(';')]:
                start_idx = -1
                end_idx = -1
                start, end = int(loc[0]), int(loc[1])
                for idx in range(len(offset_mapping)):
                    if (start_idx == -1) & (start < offset_mapping[idx][0]):
                        start_idx = idx - 1
                    if (end_idx == -1) & (end <= offset_mapping[idx][1]):
                        end_idx = idx + 1
                if start_idx == -1:
                    start_idx = end_idx
                if (start_idx != -1) & (end_idx != -1):
                    label[start_idx:end_idx] = 1
    return torch.tensor(label, dtype=torch.float)


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.feature_texts = df['feature_text'].values
        self.pn_historys = df['pn_history'].values
        self.annotation_lengths = df['annotation_length'].values
        self.locations = df['location'].values

    def __len__(self):
        return len(self.feature_texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, 
                               self.pn_historys[item], 
                               self.feature_texts[item])
        label = create_label(self.cfg, 
                             self.pn_historys[item], 
                             self.annotation_lengths[item], 
                             self.locations[item])
        return inputs, label

# Model

In [None]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, 1)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        return last_hidden_states

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

# Helpler functions

In [None]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1).mean()
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1).mean()
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [None]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_texts = valid_folds['pn_history'].values
    valid_labels = create_labels_for_scoring(valid_folds)
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler=='linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler=='cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss(reduction="none")
    
    best_score = 0.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        predictions = predictions.reshape((len(valid_folds), CFG.max_len))
        
        # scoring
        char_probs = get_char_probs(valid_texts, predictions, CFG.tokenizer)
        results = get_results(char_probs, th=0.5)
        preds = get_predictions(results)
        score = get_score(valid_labels, preds)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": avg_loss, 
                       f"[fold{fold}] avg_val_loss": avg_val_loss,
                       f"[fold{fold}] score": score})
        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[i for i in range(CFG.max_len)]] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [None]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = create_labels_for_scoring(oof_df)
        predictions = oof_df[[i for i in range(CFG.max_len)]].values
        char_probs = get_char_probs(oof_df['pn_history'].values, predictions, CFG.tokenizer)
        results = get_results(char_probs, th=0.5)
        preds = get_predictions(results)
        score = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl')
        
    if CFG.wandb:
        wandb.finish()



Downloading:   0%|          | 0.00/533M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/953] Elapsed 0m 2s (remain 36m 11s) Loss: 0.6152(0.6152) Grad: inf  LR: 0.00002000  
Epoch: [1][100/953] Elapsed 1m 45s (remain 14m 50s) Loss: 0.0204(0.0787) Grad: 3683.7551  LR: 0.00001998  
Epoch: [1][200/953] Elapsed 3m 28s (remain 13m 1s) Loss: 0.0123(0.0542) Grad: 3387.7908  LR: 0.00001991  
Epoch: [1][300/953] Elapsed 5m 12s (remain 11m 16s) Loss: 0.0173(0.0439) Grad: 2971.7676  LR: 0.00001980  
Epoch: [1][400/953] Elapsed 6m 55s (remain 9m 32s) Loss: 0.0166(0.0386) Grad: 9887.1963  LR: 0.00001965  
Epoch: [1][500/953] Elapsed 8m 39s (remain 7m 48s) Loss: 0.0152(0.0346) Grad: 6478.6763  LR: 0.00001946  
Epoch: [1][600/953] Elapsed 10m 22s (remain 6m 4s) Loss: 0.0252(0.0321) Grad: 9065.9648  LR: 0.00001923  
Epoch: [1][700/953] Elapsed 12m 5s (remain 4m 20s) Loss: 0.0059(0.0302) Grad: 1518.9550  LR: 0.00001895  
Epoch: [1][800/953] Elapsed 13m 48s (remain 2m 37s) Loss: 0.0206(0.0284) Grad: 4236.4106  LR: 0.00001864  
Epoch: [1][900/953] Elapsed 15m 32s (remain 0m 53s)

Epoch 1 - avg_train_loss: 0.0262  avg_val_loss: 0.0137  time: 1068s
Epoch 1 - Score: 0.8319
Epoch 1 - Save Best Score: 0.8319 Model


Epoch: [2][0/953] Elapsed 0m 1s (remain 20m 43s) Loss: 0.0159(0.0159) Grad: 19950.5566  LR: 0.00001809  
Epoch: [2][100/953] Elapsed 1m 44s (remain 14m 43s) Loss: 0.0104(0.0119) Grad: 15412.7588  LR: 0.00001768  
Epoch: [2][200/953] Elapsed 3m 28s (remain 12m 58s) Loss: 0.0050(0.0121) Grad: 8479.6953  LR: 0.00001724  
Epoch: [2][300/953] Elapsed 5m 11s (remain 11m 14s) Loss: 0.0056(0.0118) Grad: 8105.8262  LR: 0.00001677  
Epoch: [2][400/953] Elapsed 6m 54s (remain 9m 31s) Loss: 0.0014(0.0115) Grad: 3589.2603  LR: 0.00001627  
Epoch: [2][500/953] Elapsed 8m 38s (remain 7m 47s) Loss: 0.0041(0.0111) Grad: 7462.8813  LR: 0.00001575  
Epoch: [2][600/953] Elapsed 10m 21s (remain 6m 3s) Loss: 0.0097(0.0110) Grad: 20346.2246  LR: 0.00001520  
Epoch: [2][700/953] Elapsed 12m 4s (remain 4m 20s) Loss: 0.0158(0.0113) Grad: 22776.1172  LR: 0.00001462  
Epoch: [2][800/953] Elapsed 13m 48s (remain 2m 37s) Loss: 0.0258(0.0112) Grad: 53962.2773  LR: 0.00001403  
Epoch: [2][900/953] Elapsed 15m 31s (re

Epoch 2 - avg_train_loss: 0.0108  avg_val_loss: 0.0124  time: 1067s
Epoch 2 - Score: 0.8569
Epoch 2 - Save Best Score: 0.8569 Model


Epoch: [3][0/953] Elapsed 0m 1s (remain 20m 17s) Loss: 0.0186(0.0186) Grad: 20406.9160  LR: 0.00001309  
Epoch: [3][100/953] Elapsed 1m 44s (remain 14m 42s) Loss: 0.0100(0.0093) Grad: 49438.2383  LR: 0.00001245  
Epoch: [3][200/953] Elapsed 3m 28s (remain 12m 58s) Loss: 0.0051(0.0086) Grad: 7300.5186  LR: 0.00001181  
Epoch: [3][300/953] Elapsed 5m 11s (remain 11m 14s) Loss: 0.0026(0.0086) Grad: 7038.8491  LR: 0.00001116  
Epoch: [3][400/953] Elapsed 6m 54s (remain 9m 31s) Loss: 0.0046(0.0091) Grad: 25473.1855  LR: 0.00001050  
Epoch: [3][500/953] Elapsed 8m 38s (remain 7m 47s) Loss: 0.0033(0.0090) Grad: 15953.5449  LR: 0.00000984  
Epoch: [3][600/953] Elapsed 10m 21s (remain 6m 4s) Loss: 0.0167(0.0094) Grad: 25778.5703  LR: 0.00000918  
Epoch: [3][700/953] Elapsed 12m 5s (remain 4m 20s) Loss: 0.0282(0.0093) Grad: 51500.9141  LR: 0.00000853  
Epoch: [3][800/953] Elapsed 13m 48s (remain 2m 37s) Loss: 0.0387(0.0094) Grad: 63309.2812  LR: 0.00000788  
Epoch: [3][900/953] Elapsed 15m 31s (

Epoch 3 - avg_train_loss: 0.0094  avg_val_loss: 0.0120  time: 1067s
Epoch 3 - Score: 0.8613
Epoch 3 - Save Best Score: 0.8613 Model


Epoch: [4][0/953] Elapsed 0m 1s (remain 20m 20s) Loss: 0.0035(0.0035) Grad: 8735.9014  LR: 0.00000691  
Epoch: [4][100/953] Elapsed 1m 44s (remain 14m 43s) Loss: 0.0140(0.0080) Grad: 23477.3047  LR: 0.00000629  
Epoch: [4][200/953] Elapsed 3m 28s (remain 12m 58s) Loss: 0.0039(0.0083) Grad: 7484.7407  LR: 0.00000568  
Epoch: [4][300/953] Elapsed 5m 11s (remain 11m 14s) Loss: 0.0085(0.0080) Grad: 32885.7227  LR: 0.00000510  
Epoch: [4][400/953] Elapsed 6m 54s (remain 9m 31s) Loss: 0.0051(0.0079) Grad: 13357.5195  LR: 0.00000454  
Epoch: [4][500/953] Elapsed 8m 38s (remain 7m 47s) Loss: 0.0029(0.0079) Grad: 9817.5059  LR: 0.00000400  
Epoch: [4][600/953] Elapsed 10m 21s (remain 6m 4s) Loss: 0.0038(0.0079) Grad: 12878.0654  LR: 0.00000348  
Epoch: [4][700/953] Elapsed 12m 4s (remain 4m 20s) Loss: 0.0091(0.0081) Grad: 15060.7949  LR: 0.00000300  
Epoch: [4][800/953] Elapsed 13m 48s (remain 2m 37s) Loss: 0.0067(0.0081) Grad: 13976.8350  LR: 0.00000254  
Epoch: [4][900/953] Elapsed 15m 31s (r

Epoch 4 - avg_train_loss: 0.0082  avg_val_loss: 0.0119  time: 1067s
Epoch 4 - Score: 0.8651
Epoch 4 - Save Best Score: 0.8651 Model


Epoch: [5][0/953] Elapsed 0m 1s (remain 20m 30s) Loss: 0.0221(0.0221) Grad: 54597.7383  LR: 0.00000191  
Epoch: [5][100/953] Elapsed 1m 44s (remain 14m 43s) Loss: 0.0033(0.0067) Grad: 7506.9854  LR: 0.00000154  
Epoch: [5][200/953] Elapsed 3m 28s (remain 12m 58s) Loss: 0.0121(0.0068) Grad: 28685.4551  LR: 0.00000121  
Epoch: [5][300/953] Elapsed 5m 11s (remain 11m 14s) Loss: 0.0026(0.0069) Grad: 6678.3521  LR: 0.00000091  
Epoch: [5][400/953] Elapsed 6m 54s (remain 9m 30s) Loss: 0.0007(0.0072) Grad: 3564.5271  LR: 0.00000066  
Epoch: [5][500/953] Elapsed 8m 38s (remain 7m 47s) Loss: 0.0089(0.0074) Grad: 16537.8926  LR: 0.00000044  
Epoch: [5][600/953] Elapsed 10m 21s (remain 6m 3s) Loss: 0.0051(0.0075) Grad: 11085.3350  LR: 0.00000027  
Epoch: [5][700/953] Elapsed 12m 4s (remain 4m 20s) Loss: 0.0012(0.0076) Grad: 5565.9688  LR: 0.00000014  
Epoch: [5][800/953] Elapsed 13m 48s (remain 2m 37s) Loss: 0.0043(0.0076) Grad: 16203.3701  LR: 0.00000005  
Epoch: [5][900/953] Elapsed 15m 31s (re

Epoch 5 - avg_train_loss: 0.0075  avg_val_loss: 0.0123  time: 1067s
Epoch 5 - Score: 0.8648
Score: 0.8651
Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/953] Elapsed 0m 1s (remain 20m 36s) Loss: 0.8972(0.8972) Grad: inf  LR: 0.00002000  
Epoch: [1][100/953] Elapsed 1m 44s (remain 14m 42s) Loss: 0.0366(0.1036) Grad: 2767.1121  LR: 0.00001998  
Epoch: [1][200/953] Elapsed 3m 27s (remain 12m 58s) Loss: 0.0119(0.0661) Grad: 1229.3901  LR: 0.00001991  
Epoch: [1][300/953] Elapsed 5m 11s (remain 11m 14s) Loss: 0.0792(0.0521) Grad: 11543.2695  LR: 0.00001980  
Epoch: [1][400/953] Elapsed 6m 54s (remain 9m 30s) Loss: 0.0404(0.0444) Grad: 4422.9409  LR: 0.00001965  
Epoch: [1][500/953] Elapsed 8m 37s (remain 7m 47s) Loss: 0.0222(0.0395) Grad: 2235.0764  LR: 0.00001946  
Epoch: [1][600/953] Elapsed 10m 21s (remain 6m 3s) Loss: 0.0070(0.0361) Grad: 956.4985  LR: 0.00001923  
Epoch: [1][700/953] Elapsed 12m 4s (remain 4m 20s) Loss: 0.0153(0.0336) Grad: 1297.6082  LR: 0.00001895  
Epoch: [1][800/953] Elapsed 13m 47s (remain 2m 37s) Loss: 0.0088(0.0314) Grad: 1474.4218  LR: 0.00001864  
Epoch: [1][900/953] Elapsed 15m 30s (remain 0m 53s

Epoch 1 - avg_train_loss: 0.0291  avg_val_loss: 0.0128  time: 1066s
Epoch 1 - Score: 0.8329
Epoch 1 - Save Best Score: 0.8329 Model


Epoch: [2][0/953] Elapsed 0m 1s (remain 19m 57s) Loss: 0.0094(0.0094) Grad: 16259.5547  LR: 0.00001809  
Epoch: [2][100/953] Elapsed 1m 44s (remain 14m 42s) Loss: 0.0117(0.0113) Grad: 17293.4316  LR: 0.00001768  
Epoch: [2][200/953] Elapsed 3m 28s (remain 12m 58s) Loss: 0.0050(0.0112) Grad: 10829.0361  LR: 0.00001724  
Epoch: [2][300/953] Elapsed 5m 11s (remain 11m 14s) Loss: 0.0071(0.0114) Grad: 14014.2295  LR: 0.00001677  
Epoch: [2][400/953] Elapsed 6m 54s (remain 9m 30s) Loss: 0.0114(0.0114) Grad: 23252.6309  LR: 0.00001627  
Epoch: [2][500/953] Elapsed 8m 38s (remain 7m 47s) Loss: 0.0099(0.0112) Grad: 14844.9541  LR: 0.00001575  
Epoch: [2][600/953] Elapsed 10m 21s (remain 6m 4s) Loss: 0.0039(0.0110) Grad: 10484.8877  LR: 0.00001520  
Epoch: [2][700/953] Elapsed 12m 4s (remain 4m 20s) Loss: 0.0102(0.0110) Grad: 21072.5117  LR: 0.00001462  
Epoch: [2][800/953] Elapsed 13m 48s (remain 2m 37s) Loss: 0.0097(0.0108) Grad: 14240.5459  LR: 0.00001403  
Epoch: [2][900/953] Elapsed 15m 31s

Epoch 2 - avg_train_loss: 0.0108  avg_val_loss: 0.0122  time: 1067s
Epoch 2 - Score: 0.8517
Epoch 2 - Save Best Score: 0.8517 Model


Epoch: [3][0/953] Elapsed 0m 1s (remain 21m 13s) Loss: 0.0133(0.0133) Grad: 32321.6660  LR: 0.00001309  
Epoch: [3][100/953] Elapsed 1m 44s (remain 14m 43s) Loss: 0.0070(0.0101) Grad: 10673.3389  LR: 0.00001245  
Epoch: [3][200/953] Elapsed 3m 28s (remain 12m 58s) Loss: 0.0027(0.0101) Grad: 7611.6230  LR: 0.00001181  
Epoch: [3][300/953] Elapsed 5m 11s (remain 11m 14s) Loss: 0.0144(0.0097) Grad: 34534.5664  LR: 0.00001116  
Epoch: [3][400/953] Elapsed 6m 54s (remain 9m 31s) Loss: 0.0108(0.0100) Grad: 24159.2188  LR: 0.00001050  
Epoch: [3][500/953] Elapsed 8m 38s (remain 7m 47s) Loss: 0.0084(0.0099) Grad: 23953.3535  LR: 0.00000984  
Epoch: [3][600/953] Elapsed 10m 21s (remain 6m 4s) Loss: 0.0022(0.0097) Grad: 5068.6357  LR: 0.00000918  
Epoch: [3][700/953] Elapsed 12m 5s (remain 4m 20s) Loss: 0.0061(0.0097) Grad: 18036.6426  LR: 0.00000853  
Epoch: [3][800/953] Elapsed 13m 48s (remain 2m 37s) Loss: 0.0053(0.0097) Grad: 15547.3545  LR: 0.00000788  
Epoch: [3][900/953] Elapsed 15m 31s (

Epoch 3 - avg_train_loss: 0.0096  avg_val_loss: 0.0123  time: 1068s
Epoch 3 - Score: 0.8595
Epoch 3 - Save Best Score: 0.8595 Model


Epoch: [4][0/953] Elapsed 0m 1s (remain 21m 3s) Loss: 0.0135(0.0135) Grad: 30709.0918  LR: 0.00000691  
Epoch: [4][100/953] Elapsed 1m 44s (remain 14m 43s) Loss: 0.0011(0.0103) Grad: 5585.1938  LR: 0.00000629  
Epoch: [4][200/953] Elapsed 3m 28s (remain 12m 58s) Loss: 0.0162(0.0095) Grad: 22152.5312  LR: 0.00000568  
Epoch: [4][300/953] Elapsed 5m 11s (remain 11m 14s) Loss: 0.0086(0.0091) Grad: 27004.6191  LR: 0.00000510  
Epoch: [4][400/953] Elapsed 6m 54s (remain 9m 31s) Loss: 0.0231(0.0089) Grad: 65443.6914  LR: 0.00000454  
Epoch: [4][500/953] Elapsed 8m 38s (remain 7m 47s) Loss: 0.0109(0.0089) Grad: 58988.9922  LR: 0.00000400  
Epoch: [4][600/953] Elapsed 10m 21s (remain 6m 4s) Loss: 0.0041(0.0089) Grad: 9144.3652  LR: 0.00000348  
Epoch: [4][700/953] Elapsed 12m 5s (remain 4m 20s) Loss: 0.0092(0.0087) Grad: 12483.0117  LR: 0.00000300  
Epoch: [4][800/953] Elapsed 13m 48s (remain 2m 37s) Loss: 0.0029(0.0087) Grad: 6482.1162  LR: 0.00000254  
Epoch: [4][900/953] Elapsed 15m 31s (re

Epoch 4 - avg_train_loss: 0.0088  avg_val_loss: 0.0121  time: 1067s
Epoch 4 - Score: 0.8599
Epoch 4 - Save Best Score: 0.8599 Model


Epoch: [5][0/953] Elapsed 0m 1s (remain 19m 41s) Loss: 0.0055(0.0055) Grad: 11747.1562  LR: 0.00000191  
Epoch: [5][100/953] Elapsed 1m 44s (remain 14m 41s) Loss: 0.0079(0.0078) Grad: 27258.3516  LR: 0.00000154  
Epoch: [5][200/953] Elapsed 3m 27s (remain 12m 57s) Loss: 0.0117(0.0083) Grad: 59280.3008  LR: 0.00000121  
Epoch: [5][300/953] Elapsed 5m 11s (remain 11m 14s) Loss: 0.0074(0.0079) Grad: 17628.6465  LR: 0.00000091  
Epoch: [5][400/953] Elapsed 6m 54s (remain 9m 30s) Loss: 0.0037(0.0079) Grad: 9040.2295  LR: 0.00000066  
Epoch: [5][500/953] Elapsed 8m 37s (remain 7m 47s) Loss: 0.0139(0.0078) Grad: 26971.9707  LR: 0.00000044  
Epoch: [5][600/953] Elapsed 10m 21s (remain 6m 3s) Loss: 0.0068(0.0079) Grad: 20553.1328  LR: 0.00000027  
Epoch: [5][700/953] Elapsed 12m 4s (remain 4m 20s) Loss: 0.0211(0.0083) Grad: 118407.2969  LR: 0.00000014  
Epoch: [5][800/953] Elapsed 13m 47s (remain 2m 37s) Loss: 0.0081(0.0083) Grad: 17488.5938  LR: 0.00000005  
Epoch: [5][900/953] Elapsed 15m 31s

Epoch 5 - avg_train_loss: 0.0083  avg_val_loss: 0.0123  time: 1067s
Epoch 5 - Score: 0.8604
Epoch 5 - Save Best Score: 0.8604 Model
Score: 0.8604
Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/953] Elapsed 0m 1s (remain 20m 38s) Loss: 1.0706(1.0706) Grad: inf  LR: 0.00002000  
Epoch: [1][100/953] Elapsed 1m 44s (remain 14m 42s) Loss: 0.0300(0.1154) Grad: 2217.2114  LR: 0.00001998  
Epoch: [1][200/953] Elapsed 3m 28s (remain 12m 58s) Loss: 0.0214(0.0729) Grad: 1798.9280  LR: 0.00001991  
Epoch: [1][300/953] Elapsed 5m 11s (remain 11m 14s) Loss: 0.0086(0.0568) Grad: 966.3666  LR: 0.00001980  
Epoch: [1][400/953] Elapsed 6m 54s (remain 9m 31s) Loss: 0.0244(0.0485) Grad: 2235.9109  LR: 0.00001965  
Epoch: [1][500/953] Elapsed 8m 38s (remain 7m 47s) Loss: 0.0148(0.0428) Grad: 1696.2240  LR: 0.00001946  
Epoch: [1][600/953] Elapsed 10m 21s (remain 6m 4s) Loss: 0.0657(0.0388) Grad: 4781.5723  LR: 0.00001923  
Epoch: [1][700/953] Elapsed 12m 4s (remain 4m 20s) Loss: 0.0082(0.0359) Grad: 935.1599  LR: 0.00001895  
Epoch: [1][800/953] Elapsed 13m 48s (remain 2m 37s) Loss: 0.0196(0.0336) Grad: 1320.6721  LR: 0.00001864  
Epoch: [1][900/953] Elapsed 15m 31s (remain 0m 53s) 

Epoch 1 - avg_train_loss: 0.0308  avg_val_loss: 0.0135  time: 1067s
Epoch 1 - Score: 0.8167
Epoch 1 - Save Best Score: 0.8167 Model


Epoch: [2][0/953] Elapsed 0m 1s (remain 19m 58s) Loss: 0.0085(0.0085) Grad: 11819.9238  LR: 0.00001809  
Epoch: [2][100/953] Elapsed 1m 44s (remain 14m 42s) Loss: 0.0186(0.0127) Grad: 32355.9121  LR: 0.00001768  
Epoch: [2][200/953] Elapsed 3m 28s (remain 12m 58s) Loss: 0.0059(0.0120) Grad: 30082.8359  LR: 0.00001724  
Epoch: [2][300/953] Elapsed 5m 11s (remain 11m 15s) Loss: 0.0142(0.0116) Grad: 20360.4023  LR: 0.00001677  
Epoch: [2][400/953] Elapsed 6m 55s (remain 9m 31s) Loss: 0.0256(0.0114) Grad: 33702.6641  LR: 0.00001627  
Epoch: [2][500/953] Elapsed 8m 38s (remain 7m 47s) Loss: 0.0074(0.0114) Grad: 12409.3135  LR: 0.00001575  
Epoch: [2][600/953] Elapsed 10m 22s (remain 6m 4s) Loss: 0.0016(0.0112) Grad: 3786.5481  LR: 0.00001520  
Epoch: [2][700/953] Elapsed 12m 5s (remain 4m 20s) Loss: 0.0021(0.0112) Grad: 4698.5225  LR: 0.00001462  
Epoch: [2][800/953] Elapsed 13m 48s (remain 2m 37s) Loss: 0.0051(0.0111) Grad: 6151.5190  LR: 0.00001403  
Epoch: [2][900/953] Elapsed 15m 32s (r

Epoch 2 - avg_train_loss: 0.0110  avg_val_loss: 0.0127  time: 1068s
Epoch 2 - Score: 0.8537
Epoch 2 - Save Best Score: 0.8537 Model


Epoch: [3][0/953] Elapsed 0m 1s (remain 22m 23s) Loss: 0.0133(0.0133) Grad: 26927.6074  LR: 0.00001309  
Epoch: [3][100/953] Elapsed 1m 44s (remain 14m 45s) Loss: 0.0234(0.0095) Grad: 100114.2969  LR: 0.00001245  
Epoch: [3][200/953] Elapsed 3m 28s (remain 12m 59s) Loss: 0.0044(0.0095) Grad: 11644.0449  LR: 0.00001181  
Epoch: [3][300/953] Elapsed 5m 12s (remain 11m 15s) Loss: 0.0148(0.0099) Grad: 28668.2539  LR: 0.00001116  
Epoch: [3][400/953] Elapsed 6m 55s (remain 9m 32s) Loss: 0.0107(0.0098) Grad: 16855.7480  LR: 0.00001050  
Epoch: [3][500/953] Elapsed 8m 39s (remain 7m 48s) Loss: 0.0079(0.0102) Grad: 16207.7822  LR: 0.00000984  
Epoch: [3][600/953] Elapsed 10m 22s (remain 6m 4s) Loss: 0.0117(0.0099) Grad: 16883.0547  LR: 0.00000918  
Epoch: [3][700/953] Elapsed 12m 6s (remain 4m 21s) Loss: 0.0062(0.0097) Grad: 22936.4980  LR: 0.00000853  
Epoch: [3][800/953] Elapsed 13m 49s (remain 2m 37s) Loss: 0.0036(0.0095) Grad: 33636.9102  LR: 0.00000788  
Epoch: [3][900/953] Elapsed 15m 33

Epoch 3 - avg_train_loss: 0.0096  avg_val_loss: 0.0125  time: 1069s
Epoch 3 - Score: 0.8580
Epoch 3 - Save Best Score: 0.8580 Model


Epoch: [4][0/953] Elapsed 0m 1s (remain 20m 58s) Loss: 0.0055(0.0055) Grad: 8374.0195  LR: 0.00000691  
Epoch: [4][100/953] Elapsed 1m 44s (remain 14m 43s) Loss: 0.0067(0.0090) Grad: 13717.9395  LR: 0.00000629  
Epoch: [4][200/953] Elapsed 3m 28s (remain 12m 58s) Loss: 0.0193(0.0088) Grad: 34727.5586  LR: 0.00000568  
Epoch: [4][300/953] Elapsed 5m 11s (remain 11m 14s) Loss: 0.0119(0.0090) Grad: 20391.7969  LR: 0.00000510  
Epoch: [4][400/953] Elapsed 6m 55s (remain 9m 31s) Loss: 0.0079(0.0091) Grad: 33459.5625  LR: 0.00000454  
Epoch: [4][500/953] Elapsed 8m 38s (remain 7m 47s) Loss: 0.0098(0.0090) Grad: 20319.1953  LR: 0.00000400  
Epoch: [4][600/953] Elapsed 10m 21s (remain 6m 4s) Loss: 0.0087(0.0088) Grad: 12816.7627  LR: 0.00000348  
Epoch: [4][700/953] Elapsed 12m 5s (remain 4m 20s) Loss: 0.0049(0.0088) Grad: 18390.5312  LR: 0.00000300  
Epoch: [4][800/953] Elapsed 13m 48s (remain 2m 37s) Loss: 0.0034(0.0088) Grad: 15580.9580  LR: 0.00000254  
Epoch: [4][900/953] Elapsed 15m 32s 

Epoch 4 - avg_train_loss: 0.0086  avg_val_loss: 0.0123  time: 1068s
Epoch 4 - Score: 0.8618
Epoch 4 - Save Best Score: 0.8618 Model


Epoch: [5][0/953] Elapsed 0m 1s (remain 21m 54s) Loss: 0.0063(0.0063) Grad: 15929.3213  LR: 0.00000191  
Epoch: [5][100/953] Elapsed 1m 44s (remain 14m 44s) Loss: 0.0088(0.0074) Grad: 20071.0117  LR: 0.00000154  
Epoch: [5][200/953] Elapsed 3m 28s (remain 12m 59s) Loss: 0.0069(0.0077) Grad: 12056.4736  LR: 0.00000121  
Epoch: [5][300/953] Elapsed 5m 11s (remain 11m 15s) Loss: 0.0041(0.0077) Grad: 11364.0762  LR: 0.00000091  
Epoch: [5][400/953] Elapsed 6m 55s (remain 9m 31s) Loss: 0.0034(0.0077) Grad: 29427.9395  LR: 0.00000066  
Epoch: [5][500/953] Elapsed 8m 38s (remain 7m 47s) Loss: 0.0108(0.0079) Grad: 18412.2559  LR: 0.00000044  
Epoch: [5][600/953] Elapsed 10m 22s (remain 6m 4s) Loss: 0.0129(0.0080) Grad: 31372.1250  LR: 0.00000027  
Epoch: [5][700/953] Elapsed 12m 5s (remain 4m 20s) Loss: 0.0007(0.0080) Grad: 3784.7502  LR: 0.00000014  
Epoch: [5][800/953] Elapsed 13m 48s (remain 2m 37s) Loss: 0.0054(0.0080) Grad: 29238.2930  LR: 0.00000005  
Epoch: [5][900/953] Elapsed 15m 32s 

Epoch 5 - avg_train_loss: 0.0082  avg_val_loss: 0.0124  time: 1068s
Epoch 5 - Score: 0.8618
Score: 0.8618
Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/953] Elapsed 0m 1s (remain 19m 18s) Loss: 0.7444(0.7444) Grad: inf  LR: 0.00002000  
Epoch: [1][100/953] Elapsed 1m 44s (remain 14m 42s) Loss: 0.0369(0.0950) Grad: 2014.4661  LR: 0.00001998  
Epoch: [1][200/953] Elapsed 3m 28s (remain 12m 58s) Loss: 0.0162(0.0621) Grad: 2515.3689  LR: 0.00001991  
Epoch: [1][300/953] Elapsed 5m 11s (remain 11m 14s) Loss: 0.0135(0.0497) Grad: 1339.1007  LR: 0.00001980  
Epoch: [1][400/953] Elapsed 6m 54s (remain 9m 31s) Loss: 0.0235(0.0429) Grad: 1668.6628  LR: 0.00001965  
Epoch: [1][500/953] Elapsed 8m 38s (remain 7m 47s) Loss: 0.0129(0.0383) Grad: 1420.8352  LR: 0.00001946  
Epoch: [1][600/953] Elapsed 10m 21s (remain 6m 4s) Loss: 0.0324(0.0351) Grad: 2794.3689  LR: 0.00001923  
Epoch: [1][700/953] Elapsed 12m 4s (remain 4m 20s) Loss: 0.0069(0.0324) Grad: 879.9413  LR: 0.00001895  


wandb: Network error (ReadTimeout), entering retry loop.


Epoch: [1][800/953] Elapsed 13m 48s (remain 2m 37s) Loss: 0.0141(0.0303) Grad: 1447.3911  LR: 0.00001864  
Epoch: [1][900/953] Elapsed 15m 31s (remain 0m 53s) Loss: 0.0273(0.0287) Grad: 3760.0537  LR: 0.00001829  
Epoch: [1][952/953] Elapsed 16m 25s (remain 0m 0s) Loss: 0.0282(0.0280) Grad: 1580.2777  LR: 0.00001809  
EVAL: [0/239] Elapsed 0m 0s (remain 2m 31s) Loss: 0.0190(0.0190) 
EVAL: [100/239] Elapsed 0m 33s (remain 0m 45s) Loss: 0.0212(0.0152) 
EVAL: [200/239] Elapsed 1m 6s (remain 0m 12s) Loss: 0.0171(0.0163) 
EVAL: [238/239] Elapsed 1m 18s (remain 0m 0s) Loss: 0.0036(0.0150) 


Epoch 1 - avg_train_loss: 0.0280  avg_val_loss: 0.0150  time: 1067s
Epoch 1 - Score: 0.8218
Epoch 1 - Save Best Score: 0.8218 Model


Epoch: [2][0/953] Elapsed 0m 1s (remain 20m 13s) Loss: 0.0183(0.0183) Grad: 29542.5820  LR: 0.00001809  
Epoch: [2][100/953] Elapsed 1m 44s (remain 14m 43s) Loss: 0.0179(0.0119) Grad: 23301.8926  LR: 0.00001768  
Epoch: [2][200/953] Elapsed 3m 28s (remain 12m 58s) Loss: 0.0230(0.0117) Grad: 33708.7656  LR: 0.00001724  
Epoch: [2][300/953] Elapsed 5m 11s (remain 11m 14s) Loss: 0.0173(0.0111) Grad: 33921.0117  LR: 0.00001677  
Epoch: [2][400/953] Elapsed 6m 54s (remain 9m 31s) Loss: 0.0103(0.0108) Grad: 10503.6592  LR: 0.00001627  
Epoch: [2][500/953] Elapsed 8m 38s (remain 7m 47s) Loss: 0.0040(0.0110) Grad: 13777.8184  LR: 0.00001575  
Epoch: [2][600/953] Elapsed 10m 21s (remain 6m 4s) Loss: 0.0059(0.0112) Grad: 9933.0723  LR: 0.00001520  
Epoch: [2][700/953] Elapsed 12m 5s (remain 4m 20s) Loss: 0.0060(0.0110) Grad: 19482.2988  LR: 0.00001462  
Epoch: [2][800/953] Elapsed 13m 48s (remain 2m 37s) Loss: 0.0146(0.0109) Grad: 22599.5488  LR: 0.00001403  
Epoch: [2][900/953] Elapsed 15m 31s 

Epoch 2 - avg_train_loss: 0.0108  avg_val_loss: 0.0141  time: 1068s
Epoch 2 - Score: 0.8446
Epoch 2 - Save Best Score: 0.8446 Model


Epoch: [3][0/953] Elapsed 0m 1s (remain 21m 34s) Loss: 0.0046(0.0046) Grad: 10505.8418  LR: 0.00001309  
Epoch: [3][100/953] Elapsed 1m 44s (remain 14m 44s) Loss: 0.0095(0.0097) Grad: 25874.6641  LR: 0.00001245  
Epoch: [3][200/953] Elapsed 3m 28s (remain 12m 59s) Loss: 0.0081(0.0097) Grad: 14258.3066  LR: 0.00001181  
Epoch: [3][300/953] Elapsed 5m 11s (remain 11m 15s) Loss: 0.0052(0.0092) Grad: 14731.6787  LR: 0.00001116  
Epoch: [3][400/953] Elapsed 6m 55s (remain 9m 31s) Loss: 0.0077(0.0094) Grad: 13243.5166  LR: 0.00001050  
Epoch: [3][500/953] Elapsed 8m 38s (remain 7m 47s) Loss: 0.0118(0.0096) Grad: 34866.4258  LR: 0.00000984  
Epoch: [3][600/953] Elapsed 10m 22s (remain 6m 4s) Loss: 0.0083(0.0095) Grad: 17213.8320  LR: 0.00000918  
Epoch: [3][700/953] Elapsed 12m 5s (remain 4m 20s) Loss: 0.0123(0.0094) Grad: 28980.7754  LR: 0.00000853  
Epoch: [3][800/953] Elapsed 13m 48s (remain 2m 37s) Loss: 0.0413(0.0095) Grad: 42464.1250  LR: 0.00000788  
Epoch: [3][900/953] Elapsed 15m 32s

Epoch 3 - avg_train_loss: 0.0095  avg_val_loss: 0.0137  time: 1068s
Epoch 3 - Score: 0.8493
Epoch 3 - Save Best Score: 0.8493 Model


Epoch: [4][0/953] Elapsed 0m 1s (remain 21m 3s) Loss: 0.0063(0.0063) Grad: 14266.5908  LR: 0.00000691  
Epoch: [4][100/953] Elapsed 1m 44s (remain 14m 42s) Loss: 0.0108(0.0071) Grad: 15751.7422  LR: 0.00000629  
Epoch: [4][200/953] Elapsed 3m 27s (remain 12m 57s) Loss: 0.0087(0.0083) Grad: 15937.2998  LR: 0.00000568  
Epoch: [4][300/953] Elapsed 5m 11s (remain 11m 14s) Loss: 0.0014(0.0085) Grad: 3825.4106  LR: 0.00000510  
Epoch: [4][400/953] Elapsed 6m 54s (remain 9m 30s) Loss: 0.0224(0.0084) Grad: 41536.5898  LR: 0.00000454  
Epoch: [4][500/953] Elapsed 8m 37s (remain 7m 47s) Loss: 0.0051(0.0084) Grad: 29153.3867  LR: 0.00000400  
Epoch: [4][600/953] Elapsed 10m 20s (remain 6m 3s) Loss: 0.0165(0.0084) Grad: 32568.6973  LR: 0.00000348  
Epoch: [4][700/953] Elapsed 12m 4s (remain 4m 20s) Loss: 0.0016(0.0085) Grad: 5932.8252  LR: 0.00000300  
Epoch: [4][800/953] Elapsed 13m 47s (remain 2m 37s) Loss: 0.0242(0.0086) Grad: 42224.6289  LR: 0.00000254  
Epoch: [4][900/953] Elapsed 15m 30s (r

Epoch 4 - avg_train_loss: 0.0087  avg_val_loss: 0.0138  time: 1067s
Epoch 4 - Score: 0.8515
Epoch 4 - Save Best Score: 0.8515 Model


Epoch: [5][0/953] Elapsed 0m 1s (remain 20m 53s) Loss: 0.0028(0.0028) Grad: 7100.0923  LR: 0.00000191  
Epoch: [5][100/953] Elapsed 1m 44s (remain 14m 42s) Loss: 0.0082(0.0080) Grad: 15449.9521  LR: 0.00000154  
Epoch: [5][200/953] Elapsed 3m 27s (remain 12m 57s) Loss: 0.0054(0.0079) Grad: 12396.1475  LR: 0.00000121  
Epoch: [5][300/953] Elapsed 5m 11s (remain 11m 13s) Loss: 0.0059(0.0080) Grad: 15647.4668  LR: 0.00000091  
Epoch: [5][400/953] Elapsed 6m 54s (remain 9m 30s) Loss: 0.0084(0.0079) Grad: 22105.2266  LR: 0.00000066  
Epoch: [5][500/953] Elapsed 8m 37s (remain 7m 47s) Loss: 0.0121(0.0079) Grad: 25802.2363  LR: 0.00000044  
Epoch: [5][600/953] Elapsed 10m 20s (remain 6m 3s) Loss: 0.0013(0.0079) Grad: 8066.8291  LR: 0.00000027  
Epoch: [5][700/953] Elapsed 12m 4s (remain 4m 20s) Loss: 0.0072(0.0081) Grad: 18752.2812  LR: 0.00000014  
Epoch: [5][800/953] Elapsed 13m 47s (remain 2m 37s) Loss: 0.0200(0.0081) Grad: 36105.1797  LR: 0.00000005  
Epoch: [5][900/953] Elapsed 15m 30s (

Epoch 5 - avg_train_loss: 0.0081  avg_val_loss: 0.0140  time: 1067s
Epoch 5 - Score: 0.8537
Epoch 5 - Save Best Score: 0.8537 Model
Score: 0.8537
Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/953] Elapsed 0m 1s (remain 20m 31s) Loss: 0.8424(0.8424) Grad: inf  LR: 0.00002000  
Epoch: [1][100/953] Elapsed 1m 44s (remain 14m 41s) Loss: 0.0479(0.1008) Grad: 3370.0044  LR: 0.00001998  
Epoch: [1][200/953] Elapsed 3m 27s (remain 12m 57s) Loss: 0.0308(0.0650) Grad: 1454.5791  LR: 0.00001991  
Epoch: [1][300/953] Elapsed 5m 11s (remain 11m 13s) Loss: 0.0517(0.0520) Grad: 2950.9194  LR: 0.00001980  
Epoch: [1][400/953] Elapsed 6m 54s (remain 9m 30s) Loss: 0.0337(0.0445) Grad: 4467.4341  LR: 0.00001965  
Epoch: [1][500/953] Elapsed 8m 37s (remain 7m 46s) Loss: 0.0130(0.0397) Grad: 2592.0957  LR: 0.00001946  
Epoch: [1][600/953] Elapsed 10m 20s (remain 6m 3s) Loss: 0.0056(0.0363) Grad: 947.5839  LR: 0.00001923  
Epoch: [1][700/953] Elapsed 12m 3s (remain 4m 20s) Loss: 0.0295(0.0333) Grad: 2226.3984  LR: 0.00001895  
Epoch: [1][800/953] Elapsed 13m 46s (remain 2m 36s) Loss: 0.0146(0.0311) Grad: 1676.4886  LR: 0.00001864  
Epoch: [1][900/953] Elapsed 15m 30s (remain 0m 53s)

Epoch 1 - avg_train_loss: 0.0288  avg_val_loss: 0.0134  time: 1066s
Epoch 1 - Score: 0.8317
Epoch 1 - Save Best Score: 0.8317 Model


Epoch: [2][0/953] Elapsed 0m 1s (remain 20m 3s) Loss: 0.0078(0.0078) Grad: 14229.5693  LR: 0.00001809  
Epoch: [2][100/953] Elapsed 1m 44s (remain 14m 41s) Loss: 0.0049(0.0118) Grad: 20374.3672  LR: 0.00001768  
Epoch: [2][200/953] Elapsed 3m 27s (remain 12m 57s) Loss: 0.0153(0.0108) Grad: 37161.9766  LR: 0.00001724  
Epoch: [2][300/953] Elapsed 5m 11s (remain 11m 14s) Loss: 0.0145(0.0113) Grad: 16582.0781  LR: 0.00001677  
Epoch: [2][400/953] Elapsed 6m 54s (remain 9m 30s) Loss: 0.0031(0.0114) Grad: 14321.3828  LR: 0.00001627  
Epoch: [2][500/953] Elapsed 8m 37s (remain 7m 47s) Loss: 0.0026(0.0114) Grad: 5321.2812  LR: 0.00001575  
Epoch: [2][600/953] Elapsed 10m 21s (remain 6m 3s) Loss: 0.0070(0.0112) Grad: 13252.4580  LR: 0.00001520  
Epoch: [2][700/953] Elapsed 12m 4s (remain 4m 20s) Loss: 0.0234(0.0111) Grad: 58960.8555  LR: 0.00001462  
Epoch: [2][800/953] Elapsed 13m 47s (remain 2m 37s) Loss: 0.0044(0.0110) Grad: 8939.3818  LR: 0.00001403  
Epoch: [2][900/953] Elapsed 15m 31s (r

Epoch 2 - avg_train_loss: 0.0109  avg_val_loss: 0.0125  time: 1067s
Epoch 2 - Score: 0.8539
Epoch 2 - Save Best Score: 0.8539 Model


Epoch: [3][0/953] Elapsed 0m 1s (remain 20m 39s) Loss: 0.0073(0.0073) Grad: 21636.5293  LR: 0.00001309  
Epoch: [3][100/953] Elapsed 1m 44s (remain 14m 43s) Loss: 0.0063(0.0094) Grad: 10119.6875  LR: 0.00001245  
Epoch: [3][200/953] Elapsed 3m 28s (remain 12m 58s) Loss: 0.0027(0.0101) Grad: 8758.4609  LR: 0.00001181  
Epoch: [3][300/953] Elapsed 5m 11s (remain 11m 14s) Loss: 0.0035(0.0099) Grad: 16130.5488  LR: 0.00001116  
Epoch: [3][400/953] Elapsed 6m 54s (remain 9m 31s) Loss: 0.0046(0.0099) Grad: 18105.9609  LR: 0.00001050  
Epoch: [3][500/953] Elapsed 8m 38s (remain 7m 47s) Loss: 0.0072(0.0098) Grad: 18800.1113  LR: 0.00000984  
Epoch: [3][600/953] Elapsed 10m 21s (remain 6m 4s) Loss: 0.0135(0.0097) Grad: 26138.4180  LR: 0.00000918  
Epoch: [3][700/953] Elapsed 12m 5s (remain 4m 20s) Loss: 0.0039(0.0098) Grad: 7231.1685  LR: 0.00000853  
Epoch: [3][800/953] Elapsed 13m 48s (remain 2m 37s) Loss: 0.0181(0.0097) Grad: 29482.9727  LR: 0.00000788  
Epoch: [3][900/953] Elapsed 15m 32s (

Epoch 3 - avg_train_loss: 0.0096  avg_val_loss: 0.0123  time: 1068s
Epoch 3 - Score: 0.8580
Epoch 3 - Save Best Score: 0.8580 Model


Epoch: [4][0/953] Elapsed 0m 1s (remain 21m 8s) Loss: 0.0017(0.0017) Grad: 5737.9473  LR: 0.00000691  
Epoch: [4][100/953] Elapsed 1m 44s (remain 14m 43s) Loss: 0.0023(0.0089) Grad: 8004.7261  LR: 0.00000629  
Epoch: [4][200/953] Elapsed 3m 28s (remain 12m 58s) Loss: 0.0032(0.0093) Grad: 28960.4648  LR: 0.00000568  
Epoch: [4][300/953] Elapsed 5m 11s (remain 11m 14s) Loss: 0.0045(0.0091) Grad: 9540.2324  LR: 0.00000510  
Epoch: [4][400/953] Elapsed 6m 54s (remain 9m 31s) Loss: 0.0009(0.0089) Grad: 2664.8315  LR: 0.00000454  
Epoch: [4][500/953] Elapsed 8m 38s (remain 7m 47s) Loss: 0.0105(0.0088) Grad: 28169.2715  LR: 0.00000400  
Epoch: [4][600/953] Elapsed 10m 21s (remain 6m 4s) Loss: 0.0018(0.0086) Grad: 13613.2559  LR: 0.00000348  
Epoch: [4][700/953] Elapsed 12m 5s (remain 4m 20s) Loss: 0.0140(0.0087) Grad: 34775.3281  LR: 0.00000300  
Epoch: [4][800/953] Elapsed 13m 48s (remain 2m 37s) Loss: 0.0124(0.0088) Grad: 29334.9648  LR: 0.00000254  
Epoch: [4][900/953] Elapsed 15m 31s (rem

Epoch 4 - avg_train_loss: 0.0088  avg_val_loss: 0.0123  time: 1067s
Epoch 4 - Score: 0.8625
Epoch 4 - Save Best Score: 0.8625 Model


Epoch: [5][0/953] Elapsed 0m 1s (remain 21m 5s) Loss: 0.0050(0.0050) Grad: 14657.9609  LR: 0.00000191  
Epoch: [5][100/953] Elapsed 1m 44s (remain 14m 43s) Loss: 0.0057(0.0078) Grad: 19832.9922  LR: 0.00000154  
Epoch: [5][200/953] Elapsed 3m 28s (remain 12m 58s) Loss: 0.0023(0.0081) Grad: 11254.8945  LR: 0.00000121  
Epoch: [5][300/953] Elapsed 5m 11s (remain 11m 15s) Loss: 0.0119(0.0082) Grad: 14650.4639  LR: 0.00000091  
Epoch: [5][400/953] Elapsed 6m 55s (remain 9m 31s) Loss: 0.0043(0.0084) Grad: 10062.6553  LR: 0.00000066  
Epoch: [5][500/953] Elapsed 8m 38s (remain 7m 47s) Loss: 0.0070(0.0085) Grad: 15090.0664  LR: 0.00000044  
Epoch: [5][600/953] Elapsed 10m 21s (remain 6m 4s) Loss: 0.0055(0.0083) Grad: 19426.5781  LR: 0.00000027  
Epoch: [5][700/953] Elapsed 12m 5s (remain 4m 20s) Loss: 0.0145(0.0084) Grad: 20737.1699  LR: 0.00000014  
Epoch: [5][800/953] Elapsed 13m 48s (remain 2m 37s) Loss: 0.0054(0.0083) Grad: 16980.9023  LR: 0.00000005  
Epoch: [5][900/953] Elapsed 15m 31s 

Epoch 5 - avg_train_loss: 0.0083  avg_val_loss: 0.0122  time: 1068s
Epoch 5 - Score: 0.8621
Score: 0.8625
Score: 0.8608


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
[fold0] avg_train_loss,█▂▂▁▁
[fold0] avg_val_loss,█▃▁▁▂
[fold0] epoch,▁▃▅▆█
[fold0] loss,█▃▂▂▃▃▂▂▄▁▁▁▂▁▄▂▂▁▁▂▂▂▁▁▁▂▂▁▁▁▂▂▁▁▁▂▁▄▁▁
[fold0] lr,███████▇▇▇▇▇▆▆▆▆▆▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
[fold0] score,▁▆▇██
[fold1] avg_train_loss,█▂▁▁▁
[fold1] avg_val_loss,█▁▃▁▃
[fold1] epoch,▁▃▅▆█
[fold1] loss,▃▂▂▆▅▃█▃▂▃▄▃▃▄▁▂▂▂▁▁▁▁▂▁▂▂▂▂▂▁▂▃▄▆▂▂▁▂▁▂

0,1
[fold0] avg_train_loss,0.00752
[fold0] avg_val_loss,0.01226
[fold0] epoch,5.0
[fold0] loss,0.01331
[fold0] lr,0.0
[fold0] score,0.86475
[fold1] avg_train_loss,0.00833
[fold1] avg_val_loss,0.01231
[fold1] epoch,5.0
[fold1] loss,0.01233
