# Directory settings

In [1]:
# ====================================================
# Directory settings
# ====================================================
import os

INPUT_DIR = '../input/nbme-score-clinical-patient-notes/'
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# CFG

In [2]:
# ====================================================
# CFG
# ====================================================
class CFG:
    env='kaggle' if os.getcwd() == '/kaggle/working' else 'colab'
    wandb=False # True
    competition='NBME'
    _wandb_kernel='nakama'
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model="microsoft/deberta-v3-large"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4 # 5
    encoder_lr=2e-5
    decoder_lr=2e-5
    group_step=8
    lr_scale=1e-6
    reinit_layers=0 # https://openreview.net/pdf?id=cO1IH43yUF
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=3 # 6 # 8
    fc_dropout=0.6
    loss='bce' # ['bce', 'ce']
    target_size=1 if loss == 'bce' else 2
    max_len=512
    alpha=1
    gamma=2
    smoothing=0.0001
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    train=False # True
    # ====================================================
    # MLM
    # ====================================================
    mlm=True
    extension='csv'
    pad_to_max_length=True
    max_seq_length=512
    overwrite_cache=True
    mlm_probability=0.15
    
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

In [3]:
# ====================================================
# colab
# ====================================================
if CFG.env == 'colab':

    gpu_info = !nvidia-smi
    gpu_info = '\n'.join(gpu_info)
    if gpu_info.find('failed') >= 0:
        print('Not connected to a GPU')
    else:
        print(gpu_info)

    from requests import get
    exp = get('http://172.28.0.2:9000/api/sessions').json()[0]['name'].split('.ipynb')[0]
    INPUT_DIR = './drive/MyDrive/Colab Notebooks/NBME/input/nbme-score-clinical-patient-notes/'
    OUTPUT_DIR = f'./drive/MyDrive/Colab Notebooks/NBME/train/{exp}/'

    from google.colab import drive
    drive.mount('/content/drive')

In [4]:
# ====================================================
# wandb
# ====================================================
if CFG.wandb:
    
    import wandb

    try:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        secret_value_0 = user_secrets.get_secret("wandb_api")
        wandb.login(key=secret_value_0)
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project='NBME', 
                     name=CFG.model,
                     config=class2dict(CFG),
                     group=CFG.model,
                     job_type="train",
                     anonymous=anony)

# Library

In [5]:
def prepare_deverta_v2_v3_tokenizer_fast():
    if CFG.env == 'kaggle':
        transformers_path = Path("/opt/conda/lib/python3.7/site-packages/transformers")
        input_dir = Path("../input/deberta-v2-v3-tokenizer-fast")
    elif CFG.env == 'colab':
        transformers_path = Path("/usr/local/lib/python3.7/dist-packages/transformers")
        input_dir = Path("./drive/MyDrive/Colab Notebooks/NBME/input/deberta-v2-v3-tokenizer-fast")
    convert_file = input_dir / "convert_slow_tokenizer.py"
    conversion_path = transformers_path / convert_file.name
    if conversion_path.exists():
        conversion_path.unlink()
    shutil.copy(convert_file, transformers_path)
    deberta_v2_path = transformers_path / "models" / "deberta_v2"
    for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py', "deberta__init__.py"]:
        if str(filename).startswith("deberta"):
            filepath = deberta_v2_path / str(filename).replace("deberta", "")
        else:
            filepath = deberta_v2_path / filename
        if filepath.exists():
            filepath.unlink()
        shutil.copy(input_dir / filename, filepath)

In [6]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import shutil
import string
import pickle
import random
import joblib
import itertools
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
print(f"torch.__version__: {torch.__version__}")
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

os.system('pip uninstall -y transformers')
os.system('pip uninstall -y tokenizers')

if CFG.env == 'colab':
    os.system('pip install transformers==4.16.2')
    os.system('pip install tokenizers==0.11.0')
    os.system('pip install sentencepiece==0.1.96')
elif CFG.env == 'kaggle':
    os.system('python -m pip install --no-index --find-links=../input/nbme-pip-wheels transformers')
    os.system('python -m pip install --no-index --find-links=../input/nbme-pip-wheels tokenizers')
os.system('pip install datasets==1.18.3')
prepare_deverta_v2_v3_tokenizer_fast()

import datasets
import tokenizers
import transformers
print(f"datasets.__version__: {datasets.__version__}")
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from datasets import load_dataset
from transformers import AutoModelForMaskedLM
from transformers.modeling_outputs import MaskedLMOutput
from transformers import DataCollatorForLanguageModeling
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers.models.deberta_v2 import DebertaV2TokenizerFast
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

torch.__version__: 1.9.1


Found existing installation: transformers 4.12.5


Uninstalling transformers-4.12.5:


  Successfully uninstalled transformers-4.12.5




Found existing installation: tokenizers 0.10.3
Uninstalling tokenizers-0.10.3:
  Successfully uninstalled tokenizers-0.10.3




Looking in links: ../input/nbme-pip-wheels
Processing /kaggle/input/nbme-pip-wheels/transformers-4.16.2-py3-none-any.whl


Processing /kaggle/input/nbme-pip-wheels/tokenizers-0.11.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl


Installing collected packages: tokenizers, transformers


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
allennlp 2.8.0 requires transformers<4.13,>=4.1, but you have transformers 4.16.2 which is incompatible.


Successfully installed tokenizers-0.11.4 transformers-4.16.2


Looking in links: ../input/nbme-pip-wheels




Collecting datasets==1.18.3


  Downloading datasets-1.18.3-py3-none-any.whl (311 kB)






Installing collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 1.18.0
    Uninstalling datasets-1.18.0:


      Successfully uninstalled datasets-1.18.0


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
allennlp 2.8.0 requires transformers<4.13,>=4.1, but you have transformers 4.16.2 which is incompatible.


Successfully installed datasets-1.18.3


datasets.__version__: 1.18.3
tokenizers.__version__: 0.11.0
transformers.__version__: 4.16.2


env: TOKENIZERS_PARALLELISM=true


# Helper functions for scoring

In [7]:
# From https://www.kaggle.com/theoviel/evaluation-metric-folds-baseline

def micro_f1(preds, truths):
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    return f1_score(truths, preds)


def spans_to_binary(spans, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
    return binary


def span_micro_f1(preds, truths):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
    return micro_f1(bin_preds, bin_truths)

In [8]:
def create_labels_for_scoring(df):
    # example: ['0 1', '3 4'] -> ['0 1; 3 4']
    df['location_for_create_labels'] = [ast.literal_eval(f'[]')] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, 'location']
        if lst:
            new_lst = ';'.join(lst)
            df.loc[i, 'location_for_create_labels'] = ast.literal_eval(f'[["{new_lst}"]]')
    # create labels
    truths = []
    for location_list in df['location_for_create_labels'].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)
    return truths


def get_char_probs(texts, predictions, tokenizer):
    results = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        encoded = tokenizer(text, 
                            add_special_tokens=True,
                            return_offsets_mapping=True)
        for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
            start = offset_mapping[0]
            end = offset_mapping[1]
            results[i][start:end] = pred
    return results


def get_results(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        result = np.where(char_prob >= th)[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results


def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions


def compute_best_th(truths, char_probs):
    best_th = 0.5
    best_score = 0.
    for th in [0.5]:
        th = np.round(th, 2)
        results = get_results(char_probs, th=th)
        preds = get_predictions(results)
        score = get_score(truths, preds)
        if best_score < score:
            best_th = th
            best_score = score
    return best_th, best_score

# Utils

In [9]:
# ====================================================
# Utils
# ====================================================
def get_score(y_true, y_pred):
    score = span_micro_f1(y_true, y_pred)
    return score


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

# Data Loading

In [10]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv(INPUT_DIR+'train.csv')
train['annotation'] = train['annotation'].apply(ast.literal_eval)
train['location'] = train['location'].apply(ast.literal_eval)
features = pd.read_csv(INPUT_DIR+'features.csv')
def preprocess_features(features):
    features.loc[27, 'feature_text'] = "Last-Pap-smear-1-year-ago"
    return features
features = preprocess_features(features)
patient_notes = pd.read_csv(INPUT_DIR+'patient_notes.csv')

print(f"train.shape: {train.shape}")
display(train.head())
print(f"features.shape: {features.shape}")
display(features.head())
print(f"patient_notes.shape: {patient_notes.shape}")
display(patient_notes.head())

train.shape: (14300, 6)


Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724]
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693]
2,00016_002,0,16,2,[chest pressure],[203 217]
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]"
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258]


features.shape: (143, 3)


Unnamed: 0,feature_num,case_num,feature_text
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,1,0,Family-history-of-thyroid-disorder
2,2,0,Chest-pressure
3,3,0,Intermittent-symptoms
4,4,0,Lightheaded


patient_notes.shape: (42146, 3)


Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...


In [11]:
train = train.merge(features, on=['feature_num', 'case_num'], how='left')
train = train.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
display(train.head())

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693],Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...
2,00016_002,0,16,2,[chest pressure],[203 217],Chest-pressure,HPI: 17yo M presents with palpitations. Patien...
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]",Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258],Lightheaded,HPI: 17yo M presents with palpitations. Patien...


In [12]:
# incorrect annotation
train.loc[338, 'annotation'] = ast.literal_eval('[["father heart attack"]]')
train.loc[338, 'location'] = ast.literal_eval('[["764 783"]]')

train.loc[621, 'annotation'] = ast.literal_eval('[["for the last 2-3 months"]]')
train.loc[621, 'location'] = ast.literal_eval('[["77 100"]]')

train.loc[655, 'annotation'] = ast.literal_eval('[["no heat intolerance"], ["no cold intolerance"]]')
train.loc[655, 'location'] = ast.literal_eval('[["285 292;301 312"], ["285 287;296 312"]]')

train.loc[1262, 'annotation'] = ast.literal_eval('[["mother thyroid problem"]]')
train.loc[1262, 'location'] = ast.literal_eval('[["551 557;565 580"]]')

train.loc[1265, 'annotation'] = ast.literal_eval('[[\'felt like he was going to "pass out"\']]')
train.loc[1265, 'location'] = ast.literal_eval('[["131 135;181 212"]]')

train.loc[1396, 'annotation'] = ast.literal_eval('[["stool , with no blood"]]')
train.loc[1396, 'location'] = ast.literal_eval('[["259 280"]]')

train.loc[1591, 'annotation'] = ast.literal_eval('[["diarrhoe non blooody"]]')
train.loc[1591, 'location'] = ast.literal_eval('[["176 184;201 212"]]')

train.loc[1615, 'annotation'] = ast.literal_eval('[["diarrhea for last 2-3 days"]]')
train.loc[1615, 'location'] = ast.literal_eval('[["249 257;271 288"]]')

train.loc[1664, 'annotation'] = ast.literal_eval('[["no vaginal discharge"]]')
train.loc[1664, 'location'] = ast.literal_eval('[["822 824;907 924"]]')

train.loc[1714, 'annotation'] = ast.literal_eval('[["started about 8-10 hours ago"]]')
train.loc[1714, 'location'] = ast.literal_eval('[["101 129"]]')

train.loc[1929, 'annotation'] = ast.literal_eval('[["no blood in the stool"]]')
train.loc[1929, 'location'] = ast.literal_eval('[["531 539;549 561"]]')

train.loc[2134, 'annotation'] = ast.literal_eval('[["last sexually active 9 months ago"]]')
train.loc[2134, 'location'] = ast.literal_eval('[["540 560;581 593"]]')

train.loc[2191, 'annotation'] = ast.literal_eval('[["right lower quadrant pain"]]')
train.loc[2191, 'location'] = ast.literal_eval('[["32 57"]]')

train.loc[2553, 'annotation'] = ast.literal_eval('[["diarrhoea no blood"]]')
train.loc[2553, 'location'] = ast.literal_eval('[["308 317;376 384"]]')

train.loc[3124, 'annotation'] = ast.literal_eval('[["sweating"]]')
train.loc[3124, 'location'] = ast.literal_eval('[["549 557"]]')

train.loc[3858, 'annotation'] = ast.literal_eval('[["previously as regular"], ["previously eveyr 28-29 days"], ["previously lasting 5 days"], ["previously regular flow"]]')
train.loc[3858, 'location'] = ast.literal_eval('[["102 123"], ["102 112;125 141"], ["102 112;143 157"], ["102 112;159 171"]]')

train.loc[4373, 'annotation'] = ast.literal_eval('[["for 2 months"]]')
train.loc[4373, 'location'] = ast.literal_eval('[["33 45"]]')

train.loc[4763, 'annotation'] = ast.literal_eval('[["35 year old"]]')
train.loc[4763, 'location'] = ast.literal_eval('[["5 16"]]')

train.loc[4782, 'annotation'] = ast.literal_eval('[["darker brown stools"]]')
train.loc[4782, 'location'] = ast.literal_eval('[["175 194"]]')

train.loc[4908, 'annotation'] = ast.literal_eval('[["uncle with peptic ulcer"]]')
train.loc[4908, 'location'] = ast.literal_eval('[["700 723"]]')

train.loc[6016, 'annotation'] = ast.literal_eval('[["difficulty falling asleep"]]')
train.loc[6016, 'location'] = ast.literal_eval('[["225 250"]]')

train.loc[6192, 'annotation'] = ast.literal_eval('[["helps to take care of aging mother and in-laws"]]')
train.loc[6192, 'location'] = ast.literal_eval('[["197 218;236 260"]]')

train.loc[6380, 'annotation'] = ast.literal_eval('[["No hair changes"], ["No skin changes"], ["No GI changes"], ["No palpitations"], ["No excessive sweating"]]')
train.loc[6380, 'location'] = ast.literal_eval('[["480 482;507 519"], ["480 482;499 503;512 519"], ["480 482;521 531"], ["480 482;533 545"], ["480 482;564 582"]]')

train.loc[6562, 'annotation'] = ast.literal_eval('[["stressed due to taking care of her mother"], ["stressed due to taking care of husbands parents"]]')
train.loc[6562, 'location'] = ast.literal_eval('[["290 320;327 337"], ["290 320;342 358"]]')

train.loc[6862, 'annotation'] = ast.literal_eval('[["stressor taking care of many sick family members"]]')
train.loc[6862, 'location'] = ast.literal_eval('[["288 296;324 363"]]')

train.loc[7022, 'annotation'] = ast.literal_eval('[["heart started racing and felt numbness for the 1st time in her finger tips"]]')
train.loc[7022, 'location'] = ast.literal_eval('[["108 182"]]')

train.loc[7422, 'annotation'] = ast.literal_eval('[["first started 5 yrs"]]')
train.loc[7422, 'location'] = ast.literal_eval('[["102 121"]]')

train.loc[8876, 'annotation'] = ast.literal_eval('[["No shortness of breath"]]')
train.loc[8876, 'location'] = ast.literal_eval('[["481 483;533 552"]]')

train.loc[9027, 'annotation'] = ast.literal_eval('[["recent URI"], ["nasal stuffines, rhinorrhea, for 3-4 days"]]')
train.loc[9027, 'location'] = ast.literal_eval('[["92 102"], ["123 164"]]')

train.loc[9938, 'annotation'] = ast.literal_eval('[["irregularity with her cycles"], ["heavier bleeding"], ["changes her pad every couple hours"]]')
train.loc[9938, 'location'] = ast.literal_eval('[["89 117"], ["122 138"], ["368 402"]]')

train.loc[9973, 'annotation'] = ast.literal_eval('[["gaining 10-15 lbs"]]')
train.loc[9973, 'location'] = ast.literal_eval('[["344 361"]]')

train.loc[10513, 'annotation'] = ast.literal_eval('[["weight gain"], ["gain of 10-16lbs"]]')
train.loc[10513, 'location'] = ast.literal_eval('[["600 611"], ["607 623"]]')

train.loc[11551, 'annotation'] = ast.literal_eval('[["seeing her son knows are not real"]]')
train.loc[11551, 'location'] = ast.literal_eval('[["386 400;443 461"]]')

train.loc[11677, 'annotation'] = ast.literal_eval('[["saw him once in the kitchen after he died"]]')
train.loc[11677, 'location'] = ast.literal_eval('[["160 201"]]')

train.loc[12124, 'annotation'] = ast.literal_eval('[["tried Ambien but it didnt work"]]')
train.loc[12124, 'location'] = ast.literal_eval('[["325 337;349 366"]]')

train.loc[12279, 'annotation'] = ast.literal_eval('[["heard what she described as a party later than evening these things did not actually happen"]]')
train.loc[12279, 'location'] = ast.literal_eval('[["405 459;488 524"]]')

train.loc[12289, 'annotation'] = ast.literal_eval('[["experienced seeing her son at the kitchen table these things did not actually happen"]]')
train.loc[12289, 'location'] = ast.literal_eval('[["353 400;488 524"]]')

train.loc[13238, 'annotation'] = ast.literal_eval('[["SCRACHY THROAT"], ["RUNNY NOSE"]]')
train.loc[13238, 'location'] = ast.literal_eval('[["293 307"], ["321 331"]]')

train.loc[13297, 'annotation'] = ast.literal_eval('[["without improvement when taking tylenol"], ["without improvement when taking ibuprofen"]]')
train.loc[13297, 'location'] = ast.literal_eval('[["182 221"], ["182 213;225 234"]]')

train.loc[13299, 'annotation'] = ast.literal_eval('[["yesterday"], ["yesterday"]]')
train.loc[13299, 'location'] = ast.literal_eval('[["79 88"], ["409 418"]]')

train.loc[13845, 'annotation'] = ast.literal_eval('[["headache global"], ["headache throughout her head"]]')
train.loc[13845, 'location'] = ast.literal_eval('[["86 94;230 236"], ["86 94;237 256"]]')

train.loc[14083, 'annotation'] = ast.literal_eval('[["headache generalized in her head"]]')
train.loc[14083, 'location'] = ast.literal_eval('[["56 64;156 179"]]')

In [13]:
train['annotation_length'] = train['annotation'].apply(len)
display(train['annotation_length'].value_counts())

1    8185
0    4399
2    1292
3     287
4      99
5      27
6       9
7       1
8       1
Name: annotation_length, dtype: int64

# CV split

In [14]:
# ====================================================
# CV split
# ====================================================
Fold = GroupKFold(n_splits=CFG.n_fold)
groups = train['pn_num'].values
for n, (train_index, val_index) in enumerate(Fold.split(train, train['location'], groups)):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    3575
1    3575
2    3575
3    3575
dtype: int64

In [15]:
if CFG.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    display(train.groupby('fold').size())

In [16]:
# ====================================================
# CV split for mlm
# ====================================================
if CFG.mlm:
        
    pn_tmp = patient_notes[~patient_notes['pn_num'].isin(train['pn_num'].unique())].reset_index(drop=True)
    Fold = KFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
    for n, (train_index, val_index) in enumerate(Fold.split(pn_tmp, pn_tmp['pn_history'])):
        pn_tmp.loc[val_index, 'fold'] = int(n)
    pn_tmp['fold'] = pn_tmp['fold'].astype(int)

    fold_dict = dict(train.groupby('pn_num').agg({'fold': 'mean'}).reset_index()[['pn_num', 'fold']].values)
    fold_dict.update(dict(pn_tmp[['pn_num', 'fold']].values))

    patient_notes['fold'] = patient_notes['pn_num'].map(fold_dict)
    patient_notes['fold'] = patient_notes['fold'].fillna(-1)
    patient_notes['fold'] = patient_notes['fold'].astype(int)
    display(patient_notes['fold'].value_counts())
    
    if CFG.debug:
        display(patient_notes.groupby('fold').size())
        patient_notes = patient_notes.sample(n=1000, random_state=0).reset_index(drop=True)
        display(patient_notes.groupby('fold').size())
        
    for fold in range(CFG.n_fold):
        trn_mlm_data = patient_notes[patient_notes['fold'] != fold][['pn_history']].reset_index(drop=True)
        val_mlm_data = patient_notes[patient_notes['fold'] == fold][['pn_history']].reset_index(drop=True)
        trn_mlm_data = trn_mlm_data.rename(columns={'pn_history': 'text'})
        val_mlm_data = val_mlm_data.rename(columns={'pn_history': 'text'})
        trn_csv_name = f'trn_mlm_data_fold{fold}.csv'
        val_csv_name = f'val_mlm_data_fold{fold}.csv'
        trn_mlm_data.to_csv(OUTPUT_DIR+trn_csv_name, index=False)
        val_mlm_data.to_csv(OUTPUT_DIR+val_csv_name, index=False)
        print(f"Saved train: {trn_csv_name}  valid: {val_csv_name}")

1    10537
0    10537
3    10536
2    10536
Name: fold, dtype: int64

Saved train: trn_mlm_data_fold0.csv  valid: val_mlm_data_fold0.csv


Saved train: trn_mlm_data_fold1.csv  valid: val_mlm_data_fold1.csv


Saved train: trn_mlm_data_fold2.csv  valid: val_mlm_data_fold2.csv


Saved train: trn_mlm_data_fold3.csv  valid: val_mlm_data_fold3.csv


# tokenizer

In [17]:
# ====================================================
# tokenizer
# ====================================================
if CFG.model.find("deberta-v2") >= 0 or CFG.model.find("deberta-v3") >= 0:
    tokenizer = DebertaV2TokenizerFast.from_pretrained(CFG.model, add_prefix_space=False, trim_offsets=False)
else:
    tokenizer = AutoTokenizer.from_pretrained(CFG.model, add_prefix_space=False, trim_offsets=False)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/580 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Dataset

In [18]:
# ====================================================
# Define max_len
# ====================================================
for text_col in ['pn_history']:
    pn_history_lengths = []
    tk0 = tqdm(patient_notes[text_col].fillna("").values, total=len(patient_notes))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        pn_history_lengths.append(length)
    LOGGER.info(f'{text_col} max(lengths): {max(pn_history_lengths)}')

for text_col in ['feature_text']:
    features_lengths = []
    tk0 = tqdm(features[text_col].fillna("").values, total=len(features))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        features_lengths.append(length)
    LOGGER.info(f'{text_col} max(lengths): {max(features_lengths)}')

CFG.max_len = max(pn_history_lengths) + max(features_lengths) + 3 # cls & sep & sep
CFG.max_seq_length = max(pn_history_lengths) + 3 # cls & sep & sep # mlm
LOGGER.info(f"max_len: {CFG.max_len}")
LOGGER.info(f"max_seq_length: {CFG.max_seq_length}")

  0%|          | 0/42146 [00:00<?, ?it/s]

pn_history max(lengths): 323


  0%|          | 0/143 [00:00<?, ?it/s]

feature_text max(lengths): 28


max_len: 354


max_seq_length: 326


In [19]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text, feature_text):
    inputs = cfg.tokenizer(text, feature_text, 
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


def create_label(cfg, text, annotation_length, location_list):
    encoded = cfg.tokenizer(text,
                            add_special_tokens=True,
                            max_length=cfg.max_len,
                            padding="max_length",
                            return_offsets_mapping=True)
    offset_mapping = encoded['offset_mapping']
    ignore_idxes = np.where(np.array(encoded.sequence_ids()) != 0)[0]
    label = np.zeros(len(offset_mapping))
    label[ignore_idxes] = -1
    if annotation_length != 0:
        for location in location_list:
            for loc in [s.split() for s in location.split(';')]:
                start_idx = -1
                end_idx = -1
                start, end = int(loc[0]), int(loc[1])
                for idx in range(len(offset_mapping)):
                    if (start_idx == -1) & (start < offset_mapping[idx][0]):
                        start_idx = idx - 1
                    if (end_idx == -1) & (end <= offset_mapping[idx][1]):
                        end_idx = idx + 1
                if start_idx == -1:
                    start_idx = end_idx
                if (start_idx != -1) & (end_idx != -1):
                    label[start_idx:end_idx] = 1
    return label


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.feature_texts = df['feature_text'].values
        self.pn_historys = df['pn_history'].values
        self.annotation_lengths = df['annotation_length'].values
        self.locations = df['location'].values

    def __len__(self):
        return len(self.feature_texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, 
                               self.pn_historys[item], 
                               self.feature_texts[item])
        label = create_label(self.cfg,
                             self.pn_historys[item], 
                             self.annotation_lengths[item], 
                             self.locations[item])
        if self.cfg.loss == 'bce':
            return inputs, torch.tensor(label, dtype=torch.float)
        elif self.cfg.loss == 'ce':
            return inputs, torch.tensor(label, dtype=torch.long)

In [20]:
"""
train_dataset = TrainDataset(CFG, train)
inputs, label = train_dataset[0]
print(inputs)
print(label)
"""

'\ntrain_dataset = TrainDataset(CFG, train)\ninputs, label = train_dataset[0]\nprint(inputs)\nprint(label)\n'

# Model

In [21]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        if cfg.reinit_layers != 0:
            reinit_encoder_layer = AutoModel.from_config(self.config).encoder.layer
            for i in np.arange(self.config.num_hidden_layers)[cfg.reinit_layers:]:
                LOGGER.info(f"reinit encoder layer #{i}")
                self.model.encoder.layer[i] = reinit_encoder_layer[i]
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, self.cfg.target_size)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        return last_hidden_states

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

In [22]:
class CustomModelForMaskedLM(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        if cfg.reinit_layers != 0:
            reinit_encoder_layer = AutoModel.from_config(self.config).encoder.layer
            for i in np.arange(self.config.num_hidden_layers)[cfg.reinit_layers:]:
                LOGGER.info(f"reinit encoder layer #{i}")
                self.model.encoder.layer[i] = reinit_encoder_layer[i]
        self.cls = AutoModelForMaskedLM.from_pretrained(cfg.model, config=self.config).cls

    def forward(self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        
        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        
        sequence_output = outputs[0]
        prediction_scores = self.cls(sequence_output)
        
        masked_lm_loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
        
        return MaskedLMOutput(
            loss=masked_lm_loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

# Loss

In [23]:
class FocalLoss(nn.Module):
    def __init__(self, reduction='none', alpha=1, gamma=2):
        super().__init__()
        self.reduction = reduction
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        bce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        pt = torch.exp(-bce_loss)
        loss = self.alpha * (1. - pt)**self.gamma * bce_loss
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss


class SmoothFocalLoss(nn.Module):
    def __init__(self, reduction='none', alpha=1, gamma=2, smoothing=0.0):
        super().__init__()
        self.reduction = reduction
        self.focal_loss = FocalLoss(reduction='none', alpha=alpha, gamma=gamma)
        self.smoothing = smoothing

    @staticmethod
    def _smooth(targets:torch.Tensor, smoothing=0.0):
        assert 0 <= smoothing < 1
        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
        return targets

    def forward(self, inputs, targets):
        targets = SmoothFocalLoss._smooth(targets, self.smoothing)
        loss = self.focal_loss(inputs, targets)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss

    
class CEFocalLoss(nn.Module):
    def __init__(self, reduction='none', alpha=1, gamma=2):
        super(CEFocalLoss, self).__init__()
        self.reduction = reduction
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        loss = self.alpha * (1. - pt)**self.gamma * ce_loss
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss

    
class SmoothCEFocalLoss(nn.Module):
    def __init__(self, reduction='none', alpha=1, gamma=2, smoothing=0.0):
        super(SmoothCEFocalLoss, self).__init__()
        self.reduction = reduction
        self.alpha = alpha
        self.gamma = gamma
        self.smoothing = smoothing

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none', label_smoothing=self.smoothing) # torch >= 1.10.0
        pt = torch.exp(-ce_loss)
        loss = self.alpha * (1. - pt)**self.gamma * ce_loss
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss

In [24]:
"""
train_dataset = TrainDataset(CFG, train)
train_loader = DataLoader(train_dataset,
                          batch_size=2,
                          shuffle=True,
                          num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
model = CustomModel(CFG, config_path=None, pretrained=False)
model.to(device)
if CFG.loss == 'bce':
    criterion = SmoothFocalLoss(reduction='mean', alpha=CFG.alpha, gamma=CFG.gamma, smoothing=CFG.smoothing)
elif CFG.loss == 'ce':
    criterion = CEFocalLoss(reduction='mean', alpha=CFG.alpha, gamma=CFG.gamma)
for step, (inputs, labels) in enumerate(train_loader):
    for k, v in inputs.items():
        inputs[k] = v.to(device)
    labels = labels.to(device)
    batch_size = labels.size(0)
    y_preds = model(inputs)
    if CFG.loss == 'bce':
        loss_mask = labels.view(-1, 1) != -1
        loss = criterion(y_preds.view(-1, 1)[loss_mask], labels.view(-1, 1)[loss_mask])
    elif CFG.loss == 'ce':
        loss_mask = labels.view(-1) != -1
        loss = criterion(y_preds.view(-1, CFG.target_size)[loss_mask], labels.view(-1)[loss_mask])
    break
"""

"\ntrain_dataset = TrainDataset(CFG, train)\ntrain_loader = DataLoader(train_dataset,\n                          batch_size=2,\n                          shuffle=True,\n                          num_workers=CFG.num_workers, pin_memory=True, drop_last=True)\nmodel = CustomModel(CFG, config_path=None, pretrained=False)\nmodel.to(device)\nif CFG.loss == 'bce':\n    criterion = SmoothFocalLoss(reduction='mean', alpha=CFG.alpha, gamma=CFG.gamma, smoothing=CFG.smoothing)\nelif CFG.loss == 'ce':\n    criterion = CEFocalLoss(reduction='mean', alpha=CFG.alpha, gamma=CFG.gamma)\nfor step, (inputs, labels) in enumerate(train_loader):\n    for k, v in inputs.items():\n        inputs[k] = v.to(device)\n    labels = labels.to(device)\n    batch_size = labels.size(0)\n    y_preds = model(inputs)\n    if CFG.loss == 'bce':\n        loss_mask = labels.view(-1, 1) != -1\n        loss = criterion(y_preds.view(-1, 1)[loss_mask], labels.view(-1, 1)[loss_mask])\n    elif CFG.loss == 'ce':\n        loss_mask

# Helpler functions

In [25]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        if CFG.loss == 'bce':
            loss_mask = labels.view(-1, 1) != -1
            loss = criterion(y_preds.view(-1, 1)[loss_mask], labels.view(-1, 1)[loss_mask])
        elif CFG.loss == 'ce':
            loss_mask = labels.view(-1) != -1
            loss = criterion(y_preds.view(-1, CFG.target_size)[loss_mask], labels.view(-1)[loss_mask])
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        if CFG.loss == 'bce':
            loss_mask = labels.view(-1, 1) != -1
            loss = criterion(y_preds.view(-1, 1)[loss_mask], labels.view(-1, 1)[loss_mask])
        elif CFG.loss == 'ce':
            loss_mask = labels.view(-1) != -1
            loss = criterion(y_preds.view(-1, CFG.target_size)[loss_mask], labels.view(-1)[loss_mask])
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        if CFG.loss == 'bce':
            preds.append(y_preds.sigmoid().to('cpu').numpy())
        elif CFG.loss == 'ce':
            preds.append(y_preds.softmax(2).to('cpu').numpy()[:,:,1])
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        if CFG.loss == 'bce':
            preds.append(y_preds.sigmoid().to('cpu').numpy())
        elif CFG.loss == 'ce':
            preds.append(y_preds.softmax(2).to('cpu').numpy()[:,:,1])
    predictions = np.concatenate(preds)
    return predictions

In [26]:
def train_mlm_fn(fold, train_loader, model, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, inputs in enumerate(train_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        batch_size = inputs['labels'].size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            outputs = model(**inputs)
        loss = outputs.loss
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg


def valid_mlm_fn(valid_loader, model, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, inputs in enumerate(valid_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        batch_size = inputs['labels'].size(0)
        with torch.no_grad():
            outputs = model(**inputs)
        loss = outputs.loss
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    return losses.avg

In [27]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_texts = valid_folds['pn_history'].values
    valid_labels = create_labels_for_scoring(valid_folds)
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, group_step=1, lr_scale=1.0, weight_decay=0.0):
        num_hidden_layers = model.config.num_hidden_layers
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        groups = np.array([f'layer.{i}.' for i in range(num_hidden_layers)]).reshape(-1, group_step).tolist()
        optimizer_parameters = []
        for i, group in enumerate(groups):
            lr_factor = (num_hidden_layers - (i * lr_scale)) / num_hidden_layers
            optimizer_parameters.append(
                {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group)], 'weight_decay': weight_decay,
                 'lr': encoder_lr * lr_factor}
            )
            optimizer_parameters.append(
                {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group)], 'weight_decay': 0.0,
                 'lr': encoder_lr * lr_factor}
            )
        optimizer_parameters.append(
            {'params': [p for n, p in model.named_parameters() if "model" not in n], 'weight_decay': 0.0,
             'lr': decoder_lr, "momentum" : 0.99}
        )
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                group_step=CFG.group_step,
                                                lr_scale=CFG.lr_scale,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler=='linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler=='cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_dataset) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    if CFG.loss == 'bce':
        criterion = SmoothFocalLoss(reduction='mean', alpha=CFG.alpha, gamma=CFG.gamma, smoothing=CFG.smoothing)
    elif CFG.loss == 'ce':
        criterion = CEFocalLoss(reduction='mean', alpha=CFG.alpha, gamma=CFG.gamma)
    
    best_score = 0.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        predictions = predictions.reshape((len(valid_folds), CFG.max_len))
        
        # scoring
        char_probs = get_char_probs(valid_texts, predictions, CFG.tokenizer)
        th, score = compute_best_th(valid_labels, char_probs)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}  th: {th}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": avg_loss, 
                       f"[fold{fold}] avg_val_loss": avg_val_loss,
                       f"[fold{fold}] score": score})
        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[i for i in range(CFG.max_len)]] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [28]:
# ====================================================
# train mlm loop
# ====================================================
def train_mlm_loop(fold):

    # ====================================================
    # loader
    # ====================================================
    args = CFG()
    args.train_file = f"trn_mlm_data_fold{fold}.csv"
    args.validation_file = f"val_mlm_data_fold{fold}.csv"
    data_files = {'train': OUTPUT_DIR+args.train_file, 'validation': OUTPUT_DIR+args.validation_file}
    raw_datasets = load_dataset(args.extension, data_files=data_files)
    LOGGER.info(f'raw_datasets: {raw_datasets}')
    
    max_seq_length = args.max_seq_length
    LOGGER.info(f"max_seq_length: {max_seq_length}")

    def tokenize_function(examples):
        return tokenizer(examples["text"], return_special_tokens_mask=True)

    tokenized_datasets = raw_datasets.map(tokenize_function,
                                          batched=True,
                                          num_proc=args.num_workers,
                                          remove_columns=["text"],
                                          load_from_cache_file=not args.overwrite_cache)
    LOGGER.info(f"tokenized_datasets: {tokenized_datasets}")
    
    def group_texts(examples):
        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        total_length = (total_length // max_seq_length) * max_seq_length
        result = {
            k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
            for k, t in concatenated_examples.items()
        }
        return result

    tokenized_datasets = tokenized_datasets.map(group_texts,
                                                batched=True,
                                                num_proc=args.num_workers,
                                                load_from_cache_file=not args.overwrite_cache)
    train_dataset = tokenized_datasets["train"]
    valid_dataset = tokenized_datasets["validation"]
    LOGGER.info(f"train_dataset: {train_dataset}  valid_dataset: {valid_dataset}")

    data_collator = DataCollatorForLanguageModeling(tokenizer=args.tokenizer,
                                                    mlm_probability=args.mlm_probability)
    train_loader = DataLoader(train_dataset,
                              shuffle=True,
                              collate_fn=data_collator,
                              batch_size=args.batch_size)
    valid_loader = DataLoader(valid_dataset,
                              shuffle=False,
                              collate_fn=data_collator,
                              batch_size=args.batch_size)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModelForMaskedLM(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, group_step=1, lr_scale=1.0, weight_decay=0.0):
        num_hidden_layers = model.config.num_hidden_layers
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        groups = np.array([f'layer.{i}.' for i in range(num_hidden_layers)]).reshape(-1, group_step).tolist()
        optimizer_parameters = []
        for i, group in enumerate(groups):
            lr_factor = (num_hidden_layers - (i * lr_scale)) / num_hidden_layers
            optimizer_parameters.append(
                {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group)], 'weight_decay': weight_decay,
                 'lr': encoder_lr * lr_factor}
            )
            optimizer_parameters.append(
                {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group)], 'weight_decay': 0.0,
                 'lr': encoder_lr * lr_factor}
            )
        optimizer_parameters.append(
            {'params': [p for n, p in model.named_parameters() if "model" not in n], 'weight_decay': 0.0,
             'lr': decoder_lr, "momentum" : 0.99}
        )
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                group_step=CFG.group_step,
                                                lr_scale=CFG.lr_scale,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler=='linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler=='cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_dataset) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    best_loss = np.inf

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_mlm_fn(fold, train_loader, model, optimizer, epoch, scheduler, device)
        
        # eval
        avg_val_loss = valid_mlm_fn(valid_loader, model, device)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": avg_loss, 
                       f"[fold{fold}] avg_val_loss": avg_val_loss})
        
        if avg_val_loss < best_loss:
            best_loss = avg_val_loss
            LOGGER.info(f'Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model')
            torch.save({'model': model.state_dict()},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_mlm_fold{fold}_best.pth")

    torch.cuda.empty_cache()
    gc.collect()

In [29]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = create_labels_for_scoring(oof_df)
        predictions = oof_df[[i for i in range(CFG.max_len)]].values
        char_probs = get_char_probs(oof_df['pn_history'].values, predictions, CFG.tokenizer)
        th, score = compute_best_th(labels, char_probs)
        LOGGER.info(f'Score: {score:<.4f}  th: {th}')
        
    if CFG.mlm:
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                train_mlm_loop(fold)
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl')
        
    if CFG.wandb:
        wandb.finish()

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-e3c5456beb82e398/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e...


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-e3c5456beb82e398/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

raw_datasets: DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 31609
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 10537
    })
})


max_seq_length: 326


tokenized_datasets: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 31609
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 10537
    })
})


train_dataset: Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
    num_rows: 18400
})  valid_dataset: Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
    num_rows: 6144
})


Downloading:   0%|          | 0.00/833M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2ForMaskedLM: ['mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'deberta.embeddings.position_embeddings.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaV2ForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification mode

Some weights of DebertaV2ForMaskedLM were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: [1][0/6134] Elapsed 0m 1s (remain 162m 33s) Loss: 12.1030(12.1030) Grad: inf  LR: 0.00002000  


Epoch: [1][100/6134] Elapsed 1m 16s (remain 76m 9s) Loss: 6.0849(7.5446) Grad: 98497.7734  LR: 0.00002000  


Epoch: [1][200/6134] Elapsed 2m 31s (remain 74m 31s) Loss: 4.6871(6.5824) Grad: 89444.6016  LR: 0.00002000  


Epoch: [1][300/6134] Elapsed 3m 46s (remain 73m 8s) Loss: 4.3961(5.9815) Grad: 86750.0156  LR: 0.00001999  


Epoch: [1][400/6134] Elapsed 5m 1s (remain 71m 50s) Loss: 4.4440(5.5788) Grad: 87827.3438  LR: 0.00001999  


Epoch: [1][500/6134] Elapsed 6m 16s (remain 70m 34s) Loss: 4.3342(5.2644) Grad: 89661.5312  LR: 0.00001998  


Epoch: [1][600/6134] Elapsed 7m 31s (remain 69m 19s) Loss: 3.3797(5.0130) Grad: 74946.6562  LR: 0.00001997  


Epoch: [1][700/6134] Elapsed 8m 46s (remain 68m 4s) Loss: 3.0219(4.7995) Grad: 97519.9062  LR: 0.00001996  


Epoch: [1][800/6134] Elapsed 10m 2s (remain 66m 49s) Loss: 3.4496(4.6162) Grad: 85672.5000  LR: 0.00001995  


Epoch: [1][900/6134] Elapsed 11m 17s (remain 65m 33s) Loss: 3.5967(4.4553) Grad: 86688.9688  LR: 0.00001993  


Epoch: [1][1000/6134] Elapsed 12m 32s (remain 64m 18s) Loss: 3.2843(4.3225) Grad: 83461.9766  LR: 0.00001992  


Epoch: [1][1100/6134] Elapsed 13m 47s (remain 63m 3s) Loss: 2.8708(4.2016) Grad: 88984.3828  LR: 0.00001990  


Epoch: [1][1200/6134] Elapsed 15m 2s (remain 61m 48s) Loss: 2.5271(4.0843) Grad: 86619.2422  LR: 0.00001988  


Epoch: [1][1300/6134] Elapsed 16m 18s (remain 60m 33s) Loss: 2.6245(3.9841) Grad: 82258.7188  LR: 0.00001986  


Epoch: [1][1400/6134] Elapsed 17m 33s (remain 59m 18s) Loss: 4.0668(3.8941) Grad: 89671.9141  LR: 0.00001984  


Epoch: [1][1500/6134] Elapsed 18m 48s (remain 58m 2s) Loss: 2.9969(3.8127) Grad: 80943.1719  LR: 0.00001982  


Epoch: [1][1600/6134] Elapsed 20m 3s (remain 56m 47s) Loss: 2.6325(3.7424) Grad: 78426.4141  LR: 0.00001979  


Epoch: [1][1700/6134] Elapsed 21m 18s (remain 55m 32s) Loss: 2.7650(3.6705) Grad: 77121.9531  LR: 0.00001976  


Epoch: [1][1800/6134] Elapsed 22m 34s (remain 54m 17s) Loss: 2.1763(3.6036) Grad: 75520.2734  LR: 0.00001974  


Epoch: [1][1900/6134] Elapsed 23m 49s (remain 53m 2s) Loss: 2.7821(3.5455) Grad: 85783.1406  LR: 0.00001971  


Epoch: [1][2000/6134] Elapsed 25m 4s (remain 51m 47s) Loss: 2.9412(3.4865) Grad: 83049.8516  LR: 0.00001967  


Epoch: [1][2100/6134] Elapsed 26m 19s (remain 50m 32s) Loss: 2.6254(3.4320) Grad: 164653.5938  LR: 0.00001964  


Epoch: [1][2200/6134] Elapsed 27m 34s (remain 49m 16s) Loss: 1.6417(3.3789) Grad: 124081.7031  LR: 0.00001961  


Epoch: [1][2300/6134] Elapsed 28m 49s (remain 48m 1s) Loss: 2.4510(3.3283) Grad: 170202.8594  LR: 0.00001957  


Epoch: [1][2400/6134] Elapsed 30m 5s (remain 46m 46s) Loss: 2.0994(3.2832) Grad: 147811.8281  LR: 0.00001953  


Epoch: [1][2500/6134] Elapsed 31m 20s (remain 45m 31s) Loss: 2.2294(3.2415) Grad: 158947.3906  LR: 0.00001949  


Epoch: [1][2600/6134] Elapsed 32m 35s (remain 44m 16s) Loss: 1.8312(3.2020) Grad: 148878.0312  LR: 0.00001945  


Epoch: [1][2700/6134] Elapsed 33m 50s (remain 43m 1s) Loss: 2.7409(3.1654) Grad: 154074.8125  LR: 0.00001941  


Epoch: [1][2800/6134] Elapsed 35m 6s (remain 41m 46s) Loss: 1.4201(3.1303) Grad: 129796.5234  LR: 0.00001936  


Epoch: [1][2900/6134] Elapsed 36m 21s (remain 40m 30s) Loss: 2.1872(3.0958) Grad: 178062.4688  LR: 0.00001932  


Epoch: [1][3000/6134] Elapsed 37m 36s (remain 39m 15s) Loss: 3.4589(3.0636) Grad: 217241.9062  LR: 0.00001927  


Epoch: [1][3100/6134] Elapsed 38m 51s (remain 38m 0s) Loss: 2.1778(3.0326) Grad: 164694.6875  LR: 0.00001922  


Epoch: [1][3200/6134] Elapsed 40m 6s (remain 36m 45s) Loss: 2.4850(3.0035) Grad: 152818.5625  LR: 0.00001917  


Epoch: [1][3300/6134] Elapsed 41m 21s (remain 35m 30s) Loss: 2.2632(2.9742) Grad: 177097.2344  LR: 0.00001912  


Epoch: [1][3400/6134] Elapsed 42m 37s (remain 34m 14s) Loss: 2.6429(2.9475) Grad: 164308.4062  LR: 0.00001907  


Epoch: [1][3500/6134] Elapsed 43m 52s (remain 32m 59s) Loss: 2.0351(2.9212) Grad: 165197.3438  LR: 0.00001901  


Epoch: [1][3600/6134] Elapsed 45m 7s (remain 31m 44s) Loss: 2.1423(2.8967) Grad: 154795.6875  LR: 0.00001896  


Epoch: [1][3700/6134] Elapsed 46m 22s (remain 30m 29s) Loss: 2.2337(2.8729) Grad: 161040.0781  LR: 0.00001890  


Epoch: [1][3800/6134] Elapsed 47m 37s (remain 29m 14s) Loss: 2.0655(2.8488) Grad: 145945.0156  LR: 0.00001884  


Epoch: [1][3900/6134] Elapsed 48m 53s (remain 27m 58s) Loss: 1.8768(2.8284) Grad: 145134.5312  LR: 0.00001878  


Epoch: [1][4000/6134] Elapsed 50m 8s (remain 26m 43s) Loss: 2.0665(2.8074) Grad: 175783.2812  LR: 0.00001872  


Epoch: [1][4100/6134] Elapsed 51m 23s (remain 25m 28s) Loss: 1.9330(2.7862) Grad: 252252.6562  LR: 0.00001865  


Epoch: [1][4200/6134] Elapsed 52m 38s (remain 24m 13s) Loss: 1.8661(2.7669) Grad: 284112.5000  LR: 0.00001859  


Epoch: [1][4300/6134] Elapsed 53m 53s (remain 22m 58s) Loss: 1.6465(2.7468) Grad: 279607.9688  LR: 0.00001852  


Epoch: [1][4400/6134] Elapsed 55m 9s (remain 21m 43s) Loss: 1.8168(2.7260) Grad: 308806.0938  LR: 0.00001845  


Epoch: [1][4500/6134] Elapsed 56m 24s (remain 20m 28s) Loss: 1.7843(2.7083) Grad: 247969.4062  LR: 0.00001838  


Epoch: [1][4600/6134] Elapsed 57m 39s (remain 19m 12s) Loss: 1.6917(2.6919) Grad: 273673.2500  LR: 0.00001831  


Epoch: [1][4700/6134] Elapsed 58m 55s (remain 17m 57s) Loss: 1.5948(2.6750) Grad: 255573.9375  LR: 0.00001824  


Epoch: [1][4800/6134] Elapsed 60m 10s (remain 16m 42s) Loss: 1.8378(2.6570) Grad: 293990.1562  LR: 0.00001817  


Epoch: [1][4900/6134] Elapsed 61m 25s (remain 15m 27s) Loss: 1.8392(2.6403) Grad: 293237.1562  LR: 0.00001809  


Epoch: [1][5000/6134] Elapsed 62m 40s (remain 14m 12s) Loss: 2.5946(2.6247) Grad: 323955.5000  LR: 0.00001802  


Epoch: [1][5100/6134] Elapsed 63m 56s (remain 12m 56s) Loss: 1.5843(2.6092) Grad: 303848.8438  LR: 0.00001794  


Epoch: [1][5200/6134] Elapsed 65m 11s (remain 11m 41s) Loss: 1.5627(2.5929) Grad: 245048.3750  LR: 0.00001786  


Epoch: [1][5300/6134] Elapsed 66m 26s (remain 10m 26s) Loss: 1.9240(2.5793) Grad: 283972.4688  LR: 0.00001778  


Epoch: [1][5400/6134] Elapsed 67m 41s (remain 9m 11s) Loss: 1.6893(2.5651) Grad: 324447.3750  LR: 0.00001770  


Epoch: [1][5500/6134] Elapsed 68m 56s (remain 7m 56s) Loss: 1.9542(2.5518) Grad: 311977.6562  LR: 0.00001762  


Epoch: [1][5600/6134] Elapsed 70m 12s (remain 6m 40s) Loss: 1.5794(2.5384) Grad: 412493.3125  LR: 0.00001754  


Epoch: [1][5700/6134] Elapsed 71m 27s (remain 5m 25s) Loss: 1.5404(2.5247) Grad: 244684.7656  LR: 0.00001745  


Epoch: [1][5800/6134] Elapsed 72m 42s (remain 4m 10s) Loss: 1.8630(2.5119) Grad: 264883.7812  LR: 0.00001737  


Epoch: [1][5900/6134] Elapsed 73m 57s (remain 2m 55s) Loss: 1.7812(2.4989) Grad: 322027.2500  LR: 0.00001728  


Epoch: [1][6000/6134] Elapsed 75m 12s (remain 1m 40s) Loss: 1.8016(2.4867) Grad: 271292.2188  LR: 0.00001719  


Epoch: [1][6100/6134] Elapsed 76m 28s (remain 0m 24s) Loss: 1.2445(2.4743) Grad: 648329.7500  LR: 0.00001710  


Epoch: [1][6133/6134] Elapsed 76m 52s (remain 0m 0s) Loss: 1.6403(2.4704) Grad: 1051674.8750  LR: 0.00001707  


EVAL: [0/2048] Elapsed 0m 0s (remain 7m 32s) Loss: 1.9806(1.9806) 


EVAL: [100/2048] Elapsed 0m 22s (remain 7m 5s) Loss: 1.2799(1.7801) 


EVAL: [200/2048] Elapsed 0m 43s (remain 6m 43s) Loss: 2.0271(1.7635) 


EVAL: [300/2048] Elapsed 1m 5s (remain 6m 22s) Loss: 1.5012(1.7342) 


EVAL: [400/2048] Elapsed 1m 27s (remain 6m 0s) Loss: 1.1810(1.6707) 


EVAL: [500/2048] Elapsed 1m 49s (remain 5m 39s) Loss: 1.2565(1.6321) 


EVAL: [600/2048] Elapsed 2m 11s (remain 5m 17s) Loss: 1.6081(1.6114) 


EVAL: [700/2048] Elapsed 2m 33s (remain 4m 55s) Loss: 1.4283(1.5966) 


EVAL: [800/2048] Elapsed 2m 55s (remain 4m 33s) Loss: 1.7422(1.6072) 


EVAL: [900/2048] Elapsed 3m 17s (remain 4m 11s) Loss: 2.1983(1.6271) 


EVAL: [1000/2048] Elapsed 3m 39s (remain 3m 49s) Loss: 1.0350(1.6321) 


EVAL: [1100/2048] Elapsed 4m 1s (remain 3m 27s) Loss: 1.8871(1.6282) 


EVAL: [1200/2048] Elapsed 4m 23s (remain 3m 6s) Loss: 1.8007(1.6206) 


EVAL: [1300/2048] Elapsed 4m 45s (remain 2m 43s) Loss: 1.2971(1.6111) 


EVAL: [1400/2048] Elapsed 5m 7s (remain 2m 22s) Loss: 1.8339(1.6232) 


EVAL: [1500/2048] Elapsed 5m 29s (remain 2m 0s) Loss: 1.9466(1.6283) 


EVAL: [1600/2048] Elapsed 5m 51s (remain 1m 38s) Loss: 2.1023(1.6307) 


EVAL: [1700/2048] Elapsed 6m 13s (remain 1m 16s) Loss: 1.9077(1.6369) 


EVAL: [1800/2048] Elapsed 6m 34s (remain 0m 54s) Loss: 1.0786(1.6392) 


EVAL: [1900/2048] Elapsed 6m 56s (remain 0m 32s) Loss: 1.8079(1.6421) 


EVAL: [2000/2048] Elapsed 7m 18s (remain 0m 10s) Loss: 1.9751(1.6427) 


Epoch 1 - avg_train_loss: 2.4704  avg_val_loss: 1.6433  time: 5062s


Epoch 1 - Save Best Loss: 1.6433 Model


EVAL: [2047/2048] Elapsed 7m 28s (remain 0m 0s) Loss: 1.4145(1.6433) 


Epoch: [2][0/6134] Elapsed 0m 0s (remain 75m 56s) Loss: 2.0458(2.0458) Grad: 686903.1250  LR: 0.00001707  


Epoch: [2][100/6134] Elapsed 1m 16s (remain 75m 40s) Loss: 1.7852(1.7849) Grad: 611870.1875  LR: 0.00001698  


Epoch: [2][200/6134] Elapsed 2m 31s (remain 74m 24s) Loss: 1.7019(1.7890) Grad: 537325.1250  LR: 0.00001689  


Epoch: [2][300/6134] Elapsed 3m 46s (remain 73m 9s) Loss: 2.0049(1.7495) Grad: 698338.3750  LR: 0.00001679  


Epoch: [2][400/6134] Elapsed 5m 1s (remain 71m 53s) Loss: 1.2356(1.7345) Grad: 512673.0625  LR: 0.00001670  


Epoch: [2][500/6134] Elapsed 6m 16s (remain 70m 38s) Loss: 1.8272(1.7398) Grad: 644502.0000  LR: 0.00001660  


Epoch: [2][600/6134] Elapsed 7m 32s (remain 69m 24s) Loss: 1.6303(1.7369) Grad: 569030.7500  LR: 0.00001651  


Epoch: [2][700/6134] Elapsed 8m 47s (remain 68m 8s) Loss: 1.4248(1.7306) Grad: 603685.1875  LR: 0.00001641  


Epoch: [2][800/6134] Elapsed 10m 2s (remain 66m 52s) Loss: 1.8645(1.7321) Grad: 660521.3750  LR: 0.00001631  


Epoch: [2][900/6134] Elapsed 11m 17s (remain 65m 37s) Loss: 2.2336(1.7236) Grad: 656508.2500  LR: 0.00001621  


Epoch: [2][1000/6134] Elapsed 12m 33s (remain 64m 22s) Loss: 1.9163(1.7234) Grad: 638413.2500  LR: 0.00001611  


Epoch: [2][1100/6134] Elapsed 13m 48s (remain 63m 6s) Loss: 1.7113(1.7199) Grad: 534549.3125  LR: 0.00001601  


Epoch: [2][1200/6134] Elapsed 15m 3s (remain 61m 51s) Loss: 1.3381(1.7159) Grad: 615330.8750  LR: 0.00001590  


Epoch: [2][1300/6134] Elapsed 16m 18s (remain 60m 36s) Loss: 1.5442(1.7113) Grad: 633361.7500  LR: 0.00001580  


Epoch: [2][1400/6134] Elapsed 17m 34s (remain 59m 21s) Loss: 1.7764(1.7084) Grad: 655568.0625  LR: 0.00001569  


Epoch: [2][1500/6134] Elapsed 18m 49s (remain 58m 5s) Loss: 1.4850(1.7113) Grad: 548487.8125  LR: 0.00001559  


Epoch: [2][1600/6134] Elapsed 20m 4s (remain 56m 50s) Loss: 1.3883(1.7092) Grad: 590423.4375  LR: 0.00001548  


Epoch: [2][1700/6134] Elapsed 21m 19s (remain 55m 35s) Loss: 1.6523(1.7040) Grad: 626890.7500  LR: 0.00001538  


Epoch: [2][1800/6134] Elapsed 22m 35s (remain 54m 20s) Loss: 1.3814(1.7018) Grad: 564758.0000  LR: 0.00001527  


Epoch: [2][1900/6134] Elapsed 23m 50s (remain 53m 5s) Loss: 1.5278(1.6978) Grad: 606434.1250  LR: 0.00001516  


Epoch: [2][2000/6134] Elapsed 25m 5s (remain 51m 50s) Loss: 1.9134(1.6978) Grad: 1205242.1250  LR: 0.00001505  


Epoch: [2][2100/6134] Elapsed 26m 21s (remain 50m 34s) Loss: 1.6810(1.6939) Grad: 670734.3125  LR: 0.00001494  


Epoch: [2][2200/6134] Elapsed 27m 36s (remain 49m 19s) Loss: 2.0539(1.6932) Grad: 612712.6250  LR: 0.00001482  


Epoch: [2][2300/6134] Elapsed 28m 51s (remain 48m 4s) Loss: 1.6979(1.6909) Grad: 605305.8125  LR: 0.00001471  


Epoch: [2][2400/6134] Elapsed 30m 6s (remain 46m 49s) Loss: 1.4902(1.6864) Grad: 587313.3750  LR: 0.00001460  


Epoch: [2][2500/6134] Elapsed 31m 22s (remain 45m 34s) Loss: 1.8479(1.6859) Grad: 562072.6250  LR: 0.00001448  


Epoch: [2][2600/6134] Elapsed 32m 37s (remain 44m 18s) Loss: 1.7376(1.6838) Grad: 664448.6250  LR: 0.00001437  


Epoch: [2][2700/6134] Elapsed 33m 52s (remain 43m 3s) Loss: 1.5037(1.6801) Grad: 600584.0000  LR: 0.00001425  


Epoch: [2][2800/6134] Elapsed 35m 7s (remain 41m 48s) Loss: 1.5908(1.6785) Grad: 595912.8750  LR: 0.00001414  


Epoch: [2][2900/6134] Elapsed 36m 22s (remain 40m 32s) Loss: 2.1278(1.6788) Grad: 666767.5000  LR: 0.00001402  


Epoch: [2][3000/6134] Elapsed 37m 38s (remain 39m 17s) Loss: 1.7663(1.6774) Grad: 611252.3750  LR: 0.00001390  


Epoch: [2][3100/6134] Elapsed 38m 53s (remain 38m 2s) Loss: 2.1490(1.6752) Grad: 726129.0625  LR: 0.00001379  


Epoch: [2][3200/6134] Elapsed 40m 8s (remain 36m 46s) Loss: 1.2116(1.6727) Grad: 484854.6875  LR: 0.00001367  


Epoch: [2][3300/6134] Elapsed 41m 23s (remain 35m 31s) Loss: 1.2637(1.6689) Grad: 572713.2500  LR: 0.00001355  


Epoch: [2][3400/6134] Elapsed 42m 39s (remain 34m 16s) Loss: 1.5260(1.6665) Grad: 678141.5625  LR: 0.00001343  


Epoch: [2][3500/6134] Elapsed 43m 54s (remain 33m 1s) Loss: 1.6025(1.6649) Grad: 517203.6875  LR: 0.00001331  


Epoch: [2][3600/6134] Elapsed 45m 9s (remain 31m 46s) Loss: 1.7456(1.6637) Grad: 594487.8750  LR: 0.00001319  


Epoch: [2][3700/6134] Elapsed 46m 24s (remain 30m 30s) Loss: 2.1132(1.6659) Grad: nan  LR: 0.00001306  


Epoch: [2][3800/6134] Elapsed 47m 38s (remain 29m 14s) Loss: 1.1253(1.6725) Grad: nan  LR: 0.00001294  


Epoch: [2][3900/6134] Elapsed 48m 53s (remain 27m 58s) Loss: 2.4051(1.6751) Grad: nan  LR: 0.00001282  


Epoch: [2][4000/6134] Elapsed 50m 7s (remain 26m 43s) Loss: 1.7561(1.6766) Grad: nan  LR: 0.00001270  


Epoch: [2][4100/6134] Elapsed 51m 21s (remain 25m 27s) Loss: 1.7812(1.6788) Grad: nan  LR: 0.00001257  


Epoch: [2][4200/6134] Elapsed 52m 36s (remain 24m 12s) Loss: 2.7805(1.6812) Grad: nan  LR: 0.00001245  


Epoch: [2][4300/6134] Elapsed 53m 50s (remain 22m 56s) Loss: 1.3748(1.6813) Grad: nan  LR: 0.00001232  


Epoch: [2][4400/6134] Elapsed 55m 4s (remain 21m 41s) Loss: 2.0939(1.6812) Grad: nan  LR: 0.00001220  


Epoch: [2][4500/6134] Elapsed 56m 19s (remain 20m 25s) Loss: 1.3009(1.6802) Grad: nan  LR: 0.00001207  


Epoch: [2][4600/6134] Elapsed 57m 33s (remain 19m 10s) Loss: 1.9865(1.6807) Grad: nan  LR: 0.00001195  


Epoch: [2][4700/6134] Elapsed 58m 47s (remain 17m 55s) Loss: 1.3725(1.6806) Grad: nan  LR: 0.00001182  


Epoch: [2][4800/6134] Elapsed 60m 2s (remain 16m 40s) Loss: 1.9487(1.6799) Grad: nan  LR: 0.00001170  


Epoch: [2][4900/6134] Elapsed 61m 16s (remain 15m 24s) Loss: 1.8420(1.6785) Grad: nan  LR: 0.00001157  


Epoch: [2][5000/6134] Elapsed 62m 30s (remain 14m 9s) Loss: 1.7819(1.6777) Grad: nan  LR: 0.00001144  


Epoch: [2][5100/6134] Elapsed 63m 45s (remain 12m 54s) Loss: 1.9183(1.6766) Grad: nan  LR: 0.00001132  


Epoch: [2][5200/6134] Elapsed 64m 59s (remain 11m 39s) Loss: 1.2509(1.6758) Grad: nan  LR: 0.00001119  


Epoch: [2][5300/6134] Elapsed 66m 13s (remain 10m 24s) Loss: 1.5313(1.6745) Grad: nan  LR: 0.00001106  


Epoch: [2][5400/6134] Elapsed 67m 28s (remain 9m 9s) Loss: 1.3024(1.6738) Grad: nan  LR: 0.00001094  


Epoch: [2][5500/6134] Elapsed 68m 42s (remain 7m 54s) Loss: 1.8408(1.6725) Grad: nan  LR: 0.00001081  


Epoch: [2][5600/6134] Elapsed 69m 56s (remain 6m 39s) Loss: 1.6275(1.6703) Grad: nan  LR: 0.00001068  


Epoch: [2][5700/6134] Elapsed 71m 10s (remain 5m 24s) Loss: 2.1910(1.6702) Grad: nan  LR: 0.00001055  


Epoch: [2][5800/6134] Elapsed 72m 24s (remain 4m 9s) Loss: 1.2555(1.6672) Grad: nan  LR: 0.00001042  


Epoch: [2][5900/6134] Elapsed 73m 39s (remain 2m 54s) Loss: 1.5734(1.6653) Grad: nan  LR: 0.00001030  


Epoch: [2][6000/6134] Elapsed 74m 53s (remain 1m 39s) Loss: 1.1059(1.6633) Grad: nan  LR: 0.00001017  


Epoch: [2][6100/6134] Elapsed 76m 7s (remain 0m 24s) Loss: 1.5435(1.6606) Grad: nan  LR: 0.00001004  


Epoch: [2][6133/6134] Elapsed 76m 31s (remain 0m 0s) Loss: 1.2287(1.6601) Grad: nan  LR: 0.00001000  


EVAL: [0/2048] Elapsed 0m 0s (remain 7m 26s) Loss: 1.3402(1.3402) 


EVAL: [100/2048] Elapsed 0m 22s (remain 7m 5s) Loss: 1.4173(1.5398) 


EVAL: [200/2048] Elapsed 0m 43s (remain 6m 43s) Loss: 1.8905(1.5339) 


EVAL: [300/2048] Elapsed 1m 5s (remain 6m 21s) Loss: 0.8224(1.5047) 


EVAL: [400/2048] Elapsed 1m 27s (remain 6m 0s) Loss: 1.2976(1.4485) 


EVAL: [500/2048] Elapsed 1m 49s (remain 5m 38s) Loss: 0.9064(1.4201) 


EVAL: [600/2048] Elapsed 2m 11s (remain 5m 16s) Loss: 1.4991(1.4053) 


EVAL: [700/2048] Elapsed 2m 33s (remain 4m 54s) Loss: 1.5466(1.3965) 


EVAL: [800/2048] Elapsed 2m 55s (remain 4m 32s) Loss: 1.4669(1.4046) 


EVAL: [900/2048] Elapsed 3m 17s (remain 4m 10s) Loss: 1.7306(1.4179) 


EVAL: [1000/2048] Elapsed 3m 39s (remain 3m 49s) Loss: 1.2759(1.4264) 


EVAL: [1100/2048] Elapsed 4m 0s (remain 3m 27s) Loss: 1.6157(1.4215) 


EVAL: [1200/2048] Elapsed 4m 22s (remain 3m 5s) Loss: 1.2409(1.4195) 


EVAL: [1300/2048] Elapsed 4m 44s (remain 2m 43s) Loss: 1.2367(1.4138) 


EVAL: [1400/2048] Elapsed 5m 6s (remain 2m 21s) Loss: 1.1488(1.4194) 


EVAL: [1500/2048] Elapsed 5m 28s (remain 1m 59s) Loss: 1.5569(1.4219) 


EVAL: [1600/2048] Elapsed 5m 50s (remain 1m 37s) Loss: 1.7673(1.4227) 


EVAL: [1700/2048] Elapsed 6m 11s (remain 1m 15s) Loss: 1.0298(1.4268) 


EVAL: [1800/2048] Elapsed 6m 33s (remain 0m 54s) Loss: 0.9541(1.4287) 


EVAL: [1900/2048] Elapsed 6m 55s (remain 0m 32s) Loss: 1.6836(1.4295) 


EVAL: [2000/2048] Elapsed 7m 17s (remain 0m 10s) Loss: 1.7309(1.4295) 


Epoch 2 - avg_train_loss: 1.6601  avg_val_loss: 1.4286  time: 5039s


Epoch 2 - Save Best Loss: 1.4286 Model


EVAL: [2047/2048] Elapsed 7m 27s (remain 0m 0s) Loss: 1.0754(1.4286) 


Epoch: [3][0/6134] Elapsed 0m 0s (remain 76m 23s) Loss: 1.1767(1.1767) Grad: nan  LR: 0.00001000  


Epoch: [3][100/6134] Elapsed 1m 15s (remain 74m 50s) Loss: 1.4286(1.4416) Grad: nan  LR: 0.00000987  


Epoch: [3][200/6134] Elapsed 2m 29s (remain 73m 28s) Loss: 1.6102(1.4655) Grad: nan  LR: 0.00000974  


Epoch: [3][300/6134] Elapsed 3m 43s (remain 72m 10s) Loss: 1.8452(1.4723) Grad: nan  LR: 0.00000961  


Epoch: [3][400/6134] Elapsed 4m 57s (remain 70m 55s) Loss: 1.2473(1.4836) Grad: nan  LR: 0.00000948  


Epoch: [3][500/6134] Elapsed 6m 11s (remain 69m 41s) Loss: 1.5073(1.4809) Grad: nan  LR: 0.00000936  


Epoch: [3][600/6134] Elapsed 7m 26s (remain 68m 26s) Loss: 1.4280(1.4855) Grad: nan  LR: 0.00000923  


Epoch: [3][700/6134] Elapsed 8m 40s (remain 67m 11s) Loss: 1.9655(1.4921) Grad: nan  LR: 0.00000910  


Epoch: [3][800/6134] Elapsed 9m 54s (remain 65m 56s) Loss: 1.2264(1.4862) Grad: nan  LR: 0.00000897  


Epoch: [3][900/6134] Elapsed 11m 8s (remain 64m 41s) Loss: 1.8855(1.5137) Grad: nan  LR: 0.00000885  


Epoch: [3][1000/6134] Elapsed 12m 22s (remain 63m 27s) Loss: 1.2899(1.5185) Grad: nan  LR: 0.00000872  


Epoch: [3][1100/6134] Elapsed 13m 36s (remain 62m 13s) Loss: 1.3736(1.5241) Grad: nan  LR: 0.00000859  


Epoch: [3][1200/6134] Elapsed 14m 50s (remain 60m 58s) Loss: 1.2350(1.5270) Grad: nan  LR: 0.00000847  


Epoch: [3][1300/6134] Elapsed 16m 4s (remain 59m 44s) Loss: 1.6288(1.5246) Grad: nan  LR: 0.00000834  


Epoch: [3][1400/6134] Elapsed 17m 18s (remain 58m 29s) Loss: 1.3689(1.5221) Grad: nan  LR: 0.00000821  


Epoch: [3][1500/6134] Elapsed 18m 33s (remain 57m 15s) Loss: 1.9652(1.5208) Grad: nan  LR: 0.00000809  


Epoch: [3][1600/6134] Elapsed 19m 47s (remain 56m 1s) Loss: 1.9202(1.5179) Grad: nan  LR: 0.00000796  


Epoch: [3][1700/6134] Elapsed 21m 1s (remain 54m 47s) Loss: 1.7922(1.5141) Grad: nan  LR: 0.00000784  


Epoch: [3][1800/6134] Elapsed 22m 15s (remain 53m 33s) Loss: 1.5032(1.5095) Grad: nan  LR: 0.00000771  


Epoch: [3][1900/6134] Elapsed 23m 29s (remain 52m 18s) Loss: 1.4602(1.5044) Grad: nan  LR: 0.00000759  


Epoch: [3][2000/6134] Elapsed 24m 43s (remain 51m 4s) Loss: 1.0314(1.5007) Grad: nan  LR: 0.00000746  


Epoch: [3][2100/6134] Elapsed 25m 58s (remain 49m 50s) Loss: 1.6712(1.4975) Grad: nan  LR: 0.00000734  


Epoch: [3][2200/6134] Elapsed 27m 12s (remain 48m 36s) Loss: 1.0515(1.4948) Grad: nan  LR: 0.00000722  


Epoch: [3][2300/6134] Elapsed 28m 26s (remain 47m 22s) Loss: 1.4589(1.4938) Grad: nan  LR: 0.00000709  


Epoch: [3][2400/6134] Elapsed 29m 40s (remain 46m 8s) Loss: 1.1135(1.4926) Grad: nan  LR: 0.00000697  


Epoch: [3][2500/6134] Elapsed 30m 54s (remain 44m 53s) Loss: 1.8739(1.4896) Grad: nan  LR: 0.00000685  


Epoch: [3][2600/6134] Elapsed 32m 8s (remain 43m 40s) Loss: 1.2362(1.4861) Grad: nan  LR: 0.00000673  


Epoch: [3][2700/6134] Elapsed 33m 22s (remain 42m 25s) Loss: 1.7037(1.4843) Grad: nan  LR: 0.00000661  


Epoch: [3][2800/6134] Elapsed 34m 37s (remain 41m 11s) Loss: 1.5348(1.4825) Grad: nan  LR: 0.00000649  


Epoch: [3][2900/6134] Elapsed 35m 51s (remain 39m 57s) Loss: 1.1493(1.4808) Grad: nan  LR: 0.00000637  


Epoch: [3][3000/6134] Elapsed 37m 5s (remain 38m 43s) Loss: 1.4600(1.4798) Grad: nan  LR: 0.00000625  


Epoch: [3][3100/6134] Elapsed 38m 19s (remain 37m 29s) Loss: 1.0953(1.4774) Grad: nan  LR: 0.00000613  


Epoch: [3][3200/6134] Elapsed 39m 33s (remain 36m 14s) Loss: 0.9933(1.4747) Grad: nan  LR: 0.00000601  


Epoch: [3][3300/6134] Elapsed 40m 47s (remain 35m 0s) Loss: 1.6427(1.4723) Grad: nan  LR: 0.00000590  


Epoch: [3][3400/6134] Elapsed 42m 2s (remain 33m 46s) Loss: 1.7655(1.4719) Grad: nan  LR: 0.00000578  


Epoch: [3][3500/6134] Elapsed 43m 16s (remain 32m 32s) Loss: 1.1915(1.4691) Grad: nan  LR: 0.00000566  


Epoch: [3][3600/6134] Elapsed 44m 30s (remain 31m 18s) Loss: 1.7774(1.4668) Grad: nan  LR: 0.00000555  


Epoch: [3][3700/6134] Elapsed 45m 44s (remain 30m 4s) Loss: 1.2285(1.4646) Grad: nan  LR: 0.00000543  


Epoch: [3][3800/6134] Elapsed 46m 58s (remain 28m 50s) Loss: 1.0088(1.4621) Grad: nan  LR: 0.00000532  


Epoch: [3][3900/6134] Elapsed 48m 12s (remain 27m 35s) Loss: 1.9226(1.4595) Grad: nan  LR: 0.00000521  


Epoch: [3][4000/6134] Elapsed 49m 26s (remain 26m 21s) Loss: 1.4941(1.4577) Grad: nan  LR: 0.00000510  


Epoch: [3][4100/6134] Elapsed 50m 41s (remain 25m 7s) Loss: 1.6181(1.4542) Grad: nan  LR: 0.00000498  


Epoch: [3][4200/6134] Elapsed 51m 55s (remain 23m 53s) Loss: 1.3600(1.4517) Grad: nan  LR: 0.00000487  


Epoch: [3][4300/6134] Elapsed 53m 9s (remain 22m 39s) Loss: 1.7671(1.4504) Grad: nan  LR: 0.00000476  


Epoch: [3][4400/6134] Elapsed 54m 23s (remain 21m 25s) Loss: 1.3803(1.4478) Grad: nan  LR: 0.00000466  


Epoch: [3][4500/6134] Elapsed 55m 37s (remain 20m 10s) Loss: 1.7401(1.4457) Grad: nan  LR: 0.00000455  


Epoch: [3][4600/6134] Elapsed 56m 51s (remain 18m 56s) Loss: 1.4934(1.4434) Grad: nan  LR: 0.00000444  


Epoch: [3][4700/6134] Elapsed 58m 6s (remain 17m 42s) Loss: 0.9994(1.4408) Grad: nan  LR: 0.00000434  


Epoch: [3][4800/6134] Elapsed 59m 20s (remain 16m 28s) Loss: 1.5664(1.4381) Grad: nan  LR: 0.00000423  


Epoch: [3][4900/6134] Elapsed 60m 34s (remain 15m 14s) Loss: 0.9921(1.4360) Grad: nan  LR: 0.00000413  


Epoch: [3][5000/6134] Elapsed 61m 48s (remain 14m 0s) Loss: 1.6646(1.4354) Grad: nan  LR: 0.00000402  


Epoch: [3][5100/6134] Elapsed 63m 2s (remain 12m 46s) Loss: 1.0415(1.4334) Grad: nan  LR: 0.00000392  


Epoch: [3][5200/6134] Elapsed 64m 16s (remain 11m 31s) Loss: 1.2155(1.4316) Grad: nan  LR: 0.00000382  


Epoch: [3][5300/6134] Elapsed 65m 31s (remain 10m 17s) Loss: 1.7424(1.4310) Grad: nan  LR: 0.00000372  


Epoch: [3][5400/6134] Elapsed 66m 45s (remain 9m 3s) Loss: 1.1320(1.4291) Grad: nan  LR: 0.00000362  


Epoch: [3][5500/6134] Elapsed 67m 59s (remain 7m 49s) Loss: 1.6346(1.4283) Grad: nan  LR: 0.00000352  


Epoch: [3][5600/6134] Elapsed 69m 13s (remain 6m 35s) Loss: 1.3406(1.4274) Grad: nan  LR: 0.00000343  


Epoch: [3][5700/6134] Elapsed 70m 27s (remain 5m 21s) Loss: 1.4115(1.4254) Grad: nan  LR: 0.00000333  


Epoch: [3][5800/6134] Elapsed 71m 42s (remain 4m 6s) Loss: 0.9265(1.4250) Grad: nan  LR: 0.00000323  


Epoch: [3][5900/6134] Elapsed 72m 56s (remain 2m 52s) Loss: 1.7053(1.4236) Grad: nan  LR: 0.00000314  


Epoch: [3][6000/6134] Elapsed 74m 10s (remain 1m 38s) Loss: 1.3995(1.4211) Grad: nan  LR: 0.00000305  


Epoch: [3][6100/6134] Elapsed 75m 24s (remain 0m 24s) Loss: 1.5248(1.4204) Grad: nan  LR: 0.00000296  


Epoch: [3][6133/6134] Elapsed 75m 49s (remain 0m 0s) Loss: 1.4445(1.4196) Grad: nan  LR: 0.00000293  


EVAL: [0/2048] Elapsed 0m 0s (remain 7m 27s) Loss: 1.7187(1.7187) 


EVAL: [100/2048] Elapsed 0m 22s (remain 7m 6s) Loss: 1.2660(1.3604) 


EVAL: [200/2048] Elapsed 0m 43s (remain 6m 43s) Loss: 1.8246(1.3429) 


EVAL: [300/2048] Elapsed 1m 5s (remain 6m 21s) Loss: 0.8846(1.3298) 


EVAL: [400/2048] Elapsed 1m 27s (remain 6m 0s) Loss: 1.0372(1.2832) 


EVAL: [500/2048] Elapsed 1m 49s (remain 5m 38s) Loss: 1.1508(1.2601) 


EVAL: [600/2048] Elapsed 2m 11s (remain 5m 16s) Loss: 0.9428(1.2480) 


EVAL: [700/2048] Elapsed 2m 33s (remain 4m 54s) Loss: 1.2509(1.2397) 


EVAL: [800/2048] Elapsed 2m 55s (remain 4m 32s) Loss: 1.0796(1.2446) 


EVAL: [900/2048] Elapsed 3m 16s (remain 4m 10s) Loss: 1.4896(1.2544) 


EVAL: [1000/2048] Elapsed 3m 38s (remain 3m 48s) Loss: 1.1176(1.2618) 


EVAL: [1100/2048] Elapsed 4m 0s (remain 3m 27s) Loss: 1.1924(1.2595) 


EVAL: [1200/2048] Elapsed 4m 22s (remain 3m 5s) Loss: 0.9690(1.2579) 


EVAL: [1300/2048] Elapsed 4m 44s (remain 2m 43s) Loss: 0.9798(1.2545) 


EVAL: [1400/2048] Elapsed 5m 6s (remain 2m 21s) Loss: 1.2478(1.2601) 


EVAL: [1500/2048] Elapsed 5m 28s (remain 1m 59s) Loss: 1.2030(1.2628) 


EVAL: [1600/2048] Elapsed 5m 50s (remain 1m 37s) Loss: 1.2594(1.2634) 


EVAL: [1700/2048] Elapsed 6m 11s (remain 1m 15s) Loss: 0.7726(1.2672) 


EVAL: [1800/2048] Elapsed 6m 33s (remain 0m 54s) Loss: 1.0529(1.2696) 


EVAL: [1900/2048] Elapsed 6m 55s (remain 0m 32s) Loss: 1.4114(1.2678) 


EVAL: [2000/2048] Elapsed 7m 17s (remain 0m 10s) Loss: 1.3096(1.2689) 


Epoch 3 - avg_train_loss: 1.4196  avg_val_loss: 1.2701  time: 4997s


Epoch 3 - Save Best Loss: 1.2701 Model


EVAL: [2047/2048] Elapsed 7m 27s (remain 0m 0s) Loss: 0.7776(1.2701) 


Epoch: [4][0/6134] Elapsed 0m 0s (remain 77m 27s) Loss: 1.0879(1.0879) Grad: nan  LR: 0.00000293  


Epoch: [4][100/6134] Elapsed 1m 15s (remain 74m 46s) Loss: 1.5532(1.3265) Grad: nan  LR: 0.00000284  


Epoch: [4][200/6134] Elapsed 2m 29s (remain 73m 29s) Loss: 1.5943(1.3512) Grad: nan  LR: 0.00000275  


Epoch: [4][300/6134] Elapsed 3m 43s (remain 72m 15s) Loss: 1.0763(1.3508) Grad: nan  LR: 0.00000266  


Epoch: [4][400/6134] Elapsed 4m 58s (remain 71m 1s) Loss: 0.6944(1.3331) Grad: nan  LR: 0.00000257  


Epoch: [4][500/6134] Elapsed 6m 12s (remain 69m 46s) Loss: 1.0133(1.3258) Grad: nan  LR: 0.00000249  


Epoch: [4][600/6134] Elapsed 7m 26s (remain 68m 30s) Loss: 1.7044(1.3168) Grad: nan  LR: 0.00000240  


Epoch: [4][700/6134] Elapsed 8m 40s (remain 67m 16s) Loss: 1.2267(1.3200) Grad: nan  LR: 0.00000232  


Epoch: [4][800/6134] Elapsed 9m 55s (remain 66m 1s) Loss: 1.3931(1.3241) Grad: nan  LR: 0.00000224  


Epoch: [4][900/6134] Elapsed 11m 9s (remain 64m 47s) Loss: 1.4222(1.3218) Grad: nan  LR: 0.00000216  


Epoch: [4][1000/6134] Elapsed 12m 23s (remain 63m 34s) Loss: 1.5838(1.3241) Grad: nan  LR: 0.00000208  


Epoch: [4][1100/6134] Elapsed 13m 38s (remain 62m 20s) Loss: 1.5626(1.3297) Grad: nan  LR: 0.00000200  


Epoch: [4][1200/6134] Elapsed 14m 52s (remain 61m 5s) Loss: 1.5620(1.3329) Grad: nan  LR: 0.00000193  


Epoch: [4][1300/6134] Elapsed 16m 6s (remain 59m 50s) Loss: 1.5057(1.3320) Grad: nan  LR: 0.00000185  


Epoch: [4][1400/6134] Elapsed 17m 20s (remain 58m 36s) Loss: 1.5748(1.3311) Grad: nan  LR: 0.00000178  


Epoch: [4][1500/6134] Elapsed 18m 35s (remain 57m 21s) Loss: 0.8999(1.3326) Grad: nan  LR: 0.00000171  


Epoch: [4][1600/6134] Elapsed 19m 49s (remain 56m 7s) Loss: 1.0935(1.3328) Grad: nan  LR: 0.00000164  


Epoch: [4][1700/6134] Elapsed 21m 3s (remain 54m 53s) Loss: 0.9111(1.3315) Grad: nan  LR: 0.00000157  


Epoch: [4][1800/6134] Elapsed 22m 18s (remain 53m 39s) Loss: 0.9029(1.3301) Grad: nan  LR: 0.00000150  


Epoch: [4][1900/6134] Elapsed 23m 32s (remain 52m 25s) Loss: 1.5304(1.3275) Grad: nan  LR: 0.00000143  


Epoch: [4][2000/6134] Elapsed 24m 46s (remain 51m 10s) Loss: 1.9487(1.3282) Grad: nan  LR: 0.00000137  


Epoch: [4][2100/6134] Elapsed 26m 1s (remain 49m 56s) Loss: 0.9750(1.3259) Grad: nan  LR: 0.00000130  


Epoch: [4][2200/6134] Elapsed 27m 15s (remain 48m 42s) Loss: 1.2659(1.3256) Grad: nan  LR: 0.00000124  


Epoch: [4][2300/6134] Elapsed 28m 29s (remain 47m 27s) Loss: 1.4660(1.3261) Grad: nan  LR: 0.00000118  


Epoch: [4][2400/6134] Elapsed 29m 43s (remain 46m 13s) Loss: 1.7419(1.3273) Grad: nan  LR: 0.00000112  


Epoch: [4][2500/6134] Elapsed 30m 57s (remain 44m 58s) Loss: 1.4614(1.3249) Grad: nan  LR: 0.00000106  


Epoch: [4][2600/6134] Elapsed 32m 12s (remain 43m 44s) Loss: 1.1562(1.3249) Grad: nan  LR: 0.00000100  


Epoch: [4][2700/6134] Elapsed 33m 26s (remain 42m 30s) Loss: 1.3970(1.3237) Grad: nan  LR: 0.00000095  


Epoch: [4][2800/6134] Elapsed 34m 40s (remain 41m 15s) Loss: 1.1869(1.3239) Grad: nan  LR: 0.00000090  


Epoch: [4][2900/6134] Elapsed 35m 54s (remain 40m 1s) Loss: 1.2277(1.3231) Grad: nan  LR: 0.00000084  


Epoch: [4][3000/6134] Elapsed 37m 9s (remain 38m 47s) Loss: 1.6832(1.3230) Grad: nan  LR: 0.00000079  


Epoch: [4][3100/6134] Elapsed 38m 23s (remain 37m 33s) Loss: 1.2689(1.3235) Grad: nan  LR: 0.00000074  


Epoch: [4][3200/6134] Elapsed 39m 37s (remain 36m 18s) Loss: 1.6197(1.3229) Grad: nan  LR: 0.00000070  


Epoch: [4][3300/6134] Elapsed 40m 52s (remain 35m 4s) Loss: 1.3449(1.3225) Grad: nan  LR: 0.00000065  


Epoch: [4][3400/6134] Elapsed 42m 6s (remain 33m 50s) Loss: 1.0306(1.3224) Grad: nan  LR: 0.00000060  


Epoch: [4][3500/6134] Elapsed 43m 20s (remain 32m 35s) Loss: 1.1192(1.3216) Grad: nan  LR: 0.00000056  


Epoch: [4][3600/6134] Elapsed 44m 34s (remain 31m 21s) Loss: 1.0709(1.3211) Grad: nan  LR: 0.00000052  


Epoch: [4][3700/6134] Elapsed 45m 49s (remain 30m 7s) Loss: 1.4470(1.3217) Grad: nan  LR: 0.00000048  


Epoch: [4][3800/6134] Elapsed 47m 3s (remain 28m 53s) Loss: 1.4661(1.3194) Grad: nan  LR: 0.00000044  


Epoch: [4][3900/6134] Elapsed 48m 17s (remain 27m 38s) Loss: 1.4637(1.3202) Grad: nan  LR: 0.00000040  


Epoch: [4][4000/6134] Elapsed 49m 32s (remain 26m 24s) Loss: 1.1599(1.3192) Grad: nan  LR: 0.00000037  


Epoch: [4][4100/6134] Elapsed 50m 46s (remain 25m 10s) Loss: 1.3990(1.3184) Grad: nan  LR: 0.00000034  


Epoch: [4][4200/6134] Elapsed 52m 0s (remain 23m 55s) Loss: 1.7513(1.3202) Grad: nan  LR: 0.00000030  


Epoch: [4][4300/6134] Elapsed 53m 14s (remain 22m 41s) Loss: 1.2872(1.3191) Grad: nan  LR: 0.00000027  


Epoch: [4][4400/6134] Elapsed 54m 28s (remain 21m 27s) Loss: 0.8999(1.3180) Grad: nan  LR: 0.00000024  


Epoch: [4][4500/6134] Elapsed 55m 43s (remain 20m 12s) Loss: 1.2677(1.3167) Grad: nan  LR: 0.00000022  


Epoch: [4][4600/6134] Elapsed 56m 57s (remain 18m 58s) Loss: 1.6225(1.3163) Grad: nan  LR: 0.00000019  


Epoch: [4][4700/6134] Elapsed 58m 11s (remain 17m 44s) Loss: 1.1934(1.3170) Grad: nan  LR: 0.00000017  


Epoch: [4][4800/6134] Elapsed 59m 25s (remain 16m 30s) Loss: 1.4623(1.3175) Grad: nan  LR: 0.00000014  


Epoch: [4][4900/6134] Elapsed 60m 40s (remain 15m 15s) Loss: 1.5029(1.3174) Grad: nan  LR: 0.00000012  


Epoch: [4][5000/6134] Elapsed 61m 54s (remain 14m 1s) Loss: 0.7193(1.3170) Grad: nan  LR: 0.00000010  


Epoch: [4][5100/6134] Elapsed 63m 8s (remain 12m 47s) Loss: 1.2624(1.3162) Grad: nan  LR: 0.00000009  


Epoch: [4][5200/6134] Elapsed 64m 22s (remain 11m 32s) Loss: 1.2747(1.3156) Grad: nan  LR: 0.00000007  


Epoch: [4][5300/6134] Elapsed 65m 36s (remain 10m 18s) Loss: 0.9691(1.3149) Grad: nan  LR: 0.00000006  


Epoch: [4][5400/6134] Elapsed 66m 51s (remain 9m 4s) Loss: 1.1155(1.3159) Grad: nan  LR: 0.00000004  


Epoch: [4][5500/6134] Elapsed 68m 5s (remain 7m 50s) Loss: 1.2724(1.3150) Grad: nan  LR: 0.00000003  


Epoch: [4][5600/6134] Elapsed 69m 19s (remain 6m 35s) Loss: 1.4751(1.3143) Grad: nan  LR: 0.00000002  


Epoch: [4][5700/6134] Elapsed 70m 33s (remain 5m 21s) Loss: 2.1167(1.3138) Grad: nan  LR: 0.00000002  


Epoch: [4][5800/6134] Elapsed 71m 48s (remain 4m 7s) Loss: 1.2235(1.3130) Grad: nan  LR: 0.00000001  


Epoch: [4][5900/6134] Elapsed 73m 2s (remain 2m 53s) Loss: 1.2809(1.3134) Grad: nan  LR: 0.00000000  


Epoch: [4][6000/6134] Elapsed 74m 16s (remain 1m 38s) Loss: 1.2068(1.3135) Grad: nan  LR: 0.00000000  


Epoch: [4][6100/6134] Elapsed 75m 30s (remain 0m 24s) Loss: 1.3190(1.3130) Grad: nan  LR: 0.00000000  


Epoch: [4][6133/6134] Elapsed 75m 55s (remain 0m 0s) Loss: 1.0698(1.3128) Grad: nan  LR: 0.00000000  


EVAL: [0/2048] Elapsed 0m 0s (remain 7m 29s) Loss: 1.2215(1.2215) 


EVAL: [100/2048] Elapsed 0m 22s (remain 7m 8s) Loss: 1.2735(1.3344) 


EVAL: [200/2048] Elapsed 0m 44s (remain 6m 47s) Loss: 1.5356(1.3195) 


EVAL: [300/2048] Elapsed 1m 6s (remain 6m 23s) Loss: 0.9357(1.2914) 


EVAL: [400/2048] Elapsed 1m 28s (remain 6m 2s) Loss: 1.0281(1.2490) 


EVAL: [500/2048] Elapsed 1m 50s (remain 5m 39s) Loss: 0.9278(1.2260) 


EVAL: [600/2048] Elapsed 2m 11s (remain 5m 17s) Loss: 1.4686(1.2209) 


EVAL: [700/2048] Elapsed 2m 33s (remain 4m 55s) Loss: 1.4898(1.2145) 


EVAL: [800/2048] Elapsed 2m 55s (remain 4m 33s) Loss: 1.3089(1.2213) 


EVAL: [900/2048] Elapsed 3m 17s (remain 4m 11s) Loss: 1.5992(1.2357) 


EVAL: [1000/2048] Elapsed 3m 39s (remain 3m 49s) Loss: 1.2342(1.2450) 


EVAL: [1100/2048] Elapsed 4m 1s (remain 3m 27s) Loss: 1.0764(1.2417) 


EVAL: [1200/2048] Elapsed 4m 23s (remain 3m 5s) Loss: 1.3775(1.2412) 


EVAL: [1300/2048] Elapsed 4m 44s (remain 2m 43s) Loss: 0.9673(1.2358) 


EVAL: [1400/2048] Elapsed 5m 6s (remain 2m 21s) Loss: 0.9305(1.2385) 


EVAL: [1500/2048] Elapsed 5m 28s (remain 1m 59s) Loss: 1.6773(1.2400) 


EVAL: [1600/2048] Elapsed 5m 50s (remain 1m 37s) Loss: 1.4167(1.2428) 


EVAL: [1700/2048] Elapsed 6m 12s (remain 1m 15s) Loss: 1.4013(1.2458) 


EVAL: [1800/2048] Elapsed 6m 34s (remain 0m 54s) Loss: 0.6570(1.2491) 


EVAL: [1900/2048] Elapsed 6m 56s (remain 0m 32s) Loss: 0.9183(1.2494) 


EVAL: [2000/2048] Elapsed 7m 17s (remain 0m 10s) Loss: 1.4324(1.2489) 


Epoch 4 - avg_train_loss: 1.3128  avg_val_loss: 1.2487  time: 5003s


Epoch 4 - Save Best Loss: 1.2487 Model


EVAL: [2047/2048] Elapsed 7m 28s (remain 0m 0s) Loss: 0.8142(1.2487) 


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-b92bbe5b9e2da6cd/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e...


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-b92bbe5b9e2da6cd/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

raw_datasets: DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 31609
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 10537
    })
})


max_seq_length: 326


tokenized_datasets: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 31609
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 10537
    })
})


train_dataset: Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
    num_rows: 18429
})  valid_dataset: Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
    num_rows: 6113
})


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2ForMaskedLM: ['mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'deberta.embeddings.position_embeddings.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaV2ForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification mode

Some weights of DebertaV2ForMaskedLM were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: [1][0/6143] Elapsed 0m 0s (remain 70m 16s) Loss: 12.7627(12.7627) Grad: inf  LR: 0.00002000  


Epoch: [1][100/6143] Elapsed 1m 15s (remain 75m 18s) Loss: 5.6148(7.5288) Grad: 80536.3203  LR: 0.00002000  


Epoch: [1][200/6143] Elapsed 2m 30s (remain 74m 9s) Loss: 4.1739(6.5859) Grad: 105636.6562  LR: 0.00002000  


Epoch: [1][300/6143] Elapsed 3m 45s (remain 72m 58s) Loss: 4.7301(6.0205) Grad: 93860.2578  LR: 0.00001999  


Epoch: [1][400/6143] Elapsed 5m 0s (remain 71m 46s) Loss: 3.9173(5.6194) Grad: 92970.0625  LR: 0.00001999  


Epoch: [1][500/6143] Elapsed 6m 16s (remain 70m 35s) Loss: 4.9749(5.3135) Grad: 100767.5312  LR: 0.00001998  


Epoch: [1][600/6143] Elapsed 7m 31s (remain 69m 21s) Loss: 3.6342(5.0496) Grad: 100914.4922  LR: 0.00001997  


Epoch: [1][700/6143] Elapsed 8m 46s (remain 68m 6s) Loss: 3.2340(4.8580) Grad: 88747.1328  LR: 0.00001996  


Epoch: [1][800/6143] Elapsed 10m 1s (remain 66m 52s) Loss: 3.5790(4.6703) Grad: 87201.6797  LR: 0.00001995  


Epoch: [1][900/6143] Elapsed 11m 16s (remain 65m 37s) Loss: 4.0324(4.5197) Grad: 96874.2891  LR: 0.00001993  


Epoch: [1][1000/6143] Elapsed 12m 32s (remain 64m 22s) Loss: 3.1557(4.3858) Grad: 85757.6016  LR: 0.00001992  


Epoch: [1][1100/6143] Elapsed 13m 47s (remain 63m 8s) Loss: 3.1412(4.2570) Grad: 95330.3828  LR: 0.00001990  


Epoch: [1][1200/6143] Elapsed 15m 2s (remain 61m 53s) Loss: 2.4977(4.1518) Grad: 85630.1250  LR: 0.00001988  


Epoch: [1][1300/6143] Elapsed 16m 17s (remain 60m 38s) Loss: 3.0593(4.0525) Grad: 89698.6953  LR: 0.00001986  


Epoch: [1][1400/6143] Elapsed 17m 32s (remain 59m 23s) Loss: 2.5263(3.9635) Grad: 88893.6562  LR: 0.00001984  


Epoch: [1][1500/6143] Elapsed 18m 48s (remain 58m 8s) Loss: 3.2908(3.8797) Grad: 78408.3828  LR: 0.00001982  


Epoch: [1][1600/6143] Elapsed 20m 3s (remain 56m 53s) Loss: 2.5485(3.8002) Grad: 82308.2812  LR: 0.00001979  


Epoch: [1][1700/6143] Elapsed 21m 18s (remain 55m 38s) Loss: 2.7127(3.7273) Grad: 90180.2812  LR: 0.00001976  


Epoch: [1][1800/6143] Elapsed 22m 33s (remain 54m 23s) Loss: 2.8110(3.6581) Grad: 86073.0625  LR: 0.00001974  


Epoch: [1][1900/6143] Elapsed 23m 48s (remain 53m 8s) Loss: 2.8665(3.6013) Grad: 81026.4141  LR: 0.00001971  


Epoch: [1][2000/6143] Elapsed 25m 3s (remain 51m 52s) Loss: 2.5014(3.5432) Grad: 87304.8203  LR: 0.00001967  


Epoch: [1][2100/6143] Elapsed 26m 19s (remain 50m 38s) Loss: 1.7973(3.4871) Grad: 207260.7344  LR: 0.00001964  


Epoch: [1][2200/6143] Elapsed 27m 34s (remain 49m 22s) Loss: 2.1755(3.4358) Grad: 152185.7500  LR: 0.00001961  


Epoch: [1][2300/6143] Elapsed 28m 49s (remain 48m 7s) Loss: 2.3808(3.3852) Grad: 162724.2812  LR: 0.00001957  


Epoch: [1][2400/6143] Elapsed 30m 4s (remain 46m 52s) Loss: 2.0444(3.3399) Grad: 158575.1250  LR: 0.00001953  


Epoch: [1][2500/6143] Elapsed 31m 19s (remain 45m 37s) Loss: 2.2525(3.2959) Grad: 174459.5000  LR: 0.00001949  


Epoch: [1][2600/6143] Elapsed 32m 35s (remain 44m 22s) Loss: 1.6857(3.2537) Grad: 178526.3750  LR: 0.00001945  


Epoch: [1][2700/6143] Elapsed 33m 50s (remain 43m 7s) Loss: 1.9918(3.2135) Grad: 132671.3281  LR: 0.00001941  


Epoch: [1][2800/6143] Elapsed 35m 5s (remain 41m 52s) Loss: 1.9485(3.1755) Grad: 149165.8750  LR: 0.00001937  


Epoch: [1][2900/6143] Elapsed 36m 21s (remain 40m 37s) Loss: 2.2335(3.1414) Grad: 155860.5469  LR: 0.00001932  


Epoch: [1][3000/6143] Elapsed 37m 36s (remain 39m 22s) Loss: 1.9242(3.1075) Grad: 144684.5312  LR: 0.00001927  


Epoch: [1][3100/6143] Elapsed 38m 51s (remain 38m 7s) Loss: 2.1895(3.0761) Grad: 152296.8594  LR: 0.00001922  


Epoch: [1][3200/6143] Elapsed 40m 6s (remain 36m 52s) Loss: 1.7871(3.0466) Grad: 146268.3281  LR: 0.00001917  


Epoch: [1][3300/6143] Elapsed 41m 22s (remain 35m 36s) Loss: 1.8805(3.0164) Grad: 162118.2969  LR: 0.00001912  


Epoch: [1][3400/6143] Elapsed 42m 37s (remain 34m 21s) Loss: 1.5191(2.9898) Grad: 155254.9219  LR: 0.00001907  


Epoch: [1][3500/6143] Elapsed 43m 52s (remain 33m 6s) Loss: 2.1894(2.9618) Grad: 167484.0938  LR: 0.00001901  


Epoch: [1][3600/6143] Elapsed 45m 7s (remain 31m 51s) Loss: 1.3190(2.9362) Grad: 125616.5547  LR: 0.00001896  


Epoch: [1][3700/6143] Elapsed 46m 23s (remain 30m 36s) Loss: 1.8804(2.9116) Grad: 181213.1250  LR: 0.00001890  


Epoch: [1][3800/6143] Elapsed 47m 38s (remain 29m 21s) Loss: 2.1577(2.8882) Grad: 149366.4375  LR: 0.00001884  


Epoch: [1][3900/6143] Elapsed 48m 53s (remain 28m 5s) Loss: 1.7390(2.8656) Grad: 157697.5000  LR: 0.00001878  


Epoch: [1][4000/6143] Elapsed 50m 8s (remain 26m 50s) Loss: 1.8800(2.8439) Grad: 171742.9062  LR: 0.00001872  


Epoch: [1][4100/6143] Elapsed 51m 24s (remain 25m 35s) Loss: 2.4625(2.8226) Grad: 336455.7500  LR: 0.00001866  


Epoch: [1][4200/6143] Elapsed 52m 39s (remain 24m 20s) Loss: 2.2604(2.8015) Grad: 332604.6562  LR: 0.00001859  


Epoch: [1][4300/6143] Elapsed 53m 54s (remain 23m 5s) Loss: 1.7051(2.7800) Grad: 479481.0312  LR: 0.00001853  


Epoch: [1][4400/6143] Elapsed 55m 10s (remain 21m 50s) Loss: 1.7123(2.7588) Grad: 259920.2812  LR: 0.00001846  


Epoch: [1][4500/6143] Elapsed 56m 25s (remain 20m 35s) Loss: 1.5356(2.7405) Grad: 255192.7812  LR: 0.00001839  


Epoch: [1][4600/6143] Elapsed 57m 40s (remain 19m 19s) Loss: 1.9343(2.7226) Grad: 318410.5312  LR: 0.00001832  


Epoch: [1][4700/6143] Elapsed 58m 56s (remain 18m 4s) Loss: 2.4444(2.7052) Grad: 324261.8125  LR: 0.00001825  


Epoch: [1][4800/6143] Elapsed 60m 11s (remain 16m 49s) Loss: 2.7196(2.6884) Grad: 427368.2500  LR: 0.00001817  


Epoch: [1][4900/6143] Elapsed 61m 26s (remain 15m 34s) Loss: 2.0209(2.6707) Grad: 327956.4688  LR: 0.00001810  


Epoch: [1][5000/6143] Elapsed 62m 41s (remain 14m 19s) Loss: 1.4383(2.6543) Grad: 349466.1875  LR: 0.00001802  


Epoch: [1][5100/6143] Elapsed 63m 57s (remain 13m 3s) Loss: 2.5974(2.6381) Grad: 349103.4375  LR: 0.00001795  


Epoch: [1][5200/6143] Elapsed 65m 12s (remain 11m 48s) Loss: 2.0450(2.6221) Grad: 390482.5000  LR: 0.00001787  


Epoch: [1][5300/6143] Elapsed 66m 27s (remain 10m 33s) Loss: 2.0200(2.6071) Grad: 314878.0938  LR: 0.00001779  


Epoch: [1][5400/6143] Elapsed 67m 42s (remain 9m 18s) Loss: 2.4228(2.5916) Grad: 298657.4062  LR: 0.00001771  


Epoch: [1][5500/6143] Elapsed 68m 58s (remain 8m 2s) Loss: 1.9921(2.5781) Grad: 306637.9062  LR: 0.00001763  


Epoch: [1][5600/6143] Elapsed 70m 13s (remain 6m 47s) Loss: 1.7689(2.5652) Grad: 340161.8438  LR: 0.00001754  


Epoch: [1][5700/6143] Elapsed 71m 28s (remain 5m 32s) Loss: 1.6152(2.5521) Grad: 322377.7500  LR: 0.00001746  


Epoch: [1][5800/6143] Elapsed 72m 44s (remain 4m 17s) Loss: 1.6540(2.5386) Grad: 291336.1250  LR: 0.00001737  


Epoch: [1][5900/6143] Elapsed 73m 59s (remain 3m 2s) Loss: 1.9412(2.5258) Grad: 292073.1875  LR: 0.00001729  


Epoch: [1][6000/6143] Elapsed 75m 15s (remain 1m 46s) Loss: 1.9046(2.5124) Grad: 262827.3125  LR: 0.00001720  


Epoch: [1][6100/6143] Elapsed 76m 30s (remain 0m 31s) Loss: 1.9272(2.5000) Grad: 556905.5000  LR: 0.00001711  


Epoch: [1][6142/6143] Elapsed 77m 2s (remain 0m 0s) Loss: 1.4573(2.4952) Grad: 495672.5625  LR: 0.00001707  


EVAL: [0/2038] Elapsed 0m 0s (remain 7m 30s) Loss: 2.3909(2.3909) 


EVAL: [100/2038] Elapsed 0m 22s (remain 7m 6s) Loss: 2.3540(1.7800) 


EVAL: [200/2038] Elapsed 0m 44s (remain 6m 43s) Loss: 1.6618(1.7520) 


EVAL: [300/2038] Elapsed 1m 5s (remain 6m 20s) Loss: 1.5814(1.6814) 


EVAL: [400/2038] Elapsed 1m 27s (remain 5m 58s) Loss: 1.5708(1.6359) 


EVAL: [500/2038] Elapsed 1m 49s (remain 5m 36s) Loss: 1.2067(1.6167) 


EVAL: [600/2038] Elapsed 2m 11s (remain 5m 14s) Loss: 1.7915(1.5932) 


EVAL: [700/2038] Elapsed 2m 33s (remain 4m 52s) Loss: 1.2190(1.5756) 


EVAL: [800/2038] Elapsed 2m 55s (remain 4m 30s) Loss: 1.8267(1.5858) 


EVAL: [900/2038] Elapsed 3m 17s (remain 4m 8s) Loss: 1.9152(1.6039) 


EVAL: [1000/2038] Elapsed 3m 39s (remain 3m 46s) Loss: 1.9577(1.6215) 


EVAL: [1100/2038] Elapsed 4m 0s (remain 3m 25s) Loss: 1.0812(1.6197) 


EVAL: [1200/2038] Elapsed 4m 22s (remain 3m 3s) Loss: 1.2539(1.6153) 


EVAL: [1300/2038] Elapsed 4m 44s (remain 2m 41s) Loss: 1.5520(1.6112) 


EVAL: [1400/2038] Elapsed 5m 6s (remain 2m 19s) Loss: 1.5060(1.6238) 


EVAL: [1500/2038] Elapsed 5m 28s (remain 1m 57s) Loss: 1.5964(1.6266) 


EVAL: [1600/2038] Elapsed 5m 50s (remain 1m 35s) Loss: 1.4034(1.6279) 


EVAL: [1700/2038] Elapsed 6m 12s (remain 1m 13s) Loss: 1.5056(1.6335) 


EVAL: [1800/2038] Elapsed 6m 33s (remain 0m 51s) Loss: 1.4954(1.6387) 


EVAL: [1900/2038] Elapsed 6m 55s (remain 0m 29s) Loss: 1.4267(1.6376) 


EVAL: [2000/2038] Elapsed 7m 17s (remain 0m 8s) Loss: 1.3673(1.6385) 


Epoch 1 - avg_train_loss: 2.4952  avg_val_loss: 1.6374  time: 5068s


Epoch 1 - Save Best Loss: 1.6374 Model


EVAL: [2037/2038] Elapsed 7m 25s (remain 0m 0s) Loss: 1.8392(1.6374) 


Epoch: [2][0/6143] Elapsed 0m 0s (remain 75m 52s) Loss: 1.8265(1.8265) Grad: 582418.3125  LR: 0.00001707  


Epoch: [2][100/6143] Elapsed 1m 15s (remain 75m 36s) Loss: 3.0349(1.7842) Grad: nan  LR: 0.00001698  


Epoch: [2][200/6143] Elapsed 2m 30s (remain 74m 0s) Loss: 2.0543(2.0365) Grad: nan  LR: 0.00001689  


Epoch: [2][300/6143] Elapsed 3m 44s (remain 72m 38s) Loss: 2.5628(2.0504) Grad: nan  LR: 0.00001679  


Epoch: [2][400/6143] Elapsed 4m 58s (remain 71m 19s) Loss: 2.0491(2.0546) Grad: nan  LR: 0.00001670  


Epoch: [2][500/6143] Elapsed 6m 13s (remain 70m 2s) Loss: 1.8789(2.0228) Grad: nan  LR: 0.00001660  


Epoch: [2][600/6143] Elapsed 7m 27s (remain 68m 47s) Loss: 1.9495(1.9987) Grad: nan  LR: 0.00001651  


Epoch: [2][700/6143] Elapsed 8m 41s (remain 67m 31s) Loss: 2.0783(1.9853) Grad: nan  LR: 0.00001641  


Epoch: [2][800/6143] Elapsed 9m 56s (remain 66m 15s) Loss: 2.2482(1.9697) Grad: nan  LR: 0.00001631  


Epoch: [2][900/6143] Elapsed 11m 10s (remain 65m 0s) Loss: 1.2275(1.9485) Grad: nan  LR: 0.00001621  


Epoch: [2][1000/6143] Elapsed 12m 24s (remain 63m 45s) Loss: 1.8425(1.9355) Grad: nan  LR: 0.00001611  


Epoch: [2][1100/6143] Elapsed 13m 39s (remain 62m 31s) Loss: 1.5139(1.9179) Grad: nan  LR: 0.00001601  


Epoch: [2][1200/6143] Elapsed 14m 53s (remain 61m 16s) Loss: 1.7607(1.9020) Grad: nan  LR: 0.00001591  


Epoch: [2][1300/6143] Elapsed 16m 7s (remain 60m 2s) Loss: 1.4673(1.8871) Grad: nan  LR: 0.00001580  


Epoch: [2][1400/6143] Elapsed 17m 22s (remain 58m 48s) Loss: 1.4399(1.8745) Grad: nan  LR: 0.00001570  


Epoch: [2][1500/6143] Elapsed 18m 36s (remain 57m 34s) Loss: 1.2184(1.8657) Grad: nan  LR: 0.00001559  


Epoch: [2][1600/6143] Elapsed 19m 51s (remain 56m 19s) Loss: 1.9472(1.8569) Grad: nan  LR: 0.00001549  


Epoch: [2][1700/6143] Elapsed 21m 5s (remain 55m 5s) Loss: 1.4763(1.8471) Grad: nan  LR: 0.00001538  


Epoch: [2][1800/6143] Elapsed 22m 19s (remain 53m 50s) Loss: 1.2553(1.8349) Grad: nan  LR: 0.00001527  


Epoch: [2][1900/6143] Elapsed 23m 34s (remain 52m 35s) Loss: 2.6753(1.8250) Grad: nan  LR: 0.00001516  


Epoch: [2][2000/6143] Elapsed 24m 48s (remain 51m 21s) Loss: 1.5690(1.8180) Grad: nan  LR: 0.00001505  


Epoch: [2][2100/6143] Elapsed 26m 2s (remain 50m 6s) Loss: 2.1968(1.8119) Grad: nan  LR: 0.00001494  


Epoch: [2][2200/6143] Elapsed 27m 17s (remain 48m 52s) Loss: 1.5435(1.8012) Grad: nan  LR: 0.00001483  


Epoch: [2][2300/6143] Elapsed 28m 31s (remain 47m 37s) Loss: 1.9452(1.7949) Grad: nan  LR: 0.00001472  


Epoch: [2][2400/6143] Elapsed 29m 45s (remain 46m 23s) Loss: 1.6300(1.7873) Grad: nan  LR: 0.00001460  


Epoch: [2][2500/6143] Elapsed 31m 0s (remain 45m 8s) Loss: 1.6392(1.7805) Grad: nan  LR: 0.00001449  


Epoch: [2][2600/6143] Elapsed 32m 14s (remain 43m 54s) Loss: 1.7814(1.7722) Grad: nan  LR: 0.00001438  


Epoch: [2][2700/6143] Elapsed 33m 28s (remain 42m 40s) Loss: 1.8236(1.7647) Grad: nan  LR: 0.00001426  


Epoch: [2][2800/6143] Elapsed 34m 43s (remain 41m 25s) Loss: 1.1626(1.7600) Grad: nan  LR: 0.00001414  


Epoch: [2][2900/6143] Elapsed 35m 58s (remain 40m 11s) Loss: 1.2413(1.7541) Grad: nan  LR: 0.00001403  


Epoch: [2][3000/6143] Elapsed 37m 12s (remain 38m 57s) Loss: 1.6438(1.7488) Grad: nan  LR: 0.00001391  


Epoch: [2][3100/6143] Elapsed 38m 26s (remain 37m 42s) Loss: 1.8116(1.7410) Grad: nan  LR: 0.00001379  


Epoch: [2][3200/6143] Elapsed 39m 41s (remain 36m 28s) Loss: 1.7434(1.7354) Grad: nan  LR: 0.00001367  


Epoch: [2][3300/6143] Elapsed 40m 55s (remain 35m 14s) Loss: 0.9900(1.7298) Grad: nan  LR: 0.00001355  


Epoch: [2][3400/6143] Elapsed 42m 9s (remain 33m 59s) Loss: 2.0378(1.7240) Grad: nan  LR: 0.00001343  


Epoch: [2][3500/6143] Elapsed 43m 24s (remain 32m 45s) Loss: 1.4277(1.7177) Grad: nan  LR: 0.00001331  


Epoch: [2][3600/6143] Elapsed 44m 38s (remain 31m 30s) Loss: 1.2886(1.7132) Grad: nan  LR: 0.00001319  


Epoch: [2][3700/6143] Elapsed 45m 52s (remain 30m 16s) Loss: 1.4036(1.7083) Grad: nan  LR: 0.00001307  


Epoch: [2][3800/6143] Elapsed 47m 7s (remain 29m 1s) Loss: 1.3978(1.7028) Grad: nan  LR: 0.00001295  


Epoch: [2][3900/6143] Elapsed 48m 21s (remain 27m 47s) Loss: 1.1637(1.6985) Grad: nan  LR: 0.00001283  


Epoch: [2][4000/6143] Elapsed 49m 35s (remain 26m 33s) Loss: 1.1823(1.6942) Grad: nan  LR: 0.00001270  


Epoch: [2][4100/6143] Elapsed 50m 50s (remain 25m 18s) Loss: 0.9803(1.6890) Grad: nan  LR: 0.00001258  


Epoch: [2][4200/6143] Elapsed 52m 4s (remain 24m 4s) Loss: 1.5117(1.6846) Grad: nan  LR: 0.00001246  


Epoch: [2][4300/6143] Elapsed 53m 19s (remain 22m 50s) Loss: 1.1297(1.6794) Grad: nan  LR: 0.00001233  


Epoch: [2][4400/6143] Elapsed 54m 33s (remain 21m 35s) Loss: 1.9361(1.6743) Grad: nan  LR: 0.00001221  


Epoch: [2][4500/6143] Elapsed 55m 48s (remain 20m 21s) Loss: 1.1893(1.6697) Grad: nan  LR: 0.00001208  


Epoch: [2][4600/6143] Elapsed 57m 2s (remain 19m 6s) Loss: 1.7688(1.6649) Grad: nan  LR: 0.00001196  


Epoch: [2][4700/6143] Elapsed 58m 16s (remain 17m 52s) Loss: 1.3226(1.6613) Grad: nan  LR: 0.00001183  


Epoch: [2][4800/6143] Elapsed 59m 31s (remain 16m 38s) Loss: 1.6600(1.6577) Grad: nan  LR: 0.00001171  


Epoch: [2][4900/6143] Elapsed 60m 45s (remain 15m 23s) Loss: 1.6322(1.6548) Grad: nan  LR: 0.00001158  


Epoch: [2][5000/6143] Elapsed 61m 59s (remain 14m 9s) Loss: 1.8638(1.6507) Grad: nan  LR: 0.00001145  


Epoch: [2][5100/6143] Elapsed 63m 14s (remain 12m 55s) Loss: 1.3658(1.6462) Grad: nan  LR: 0.00001133  


Epoch: [2][5200/6143] Elapsed 64m 28s (remain 11m 40s) Loss: 1.6403(1.6424) Grad: nan  LR: 0.00001120  


Epoch: [2][5300/6143] Elapsed 65m 43s (remain 10m 26s) Loss: 2.1412(1.6377) Grad: nan  LR: 0.00001107  


Epoch: [2][5400/6143] Elapsed 66m 57s (remain 9m 11s) Loss: 1.8958(1.6330) Grad: nan  LR: 0.00001095  


Epoch: [2][5500/6143] Elapsed 68m 11s (remain 7m 57s) Loss: 2.0407(1.6287) Grad: nan  LR: 0.00001082  


Epoch: [2][5600/6143] Elapsed 69m 26s (remain 6m 43s) Loss: 2.0611(1.6257) Grad: nan  LR: 0.00001069  


Epoch: [2][5700/6143] Elapsed 70m 40s (remain 5m 28s) Loss: 1.7211(1.6222) Grad: nan  LR: 0.00001056  


Epoch: [2][5800/6143] Elapsed 71m 54s (remain 4m 14s) Loss: 1.6906(1.6193) Grad: nan  LR: 0.00001044  


Epoch: [2][5900/6143] Elapsed 73m 9s (remain 2m 59s) Loss: 1.5366(1.6156) Grad: nan  LR: 0.00001031  


Epoch: [2][6000/6143] Elapsed 74m 23s (remain 1m 45s) Loss: 1.3293(1.6128) Grad: nan  LR: 0.00001018  


Epoch: [2][6100/6143] Elapsed 75m 37s (remain 0m 31s) Loss: 1.1825(1.6091) Grad: nan  LR: 0.00001005  


Epoch: [2][6142/6143] Elapsed 76m 8s (remain 0m 0s) Loss: 1.8024(1.6080) Grad: nan  LR: 0.00001000  


EVAL: [0/2038] Elapsed 0m 0s (remain 7m 25s) Loss: 1.9761(1.9761) 


EVAL: [100/2038] Elapsed 0m 22s (remain 7m 3s) Loss: 1.8538(1.4275) 


EVAL: [200/2038] Elapsed 0m 43s (remain 6m 41s) Loss: 1.4593(1.3861) 


EVAL: [300/2038] Elapsed 1m 5s (remain 6m 19s) Loss: 0.9771(1.3443) 


EVAL: [400/2038] Elapsed 1m 27s (remain 5m 57s) Loss: 1.2289(1.3180) 


EVAL: [500/2038] Elapsed 1m 49s (remain 5m 36s) Loss: 1.2420(1.3012) 


EVAL: [600/2038] Elapsed 2m 11s (remain 5m 14s) Loss: 1.0977(1.2858) 


EVAL: [700/2038] Elapsed 2m 33s (remain 4m 52s) Loss: 0.9610(1.2799) 


EVAL: [800/2038] Elapsed 2m 55s (remain 4m 30s) Loss: 1.4036(1.2870) 


EVAL: [900/2038] Elapsed 3m 16s (remain 4m 8s) Loss: 0.9810(1.2967) 


EVAL: [1000/2038] Elapsed 3m 38s (remain 3m 46s) Loss: 1.3069(1.3092) 


EVAL: [1100/2038] Elapsed 4m 0s (remain 3m 24s) Loss: 1.1885(1.3094) 


EVAL: [1200/2038] Elapsed 4m 22s (remain 3m 3s) Loss: 1.0300(1.3082) 


EVAL: [1300/2038] Elapsed 4m 44s (remain 2m 41s) Loss: 1.1492(1.3083) 


EVAL: [1400/2038] Elapsed 5m 6s (remain 2m 19s) Loss: 0.9038(1.3189) 


EVAL: [1500/2038] Elapsed 5m 28s (remain 1m 57s) Loss: 1.2827(1.3222) 


EVAL: [1600/2038] Elapsed 5m 50s (remain 1m 35s) Loss: 1.8140(1.3246) 


EVAL: [1700/2038] Elapsed 6m 12s (remain 1m 13s) Loss: 1.4846(1.3279) 


EVAL: [1800/2038] Elapsed 6m 34s (remain 0m 51s) Loss: 1.1767(1.3340) 


EVAL: [1900/2038] Elapsed 6m 56s (remain 0m 29s) Loss: 1.0562(1.3357) 


EVAL: [2000/2038] Elapsed 7m 17s (remain 0m 8s) Loss: 1.6377(1.3376) 


Epoch 2 - avg_train_loss: 1.6080  avg_val_loss: 1.3367  time: 5015s


Epoch 2 - Save Best Loss: 1.3367 Model


EVAL: [2037/2038] Elapsed 7m 26s (remain 0m 0s) Loss: 1.0438(1.3367) 


Epoch: [3][0/6143] Elapsed 0m 0s (remain 75m 40s) Loss: 1.2628(1.2628) Grad: nan  LR: 0.00001000  


Epoch: [3][100/6143] Elapsed 1m 15s (remain 74m 48s) Loss: 1.2132(1.3404) Grad: nan  LR: 0.00000987  


Epoch: [3][200/6143] Elapsed 2m 29s (remain 73m 33s) Loss: 1.4430(1.3551) Grad: nan  LR: 0.00000974  


Epoch: [3][300/6143] Elapsed 3m 43s (remain 72m 18s) Loss: 0.8877(1.3623) Grad: nan  LR: 0.00000962  


Epoch: [3][400/6143] Elapsed 4m 57s (remain 71m 4s) Loss: 1.7587(1.3684) Grad: nan  LR: 0.00000949  


Epoch: [3][500/6143] Elapsed 6m 12s (remain 69m 49s) Loss: 1.2309(1.3723) Grad: nan  LR: 0.00000936  


Epoch: [3][600/6143] Elapsed 7m 26s (remain 68m 36s) Loss: 1.4452(1.3718) Grad: nan  LR: 0.00000923  


Epoch: [3][700/6143] Elapsed 8m 40s (remain 67m 21s) Loss: 1.5920(1.3785) Grad: nan  LR: 0.00000910  


Epoch: [3][800/6143] Elapsed 9m 54s (remain 66m 7s) Loss: 1.6672(1.3804) Grad: nan  LR: 0.00000898  


Epoch: [3][900/6143] Elapsed 11m 9s (remain 64m 52s) Loss: 1.4771(1.3727) Grad: nan  LR: 0.00000885  


Epoch: [3][1000/6143] Elapsed 12m 23s (remain 63m 38s) Loss: 1.7372(1.3778) Grad: nan  LR: 0.00000872  


Epoch: [3][1100/6143] Elapsed 13m 37s (remain 62m 24s) Loss: 0.9755(1.3746) Grad: nan  LR: 0.00000860  


Epoch: [3][1200/6143] Elapsed 14m 51s (remain 61m 9s) Loss: 1.2465(1.3756) Grad: nan  LR: 0.00000847  


Epoch: [3][1300/6143] Elapsed 16m 6s (remain 59m 56s) Loss: 1.1102(1.3718) Grad: nan  LR: 0.00000834  


Epoch: [3][1400/6143] Elapsed 17m 20s (remain 58m 41s) Loss: 1.5867(1.3718) Grad: nan  LR: 0.00000822  


Epoch: [3][1500/6143] Elapsed 18m 34s (remain 57m 27s) Loss: 1.0788(1.3679) Grad: nan  LR: 0.00000809  


Epoch: [3][1600/6143] Elapsed 19m 49s (remain 56m 13s) Loss: 1.3144(1.3651) Grad: nan  LR: 0.00000797  


Epoch: [3][1700/6143] Elapsed 21m 3s (remain 54m 59s) Loss: 1.6041(1.3647) Grad: nan  LR: 0.00000784  


Epoch: [3][1800/6143] Elapsed 22m 17s (remain 53m 45s) Loss: 1.4561(1.3637) Grad: nan  LR: 0.00000772  


Epoch: [3][1900/6143] Elapsed 23m 31s (remain 52m 30s) Loss: 1.4964(1.3620) Grad: nan  LR: 0.00000759  


Epoch: [3][2000/6143] Elapsed 24m 46s (remain 51m 16s) Loss: 1.1686(1.3588) Grad: nan  LR: 0.00000747  


Epoch: [3][2100/6143] Elapsed 26m 0s (remain 50m 2s) Loss: 1.1161(1.3579) Grad: nan  LR: 0.00000735  


Epoch: [3][2200/6143] Elapsed 27m 14s (remain 48m 48s) Loss: 1.7918(1.3593) Grad: nan  LR: 0.00000722  


Epoch: [3][2300/6143] Elapsed 28m 29s (remain 47m 33s) Loss: 0.9155(1.3591) Grad: nan  LR: 0.00000710  


Epoch: [3][2400/6143] Elapsed 29m 43s (remain 46m 19s) Loss: 1.0921(1.3593) Grad: nan  LR: 0.00000698  


Epoch: [3][2500/6143] Elapsed 30m 57s (remain 45m 5s) Loss: 1.6478(1.3593) Grad: nan  LR: 0.00000686  


Epoch: [3][2600/6143] Elapsed 32m 12s (remain 43m 51s) Loss: 1.2412(1.3576) Grad: nan  LR: 0.00000674  


Epoch: [3][2700/6143] Elapsed 33m 26s (remain 42m 36s) Loss: 1.2803(1.3549) Grad: nan  LR: 0.00000661  


Epoch: [3][2800/6143] Elapsed 34m 40s (remain 41m 22s) Loss: 1.3842(1.3566) Grad: nan  LR: 0.00000649  


Epoch: [3][2900/6143] Elapsed 35m 55s (remain 40m 8s) Loss: 1.1426(1.3544) Grad: nan  LR: 0.00000638  


Epoch: [3][3000/6143] Elapsed 37m 9s (remain 38m 54s) Loss: 1.4655(1.3546) Grad: nan  LR: 0.00000626  


Epoch: [3][3100/6143] Elapsed 38m 23s (remain 37m 39s) Loss: 1.7372(1.3542) Grad: nan  LR: 0.00000614  


Epoch: [3][3200/6143] Elapsed 39m 38s (remain 36m 25s) Loss: 1.1641(1.3507) Grad: nan  LR: 0.00000602  


Epoch: [3][3300/6143] Elapsed 40m 52s (remain 35m 11s) Loss: 1.1043(1.3492) Grad: nan  LR: 0.00000590  


Epoch: [3][3400/6143] Elapsed 42m 6s (remain 33m 57s) Loss: 1.3725(1.3494) Grad: nan  LR: 0.00000579  


Epoch: [3][3500/6143] Elapsed 43m 21s (remain 32m 42s) Loss: 1.0748(1.3475) Grad: nan  LR: 0.00000567  


Epoch: [3][3600/6143] Elapsed 44m 35s (remain 31m 28s) Loss: 1.2758(1.3453) Grad: nan  LR: 0.00000556  


Epoch: [3][3700/6143] Elapsed 45m 49s (remain 30m 14s) Loss: 1.3103(1.3456) Grad: nan  LR: 0.00000544  


Epoch: [3][3800/6143] Elapsed 47m 3s (remain 28m 59s) Loss: 1.5999(1.3453) Grad: nan  LR: 0.00000533  


Epoch: [3][3900/6143] Elapsed 48m 18s (remain 27m 45s) Loss: 1.0310(1.3446) Grad: nan  LR: 0.00000522  


Epoch: [3][4000/6143] Elapsed 49m 32s (remain 26m 31s) Loss: 1.2026(1.3424) Grad: nan  LR: 0.00000510  


Epoch: [3][4100/6143] Elapsed 50m 46s (remain 25m 16s) Loss: 1.4171(1.3429) Grad: nan  LR: 0.00000499  


Epoch: [3][4200/6143] Elapsed 52m 0s (remain 24m 2s) Loss: 0.9006(1.3429) Grad: nan  LR: 0.00000488  


Epoch: [3][4300/6143] Elapsed 53m 15s (remain 22m 48s) Loss: 1.2796(1.3413) Grad: nan  LR: 0.00000477  


Epoch: [3][4400/6143] Elapsed 54m 29s (remain 21m 34s) Loss: 1.2888(1.3407) Grad: nan  LR: 0.00000467  


Epoch: [3][4500/6143] Elapsed 55m 43s (remain 20m 19s) Loss: 1.5895(1.3400) Grad: nan  LR: 0.00000456  


Epoch: [3][4600/6143] Elapsed 56m 58s (remain 19m 5s) Loss: 1.1364(1.3392) Grad: nan  LR: 0.00000445  


Epoch: [3][4700/6143] Elapsed 58m 12s (remain 17m 51s) Loss: 0.9050(1.3379) Grad: nan  LR: 0.00000435  


Epoch: [3][4800/6143] Elapsed 59m 26s (remain 16m 36s) Loss: 0.9018(1.3364) Grad: nan  LR: 0.00000424  


Epoch: [3][4900/6143] Elapsed 60m 40s (remain 15m 22s) Loss: 1.4733(1.3351) Grad: nan  LR: 0.00000414  


Epoch: [3][5000/6143] Elapsed 61m 55s (remain 14m 8s) Loss: 1.1487(1.3338) Grad: nan  LR: 0.00000403  


Epoch: [3][5100/6143] Elapsed 63m 9s (remain 12m 54s) Loss: 1.0713(1.3333) Grad: nan  LR: 0.00000393  


Epoch: [3][5200/6143] Elapsed 64m 24s (remain 11m 39s) Loss: 1.0130(1.3327) Grad: nan  LR: 0.00000383  


Epoch: [3][5300/6143] Elapsed 65m 38s (remain 10m 25s) Loss: 1.5309(1.3320) Grad: nan  LR: 0.00000373  


Epoch: [3][5400/6143] Elapsed 66m 52s (remain 9m 11s) Loss: 1.0697(1.3308) Grad: nan  LR: 0.00000363  


Epoch: [3][5500/6143] Elapsed 68m 7s (remain 7m 57s) Loss: 1.8673(1.3301) Grad: nan  LR: 0.00000353  


Epoch: [3][5600/6143] Elapsed 69m 21s (remain 6m 42s) Loss: 1.1154(1.3297) Grad: nan  LR: 0.00000344  


Epoch: [3][5700/6143] Elapsed 70m 36s (remain 5m 28s) Loss: 0.9965(1.3281) Grad: nan  LR: 0.00000334  


Epoch: [3][5800/6143] Elapsed 71m 50s (remain 4m 14s) Loss: 1.0921(1.3269) Grad: nan  LR: 0.00000324  


Epoch: [3][5900/6143] Elapsed 73m 5s (remain 2m 59s) Loss: 1.2452(1.3252) Grad: nan  LR: 0.00000315  


Epoch: [3][6000/6143] Elapsed 74m 19s (remain 1m 45s) Loss: 1.3321(1.3246) Grad: nan  LR: 0.00000306  


Epoch: [3][6100/6143] Elapsed 75m 33s (remain 0m 31s) Loss: 1.0302(1.3237) Grad: nan  LR: 0.00000297  


Epoch: [3][6142/6143] Elapsed 76m 4s (remain 0m 0s) Loss: 1.1652(1.3232) Grad: nan  LR: 0.00000293  


EVAL: [0/2038] Elapsed 0m 0s (remain 7m 24s) Loss: 1.5297(1.5297) 


EVAL: [100/2038] Elapsed 0m 22s (remain 7m 2s) Loss: 1.5943(1.2889) 


EVAL: [200/2038] Elapsed 0m 43s (remain 6m 40s) Loss: 1.0602(1.2548) 


EVAL: [300/2038] Elapsed 1m 5s (remain 6m 19s) Loss: 0.9744(1.2089) 


EVAL: [400/2038] Elapsed 1m 27s (remain 5m 57s) Loss: 0.7162(1.1897) 


EVAL: [500/2038] Elapsed 1m 49s (remain 5m 35s) Loss: 0.9659(1.1753) 


EVAL: [600/2038] Elapsed 2m 11s (remain 5m 13s) Loss: 1.1367(1.1643) 


EVAL: [700/2038] Elapsed 2m 33s (remain 4m 51s) Loss: 0.6192(1.1589) 


EVAL: [800/2038] Elapsed 2m 54s (remain 4m 30s) Loss: 1.4258(1.1693) 


EVAL: [900/2038] Elapsed 3m 16s (remain 4m 8s) Loss: 1.6622(1.1849) 


EVAL: [1000/2038] Elapsed 3m 38s (remain 3m 46s) Loss: 1.2627(1.1975) 


EVAL: [1100/2038] Elapsed 4m 0s (remain 3m 24s) Loss: 1.2883(1.1961) 


EVAL: [1200/2038] Elapsed 4m 22s (remain 3m 2s) Loss: 0.8958(1.1903) 


EVAL: [1300/2038] Elapsed 4m 44s (remain 2m 40s) Loss: 0.8542(1.1890) 


EVAL: [1400/2038] Elapsed 5m 5s (remain 2m 19s) Loss: 1.2840(1.1957) 


EVAL: [1500/2038] Elapsed 5m 27s (remain 1m 57s) Loss: 1.0206(1.1967) 


EVAL: [1600/2038] Elapsed 5m 49s (remain 1m 35s) Loss: 1.2760(1.1996) 


EVAL: [1700/2038] Elapsed 6m 11s (remain 1m 13s) Loss: 1.4985(1.2042) 


EVAL: [1800/2038] Elapsed 6m 33s (remain 0m 51s) Loss: 0.9810(1.2100) 


EVAL: [1900/2038] Elapsed 6m 55s (remain 0m 29s) Loss: 0.9687(1.2112) 


EVAL: [2000/2038] Elapsed 7m 16s (remain 0m 8s) Loss: 1.0428(1.2134) 


Epoch 3 - avg_train_loss: 1.3232  avg_val_loss: 1.2124  time: 5010s


Epoch 3 - Save Best Loss: 1.2124 Model


EVAL: [2037/2038] Elapsed 7m 24s (remain 0m 0s) Loss: 1.2511(1.2124) 


Epoch: [4][0/6143] Elapsed 0m 0s (remain 77m 19s) Loss: 1.2967(1.2967) Grad: nan  LR: 0.00000293  


Epoch: [4][100/6143] Elapsed 1m 15s (remain 74m 47s) Loss: 1.4528(1.2632) Grad: nan  LR: 0.00000284  


Epoch: [4][200/6143] Elapsed 2m 29s (remain 73m 35s) Loss: 1.1988(1.2514) Grad: nan  LR: 0.00000275  


Epoch: [4][300/6143] Elapsed 3m 43s (remain 72m 21s) Loss: 0.9651(1.2551) Grad: nan  LR: 0.00000266  


Epoch: [4][400/6143] Elapsed 4m 58s (remain 71m 7s) Loss: 1.9327(1.2693) Grad: nan  LR: 0.00000258  


Epoch: [4][500/6143] Elapsed 6m 12s (remain 69m 54s) Loss: 0.8343(1.2608) Grad: nan  LR: 0.00000249  


Epoch: [4][600/6143] Elapsed 7m 26s (remain 68m 39s) Loss: 1.3377(1.2648) Grad: nan  LR: 0.00000241  


Epoch: [4][700/6143] Elapsed 8m 40s (remain 67m 24s) Loss: 1.1362(1.2649) Grad: nan  LR: 0.00000232  


Epoch: [4][800/6143] Elapsed 9m 55s (remain 66m 9s) Loss: 1.2972(1.2631) Grad: nan  LR: 0.00000224  


Epoch: [4][900/6143] Elapsed 11m 9s (remain 64m 55s) Loss: 0.8312(1.2592) Grad: nan  LR: 0.00000216  


Epoch: [4][1000/6143] Elapsed 12m 23s (remain 63m 40s) Loss: 1.2486(1.2556) Grad: nan  LR: 0.00000208  


Epoch: [4][1100/6143] Elapsed 13m 38s (remain 62m 26s) Loss: 1.0277(1.2567) Grad: nan  LR: 0.00000201  


Epoch: [4][1200/6143] Elapsed 14m 52s (remain 61m 12s) Loss: 1.7365(1.2533) Grad: nan  LR: 0.00000193  


Epoch: [4][1300/6143] Elapsed 16m 6s (remain 59m 58s) Loss: 2.2552(1.2565) Grad: nan  LR: 0.00000186  


Epoch: [4][1400/6143] Elapsed 17m 21s (remain 58m 44s) Loss: 1.7886(1.2544) Grad: nan  LR: 0.00000178  


Epoch: [4][1500/6143] Elapsed 18m 35s (remain 57m 29s) Loss: 1.1195(1.2553) Grad: nan  LR: 0.00000171  


Epoch: [4][1600/6143] Elapsed 19m 49s (remain 56m 15s) Loss: 1.3398(1.2523) Grad: nan  LR: 0.00000164  


Epoch: [4][1700/6143] Elapsed 21m 4s (remain 55m 0s) Loss: 1.2699(1.2539) Grad: nan  LR: 0.00000157  


Epoch: [4][1800/6143] Elapsed 22m 18s (remain 53m 46s) Loss: 1.2168(1.2521) Grad: nan  LR: 0.00000150  


Epoch: [4][1900/6143] Elapsed 23m 32s (remain 52m 32s) Loss: 1.5152(1.2534) Grad: nan  LR: 0.00000144  


Epoch: [4][2000/6143] Elapsed 24m 47s (remain 51m 18s) Loss: 0.7730(1.2563) Grad: nan  LR: 0.00000137  


Epoch: [4][2100/6143] Elapsed 26m 1s (remain 50m 3s) Loss: 1.3206(1.2551) Grad: nan  LR: 0.00000131  


Epoch: [4][2200/6143] Elapsed 27m 15s (remain 48m 49s) Loss: 1.3626(1.2568) Grad: nan  LR: 0.00000124  


Epoch: [4][2300/6143] Elapsed 28m 29s (remain 47m 35s) Loss: 1.2888(1.2560) Grad: nan  LR: 0.00000118  


Epoch: [4][2400/6143] Elapsed 29m 44s (remain 46m 20s) Loss: 0.8488(1.2556) Grad: nan  LR: 0.00000112  


Epoch: [4][2500/6143] Elapsed 30m 58s (remain 45m 6s) Loss: 0.9513(1.2565) Grad: nan  LR: 0.00000106  


Epoch: [4][2600/6143] Elapsed 32m 13s (remain 43m 52s) Loss: 1.1250(1.2546) Grad: nan  LR: 0.00000101  


Epoch: [4][2700/6143] Elapsed 33m 27s (remain 42m 38s) Loss: 1.2082(1.2558) Grad: nan  LR: 0.00000095  


Epoch: [4][2800/6143] Elapsed 34m 41s (remain 41m 23s) Loss: 1.6089(1.2557) Grad: nan  LR: 0.00000090  


Epoch: [4][2900/6143] Elapsed 35m 56s (remain 40m 9s) Loss: 1.0068(1.2549) Grad: nan  LR: 0.00000085  


Epoch: [4][3000/6143] Elapsed 37m 10s (remain 38m 55s) Loss: 1.5767(1.2551) Grad: nan  LR: 0.00000080  


Epoch: [4][3100/6143] Elapsed 38m 24s (remain 37m 40s) Loss: 0.9166(1.2544) Grad: nan  LR: 0.00000075  


Epoch: [4][3200/6143] Elapsed 39m 38s (remain 36m 26s) Loss: 1.1136(1.2540) Grad: nan  LR: 0.00000070  


Epoch: [4][3300/6143] Elapsed 40m 53s (remain 35m 12s) Loss: 1.8605(1.2538) Grad: nan  LR: 0.00000065  


Epoch: [4][3400/6143] Elapsed 42m 7s (remain 33m 57s) Loss: 1.0584(1.2534) Grad: nan  LR: 0.00000061  


Epoch: [4][3500/6143] Elapsed 43m 21s (remain 32m 43s) Loss: 1.0291(1.2531) Grad: nan  LR: 0.00000057  


Epoch: [4][3600/6143] Elapsed 44m 36s (remain 31m 29s) Loss: 1.0652(1.2538) Grad: nan  LR: 0.00000052  


Epoch: [4][3700/6143] Elapsed 45m 50s (remain 30m 14s) Loss: 1.4178(1.2544) Grad: nan  LR: 0.00000048  


Epoch: [4][3800/6143] Elapsed 47m 4s (remain 29m 0s) Loss: 1.3269(1.2545) Grad: nan  LR: 0.00000044  


Epoch: [4][3900/6143] Elapsed 48m 18s (remain 27m 46s) Loss: 1.3347(1.2538) Grad: nan  LR: 0.00000041  


Epoch: [4][4000/6143] Elapsed 49m 33s (remain 26m 31s) Loss: 1.1243(1.2520) Grad: nan  LR: 0.00000037  


Epoch: [4][4100/6143] Elapsed 50m 47s (remain 25m 17s) Loss: 1.1249(1.2521) Grad: nan  LR: 0.00000034  


Epoch: [4][4200/6143] Elapsed 52m 1s (remain 24m 3s) Loss: 1.8114(1.2516) Grad: nan  LR: 0.00000031  


Epoch: [4][4300/6143] Elapsed 53m 15s (remain 22m 48s) Loss: 1.3881(1.2503) Grad: nan  LR: 0.00000028  


Epoch: [4][4400/6143] Elapsed 54m 30s (remain 21m 34s) Loss: 1.3238(1.2498) Grad: nan  LR: 0.00000025  


Epoch: [4][4500/6143] Elapsed 55m 44s (remain 20m 20s) Loss: 0.7530(1.2485) Grad: nan  LR: 0.00000022  


Epoch: [4][4600/6143] Elapsed 56m 58s (remain 19m 5s) Loss: 0.8045(1.2476) Grad: nan  LR: 0.00000019  


Epoch: [4][4700/6143] Elapsed 58m 12s (remain 17m 51s) Loss: 0.9808(1.2480) Grad: nan  LR: 0.00000017  


Epoch: [4][4800/6143] Elapsed 59m 27s (remain 16m 37s) Loss: 1.1700(1.2478) Grad: nan  LR: 0.00000015  


Epoch: [4][4900/6143] Elapsed 60m 41s (remain 15m 22s) Loss: 1.6219(1.2481) Grad: nan  LR: 0.00000013  


Epoch: [4][5000/6143] Elapsed 61m 55s (remain 14m 8s) Loss: 1.0501(1.2483) Grad: nan  LR: 0.00000011  


Epoch: [4][5100/6143] Elapsed 63m 10s (remain 12m 54s) Loss: 1.2011(1.2477) Grad: nan  LR: 0.00000009  


Epoch: [4][5200/6143] Elapsed 64m 24s (remain 11m 39s) Loss: 0.8760(1.2475) Grad: nan  LR: 0.00000007  


Epoch: [4][5300/6143] Elapsed 65m 38s (remain 10m 25s) Loss: 1.1582(1.2481) Grad: nan  LR: 0.00000006  


Epoch: [4][5400/6143] Elapsed 66m 53s (remain 9m 11s) Loss: 1.1917(1.2477) Grad: nan  LR: 0.00000004  


Epoch: [4][5500/6143] Elapsed 68m 7s (remain 7m 57s) Loss: 1.4695(1.2471) Grad: nan  LR: 0.00000003  


Epoch: [4][5600/6143] Elapsed 69m 21s (remain 6m 42s) Loss: 1.2853(1.2463) Grad: nan  LR: 0.00000002  


Epoch: [4][5700/6143] Elapsed 70m 35s (remain 5m 28s) Loss: 1.6529(1.2464) Grad: nan  LR: 0.00000002  


Epoch: [4][5800/6143] Elapsed 71m 50s (remain 4m 14s) Loss: 1.2267(1.2461) Grad: nan  LR: 0.00000001  


Epoch: [4][5900/6143] Elapsed 73m 4s (remain 2m 59s) Loss: 1.3323(1.2470) Grad: nan  LR: 0.00000000  


Epoch: [4][6000/6143] Elapsed 74m 18s (remain 1m 45s) Loss: 1.0489(1.2462) Grad: nan  LR: 0.00000000  


Epoch: [4][6100/6143] Elapsed 75m 32s (remain 0m 31s) Loss: 1.1049(1.2460) Grad: nan  LR: 0.00000000  


Epoch: [4][6142/6143] Elapsed 76m 4s (remain 0m 0s) Loss: 1.1972(1.2460) Grad: nan  LR: 0.00000000  


EVAL: [0/2038] Elapsed 0m 0s (remain 7m 24s) Loss: 2.0171(2.0171) 


EVAL: [100/2038] Elapsed 0m 22s (remain 7m 2s) Loss: 1.6380(1.2940) 


EVAL: [200/2038] Elapsed 0m 43s (remain 6m 40s) Loss: 1.3450(1.2434) 


EVAL: [300/2038] Elapsed 1m 5s (remain 6m 18s) Loss: 0.7229(1.2010) 


EVAL: [400/2038] Elapsed 1m 27s (remain 5m 57s) Loss: 1.0630(1.1769) 


EVAL: [500/2038] Elapsed 1m 49s (remain 5m 35s) Loss: 0.9076(1.1624) 


EVAL: [600/2038] Elapsed 2m 11s (remain 5m 13s) Loss: 1.3016(1.1509) 


EVAL: [700/2038] Elapsed 2m 32s (remain 4m 51s) Loss: 0.9168(1.1390) 


EVAL: [800/2038] Elapsed 2m 54s (remain 4m 29s) Loss: 1.3999(1.1486) 


EVAL: [900/2038] Elapsed 3m 16s (remain 4m 7s) Loss: 0.8658(1.1621) 


EVAL: [1000/2038] Elapsed 3m 38s (remain 3m 46s) Loss: 0.9347(1.1744) 


EVAL: [1100/2038] Elapsed 3m 59s (remain 3m 24s) Loss: 1.0889(1.1742) 


EVAL: [1200/2038] Elapsed 4m 21s (remain 3m 2s) Loss: 0.9408(1.1728) 


EVAL: [1300/2038] Elapsed 4m 43s (remain 2m 40s) Loss: 0.8561(1.1703) 


EVAL: [1400/2038] Elapsed 5m 5s (remain 2m 18s) Loss: 1.1271(1.1767) 


EVAL: [1500/2038] Elapsed 5m 27s (remain 1m 57s) Loss: 0.7129(1.1791) 


EVAL: [1600/2038] Elapsed 5m 48s (remain 1m 35s) Loss: 1.7687(1.1815) 


EVAL: [1700/2038] Elapsed 6m 10s (remain 1m 13s) Loss: 0.9288(1.1820) 


EVAL: [1800/2038] Elapsed 6m 32s (remain 0m 51s) Loss: 1.1224(1.1864) 


EVAL: [1900/2038] Elapsed 6m 54s (remain 0m 29s) Loss: 1.2384(1.1876) 


EVAL: [2000/2038] Elapsed 7m 16s (remain 0m 8s) Loss: 1.6677(1.1886) 


Epoch 4 - avg_train_loss: 1.2460  avg_val_loss: 1.1880  time: 5009s


Epoch 4 - Save Best Loss: 1.1880 Model


EVAL: [2037/2038] Elapsed 7m 24s (remain 0m 0s) Loss: 0.5582(1.1880) 


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-1d4f13d80cdee2ad/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e...


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-1d4f13d80cdee2ad/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

raw_datasets: DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 31610
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 10536
    })
})


max_seq_length: 326


tokenized_datasets: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 31610
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 10536
    })
})


train_dataset: Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
    num_rows: 18402
})  valid_dataset: Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
    num_rows: 6147
})


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2ForMaskedLM: ['mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'deberta.embeddings.position_embeddings.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaV2ForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification mode

Some weights of DebertaV2ForMaskedLM were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: [1][0/6134] Elapsed 0m 0s (remain 70m 27s) Loss: 13.0324(13.0324) Grad: inf  LR: 0.00002000  


Epoch: [1][100/6134] Elapsed 1m 15s (remain 75m 9s) Loss: 6.3428(7.6837) Grad: 81288.3203  LR: 0.00002000  


Epoch: [1][200/6134] Elapsed 2m 30s (remain 74m 7s) Loss: 5.1865(6.6673) Grad: 80466.4141  LR: 0.00002000  


Epoch: [1][300/6134] Elapsed 3m 45s (remain 72m 57s) Loss: 4.8597(6.0692) Grad: 90805.4688  LR: 0.00001999  


Epoch: [1][400/6134] Elapsed 5m 1s (remain 71m 44s) Loss: 3.7412(5.6609) Grad: 86346.3047  LR: 0.00001999  


Epoch: [1][500/6134] Elapsed 6m 16s (remain 70m 30s) Loss: 4.1156(5.3307) Grad: 79680.9141  LR: 0.00001998  


Epoch: [1][600/6134] Elapsed 7m 31s (remain 69m 15s) Loss: 3.7088(5.0843) Grad: 85853.1641  LR: 0.00001997  


Epoch: [1][700/6134] Elapsed 8m 46s (remain 68m 0s) Loss: 2.8710(4.8785) Grad: 87977.6328  LR: 0.00001996  


Epoch: [1][800/6134] Elapsed 10m 1s (remain 66m 45s) Loss: 3.0890(4.6937) Grad: 77934.8594  LR: 0.00001995  


Epoch: [1][900/6134] Elapsed 11m 16s (remain 65m 30s) Loss: 3.2867(4.5373) Grad: 87363.0703  LR: 0.00001993  


Epoch: [1][1000/6134] Elapsed 12m 32s (remain 64m 16s) Loss: 2.5572(4.4003) Grad: 83054.9297  LR: 0.00001992  


Epoch: [1][1100/6134] Elapsed 13m 47s (remain 63m 1s) Loss: 3.8669(4.2707) Grad: 94113.9844  LR: 0.00001990  


Epoch: [1][1200/6134] Elapsed 15m 2s (remain 61m 46s) Loss: 2.9510(4.1560) Grad: 79842.5781  LR: 0.00001988  


Epoch: [1][1300/6134] Elapsed 16m 17s (remain 60m 30s) Loss: 3.5082(4.0523) Grad: 93068.0938  LR: 0.00001986  


Epoch: [1][1400/6134] Elapsed 17m 32s (remain 59m 16s) Loss: 2.9730(3.9556) Grad: 95534.1016  LR: 0.00001984  


Epoch: [1][1500/6134] Elapsed 18m 47s (remain 58m 0s) Loss: 3.1080(3.8745) Grad: 77952.0078  LR: 0.00001982  


Epoch: [1][1600/6134] Elapsed 20m 2s (remain 56m 45s) Loss: 2.5939(3.7986) Grad: 78744.5781  LR: 0.00001979  


Epoch: [1][1700/6134] Elapsed 21m 18s (remain 55m 30s) Loss: 2.6657(3.7259) Grad: 83728.9844  LR: 0.00001976  


Epoch: [1][1800/6134] Elapsed 22m 33s (remain 54m 15s) Loss: 2.4833(3.6601) Grad: 84902.1250  LR: 0.00001974  


Epoch: [1][1900/6134] Elapsed 23m 48s (remain 53m 0s) Loss: 2.3769(3.5939) Grad: 78736.6016  LR: 0.00001971  


Epoch: [1][2000/6134] Elapsed 25m 3s (remain 51m 45s) Loss: 2.6152(3.5347) Grad: 80412.2266  LR: 0.00001967  


Epoch: [1][2100/6134] Elapsed 26m 18s (remain 50m 30s) Loss: 2.4938(3.4781) Grad: 152936.1562  LR: 0.00001964  


Epoch: [1][2200/6134] Elapsed 27m 33s (remain 49m 15s) Loss: 2.5120(3.4258) Grad: 234156.7344  LR: 0.00001961  


Epoch: [1][2300/6134] Elapsed 28m 48s (remain 48m 0s) Loss: 2.0739(3.3764) Grad: 161402.6875  LR: 0.00001957  


Epoch: [1][2400/6134] Elapsed 30m 4s (remain 46m 44s) Loss: 2.7216(3.3307) Grad: 178528.9375  LR: 0.00001953  


Epoch: [1][2500/6134] Elapsed 31m 19s (remain 45m 29s) Loss: 2.3707(3.2868) Grad: 181561.2031  LR: 0.00001949  


Epoch: [1][2600/6134] Elapsed 32m 34s (remain 44m 14s) Loss: 2.2070(3.2435) Grad: 154111.8125  LR: 0.00001945  


Epoch: [1][2700/6134] Elapsed 33m 49s (remain 42m 59s) Loss: 2.6256(3.2046) Grad: 174623.6562  LR: 0.00001941  


Epoch: [1][2800/6134] Elapsed 35m 4s (remain 41m 44s) Loss: 2.7673(3.1693) Grad: 185264.8594  LR: 0.00001936  


Epoch: [1][2900/6134] Elapsed 36m 19s (remain 40m 29s) Loss: 1.5585(3.1321) Grad: 174026.0469  LR: 0.00001932  


Epoch: [1][3000/6134] Elapsed 37m 34s (remain 39m 14s) Loss: 2.0953(3.1019) Grad: 128577.5859  LR: 0.00001927  


Epoch: [1][3100/6134] Elapsed 38m 50s (remain 37m 59s) Loss: 2.3721(3.0723) Grad: 168787.8750  LR: 0.00001922  


Epoch: [1][3200/6134] Elapsed 40m 5s (remain 36m 43s) Loss: 1.5823(3.0431) Grad: 139854.5781  LR: 0.00001917  


Epoch: [1][3300/6134] Elapsed 41m 20s (remain 35m 28s) Loss: 1.9765(3.0136) Grad: 157820.3750  LR: 0.00001912  
