In [1]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR = './Comonlist_024_3'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [2]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

os.system('pip install iterative-stratification==0.1.7')
# from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

os.system('pip install -q transformers')
os.system('pip install -q tokenizers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizers.__version__: 0.11.6
transformers.__version__: 4.29.2
env: TOKENIZERS_PARALLELISM=true


In [4]:
from torch.cuda.amp import autocast, GradScaler
from sklearn import metrics
from src.machine_learning_util import set_seed, set_device, init_logger, AverageMeter, to_pickle, unpickle, asMinutes, timeSince

In [5]:
class CFG:
    EXP_ID = '024'
    apex = True
    model ='microsoft/deberta-v3-base' #'microsoft/deberta-v3-large' 
    seed = 2022 # 42 # 71
    n_splits = 4
    max_len = 512 # 1429 # 1024 # 512
    dropout = 0
    targets = ['content', 'wording']
    target_size = len(targets)
    n_accumulate=1
    print_freq = 100
    eval_freq = 780 * 2 # 390 # 170
    min_lr=1e-6
    scheduler = 'cosine'
    batch_size = 1 # 2 # 4
    num_workers = 0 #3
    lr = 5e-6 # 3e-6
    weigth_decay = 0.01
    epochs = 4
    n_fold = 4
    trn_fold = [i for i in range(n_fold)]
    train = True
    num_warmup_steps = 0
    num_cycles=0.5
    debug = False
    freezing = True
    gradient_checkpoint = True
    reinit_layers = 4 # 3
    tokenizer = AutoTokenizer.from_pretrained(model)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
# ====================================================
# Utils
# ====================================================
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores


def get_logger(filename=OUTPUT_DIR+'_train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(CFG.seed)
# seed_everything(seed=60)

# Data Loading

In [7]:
# ====================================================
# Data Loading
# ====================================================
path = "C:/Users/Lab000/Desktop/kaggle/kaggle_competetion/CommonLit - Evaluate Student Summaries/input/commonlit-evaluate-student-summaries/"
prompts_train = pd.read_csv(path+'prompts_train.csv')
prompts_test = pd.read_csv(path+'prompts_test.csv')

summaries_train = pd.read_csv(path+'summaries_train.csv')
summaries_test = pd.read_csv(path+'summaries_test.csv')

submission = pd.read_csv(path+'sample_submission.csv')

print(f"prompts_train.shape: {prompts_train.shape}")
#display(prompts_train.head())
print(f"prompts_test.shape: {prompts_test.shape}")
#display(prompts_test.head())

print(f"summaries_train.shape: {summaries_train.shape}")
#display(summaries_train.head())
print(f"summaries_test.shape: {summaries_test.shape}")
#display(summaries_test.head())
print(f"submission.shape: {submission.shape}")
#display(submission.head())

prompts_train.shape: (4, 4)
prompts_test.shape: (2, 4)
summaries_train.shape: (7165, 5)
summaries_test.shape: (4, 3)
submission.shape: (4, 3)


# Merge Prompts Data 

In [8]:
train = summaries_train.merge(prompts_train, how="left", on="prompt_id")
test = summaries_test.merge(prompts_test, how="left", on="prompt_id")

# CV split(GroupKFold)

In [9]:
# ====================================================
# CV split
# ====================================================
Fold = GroupKFold(n_splits=CFG.n_fold)
for n, (train_index, val_index) in enumerate(Fold.split(train, groups=train["prompt_id"])):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    2057
1    2009
2    1996
3    1103
dtype: int64

In [10]:
CFG.tokenizer.save_pretrained(OUTPUT_DIR+'/tokenizer/')

('./Comonlist_024_3/tokenizer/tokenizer_config.json',
 './Comonlist_024_3/tokenizer/special_tokens_map.json',
 './Comonlist_024_3/tokenizer/spm.model',
 './Comonlist_024_3/tokenizer/added_tokens.json',
 './Comonlist_024_3/tokenizer/tokenizer.json')

# Text max Length

In [11]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['text'].fillna("").values, total=len(train))
for text in tk0:
    length = len(CFG.tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 2 # cls & sep
LOGGER.info(f"text column max_len: {CFG.max_len}")

  0%|          | 0/7165 [00:00<?, ?it/s]

text column max_len: 822


In [12]:
input_col = "input"
sep = CFG.tokenizer.sep_token
train[input_col] = (
                    train["prompt_title"] + sep 
#                    + train_df["prompt_text"] + sep 
                    + train["prompt_question"] + sep 
                    + train["text"]
                  )

# input max Length

In [13]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['input'].fillna("").values, total=len(train))
for text in tk0:
    length = len(CFG.tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 2 # cls & sep
LOGGER.info(f"input column max_len: {CFG.max_len}")

  0%|          | 0/7165 [00:00<?, ?it/s]

input column max_len: 858


In [14]:
class FeedBackDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = CFG.max_len
        self.text = df['input'].values
        self.tokenizer = CFG.tokenizer
        self.targets = df[CFG.targets].values

    def __len__(self):
        return len(self.df)

    # staticmethod に書き換えたい
    def cut_head_and_tail(self, text):
        input_ids = self.tokenizer.encode(text)
        n_token = len(input_ids)

        if n_token == self.max_len:
            input_ids = input_ids
            attention_mask = [1 for _ in range(self.max_len)]
            token_type_ids = [1 for _ in range(self.max_len)]
        elif n_token < self.max_len:
            pad = [1 for _ in range(self.max_len-n_token)]
            input_ids = input_ids + pad
            attention_mask = [1 if n_token > i else 0 for i in range(self.max_len)]
            token_type_ids = [1 if n_token > i else 0 for i in range(self.max_len)]
        else:
            harf_len = (self.max_len-2)//2#256
            
            _input_ids = input_ids[1:-1]
            input_ids = [0]+ _input_ids[:harf_len] + _input_ids[-harf_len:] + [2]#取前256個跟後256個
            attention_mask = [1 for _ in range(self.max_len)]
            token_type_ids = [1 for _ in range(self.max_len)]

            if len(input_ids) < self.max_len:
                diff = self.max_len - len(input_ids)
                input_ids = [0]+ _input_ids[:harf_len] + _input_ids[-(harf_len+diff):] + [2]

        d = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
        }
        return d

    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.cut_head_and_tail(text)
        return {
            'input_ids':inputs['input_ids'],
            'attention_mask':inputs['attention_mask'],
            'target':self.targets[index]
            }

In [15]:
class Collate:
    def __init__(self, tokenizer, isTrain=True):
        self.tokenizer = tokenizer
        self.isTrain = isTrain

    def __call__(self, batch):
        
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        if self.isTrain:
            output["target"] = [sample["target"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["input_ids"]])
        
        # add padding
        if self.tokenizer.padding_side == "right":#沒意義
            output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["input_ids"]]
            output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]
        else:
            output["input_ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["input_ids"]]
            output["attention_mask"] = [(batch_max - len(s)) * [0] + s for s in output["attention_mask"]]

        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
        output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)
        if self.isTrain:
            output["target"] = torch.tensor(output["target"], dtype=torch.float)

        
        return output

In [16]:
collate_fn = Collate(CFG.tokenizer, isTrain=True)

In [17]:
def freeze(module):
    """
    Freezes module's parameters.
    """

    for parameter in module.parameters():
        parameter.requires_grad = False

def get_scheduler(cfg, optimizer, num_train_steps):
    if cfg.scheduler == 'linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
        )
    elif cfg.scheduler == 'cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
        )
    return scheduler


In [18]:
class FeedBackModel(nn.Module):
    def __init__(self, model_name):
        super(FeedBackModel, self).__init__()

        self.cfg = CFG
        self.config = AutoConfig.from_pretrained(model_name)
        self.config.hidden_dropout_prob = 0
        self.config.attention_probs_dropout_prob = 0

        self.model = AutoModel.from_pretrained(model_name, config=self.config)

        self.output = nn.Sequential(
            nn.LayerNorm(self.config.hidden_size),
            nn.Linear(self.config.hidden_size, self.cfg.target_size)
        )

        
        # Freeze
        if self.cfg.freezing:
            freeze(self.model.embeddings)
            freeze(self.model.encoder.layer[:2])

        # Gradient Checkpointing
        #if self.cfg.gradient_checkpoint:
        #    self.model.gradient_checkpointing_enable() 

        #if self.cfg.reinit_layers > 0:
        #    layers = self.model.encoder.layer[-self.cfg.reinit_layers:]
        #    for layer in layers:
        #        for module in layer.modules():
        #            self._init_weights(module)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, ids, mask, token_type_ids=None, targets=None):
        if token_type_ids:
            transformer_out = self.model(ids, mask, token_type_ids)
        else:
            transformer_out = self.model(ids, mask)

        # simple CLS
        sequence_output = transformer_out[0][:, 0, :]

        logits = self.output(sequence_output)

        return logits

In [19]:
def criterion(outputs, targets):
    loss_fct = nn.MSELoss()
    loss = loss_fct(outputs, targets)
    return loss

def get_score(outputs, targets):
    mcrmse = []
    for i in range(CFG.target_size):
        mcrmse.append(
            metrics.mean_squared_error(
                targets[:, i],
                outputs[:, i],
                squared=False,
            ),
        )
    mcrmse = np.mean(mcrmse)
    return mcrmse
def get_result(oof_df):
    labels = oof_df[CFG.targets].values
    preds = oof_df[['pred_0', 'pred_1']].values
    score = get_score(preds, labels)
    LOGGER.info(f'Score: {score:<.4f}')

# Training And Validation Per Epoch

In [20]:
def train_one_epoch(model, optimizer, scheduler, dataloader, valid_loader, device, epoch, best_score, valid_labels):
    model.train()
    scaler = GradScaler(enabled=CFG.apex)

    dataset_size = 0
    running_loss = 0

    start = end = time.time()

    for step, data in enumerate(dataloader):
        
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        targets = data['target'].to(device, dtype=torch.float)

        batch_size = ids.size(0)

        with autocast(enabled=CFG.apex):
            outputs = model(ids, mask)
            loss = criterion(outputs, targets)

        #accumulate
        loss = loss / CFG.n_accumulate
        scaler.scale(loss).backward()
        if (step +1) % CFG.n_accumulate == 0:#n_accumulate=1
            # torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_norm)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            if scheduler is not None:
                scheduler.step()
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size

        epoch_loss = running_loss / dataset_size

        end = time.time()

        if step % CFG.print_freq == 0 or step == (len(dataloader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Loss: [{3}]'
                  'Elapsed {remain:s} '
                  .format(epoch+1, step, len(dataloader), epoch_loss,
                          remain=timeSince(start, float(step+1)/len(dataloader))))

        if (step > 0) & (step % CFG.eval_freq == 0) :

            valid_epoch_loss, pred = valid_one_epoch(model, valid_loader, device, epoch)

            score = get_score(pred, valid_labels)

            LOGGER.info(f'Epoch {epoch+1} Step {step} - avg_train_loss: {epoch_loss:.4f}  avg_val_loss: {valid_epoch_loss:.4f}')
            LOGGER.info(f'Epoch {epoch+1} Step {step} - Score: {score:.4f}')

            if score < best_score:
                best_score = score
                LOGGER.info(f'Epoch {epoch+1} Step {step} - Save Best Score: {best_score:.4f} Model')
                torch.save({'model': model.state_dict(),
                            'predictions': pred},
                            OUTPUT_DIR+f"/model/{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

            # model.train()

    gc.collect()

    return epoch_loss, valid_epoch_loss, pred, best_score

@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()

    dataset_size = 0
    running_loss = 0

    start = end = time.time()
    pred = []

    for step, data in enumerate(dataloader):
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        targets = data['target'].to(device, dtype=torch.float)

        batch_size = ids.size(0)
        outputs = model(ids, mask)
        loss = criterion(outputs, targets)
        pred.append(outputs.to('cpu').numpy())

        running_loss += (loss.item()* batch_size)
        dataset_size += batch_size

        epoch_loss = running_loss / dataset_size

        end = time.time()

        if step % CFG.print_freq == 0 or step == (len(dataloader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  .format(step, len(dataloader),
                          remain=timeSince(start, float(step+1)/len(dataloader))))

    pred = np.concatenate(pred)
    return epoch_loss, pred

# Training And Validation Loop

In [21]:
def train_loop(fold):
    LOGGER.info(f'-------------fold:{fold} training-------------')

    train_data = train[train.fold != fold].reset_index(drop=True)
    valid_data = train[train.fold == fold].reset_index(drop=True)
    valid_labels = valid_data[CFG.targets].values

    trainDataset = FeedBackDataset(train_data, CFG.tokenizer, CFG.max_len)
    validDataset = FeedBackDataset(valid_data, CFG.tokenizer, CFG.max_len)
    
    

    train_loader = DataLoader(trainDataset,
                              batch_size = CFG.batch_size,
                              shuffle=True,
                              collate_fn = collate_fn,
                              num_workers = CFG.num_workers,
                              pin_memory = True,
                              drop_last=True)

    
    valid_loader = DataLoader(validDataset,
                              batch_size = CFG.batch_size * 2,
                              shuffle=False,
                              collate_fn = collate_fn,
                              num_workers = CFG.num_workers,
                              pin_memory = True,
                              drop_last=False)

    model = FeedBackModel(CFG.model)
    torch.save(model.config, OUTPUT_DIR+'/model/config.pth')
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=CFG.lr, weight_decay=CFG.weigth_decay)
    num_train_steps = int(len(train_data) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # loop
    best_score = 100

    for epoch in range(CFG.epochs):
        # if epoch == (CFG.epochs - 1):
        #     break

        start_time = time.time()

        train_epoch_loss, valid_epoch_loss, pred, best_score = train_one_epoch(model, optimizer, scheduler, train_loader, valid_loader, device, epoch, best_score, valid_labels)

        score = get_score(pred, valid_labels)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {train_epoch_loss:.4f}  avg_val_loss: {valid_epoch_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')

        if score < best_score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': pred},
                        OUTPUT_DIR+f"/model/{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_DIR+f"/model/{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                             map_location=torch.device('cpu'))['predictions']
    valid_data['pred_0'] = predictions[:, 0]
    valid_data['pred_1'] = predictions[:, 1]
    # valid_data['pred_2'] = predictions[:, 2]
    # valid_data['pred_3'] = predictions[:, 3]
    # valid_data['pred_4'] = predictions[:, 4]
    # valid_data['pred_5'] = predictions[:, 5]

    torch.cuda.empty_cache()
    gc.collect()

    return valid_data

In [22]:
if CFG.train:
    oof_df = pd.DataFrame()
    for fold in range(CFG.n_fold):
        if fold in CFG.trn_fold:
            _oof_df = train_loop(fold)
            oof_df = pd.concat([oof_df, _oof_df])
            LOGGER.info(f"========== fold: {fold} result ==========")
            get_result(_oof_df)
    oof_df = oof_df.reset_index(drop=True)
    LOGGER.info(f"========== CV ==========")
    get_result(oof_df)
    oof_df.to_csv(OUTPUT_DIR+f'_oof_df.csv', index=False)

-------------fold:0 training-------------
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceCl

Epoch: [1][0/5108] Loss: [2.671222686767578]Elapsed 0m 1s (remain 142m 16s) 
Epoch: [1][100/5108] Loss: [1.0035157612142254]Elapsed 0m 10s (remain 8m 30s) 
Epoch: [1][200/5108] Loss: [0.9806683814855042]Elapsed 0m 18s (remain 7m 42s) 
Epoch: [1][300/5108] Loss: [0.8687996745080164]Elapsed 0m 27s (remain 7m 20s) 
Epoch: [1][400/5108] Loss: [0.7793878867857604]Elapsed 0m 36s (remain 7m 5s) 
Epoch: [1][500/5108] Loss: [0.7169835375895642]Elapsed 0m 44s (remain 6m 52s) 
Epoch: [1][600/5108] Loss: [0.6722009869712181]Elapsed 0m 53s (remain 6m 41s) 
Epoch: [1][700/5108] Loss: [0.636118996496992]Elapsed 1m 2s (remain 6m 30s) 
Epoch: [1][800/5108] Loss: [0.599341884369154]Elapsed 1m 10s (remain 6m 20s) 
Epoch: [1][900/5108] Loss: [0.5793823420322097]Elapsed 1m 19s (remain 6m 12s) 
Epoch: [1][1000/5108] Loss: [0.5566850868251007]Elapsed 1m 29s (remain 6m 6s) 
Epoch: [1][1100/5108] Loss: [0.5421935644879673]Elapsed 1m 38s (remain 5m 59s) 
Epoch: [1][1200/5108] Loss: [0.531760514137139]Elapsed 1m

Epoch 1 Step 1560 - avg_train_loss: 0.4893  avg_val_loss: 0.3078
Epoch 1 Step 1560 - Score: 0.5500
Epoch 1 Step 1560 - Save Best Score: 0.5500 Model


EVAL: [1028/1029] Elapsed 1m 48s (remain 0m 0s) 
Epoch: [1][1600/5108] Loss: [0.48678002259118325]Elapsed 4m 19s (remain 9m 29s) 
Epoch: [1][1700/5108] Loss: [0.4762185636668466]Elapsed 4m 29s (remain 8m 59s) 
Epoch: [1][1800/5108] Loss: [0.46885177440245523]Elapsed 4m 38s (remain 8m 31s) 
Epoch: [1][1900/5108] Loss: [0.45924669681219343]Elapsed 4m 48s (remain 8m 6s) 
Epoch: [1][2000/5108] Loss: [0.44988548614957874]Elapsed 4m 57s (remain 7m 42s) 
Epoch: [1][2100/5108] Loss: [0.4440156893883529]Elapsed 5m 7s (remain 7m 19s) 
Epoch: [1][2200/5108] Loss: [0.4373977525700866]Elapsed 5m 16s (remain 6m 58s) 
Epoch: [1][2300/5108] Loss: [0.4294021329410401]Elapsed 5m 26s (remain 6m 37s) 
Epoch: [1][2400/5108] Loss: [0.42101600636578]Elapsed 5m 35s (remain 6m 18s) 
Epoch: [1][2500/5108] Loss: [0.41762667971920503]Elapsed 5m 45s (remain 5m 59s) 
Epoch: [1][2600/5108] Loss: [0.4093976663247953]Elapsed 5m 54s (remain 5m 41s) 
Epoch: [1][2700/5108] Loss: [0.4053047892166705]Elapsed 6m 4s (remain 

Epoch 1 Step 3120 - avg_train_loss: 0.3911  avg_val_loss: 0.2587
Epoch 1 Step 3120 - Score: 0.5064
Epoch 1 Step 3120 - Save Best Score: 0.5064 Model


EVAL: [1028/1029] Elapsed 1m 48s (remain 0m 0s) 
Epoch: [1][3200/5108] Loss: [0.3886875995492338]Elapsed 8m 44s (remain 5m 12s) 
Epoch: [1][3300/5108] Loss: [0.38468658634526115]Elapsed 8m 54s (remain 4m 52s) 
Epoch: [1][3400/5108] Loss: [0.3827701811234903]Elapsed 9m 3s (remain 4m 32s) 
Epoch: [1][3500/5108] Loss: [0.3817216397359492]Elapsed 9m 13s (remain 4m 13s) 
Epoch: [1][3600/5108] Loss: [0.38035355873270454]Elapsed 9m 22s (remain 3m 55s) 
Epoch: [1][3700/5108] Loss: [0.3759933146824872]Elapsed 9m 31s (remain 3m 37s) 
Epoch: [1][3800/5108] Loss: [0.3716341864559415]Elapsed 9m 41s (remain 3m 19s) 
Epoch: [1][3900/5108] Loss: [0.3675498794876175]Elapsed 9m 50s (remain 3m 2s) 
Epoch: [1][4000/5108] Loss: [0.36528760961370726]Elapsed 10m 0s (remain 2m 46s) 
Epoch: [1][4100/5108] Loss: [0.36218092948790853]Elapsed 10m 9s (remain 2m 29s) 
Epoch: [1][4200/5108] Loss: [0.35952882040370315]Elapsed 10m 19s (remain 2m 13s) 
Epoch: [1][4300/5108] Loss: [0.35800756307008114]Elapsed 10m 28s (r

Epoch 1 Step 4680 - avg_train_loss: 0.3539  avg_val_loss: 0.3613
Epoch 1 Step 4680 - Score: 0.6010


EVAL: [1028/1029] Elapsed 1m 47s (remain 0m 0s) 
Epoch: [1][4700/5108] Loss: [0.35301869167197597]Elapsed 12m 54s (remain 1m 7s) 
Epoch: [1][4800/5108] Loss: [0.3503628119270441]Elapsed 13m 3s (remain 0m 50s) 
Epoch: [1][4900/5108] Loss: [0.3473649296878122]Elapsed 13m 13s (remain 0m 33s) 
Epoch: [1][5000/5108] Loss: [0.3458356210261119]Elapsed 13m 22s (remain 0m 17s) 
Epoch: [1][5100/5108] Loss: [0.34444214328049955]Elapsed 13m 32s (remain 0m 1s) 


Epoch 1 - avg_train_loss: 0.3442  avg_val_loss: 0.3613  time: 813s
Epoch 1 - Score: 0.6010


Epoch: [1][5107/5108] Loss: [0.3442399825651007]Elapsed 13m 32s (remain 0m 0s) 
Epoch: [2][0/5108] Loss: [0.16139928996562958]Elapsed 0m 0s (remain 6m 52s) 
Epoch: [2][100/5108] Loss: [0.21285053608607096]Elapsed 0m 9s (remain 7m 51s) 
Epoch: [2][200/5108] Loss: [0.19944580363408565]Elapsed 0m 18s (remain 7m 43s) 
Epoch: [2][300/5108] Loss: [0.19752105675363896]Elapsed 0m 28s (remain 7m 34s) 
Epoch: [2][400/5108] Loss: [0.20530925470214462]Elapsed 0m 37s (remain 7m 24s) 
Epoch: [2][500/5108] Loss: [0.20492681853929554]Elapsed 0m 47s (remain 7m 15s) 
Epoch: [2][600/5108] Loss: [0.2268856804569588]Elapsed 0m 56s (remain 7m 6s) 
Epoch: [2][700/5108] Loss: [0.23484436606784936]Elapsed 1m 6s (remain 6m 57s) 
Epoch: [2][800/5108] Loss: [0.23580897756766078]Elapsed 1m 15s (remain 6m 47s) 
Epoch: [2][900/5108] Loss: [0.236228506130886]Elapsed 1m 25s (remain 6m 38s) 
Epoch: [2][1000/5108] Loss: [0.22894639119714102]Elapsed 1m 34s (remain 6m 29s) 
Epoch: [2][1100/5108] Loss: [0.23418009066158527

Epoch 2 Step 1560 - avg_train_loss: 0.2274  avg_val_loss: 0.2855
Epoch 2 Step 1560 - Score: 0.5321


EVAL: [1028/1029] Elapsed 1m 48s (remain 0m 0s) 
Epoch: [2][1600/5108] Loss: [0.22737604209021817]Elapsed 4m 20s (remain 9m 30s) 
Epoch: [2][1700/5108] Loss: [0.22807523495544355]Elapsed 4m 29s (remain 9m 0s) 
Epoch: [2][1800/5108] Loss: [0.22611398076244918]Elapsed 4m 39s (remain 8m 32s) 
Epoch: [2][1900/5108] Loss: [0.22778644515458155]Elapsed 4m 48s (remain 8m 7s) 
Epoch: [2][2000/5108] Loss: [0.2254048867018763]Elapsed 4m 58s (remain 7m 43s) 
Epoch: [2][2100/5108] Loss: [0.2250955975926478]Elapsed 5m 7s (remain 7m 20s) 
Epoch: [2][2200/5108] Loss: [0.2222026547530584]Elapsed 5m 17s (remain 6m 59s) 
Epoch: [2][2300/5108] Loss: [0.22136291042930073]Elapsed 5m 26s (remain 6m 38s) 
Epoch: [2][2400/5108] Loss: [0.2196638596105292]Elapsed 5m 36s (remain 6m 19s) 
Epoch: [2][2500/5108] Loss: [0.21889152639538612]Elapsed 5m 45s (remain 6m 0s) 
Epoch: [2][2600/5108] Loss: [0.21825286336870658]Elapsed 5m 55s (remain 5m 42s) 
Epoch: [2][2700/5108] Loss: [0.21622696180789297]Elapsed 6m 4s (rema

Epoch 2 Step 3120 - avg_train_loss: 0.2154  avg_val_loss: 0.2871
Epoch 2 Step 3120 - Score: 0.5334


EVAL: [1028/1029] Elapsed 1m 48s (remain 0m 0s) 
Epoch: [2][3200/5108] Loss: [0.21652670740129004]Elapsed 8m 40s (remain 5m 10s) 
Epoch: [2][3300/5108] Loss: [0.21656013627940082]Elapsed 8m 50s (remain 4m 50s) 
Epoch: [2][3400/5108] Loss: [0.21575682917433014]Elapsed 8m 59s (remain 4m 30s) 
Epoch: [2][3500/5108] Loss: [0.2153696867339182]Elapsed 9m 9s (remain 4m 12s) 
Epoch: [2][3600/5108] Loss: [0.21479607505692766]Elapsed 9m 18s (remain 3m 53s) 
Epoch: [2][3700/5108] Loss: [0.2144474270649682]Elapsed 9m 28s (remain 3m 36s) 
Epoch: [2][3800/5108] Loss: [0.213775616119748]Elapsed 9m 37s (remain 3m 18s) 
Epoch: [2][3900/5108] Loss: [0.2138337212034444]Elapsed 9m 47s (remain 3m 1s) 
Epoch: [2][4000/5108] Loss: [0.21266002465543357]Elapsed 9m 56s (remain 2m 45s) 
Epoch: [2][4100/5108] Loss: [0.21201049299943509]Elapsed 10m 6s (remain 2m 28s) 
Epoch: [2][4200/5108] Loss: [0.2116817269438368]Elapsed 10m 15s (remain 2m 12s) 
Epoch: [2][4300/5108] Loss: [0.21122061800742534]Elapsed 10m 25s (r

Epoch 2 Step 4680 - avg_train_loss: 0.2105  avg_val_loss: 0.2763
Epoch 2 Step 4680 - Score: 0.5255


EVAL: [1028/1029] Elapsed 1m 48s (remain 0m 0s) 
Epoch: [2][4700/5108] Loss: [0.21022875614328973]Elapsed 12m 51s (remain 1m 6s) 
Epoch: [2][4800/5108] Loss: [0.2103320408920053]Elapsed 13m 1s (remain 0m 49s) 
Epoch: [2][4900/5108] Loss: [0.21027807067490853]Elapsed 13m 10s (remain 0m 33s) 
Epoch: [2][5000/5108] Loss: [0.2103509696961921]Elapsed 13m 20s (remain 0m 17s) 
Epoch: [2][5100/5108] Loss: [0.21061211176800973]Elapsed 13m 29s (remain 0m 1s) 


Epoch 2 - avg_train_loss: 0.2107  avg_val_loss: 0.2763  time: 811s
Epoch 2 - Score: 0.5255


Epoch: [2][5107/5108] Loss: [0.2107251823122116]Elapsed 13m 30s (remain 0m 0s) 
Epoch: [3][0/5108] Loss: [0.08696401864290237]Elapsed 0m 0s (remain 6m 57s) 
Epoch: [3][100/5108] Loss: [0.13361788850328377]Elapsed 0m 9s (remain 7m 51s) 
Epoch: [3][200/5108] Loss: [0.14197342374480024]Elapsed 0m 18s (remain 7m 43s) 
Epoch: [3][300/5108] Loss: [0.14438490376860227]Elapsed 0m 28s (remain 7m 35s) 
Epoch: [3][400/5108] Loss: [0.14139088307050873]Elapsed 0m 37s (remain 7m 25s) 
Epoch: [3][500/5108] Loss: [0.1386193704038959]Elapsed 0m 47s (remain 7m 16s) 
Epoch: [3][600/5108] Loss: [0.14067842415684986]Elapsed 0m 56s (remain 7m 7s) 
Epoch: [3][700/5108] Loss: [0.13772149655153157]Elapsed 1m 6s (remain 6m 57s) 
Epoch: [3][800/5108] Loss: [0.13860605345544127]Elapsed 1m 15s (remain 6m 48s) 
Epoch: [3][900/5108] Loss: [0.13731394042470058]Elapsed 1m 25s (remain 6m 38s) 
Epoch: [3][1000/5108] Loss: [0.13543522554503284]Elapsed 1m 34s (remain 6m 29s) 
Epoch: [3][1100/5108] Loss: [0.136588945577609

Epoch 3 Step 1560 - avg_train_loss: 0.1472  avg_val_loss: 0.2704
Epoch 3 Step 1560 - Score: 0.5198


EVAL: [1028/1029] Elapsed 1m 48s (remain 0m 0s) 
Epoch: [3][1600/5108] Loss: [0.14677338092792283]Elapsed 4m 20s (remain 9m 30s) 
Epoch: [3][1700/5108] Loss: [0.14797434663661693]Elapsed 4m 30s (remain 9m 0s) 
Epoch: [3][1800/5108] Loss: [0.14813335001698705]Elapsed 4m 39s (remain 8m 33s) 
Epoch: [3][1900/5108] Loss: [0.14761680148386805]Elapsed 4m 49s (remain 8m 7s) 
Epoch: [3][2000/5108] Loss: [0.14752267088690735]Elapsed 4m 58s (remain 7m 43s) 
Epoch: [3][2100/5108] Loss: [0.1473223063951886]Elapsed 5m 8s (remain 7m 20s) 
Epoch: [3][2200/5108] Loss: [0.14861373611606274]Elapsed 5m 17s (remain 6m 59s) 
Epoch: [3][2300/5108] Loss: [0.147467589692613]Elapsed 5m 27s (remain 6m 38s) 
Epoch: [3][2400/5108] Loss: [0.1467384055751892]Elapsed 5m 36s (remain 6m 19s) 
Epoch: [3][2500/5108] Loss: [0.14558662081036614]Elapsed 5m 46s (remain 6m 0s) 
Epoch: [3][2600/5108] Loss: [0.14554626777232335]Elapsed 5m 55s (remain 5m 42s) 
Epoch: [3][2700/5108] Loss: [0.1449580915810914]Elapsed 6m 4s (remai

Epoch 3 Step 3120 - avg_train_loss: 0.1438  avg_val_loss: 0.2952
Epoch 3 Step 3120 - Score: 0.5432


EVAL: [1028/1029] Elapsed 1m 48s (remain 0m 0s) 
Epoch: [3][3200/5108] Loss: [0.14404309811975577]Elapsed 8m 41s (remain 5m 10s) 
Epoch: [3][3300/5108] Loss: [0.14384988218032807]Elapsed 8m 50s (remain 4m 50s) 
Epoch: [3][3400/5108] Loss: [0.14314217298217233]Elapsed 9m 0s (remain 4m 31s) 
Epoch: [3][3500/5108] Loss: [0.1428260538587053]Elapsed 9m 9s (remain 4m 12s) 
Epoch: [3][3600/5108] Loss: [0.14306421303221073]Elapsed 9m 19s (remain 3m 53s) 
Epoch: [3][3700/5108] Loss: [0.14210789772268073]Elapsed 9m 28s (remain 3m 36s) 
Epoch: [3][3800/5108] Loss: [0.14149334882370168]Elapsed 9m 38s (remain 3m 18s) 
Epoch: [3][3900/5108] Loss: [0.14135145971177707]Elapsed 9m 47s (remain 3m 1s) 
Epoch: [3][4000/5108] Loss: [0.14154045364449466]Elapsed 9m 57s (remain 2m 45s) 
Epoch: [3][4100/5108] Loss: [0.14160668293538298]Elapsed 10m 6s (remain 2m 28s) 
Epoch: [3][4200/5108] Loss: [0.14141263139153268]Elapsed 10m 16s (remain 2m 13s) 
Epoch: [3][4300/5108] Loss: [0.14091277769328117]Elapsed 10m 25

Epoch 3 Step 4680 - avg_train_loss: 0.1409  avg_val_loss: 0.2771
Epoch 3 Step 4680 - Score: 0.5263


EVAL: [1028/1029] Elapsed 1m 48s (remain 0m 0s) 
Epoch: [3][4700/5108] Loss: [0.1408061511505849]Elapsed 12m 52s (remain 1m 6s) 
Epoch: [3][4800/5108] Loss: [0.14094308203634767]Elapsed 13m 1s (remain 0m 49s) 
Epoch: [3][4900/5108] Loss: [0.14061089670615629]Elapsed 13m 11s (remain 0m 33s) 
Epoch: [3][5000/5108] Loss: [0.14053606971493873]Elapsed 13m 20s (remain 0m 17s) 
Epoch: [3][5100/5108] Loss: [0.1400637319332963]Elapsed 13m 30s (remain 0m 1s) 


Epoch 3 - avg_train_loss: 0.1401  avg_val_loss: 0.2771  time: 811s
Epoch 3 - Score: 0.5263


Epoch: [3][5107/5108] Loss: [0.14005531195893856]Elapsed 13m 30s (remain 0m 0s) 
Epoch: [4][0/5108] Loss: [0.02368536964058876]Elapsed 0m 0s (remain 6m 52s) 
Epoch: [4][100/5108] Loss: [0.10509998442911736]Elapsed 0m 9s (remain 7m 53s) 
Epoch: [4][200/5108] Loss: [0.10201675171149542]Elapsed 0m 19s (remain 7m 44s) 
Epoch: [4][300/5108] Loss: [0.10063953776735553]Elapsed 0m 28s (remain 7m 35s) 
Epoch: [4][400/5108] Loss: [0.10380945290747869]Elapsed 0m 38s (remain 7m 26s) 
Epoch: [4][500/5108] Loss: [0.10260048506983414]Elapsed 0m 47s (remain 7m 16s) 
Epoch: [4][600/5108] Loss: [0.10231344815208245]Elapsed 0m 57s (remain 7m 7s) 
Epoch: [4][700/5108] Loss: [0.1008139742651022]Elapsed 1m 6s (remain 6m 58s) 
Epoch: [4][800/5108] Loss: [0.10245417971560124]Elapsed 1m 16s (remain 6m 48s) 
Epoch: [4][900/5108] Loss: [0.10482090979100549]Elapsed 1m 25s (remain 6m 39s) 
Epoch: [4][1000/5108] Loss: [0.10265899284742089]Elapsed 1m 34s (remain 6m 29s) 
Epoch: [4][1100/5108] Loss: [0.10203739021374

Epoch 4 Step 1560 - avg_train_loss: 0.1042  avg_val_loss: 0.2758
Epoch 4 Step 1560 - Score: 0.5249


EVAL: [1028/1029] Elapsed 1m 48s (remain 0m 0s) 
Epoch: [4][1600/5108] Loss: [0.10423299238548638]Elapsed 4m 20s (remain 9m 30s) 
Epoch: [4][1700/5108] Loss: [0.10467566067510158]Elapsed 4m 30s (remain 9m 1s) 
Epoch: [4][1800/5108] Loss: [0.10438967593293605]Elapsed 4m 39s (remain 8m 33s) 
Epoch: [4][1900/5108] Loss: [0.1044876216476619]Elapsed 4m 49s (remain 8m 7s) 
Epoch: [4][2000/5108] Loss: [0.10400466369861416]Elapsed 4m 58s (remain 7m 43s) 
Epoch: [4][2100/5108] Loss: [0.10369686751096185]Elapsed 5m 8s (remain 7m 20s) 
Epoch: [4][2200/5108] Loss: [0.10283998764457938]Elapsed 5m 17s (remain 6m 59s) 
Epoch: [4][2300/5108] Loss: [0.10280053561271069]Elapsed 5m 27s (remain 6m 38s) 
Epoch: [4][2400/5108] Loss: [0.10213415515464604]Elapsed 5m 36s (remain 6m 19s) 
Epoch: [4][2500/5108] Loss: [0.10202688091699494]Elapsed 5m 46s (remain 6m 0s) 
Epoch: [4][2600/5108] Loss: [0.10381715049581274]Elapsed 5m 55s (remain 5m 42s) 
Epoch: [4][2700/5108] Loss: [0.10363550261789264]Elapsed 6m 4s (r

Epoch 4 Step 3120 - avg_train_loss: 0.1036  avg_val_loss: 0.2798
Epoch 4 Step 3120 - Score: 0.5289


EVAL: [1028/1029] Elapsed 1m 48s (remain 0m 0s) 
Epoch: [4][3200/5108] Loss: [0.10326580359856415]Elapsed 8m 41s (remain 5m 10s) 
Epoch: [4][3300/5108] Loss: [0.10369050222126074]Elapsed 8m 50s (remain 4m 50s) 
Epoch: [4][3400/5108] Loss: [0.10347300186003287]Elapsed 9m 0s (remain 4m 31s) 
Epoch: [4][3500/5108] Loss: [0.10365383014146441]Elapsed 9m 9s (remain 4m 12s) 
Epoch: [4][3600/5108] Loss: [0.10313520605044067]Elapsed 9m 19s (remain 3m 53s) 
Epoch: [4][3700/5108] Loss: [0.10275329962287369]Elapsed 9m 28s (remain 3m 36s) 
Epoch: [4][3800/5108] Loss: [0.10312982892065573]Elapsed 9m 38s (remain 3m 18s) 
Epoch: [4][3900/5108] Loss: [0.10335396601122596]Elapsed 9m 47s (remain 3m 1s) 
Epoch: [4][4000/5108] Loss: [0.10296721769658594]Elapsed 9m 57s (remain 2m 45s) 
Epoch: [4][4100/5108] Loss: [0.10275702163298905]Elapsed 10m 6s (remain 2m 28s) 
Epoch: [4][4200/5108] Loss: [0.10258784090761648]Elapsed 10m 16s (remain 2m 13s) 
Epoch: [4][4300/5108] Loss: [0.10263796200880788]Elapsed 10m 2

Epoch 4 Step 4680 - avg_train_loss: 0.1025  avg_val_loss: 0.2731
Epoch 4 Step 4680 - Score: 0.5224


EVAL: [1028/1029] Elapsed 1m 48s (remain 0m 0s) 
Epoch: [4][4700/5108] Loss: [0.10246834682002941]Elapsed 12m 52s (remain 1m 6s) 
Epoch: [4][4800/5108] Loss: [0.10273817950456895]Elapsed 13m 1s (remain 0m 49s) 
Epoch: [4][4900/5108] Loss: [0.10270073263011761]Elapsed 13m 11s (remain 0m 33s) 
Epoch: [4][5000/5108] Loss: [0.10241121533492144]Elapsed 13m 20s (remain 0m 17s) 
Epoch: [4][5100/5108] Loss: [0.10248583028644148]Elapsed 13m 30s (remain 0m 1s) 


Epoch 4 - avg_train_loss: 0.1024  avg_val_loss: 0.2731  time: 811s
Epoch 4 - Score: 0.5224


Epoch: [4][5107/5108] Loss: [0.1024373471216418]Elapsed 13m 30s (remain 0m 0s) 


Score: 0.5064
-------------fold:1 training-------------
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a Ber

Epoch: [1][0/5156] Loss: [0.7775920629501343]Elapsed 0m 0s (remain 9m 4s) 
Epoch: [1][100/5156] Loss: [1.143198475377671]Elapsed 0m 9s (remain 7m 58s) 
Epoch: [1][200/5156] Loss: [0.9212338381055372]Elapsed 0m 19s (remain 7m 49s) 
Epoch: [1][300/5156] Loss: [0.7795278827411871]Elapsed 0m 28s (remain 7m 40s) 
Epoch: [1][400/5156] Loss: [0.7014294286434738]Elapsed 0m 38s (remain 7m 30s) 
Epoch: [1][500/5156] Loss: [0.6454587011966816]Elapsed 0m 47s (remain 7m 21s) 
Epoch: [1][600/5156] Loss: [0.6158055745826657]Elapsed 0m 56s (remain 7m 11s) 
Epoch: [1][700/5156] Loss: [0.5895905475391735]Elapsed 1m 6s (remain 7m 2s) 
Epoch: [1][800/5156] Loss: [0.5670146398484206]Elapsed 1m 15s (remain 6m 52s) 
Epoch: [1][900/5156] Loss: [0.5398633215059195]Elapsed 1m 25s (remain 6m 43s) 
Epoch: [1][1000/5156] Loss: [0.5118660388279359]Elapsed 1m 34s (remain 6m 33s) 
Epoch: [1][1100/5156] Loss: [0.49186399653528584]Elapsed 1m 44s (remain 6m 24s) 
Epoch: [1][1200/5156] Loss: [0.47187679209013617]Elapsed 

Epoch 1 Step 1560 - avg_train_loss: 0.4386  avg_val_loss: 0.5584
Epoch 1 Step 1560 - Score: 0.7406
Epoch 1 Step 1560 - Save Best Score: 0.7406 Model


EVAL: [1004/1005] Elapsed 1m 46s (remain 0m 0s) 
Epoch: [1][1600/5156] Loss: [0.43591216008582745]Elapsed 4m 23s (remain 9m 44s) 
Epoch: [1][1700/5156] Loss: [0.4248494786618051]Elapsed 4m 32s (remain 9m 13s) 
Epoch: [1][1800/5156] Loss: [0.4165978325146735]Elapsed 4m 42s (remain 8m 45s) 
Epoch: [1][1900/5156] Loss: [0.40945489333552687]Elapsed 4m 51s (remain 8m 19s) 
Epoch: [1][2000/5156] Loss: [0.4067660198390998]Elapsed 5m 1s (remain 7m 54s) 
Epoch: [1][2100/5156] Loss: [0.39983167652371127]Elapsed 5m 10s (remain 7m 31s) 
Epoch: [1][2200/5156] Loss: [0.3918577025639377]Elapsed 5m 20s (remain 7m 9s) 
Epoch: [1][2300/5156] Loss: [0.3863912100101861]Elapsed 5m 29s (remain 6m 48s) 
Epoch: [1][2400/5156] Loss: [0.3803528420563867]Elapsed 5m 39s (remain 6m 29s) 
Epoch: [1][2500/5156] Loss: [0.37545453848656646]Elapsed 5m 48s (remain 6m 9s) 
Epoch: [1][2600/5156] Loss: [0.37377084902072477]Elapsed 5m 58s (remain 5m 51s) 
Epoch: [1][2700/5156] Loss: [0.36867949498168184]Elapsed 6m 7s (remai

Epoch 1 Step 3120 - avg_train_loss: 0.3580  avg_val_loss: 0.7580
Epoch 1 Step 3120 - Score: 0.8535


EVAL: [1004/1005] Elapsed 1m 46s (remain 0m 0s) 
Epoch: [1][3200/5156] Loss: [0.3548899382220157]Elapsed 8m 41s (remain 5m 18s) 
Epoch: [1][3300/5156] Loss: [0.3524412958914595]Elapsed 8m 50s (remain 4m 58s) 
Epoch: [1][3400/5156] Loss: [0.34835604960642086]Elapsed 9m 0s (remain 4m 38s) 
Epoch: [1][3500/5156] Loss: [0.3460905009221308]Elapsed 9m 9s (remain 4m 19s) 
Epoch: [1][3600/5156] Loss: [0.3444446937821453]Elapsed 9m 19s (remain 4m 1s) 
Epoch: [1][3700/5156] Loss: [0.34226952620786805]Elapsed 9m 28s (remain 3m 43s) 
Epoch: [1][3800/5156] Loss: [0.339460109818352]Elapsed 9m 38s (remain 3m 26s) 
Epoch: [1][3900/5156] Loss: [0.3367236529510991]Elapsed 9m 47s (remain 3m 9s) 
Epoch: [1][4000/5156] Loss: [0.33478271435595963]Elapsed 9m 57s (remain 2m 52s) 
Epoch: [1][4100/5156] Loss: [0.3317470747958731]Elapsed 10m 6s (remain 2m 36s) 
Epoch: [1][4200/5156] Loss: [0.32850219415560006]Elapsed 10m 15s (remain 2m 20s) 
Epoch: [1][4300/5156] Loss: [0.3261562640272445]Elapsed 10m 25s (remain

Epoch 1 Step 4680 - avg_train_loss: 0.3233  avg_val_loss: 0.4927
Epoch 1 Step 4680 - Score: 0.6942
Epoch 1 Step 4680 - Save Best Score: 0.6942 Model


EVAL: [1004/1005] Elapsed 1m 46s (remain 0m 0s) 
Epoch: [1][4700/5156] Loss: [0.3231209622254575]Elapsed 12m 54s (remain 1m 14s) 
Epoch: [1][4800/5156] Loss: [0.3211688612115469]Elapsed 13m 4s (remain 0m 57s) 
Epoch: [1][4900/5156] Loss: [0.31915969534705985]Elapsed 13m 13s (remain 0m 41s) 
Epoch: [1][5000/5156] Loss: [0.3178370722335088]Elapsed 13m 23s (remain 0m 24s) 
Epoch: [1][5100/5156] Loss: [0.3176269321711435]Elapsed 13m 32s (remain 0m 8s) 


Epoch 1 - avg_train_loss: 0.3164  avg_val_loss: 0.4927  time: 818s
Epoch 1 - Score: 0.6942


Epoch: [1][5155/5156] Loss: [0.3164249666056196]Elapsed 13m 37s (remain 0m 0s) 
Epoch: [2][0/5156] Loss: [0.12335623800754547]Elapsed 0m 0s (remain 7m 1s) 
Epoch: [2][100/5156] Loss: [0.2611541556134488]Elapsed 0m 9s (remain 7m 56s) 
Epoch: [2][200/5156] Loss: [0.2332833937035679]Elapsed 0m 18s (remain 7m 47s) 
Epoch: [2][300/5156] Loss: [0.22036172304006774]Elapsed 0m 28s (remain 7m 39s) 
Epoch: [2][400/5156] Loss: [0.20735939326672148]Elapsed 0m 37s (remain 7m 29s) 
Epoch: [2][500/5156] Loss: [0.20220542970045954]Elapsed 0m 47s (remain 7m 20s) 
Epoch: [2][600/5156] Loss: [0.20043042314768333]Elapsed 0m 56s (remain 7m 11s) 
Epoch: [2][700/5156] Loss: [0.1989409888934232]Elapsed 1m 6s (remain 7m 1s) 
Epoch: [2][800/5156] Loss: [0.20012288312681678]Elapsed 1m 15s (remain 6m 52s) 
Epoch: [2][900/5156] Loss: [0.1993677829783008]Elapsed 1m 25s (remain 6m 43s) 
Epoch: [2][1000/5156] Loss: [0.20690939987441917]Elapsed 1m 34s (remain 6m 33s) 
Epoch: [2][1100/5156] Loss: [0.2109554782571263]El

Epoch 2 Step 1560 - avg_train_loss: 0.2106  avg_val_loss: 0.5960
Epoch 2 Step 1560 - Score: 0.7527


EVAL: [1004/1005] Elapsed 1m 46s (remain 0m 0s) 
Epoch: [2][1600/5156] Loss: [0.20980522456919176]Elapsed 4m 18s (remain 9m 33s) 
Epoch: [2][1700/5156] Loss: [0.21245740806125227]Elapsed 4m 27s (remain 9m 3s) 
Epoch: [2][1800/5156] Loss: [0.20924647909480582]Elapsed 4m 37s (remain 8m 36s) 
Epoch: [2][1900/5156] Loss: [0.2099959040981827]Elapsed 4m 46s (remain 8m 10s) 
Epoch: [2][2000/5156] Loss: [0.21211327036022037]Elapsed 4m 56s (remain 7m 46s) 
Epoch: [2][2100/5156] Loss: [0.21587226768389414]Elapsed 5m 5s (remain 7m 24s) 
Epoch: [2][2200/5156] Loss: [0.21505849146206835]Elapsed 5m 15s (remain 7m 2s) 
Epoch: [2][2300/5156] Loss: [0.21465980447421493]Elapsed 5m 24s (remain 6m 42s) 
Epoch: [2][2400/5156] Loss: [0.21249392240166937]Elapsed 5m 33s (remain 6m 23s) 
Epoch: [2][2500/5156] Loss: [0.21398920448103984]Elapsed 5m 43s (remain 6m 4s) 
Epoch: [2][2600/5156] Loss: [0.21439682058246573]Elapsed 5m 52s (remain 5m 46s) 
Epoch: [2][2700/5156] Loss: [0.21714779617038932]Elapsed 6m 2s (r

Epoch 2 Step 3120 - avg_train_loss: 0.2139  avg_val_loss: 0.4789
Epoch 2 Step 3120 - Score: 0.6736
Epoch 2 Step 3120 - Save Best Score: 0.6736 Model


EVAL: [1004/1005] Elapsed 1m 46s (remain 0m 0s) 
Epoch: [2][3200/5156] Loss: [0.21318695567388726]Elapsed 8m 40s (remain 5m 18s) 
Epoch: [2][3300/5156] Loss: [0.21125040401722983]Elapsed 8m 50s (remain 4m 58s) 
Epoch: [2][3400/5156] Loss: [0.21071635063209843]Elapsed 8m 59s (remain 4m 38s) 
Epoch: [2][3500/5156] Loss: [0.21018902449000243]Elapsed 9m 9s (remain 4m 19s) 
Epoch: [2][3600/5156] Loss: [0.2098422087055653]Elapsed 9m 18s (remain 4m 1s) 
Epoch: [2][3700/5156] Loss: [0.2098058121260654]Elapsed 9m 28s (remain 3m 43s) 
Epoch: [2][3800/5156] Loss: [0.20832391996418556]Elapsed 9m 37s (remain 3m 25s) 
Epoch: [2][3900/5156] Loss: [0.2073087040674174]Elapsed 9m 47s (remain 3m 8s) 
Epoch: [2][4000/5156] Loss: [0.20603088976843595]Elapsed 9m 56s (remain 2m 52s) 
Epoch: [2][4100/5156] Loss: [0.20536735820928703]Elapsed 10m 6s (remain 2m 35s) 
Epoch: [2][4200/5156] Loss: [0.20508948845590397]Elapsed 10m 15s (remain 2m 19s) 
Epoch: [2][4300/5156] Loss: [0.20472490898238024]Elapsed 10m 25s 

Epoch 2 Step 4680 - avg_train_loss: 0.2045  avg_val_loss: 0.4513
Epoch 2 Step 4680 - Score: 0.6545
Epoch 2 Step 4680 - Save Best Score: 0.6545 Model


EVAL: [1004/1005] Elapsed 1m 46s (remain 0m 0s) 
Epoch: [2][4700/5156] Loss: [0.20432854484592705]Elapsed 12m 53s (remain 1m 14s) 
Epoch: [2][4800/5156] Loss: [0.2038009011059017]Elapsed 13m 3s (remain 0m 57s) 
Epoch: [2][4900/5156] Loss: [0.20332488583590302]Elapsed 13m 12s (remain 0m 41s) 
Epoch: [2][5000/5156] Loss: [0.20293921109648722]Elapsed 13m 22s (remain 0m 24s) 
Epoch: [2][5100/5156] Loss: [0.20185738568133343]Elapsed 13m 31s (remain 0m 8s) 


Epoch 2 - avg_train_loss: 0.2013  avg_val_loss: 0.4513  time: 817s
Epoch 2 - Score: 0.6545


Epoch: [2][5155/5156] Loss: [0.20131360270537166]Elapsed 13m 36s (remain 0m 0s) 
Epoch: [3][0/5156] Loss: [0.19489657878875732]Elapsed 0m 0s (remain 7m 1s) 
Epoch: [3][100/5156] Loss: [0.13970210835266175]Elapsed 0m 9s (remain 7m 59s) 
Epoch: [3][200/5156] Loss: [0.13890750986200742]Elapsed 0m 19s (remain 7m 49s) 
Epoch: [3][300/5156] Loss: [0.1374548675248359]Elapsed 0m 28s (remain 7m 40s) 
Epoch: [3][400/5156] Loss: [0.13509525676003015]Elapsed 0m 38s (remain 7m 30s) 
Epoch: [3][500/5156] Loss: [0.14465630147138497]Elapsed 0m 47s (remain 7m 21s) 
Epoch: [3][600/5156] Loss: [0.14856108217700148]Elapsed 0m 56s (remain 7m 11s) 
Epoch: [3][700/5156] Loss: [0.14969428152843767]Elapsed 1m 6s (remain 7m 2s) 
Epoch: [3][800/5156] Loss: [0.1506473164580951]Elapsed 1m 15s (remain 6m 53s) 
Epoch: [3][900/5156] Loss: [0.1511142773827292]Elapsed 1m 25s (remain 6m 43s) 
Epoch: [3][1000/5156] Loss: [0.15471195693633538]Elapsed 1m 34s (remain 6m 34s) 
Epoch: [3][1100/5156] Loss: [0.15696150000330256

Epoch 3 Step 1560 - avg_train_loss: 0.1505  avg_val_loss: 0.4581
Epoch 3 Step 1560 - Score: 0.6658


EVAL: [1004/1005] Elapsed 1m 46s (remain 0m 0s) 
Epoch: [3][1600/5156] Loss: [0.15021528426209224]Elapsed 4m 18s (remain 9m 33s) 
Epoch: [3][1700/5156] Loss: [0.14915799566530802]Elapsed 4m 27s (remain 9m 3s) 
Epoch: [3][1800/5156] Loss: [0.15029626936143345]Elapsed 4m 37s (remain 8m 36s) 
Epoch: [3][1900/5156] Loss: [0.14906959046484863]Elapsed 4m 46s (remain 8m 10s) 
Epoch: [3][2000/5156] Loss: [0.14853984936198977]Elapsed 4m 56s (remain 7m 47s) 
Epoch: [3][2100/5156] Loss: [0.1486544317129938]Elapsed 5m 5s (remain 7m 24s) 
Epoch: [3][2200/5156] Loss: [0.14908491145067274]Elapsed 5m 15s (remain 7m 3s) 
Epoch: [3][2300/5156] Loss: [0.14883151245147497]Elapsed 5m 24s (remain 6m 42s) 
Epoch: [3][2400/5156] Loss: [0.1497256812965216]Elapsed 5m 34s (remain 6m 23s) 
Epoch: [3][2500/5156] Loss: [0.14800619928018005]Elapsed 5m 43s (remain 6m 4s) 
Epoch: [3][2600/5156] Loss: [0.14670307310009564]Elapsed 5m 53s (remain 5m 46s) 
Epoch: [3][2700/5156] Loss: [0.14727747796523064]Elapsed 6m 2s (re

Epoch 3 Step 3120 - avg_train_loss: 0.1474  avg_val_loss: 0.4519
Epoch 3 Step 3120 - Score: 0.6553


EVAL: [1004/1005] Elapsed 1m 46s (remain 0m 0s) 
Epoch: [3][3200/5156] Loss: [0.1470120672308486]Elapsed 8m 36s (remain 5m 15s) 
Epoch: [3][3300/5156] Loss: [0.146605132791829]Elapsed 8m 45s (remain 4m 55s) 
Epoch: [3][3400/5156] Loss: [0.14592882555382694]Elapsed 8m 55s (remain 4m 36s) 
Epoch: [3][3500/5156] Loss: [0.1456791313181268]Elapsed 9m 4s (remain 4m 17s) 
Epoch: [3][3600/5156] Loss: [0.14534230139363039]Elapsed 9m 14s (remain 3m 59s) 
Epoch: [3][3700/5156] Loss: [0.14447835162006287]Elapsed 9m 23s (remain 3m 41s) 
Epoch: [3][3800/5156] Loss: [0.1441467015541504]Elapsed 9m 33s (remain 3m 24s) 
Epoch: [3][3900/5156] Loss: [0.14373671761989815]Elapsed 9m 42s (remain 3m 7s) 
Epoch: [3][4000/5156] Loss: [0.14361132988368275]Elapsed 9m 52s (remain 2m 50s) 
Epoch: [3][4100/5156] Loss: [0.14365328429052487]Elapsed 10m 1s (remain 2m 34s) 
Epoch: [3][4200/5156] Loss: [0.14294324435874614]Elapsed 10m 11s (remain 2m 18s) 
Epoch: [3][4300/5156] Loss: [0.14285581572798411]Elapsed 10m 20s (

Epoch 3 Step 4680 - avg_train_loss: 0.1440  avg_val_loss: 0.4235
Epoch 3 Step 4680 - Score: 0.6410
Epoch 3 Step 4680 - Save Best Score: 0.6410 Model


EVAL: [1004/1005] Elapsed 1m 46s (remain 0m 0s) 
Epoch: [3][4700/5156] Loss: [0.1440166277970089]Elapsed 12m 50s (remain 1m 14s) 
Epoch: [3][4800/5156] Loss: [0.1435394500372436]Elapsed 12m 59s (remain 0m 57s) 
Epoch: [3][4900/5156] Loss: [0.14326402365225724]Elapsed 13m 9s (remain 0m 41s) 
Epoch: [3][5000/5156] Loss: [0.1434690428867325]Elapsed 13m 18s (remain 0m 24s) 
Epoch: [3][5100/5156] Loss: [0.1436406145166153]Elapsed 13m 28s (remain 0m 8s) 


Epoch 3 - avg_train_loss: 0.1435  avg_val_loss: 0.4235  time: 813s
Epoch 3 - Score: 0.6410


Epoch: [3][5155/5156] Loss: [0.14352098671983307]Elapsed 13m 33s (remain 0m 0s) 
Epoch: [4][0/5156] Loss: [0.3596523702144623]Elapsed 0m 0s (remain 6m 56s) 
Epoch: [4][100/5156] Loss: [0.12165030580109658]Elapsed 0m 9s (remain 7m 57s) 
Epoch: [4][200/5156] Loss: [0.11732503325175679]Elapsed 0m 19s (remain 7m 50s) 
Epoch: [4][300/5156] Loss: [0.11337473384739732]Elapsed 0m 28s (remain 7m 41s) 
Epoch: [4][400/5156] Loss: [0.11357765797605625]Elapsed 0m 38s (remain 7m 31s) 
Epoch: [4][500/5156] Loss: [0.11206803828818392]Elapsed 0m 47s (remain 7m 22s) 
Epoch: [4][600/5156] Loss: [0.11356302110633534]Elapsed 0m 57s (remain 7m 12s) 
Epoch: [4][700/5156] Loss: [0.1132812391098324]Elapsed 1m 6s (remain 7m 3s) 
Epoch: [4][800/5156] Loss: [0.11157642503724431]Elapsed 1m 16s (remain 6m 53s) 
Epoch: [4][900/5156] Loss: [0.11010114309051011]Elapsed 1m 25s (remain 6m 44s) 
Epoch: [4][1000/5156] Loss: [0.10916529705130822]Elapsed 1m 35s (remain 6m 34s) 
Epoch: [4][1100/5156] Loss: [0.108615114321810

Epoch 4 Step 1560 - avg_train_loss: 0.1104  avg_val_loss: 0.4700
Epoch 4 Step 1560 - Score: 0.6696


EVAL: [1004/1005] Elapsed 1m 46s (remain 0m 0s) 
Epoch: [4][1600/5156] Loss: [0.10992806842077266]Elapsed 4m 18s (remain 9m 33s) 
Epoch: [4][1700/5156] Loss: [0.10988256315051768]Elapsed 4m 27s (remain 9m 4s) 
Epoch: [4][1800/5156] Loss: [0.10826692200065573]Elapsed 4m 37s (remain 8m 36s) 
Epoch: [4][1900/5156] Loss: [0.10806402219539638]Elapsed 4m 46s (remain 8m 11s) 
Epoch: [4][2000/5156] Loss: [0.10802792163359454]Elapsed 4m 56s (remain 7m 47s) 
Epoch: [4][2100/5156] Loss: [0.10769647206083939]Elapsed 5m 5s (remain 7m 24s) 
Epoch: [4][2200/5156] Loss: [0.10807310636495535]Elapsed 5m 15s (remain 7m 3s) 
Epoch: [4][2300/5156] Loss: [0.10703262197251952]Elapsed 5m 24s (remain 6m 43s) 
Epoch: [4][2400/5156] Loss: [0.10711684851733347]Elapsed 5m 34s (remain 6m 23s) 
Epoch: [4][2500/5156] Loss: [0.10723054425721947]Elapsed 5m 43s (remain 6m 4s) 
Epoch: [4][2600/5156] Loss: [0.1072558708056826]Elapsed 5m 53s (remain 5m 47s) 
Epoch: [4][2700/5156] Loss: [0.10796164885607978]Elapsed 6m 2s (r

Epoch 4 Step 3120 - avg_train_loss: 0.1090  avg_val_loss: 0.4336
Epoch 4 Step 3120 - Score: 0.6459


EVAL: [1004/1005] Elapsed 1m 46s (remain 0m 0s) 
Epoch: [4][3200/5156] Loss: [0.10895452305304286]Elapsed 8m 36s (remain 5m 15s) 
Epoch: [4][3300/5156] Loss: [0.10894434153602525]Elapsed 8m 46s (remain 4m 55s) 
Epoch: [4][3400/5156] Loss: [0.1089104669474109]Elapsed 8m 55s (remain 4m 36s) 
Epoch: [4][3500/5156] Loss: [0.10888528326112197]Elapsed 9m 5s (remain 4m 17s) 
Epoch: [4][3600/5156] Loss: [0.10927597533830193]Elapsed 9m 14s (remain 3m 59s) 
Epoch: [4][3700/5156] Loss: [0.11016650306375399]Elapsed 9m 24s (remain 3m 41s) 
Epoch: [4][3800/5156] Loss: [0.10984029003246058]Elapsed 9m 33s (remain 3m 24s) 
Epoch: [4][3900/5156] Loss: [0.10998209284726407]Elapsed 9m 43s (remain 3m 7s) 
Epoch: [4][4000/5156] Loss: [0.1099327251602038]Elapsed 9m 52s (remain 2m 51s) 
Epoch: [4][4100/5156] Loss: [0.10980309442415111]Elapsed 10m 2s (remain 2m 34s) 
Epoch: [4][4200/5156] Loss: [0.10973939191492599]Elapsed 10m 11s (remain 2m 19s) 
Epoch: [4][4300/5156] Loss: [0.10946994675718817]Elapsed 10m 21

Epoch 4 Step 4680 - avg_train_loss: 0.1095  avg_val_loss: 0.4450
Epoch 4 Step 4680 - Score: 0.6528


EVAL: [1004/1005] Elapsed 1m 46s (remain 0m 0s) 
Epoch: [4][4700/5156] Loss: [0.10939820151217598]Elapsed 12m 45s (remain 1m 14s) 
Epoch: [4][4800/5156] Loss: [0.10987232855973952]Elapsed 12m 54s (remain 0m 57s) 
Epoch: [4][4900/5156] Loss: [0.10976858238118428]Elapsed 13m 4s (remain 0m 40s) 
Epoch: [4][5000/5156] Loss: [0.10951913019722638]Elapsed 13m 13s (remain 0m 24s) 
Epoch: [4][5100/5156] Loss: [0.10910378464108875]Elapsed 13m 23s (remain 0m 8s) 


Epoch 4 - avg_train_loss: 0.1087  avg_val_loss: 0.4450  time: 809s
Epoch 4 - Score: 0.6528


Epoch: [4][5155/5156] Loss: [0.10868026811944821]Elapsed 13m 28s (remain 0m 0s) 


Score: 0.6410
-------------fold:2 training-------------
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a Ber

Epoch: [1][0/5169] Loss: [0.7186216115951538]Elapsed 0m 0s (remain 14m 22s) 
Epoch: [1][100/5169] Loss: [1.2382605705539336]Elapsed 0m 9s (remain 8m 1s) 
Epoch: [1][200/5169] Loss: [1.0703202268746983]Elapsed 0m 19s (remain 7m 51s) 
Epoch: [1][300/5169] Loss: [0.913399021323686]Elapsed 0m 28s (remain 7m 41s) 
Epoch: [1][400/5169] Loss: [0.7966479592576439]Elapsed 0m 38s (remain 7m 32s) 
Epoch: [1][500/5169] Loss: [0.7072433354454454]Elapsed 0m 47s (remain 7m 22s) 
Epoch: [1][600/5169] Loss: [0.6689856578278748]Elapsed 0m 56s (remain 7m 12s) 
Epoch: [1][700/5169] Loss: [0.6268757625464128]Elapsed 1m 6s (remain 7m 3s) 
Epoch: [1][800/5169] Loss: [0.6033743309327178]Elapsed 1m 15s (remain 6m 53s) 
Epoch: [1][900/5169] Loss: [0.5698282637288853]Elapsed 1m 25s (remain 6m 44s) 
Epoch: [1][1000/5169] Loss: [0.5499733734646547]Elapsed 1m 34s (remain 6m 35s) 
Epoch: [1][1100/5169] Loss: [0.526234544845182]Elapsed 1m 44s (remain 6m 25s) 
Epoch: [1][1200/5169] Loss: [0.5133897214979033]Elapsed 1m

Epoch 1 Step 1560 - avg_train_loss: 0.4730  avg_val_loss: 0.4534
Epoch 1 Step 1560 - Score: 0.6607
Epoch 1 Step 1560 - Save Best Score: 0.6607 Model


EVAL: [997/998] Elapsed 1m 45s (remain 0m 0s) 
Epoch: [1][1600/5169] Loss: [0.4707996358022426]Elapsed 4m 21s (remain 9m 43s) 
Epoch: [1][1700/5169] Loss: [0.4594683956244686]Elapsed 4m 31s (remain 9m 13s) 
Epoch: [1][1800/5169] Loss: [0.45352220486777056]Elapsed 4m 40s (remain 8m 45s) 
Epoch: [1][1900/5169] Loss: [0.4444681268635499]Elapsed 4m 50s (remain 8m 19s) 
Epoch: [1][2000/5169] Loss: [0.44300777223196375]Elapsed 4m 59s (remain 7m 54s) 
Epoch: [1][2100/5169] Loss: [0.43797369041393935]Elapsed 5m 9s (remain 7m 31s) 
Epoch: [1][2200/5169] Loss: [0.43694165766148774]Elapsed 5m 18s (remain 7m 9s) 
Epoch: [1][2300/5169] Loss: [0.43288514031834086]Elapsed 5m 28s (remain 6m 49s) 
Epoch: [1][2400/5169] Loss: [0.42988586125335415]Elapsed 5m 37s (remain 6m 29s) 
Epoch: [1][2500/5169] Loss: [0.42245481896646103]Elapsed 5m 47s (remain 6m 10s) 
Epoch: [1][2600/5169] Loss: [0.4168175391051398]Elapsed 5m 56s (remain 5m 52s) 
Epoch: [1][2700/5169] Loss: [0.41159183768544444]Elapsed 6m 6s (rema

Epoch 1 Step 3120 - avg_train_loss: 0.3965  avg_val_loss: 0.3940
Epoch 1 Step 3120 - Score: 0.6251
Epoch 1 Step 3120 - Save Best Score: 0.6251 Model


EVAL: [997/998] Elapsed 1m 45s (remain 0m 0s) 
Epoch: [1][3200/5169] Loss: [0.3941287857339888]Elapsed 8m 44s (remain 5m 22s) 
Epoch: [1][3300/5169] Loss: [0.3924960427275329]Elapsed 8m 53s (remain 5m 2s) 
Epoch: [1][3400/5169] Loss: [0.3879467462949734]Elapsed 9m 3s (remain 4m 42s) 
Epoch: [1][3500/5169] Loss: [0.3869557216225641]Elapsed 9m 12s (remain 4m 23s) 
Epoch: [1][3600/5169] Loss: [0.38403781687749805]Elapsed 9m 22s (remain 4m 4s) 
Epoch: [1][3700/5169] Loss: [0.38390428592274795]Elapsed 9m 31s (remain 3m 46s) 
Epoch: [1][3800/5169] Loss: [0.3800610767119787]Elapsed 9m 41s (remain 3m 29s) 
Epoch: [1][3900/5169] Loss: [0.3800365217271679]Elapsed 9m 50s (remain 3m 11s) 
Epoch: [1][4000/5169] Loss: [0.37875915630514373]Elapsed 10m 0s (remain 2m 55s) 
Epoch: [1][4100/5169] Loss: [0.3784096760042148]Elapsed 10m 9s (remain 2m 38s) 
Epoch: [1][4200/5169] Loss: [0.375892237312133]Elapsed 10m 19s (remain 2m 22s) 
Epoch: [1][4300/5169] Loss: [0.37500691066380776]Elapsed 10m 28s (remain 

Epoch 1 Step 4680 - avg_train_loss: 0.3678  avg_val_loss: 0.2904
Epoch 1 Step 4680 - Score: 0.5346
Epoch 1 Step 4680 - Save Best Score: 0.5346 Model


EVAL: [997/998] Elapsed 1m 45s (remain 0m 0s) 
Epoch: [1][4700/5169] Loss: [0.3675674776960305]Elapsed 12m 56s (remain 1m 17s) 
Epoch: [1][4800/5169] Loss: [0.3659943007954446]Elapsed 13m 5s (remain 1m 0s) 
Epoch: [1][4900/5169] Loss: [0.3642941978340475]Elapsed 13m 15s (remain 0m 43s) 
Epoch: [1][5000/5169] Loss: [0.36200437877117425]Elapsed 13m 24s (remain 0m 27s) 
Epoch: [1][5100/5169] Loss: [0.36036547085933773]Elapsed 13m 34s (remain 0m 10s) 


Epoch 1 - avg_train_loss: 0.3601  avg_val_loss: 0.2904  time: 821s
Epoch 1 - Score: 0.5346


Epoch: [1][5168/5169] Loss: [0.3601202807923612]Elapsed 13m 40s (remain 0m 0s) 
Epoch: [2][0/5169] Loss: [0.04282988980412483]Elapsed 0m 0s (remain 7m 2s) 
Epoch: [2][100/5169] Loss: [0.1898835084636207]Elapsed 0m 9s (remain 7m 56s) 
Epoch: [2][200/5169] Loss: [0.17906623973513477]Elapsed 0m 18s (remain 7m 49s) 
Epoch: [2][300/5169] Loss: [0.18405404721310925]Elapsed 0m 28s (remain 7m 40s) 
Epoch: [2][400/5169] Loss: [0.18981263581506602]Elapsed 0m 37s (remain 7m 31s) 
Epoch: [2][500/5169] Loss: [0.1957902883160548]Elapsed 0m 47s (remain 7m 21s) 
Epoch: [2][600/5169] Loss: [0.19256172934997426]Elapsed 0m 56s (remain 7m 12s) 
Epoch: [2][700/5169] Loss: [0.19900166304283323]Elapsed 1m 6s (remain 7m 3s) 
Epoch: [2][800/5169] Loss: [0.20210536543766622]Elapsed 1m 15s (remain 6m 53s) 
Epoch: [2][900/5169] Loss: [0.2016199873384674]Elapsed 1m 25s (remain 6m 44s) 
Epoch: [2][1000/5169] Loss: [0.2000327585000751]Elapsed 1m 34s (remain 6m 34s) 
Epoch: [2][1100/5169] Loss: [0.20545768818625487]E

Epoch 2 Step 1560 - avg_train_loss: 0.2062  avg_val_loss: 0.3186
Epoch 2 Step 1560 - Score: 0.5616


EVAL: [997/998] Elapsed 1m 45s (remain 0m 0s) 
Epoch: [2][1600/5169] Loss: [0.20676781853877557]Elapsed 4m 17s (remain 9m 33s) 
Epoch: [2][1700/5169] Loss: [0.20886305686590334]Elapsed 4m 26s (remain 9m 4s) 
Epoch: [2][1800/5169] Loss: [0.2111262070973964]Elapsed 4m 36s (remain 8m 36s) 
Epoch: [2][1900/5169] Loss: [0.21234553394133265]Elapsed 4m 45s (remain 8m 11s) 
Epoch: [2][2000/5169] Loss: [0.21201821751922093]Elapsed 4m 55s (remain 7m 47s) 
Epoch: [2][2100/5169] Loss: [0.21561742081133414]Elapsed 5m 4s (remain 7m 25s) 
Epoch: [2][2200/5169] Loss: [0.216526689291082]Elapsed 5m 14s (remain 7m 3s) 
Epoch: [2][2300/5169] Loss: [0.2160521908439401]Elapsed 5m 23s (remain 6m 43s) 
Epoch: [2][2400/5169] Loss: [0.21456346676654078]Elapsed 5m 33s (remain 6m 24s) 
Epoch: [2][2500/5169] Loss: [0.21499407697194925]Elapsed 5m 42s (remain 6m 5s) 
Epoch: [2][2600/5169] Loss: [0.2181338234493503]Elapsed 5m 52s (remain 5m 47s) 
Epoch: [2][2700/5169] Loss: [0.21854198354030283]Elapsed 6m 1s (remain 

Epoch 2 Step 3120 - avg_train_loss: 0.2205  avg_val_loss: 0.2450
Epoch 2 Step 3120 - Score: 0.4932
Epoch 2 Step 3120 - Save Best Score: 0.4932 Model


EVAL: [997/998] Elapsed 1m 45s (remain 0m 0s) 
Epoch: [2][3200/5169] Loss: [0.2199209710349877]Elapsed 8m 39s (remain 5m 19s) 
Epoch: [2][3300/5169] Loss: [0.21886944023397517]Elapsed 8m 48s (remain 4m 59s) 
Epoch: [2][3400/5169] Loss: [0.2181996689364104]Elapsed 8m 58s (remain 4m 39s) 
Epoch: [2][3500/5169] Loss: [0.21912871516183544]Elapsed 9m 7s (remain 4m 20s) 
Epoch: [2][3600/5169] Loss: [0.22094980830239735]Elapsed 9m 17s (remain 4m 2s) 
Epoch: [2][3700/5169] Loss: [0.22178051556435605]Elapsed 9m 26s (remain 3m 44s) 
Epoch: [2][3800/5169] Loss: [0.22153111852923874]Elapsed 9m 36s (remain 3m 27s) 
Epoch: [2][3900/5169] Loss: [0.22103766149388113]Elapsed 9m 45s (remain 3m 10s) 
Epoch: [2][4000/5169] Loss: [0.2212523562535717]Elapsed 9m 55s (remain 2m 53s) 
Epoch: [2][4100/5169] Loss: [0.22111891087024982]Elapsed 10m 4s (remain 2m 37s) 
Epoch: [2][4200/5169] Loss: [0.22113427778936115]Elapsed 10m 14s (remain 2m 21s) 
Epoch: [2][4300/5169] Loss: [0.2210945343581941]Elapsed 10m 23s (r

Epoch 2 Step 4680 - avg_train_loss: 0.2183  avg_val_loss: 0.2406
Epoch 2 Step 4680 - Score: 0.4891
Epoch 2 Step 4680 - Save Best Score: 0.4891 Model


EVAL: [997/998] Elapsed 1m 45s (remain 0m 0s) 
Epoch: [2][4700/5169] Loss: [0.2181584204785249]Elapsed 12m 51s (remain 1m 16s) 
Epoch: [2][4800/5169] Loss: [0.2180755715209566]Elapsed 13m 1s (remain 0m 59s) 
Epoch: [2][4900/5169] Loss: [0.21732900851565226]Elapsed 13m 10s (remain 0m 43s) 
Epoch: [2][5000/5169] Loss: [0.21658746451102082]Elapsed 13m 20s (remain 0m 26s) 
Epoch: [2][5100/5169] Loss: [0.2167274381930058]Elapsed 13m 29s (remain 0m 10s) 


Epoch 2 - avg_train_loss: 0.2158  avg_val_loss: 0.2406  time: 816s
Epoch 2 - Score: 0.4891


Epoch: [2][5168/5169] Loss: [0.21578276312958908]Elapsed 13m 35s (remain 0m 0s) 
Epoch: [3][0/5169] Loss: [0.04211118817329407]Elapsed 0m 0s (remain 6m 57s) 
Epoch: [3][100/5169] Loss: [0.17352112483101814]Elapsed 0m 9s (remain 7m 57s) 
Epoch: [3][200/5169] Loss: [0.165450346831925]Elapsed 0m 18s (remain 7m 49s) 
Epoch: [3][300/5169] Loss: [0.16399691652826054]Elapsed 0m 28s (remain 7m 40s) 
Epoch: [3][400/5169] Loss: [0.15904594981619632]Elapsed 0m 37s (remain 7m 31s) 
Epoch: [3][500/5169] Loss: [0.1555413794177473]Elapsed 0m 47s (remain 7m 22s) 
Epoch: [3][600/5169] Loss: [0.15435576556122996]Elapsed 0m 56s (remain 7m 12s) 
Epoch: [3][700/5169] Loss: [0.15434499029573331]Elapsed 1m 6s (remain 7m 3s) 
Epoch: [3][800/5169] Loss: [0.15788447343614698]Elapsed 1m 15s (remain 6m 53s) 
Epoch: [3][900/5169] Loss: [0.15815751897323604]Elapsed 1m 25s (remain 6m 44s) 
Epoch: [3][1000/5169] Loss: [0.1582367616008035]Elapsed 1m 34s (remain 6m 35s) 
Epoch: [3][1100/5169] Loss: [0.15498439676033587

Epoch 3 Step 1560 - avg_train_loss: 0.1557  avg_val_loss: 0.2382
Epoch 3 Step 1560 - Score: 0.4850
Epoch 3 Step 1560 - Save Best Score: 0.4850 Model


EVAL: [997/998] Elapsed 1m 45s (remain 0m 0s) 
Epoch: [3][1600/5169] Loss: [0.15744261459724776]Elapsed 4m 21s (remain 9m 43s) 
Epoch: [3][1700/5169] Loss: [0.1575605545851275]Elapsed 4m 31s (remain 9m 13s) 
Epoch: [3][1800/5169] Loss: [0.15586775334049424]Elapsed 4m 40s (remain 8m 45s) 
Epoch: [3][1900/5169] Loss: [0.15567189447411156]Elapsed 4m 50s (remain 8m 18s) 
Epoch: [3][2000/5169] Loss: [0.15336483669937528]Elapsed 4m 59s (remain 7m 54s) 
Epoch: [3][2100/5169] Loss: [0.15201521721152514]Elapsed 5m 9s (remain 7m 31s) 
Epoch: [3][2200/5169] Loss: [0.15251639159142077]Elapsed 5m 18s (remain 7m 9s) 
Epoch: [3][2300/5169] Loss: [0.15232697043946322]Elapsed 5m 28s (remain 6m 49s) 
Epoch: [3][2400/5169] Loss: [0.1521772293526207]Elapsed 5m 37s (remain 6m 29s) 
Epoch: [3][2500/5169] Loss: [0.15175159389184206]Elapsed 5m 47s (remain 6m 10s) 
Epoch: [3][2600/5169] Loss: [0.15303455107506805]Elapsed 5m 56s (remain 5m 52s) 
Epoch: [3][2700/5169] Loss: [0.15200474404123895]Elapsed 6m 6s (re

Epoch 3 Step 3120 - avg_train_loss: 0.1513  avg_val_loss: 0.2654
Epoch 3 Step 3120 - Score: 0.5093


EVAL: [997/998] Elapsed 1m 45s (remain 0m 0s) 
Epoch: [3][3200/5169] Loss: [0.15068537058546597]Elapsed 8m 39s (remain 5m 19s) 
Epoch: [3][3300/5169] Loss: [0.14960767971358752]Elapsed 8m 48s (remain 4m 59s) 
Epoch: [3][3400/5169] Loss: [0.1495861176512465]Elapsed 8m 58s (remain 4m 39s) 
Epoch: [3][3500/5169] Loss: [0.14899401945293939]Elapsed 9m 7s (remain 4m 20s) 
Epoch: [3][3600/5169] Loss: [0.14880259011358726]Elapsed 9m 17s (remain 4m 2s) 
Epoch: [3][3700/5169] Loss: [0.14884255985558775]Elapsed 9m 26s (remain 3m 44s) 
Epoch: [3][3800/5169] Loss: [0.14850159523985715]Elapsed 9m 36s (remain 3m 27s) 
Epoch: [3][3900/5169] Loss: [0.14798742163737214]Elapsed 9m 45s (remain 3m 10s) 
Epoch: [3][4000/5169] Loss: [0.14803132154045562]Elapsed 9m 55s (remain 2m 53s) 
Epoch: [3][4100/5169] Loss: [0.14785961068017828]Elapsed 10m 4s (remain 2m 37s) 
Epoch: [3][4200/5169] Loss: [0.14762235327410345]Elapsed 10m 14s (remain 2m 21s) 
Epoch: [3][4300/5169] Loss: [0.14794410216661458]Elapsed 10m 23s

Epoch 3 Step 4680 - avg_train_loss: 0.1479  avg_val_loss: 0.2666
Epoch 3 Step 4680 - Score: 0.5134


EVAL: [997/998] Elapsed 1m 45s (remain 0m 0s) 
Epoch: [3][4700/5169] Loss: [0.14769107456280323]Elapsed 12m 47s (remain 1m 16s) 
Epoch: [3][4800/5169] Loss: [0.1470126795443189]Elapsed 12m 56s (remain 0m 59s) 
Epoch: [3][4900/5169] Loss: [0.14666891675070926]Elapsed 13m 6s (remain 0m 42s) 
Epoch: [3][5000/5169] Loss: [0.14696745642206793]Elapsed 13m 15s (remain 0m 26s) 
Epoch: [3][5100/5169] Loss: [0.14645364166642613]Elapsed 13m 25s (remain 0m 10s) 


Epoch 3 - avg_train_loss: 0.1462  avg_val_loss: 0.2666  time: 812s
Epoch 3 - Score: 0.5134


Epoch: [3][5168/5169] Loss: [0.14623382489742992]Elapsed 13m 31s (remain 0m 0s) 
Epoch: [4][0/5169] Loss: [0.03491131588816643]Elapsed 0m 0s (remain 7m 2s) 
Epoch: [4][100/5169] Loss: [0.1136519296947467]Elapsed 0m 9s (remain 7m 57s) 
Epoch: [4][200/5169] Loss: [0.12576663182138945]Elapsed 0m 19s (remain 7m 49s) 
Epoch: [4][300/5169] Loss: [0.11789882930746676]Elapsed 0m 28s (remain 7m 40s) 
Epoch: [4][400/5169] Loss: [0.10935077222938504]Elapsed 0m 37s (remain 7m 31s) 
Epoch: [4][500/5169] Loss: [0.10884395292650552]Elapsed 0m 47s (remain 7m 22s) 
Epoch: [4][600/5169] Loss: [0.10785148359279573]Elapsed 0m 56s (remain 7m 12s) 
Epoch: [4][700/5169] Loss: [0.10654338661977159]Elapsed 1m 6s (remain 7m 3s) 
Epoch: [4][800/5169] Loss: [0.1042019620011895]Elapsed 1m 15s (remain 6m 54s) 
Epoch: [4][900/5169] Loss: [0.1096580401495263]Elapsed 1m 25s (remain 6m 44s) 
Epoch: [4][1000/5169] Loss: [0.1110602249776999]Elapsed 1m 34s (remain 6m 35s) 
Epoch: [4][1100/5169] Loss: [0.10922399529141077]

Epoch 4 Step 1560 - avg_train_loss: 0.1074  avg_val_loss: 0.2557
Epoch 4 Step 1560 - Score: 0.5004


EVAL: [997/998] Elapsed 1m 45s (remain 0m 0s) 
Epoch: [4][1600/5169] Loss: [0.10684301251839737]Elapsed 4m 17s (remain 9m 33s) 
Epoch: [4][1700/5169] Loss: [0.10760219161984091]Elapsed 4m 26s (remain 9m 4s) 
Epoch: [4][1800/5169] Loss: [0.10766207928852242]Elapsed 4m 36s (remain 8m 36s) 
Epoch: [4][1900/5169] Loss: [0.10834967198668458]Elapsed 4m 45s (remain 8m 11s) 
Epoch: [4][2000/5169] Loss: [0.10889286141527468]Elapsed 4m 55s (remain 7m 47s) 
Epoch: [4][2100/5169] Loss: [0.10914790302719979]Elapsed 5m 4s (remain 7m 25s) 
Epoch: [4][2200/5169] Loss: [0.10877262935139216]Elapsed 5m 14s (remain 7m 3s) 
Epoch: [4][2300/5169] Loss: [0.10811104001525061]Elapsed 5m 23s (remain 6m 43s) 
Epoch: [4][2400/5169] Loss: [0.1084868706064167]Elapsed 5m 33s (remain 6m 24s) 
Epoch: [4][2500/5169] Loss: [0.10868739367503136]Elapsed 5m 42s (remain 6m 5s) 
Epoch: [4][2600/5169] Loss: [0.10813567325983109]Elapsed 5m 52s (remain 5m 47s) 
Epoch: [4][2700/5169] Loss: [0.10727045844267646]Elapsed 6m 1s (rem

Epoch 4 Step 3120 - avg_train_loss: 0.1076  avg_val_loss: 0.2483
Epoch 4 Step 3120 - Score: 0.4944


EVAL: [997/998] Elapsed 1m 45s (remain 0m 0s) 
Epoch: [4][3200/5169] Loss: [0.10742786460453378]Elapsed 8m 34s (remain 5m 16s) 
Epoch: [4][3300/5169] Loss: [0.10703719295782359]Elapsed 8m 44s (remain 4m 56s) 
Epoch: [4][3400/5169] Loss: [0.10642170832608444]Elapsed 8m 53s (remain 4m 37s) 
Epoch: [4][3500/5169] Loss: [0.10648722904515442]Elapsed 9m 3s (remain 4m 18s) 
Epoch: [4][3600/5169] Loss: [0.10627726442874275]Elapsed 9m 12s (remain 4m 0s) 
Epoch: [4][3700/5169] Loss: [0.10640005020936617]Elapsed 9m 22s (remain 3m 43s) 
Epoch: [4][3800/5169] Loss: [0.10669493186359796]Elapsed 9m 31s (remain 3m 25s) 
Epoch: [4][3900/5169] Loss: [0.1071311567260041]Elapsed 9m 41s (remain 3m 8s) 
Epoch: [4][4000/5169] Loss: [0.10710613131715756]Elapsed 9m 50s (remain 2m 52s) 
Epoch: [4][4100/5169] Loss: [0.10753179545048969]Elapsed 10m 0s (remain 2m 36s) 
Epoch: [4][4200/5169] Loss: [0.10741492135597007]Elapsed 10m 9s (remain 2m 20s) 
Epoch: [4][4300/5169] Loss: [0.10734756825984898]Elapsed 10m 19s (

Epoch 4 Step 4680 - avg_train_loss: 0.1076  avg_val_loss: 0.2525
Epoch 4 Step 4680 - Score: 0.4983


EVAL: [997/998] Elapsed 1m 45s (remain 0m 0s) 
Epoch: [4][4700/5169] Loss: [0.1073990323543136]Elapsed 12m 42s (remain 1m 15s) 
Epoch: [4][4800/5169] Loss: [0.10702753007570107]Elapsed 12m 52s (remain 0m 59s) 
Epoch: [4][4900/5169] Loss: [0.10669875216908917]Elapsed 13m 1s (remain 0m 42s) 
Epoch: [4][5000/5169] Loss: [0.10665950402152821]Elapsed 13m 11s (remain 0m 26s) 
Epoch: [4][5100/5169] Loss: [0.10625903452656454]Elapsed 13m 20s (remain 0m 10s) 


Epoch 4 - avg_train_loss: 0.1060  avg_val_loss: 0.2525  time: 807s
Epoch 4 - Score: 0.4983


Epoch: [4][5168/5169] Loss: [0.10598306629529343]Elapsed 13m 27s (remain 0m 0s) 


Score: 0.4850
-------------fold:3 training-------------
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a Ber

Epoch: [1][0/6062] Loss: [0.1896665096282959]Elapsed 0m 0s (remain 16m 55s) 
Epoch: [1][100/6062] Loss: [0.65332771289983]Elapsed 0m 9s (remain 9m 28s) 
Epoch: [1][200/6062] Loss: [0.6752003793609306]Elapsed 0m 19s (remain 9m 17s) 
Epoch: [1][300/6062] Loss: [0.6086599206462181]Elapsed 0m 28s (remain 9m 7s) 
Epoch: [1][400/6062] Loss: [0.5497779313537601]Elapsed 0m 38s (remain 8m 57s) 
Epoch: [1][500/6062] Loss: [0.5026494975068407]Elapsed 0m 47s (remain 8m 48s) 
Epoch: [1][600/6062] Loss: [0.48300058119941947]Elapsed 0m 57s (remain 8m 38s) 
Epoch: [1][700/6062] Loss: [0.4646298679532802]Elapsed 1m 6s (remain 8m 29s) 
Epoch: [1][800/6062] Loss: [0.44650868176916886]Elapsed 1m 16s (remain 8m 19s) 
Epoch: [1][900/6062] Loss: [0.4396764524605334]Elapsed 1m 25s (remain 8m 9s) 
Epoch: [1][1000/6062] Loss: [0.42762225964881173]Elapsed 1m 35s (remain 8m 0s) 
Epoch: [1][1100/6062] Loss: [0.4156910356355799]Elapsed 1m 44s (remain 7m 50s) 
Epoch: [1][1200/6062] Loss: [0.40712683140373884]Elapsed

Epoch 1 Step 1560 - avg_train_loss: 0.3908  avg_val_loss: 0.5895
Epoch 1 Step 1560 - Score: 0.7436
Epoch 1 Step 1560 - Save Best Score: 0.7436 Model


EVAL: [551/552] Elapsed 0m 58s (remain 0m 0s) 
Epoch: [1][1600/6062] Loss: [0.38987834797408616]Elapsed 3m 35s (remain 9m 59s) 
Epoch: [1][1700/6062] Loss: [0.38281877774488987]Elapsed 3m 44s (remain 9m 35s) 
Epoch: [1][1800/6062] Loss: [0.37462663775996907]Elapsed 3m 54s (remain 9m 13s) 
Epoch: [1][1900/6062] Loss: [0.36935260557938343]Elapsed 4m 3s (remain 8m 53s) 
Epoch: [1][2000/6062] Loss: [0.3642257893662972]Elapsed 4m 13s (remain 8m 33s) 
Epoch: [1][2100/6062] Loss: [0.3600704957681366]Elapsed 4m 22s (remain 8m 14s) 
Epoch: [1][2200/6062] Loss: [0.3568729425713355]Elapsed 4m 32s (remain 7m 57s) 
Epoch: [1][2300/6062] Loss: [0.35298617939766386]Elapsed 4m 41s (remain 7m 40s) 
Epoch: [1][2400/6062] Loss: [0.3490457681235035]Elapsed 4m 50s (remain 7m 23s) 
Epoch: [1][2500/6062] Loss: [0.34987630507748546]Elapsed 5m 0s (remain 7m 7s) 
Epoch: [1][2600/6062] Loss: [0.3445729597060374]Elapsed 5m 9s (remain 6m 52s) 
Epoch: [1][2700/6062] Loss: [0.3402886048072209]Elapsed 5m 19s (remain 

Epoch 1 Step 3120 - avg_train_loss: 0.3297  avg_val_loss: 0.6222
Epoch 1 Step 3120 - Score: 0.7756


EVAL: [551/552] Elapsed 0m 58s (remain 0m 0s) 
Epoch: [1][3200/6062] Loss: [0.32599322652643253]Elapsed 7m 5s (remain 6m 19s) 
Epoch: [1][3300/6062] Loss: [0.32341900217535596]Elapsed 7m 14s (remain 6m 3s) 
Epoch: [1][3400/6062] Loss: [0.3229806652568398]Elapsed 7m 24s (remain 5m 47s) 
Epoch: [1][3500/6062] Loss: [0.3195790195697445]Elapsed 7m 33s (remain 5m 31s) 
Epoch: [1][3600/6062] Loss: [0.3176325974779331]Elapsed 7m 43s (remain 5m 16s) 
Epoch: [1][3700/6062] Loss: [0.31659871546846935]Elapsed 7m 52s (remain 5m 1s) 
Epoch: [1][3800/6062] Loss: [0.3141616873733493]Elapsed 8m 2s (remain 4m 46s) 
Epoch: [1][3900/6062] Loss: [0.31333016057608026]Elapsed 8m 11s (remain 4m 32s) 
Epoch: [1][4000/6062] Loss: [0.31153521188790717]Elapsed 8m 21s (remain 4m 18s) 
Epoch: [1][4100/6062] Loss: [0.30932768230867747]Elapsed 8m 30s (remain 4m 4s) 
Epoch: [1][4200/6062] Loss: [0.30821373192082335]Elapsed 8m 39s (remain 3m 50s) 
Epoch: [1][4300/6062] Loss: [0.3076154176829636]Elapsed 8m 49s (remain 

Epoch 1 Step 4680 - avg_train_loss: 0.3029  avg_val_loss: 0.4779
Epoch 1 Step 4680 - Score: 0.6836
Epoch 1 Step 4680 - Save Best Score: 0.6836 Model


EVAL: [551/552] Elapsed 0m 58s (remain 0m 0s) 
Epoch: [1][4700/6062] Loss: [0.30306017030902704]Elapsed 10m 29s (remain 3m 2s) 
Epoch: [1][4800/6062] Loss: [0.3017465070659084]Elapsed 10m 38s (remain 2m 47s) 
Epoch: [1][4900/6062] Loss: [0.30123235999837367]Elapsed 10m 48s (remain 2m 33s) 
Epoch: [1][5000/6062] Loss: [0.29990470405463987]Elapsed 10m 57s (remain 2m 19s) 
Epoch: [1][5100/6062] Loss: [0.2988849766033482]Elapsed 11m 7s (remain 2m 5s) 
Epoch: [1][5200/6062] Loss: [0.2981482792528501]Elapsed 11m 16s (remain 1m 52s) 
Epoch: [1][5300/6062] Loss: [0.29697487809112705]Elapsed 11m 26s (remain 1m 38s) 
Epoch: [1][5400/6062] Loss: [0.29601067466340414]Elapsed 11m 35s (remain 1m 25s) 
Epoch: [1][5500/6062] Loss: [0.2960083160789265]Elapsed 11m 45s (remain 1m 11s) 
Epoch: [1][5600/6062] Loss: [0.29483724313339565]Elapsed 11m 54s (remain 0m 58s) 
Epoch: [1][5700/6062] Loss: [0.2962906988232846]Elapsed 12m 4s (remain 0m 45s) 
Epoch: [1][5800/6062] Loss: [0.29444810003190786]Elapsed 12m

Epoch 1 - avg_train_loss: 0.2934  avg_val_loss: 0.4779  time: 758s
Epoch 1 - Score: 0.6836


Epoch: [1][6061/6062] Loss: [0.2933798605731986]Elapsed 12m 38s (remain 0m 0s) 
Epoch: [2][0/6062] Loss: [0.06096861511468887]Elapsed 0m 0s (remain 8m 3s) 
Epoch: [2][100/6062] Loss: [0.1887563468570277]Elapsed 0m 9s (remain 9m 21s) 
Epoch: [2][200/6062] Loss: [0.18790491885535734]Elapsed 0m 18s (remain 9m 13s) 
Epoch: [2][300/6062] Loss: [0.18965984005256684]Elapsed 0m 28s (remain 9m 5s) 
Epoch: [2][400/6062] Loss: [0.18879168423007653]Elapsed 0m 37s (remain 8m 55s) 
Epoch: [2][500/6062] Loss: [0.1933738444183587]Elapsed 0m 47s (remain 8m 46s) 
Epoch: [2][600/6062] Loss: [0.19638079417247276]Elapsed 0m 56s (remain 8m 37s) 
Epoch: [2][700/6062] Loss: [0.1999980617139705]Elapsed 1m 6s (remain 8m 27s) 
Epoch: [2][800/6062] Loss: [0.19988126974468864]Elapsed 1m 15s (remain 8m 18s) 
Epoch: [2][900/6062] Loss: [0.19981337753733744]Elapsed 1m 25s (remain 8m 9s) 
Epoch: [2][1000/6062] Loss: [0.19788668841881502]Elapsed 1m 34s (remain 7m 59s) 
Epoch: [2][1100/6062] Loss: [0.1961632664627588]El

Epoch 2 Step 1560 - avg_train_loss: 0.1958  avg_val_loss: 0.3837
Epoch 2 Step 1560 - Score: 0.6156
Epoch 2 Step 1560 - Save Best Score: 0.6156 Model


EVAL: [551/552] Elapsed 0m 58s (remain 0m 0s) 
Epoch: [2][1600/6062] Loss: [0.19602225991892522]Elapsed 3m 33s (remain 9m 54s) 
Epoch: [2][1700/6062] Loss: [0.1989233910375106]Elapsed 3m 42s (remain 9m 31s) 
Epoch: [2][1800/6062] Loss: [0.20057216492730034]Elapsed 3m 52s (remain 9m 9s) 
Epoch: [2][1900/6062] Loss: [0.1998467464340552]Elapsed 4m 1s (remain 8m 49s) 
Epoch: [2][2000/6062] Loss: [0.19909272004158843]Elapsed 4m 11s (remain 8m 30s) 
Epoch: [2][2100/6062] Loss: [0.1968946154948558]Elapsed 4m 20s (remain 8m 11s) 
Epoch: [2][2200/6062] Loss: [0.19741072158566356]Elapsed 4m 30s (remain 7m 54s) 
Epoch: [2][2300/6062] Loss: [0.1977147618538863]Elapsed 4m 39s (remain 7m 37s) 
Epoch: [2][2400/6062] Loss: [0.19760973723573275]Elapsed 4m 49s (remain 7m 21s) 
Epoch: [2][2500/6062] Loss: [0.19686235275615663]Elapsed 4m 58s (remain 7m 5s) 
Epoch: [2][2600/6062] Loss: [0.19668934647402173]Elapsed 5m 8s (remain 6m 50s) 
Epoch: [2][2700/6062] Loss: [0.19647324063701752]Elapsed 5m 17s (remai

Epoch 2 Step 3120 - avg_train_loss: 0.1935  avg_val_loss: 0.4466
Epoch 2 Step 3120 - Score: 0.6661


EVAL: [551/552] Elapsed 0m 58s (remain 0m 0s) 
Epoch: [2][3200/6062] Loss: [0.19288912101980238]Elapsed 7m 3s (remain 6m 18s) 
Epoch: [2][3300/6062] Loss: [0.1945344665606528]Elapsed 7m 13s (remain 6m 2s) 
Epoch: [2][3400/6062] Loss: [0.19254554725087986]Elapsed 7m 22s (remain 5m 46s) 
Epoch: [2][3500/6062] Loss: [0.19312373538436944]Elapsed 7m 32s (remain 5m 30s) 
Epoch: [2][3600/6062] Loss: [0.19187757022500293]Elapsed 7m 41s (remain 5m 15s) 
Epoch: [2][3700/6062] Loss: [0.190937191654395]Elapsed 7m 51s (remain 5m 0s) 
Epoch: [2][3800/6062] Loss: [0.19151637554178041]Elapsed 8m 0s (remain 4m 45s) 
Epoch: [2][3900/6062] Loss: [0.19231789217248776]Elapsed 8m 9s (remain 4m 31s) 
Epoch: [2][4000/6062] Loss: [0.19183121064708647]Elapsed 8m 19s (remain 4m 17s) 
Epoch: [2][4100/6062] Loss: [0.19104056985285858]Elapsed 8m 28s (remain 4m 3s) 
Epoch: [2][4200/6062] Loss: [0.19254746366184247]Elapsed 8m 38s (remain 3m 49s) 
Epoch: [2][4300/6062] Loss: [0.1919010171991857]Elapsed 8m 47s (remain 

Epoch 2 Step 4680 - avg_train_loss: 0.1915  avg_val_loss: 0.4058
Epoch 2 Step 4680 - Score: 0.6342


EVAL: [551/552] Elapsed 0m 58s (remain 0m 0s) 
Epoch: [2][4700/6062] Loss: [0.19230487099551816]Elapsed 10m 24s (remain 3m 0s) 
Epoch: [2][4800/6062] Loss: [0.19221985211311604]Elapsed 10m 33s (remain 2m 46s) 
Epoch: [2][4900/6062] Loss: [0.19163758138841433]Elapsed 10m 43s (remain 2m 32s) 
Epoch: [2][5000/6062] Loss: [0.19186524630546284]Elapsed 10m 52s (remain 2m 18s) 
Epoch: [2][5100/6062] Loss: [0.19177867617839778]Elapsed 11m 2s (remain 2m 4s) 
Epoch: [2][5200/6062] Loss: [0.19170314174951608]Elapsed 11m 11s (remain 1m 51s) 
Epoch: [2][5300/6062] Loss: [0.19144762083301825]Elapsed 11m 21s (remain 1m 37s) 
Epoch: [2][5400/6062] Loss: [0.19087707793702038]Elapsed 11m 30s (remain 1m 24s) 
Epoch: [2][5500/6062] Loss: [0.1905554378782179]Elapsed 11m 40s (remain 1m 11s) 
Epoch: [2][5600/6062] Loss: [0.19060607677210592]Elapsed 11m 49s (remain 0m 58s) 
Epoch: [2][5700/6062] Loss: [0.19054380961537537]Elapsed 11m 59s (remain 0m 45s) 
Epoch: [2][5800/6062] Loss: [0.19056280419345462]Elapse

Epoch 2 - avg_train_loss: 0.1906  avg_val_loss: 0.4058  time: 753s
Epoch 2 - Score: 0.6342


Epoch: [2][6061/6062] Loss: [0.1905801381283759]Elapsed 12m 33s (remain 0m 0s) 
Epoch: [3][0/6062] Loss: [0.047522541135549545]Elapsed 0m 0s (remain 8m 15s) 
Epoch: [3][100/6062] Loss: [0.11553307589994154]Elapsed 0m 9s (remain 9m 23s) 
Epoch: [3][200/6062] Loss: [0.12616857987613098]Elapsed 0m 19s (remain 9m 14s) 
Epoch: [3][300/6062] Loss: [0.14451098628941633]Elapsed 0m 28s (remain 9m 5s) 
Epoch: [3][400/6062] Loss: [0.14573657507511747]Elapsed 0m 37s (remain 8m 56s) 
Epoch: [3][500/6062] Loss: [0.14059873503721607]Elapsed 0m 47s (remain 8m 46s) 
Epoch: [3][600/6062] Loss: [0.13581537738552443]Elapsed 0m 56s (remain 8m 37s) 
Epoch: [3][700/6062] Loss: [0.13611596613191068]Elapsed 1m 6s (remain 8m 27s) 
Epoch: [3][800/6062] Loss: [0.13579553597013813]Elapsed 1m 15s (remain 8m 18s) 
Epoch: [3][900/6062] Loss: [0.13735452691158273]Elapsed 1m 25s (remain 8m 9s) 
Epoch: [3][1000/6062] Loss: [0.13678706471612465]Elapsed 1m 34s (remain 7m 59s) 
Epoch: [3][1100/6062] Loss: [0.13794194367972

Epoch 3 Step 1560 - avg_train_loss: 0.1363  avg_val_loss: 0.3705
Epoch 3 Step 1560 - Score: 0.6056
Epoch 3 Step 1560 - Save Best Score: 0.6056 Model


EVAL: [551/552] Elapsed 0m 58s (remain 0m 0s) 
Epoch: [3][1600/6062] Loss: [0.13574285672108524]Elapsed 3m 34s (remain 9m 57s) 
Epoch: [3][1700/6062] Loss: [0.13556875570915972]Elapsed 3m 43s (remain 9m 33s) 
Epoch: [3][1800/6062] Loss: [0.13454784344033577]Elapsed 3m 53s (remain 9m 12s) 
Epoch: [3][1900/6062] Loss: [0.13532318803980856]Elapsed 4m 2s (remain 8m 51s) 
Epoch: [3][2000/6062] Loss: [0.1343308017196272]Elapsed 4m 12s (remain 8m 32s) 
Epoch: [3][2100/6062] Loss: [0.13551551452779703]Elapsed 4m 21s (remain 8m 13s) 
Epoch: [3][2200/6062] Loss: [0.1347875875260976]Elapsed 4m 31s (remain 7m 56s) 
Epoch: [3][2300/6062] Loss: [0.1350073754415938]Elapsed 4m 41s (remain 7m 39s) 
Epoch: [3][2400/6062] Loss: [0.13444317530051572]Elapsed 4m 50s (remain 7m 23s) 
Epoch: [3][2500/6062] Loss: [0.1355478006460988]Elapsed 5m 0s (remain 7m 7s) 
Epoch: [3][2600/6062] Loss: [0.13498997073598404]Elapsed 5m 9s (remain 6m 51s) 
Epoch: [3][2700/6062] Loss: [0.13560594396197143]Elapsed 5m 19s (remai

Epoch 3 Step 3120 - avg_train_loss: 0.1347  avg_val_loss: 0.3737
Epoch 3 Step 3120 - Score: 0.6094


EVAL: [551/552] Elapsed 0m 58s (remain 0m 0s) 
Epoch: [3][3200/6062] Loss: [0.13421874990865088]Elapsed 7m 5s (remain 6m 20s) 
Epoch: [3][3300/6062] Loss: [0.133498624293749]Elapsed 7m 14s (remain 6m 3s) 
Epoch: [3][3400/6062] Loss: [0.13337177825386753]Elapsed 7m 24s (remain 5m 47s) 
Epoch: [3][3500/6062] Loss: [0.13417954608499658]Elapsed 7m 33s (remain 5m 31s) 
Epoch: [3][3600/6062] Loss: [0.13532756874239998]Elapsed 7m 43s (remain 5m 16s) 
Epoch: [3][3700/6062] Loss: [0.13585904316289577]Elapsed 7m 52s (remain 5m 1s) 
Epoch: [3][3800/6062] Loss: [0.13585871213137696]Elapsed 8m 2s (remain 4m 46s) 
Epoch: [3][3900/6062] Loss: [0.13569281497214747]Elapsed 8m 11s (remain 4m 32s) 
Epoch: [3][4000/6062] Loss: [0.13466199631515335]Elapsed 8m 21s (remain 4m 18s) 
Epoch: [3][4100/6062] Loss: [0.13464455309205123]Elapsed 8m 30s (remain 4m 4s) 
Epoch: [3][4200/6062] Loss: [0.13457079853258583]Elapsed 8m 40s (remain 3m 50s) 
Epoch: [3][4300/6062] Loss: [0.13386979121728634]Elapsed 8m 49s (rema

Epoch 3 Step 4680 - avg_train_loss: 0.1336  avg_val_loss: 0.4210
Epoch 3 Step 4680 - Score: 0.6485


EVAL: [551/552] Elapsed 0m 58s (remain 0m 0s) 
Epoch: [3][4700/6062] Loss: [0.13386136413206545]Elapsed 10m 26s (remain 3m 1s) 
Epoch: [3][4800/6062] Loss: [0.1336195418714969]Elapsed 10m 35s (remain 2m 46s) 
Epoch: [3][4900/6062] Loss: [0.13306552419796822]Elapsed 10m 45s (remain 2m 32s) 
Epoch: [3][5000/6062] Loss: [0.1327375942277634]Elapsed 10m 54s (remain 2m 18s) 
Epoch: [3][5100/6062] Loss: [0.13261789612349953]Elapsed 11m 3s (remain 2m 5s) 
Epoch: [3][5200/6062] Loss: [0.1319589854027436]Elapsed 11m 13s (remain 1m 51s) 
Epoch: [3][5300/6062] Loss: [0.13157346495735606]Elapsed 11m 22s (remain 1m 38s) 
Epoch: [3][5400/6062] Loss: [0.13126892451243072]Elapsed 11m 32s (remain 1m 24s) 
Epoch: [3][5500/6062] Loss: [0.1315647483825497]Elapsed 11m 41s (remain 1m 11s) 
Epoch: [3][5600/6062] Loss: [0.1309486798299786]Elapsed 11m 51s (remain 0m 58s) 
Epoch: [3][5700/6062] Loss: [0.13037094132698496]Elapsed 12m 0s (remain 0m 45s) 
Epoch: [3][5800/6062] Loss: [0.13011464064413614]Elapsed 12m

Epoch 3 - avg_train_loss: 0.1303  avg_val_loss: 0.4210  time: 755s
Epoch 3 - Score: 0.6485


Epoch: [3][6061/6062] Loss: [0.1303443963376076]Elapsed 12m 35s (remain 0m 0s) 
Epoch: [4][0/6062] Loss: [0.04673971235752106]Elapsed 0m 0s (remain 8m 21s) 
Epoch: [4][100/6062] Loss: [0.10441842465445349]Elapsed 0m 9s (remain 9m 21s) 
Epoch: [4][200/6062] Loss: [0.10069541650084635]Elapsed 0m 18s (remain 9m 13s) 
Epoch: [4][300/6062] Loss: [0.09503700531970019]Elapsed 0m 28s (remain 9m 5s) 
Epoch: [4][400/6062] Loss: [0.09244749564211525]Elapsed 0m 37s (remain 8m 55s) 
Epoch: [4][500/6062] Loss: [0.0930389059646991]Elapsed 0m 47s (remain 8m 46s) 
Epoch: [4][600/6062] Loss: [0.09469403779503303]Elapsed 0m 56s (remain 8m 37s) 
Epoch: [4][700/6062] Loss: [0.09423489460360035]Elapsed 1m 6s (remain 8m 28s) 
Epoch: [4][800/6062] Loss: [0.09460789506519224]Elapsed 1m 15s (remain 8m 18s) 
Epoch: [4][900/6062] Loss: [0.09409748089225563]Elapsed 1m 25s (remain 8m 9s) 
Epoch: [4][1000/6062] Loss: [0.09409479023939152]Elapsed 1m 34s (remain 7m 59s) 
Epoch: [4][1100/6062] Loss: [0.0951138298801746

Epoch 4 Step 1560 - avg_train_loss: 0.0929  avg_val_loss: 0.4293
Epoch 4 Step 1560 - Score: 0.6548


EVAL: [551/552] Elapsed 0m 58s (remain 0m 0s) 
Epoch: [4][1600/6062] Loss: [0.09344134552072628]Elapsed 3m 30s (remain 9m 45s) 
Epoch: [4][1700/6062] Loss: [0.09374832198786613]Elapsed 3m 39s (remain 9m 23s) 
Epoch: [4][1800/6062] Loss: [0.09353180018270035]Elapsed 3m 49s (remain 9m 2s) 
Epoch: [4][1900/6062] Loss: [0.09336595118875175]Elapsed 3m 58s (remain 8m 42s) 
Epoch: [4][2000/6062] Loss: [0.0949219496786355]Elapsed 4m 8s (remain 8m 23s) 
Epoch: [4][2100/6062] Loss: [0.09406215595829884]Elapsed 4m 17s (remain 8m 5s) 
Epoch: [4][2200/6062] Loss: [0.09317443266992458]Elapsed 4m 27s (remain 7m 48s) 
Epoch: [4][2300/6062] Loss: [0.09334792424868632]Elapsed 4m 36s (remain 7m 32s) 
Epoch: [4][2400/6062] Loss: [0.09312947308397365]Elapsed 4m 46s (remain 7m 16s) 
Epoch: [4][2500/6062] Loss: [0.09282424123170686]Elapsed 4m 55s (remain 7m 0s) 
Epoch: [4][2600/6062] Loss: [0.09318571661867846]Elapsed 5m 5s (remain 6m 45s) 
Epoch: [4][2700/6062] Loss: [0.09260562644114728]Elapsed 5m 14s (rem

Epoch 4 Step 3120 - avg_train_loss: 0.0922  avg_val_loss: 0.4200
Epoch 4 Step 3120 - Score: 0.6478


EVAL: [551/552] Elapsed 0m 58s (remain 0m 0s) 
Epoch: [4][3200/6062] Loss: [0.09246472015560836]Elapsed 7m 0s (remain 6m 15s) 
Epoch: [4][3300/6062] Loss: [0.09264850061348087]Elapsed 7m 9s (remain 5m 59s) 
Epoch: [4][3400/6062] Loss: [0.09278029147568385]Elapsed 7m 19s (remain 5m 43s) 
Epoch: [4][3500/6062] Loss: [0.09263897473189354]Elapsed 7m 28s (remain 5m 28s) 
Epoch: [4][3600/6062] Loss: [0.09244285354660196]Elapsed 7m 38s (remain 5m 13s) 
Epoch: [4][3700/6062] Loss: [0.09230330238240528]Elapsed 7m 47s (remain 4m 58s) 
Epoch: [4][3800/6062] Loss: [0.09260756919641978]Elapsed 7m 57s (remain 4m 43s) 
Epoch: [4][3900/6062] Loss: [0.09238250232756419]Elapsed 8m 6s (remain 4m 29s) 
Epoch: [4][4000/6062] Loss: [0.09242257903340413]Elapsed 8m 16s (remain 4m 15s) 
Epoch: [4][4100/6062] Loss: [0.09308923590913012]Elapsed 8m 25s (remain 4m 1s) 
Epoch: [4][4200/6062] Loss: [0.09321432214471524]Elapsed 8m 35s (remain 3m 48s) 
Epoch: [4][4300/6062] Loss: [0.09360982022350817]Elapsed 8m 44s (r

Epoch 4 Step 4680 - avg_train_loss: 0.0927  avg_val_loss: 0.4309
Epoch 4 Step 4680 - Score: 0.6562


EVAL: [551/552] Elapsed 0m 58s (remain 0m 0s) 
Epoch: [4][4700/6062] Loss: [0.09259802100141602]Elapsed 10m 20s (remain 2m 59s) 
Epoch: [4][4800/6062] Loss: [0.09277069952296692]Elapsed 10m 30s (remain 2m 45s) 
Epoch: [4][4900/6062] Loss: [0.0928406447903042]Elapsed 10m 39s (remain 2m 31s) 
Epoch: [4][5000/6062] Loss: [0.09275733919158025]Elapsed 10m 49s (remain 2m 17s) 
Epoch: [4][5100/6062] Loss: [0.09285939716493964]Elapsed 10m 58s (remain 2m 4s) 
Epoch: [4][5200/6062] Loss: [0.09261499285555504]Elapsed 11m 8s (remain 1m 50s) 
Epoch: [4][5300/6062] Loss: [0.09260875183244074]Elapsed 11m 17s (remain 1m 37s) 
Epoch: [4][5400/6062] Loss: [0.09285678580976171]Elapsed 11m 27s (remain 1m 24s) 
Epoch: [4][5500/6062] Loss: [0.09292557010090899]Elapsed 11m 36s (remain 1m 11s) 
Epoch: [4][5600/6062] Loss: [0.09320859423792013]Elapsed 11m 46s (remain 0m 58s) 
Epoch: [4][5700/6062] Loss: [0.09320084460013278]Elapsed 11m 55s (remain 0m 45s) 
Epoch: [4][5800/6062] Loss: [0.09327824713774432]Elaps

Epoch 4 - avg_train_loss: 0.0934  avg_val_loss: 0.4309  time: 750s
Epoch 4 - Score: 0.6562


Epoch: [4][6061/6062] Loss: [0.09339652683837883]Elapsed 12m 29s (remain 0m 0s) 


Score: 0.6056
Score: 0.5579
