# Config

In [1]:
# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model="microsoft/deberta-v3-large"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=10
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=2
    fc_dropout=0.2
    max_len=450
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    # MLM setting
    mlm_probability=0.15 # 0.15
    max_seq_length=None
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    train=True
    
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

In [2]:
# ====================================================
# Define path
# ====================================================
import os

INPUT_DIR = '../data/'
OUTPUT_DIR = '../output/pretrained/{}/'.format(CFG.model.replace('/', '-'))
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Library

In [3]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForMaskedLM
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers.modeling_outputs import MaskedLMOutput
from transformers import DataCollatorForLanguageModeling
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizers.__version__: 0.11.0
transformers.__version__: 4.16.2


2022-03-23 11:54:25.192345: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


env: TOKENIZERS_PARALLELISM=true


# Utils

In [4]:
# ====================================================
# Utils
# ===================================================
def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

# Trainer

In [5]:
######################################################

# Trainer
def trainer(model, data_loader, optimizer, scheduler, CFG):
    model.train()

    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = []

    optimizer.zero_grad()

    for idx, batch in enumerate(data_loader):
        for k, v in batch.items():
            batch[k] = v.to(device, dtype=torch.long)

        with torch.cuda.amp.autocast(enabled=CFG.apex):
            outputs = model(**batch)

        loss = outputs.loss

        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (idx + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            CFG.global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        
        losses.append(loss.detach().cpu().item())

        # if (CFG.global_step % CFG.save_step) == 0:
        #     LOGGER.info(
        #         "Epoch {} Step {}: Train Loss {:.4f}, elapsed {:.4f}s".format(
        #             CFG.epoch + 1, CFG.global_step, np.mean(losses), time.time() - start)
        #         )
        #     torch.save(
        #         model.state_dict(),
        #         OUTPUT_DIR + '{}-mlm-step-{}.bin'.format(
        #             CFG.model.replace('/', '-'),
        #             CFG.global_step))
        
        if idx % 500 == 0 or idx == (len(data_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss:.4f} '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, idx, len(data_loader), 
                          remain=timeSince(start, float(idx+1)/len(data_loader)),
                          loss=np.mean(losses),
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
    
    return np.mean(losses)

# Model

In [6]:
def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
         'lr': encoder_lr, 'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
         'lr': encoder_lr, 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if "model" not in n],
         'lr': decoder_lr, 'weight_decay': 0.0}
    ]
    return optimizer_parameters

In [7]:
def get_scheduler(cfg, optimizer, num_train_steps):
    if cfg.scheduler=='linear':
        scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
        )
    elif cfg.scheduler=='cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
    return scheduler

In [8]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(
                cfg.model,
                output_hidden_states=False
                )
        else:
            self.config = torch.load(config_path)
        
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
            self.lm_head = AutoModelForMaskedLM.from_pretrained(cfg.model, config=self.config).cls # [cls, lm_head]
        else:
            self.model = AutoModel(self.config)
            self.lm_head = AutoModelForMaskedLM(self.config).cls # [cls, lm_head]
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(
            self, 
            input_ids=None,
            attention_mask=None,
            token_type_ids=None,
            #position_ids=None,
            inputs_embeds=None,
            labels=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None):
        
        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            #position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,)
        
        sequence_output = outputs[0]
        prediction_scores = self.lm_head(sequence_output)

        masked_lm_loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        return MaskedLMOutput(loss=masked_lm_loss,
                              logits=prediction_scores,
                              hidden_states=outputs.hidden_states,
                              attentions=outputs.attentions)

# Main

In [9]:
# ====================================================
# data loading
# ====================================================
train = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'))
features = pd.read_csv(os.path.join(INPUT_DIR, 'features.csv'))
patient_notes = pd.read_csv(os.path.join(INPUT_DIR, 'patient_notes.csv'))
display(patient_notes.head())
display(patient_notes['pn_history'].nunique(), len(patient_notes))

Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...


42146

42146

In [10]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
CFG.tokenizer = tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
mlm_data = patient_notes[['pn_history']].reset_index(drop=True)
mlm_data = mlm_data.rename(columns={'pn_history': 'text'})
csv_name = f'mlm_data.csv'
mlm_data.to_csv(OUTPUT_DIR+csv_name, index=False)
print(f"Saved mlm data: {csv_name}")
print(f"mlm data: {mlm_data.shape}")

Saved mlm data: mlm_data.csv
mlm data: (42146, 1)


In [12]:
#####################################################

# Training support

def tokenize_function(examples):
    return tokenizer(examples["text"], return_special_tokens_mask=True)

def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // CFG.max_len) * CFG.max_len
    result = {
        k: [t[i : i + CFG.max_len] for i in range(0, total_length, CFG.max_len)]
        for k, t in concatenated_examples.items()
    }
    return result

In [13]:
seed_everything(CFG.seed)

CFG.train_file = f"mlm_data.csv"
data_files = {'train': OUTPUT_DIR+CFG.train_file}
raw_datasets = load_dataset('csv', data_files=data_files)

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-a2589d65210d3a53/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-a2589d65210d3a53/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [14]:
if CFG.max_seq_length is None:
    max_seq_length = tokenizer.model_max_length
else:
    if CFG.max_seq_length > tokenizer.model_max_length:
        max_seq_length = min(CFG.max_seq_length, tokenizer.model_max_length)

In [15]:
tokenized_datasets = raw_datasets.map(
    tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns=["text"],
    load_from_cache_file=not True,
    )
LOGGER.info(f"tokenized_datasets: {tokenized_datasets}")

tokenized_datasets: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'special_tokens_mask', 'attention_mask'],
        num_rows: 42146
    })
})


In [16]:
tokenized_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    num_proc=4,
    load_from_cache_file=not True,
    )
train_dataset = tokenized_datasets["train"]
LOGGER.info(f"train_dataset: {train_dataset}")

train_dataset: Dataset({
    features: ['input_ids', 'token_type_ids', 'special_tokens_mask', 'attention_mask'],
    num_rows: 17763
})


In [17]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm_probability=CFG.mlm_probability
    )
train_loader = DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=data_collator,
    batch_size=CFG.batch_size
    )

In [19]:
model = CustomModel(CFG, config_path=None, pretrained=True)
model.to(device)
optimizer_parameters = get_optimizer_params(
    model,
    encoder_lr=CFG.encoder_lr, 
    decoder_lr=CFG.decoder_lr,
    weight_decay=CFG.weight_decay)
optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the 

In [20]:
num_train_steps = int(len(mlm_data) / CFG.batch_size * CFG.epochs)
scheduler = get_scheduler(CFG, optimizer, num_train_steps)

In [22]:
CFG.global_step = 0
CFG.save_step = 25000

for epoch in range(CFG.epochs):
    CFG.epoch = epoch
    start = time.time()
        
    train_loss = trainer(model, train_loader, optimizer, scheduler, CFG)

    LOGGER.info(
        "Epoch {}: Train Loss {:.4f}, elapsed {:.4f}s".format(
            epoch + 1, train_loss, time.time() - start)
        )
    torch.save(
        model.state_dict(),
        OUTPUT_DIR + '{}-mlm-epoch-{}.bin'.format(
            CFG.model.replace('/', '-'),
            CFG.epoch + 1))

Epoch: [1][0/8882] Elapsed 0m 0s (remain 50m 2s) Loss: 12.2199 Grad: inf  LR: 0.00002000  
Epoch: [1][500/8882] Elapsed 2m 24s (remain 40m 20s) Loss: 5.3418 Grad: 88372.2891  LR: 0.00002000  
Epoch: [1][1000/8882] Elapsed 4m 49s (remain 37m 59s) Loss: 4.3881 Grad: 89154.5234  LR: 0.00002000  
Epoch: [1][1500/8882] Elapsed 7m 14s (remain 35m 34s) Loss: 3.8857 Grad: 77663.2656  LR: 0.00002000  
Epoch: [1][2000/8882] Elapsed 9m 38s (remain 33m 10s) Loss: 3.5395 Grad: 70033.1562  LR: 0.00002000  
Epoch: [1][2500/8882] Elapsed 12m 3s (remain 30m 45s) Loss: 3.2938 Grad: 178959.0781  LR: 0.00001999  
Epoch: [1][3000/8882] Elapsed 14m 27s (remain 28m 19s) Loss: 3.1100 Grad: 138382.5625  LR: 0.00001999  
Epoch: [1][3500/8882] Elapsed 16m 52s (remain 25m 55s) Loss: 2.9651 Grad: 179101.8750  LR: 0.00001999  
Epoch: [1][4000/8882] Elapsed 19m 16s (remain 23m 30s) Loss: 2.8452 Grad: 165747.7656  LR: 0.00001998  
Epoch: [1][4500/8882] Elapsed 21m 41s (remain 21m 6s) Loss: 2.7455 Grad: 285073.6875  L

Epoch 1: Train Loss 2.2637, elapsed 2544.2021s


Epoch: [1][8881/8882] Elapsed 42m 24s (remain 0m 0s) Loss: 2.2637 Grad: 40322.5742  LR: 0.00001991  
Epoch: [2][0/8882] Elapsed 0m 0s (remain 45m 31s) Loss: 1.8409 Grad: 570392.0000  LR: 0.00001991  
Epoch: [2][500/8882] Elapsed 2m 20s (remain 39m 11s) Loss: 1.6108 Grad: 602080.7500  LR: 0.00001990  
Epoch: [2][1000/8882] Elapsed 4m 40s (remain 36m 48s) Loss: 1.5941 Grad: 441119.0938  LR: 0.00001989  
Epoch: [2][1500/8882] Elapsed 7m 1s (remain 34m 30s) Loss: 1.5679 Grad: 510324.4688  LR: 0.00001988  
Epoch: [2][2000/8882] Elapsed 9m 21s (remain 32m 9s) Loss: 1.5535 Grad: 1189315.7500  LR: 0.00001987  
Epoch: [2][2500/8882] Elapsed 11m 40s (remain 29m 47s) Loss: 1.5457 Grad: 499084.5625  LR: 0.00001986  
Epoch: [2][3000/8882] Elapsed 14m 0s (remain 27m 27s) Loss: 1.5384 Grad: 310575.7812  LR: 0.00001984  
Epoch: [2][3500/8882] Elapsed 16m 20s (remain 25m 7s) Loss: 1.5302 Grad: 269342.4062  LR: 0.00001983  
Epoch: [2][4000/8882] Elapsed 18m 39s (remain 22m 46s) Loss: 1.5273 Grad: 121803

Epoch 2: Train Loss 1.4542, elapsed 2480.6889s


Epoch: [2][8881/8882] Elapsed 41m 20s (remain 0m 0s) Loss: 1.4542 Grad: 661538.3750  LR: 0.00001965  
Epoch: [3][0/8882] Elapsed 0m 0s (remain 45m 8s) Loss: 1.0780 Grad: 515136.4375  LR: 0.00001965  
Epoch: [3][500/8882] Elapsed 2m 19s (remain 38m 49s) Loss: 1.2921 Grad: 532908.7500  LR: 0.00001963  
Epoch: [3][1000/8882] Elapsed 4m 38s (remain 36m 33s) Loss: 1.2919 Grad: 440827.2188  LR: 0.00001961  
Epoch: [3][1500/8882] Elapsed 6m 57s (remain 34m 13s) Loss: 1.2939 Grad: 409911.8125  LR: 0.00001959  
Epoch: [3][2000/8882] Elapsed 9m 16s (remain 31m 53s) Loss: 1.2947 Grad: 240010.0312  LR: 0.00001957  
Epoch: [3][2500/8882] Elapsed 11m 35s (remain 29m 35s) Loss: 1.3032 Grad: 271265.8125  LR: 0.00001955  
Epoch: [3][3000/8882] Elapsed 13m 55s (remain 27m 16s) Loss: 1.2997 Grad: 276147.1562  LR: 0.00001952  
Epoch: [3][3500/8882] Elapsed 16m 14s (remain 24m 57s) Loss: 1.2971 Grad: 241681.3906  LR: 0.00001950  
Epoch: [3][4000/8882] Elapsed 18m 33s (remain 22m 38s) Loss: 1.2932 Grad: 416

Epoch 3: Train Loss 1.2579, elapsed 2471.6646s


Epoch: [3][8881/8882] Elapsed 41m 11s (remain 0m 0s) Loss: 1.2579 Grad: 867947.8750  LR: 0.00001922  
Epoch: [4][0/8882] Elapsed 0m 0s (remain 45m 6s) Loss: 1.0336 Grad: 523120.6875  LR: 0.00001922  
Epoch: [4][500/8882] Elapsed 2m 19s (remain 38m 58s) Loss: 1.1944 Grad: 488208.3438  LR: 0.00001919  
Epoch: [4][1000/8882] Elapsed 4m 38s (remain 36m 34s) Loss: 1.1894 Grad: 451028.3438  LR: 0.00001916  
Epoch: [4][1500/8882] Elapsed 6m 57s (remain 34m 13s) Loss: 1.1915 Grad: 471103.9062  LR: 0.00001913  
Epoch: [4][2000/8882] Elapsed 9m 16s (remain 31m 53s) Loss: 1.1884 Grad: 826846.2500  LR: 0.00001910  
Epoch: [4][2500/8882] Elapsed 11m 35s (remain 29m 35s) Loss: 1.1861 Grad: 381155.0000  LR: 0.00001907  
Epoch: [4][3000/8882] Elapsed 13m 54s (remain 27m 15s) Loss: 1.1832 Grad: 429648.0625  LR: 0.00001904  
Epoch: [4][3500/8882] Elapsed 16m 13s (remain 24m 56s) Loss: 1.1830 Grad: 456696.2500  LR: 0.00001901  
Epoch: [4][4000/8882] Elapsed 18m 32s (remain 22m 37s) Loss: 1.1787 Grad: 406

Epoch 4: Train Loss 1.1564, elapsed 2469.6004s


Epoch: [4][8881/8882] Elapsed 41m 9s (remain 0m 0s) Loss: 1.1564 Grad: 1419945.2500  LR: 0.00001863  
Epoch: [5][0/8882] Elapsed 0m 0s (remain 44m 23s) Loss: 1.1357 Grad: 427693.3438  LR: 0.00001863  
Epoch: [5][500/8882] Elapsed 2m 19s (remain 38m 56s) Loss: 1.1309 Grad: 587525.8125  LR: 0.00001859  
Epoch: [5][1000/8882] Elapsed 4m 38s (remain 36m 33s) Loss: 1.1214 Grad: 351896.3438  LR: 0.00001855  
Epoch: [5][1500/8882] Elapsed 6m 57s (remain 34m 11s) Loss: 1.1178 Grad: 549988.7500  LR: 0.00001851  
Epoch: [5][2000/8882] Elapsed 9m 16s (remain 31m 54s) Loss: 1.1197 Grad: 890172.3750  LR: 0.00001848  
Epoch: [5][2500/8882] Elapsed 11m 35s (remain 29m 35s) Loss: 1.1107 Grad: 923711.0000  LR: 0.00001844  
Epoch: [5][3000/8882] Elapsed 13m 55s (remain 27m 16s) Loss: 1.1061 Grad: 409044.9688  LR: 0.00001840  
Epoch: [5][3500/8882] Elapsed 16m 14s (remain 24m 57s) Loss: 1.1048 Grad: 457682.0000  LR: 0.00001835  
Epoch: [5][4000/8882] Elapsed 18m 33s (remain 22m 38s) Loss: 1.1033 Grad: 45

Epoch 5: Train Loss 1.1107, elapsed 2472.8222s


Epoch: [5][8881/8882] Elapsed 41m 12s (remain 0m 0s) Loss: 1.1107 Grad: 209301.3281  LR: 0.00001789  
Epoch: [6][0/8882] Elapsed 0m 0s (remain 43m 44s) Loss: 1.2515 Grad: 515217.2188  LR: 0.00001789  
Epoch: [6][500/8882] Elapsed 2m 19s (remain 38m 52s) Loss: 1.0926 Grad: 344880.9375  LR: 0.00001784  
Epoch: [6][1000/8882] Elapsed 4m 38s (remain 36m 30s) Loss: 1.0877 Grad: 469839.3125  LR: 0.00001779  
Epoch: [6][1500/8882] Elapsed 6m 57s (remain 34m 14s) Loss: 1.0796 Grad: 428581.8438  LR: 0.00001775  
Epoch: [6][2000/8882] Elapsed 9m 16s (remain 31m 53s) Loss: 1.0751 Grad: 929533.8750  LR: 0.00001770  
Epoch: [6][2500/8882] Elapsed 11m 35s (remain 29m 33s) Loss: 1.0681 Grad: 421754.9688  LR: 0.00001765  
Epoch: [6][3000/8882] Elapsed 13m 54s (remain 27m 15s) Loss: 1.0669 Grad: 568144.4375  LR: 0.00001760  
Epoch: [6][3500/8882] Elapsed 16m 13s (remain 24m 56s) Loss: 1.0651 Grad: 357577.4375  LR: 0.00001756  
Epoch: [6][4000/8882] Elapsed 18m 32s (remain 22m 37s) Loss: 1.0646 Grad: 33

Epoch 6: Train Loss 1.0773, elapsed 2473.9778s


Epoch: [6][8881/8882] Elapsed 41m 13s (remain 0m 0s) Loss: 1.0773 Grad: 115218.9531  LR: 0.00001701  
Epoch: [7][0/8882] Elapsed 0m 0s (remain 45m 15s) Loss: 0.7541 Grad: 362171.0000  LR: 0.00001701  
Epoch: [7][500/8882] Elapsed 2m 19s (remain 38m 50s) Loss: 1.0169 Grad: 407283.7812  LR: 0.00001695  
Epoch: [7][1000/8882] Elapsed 4m 38s (remain 36m 34s) Loss: 1.0089 Grad: 446485.1250  LR: 0.00001690  
Epoch: [7][1500/8882] Elapsed 6m 57s (remain 34m 14s) Loss: 1.0072 Grad: 470946.3125  LR: 0.00001685  
Epoch: [7][2000/8882] Elapsed 9m 16s (remain 31m 55s) Loss: 1.0079 Grad: 947581.3125  LR: 0.00001679  
Epoch: [7][2500/8882] Elapsed 11m 36s (remain 29m 36s) Loss: 1.0090 Grad: 300586.2500  LR: 0.00001674  
Epoch: [7][3000/8882] Elapsed 13m 55s (remain 27m 16s) Loss: 1.0073 Grad: 475308.3438  LR: 0.00001668  
Epoch: [7][3500/8882] Elapsed 16m 14s (remain 24m 57s) Loss: 1.0070 Grad: 474177.8438  LR: 0.00001662  
Epoch: [7][4000/8882] Elapsed 18m 33s (remain 22m 38s) Loss: 1.0089 Grad: 44

Epoch 7: Train Loss 1.0156, elapsed 2472.7728s


Epoch: [7][8881/8882] Elapsed 41m 12s (remain 0m 0s) Loss: 1.0156 Grad: 345009.6250  LR: 0.00001600  
Epoch: [8][0/8882] Elapsed 0m 0s (remain 44m 51s) Loss: 1.4057 Grad: 528218.5000  LR: 0.00001600  
Epoch: [8][500/8882] Elapsed 2m 19s (remain 39m 1s) Loss: 1.0030 Grad: 351230.1250  LR: 0.00001594  
Epoch: [8][1000/8882] Elapsed 4m 38s (remain 36m 35s) Loss: 0.9859 Grad: 181243.9844  LR: 0.00001588  
Epoch: [8][1500/8882] Elapsed 6m 57s (remain 34m 13s) Loss: 0.9918 Grad: 120553.6406  LR: 0.00001582  
Epoch: [8][2000/8882] Elapsed 9m 16s (remain 31m 55s) Loss: 0.9993 Grad: 112168.5234  LR: 0.00001576  
Epoch: [8][2500/8882] Elapsed 11m 36s (remain 29m 35s) Loss: 1.0083 Grad: 48980.0117  LR: 0.00001570  
Epoch: [8][3000/8882] Elapsed 13m 54s (remain 27m 16s) Loss: 1.0160 Grad: 25369.5645  LR: 0.00001564  
Epoch: [8][3500/8882] Elapsed 16m 14s (remain 24m 57s) Loss: 1.0223 Grad: 26950.9668  LR: 0.00001558  
Epoch: [8][4000/8882] Elapsed 18m 33s (remain 22m 38s) Loss: 1.0243 Grad: 27361.

Epoch 8: Train Loss 1.0057, elapsed 2472.7191s


Epoch: [8][8881/8882] Elapsed 41m 12s (remain 0m 0s) Loss: 1.0057 Grad: 344239.5312  LR: 0.00001489  
Epoch: [9][0/8882] Elapsed 0m 0s (remain 44m 50s) Loss: 0.7049 Grad: 394879.3750  LR: 0.00001489  
Epoch: [9][500/8882] Elapsed 2m 19s (remain 38m 48s) Loss: 0.9530 Grad: 233606.9062  LR: 0.00001483  
Epoch: [9][1000/8882] Elapsed 4m 37s (remain 36m 28s) Loss: 0.9546 Grad: 173878.2188  LR: 0.00001476  
Epoch: [9][1500/8882] Elapsed 6m 57s (remain 34m 12s) Loss: 0.9547 Grad: 185162.7969  LR: 0.00001470  
Epoch: [9][2000/8882] Elapsed 9m 16s (remain 31m 53s) Loss: 0.9576 Grad: 254563.2812  LR: 0.00001463  
Epoch: [9][2500/8882] Elapsed 11m 35s (remain 29m 34s) Loss: 0.9561 Grad: 523314.8125  LR: 0.00001457  
Epoch: [9][3000/8882] Elapsed 13m 55s (remain 27m 17s) Loss: 0.9567 Grad: 378725.6875  LR: 0.00001450  
Epoch: [9][3500/8882] Elapsed 16m 14s (remain 24m 58s) Loss: 0.9545 Grad: 407960.1562  LR: 0.00001443  
Epoch: [9][4000/8882] Elapsed 18m 34s (remain 22m 39s) Loss: 0.9550 Grad: 32

Epoch 9: Train Loss 0.9483, elapsed 2477.8738s


Epoch: [9][8881/8882] Elapsed 41m 17s (remain 0m 0s) Loss: 0.9483 Grad: 234696.2500  LR: 0.00001370  
Epoch: [10][0/8882] Elapsed 0m 0s (remain 45m 9s) Loss: 0.9182 Grad: 386109.7188  LR: 0.00001370  
Epoch: [10][500/8882] Elapsed 2m 19s (remain 38m 54s) Loss: 0.9322 Grad: 360702.8125  LR: 0.00001363  
Epoch: [10][1000/8882] Elapsed 4m 39s (remain 36m 40s) Loss: 0.9282 Grad: 439410.9375  LR: 0.00001356  
Epoch: [10][1500/8882] Elapsed 6m 58s (remain 34m 18s) Loss: 0.9323 Grad: 165555.0625  LR: 0.00001349  
Epoch: [10][2000/8882] Elapsed 9m 17s (remain 31m 57s) Loss: 0.9301 Grad: 220607.1875  LR: 0.00001342  
Epoch: [10][2500/8882] Elapsed 11m 36s (remain 29m 37s) Loss: 0.9331 Grad: 175563.1094  LR: 0.00001335  
Epoch: [10][3000/8882] Elapsed 13m 55s (remain 27m 17s) Loss: 0.9295 Grad: 83053.7266  LR: 0.00001328  
Epoch: [10][3500/8882] Elapsed 16m 12s (remain 24m 55s) Loss: 0.9321 Grad: 100628.5547  LR: 0.00001321  
Epoch: [10][4000/8882] Elapsed 18m 31s (remain 22m 35s) Loss: 0.9338 G

Epoch 10: Train Loss 0.9270, elapsed 2457.8330s


Epoch: [10][8881/8882] Elapsed 40m 57s (remain 0m 0s) Loss: 0.9270 Grad: 567497.4375  LR: 0.00001244  
