# 🏋️ Model Training

## ⚙️ Setup 

### 📚 Importing Libraries

Importing from packages

In [1]:
import os
import gc
import pandas as pd
import time
import numpy as np
import torch
from torch.optim import AdamW
from torch import nn
from transformers import (
    get_linear_schedule_with_warmup,
    get_cosine_schedule_with_warmup,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.chdir("../")

Importing user defined packages

In [3]:
from lib.config import Config
from lib.paths import Paths
from lib.model.deberta import CustomModel
from lib.model.epoch_functions import train_epoch, valid_epoch
from lib.model.utils import get_score
from lib.utils.utils import get_logger, seed_everything
from lib.data import read_data_loader_from_disk

In [4]:
seed_everything(Config.RANDOM_SEED)

## 📖 Definitions

### 🌎 Global Variables

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
LOGGER = get_logger(Paths.MODEL_OUTPUT_PATH)

### 🛠️ Functions

In [7]:
def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p
                for n, p in model.model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "lr": encoder_lr,
            "weight_decay": weight_decay,
        },
        {
            "params": [
                p
                for n, p in model.model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "lr": encoder_lr,
            "weight_decay": 0.0,
        },
        {
            "params": [p for n, p in model.named_parameters() if "model" not in n],
            "lr": decoder_lr,
            "weight_decay": 0.0,
        },
    ]

    return optimizer_parameters

In [8]:
def get_scheduler(cfg: Config, optimizer, num_train_steps):
    if cfg.SCHEDULER == "linear":
        return get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=cfg.NUM_WARMUP_STEPS,
            num_training_steps=num_train_steps,
        )
    
    if cfg.SCHEDULER == "cosine":
        return get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=cfg.NUM_WARMUP_STEPS,
            num_training_steps=num_train_steps,
            num_cycles=cfg.NUM_CYCLES,
        )

In [9]:
def get_model_optimizer_and_scheduler(train_loader):
    model = CustomModel(Config, config_path=None, pretrained=True)
    torch.save(model.config, Paths.MODEL_OUTPUT_PATH + "/config.pth")
    model.to(device)

    optimizer = AdamW(
        get_optimizer_params(
            model,
            encoder_lr=Config.ENCODER_LR,
            decoder_lr=Config.DECODER_LR,
            weight_decay=Config.WEIGHT_DECAY,
        ),
        lr=Config.ENCODER_LR,
        eps=Config.EPS,
        betas=Config.BETAS,
    )

    num_train_steps = int(len(train_loader) / Config.BATCH_SIZE_TRAIN * Config.EPOCHS)
    scheduler = get_scheduler(Config, optimizer, num_train_steps)
    return model, optimizer, scheduler

In [10]:
def train_loop(fold):
    LOGGER.info(f"========== Fold: {fold} training ==========")

    # ======== DATA LOADER ==========
    train_loader, valid_loader = read_data_loader_from_disk(fold)
    valid_fold = pd.read_csv(os.path.join(Paths.DATA_LOADER_PATH, f"valid_{fold}.csv"))
    valid_labels = valid_fold["score"].values

    # ======== MODEL ==========
    model, optimizer, scheduler = get_model_optimizer_and_scheduler(train_loader)

    # ======= LOSS ==========
    # criterion = RMSELoss(reduction="mean") # nn.SmoothL1Loss(reduction='mean')
    criterion = nn.CrossEntropyLoss()
    softmax = nn.Softmax(dim=1)

    best_score = -np.inf
    # ====== ITERATE EPOCHS ========
    for epoch in range(Config.EPOCHS):
        start_time = time.time()

        # ======= TRAIN ==========
        avg_loss = train_epoch(
            train_loader, model, criterion, optimizer, epoch, scheduler, device
        )

        # ======= EVALUATION ==========
        avg_val_loss, prediction_dict = valid_epoch(
            valid_loader, model, criterion, device
        )
        predictions = prediction_dict["predictions"]
        _, predictions = torch.max(softmax(torch.tensor(predictions)), dim=1)

        # ======= SCORING ==========
        score = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(
            f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s"
        )
        LOGGER.info(f"Epoch {epoch+1} - Score: {score:.4f}")

        if score > best_score:
            best_score = score
            LOGGER.info(f"Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model")
            torch.save(
                {"model": model.state_dict(), "predictions": predictions},
                Paths.MODEL_OUTPUT_PATH
                + f"/{Config.MODEL.replace('/', '_')}_fold_{fold}_best.pth",
            )

    predictions = torch.load(
        Paths.MODEL_OUTPUT_PATH
        + f"/{Config.MODEL.replace('/', '_')}_fold_{fold}_best.pth",
        map_location=torch.device("cpu"),
    )["predictions"]
    valid_fold["pred_score"] = predictions

    del model, optimizer, scheduler, criterion, softmax
    torch.cuda.empty_cache()
    gc.collect()

    return valid_fold

In [11]:
def get_result(oof_df):
    labels = oof_df["score"].values
    preds = oof_df["pred_score"].values
    score = get_score(labels, preds)
    LOGGER.info(f'Score: {score:<.4f}')

## 🏁 Start Training

In [12]:
if Config.TRAIN:
    oof_df = pd.DataFrame()

    for fold in range(Config.N_FOLDS):
        if fold in Config.TRAIN_FOLDS:
            _oof_df = train_loop(fold)
            oof_df = pd.concat([oof_df, _oof_df])
            LOGGER.info(f"========== Fold: {fold} result ==========")
            get_result(_oof_df)

    oof_df = oof_df.reset_index(drop=True)
    LOGGER.info(f"========== CV ==========")
    get_result(oof_df)
    oof_df.to_csv(Paths.MODEL_OUTPUT_PATH + "/oof_df.csv", index=False)



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Train:   0%|          | 1/1730 [00:01<28:50,  1.00s/train_batch]

Epoch: [1][0/1730] Elapsed 0m 1s (remain 28m 51s) Loss: 2.0245 Grad: inf  LR: 0.00002000  


Train:   1%|          | 21/1730 [00:07<08:54,  3.20train_batch/s]

Epoch: [1][20/1730] Elapsed 0m 7s (remain 9m 45s) Loss: 1.5646 Grad: 269830.2500  LR: 0.00001988  


Train:   2%|▏         | 41/1730 [00:14<10:46,  2.61train_batch/s]

Epoch: [1][40/1730] Elapsed 0m 14s (remain 9m 53s) Loss: 1.5059 Grad: 116084.2031  LR: 0.00001956  


Train:   4%|▎         | 61/1730 [00:21<07:16,  3.82train_batch/s]

Epoch: [1][60/1730] Elapsed 0m 21s (remain 9m 35s) Loss: 1.4250 Grad: 95744.1797  LR: 0.00001903  


Train:   5%|▍         | 81/1730 [00:28<07:46,  3.54train_batch/s]

Epoch: [1][80/1730] Elapsed 0m 27s (remain 9m 29s) Loss: 1.4086 Grad: 44502.9805  LR: 0.00001831  


Train:   6%|▌         | 101/1730 [00:35<09:48,  2.77train_batch/s]

Epoch: [1][100/1730] Elapsed 0m 35s (remain 9m 30s) Loss: 1.3895 Grad: 110513.4688  LR: 0.00001742  


Train:   7%|▋         | 121/1730 [00:42<10:58,  2.44train_batch/s]

Epoch: [1][120/1730] Elapsed 0m 42s (remain 9m 27s) Loss: 1.3536 Grad: 228435.4531  LR: 0.00001637  


Train:   8%|▊         | 141/1730 [00:50<11:20,  2.34train_batch/s]

Epoch: [1][140/1730] Elapsed 0m 50s (remain 9m 29s) Loss: 1.3091 Grad: 81365.0938  LR: 0.00001519  


Train:   9%|▉         | 161/1730 [00:57<11:45,  2.22train_batch/s]

Epoch: [1][160/1730] Elapsed 0m 57s (remain 9m 24s) Loss: 1.2924 Grad: 52794.6719  LR: 0.00001389  


Train:  10%|█         | 181/1730 [01:06<11:19,  2.28train_batch/s]

Epoch: [1][180/1730] Elapsed 1m 5s (remain 9m 24s) Loss: 1.2621 Grad: 46812.6133  LR: 0.00001252  


Train:  12%|█▏        | 201/1730 [01:13<10:55,  2.33train_batch/s]

Epoch: [1][200/1730] Elapsed 1m 13s (remain 9m 21s) Loss: 1.2322 Grad: 53148.5508  LR: 0.00001109  


Train:  13%|█▎        | 221/1730 [01:22<09:16,  2.71train_batch/s]

Epoch: [1][220/1730] Elapsed 1m 22s (remain 9m 22s) Loss: 1.2017 Grad: 35743.9219  LR: 0.00000964  


Train:  14%|█▍        | 241/1730 [01:29<08:42,  2.85train_batch/s]

Epoch: [1][240/1730] Elapsed 1m 29s (remain 9m 15s) Loss: 1.1924 Grad: 49729.7227  LR: 0.00000819  


Train:  15%|█▌        | 261/1730 [01:36<08:28,  2.89train_batch/s]

Epoch: [1][260/1730] Elapsed 1m 36s (remain 9m 2s) Loss: 1.1758 Grad: 74444.5938  LR: 0.00000679  


Train:  16%|█▌        | 281/1730 [01:43<10:20,  2.34train_batch/s]

Epoch: [1][280/1730] Elapsed 1m 43s (remain 8m 54s) Loss: 1.1540 Grad: 45949.5938  LR: 0.00000545  


Train:  17%|█▋        | 301/1730 [01:51<08:51,  2.69train_batch/s]

Epoch: [1][300/1730] Elapsed 1m 51s (remain 8m 49s) Loss: 1.1453 Grad: 67832.1250  LR: 0.00000420  


Train:  19%|█▊        | 321/1730 [01:58<07:51,  2.99train_batch/s]

Epoch: [1][320/1730] Elapsed 1m 58s (remain 8m 39s) Loss: 1.1330 Grad: 87673.5078  LR: 0.00000308  


Train:  20%|█▉        | 341/1730 [02:06<08:34,  2.70train_batch/s]

Epoch: [1][340/1730] Elapsed 2m 6s (remain 8m 34s) Loss: 1.1162 Grad: 74222.8594  LR: 0.00000211  


Train:  21%|██        | 361/1730 [02:14<09:03,  2.52train_batch/s]

Epoch: [1][360/1730] Elapsed 2m 14s (remain 8m 29s) Loss: 1.1111 Grad: 46552.1602  LR: 0.00000130  


Train:  22%|██▏       | 381/1730 [02:22<08:25,  2.67train_batch/s]

Epoch: [1][380/1730] Elapsed 2m 22s (remain 8m 23s) Loss: 1.0971 Grad: 108509.2031  LR: 0.00000068  


Train:  23%|██▎       | 401/1730 [02:29<07:58,  2.78train_batch/s]

Epoch: [1][400/1730] Elapsed 2m 29s (remain 8m 15s) Loss: 1.0876 Grad: 79290.2656  LR: 0.00000025  


Train:  24%|██▍       | 421/1730 [02:37<09:33,  2.28train_batch/s]

Epoch: [1][420/1730] Elapsed 2m 37s (remain 8m 8s) Loss: 1.0808 Grad: 45566.2891  LR: 0.00000003  


Train:  25%|██▌       | 441/1730 [02:44<06:26,  3.33train_batch/s]

Epoch: [1][440/1730] Elapsed 2m 44s (remain 8m 0s) Loss: 1.0720 Grad: 90696.1797  LR: 0.00000002  


Train:  27%|██▋       | 461/1730 [02:51<07:06,  2.98train_batch/s]

Epoch: [1][460/1730] Elapsed 2m 51s (remain 7m 52s) Loss: 1.0632 Grad: 79573.2188  LR: 0.00000022  


Train:  28%|██▊       | 481/1730 [02:58<07:30,  2.77train_batch/s]

Epoch: [1][480/1730] Elapsed 2m 58s (remain 7m 43s) Loss: 1.0578 Grad: 53251.6172  LR: 0.00000063  


Train:  29%|██▉       | 501/1730 [03:05<07:52,  2.60train_batch/s]

Epoch: [1][500/1730] Elapsed 3m 5s (remain 7m 35s) Loss: 1.0511 Grad: 87892.6250  LR: 0.00000123  


Train:  30%|███       | 521/1730 [03:14<08:56,  2.25train_batch/s]

Epoch: [1][520/1730] Elapsed 3m 14s (remain 7m 31s) Loss: 1.0450 Grad: 57141.5430  LR: 0.00000202  


Train:  31%|███▏      | 541/1730 [03:22<08:13,  2.41train_batch/s]

Epoch: [1][540/1730] Elapsed 3m 22s (remain 7m 24s) Loss: 1.0431 Grad: 96327.6562  LR: 0.00000298  


Train:  32%|███▏      | 561/1730 [03:30<09:31,  2.04train_batch/s]

Epoch: [1][560/1730] Elapsed 3m 30s (remain 7m 19s) Loss: 1.0380 Grad: 46589.4961  LR: 0.00000409  


Train:  34%|███▎      | 581/1730 [03:37<06:59,  2.74train_batch/s]

Epoch: [1][580/1730] Elapsed 3m 37s (remain 7m 10s) Loss: 1.0312 Grad: 109939.2891  LR: 0.00000532  


Train:  35%|███▍      | 601/1730 [03:45<06:55,  2.72train_batch/s]

Epoch: [1][600/1730] Elapsed 3m 45s (remain 7m 3s) Loss: 1.0273 Grad: 63094.3398  LR: 0.00000665  


Train:  36%|███▌      | 621/1730 [03:52<05:44,  3.22train_batch/s]

Epoch: [1][620/1730] Elapsed 3m 52s (remain 6m 55s) Loss: 1.0241 Grad: 53048.1641  LR: 0.00000805  


Train:  37%|███▋      | 641/1730 [03:59<06:23,  2.84train_batch/s]

Epoch: [1][640/1730] Elapsed 3m 59s (remain 6m 47s) Loss: 1.0206 Grad: 53179.9492  LR: 0.00000949  


Train:  38%|███▊      | 661/1730 [04:07<06:09,  2.89train_batch/s]

Epoch: [1][660/1730] Elapsed 4m 7s (remain 6m 39s) Loss: 1.0212 Grad: 46656.3516  LR: 0.00001094  


Train:  39%|███▉      | 681/1730 [04:14<05:18,  3.29train_batch/s]

Epoch: [1][680/1730] Elapsed 4m 14s (remain 6m 32s) Loss: 1.0215 Grad: 62497.5312  LR: 0.00001238  


Train:  41%|████      | 701/1730 [04:21<06:46,  2.53train_batch/s]

Epoch: [1][700/1730] Elapsed 4m 21s (remain 6m 24s) Loss: 1.0193 Grad: 42291.3320  LR: 0.00001376  


Train:  42%|████▏     | 721/1730 [04:29<07:20,  2.29train_batch/s]

Epoch: [1][720/1730] Elapsed 4m 29s (remain 6m 17s) Loss: 1.0183 Grad: 102344.6719  LR: 0.00001506  


Train:  43%|████▎     | 741/1730 [04:38<06:39,  2.47train_batch/s]

Epoch: [1][740/1730] Elapsed 4m 38s (remain 6m 12s) Loss: 1.0148 Grad: 53453.2070  LR: 0.00001626  


Train:  44%|████▍     | 761/1730 [04:47<07:42,  2.10train_batch/s]

Epoch: [1][760/1730] Elapsed 4m 47s (remain 6m 5s) Loss: 1.0141 Grad: 91759.2422  LR: 0.00001732  


Train:  45%|████▌     | 781/1730 [04:54<05:33,  2.85train_batch/s]

Epoch: [1][780/1730] Elapsed 4m 54s (remain 5m 58s) Loss: 1.0159 Grad: 34200.3477  LR: 0.00001823  


Train:  46%|████▋     | 801/1730 [05:02<06:18,  2.45train_batch/s]

Epoch: [1][800/1730] Elapsed 5m 2s (remain 5m 50s) Loss: 1.0156 Grad: 43280.7188  LR: 0.00001897  


Train:  48%|████▊     | 822/1730 [05:10<05:24,  2.80train_batch/s]

Epoch: [1][820/1730] Elapsed 5m 9s (remain 5m 43s) Loss: 1.0157 Grad: 80149.0234  LR: 0.00001952  


Train:  49%|████▊     | 841/1730 [05:17<05:24,  2.74train_batch/s]

Epoch: [1][840/1730] Elapsed 5m 17s (remain 5m 35s) Loss: 1.0189 Grad: 39889.7578  LR: 0.00001986  


Train:  50%|████▉     | 861/1730 [05:23<05:41,  2.55train_batch/s]

Epoch: [1][860/1730] Elapsed 5m 23s (remain 5m 26s) Loss: 1.0172 Grad: 71554.8359  LR: 0.00002000  


Train:  51%|█████     | 881/1730 [05:30<04:56,  2.86train_batch/s]

Epoch: [1][880/1730] Elapsed 5m 30s (remain 5m 18s) Loss: 1.0142 Grad: 30230.1934  LR: 0.00001992  


Train:  52%|█████▏    | 901/1730 [05:38<05:06,  2.70train_batch/s]

Epoch: [1][900/1730] Elapsed 5m 38s (remain 5m 11s) Loss: 1.0114 Grad: 65373.8086  LR: 0.00001964  


Train:  53%|█████▎    | 921/1730 [05:45<05:23,  2.50train_batch/s]

Epoch: [1][920/1730] Elapsed 5m 45s (remain 5m 3s) Loss: 1.0090 Grad: 50814.5156  LR: 0.00001915  


Train:  54%|█████▍    | 941/1730 [05:52<05:06,  2.58train_batch/s]

Epoch: [1][940/1730] Elapsed 5m 52s (remain 4m 55s) Loss: 1.0056 Grad: 45512.3633  LR: 0.00001847  


Train:  56%|█████▌    | 961/1730 [06:00<04:38,  2.76train_batch/s]

Epoch: [1][960/1730] Elapsed 6m 0s (remain 4m 48s) Loss: 1.0042 Grad: 44528.8984  LR: 0.00001761  


Train:  57%|█████▋    | 981/1730 [06:07<05:41,  2.19train_batch/s]

Epoch: [1][980/1730] Elapsed 6m 7s (remain 4m 40s) Loss: 1.0015 Grad: 53656.3281  LR: 0.00001659  


Train:  58%|█████▊    | 1001/1730 [06:14<03:31,  3.45train_batch/s]

Epoch: [1][1000/1730] Elapsed 6m 14s (remain 4m 32s) Loss: 0.9982 Grad: 50442.5781  LR: 0.00001543  


Train:  59%|█████▉    | 1021/1730 [06:22<04:28,  2.64train_batch/s]

Epoch: [1][1020/1730] Elapsed 6m 22s (remain 4m 25s) Loss: 0.9976 Grad: 61543.1211  LR: 0.00001416  


Train:  60%|██████    | 1041/1730 [06:28<02:50,  4.04train_batch/s]

Epoch: [1][1040/1730] Elapsed 6m 28s (remain 4m 16s) Loss: 0.9952 Grad: 132804.9688  LR: 0.00001280  


Train:  61%|██████▏   | 1061/1730 [06:36<04:46,  2.34train_batch/s]

Epoch: [1][1060/1730] Elapsed 6m 36s (remain 4m 10s) Loss: 0.9927 Grad: 41819.6055  LR: 0.00001138  


Train:  62%|██████▏   | 1081/1730 [06:43<03:51,  2.80train_batch/s]

Epoch: [1][1080/1730] Elapsed 6m 43s (remain 4m 2s) Loss: 0.9906 Grad: 105340.8672  LR: 0.00000993  


Train:  64%|██████▎   | 1101/1730 [06:51<03:51,  2.71train_batch/s]

Epoch: [1][1100/1730] Elapsed 6m 51s (remain 3m 54s) Loss: 0.9880 Grad: 120136.7578  LR: 0.00000848  


Train:  65%|██████▍   | 1121/1730 [06:57<03:52,  2.62train_batch/s]

Epoch: [1][1120/1730] Elapsed 6m 57s (remain 3m 47s) Loss: 0.9859 Grad: 111096.0859  LR: 0.00000706  


Train:  66%|██████▌   | 1141/1730 [07:05<03:42,  2.65train_batch/s]

Epoch: [1][1140/1730] Elapsed 7m 5s (remain 3m 39s) Loss: 0.9849 Grad: 94221.4688  LR: 0.00000571  


Train:  67%|██████▋   | 1161/1730 [07:12<03:07,  3.04train_batch/s]

Epoch: [1][1160/1730] Elapsed 7m 12s (remain 3m 31s) Loss: 0.9817 Grad: 60420.0820  LR: 0.00000444  


Train:  68%|██████▊   | 1181/1730 [07:19<03:58,  2.30train_batch/s]

Epoch: [1][1180/1730] Elapsed 7m 19s (remain 3m 24s) Loss: 0.9795 Grad: 103595.3203  LR: 0.00000330  


Train:  69%|██████▉   | 1201/1730 [07:26<02:58,  2.97train_batch/s]

Epoch: [1][1200/1730] Elapsed 7m 26s (remain 3m 16s) Loss: 0.9769 Grad: 67061.2422  LR: 0.00000229  


Train:  71%|███████   | 1221/1730 [07:35<03:22,  2.51train_batch/s]

Epoch: [1][1220/1730] Elapsed 7m 35s (remain 3m 9s) Loss: 0.9746 Grad: 69555.7578  LR: 0.00000145  


Train:  72%|███████▏  | 1241/1730 [07:42<03:28,  2.35train_batch/s]

Epoch: [1][1240/1730] Elapsed 7m 42s (remain 3m 2s) Loss: 0.9707 Grad: 33749.1211  LR: 0.00000079  


Train:  73%|███████▎  | 1261/1730 [07:49<03:05,  2.52train_batch/s]

Epoch: [1][1260/1730] Elapsed 7m 49s (remain 2m 54s) Loss: 0.9677 Grad: 38770.4219  LR: 0.00000032  


Train:  74%|███████▍  | 1281/1730 [07:57<03:12,  2.33train_batch/s]

Epoch: [1][1280/1730] Elapsed 7m 57s (remain 2m 47s) Loss: 0.9643 Grad: 94021.4062  LR: 0.00000006  


Train:  75%|███████▌  | 1301/1730 [08:06<02:43,  2.62train_batch/s]

Epoch: [1][1300/1730] Elapsed 8m 6s (remain 2m 40s) Loss: 0.9615 Grad: 44679.1172  LR: 0.00000001  


Train:  76%|███████▋  | 1321/1730 [08:13<02:14,  3.03train_batch/s]

Epoch: [1][1320/1730] Elapsed 8m 13s (remain 2m 32s) Loss: 0.9590 Grad: 76322.1328  LR: 0.00000016  


Train:  78%|███████▊  | 1341/1730 [08:21<03:06,  2.09train_batch/s]

Epoch: [1][1340/1730] Elapsed 8m 21s (remain 2m 25s) Loss: 0.9575 Grad: 89822.4219  LR: 0.00000053  


Train:  79%|███████▊  | 1361/1730 [08:29<02:38,  2.33train_batch/s]

Epoch: [1][1360/1730] Elapsed 8m 29s (remain 2m 18s) Loss: 0.9541 Grad: 48740.2227  LR: 0.00000110  


Train:  80%|███████▉  | 1381/1730 [08:36<02:03,  2.83train_batch/s]

Epoch: [1][1380/1730] Elapsed 8m 36s (remain 2m 10s) Loss: 0.9507 Grad: 54711.1953  LR: 0.00000185  


Train:  81%|████████  | 1401/1730 [08:43<01:49,  3.00train_batch/s]

Epoch: [1][1400/1730] Elapsed 8m 43s (remain 2m 2s) Loss: 0.9488 Grad: 65816.7344  LR: 0.00000278  


Train:  82%|████████▏ | 1422/1730 [08:51<01:34,  3.25train_batch/s]

Epoch: [1][1420/1730] Elapsed 8m 51s (remain 1m 55s) Loss: 0.9461 Grad: 24048.5117  LR: 0.00000385  


Train:  83%|████████▎ | 1441/1730 [08:57<01:37,  2.97train_batch/s]

Epoch: [1][1440/1730] Elapsed 8m 57s (remain 1m 47s) Loss: 0.9439 Grad: 91810.5078  LR: 0.00000506  


Train:  84%|████████▍ | 1461/1730 [09:05<01:31,  2.95train_batch/s]

Epoch: [1][1460/1730] Elapsed 9m 5s (remain 1m 40s) Loss: 0.9420 Grad: 85980.6562  LR: 0.00000638  


Train:  86%|████████▌ | 1481/1730 [09:13<01:35,  2.60train_batch/s]

Epoch: [1][1480/1730] Elapsed 9m 13s (remain 1m 32s) Loss: 0.9398 Grad: 121214.6953  LR: 0.00000776  


Train:  87%|████████▋ | 1501/1730 [09:20<01:29,  2.57train_batch/s]

Epoch: [1][1500/1730] Elapsed 9m 20s (remain 1m 25s) Loss: 0.9379 Grad: 94374.5625  LR: 0.00000920  


Train:  88%|████████▊ | 1521/1730 [09:29<01:43,  2.02train_batch/s]

Epoch: [1][1520/1730] Elapsed 9m 29s (remain 1m 18s) Loss: 0.9364 Grad: 135518.7344  LR: 0.00001065  


Train:  89%|████████▉ | 1541/1730 [09:36<01:08,  2.76train_batch/s]

Epoch: [1][1540/1730] Elapsed 9m 36s (remain 1m 10s) Loss: 0.9360 Grad: 56391.6367  LR: 0.00001209  


Train:  90%|█████████ | 1561/1730 [09:44<01:10,  2.41train_batch/s]

Epoch: [1][1560/1730] Elapsed 9m 44s (remain 1m 3s) Loss: 0.9355 Grad: 53567.1719  LR: 0.00001349  


Train:  91%|█████████▏| 1581/1730 [09:52<01:13,  2.02train_batch/s]

Epoch: [1][1580/1730] Elapsed 9m 52s (remain 0m 55s) Loss: 0.9354 Grad: 53878.8164  LR: 0.00001481  


Train:  93%|█████████▎| 1602/1730 [10:00<00:49,  2.57train_batch/s]

Epoch: [1][1600/1730] Elapsed 10m 0s (remain 0m 48s) Loss: 0.9337 Grad: 47090.1328  LR: 0.00001603  


Train:  94%|█████████▎| 1621/1730 [10:08<00:48,  2.26train_batch/s]

Epoch: [1][1620/1730] Elapsed 10m 8s (remain 0m 40s) Loss: 0.9332 Grad: 82850.2188  LR: 0.00001712  


Train:  95%|█████████▍| 1641/1730 [10:16<00:34,  2.56train_batch/s]

Epoch: [1][1640/1730] Elapsed 10m 16s (remain 0m 33s) Loss: 0.9321 Grad: 29029.9180  LR: 0.00001806  


Train:  96%|█████████▌| 1661/1730 [10:23<00:26,  2.57train_batch/s]

Epoch: [1][1660/1730] Elapsed 10m 23s (remain 0m 25s) Loss: 0.9333 Grad: 157629.5625  LR: 0.00001884  


Train:  97%|█████████▋| 1681/1730 [10:31<00:20,  2.37train_batch/s]

Epoch: [1][1680/1730] Elapsed 10m 31s (remain 0m 18s) Loss: 0.9346 Grad: 41847.5039  LR: 0.00001942  


Train:  98%|█████████▊| 1701/1730 [10:39<00:09,  3.11train_batch/s]

Epoch: [1][1700/1730] Elapsed 10m 39s (remain 0m 10s) Loss: 0.9333 Grad: 73605.0469  LR: 0.00001981  


Train:  99%|█████████▉| 1721/1730 [10:45<00:02,  3.05train_batch/s]

Epoch: [1][1720/1730] Elapsed 10m 45s (remain 0m 3s) Loss: 0.9334 Grad: 158303.1094  LR: 0.00001999  


Train: 100%|██████████| 1730/1730 [10:49<00:00,  2.66train_batch/s]


Epoch: [1][1729/1730] Elapsed 10m 49s (remain 0m 0s) Loss: 0.9334 Grad: 59072.8789  LR: 0.00002000  


Validation:   0%|          | 4/866 [00:00<01:23, 10.32valid_batch/s]

EVAL: [0/866] Elapsed 0m 0s (remain 3m 23s) Loss: 0.7582 


Validation:   3%|▎         | 22/866 [00:01<01:07, 12.52valid_batch/s]

EVAL: [20/866] Elapsed 0m 1s (remain 1m 11s) Loss: 0.7949 


Validation:   5%|▍         | 42/866 [00:03<01:09, 11.85valid_batch/s]

EVAL: [40/866] Elapsed 0m 3s (remain 1m 9s) Loss: 0.8762 


Validation:   7%|▋         | 62/866 [00:05<01:00, 13.25valid_batch/s]

EVAL: [60/866] Elapsed 0m 4s (remain 1m 5s) Loss: 0.8572 


Validation:   9%|▉         | 82/866 [00:06<01:07, 11.57valid_batch/s]

EVAL: [80/866] Elapsed 0m 6s (remain 1m 3s) Loss: 0.8287 


Validation:  12%|█▏        | 102/866 [00:08<00:51, 14.82valid_batch/s]

EVAL: [100/866] Elapsed 0m 8s (remain 1m 0s) Loss: 0.8140 


Validation:  14%|█▍        | 123/866 [00:09<00:55, 13.38valid_batch/s]

EVAL: [120/866] Elapsed 0m 9s (remain 0m 59s) Loss: 0.8327 


Validation:  17%|█▋        | 144/866 [00:11<00:51, 13.96valid_batch/s]

EVAL: [140/866] Elapsed 0m 11s (remain 0m 57s) Loss: 0.8359 


Validation:  19%|█▊        | 162/866 [00:12<00:56, 12.48valid_batch/s]

EVAL: [160/866] Elapsed 0m 12s (remain 0m 55s) Loss: 0.8370 


Validation:  21%|██        | 182/866 [00:14<00:51, 13.23valid_batch/s]

EVAL: [180/866] Elapsed 0m 14s (remain 0m 53s) Loss: 0.8352 


Validation:  23%|██▎       | 203/866 [00:15<00:51, 12.90valid_batch/s]

EVAL: [200/866] Elapsed 0m 15s (remain 0m 51s) Loss: 0.8155 


Validation:  26%|██▌       | 223/866 [00:17<00:55, 11.69valid_batch/s]

EVAL: [220/866] Elapsed 0m 17s (remain 0m 50s) Loss: 0.8347 


Validation:  28%|██▊       | 241/866 [00:18<00:56, 11.09valid_batch/s]

EVAL: [240/866] Elapsed 0m 18s (remain 0m 49s) Loss: 0.8399 


Validation:  30%|███       | 262/866 [00:20<00:46, 12.92valid_batch/s]

EVAL: [260/866] Elapsed 0m 20s (remain 0m 48s) Loss: 0.8500 


Validation:  33%|███▎      | 284/866 [00:22<00:40, 14.44valid_batch/s]

EVAL: [280/866] Elapsed 0m 22s (remain 0m 46s) Loss: 0.8542 


Validation:  35%|███▍      | 302/866 [00:23<00:38, 14.75valid_batch/s]

EVAL: [300/866] Elapsed 0m 23s (remain 0m 44s) Loss: 0.8554 


Validation:  37%|███▋      | 324/866 [00:25<00:42, 12.64valid_batch/s]

EVAL: [320/866] Elapsed 0m 25s (remain 0m 43s) Loss: 0.8542 


Validation:  39%|███▉      | 342/866 [00:27<00:43, 12.09valid_batch/s]

EVAL: [340/866] Elapsed 0m 27s (remain 0m 41s) Loss: 0.8476 


Validation:  42%|████▏     | 364/866 [00:28<00:34, 14.39valid_batch/s]

EVAL: [360/866] Elapsed 0m 28s (remain 0m 39s) Loss: 0.8446 


Validation:  44%|████▍     | 382/866 [00:30<00:39, 12.34valid_batch/s]

EVAL: [380/866] Elapsed 0m 29s (remain 0m 38s) Loss: 0.8420 


Validation:  46%|████▋     | 402/866 [00:31<00:34, 13.48valid_batch/s]

EVAL: [400/866] Elapsed 0m 31s (remain 0m 36s) Loss: 0.8374 


Validation:  49%|████▉     | 423/866 [00:33<00:37, 11.92valid_batch/s]

EVAL: [420/866] Elapsed 0m 33s (remain 0m 35s) Loss: 0.8423 


Validation:  51%|█████     | 442/866 [00:34<00:40, 10.39valid_batch/s]

EVAL: [440/866] Elapsed 0m 34s (remain 0m 33s) Loss: 0.8400 


Validation:  53%|█████▎    | 463/866 [00:36<00:32, 12.40valid_batch/s]

EVAL: [460/866] Elapsed 0m 36s (remain 0m 32s) Loss: 0.8372 


Validation:  56%|█████▌    | 482/866 [00:38<00:32, 11.88valid_batch/s]

EVAL: [480/866] Elapsed 0m 38s (remain 0m 30s) Loss: 0.8304 


Validation:  58%|█████▊    | 504/866 [00:39<00:24, 14.64valid_batch/s]

EVAL: [500/866] Elapsed 0m 39s (remain 0m 28s) Loss: 0.8284 


Validation:  60%|██████    | 523/866 [00:41<00:28, 12.13valid_batch/s]

EVAL: [520/866] Elapsed 0m 41s (remain 0m 27s) Loss: 0.8216 


Validation:  63%|██████▎   | 543/866 [00:43<00:25, 12.43valid_batch/s]

EVAL: [540/866] Elapsed 0m 42s (remain 0m 25s) Loss: 0.8198 


Validation:  65%|██████▌   | 563/866 [00:44<00:24, 12.28valid_batch/s]

EVAL: [560/866] Elapsed 0m 44s (remain 0m 24s) Loss: 0.8211 


Validation:  67%|██████▋   | 582/866 [00:46<00:23, 11.92valid_batch/s]

EVAL: [580/866] Elapsed 0m 46s (remain 0m 22s) Loss: 0.8195 


Validation:  70%|██████▉   | 602/866 [00:47<00:19, 13.52valid_batch/s]

EVAL: [600/866] Elapsed 0m 47s (remain 0m 21s) Loss: 0.8226 


Validation:  72%|███████▏  | 624/866 [00:49<00:17, 13.84valid_batch/s]

EVAL: [620/866] Elapsed 0m 49s (remain 0m 19s) Loss: 0.8241 


Validation:  74%|███████▍  | 642/866 [00:51<00:18, 12.35valid_batch/s]

EVAL: [640/866] Elapsed 0m 50s (remain 0m 17s) Loss: 0.8263 


Validation:  77%|███████▋  | 664/866 [00:52<00:15, 13.34valid_batch/s]

EVAL: [660/866] Elapsed 0m 52s (remain 0m 16s) Loss: 0.8231 


Validation:  79%|███████▉  | 682/866 [00:54<00:17, 10.49valid_batch/s]

EVAL: [680/866] Elapsed 0m 54s (remain 0m 14s) Loss: 0.8240 


Validation:  81%|████████  | 702/866 [00:56<00:18,  9.05valid_batch/s]

EVAL: [700/866] Elapsed 0m 56s (remain 0m 13s) Loss: 0.8232 


Validation:  83%|████████▎ | 722/866 [00:58<00:14,  9.94valid_batch/s]

EVAL: [720/866] Elapsed 0m 58s (remain 0m 11s) Loss: 0.8222 


Validation:  86%|████████▌ | 743/866 [00:59<00:09, 13.10valid_batch/s]

EVAL: [740/866] Elapsed 0m 59s (remain 0m 10s) Loss: 0.8229 


Validation:  88%|████████▊ | 763/866 [01:01<00:08, 12.43valid_batch/s]

EVAL: [760/866] Elapsed 1m 0s (remain 0m 8s) Loss: 0.8222 


Validation:  90%|█████████ | 782/866 [01:02<00:05, 14.19valid_batch/s]

EVAL: [780/866] Elapsed 1m 2s (remain 0m 6s) Loss: 0.8210 


Validation:  93%|█████████▎| 804/866 [01:04<00:04, 13.88valid_batch/s]

EVAL: [800/866] Elapsed 1m 3s (remain 0m 5s) Loss: 0.8251 


Validation:  95%|█████████▌| 823/866 [01:05<00:03, 13.10valid_batch/s]

EVAL: [820/866] Elapsed 1m 5s (remain 0m 3s) Loss: 0.8251 


Validation:  97%|█████████▋| 843/866 [01:07<00:01, 13.05valid_batch/s]

EVAL: [840/866] Elapsed 1m 7s (remain 0m 1s) Loss: 0.8234 


Validation: 100%|█████████▉| 863/866 [01:08<00:00, 17.47valid_batch/s]

EVAL: [860/866] Elapsed 1m 8s (remain 0m 0s) Loss: 0.8232 


Validation: 100%|██████████| 866/866 [01:08<00:00, 12.56valid_batch/s]
Epoch 1 - avg_train_loss: 0.9334  avg_val_loss: 0.8261  time: 719s
Epoch 1 - Score: 0.7923
Epoch 1 - Save Best Score: 0.7923 Model


EVAL: [865/866] Elapsed 1m 8s (remain 0m 0s) Loss: 0.8261 


Train:   0%|          | 1/1730 [00:00<10:49,  2.66train_batch/s]

Epoch: [2][0/1730] Elapsed 0m 0s (remain 10m 49s) Loss: 1.0894 Grad: inf  LR: 0.00002000  


Train:   1%|          | 21/1730 [00:07<12:18,  2.31train_batch/s]

Epoch: [2][20/1730] Elapsed 0m 7s (remain 10m 12s) Loss: 0.9136 Grad: 151572.5312  LR: 0.00001986  


Train:   2%|▏         | 41/1730 [00:14<09:12,  3.06train_batch/s]

Epoch: [2][40/1730] Elapsed 0m 14s (remain 9m 58s) Loss: 0.8732 Grad: 129642.6250  LR: 0.00001952  


Train:   4%|▎         | 61/1730 [00:22<09:10,  3.03train_batch/s]

Epoch: [2][60/1730] Elapsed 0m 22s (remain 10m 8s) Loss: 0.8381 Grad: 120190.5547  LR: 0.00001897  


Train:   5%|▍         | 81/1730 [00:29<10:43,  2.56train_batch/s]

Epoch: [2][80/1730] Elapsed 0m 29s (remain 10m 5s) Loss: 0.8360 Grad: 168301.1094  LR: 0.00001823  


Train:   6%|▌         | 101/1730 [00:37<09:09,  2.97train_batch/s]

Epoch: [2][100/1730] Elapsed 0m 37s (remain 10m 9s) Loss: 0.8439 Grad: 91656.3516  LR: 0.00001732  


Train:   7%|▋         | 121/1730 [00:45<12:19,  2.18train_batch/s]

Epoch: [2][120/1730] Elapsed 0m 45s (remain 10m 5s) Loss: 0.8287 Grad: 165917.9688  LR: 0.00001626  


Train:   8%|▊         | 141/1730 [00:52<09:43,  2.72train_batch/s]

Epoch: [2][140/1730] Elapsed 0m 52s (remain 9m 56s) Loss: 0.8246 Grad: 130840.6875  LR: 0.00001506  


Train:   9%|▉         | 162/1730 [01:00<09:13,  2.84train_batch/s]

Epoch: [2][160/1730] Elapsed 1m 0s (remain 9m 50s) Loss: 0.8190 Grad: 72428.0625  LR: 0.00001376  


Train:  10%|█         | 181/1730 [01:08<11:00,  2.34train_batch/s]

Epoch: [2][180/1730] Elapsed 1m 8s (remain 9m 45s) Loss: 0.8268 Grad: 104435.9609  LR: 0.00001238  


Train:  12%|█▏        | 201/1730 [01:17<10:57,  2.33train_batch/s]

Epoch: [2][200/1730] Elapsed 1m 17s (remain 9m 46s) Loss: 0.8197 Grad: 149745.5469  LR: 0.00001094  


Train:  13%|█▎        | 221/1730 [01:24<08:26,  2.98train_batch/s]

Epoch: [2][220/1730] Elapsed 1m 24s (remain 9m 35s) Loss: 0.8132 Grad: 212146.2656  LR: 0.00000949  


Train:  14%|█▍        | 242/1730 [01:31<09:00,  2.75train_batch/s]

Epoch: [2][240/1730] Elapsed 1m 31s (remain 9m 25s) Loss: 0.8061 Grad: 155519.1875  LR: 0.00000805  


Train:  15%|█▌        | 261/1730 [01:39<10:06,  2.42train_batch/s]

Epoch: [2][260/1730] Elapsed 1m 39s (remain 9m 19s) Loss: 0.8030 Grad: 184794.8906  LR: 0.00000665  


Train:  16%|█▌        | 281/1730 [01:46<09:53,  2.44train_batch/s]

Epoch: [2][280/1730] Elapsed 1m 46s (remain 9m 11s) Loss: 0.7993 Grad: 123060.5625  LR: 0.00000532  


Train:  17%|█▋        | 301/1730 [01:55<08:50,  2.70train_batch/s]

Epoch: [2][300/1730] Elapsed 1m 55s (remain 9m 8s) Loss: 0.8003 Grad: 103577.6016  LR: 0.00000409  


Train:  19%|█▊        | 321/1730 [02:03<09:14,  2.54train_batch/s]

Epoch: [2][320/1730] Elapsed 2m 3s (remain 9m 2s) Loss: 0.7966 Grad: 264968.2188  LR: 0.00000298  


Train:  20%|█▉        | 341/1730 [02:11<08:50,  2.62train_batch/s]

Epoch: [2][340/1730] Elapsed 2m 11s (remain 8m 56s) Loss: 0.7949 Grad: 125818.8672  LR: 0.00000202  


Train:  21%|██        | 361/1730 [02:19<08:44,  2.61train_batch/s]

Epoch: [2][360/1730] Elapsed 2m 19s (remain 8m 48s) Loss: 0.7935 Grad: 99008.3281  LR: 0.00000123  


Train:  22%|██▏       | 381/1730 [02:25<08:59,  2.50train_batch/s]

Epoch: [2][380/1730] Elapsed 2m 25s (remain 8m 36s) Loss: 0.7847 Grad: 181769.2656  LR: 0.00000063  


Train:  23%|██▎       | 401/1730 [02:33<07:29,  2.95train_batch/s]

Epoch: [2][400/1730] Elapsed 2m 33s (remain 8m 29s) Loss: 0.7826 Grad: nan  LR: 0.00000022  


Train:  24%|██▍       | 421/1730 [02:41<10:08,  2.15train_batch/s]

Epoch: [2][420/1730] Elapsed 2m 41s (remain 8m 21s) Loss: 0.7769 Grad: 49547.1289  LR: 0.00000002  


Train:  25%|██▌       | 441/1730 [02:48<08:20,  2.57train_batch/s]

Epoch: [2][440/1730] Elapsed 2m 48s (remain 8m 13s) Loss: 0.7739 Grad: 41930.7617  LR: 0.00000003  


Train:  27%|██▋       | 461/1730 [02:55<07:55,  2.67train_batch/s]

Epoch: [2][460/1730] Elapsed 2m 55s (remain 8m 3s) Loss: 0.7680 Grad: 42368.3906  LR: 0.00000025  


Train:  28%|██▊       | 481/1730 [03:03<09:08,  2.28train_batch/s]

Epoch: [2][480/1730] Elapsed 3m 3s (remain 7m 56s) Loss: 0.7678 Grad: 65729.6953  LR: 0.00000068  


Train:  29%|██▉       | 501/1730 [03:11<07:07,  2.87train_batch/s]

Epoch: [2][500/1730] Elapsed 3m 11s (remain 7m 50s) Loss: 0.7664 Grad: 34028.6016  LR: 0.00000130  


Train:  30%|███       | 521/1730 [03:18<06:40,  3.02train_batch/s]

Epoch: [2][520/1730] Elapsed 3m 18s (remain 7m 40s) Loss: 0.7631 Grad: 53988.9102  LR: 0.00000211  


Train:  31%|███▏      | 541/1730 [03:25<06:24,  3.10train_batch/s]

Epoch: [2][540/1730] Elapsed 3m 25s (remain 7m 31s) Loss: 0.7657 Grad: 35951.3359  LR: 0.00000308  


Train:  32%|███▏      | 562/1730 [03:33<06:22,  3.05train_batch/s]

Epoch: [2][560/1730] Elapsed 3m 33s (remain 7m 25s) Loss: 0.7628 Grad: 53694.6914  LR: 0.00000420  


Train:  34%|███▎      | 581/1730 [03:40<06:18,  3.03train_batch/s]

Epoch: [2][580/1730] Elapsed 3m 40s (remain 7m 16s) Loss: 0.7626 Grad: 63876.7383  LR: 0.00000545  


Train:  35%|███▍      | 601/1730 [03:48<07:59,  2.35train_batch/s]

Epoch: [2][600/1730] Elapsed 3m 48s (remain 7m 9s) Loss: 0.7603 Grad: 85160.4453  LR: 0.00000679  


Train:  36%|███▌      | 621/1730 [03:56<07:22,  2.51train_batch/s]

Epoch: [2][620/1730] Elapsed 3m 56s (remain 7m 2s) Loss: 0.7587 Grad: 56951.4688  LR: 0.00000819  


Train:  37%|███▋      | 641/1730 [04:03<07:45,  2.34train_batch/s]

Epoch: [2][640/1730] Elapsed 4m 3s (remain 6m 53s) Loss: 0.7597 Grad: 67658.8281  LR: 0.00000964  


Train:  38%|███▊      | 661/1730 [04:10<05:32,  3.21train_batch/s]

Epoch: [2][660/1730] Elapsed 4m 10s (remain 6m 45s) Loss: 0.7615 Grad: 71478.8906  LR: 0.00001109  


Train:  39%|███▉      | 681/1730 [04:18<08:04,  2.16train_batch/s]

Epoch: [2][680/1730] Elapsed 4m 18s (remain 6m 38s) Loss: 0.7636 Grad: 83187.4297  LR: 0.00001252  


Train:  41%|████      | 701/1730 [04:26<06:24,  2.68train_batch/s]

Epoch: [2][700/1730] Elapsed 4m 26s (remain 6m 31s) Loss: 0.7674 Grad: 112997.8672  LR: 0.00001389  


Train:  42%|████▏     | 721/1730 [04:34<08:05,  2.08train_batch/s]

Epoch: [2][720/1730] Elapsed 4m 34s (remain 6m 24s) Loss: 0.7681 Grad: 60603.2891  LR: 0.00001519  


Train:  43%|████▎     | 741/1730 [04:41<05:22,  3.07train_batch/s]

Epoch: [2][740/1730] Elapsed 4m 41s (remain 6m 16s) Loss: 0.7685 Grad: 77462.2656  LR: 0.00001637  


Train:  44%|████▍     | 761/1730 [04:49<07:10,  2.25train_batch/s]

Epoch: [2][760/1730] Elapsed 4m 49s (remain 6m 8s) Loss: 0.7695 Grad: 39894.3984  LR: 0.00001742  


Train:  45%|████▌     | 781/1730 [04:56<06:32,  2.42train_batch/s]

Epoch: [2][780/1730] Elapsed 4m 56s (remain 6m 0s) Loss: 0.7729 Grad: 142567.2344  LR: 0.00001831  


Train:  46%|████▋     | 801/1730 [05:04<05:16,  2.94train_batch/s]

Epoch: [2][800/1730] Elapsed 5m 4s (remain 5m 53s) Loss: 0.7754 Grad: 84081.5938  LR: 0.00001903  


Train:  47%|████▋     | 821/1730 [05:11<04:40,  3.24train_batch/s]

Epoch: [2][820/1730] Elapsed 5m 11s (remain 5m 44s) Loss: 0.7746 Grad: 89135.9609  LR: 0.00001956  


Train:  49%|████▊     | 841/1730 [05:18<05:06,  2.90train_batch/s]

Epoch: [2][840/1730] Elapsed 5m 18s (remain 5m 37s) Loss: 0.7750 Grad: 63265.3047  LR: 0.00001988  


Train:  50%|████▉     | 861/1730 [05:25<04:38,  3.12train_batch/s]

Epoch: [2][860/1730] Elapsed 5m 25s (remain 5m 28s) Loss: 0.7786 Grad: 46902.4883  LR: 0.00002000  


Train:  51%|█████     | 881/1730 [05:33<05:18,  2.66train_batch/s]

Epoch: [2][880/1730] Elapsed 5m 33s (remain 5m 21s) Loss: 0.7811 Grad: 49374.9414  LR: 0.00001990  


Train:  52%|█████▏    | 901/1730 [05:40<04:34,  3.02train_batch/s]

Epoch: [2][900/1730] Elapsed 5m 40s (remain 5m 13s) Loss: 0.7859 Grad: 76077.5547  LR: 0.00001960  


Train:  53%|█████▎    | 921/1730 [05:48<04:32,  2.97train_batch/s]

Epoch: [2][920/1730] Elapsed 5m 48s (remain 5m 5s) Loss: 0.7881 Grad: 93227.8984  LR: 0.00001909  


Train:  54%|█████▍    | 941/1730 [05:55<03:59,  3.29train_batch/s]

Epoch: [2][940/1730] Elapsed 5m 55s (remain 4m 57s) Loss: 0.7865 Grad: 51227.8555  LR: 0.00001839  


Train:  56%|█████▌    | 961/1730 [06:02<05:16,  2.43train_batch/s]

Epoch: [2][960/1730] Elapsed 6m 2s (remain 4m 49s) Loss: 0.7843 Grad: 73397.5391  LR: 0.00001752  


Train:  57%|█████▋    | 981/1730 [06:10<05:11,  2.40train_batch/s]

Epoch: [2][980/1730] Elapsed 6m 10s (remain 4m 42s) Loss: 0.7862 Grad: 62585.9219  LR: 0.00001648  


Train:  58%|█████▊    | 1001/1730 [06:18<04:42,  2.58train_batch/s]

Epoch: [2][1000/1730] Elapsed 6m 18s (remain 4m 35s) Loss: 0.7872 Grad: 37681.1641  LR: 0.00001531  


Train:  59%|█████▉    | 1021/1730 [06:25<04:58,  2.38train_batch/s]

Epoch: [2][1020/1730] Elapsed 6m 25s (remain 4m 28s) Loss: 0.7884 Grad: 128791.6719  LR: 0.00001403  


Train:  60%|██████    | 1042/1730 [06:33<03:21,  3.42train_batch/s]

Epoch: [2][1040/1730] Elapsed 6m 33s (remain 4m 20s) Loss: 0.7910 Grad: 99089.6406  LR: 0.00001266  


Train:  61%|██████▏   | 1061/1730 [06:39<04:00,  2.78train_batch/s]

Epoch: [2][1060/1730] Elapsed 6m 39s (remain 4m 11s) Loss: 0.7885 Grad: 65502.8281  LR: 0.00001123  


Train:  62%|██████▏   | 1081/1730 [06:46<04:08,  2.62train_batch/s]

Epoch: [2][1080/1730] Elapsed 6m 46s (remain 4m 4s) Loss: 0.7885 Grad: 49543.5547  LR: 0.00000978  


Train:  64%|██████▎   | 1101/1730 [06:54<04:03,  2.58train_batch/s]

Epoch: [2][1100/1730] Elapsed 6m 54s (remain 3m 56s) Loss: 0.7875 Grad: 63135.3789  LR: 0.00000834  


Train:  65%|██████▍   | 1121/1730 [07:02<04:04,  2.49train_batch/s]

Epoch: [2][1120/1730] Elapsed 7m 2s (remain 3m 49s) Loss: 0.7880 Grad: 49984.1523  LR: 0.00000692  


Train:  66%|██████▌   | 1141/1730 [07:09<03:35,  2.73train_batch/s]

Epoch: [2][1140/1730] Elapsed 7m 9s (remain 3m 41s) Loss: 0.7887 Grad: 30953.4219  LR: 0.00000558  


Train:  67%|██████▋   | 1161/1730 [07:17<03:54,  2.42train_batch/s]

Epoch: [2][1160/1730] Elapsed 7m 17s (remain 3m 34s) Loss: 0.7887 Grad: 47988.0938  LR: 0.00000432  


Train:  68%|██████▊   | 1181/1730 [07:23<03:01,  3.03train_batch/s]

Epoch: [2][1180/1730] Elapsed 7m 23s (remain 3m 26s) Loss: 0.7893 Grad: 107874.6484  LR: 0.00000319  


Train:  69%|██████▉   | 1201/1730 [07:31<03:28,  2.53train_batch/s]

Epoch: [2][1200/1730] Elapsed 7m 31s (remain 3m 18s) Loss: 0.7895 Grad: 65262.6875  LR: 0.00000220  


Train:  71%|███████   | 1221/1730 [07:39<03:19,  2.55train_batch/s]

Epoch: [2][1220/1730] Elapsed 7m 39s (remain 3m 11s) Loss: 0.7897 Grad: 46792.1758  LR: 0.00000138  


Train:  72%|███████▏  | 1241/1730 [07:47<03:29,  2.34train_batch/s]

Epoch: [2][1240/1730] Elapsed 7m 47s (remain 3m 4s) Loss: 0.7886 Grad: 50038.4102  LR: 0.00000073  


Train:  73%|███████▎  | 1261/1730 [07:55<03:16,  2.39train_batch/s]

Epoch: [2][1260/1730] Elapsed 7m 55s (remain 2m 56s) Loss: 0.7885 Grad: 45015.9141  LR: 0.00000029  


Train:  74%|███████▍  | 1281/1730 [08:03<03:18,  2.26train_batch/s]

Epoch: [2][1280/1730] Elapsed 8m 3s (remain 2m 49s) Loss: 0.7868 Grad: 38322.3867  LR: 0.00000004  


Train:  75%|███████▌  | 1301/1730 [08:10<02:26,  2.92train_batch/s]

Epoch: [2][1300/1730] Elapsed 8m 10s (remain 2m 41s) Loss: 0.7853 Grad: 61761.1992  LR: 0.00000001  


Train:  76%|███████▋  | 1322/1730 [08:17<01:51,  3.65train_batch/s]

Epoch: [2][1320/1730] Elapsed 8m 17s (remain 2m 33s) Loss: 0.7840 Grad: 44171.7812  LR: 0.00000019  


Train:  78%|███████▊  | 1341/1730 [08:24<02:14,  2.90train_batch/s]

Epoch: [2][1340/1730] Elapsed 8m 24s (remain 2m 26s) Loss: 0.7834 Grad: 49028.5273  LR: 0.00000058  


Train:  79%|███████▊  | 1361/1730 [08:31<02:03,  2.99train_batch/s]

Epoch: [2][1360/1730] Elapsed 8m 31s (remain 2m 18s) Loss: 0.7817 Grad: 38674.5039  LR: 0.00000116  


Train:  80%|███████▉  | 1381/1730 [08:38<01:58,  2.94train_batch/s]

Epoch: [2][1380/1730] Elapsed 8m 38s (remain 2m 10s) Loss: 0.7785 Grad: 52056.6875  LR: 0.00000194  


Train:  81%|████████  | 1401/1730 [08:45<01:56,  2.82train_batch/s]

Epoch: [2][1400/1730] Elapsed 8m 45s (remain 2m 3s) Loss: 0.7770 Grad: 90586.5703  LR: 0.00000288  


Train:  82%|████████▏ | 1421/1730 [08:52<01:26,  3.57train_batch/s]

Epoch: [2][1420/1730] Elapsed 8m 52s (remain 1m 55s) Loss: 0.7788 Grad: 75100.7031  LR: 0.00000397  


Train:  83%|████████▎ | 1441/1730 [08:58<01:38,  2.93train_batch/s]

Epoch: [2][1440/1730] Elapsed 8m 58s (remain 1m 48s) Loss: 0.7799 Grad: 180483.0156  LR: 0.00000519  


Train:  84%|████████▍ | 1461/1730 [09:05<01:42,  2.62train_batch/s]

Epoch: [2][1460/1730] Elapsed 9m 5s (remain 1m 40s) Loss: 0.7809 Grad: 49074.9883  LR: 0.00000651  


Train:  86%|████████▌ | 1481/1730 [09:13<01:28,  2.80train_batch/s]

Epoch: [2][1480/1730] Elapsed 9m 13s (remain 1m 33s) Loss: 0.7821 Grad: 94747.1641  LR: 0.00000791  


Train:  87%|████████▋ | 1501/1730 [09:21<01:18,  2.90train_batch/s]

Epoch: [2][1500/1730] Elapsed 9m 21s (remain 1m 25s) Loss: 0.7819 Grad: 55024.2617  LR: 0.00000935  


Train:  88%|████████▊ | 1521/1730 [09:28<01:26,  2.41train_batch/s]

Epoch: [2][1520/1730] Elapsed 9m 27s (remain 1m 18s) Loss: 0.7811 Grad: 57378.1016  LR: 0.00001080  


Train:  89%|████████▉ | 1541/1730 [09:35<01:18,  2.41train_batch/s]

Epoch: [2][1540/1730] Elapsed 9m 35s (remain 1m 10s) Loss: 0.7811 Grad: 46409.6914  LR: 0.00001224  


Train:  90%|█████████ | 1561/1730 [09:43<01:04,  2.61train_batch/s]

Epoch: [2][1560/1730] Elapsed 9m 43s (remain 1m 3s) Loss: 0.7814 Grad: 49438.1016  LR: 0.00001362  


Train:  91%|█████████▏| 1581/1730 [09:50<01:02,  2.38train_batch/s]

Epoch: [2][1580/1730] Elapsed 9m 50s (remain 0m 55s) Loss: 0.7805 Grad: 65916.8750  LR: 0.00001494  


Train:  93%|█████████▎| 1601/1730 [09:57<00:45,  2.82train_batch/s]

Epoch: [2][1600/1730] Elapsed 9m 57s (remain 0m 48s) Loss: 0.7806 Grad: 68716.1562  LR: 0.00001615  


Train:  94%|█████████▎| 1621/1730 [10:05<00:40,  2.69train_batch/s]

Epoch: [2][1620/1730] Elapsed 10m 4s (remain 0m 40s) Loss: 0.7810 Grad: 101381.0391  LR: 0.00001722  


Train:  95%|█████████▍| 1641/1730 [10:12<00:32,  2.72train_batch/s]

Epoch: [2][1640/1730] Elapsed 10m 12s (remain 0m 33s) Loss: 0.7806 Grad: 76277.4219  LR: 0.00001815  


Train:  96%|█████████▌| 1661/1730 [10:19<00:20,  3.40train_batch/s]

Epoch: [2][1660/1730] Elapsed 10m 19s (remain 0m 25s) Loss: 0.7810 Grad: 102858.9844  LR: 0.00001890  


Train:  97%|█████████▋| 1681/1730 [10:27<00:19,  2.53train_batch/s]

Epoch: [2][1680/1730] Elapsed 10m 27s (remain 0m 18s) Loss: 0.7818 Grad: 87957.9844  LR: 0.00001947  


Train:  98%|█████████▊| 1701/1730 [10:35<00:12,  2.41train_batch/s]

Epoch: [2][1700/1730] Elapsed 10m 35s (remain 0m 10s) Loss: 0.7829 Grad: 105279.4531  LR: 0.00001984  


Train:  99%|█████████▉| 1721/1730 [10:42<00:03,  2.34train_batch/s]

Epoch: [2][1720/1730] Elapsed 10m 42s (remain 0m 3s) Loss: 0.7832 Grad: 46555.7266  LR: 0.00001999  


Train: 100%|██████████| 1730/1730 [10:46<00:00,  2.68train_batch/s]


Epoch: [2][1729/1730] Elapsed 10m 46s (remain 0m 0s) Loss: 0.7832 Grad: 35124.9922  LR: 0.00002000  


Validation:   0%|          | 4/866 [00:00<01:23, 10.27valid_batch/s]

EVAL: [0/866] Elapsed 0m 0s (remain 3m 18s) Loss: 0.9146 


Validation:   3%|▎         | 22/866 [00:01<01:08, 12.33valid_batch/s]

EVAL: [20/866] Elapsed 0m 1s (remain 1m 11s) Loss: 0.8339 


Validation:   5%|▍         | 42/866 [00:03<01:07, 12.18valid_batch/s]

EVAL: [40/866] Elapsed 0m 3s (remain 1m 9s) Loss: 0.9625 


Validation:   7%|▋         | 62/866 [00:05<00:59, 13.48valid_batch/s]

EVAL: [60/866] Elapsed 0m 4s (remain 1m 5s) Loss: 0.9258 


Validation:   9%|▉         | 82/866 [00:06<01:06, 11.82valid_batch/s]

EVAL: [80/866] Elapsed 0m 6s (remain 1m 3s) Loss: 0.8704 


Validation:  12%|█▏        | 102/866 [00:07<00:50, 15.19valid_batch/s]

EVAL: [100/866] Elapsed 0m 7s (remain 0m 59s) Loss: 0.8620 


Validation:  14%|█▍        | 123/866 [00:09<00:54, 13.71valid_batch/s]

EVAL: [120/866] Elapsed 0m 9s (remain 0m 58s) Loss: 0.8841 


Validation:  17%|█▋        | 144/866 [00:11<00:50, 14.43valid_batch/s]

EVAL: [140/866] Elapsed 0m 11s (remain 0m 56s) Loss: 0.8598 


Validation:  19%|█▉        | 163/866 [00:12<00:53, 13.16valid_batch/s]

EVAL: [160/866] Elapsed 0m 12s (remain 0m 54s) Loss: 0.8621 


Validation:  21%|██        | 182/866 [00:13<00:50, 13.66valid_batch/s]

EVAL: [180/866] Elapsed 0m 13s (remain 0m 52s) Loss: 0.8612 


Validation:  23%|██▎       | 203/866 [00:15<00:49, 13.26valid_batch/s]

EVAL: [200/866] Elapsed 0m 15s (remain 0m 50s) Loss: 0.8392 


Validation:  26%|██▌       | 223/866 [00:17<00:53, 11.95valid_batch/s]

EVAL: [220/866] Elapsed 0m 17s (remain 0m 49s) Loss: 0.8512 


Validation:  28%|██▊       | 241/866 [00:18<00:54, 11.41valid_batch/s]

EVAL: [240/866] Elapsed 0m 18s (remain 0m 48s) Loss: 0.8531 


Validation:  30%|███       | 262/866 [00:20<00:45, 13.29valid_batch/s]

EVAL: [260/866] Elapsed 0m 20s (remain 0m 47s) Loss: 0.8652 


Validation:  33%|███▎      | 284/866 [00:22<00:39, 14.89valid_batch/s]

EVAL: [280/866] Elapsed 0m 21s (remain 0m 45s) Loss: 0.8686 


Validation:  35%|███▍      | 302/866 [00:23<00:37, 15.14valid_batch/s]

EVAL: [300/866] Elapsed 0m 23s (remain 0m 43s) Loss: 0.8651 


Validation:  37%|███▋      | 324/866 [00:25<00:41, 13.06valid_batch/s]

EVAL: [320/866] Elapsed 0m 24s (remain 0m 42s) Loss: 0.8592 


Validation:  39%|███▉      | 342/866 [00:26<00:42, 12.41valid_batch/s]

EVAL: [340/866] Elapsed 0m 26s (remain 0m 40s) Loss: 0.8543 


Validation:  42%|████▏     | 364/866 [00:28<00:34, 14.73valid_batch/s]

EVAL: [360/866] Elapsed 0m 27s (remain 0m 38s) Loss: 0.8540 


Validation:  44%|████▍     | 382/866 [00:29<00:38, 12.59valid_batch/s]

EVAL: [380/866] Elapsed 0m 29s (remain 0m 37s) Loss: 0.8476 


Validation:  46%|████▋     | 402/866 [00:30<00:33, 13.86valid_batch/s]

EVAL: [400/866] Elapsed 0m 30s (remain 0m 35s) Loss: 0.8442 


Validation:  49%|████▉     | 423/866 [00:32<00:36, 12.27valid_batch/s]

EVAL: [420/866] Elapsed 0m 32s (remain 0m 34s) Loss: 0.8494 


Validation:  51%|█████     | 442/866 [00:34<00:39, 10.71valid_batch/s]

EVAL: [440/866] Elapsed 0m 34s (remain 0m 32s) Loss: 0.8448 


Validation:  53%|█████▎    | 463/866 [00:35<00:31, 12.72valid_batch/s]

EVAL: [460/866] Elapsed 0m 35s (remain 0m 31s) Loss: 0.8415 


Validation:  56%|█████▌    | 482/866 [00:37<00:31, 12.27valid_batch/s]

EVAL: [480/866] Elapsed 0m 37s (remain 0m 29s) Loss: 0.8376 


Validation:  58%|█████▊    | 504/866 [00:38<00:23, 15.12valid_batch/s]

EVAL: [500/866] Elapsed 0m 38s (remain 0m 28s) Loss: 0.8310 


Validation:  60%|██████    | 523/866 [00:40<00:27, 12.37valid_batch/s]

EVAL: [520/866] Elapsed 0m 40s (remain 0m 26s) Loss: 0.8248 


Validation:  63%|██████▎   | 543/866 [00:42<00:25, 12.45valid_batch/s]

EVAL: [540/866] Elapsed 0m 41s (remain 0m 25s) Loss: 0.8234 


Validation:  65%|██████▌   | 563/866 [00:43<00:24, 12.57valid_batch/s]

EVAL: [560/866] Elapsed 0m 43s (remain 0m 23s) Loss: 0.8236 


Validation:  67%|██████▋   | 582/866 [00:45<00:23, 12.21valid_batch/s]

EVAL: [580/866] Elapsed 0m 45s (remain 0m 22s) Loss: 0.8238 


Validation:  70%|██████▉   | 602/866 [00:46<00:18, 13.93valid_batch/s]

EVAL: [600/866] Elapsed 0m 46s (remain 0m 20s) Loss: 0.8264 


Validation:  72%|███████▏  | 623/866 [00:48<00:17, 14.07valid_batch/s]

EVAL: [620/866] Elapsed 0m 48s (remain 0m 18s) Loss: 0.8299 


Validation:  74%|███████▍  | 643/866 [00:49<00:17, 12.47valid_batch/s]

EVAL: [640/866] Elapsed 0m 49s (remain 0m 17s) Loss: 0.8314 


Validation:  77%|███████▋  | 663/866 [00:51<00:15, 13.06valid_batch/s]

EVAL: [660/866] Elapsed 0m 51s (remain 0m 15s) Loss: 0.8271 


Validation:  79%|███████▉  | 682/866 [00:53<00:16, 10.91valid_batch/s]

EVAL: [680/866] Elapsed 0m 52s (remain 0m 14s) Loss: 0.8299 


Validation:  81%|████████  | 702/866 [00:54<00:17,  9.33valid_batch/s]

EVAL: [700/866] Elapsed 0m 54s (remain 0m 12s) Loss: 0.8293 


Validation:  83%|████████▎ | 722/866 [00:56<00:14,  9.92valid_batch/s]

EVAL: [720/866] Elapsed 0m 56s (remain 0m 11s) Loss: 0.8276 


Validation:  86%|████████▌ | 743/866 [00:58<00:09, 12.84valid_batch/s]

EVAL: [740/866] Elapsed 0m 58s (remain 0m 9s) Loss: 0.8304 


Validation:  88%|████████▊ | 763/866 [00:59<00:08, 12.13valid_batch/s]

EVAL: [760/866] Elapsed 0m 59s (remain 0m 8s) Loss: 0.8294 


Validation:  90%|█████████ | 782/866 [01:01<00:06, 13.89valid_batch/s]

EVAL: [780/866] Elapsed 1m 0s (remain 0m 6s) Loss: 0.8283 


Validation:  93%|█████████▎| 804/866 [01:02<00:04, 14.15valid_batch/s]

EVAL: [800/866] Elapsed 1m 2s (remain 0m 5s) Loss: 0.8336 


Validation:  95%|█████████▌| 823/866 [01:04<00:03, 13.51valid_batch/s]

EVAL: [820/866] Elapsed 1m 4s (remain 0m 3s) Loss: 0.8343 


Validation:  97%|█████████▋| 843/866 [01:05<00:01, 13.44valid_batch/s]

EVAL: [840/866] Elapsed 1m 5s (remain 0m 1s) Loss: 0.8335 


Validation: 100%|█████████▉| 863/866 [01:07<00:00, 18.02valid_batch/s]

EVAL: [860/866] Elapsed 1m 6s (remain 0m 0s) Loss: 0.8299 


Validation: 100%|██████████| 866/866 [01:07<00:00, 12.84valid_batch/s]
Epoch 2 - avg_train_loss: 0.7832  avg_val_loss: 0.8356  time: 714s
Epoch 2 - Score: 0.8003
Epoch 2 - Save Best Score: 0.8003 Model


EVAL: [865/866] Elapsed 1m 7s (remain 0m 0s) Loss: 0.8356 


Score: 0.8003
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Train:   0%|          | 1/1730 [00:00<17:10,  1.68train_batch/s]

Epoch: [1][0/1730] Elapsed 0m 0s (remain 17m 11s) Loss: 1.5604 Grad: inf  LR: 0.00002000  


Train:   1%|          | 21/1730 [00:07<10:32,  2.70train_batch/s]

Epoch: [1][20/1730] Elapsed 0m 7s (remain 9m 45s) Loss: 1.5948 Grad: 42557.0000  LR: 0.00001988  


Train:   2%|▏         | 41/1730 [00:14<13:21,  2.11train_batch/s]

Epoch: [1][40/1730] Elapsed 0m 14s (remain 10m 3s) Loss: 1.5375 Grad: 106167.0234  LR: 0.00001956  


Train:   4%|▎         | 61/1730 [00:22<11:33,  2.41train_batch/s]

Epoch: [1][60/1730] Elapsed 0m 22s (remain 10m 17s) Loss: 1.4778 Grad: 83279.3906  LR: 0.00001903  


Train:   5%|▍         | 81/1730 [00:29<09:35,  2.87train_batch/s]

Epoch: [1][80/1730] Elapsed 0m 29s (remain 10m 3s) Loss: 1.4304 Grad: 103569.4531  LR: 0.00001831  


Train:   6%|▌         | 101/1730 [00:35<07:27,  3.64train_batch/s]

Epoch: [1][100/1730] Elapsed 0m 35s (remain 9m 28s) Loss: 1.3628 Grad: 125335.0859  LR: 0.00001742  


Train:   7%|▋         | 121/1730 [00:42<10:59,  2.44train_batch/s]

Epoch: [1][120/1730] Elapsed 0m 42s (remain 9m 22s) Loss: 1.3179 Grad: 75598.7500  LR: 0.00001637  


Train:   8%|▊         | 141/1730 [00:49<08:09,  3.25train_batch/s]

Epoch: [1][140/1730] Elapsed 0m 49s (remain 9m 15s) Loss: 1.2883 Grad: 120830.9844  LR: 0.00001519  


Train:   9%|▉         | 162/1730 [00:57<09:15,  2.82train_batch/s]

Epoch: [1][160/1730] Elapsed 0m 57s (remain 9m 18s) Loss: 1.2499 Grad: 222989.7969  LR: 0.00001389  


Train:  10%|█         | 181/1730 [01:03<10:56,  2.36train_batch/s]

Epoch: [1][180/1730] Elapsed 1m 3s (remain 9m 4s) Loss: 1.2241 Grad: 279531.0938  LR: 0.00001252  


Train:  12%|█▏        | 201/1730 [01:09<06:36,  3.86train_batch/s]

Epoch: [1][200/1730] Elapsed 1m 9s (remain 8m 45s) Loss: 1.2144 Grad: 88391.7188  LR: 0.00001109  


Train:  13%|█▎        | 221/1730 [01:16<09:10,  2.74train_batch/s]

Epoch: [1][220/1730] Elapsed 1m 16s (remain 8m 40s) Loss: 1.1943 Grad: 318903.6875  LR: 0.00000964  


Train:  14%|█▍        | 241/1730 [01:23<09:57,  2.49train_batch/s]

Epoch: [1][240/1730] Elapsed 1m 23s (remain 8m 34s) Loss: 1.1772 Grad: 169503.2812  LR: 0.00000819  


Train:  15%|█▌        | 261/1730 [01:30<07:13,  3.39train_batch/s]

Epoch: [1][260/1730] Elapsed 1m 30s (remain 8m 28s) Loss: 1.1731 Grad: 68684.3438  LR: 0.00000679  


Train:  16%|█▌        | 281/1730 [01:36<09:15,  2.61train_batch/s]

Epoch: [1][280/1730] Elapsed 1m 36s (remain 8m 19s) Loss: 1.1566 Grad: 135760.2344  LR: 0.00000545  


Train:  17%|█▋        | 301/1730 [01:44<10:00,  2.38train_batch/s]

Epoch: [1][300/1730] Elapsed 1m 44s (remain 8m 17s) Loss: 1.1409 Grad: 214688.0781  LR: 0.00000420  


Train:  19%|█▊        | 321/1730 [01:52<08:54,  2.63train_batch/s]

Epoch: [1][320/1730] Elapsed 1m 52s (remain 8m 13s) Loss: 1.1242 Grad: 208088.4844  LR: 0.00000308  


Train:  20%|█▉        | 341/1730 [02:00<09:35,  2.41train_batch/s]

Epoch: [1][340/1730] Elapsed 2m 0s (remain 8m 9s) Loss: 1.1110 Grad: 100827.9844  LR: 0.00000211  


Train:  21%|██        | 361/1730 [02:07<08:58,  2.54train_batch/s]

Epoch: [1][360/1730] Elapsed 2m 7s (remain 8m 4s) Loss: 1.1016 Grad: 90957.1641  LR: 0.00000130  


Train:  22%|██▏       | 381/1730 [02:14<07:41,  2.92train_batch/s]

Epoch: [1][380/1730] Elapsed 2m 14s (remain 7m 57s) Loss: 1.0883 Grad: 95602.6406  LR: 0.00000068  


Train:  23%|██▎       | 401/1730 [02:21<08:37,  2.57train_batch/s]

Epoch: [1][400/1730] Elapsed 2m 21s (remain 7m 50s) Loss: 1.0778 Grad: 169561.3594  LR: 0.00000025  


Train:  24%|██▍       | 421/1730 [02:29<08:04,  2.70train_batch/s]

Epoch: [1][420/1730] Elapsed 2m 29s (remain 7m 43s) Loss: 1.0692 Grad: 178321.2812  LR: 0.00000003  


Train:  25%|██▌       | 441/1730 [02:36<08:52,  2.42train_batch/s]

Epoch: [1][440/1730] Elapsed 2m 36s (remain 7m 37s) Loss: 1.0632 Grad: 245392.6406  LR: 0.00000002  


Train:  27%|██▋       | 461/1730 [02:43<08:22,  2.53train_batch/s]

Epoch: [1][460/1730] Elapsed 2m 43s (remain 7m 30s) Loss: 1.0564 Grad: 74688.8438  LR: 0.00000022  


Train:  28%|██▊       | 481/1730 [02:50<08:48,  2.36train_batch/s]

Epoch: [1][480/1730] Elapsed 2m 50s (remain 7m 23s) Loss: 1.0479 Grad: 129517.0859  LR: 0.00000063  


Train:  29%|██▉       | 501/1730 [02:57<07:33,  2.71train_batch/s]

Epoch: [1][500/1730] Elapsed 2m 57s (remain 7m 16s) Loss: 1.0453 Grad: 223443.7344  LR: 0.00000123  


Train:  30%|███       | 521/1730 [03:04<07:31,  2.68train_batch/s]

Epoch: [1][520/1730] Elapsed 3m 4s (remain 7m 8s) Loss: 1.0404 Grad: 326977.6875  LR: 0.00000202  


Train:  31%|███▏      | 541/1730 [03:11<07:05,  2.80train_batch/s]

Epoch: [1][540/1730] Elapsed 3m 11s (remain 7m 0s) Loss: 1.0413 Grad: 98968.0703  LR: 0.00000298  


Train:  32%|███▏      | 561/1730 [03:17<07:12,  2.70train_batch/s]

Epoch: [1][560/1730] Elapsed 3m 17s (remain 6m 52s) Loss: 1.0356 Grad: 320356.5000  LR: 0.00000409  


Train:  34%|███▎      | 581/1730 [03:24<06:34,  2.91train_batch/s]

Epoch: [1][580/1730] Elapsed 3m 24s (remain 6m 44s) Loss: 1.0334 Grad: 66323.3047  LR: 0.00000532  


Train:  35%|███▍      | 601/1730 [03:32<08:32,  2.20train_batch/s]

Epoch: [1][600/1730] Elapsed 3m 32s (remain 6m 38s) Loss: 1.0325 Grad: 97274.9453  LR: 0.00000665  


Train:  36%|███▌      | 621/1730 [03:38<05:33,  3.32train_batch/s]

Epoch: [1][620/1730] Elapsed 3m 38s (remain 6m 30s) Loss: 1.0287 Grad: 134468.0469  LR: 0.00000805  


Train:  37%|███▋      | 641/1730 [03:45<07:00,  2.59train_batch/s]

Epoch: [1][640/1730] Elapsed 3m 45s (remain 6m 23s) Loss: 1.0283 Grad: 70173.5938  LR: 0.00000949  


Train:  38%|███▊      | 661/1730 [03:52<05:10,  3.44train_batch/s]

Epoch: [1][660/1730] Elapsed 3m 52s (remain 6m 15s) Loss: 1.0296 Grad: 122888.8281  LR: 0.00001094  


Train:  39%|███▉      | 681/1730 [03:58<06:04,  2.88train_batch/s]

Epoch: [1][680/1730] Elapsed 3m 58s (remain 6m 7s) Loss: 1.0284 Grad: 163211.7344  LR: 0.00001238  


Train:  41%|████      | 701/1730 [04:05<06:14,  2.75train_batch/s]

Epoch: [1][700/1730] Elapsed 4m 5s (remain 6m 0s) Loss: 1.0263 Grad: 100898.6719  LR: 0.00001376  


Train:  42%|████▏     | 721/1730 [04:11<05:00,  3.36train_batch/s]

Epoch: [1][720/1730] Elapsed 4m 11s (remain 5m 51s) Loss: 1.0260 Grad: 123673.2734  LR: 0.00001506  


Train:  43%|████▎     | 741/1730 [04:18<07:43,  2.13train_batch/s]

Epoch: [1][740/1730] Elapsed 4m 18s (remain 5m 45s) Loss: 1.0241 Grad: 186092.7656  LR: 0.00001626  


Train:  44%|████▍     | 761/1730 [04:25<05:16,  3.06train_batch/s]

Epoch: [1][760/1730] Elapsed 4m 25s (remain 5m 37s) Loss: 1.0248 Grad: 270241.6875  LR: 0.00001732  


Train:  45%|████▌     | 781/1730 [04:31<05:09,  3.06train_batch/s]

Epoch: [1][780/1730] Elapsed 4m 31s (remain 5m 29s) Loss: 1.0236 Grad: 158640.0312  LR: 0.00001823  


Train:  46%|████▋     | 801/1730 [04:38<05:33,  2.79train_batch/s]

Epoch: [1][800/1730] Elapsed 4m 38s (remain 5m 22s) Loss: 1.0233 Grad: 178241.8438  LR: 0.00001897  


Train:  47%|████▋     | 821/1730 [04:45<05:29,  2.76train_batch/s]

Epoch: [1][820/1730] Elapsed 4m 45s (remain 5m 15s) Loss: 1.0196 Grad: 126603.6953  LR: 0.00001952  


Train:  49%|████▊     | 841/1730 [04:52<05:17,  2.80train_batch/s]

Epoch: [1][840/1730] Elapsed 4m 52s (remain 5m 8s) Loss: 1.0198 Grad: 174867.6719  LR: 0.00001986  


Train:  50%|████▉     | 861/1730 [04:59<04:39,  3.10train_batch/s]

Epoch: [1][860/1730] Elapsed 4m 59s (remain 5m 2s) Loss: 1.0196 Grad: 100922.4062  LR: 0.00002000  


Train:  51%|█████     | 881/1730 [05:06<04:35,  3.09train_batch/s]

Epoch: [1][880/1730] Elapsed 5m 6s (remain 4m 54s) Loss: 1.0197 Grad: 64314.8906  LR: 0.00001992  


Train:  52%|█████▏    | 901/1730 [05:12<03:25,  4.03train_batch/s]

Epoch: [1][900/1730] Elapsed 5m 12s (remain 4m 47s) Loss: 1.0209 Grad: 56454.5391  LR: 0.00001964  


Train:  53%|█████▎    | 921/1730 [05:19<04:23,  3.07train_batch/s]

Epoch: [1][920/1730] Elapsed 5m 19s (remain 4m 40s) Loss: 1.0237 Grad: 118135.0000  LR: 0.00001915  


Train:  54%|█████▍    | 942/1730 [05:26<03:39,  3.59train_batch/s]

Epoch: [1][940/1730] Elapsed 5m 26s (remain 4m 33s) Loss: 1.0241 Grad: 34188.7891  LR: 0.00001847  


Train:  56%|█████▌    | 961/1730 [05:33<04:58,  2.58train_batch/s]

Epoch: [1][960/1730] Elapsed 5m 33s (remain 4m 26s) Loss: 1.0255 Grad: 284004.2188  LR: 0.00001761  


Train:  57%|█████▋    | 981/1730 [05:40<03:46,  3.31train_batch/s]

Epoch: [1][980/1730] Elapsed 5m 40s (remain 4m 19s) Loss: 1.0236 Grad: 58325.6133  LR: 0.00001659  


Train:  58%|█████▊    | 1001/1730 [05:47<03:59,  3.05train_batch/s]

Epoch: [1][1000/1730] Elapsed 5m 47s (remain 4m 13s) Loss: 1.0249 Grad: 357691.6250  LR: 0.00001543  


Train:  59%|█████▉    | 1021/1730 [05:53<03:30,  3.37train_batch/s]

Epoch: [1][1020/1730] Elapsed 5m 53s (remain 4m 5s) Loss: 1.0229 Grad: 193732.5000  LR: 0.00001416  


Train:  60%|██████    | 1041/1730 [06:00<03:51,  2.98train_batch/s]

Epoch: [1][1040/1730] Elapsed 6m 0s (remain 3m 58s) Loss: 1.0237 Grad: 49443.3438  LR: 0.00001280  


Train:  61%|██████▏   | 1061/1730 [06:08<04:29,  2.48train_batch/s]

Epoch: [1][1060/1730] Elapsed 6m 8s (remain 3m 52s) Loss: 1.0218 Grad: 94559.1875  LR: 0.00001138  


Train:  62%|██████▏   | 1081/1730 [06:15<05:14,  2.06train_batch/s]

Epoch: [1][1080/1730] Elapsed 6m 15s (remain 3m 45s) Loss: 1.0205 Grad: 60279.9727  LR: 0.00000993  


Train:  64%|██████▎   | 1101/1730 [06:22<04:11,  2.50train_batch/s]

Epoch: [1][1100/1730] Elapsed 6m 22s (remain 3m 38s) Loss: 1.0166 Grad: 98354.3594  LR: 0.00000848  


Train:  65%|██████▍   | 1121/1730 [06:29<03:54,  2.60train_batch/s]

Epoch: [1][1120/1730] Elapsed 6m 29s (remain 3m 31s) Loss: 1.0136 Grad: 63689.2148  LR: 0.00000706  


Train:  66%|██████▌   | 1141/1730 [06:36<03:26,  2.85train_batch/s]

Epoch: [1][1140/1730] Elapsed 6m 36s (remain 3m 24s) Loss: 1.0113 Grad: 53230.7031  LR: 0.00000571  


Train:  67%|██████▋   | 1161/1730 [06:43<03:56,  2.40train_batch/s]

Epoch: [1][1160/1730] Elapsed 6m 43s (remain 3m 17s) Loss: 1.0090 Grad: 50056.4062  LR: 0.00000444  


Train:  68%|██████▊   | 1181/1730 [06:50<03:22,  2.71train_batch/s]

Epoch: [1][1180/1730] Elapsed 6m 50s (remain 3m 10s) Loss: 1.0083 Grad: 275365.2500  LR: 0.00000330  


Train:  69%|██████▉   | 1201/1730 [06:57<02:55,  3.01train_batch/s]

Epoch: [1][1200/1730] Elapsed 6m 57s (remain 3m 3s) Loss: 1.0059 Grad: 92722.3516  LR: 0.00000229  


Train:  71%|███████   | 1221/1730 [07:04<03:12,  2.65train_batch/s]

Epoch: [1][1220/1730] Elapsed 7m 4s (remain 2m 56s) Loss: 1.0020 Grad: 45671.7227  LR: 0.00000145  


Train:  72%|███████▏  | 1241/1730 [07:11<03:18,  2.47train_batch/s]

Epoch: [1][1240/1730] Elapsed 7m 11s (remain 2m 50s) Loss: 1.0000 Grad: 195326.6094  LR: 0.00000079  


Train:  73%|███████▎  | 1261/1730 [07:18<02:51,  2.74train_batch/s]

Epoch: [1][1260/1730] Elapsed 7m 18s (remain 2m 42s) Loss: 0.9966 Grad: 38151.9922  LR: 0.00000032  


Train:  74%|███████▍  | 1281/1730 [07:25<02:40,  2.79train_batch/s]

Epoch: [1][1280/1730] Elapsed 7m 25s (remain 2m 36s) Loss: 0.9934 Grad: 48224.8867  LR: 0.00000006  


Train:  75%|███████▌  | 1301/1730 [07:33<02:49,  2.53train_batch/s]

Epoch: [1][1300/1730] Elapsed 7m 32s (remain 2m 29s) Loss: 0.9914 Grad: 150692.4062  LR: 0.00000001  


Train:  76%|███████▋  | 1321/1730 [07:39<02:30,  2.71train_batch/s]

Epoch: [1][1320/1730] Elapsed 7m 39s (remain 2m 22s) Loss: 0.9888 Grad: 42466.7812  LR: 0.00000016  


Train:  78%|███████▊  | 1341/1730 [07:45<01:57,  3.32train_batch/s]

Epoch: [1][1340/1730] Elapsed 7m 45s (remain 2m 15s) Loss: 0.9857 Grad: 20897.8398  LR: 0.00000053  


Train:  79%|███████▊  | 1362/1730 [07:52<02:06,  2.91train_batch/s]

Epoch: [1][1360/1730] Elapsed 7m 52s (remain 2m 8s) Loss: 0.9830 Grad: 39194.0352  LR: 0.00000110  


Train:  80%|███████▉  | 1381/1730 [07:57<01:30,  3.84train_batch/s]

Epoch: [1][1380/1730] Elapsed 7m 57s (remain 2m 0s) Loss: 0.9808 Grad: 49049.4062  LR: 0.00000185  


Train:  81%|████████  | 1401/1730 [08:04<01:35,  3.44train_batch/s]

Epoch: [1][1400/1730] Elapsed 8m 4s (remain 1m 53s) Loss: 0.9782 Grad: 96089.8828  LR: 0.00000278  


Train:  82%|████████▏ | 1421/1730 [08:10<01:35,  3.25train_batch/s]

Epoch: [1][1420/1730] Elapsed 8m 10s (remain 1m 46s) Loss: 0.9752 Grad: 43701.2891  LR: 0.00000385  


Train:  83%|████████▎ | 1441/1730 [08:17<01:52,  2.57train_batch/s]

Epoch: [1][1440/1730] Elapsed 8m 17s (remain 1m 39s) Loss: 0.9740 Grad: 52519.7695  LR: 0.00000506  


Train:  84%|████████▍ | 1461/1730 [08:24<01:33,  2.87train_batch/s]

Epoch: [1][1460/1730] Elapsed 8m 24s (remain 1m 32s) Loss: 0.9729 Grad: 41816.1641  LR: 0.00000638  


Train:  86%|████████▌ | 1481/1730 [08:31<01:10,  3.52train_batch/s]

Epoch: [1][1480/1730] Elapsed 8m 31s (remain 1m 25s) Loss: 0.9717 Grad: 58122.6133  LR: 0.00000776  


Train:  87%|████████▋ | 1501/1730 [08:37<01:16,  3.01train_batch/s]

Epoch: [1][1500/1730] Elapsed 8m 37s (remain 1m 18s) Loss: 0.9707 Grad: 26072.1738  LR: 0.00000920  


Train:  88%|████████▊ | 1521/1730 [08:44<01:02,  3.34train_batch/s]

Epoch: [1][1520/1730] Elapsed 8m 44s (remain 1m 12s) Loss: 0.9700 Grad: 79877.9297  LR: 0.00001065  


Train:  89%|████████▉ | 1541/1730 [08:51<01:14,  2.54train_batch/s]

Epoch: [1][1540/1730] Elapsed 8m 51s (remain 1m 5s) Loss: 0.9692 Grad: 69582.7500  LR: 0.00001209  


Train:  90%|█████████ | 1561/1730 [08:58<00:59,  2.84train_batch/s]

Epoch: [1][1560/1730] Elapsed 8m 58s (remain 0m 58s) Loss: 0.9680 Grad: 109781.1562  LR: 0.00001349  


Train:  91%|█████████▏| 1581/1730 [09:05<01:06,  2.25train_batch/s]

Epoch: [1][1580/1730] Elapsed 9m 5s (remain 0m 51s) Loss: 0.9671 Grad: 123250.0547  LR: 0.00001481  


Train:  93%|█████████▎| 1601/1730 [09:11<00:49,  2.60train_batch/s]

Epoch: [1][1600/1730] Elapsed 9m 11s (remain 0m 44s) Loss: 0.9664 Grad: 40419.0391  LR: 0.00001603  


Train:  94%|█████████▎| 1621/1730 [09:18<00:39,  2.74train_batch/s]

Epoch: [1][1620/1730] Elapsed 9m 18s (remain 0m 37s) Loss: 0.9665 Grad: 114169.8672  LR: 0.00001712  


Train:  95%|█████████▍| 1642/1730 [09:24<00:21,  4.01train_batch/s]

Epoch: [1][1640/1730] Elapsed 9m 24s (remain 0m 30s) Loss: 0.9655 Grad: 49845.8750  LR: 0.00001806  


Train:  96%|█████████▌| 1661/1730 [09:31<00:27,  2.46train_batch/s]

Epoch: [1][1660/1730] Elapsed 9m 31s (remain 0m 23s) Loss: 0.9663 Grad: 135089.9219  LR: 0.00001884  


Train:  97%|█████████▋| 1681/1730 [09:39<00:17,  2.78train_batch/s]

Epoch: [1][1680/1730] Elapsed 9m 39s (remain 0m 16s) Loss: 0.9667 Grad: 54660.6328  LR: 0.00001942  


Train:  98%|█████████▊| 1701/1730 [09:45<00:07,  3.89train_batch/s]

Epoch: [1][1700/1730] Elapsed 9m 45s (remain 0m 9s) Loss: 0.9654 Grad: 59974.0742  LR: 0.00001981  


Train:  99%|█████████▉| 1721/1730 [09:52<00:03,  2.39train_batch/s]

Epoch: [1][1720/1730] Elapsed 9m 52s (remain 0m 3s) Loss: 0.9657 Grad: 101725.3359  LR: 0.00001999  


Train: 100%|██████████| 1730/1730 [09:55<00:00,  2.90train_batch/s]


Epoch: [1][1729/1730] Elapsed 9m 55s (remain 0m 0s) Loss: 0.9649 Grad: 61473.6719  LR: 0.00002000  


Validation:   0%|          | 1/866 [00:00<04:28,  3.22valid_batch/s]

EVAL: [0/866] Elapsed 0m 0s (remain 4m 29s) Loss: 2.5650 


Validation:   3%|▎         | 23/866 [00:01<00:58, 14.33valid_batch/s]

EVAL: [20/866] Elapsed 0m 1s (remain 1m 11s) Loss: 0.9781 


Validation:   5%|▍         | 42/866 [00:03<01:01, 13.32valid_batch/s]

EVAL: [40/866] Elapsed 0m 3s (remain 1m 4s) Loss: 0.9193 


Validation:   7%|▋         | 63/866 [00:04<00:55, 14.57valid_batch/s]

EVAL: [60/866] Elapsed 0m 4s (remain 1m 2s) Loss: 0.8845 


Validation:  10%|▉         | 83/866 [00:06<00:51, 15.25valid_batch/s]

EVAL: [80/866] Elapsed 0m 6s (remain 0m 59s) Loss: 0.8265 


Validation:  12%|█▏        | 101/866 [00:07<00:49, 15.37valid_batch/s]

EVAL: [100/866] Elapsed 0m 7s (remain 0m 57s) Loss: 0.8275 


Validation:  14%|█▍        | 122/866 [00:08<00:45, 16.41valid_batch/s]

EVAL: [120/866] Elapsed 0m 8s (remain 0m 54s) Loss: 0.8045 


Validation:  16%|█▋        | 142/866 [00:10<00:56, 12.89valid_batch/s]

EVAL: [140/866] Elapsed 0m 10s (remain 0m 53s) Loss: 0.8166 


Validation:  19%|█▊        | 162/866 [00:11<00:49, 14.30valid_batch/s]

EVAL: [160/866] Elapsed 0m 11s (remain 0m 51s) Loss: 0.8252 


Validation:  21%|██        | 183/866 [00:13<00:46, 14.62valid_batch/s]

EVAL: [180/866] Elapsed 0m 12s (remain 0m 49s) Loss: 0.8288 


Validation:  23%|██▎       | 203/866 [00:14<00:57, 11.63valid_batch/s]

EVAL: [200/866] Elapsed 0m 14s (remain 0m 48s) Loss: 0.8214 


Validation:  26%|██▌       | 223/866 [00:16<00:46, 13.69valid_batch/s]

EVAL: [220/866] Elapsed 0m 15s (remain 0m 46s) Loss: 0.8123 


Validation:  28%|██▊       | 241/866 [00:17<00:45, 13.87valid_batch/s]

EVAL: [240/866] Elapsed 0m 17s (remain 0m 45s) Loss: 0.8081 


Validation:  30%|███       | 262/866 [00:19<00:48, 12.41valid_batch/s]

EVAL: [260/866] Elapsed 0m 19s (remain 0m 44s) Loss: 0.8000 


Validation:  33%|███▎      | 282/866 [00:21<00:51, 11.39valid_batch/s]

EVAL: [280/866] Elapsed 0m 20s (remain 0m 43s) Loss: 0.8027 


Validation:  35%|███▍      | 303/866 [00:22<00:44, 12.72valid_batch/s]

EVAL: [300/866] Elapsed 0m 22s (remain 0m 42s) Loss: 0.8108 


Validation:  37%|███▋      | 322/866 [00:24<00:41, 12.97valid_batch/s]

EVAL: [320/866] Elapsed 0m 24s (remain 0m 40s) Loss: 0.8150 


Validation:  39%|███▉      | 342/866 [00:25<00:44, 11.72valid_batch/s]

EVAL: [340/866] Elapsed 0m 25s (remain 0m 39s) Loss: 0.8123 


Validation:  42%|████▏     | 363/866 [00:27<00:36, 13.78valid_batch/s]

EVAL: [360/866] Elapsed 0m 27s (remain 0m 38s) Loss: 0.8098 


Validation:  44%|████▍     | 382/866 [00:28<00:33, 14.47valid_batch/s]

EVAL: [380/866] Elapsed 0m 28s (remain 0m 36s) Loss: 0.8114 


Validation:  47%|████▋     | 404/866 [00:30<00:31, 14.55valid_batch/s]

EVAL: [400/866] Elapsed 0m 30s (remain 0m 34s) Loss: 0.8170 


Validation:  49%|████▉     | 423/866 [00:31<00:31, 14.10valid_batch/s]

EVAL: [420/866] Elapsed 0m 31s (remain 0m 33s) Loss: 0.8205 


Validation:  51%|█████     | 443/866 [00:32<00:30, 14.05valid_batch/s]

EVAL: [440/866] Elapsed 0m 32s (remain 0m 31s) Loss: 0.8172 


Validation:  53%|█████▎    | 463/866 [00:34<00:27, 14.68valid_batch/s]

EVAL: [460/866] Elapsed 0m 34s (remain 0m 30s) Loss: 0.8225 


Validation:  56%|█████▌    | 483/866 [00:35<00:24, 15.52valid_batch/s]

EVAL: [480/866] Elapsed 0m 35s (remain 0m 28s) Loss: 0.8239 


Validation:  58%|█████▊    | 503/866 [00:37<00:27, 13.08valid_batch/s]

EVAL: [500/866] Elapsed 0m 37s (remain 0m 27s) Loss: 0.8170 


Validation:  60%|██████    | 523/866 [00:38<00:27, 12.57valid_batch/s]

EVAL: [520/866] Elapsed 0m 38s (remain 0m 25s) Loss: 0.8169 


Validation:  63%|██████▎   | 542/866 [00:40<00:26, 12.10valid_batch/s]

EVAL: [540/866] Elapsed 0m 40s (remain 0m 24s) Loss: 0.8202 


Validation:  65%|██████▌   | 563/866 [00:41<00:19, 15.85valid_batch/s]

EVAL: [560/866] Elapsed 0m 41s (remain 0m 22s) Loss: 0.8221 


Validation:  67%|██████▋   | 583/866 [00:43<00:19, 14.84valid_batch/s]

EVAL: [580/866] Elapsed 0m 42s (remain 0m 21s) Loss: 0.8242 


Validation:  70%|██████▉   | 603/866 [00:44<00:18, 14.19valid_batch/s]

EVAL: [600/866] Elapsed 0m 44s (remain 0m 19s) Loss: 0.8264 


Validation:  72%|███████▏  | 622/866 [00:45<00:16, 14.83valid_batch/s]

EVAL: [620/866] Elapsed 0m 45s (remain 0m 18s) Loss: 0.8237 


Validation:  74%|███████▍  | 642/866 [00:47<00:17, 12.91valid_batch/s]

EVAL: [640/866] Elapsed 0m 47s (remain 0m 16s) Loss: 0.8285 


Validation:  77%|███████▋  | 663/866 [00:48<00:13, 14.75valid_batch/s]

EVAL: [660/866] Elapsed 0m 48s (remain 0m 15s) Loss: 0.8315 


Validation:  79%|███████▊  | 681/866 [00:50<00:12, 14.38valid_batch/s]

EVAL: [680/866] Elapsed 0m 50s (remain 0m 13s) Loss: 0.8325 


Validation:  81%|████████  | 702/866 [00:51<00:12, 13.41valid_batch/s]

EVAL: [700/866] Elapsed 0m 51s (remain 0m 12s) Loss: 0.8360 


Validation:  84%|████████▎ | 724/866 [00:53<00:09, 15.61valid_batch/s]

EVAL: [720/866] Elapsed 0m 53s (remain 0m 10s) Loss: 0.8374 


Validation:  86%|████████▌ | 743/866 [00:54<00:09, 13.44valid_batch/s]

EVAL: [740/866] Elapsed 0m 54s (remain 0m 9s) Loss: 0.8410 


Validation:  88%|████████▊ | 763/866 [00:55<00:06, 15.57valid_batch/s]

EVAL: [760/866] Elapsed 0m 55s (remain 0m 7s) Loss: 0.8405 


Validation:  90%|█████████ | 782/866 [00:57<00:06, 12.72valid_batch/s]

EVAL: [780/866] Elapsed 0m 57s (remain 0m 6s) Loss: 0.8407 


Validation:  93%|█████████▎| 803/866 [00:58<00:04, 13.02valid_batch/s]

EVAL: [800/866] Elapsed 0m 58s (remain 0m 4s) Loss: 0.8409 


Validation:  95%|█████████▍| 822/866 [01:00<00:03, 14.63valid_batch/s]

EVAL: [820/866] Elapsed 1m 0s (remain 0m 3s) Loss: 0.8375 


Validation:  97%|█████████▋| 843/866 [01:01<00:01, 17.86valid_batch/s]

EVAL: [840/866] Elapsed 1m 1s (remain 0m 1s) Loss: 0.8369 


Validation: 100%|█████████▉| 862/866 [01:02<00:00, 13.36valid_batch/s]

EVAL: [860/866] Elapsed 1m 2s (remain 0m 0s) Loss: 0.8392 


Validation: 100%|██████████| 866/866 [01:03<00:00, 13.67valid_batch/s]
Epoch 1 - avg_train_loss: 0.9649  avg_val_loss: 0.8421  time: 659s
Epoch 1 - Score: 0.7981
Epoch 1 - Save Best Score: 0.7981 Model


EVAL: [865/866] Elapsed 1m 3s (remain 0m 0s) Loss: 0.8421 


Train:   0%|          | 2/1730 [00:00<11:48,  2.44train_batch/s]

Epoch: [2][0/1730] Elapsed 0m 0s (remain 20m 58s) Loss: 1.1174 Grad: inf  LR: 0.00002000  


Train:   1%|          | 21/1730 [00:06<08:53,  3.20train_batch/s]

Epoch: [2][20/1730] Elapsed 0m 6s (remain 9m 1s) Loss: 0.8579 Grad: 71813.1172  LR: 0.00001986  


Train:   2%|▏         | 41/1730 [00:13<10:06,  2.79train_batch/s]

Epoch: [2][40/1730] Elapsed 0m 13s (remain 9m 1s) Loss: 0.8274 Grad: 146526.9844  LR: 0.00001952  


Train:   4%|▎         | 61/1730 [00:19<09:04,  3.07train_batch/s]

Epoch: [2][60/1730] Elapsed 0m 19s (remain 9m 3s) Loss: 0.8474 Grad: 88121.7734  LR: 0.00001897  


Train:   5%|▍         | 81/1730 [00:26<08:49,  3.11train_batch/s]

Epoch: [2][80/1730] Elapsed 0m 26s (remain 8m 55s) Loss: 0.8379 Grad: 110141.3438  LR: 0.00001823  


Train:   6%|▌         | 101/1730 [00:31<08:14,  3.30train_batch/s]

Epoch: [2][100/1730] Elapsed 0m 31s (remain 8m 33s) Loss: 0.8362 Grad: 292622.9062  LR: 0.00001732  


Train:   7%|▋         | 121/1730 [00:39<09:36,  2.79train_batch/s]

Epoch: [2][120/1730] Elapsed 0m 39s (remain 8m 40s) Loss: 0.8745 Grad: 267349.4062  LR: 0.00001626  


Train:   8%|▊         | 141/1730 [00:45<08:59,  2.95train_batch/s]

Epoch: [2][140/1730] Elapsed 0m 45s (remain 8m 33s) Loss: 0.8669 Grad: 245454.5625  LR: 0.00001506  


Train:   9%|▉         | 161/1730 [00:51<07:43,  3.38train_batch/s]

Epoch: [2][160/1730] Elapsed 0m 51s (remain 8m 26s) Loss: 0.8588 Grad: 100128.5000  LR: 0.00001376  


Train:  10%|█         | 181/1730 [00:59<09:21,  2.76train_batch/s]

Epoch: [2][180/1730] Elapsed 0m 59s (remain 8m 30s) Loss: 0.8555 Grad: 146896.6406  LR: 0.00001238  


Train:  12%|█▏        | 202/1730 [01:06<08:00,  3.18train_batch/s]

Epoch: [2][200/1730] Elapsed 1m 6s (remain 8m 25s) Loss: 0.8534 Grad: 168848.8906  LR: 0.00001094  


Train:  13%|█▎        | 221/1730 [01:12<07:31,  3.34train_batch/s]

Epoch: [2][220/1730] Elapsed 1m 12s (remain 8m 13s) Loss: 0.8541 Grad: 409119.6562  LR: 0.00000949  


Train:  14%|█▍        | 241/1730 [01:19<08:08,  3.05train_batch/s]

Epoch: [2][240/1730] Elapsed 1m 19s (remain 8m 13s) Loss: 0.8463 Grad: 143159.4375  LR: 0.00000805  


Train:  15%|█▌        | 262/1730 [01:26<07:16,  3.37train_batch/s]

Epoch: [2][260/1730] Elapsed 1m 26s (remain 8m 6s) Loss: 0.8421 Grad: 192149.5625  LR: 0.00000665  


Train:  16%|█▌        | 281/1730 [01:33<06:17,  3.84train_batch/s]

Epoch: [2][280/1730] Elapsed 1m 33s (remain 8m 0s) Loss: 0.8356 Grad: 61208.4805  LR: 0.00000532  


Train:  17%|█▋        | 301/1730 [01:40<08:32,  2.79train_batch/s]

Epoch: [2][300/1730] Elapsed 1m 40s (remain 7m 56s) Loss: 0.8299 Grad: 98214.8047  LR: 0.00000409  


Train:  19%|█▊        | 321/1730 [01:47<08:19,  2.82train_batch/s]

Epoch: [2][320/1730] Elapsed 1m 47s (remain 7m 52s) Loss: 0.8230 Grad: 139932.5781  LR: 0.00000298  


Train:  20%|█▉        | 341/1730 [01:54<08:15,  2.80train_batch/s]

Epoch: [2][340/1730] Elapsed 1m 54s (remain 7m 45s) Loss: 0.8175 Grad: 101231.9297  LR: 0.00000202  


Train:  21%|██        | 361/1730 [02:01<10:12,  2.24train_batch/s]

Epoch: [2][360/1730] Elapsed 2m 1s (remain 7m 42s) Loss: 0.8185 Grad: 110527.8516  LR: 0.00000123  


Train:  22%|██▏       | 381/1730 [02:08<06:28,  3.47train_batch/s]

Epoch: [2][380/1730] Elapsed 2m 8s (remain 7m 33s) Loss: 0.8143 Grad: 112217.8984  LR: 0.00000063  


Train:  23%|██▎       | 401/1730 [02:15<07:41,  2.88train_batch/s]

Epoch: [2][400/1730] Elapsed 2m 15s (remain 7m 27s) Loss: 0.8128 Grad: 345832.8750  LR: 0.00000022  


Train:  24%|██▍       | 421/1730 [02:22<07:45,  2.81train_batch/s]

Epoch: [2][420/1730] Elapsed 2m 22s (remain 7m 22s) Loss: 0.8085 Grad: 553261.6250  LR: 0.00000002  


Train:  25%|██▌       | 441/1730 [02:29<09:35,  2.24train_batch/s]

Epoch: [2][440/1730] Elapsed 2m 29s (remain 7m 17s) Loss: 0.8062 Grad: 655460.5000  LR: 0.00000003  


Train:  27%|██▋       | 461/1730 [02:37<07:50,  2.70train_batch/s]

Epoch: [2][460/1730] Elapsed 2m 37s (remain 7m 12s) Loss: 0.8018 Grad: 293424.5000  LR: 0.00000025  


Train:  28%|██▊       | 482/1730 [02:44<06:01,  3.46train_batch/s]

Epoch: [2][480/1730] Elapsed 2m 44s (remain 7m 6s) Loss: 0.8025 Grad: 124769.8672  LR: 0.00000068  


Train:  29%|██▉       | 501/1730 [02:51<06:06,  3.35train_batch/s]

Epoch: [2][500/1730] Elapsed 2m 51s (remain 6m 59s) Loss: 0.8025 Grad: 158022.5312  LR: 0.00000130  


Train:  30%|███       | 521/1730 [02:58<06:24,  3.15train_batch/s]

Epoch: [2][520/1730] Elapsed 2m 58s (remain 6m 54s) Loss: 0.8013 Grad: 81266.0000  LR: 0.00000211  


Train:  31%|███▏      | 541/1730 [03:04<06:15,  3.17train_batch/s]

Epoch: [2][540/1730] Elapsed 3m 4s (remain 6m 45s) Loss: 0.7968 Grad: 162376.6094  LR: 0.00000308  


Train:  32%|███▏      | 561/1730 [03:11<05:55,  3.29train_batch/s]

Epoch: [2][560/1730] Elapsed 3m 11s (remain 6m 38s) Loss: 0.7938 Grad: 112064.0312  LR: 0.00000420  


Train:  34%|███▎      | 581/1730 [03:18<07:31,  2.54train_batch/s]

Epoch: [2][580/1730] Elapsed 3m 18s (remain 6m 32s) Loss: 0.7923 Grad: 302508.4688  LR: 0.00000545  


Train:  35%|███▍      | 601/1730 [03:25<05:15,  3.57train_batch/s]

Epoch: [2][600/1730] Elapsed 3m 25s (remain 6m 25s) Loss: 0.7898 Grad: 104140.3594  LR: 0.00000679  


Train:  36%|███▌      | 621/1730 [03:33<06:10,  3.00train_batch/s]

Epoch: [2][620/1730] Elapsed 3m 33s (remain 6m 20s) Loss: 0.7924 Grad: 194427.5781  LR: 0.00000819  


Train:  37%|███▋      | 641/1730 [03:39<05:39,  3.21train_batch/s]

Epoch: [2][640/1730] Elapsed 3m 39s (remain 6m 13s) Loss: 0.7901 Grad: 65698.5312  LR: 0.00000964  


Train:  38%|███▊      | 661/1730 [03:46<06:03,  2.94train_batch/s]

Epoch: [2][660/1730] Elapsed 3m 46s (remain 6m 6s) Loss: 0.7861 Grad: 166020.0156  LR: 0.00001109  


Train:  39%|███▉      | 681/1730 [03:54<06:32,  2.67train_batch/s]

Epoch: [2][680/1730] Elapsed 3m 54s (remain 6m 0s) Loss: 0.7861 Grad: 293035.3125  LR: 0.00001252  


Train:  41%|████      | 701/1730 [04:00<07:45,  2.21train_batch/s]

Epoch: [2][700/1730] Elapsed 4m 0s (remain 5m 53s) Loss: 0.7871 Grad: 395163.5625  LR: 0.00001389  


Train:  42%|████▏     | 721/1730 [04:08<07:04,  2.37train_batch/s]

Epoch: [2][720/1730] Elapsed 4m 8s (remain 5m 47s) Loss: 0.7876 Grad: 141429.5312  LR: 0.00001519  


Train:  43%|████▎     | 741/1730 [04:15<06:08,  2.68train_batch/s]

Epoch: [2][740/1730] Elapsed 4m 15s (remain 5m 41s) Loss: 0.7883 Grad: 128067.4453  LR: 0.00001637  


Train:  44%|████▍     | 761/1730 [04:23<05:02,  3.20train_batch/s]

Epoch: [2][760/1730] Elapsed 4m 23s (remain 5m 35s) Loss: 0.7883 Grad: 115773.0156  LR: 0.00001742  


Train:  45%|████▌     | 781/1730 [04:30<05:18,  2.98train_batch/s]

Epoch: [2][780/1730] Elapsed 4m 30s (remain 5m 29s) Loss: 0.7898 Grad: 106209.6875  LR: 0.00001831  


Train:  46%|████▋     | 801/1730 [04:37<05:25,  2.86train_batch/s]

Epoch: [2][800/1730] Elapsed 4m 37s (remain 5m 21s) Loss: 0.7892 Grad: 133044.4062  LR: 0.00001903  


Train:  47%|████▋     | 821/1730 [04:44<06:26,  2.35train_batch/s]

Epoch: [2][820/1730] Elapsed 4m 44s (remain 5m 15s) Loss: 0.7926 Grad: 109666.2656  LR: 0.00001956  


Train:  49%|████▊     | 842/1730 [04:51<05:06,  2.90train_batch/s]

Epoch: [2][840/1730] Elapsed 4m 51s (remain 5m 8s) Loss: 0.7926 Grad: 387796.5312  LR: 0.00001988  


Train:  50%|████▉     | 861/1730 [04:58<04:10,  3.47train_batch/s]

Epoch: [2][860/1730] Elapsed 4m 58s (remain 5m 0s) Loss: 0.7936 Grad: 64519.7539  LR: 0.00002000  


Train:  51%|█████     | 881/1730 [05:05<05:51,  2.41train_batch/s]

Epoch: [2][880/1730] Elapsed 5m 5s (remain 4m 54s) Loss: 0.7939 Grad: 189096.1406  LR: 0.00001990  


Train:  52%|█████▏    | 901/1730 [05:13<04:25,  3.12train_batch/s]

Epoch: [2][900/1730] Elapsed 5m 13s (remain 4m 48s) Loss: 0.7963 Grad: 117712.6562  LR: 0.00001960  


Train:  53%|█████▎    | 921/1730 [05:19<04:08,  3.26train_batch/s]

Epoch: [2][920/1730] Elapsed 5m 19s (remain 4m 40s) Loss: 0.7942 Grad: 122047.2969  LR: 0.00001909  


Train:  54%|█████▍    | 941/1730 [05:26<04:28,  2.93train_batch/s]

Epoch: [2][940/1730] Elapsed 5m 26s (remain 4m 33s) Loss: 0.7998 Grad: 301189.6250  LR: 0.00001839  


Train:  56%|█████▌    | 961/1730 [05:33<04:38,  2.77train_batch/s]

Epoch: [2][960/1730] Elapsed 5m 33s (remain 4m 27s) Loss: 0.8015 Grad: 206702.9062  LR: 0.00001752  


Train:  57%|█████▋    | 981/1730 [05:41<06:05,  2.05train_batch/s]

Epoch: [2][980/1730] Elapsed 5m 41s (remain 4m 20s) Loss: 0.8018 Grad: 347577.8438  LR: 0.00001648  


Train:  58%|█████▊    | 1001/1730 [05:48<04:34,  2.65train_batch/s]

Epoch: [2][1000/1730] Elapsed 5m 48s (remain 4m 13s) Loss: 0.8026 Grad: 134920.0781  LR: 0.00001531  


Train:  59%|█████▉    | 1021/1730 [05:54<03:48,  3.10train_batch/s]

Epoch: [2][1020/1730] Elapsed 5m 54s (remain 4m 6s) Loss: 0.8040 Grad: 128169.6172  LR: 0.00001403  


Train:  60%|██████    | 1041/1730 [06:02<04:11,  2.74train_batch/s]

Epoch: [2][1040/1730] Elapsed 6m 2s (remain 3m 59s) Loss: 0.8069 Grad: 122504.5156  LR: 0.00001266  


Train:  61%|██████▏   | 1061/1730 [06:09<04:50,  2.31train_batch/s]

Epoch: [2][1060/1730] Elapsed 6m 9s (remain 3m 53s) Loss: 0.8076 Grad: 79651.1719  LR: 0.00001123  


Train:  62%|██████▏   | 1081/1730 [06:15<03:00,  3.60train_batch/s]

Epoch: [2][1080/1730] Elapsed 6m 15s (remain 3m 45s) Loss: 0.8068 Grad: 113294.7266  LR: 0.00000978  


Train:  64%|██████▎   | 1101/1730 [06:22<04:13,  2.48train_batch/s]

Epoch: [2][1100/1730] Elapsed 6m 22s (remain 3m 38s) Loss: 0.8057 Grad: 532068.6875  LR: 0.00000834  


Train:  65%|██████▍   | 1121/1730 [06:29<03:16,  3.10train_batch/s]

Epoch: [2][1120/1730] Elapsed 6m 29s (remain 3m 31s) Loss: 0.8048 Grad: 119766.6328  LR: 0.00000692  


Train:  66%|██████▌   | 1141/1730 [06:35<02:47,  3.52train_batch/s]

Epoch: [2][1140/1730] Elapsed 6m 35s (remain 3m 24s) Loss: 0.8053 Grad: 109349.4766  LR: 0.00000558  


Train:  67%|██████▋   | 1161/1730 [06:41<02:16,  4.17train_batch/s]

Epoch: [2][1160/1730] Elapsed 6m 41s (remain 3m 16s) Loss: 0.8055 Grad: 122527.5547  LR: 0.00000432  


Train:  68%|██████▊   | 1182/1730 [06:49<02:20,  3.90train_batch/s]

Epoch: [2][1180/1730] Elapsed 6m 48s (remain 3m 10s) Loss: 0.8048 Grad: 150366.1094  LR: 0.00000319  


Train:  69%|██████▉   | 1201/1730 [06:55<03:14,  2.73train_batch/s]

Epoch: [2][1200/1730] Elapsed 6m 55s (remain 3m 3s) Loss: 0.8032 Grad: 384031.9062  LR: 0.00000220  


Train:  71%|███████   | 1221/1730 [07:02<02:57,  2.87train_batch/s]

Epoch: [2][1220/1730] Elapsed 7m 2s (remain 2m 55s) Loss: 0.8015 Grad: 210317.7812  LR: 0.00000138  


Train:  72%|███████▏  | 1241/1730 [07:09<03:40,  2.22train_batch/s]

Epoch: [2][1240/1730] Elapsed 7m 9s (remain 2m 49s) Loss: 0.8009 Grad: 136171.5000  LR: 0.00000073  


Train:  73%|███████▎  | 1261/1730 [07:15<02:42,  2.89train_batch/s]

Epoch: [2][1260/1730] Elapsed 7m 15s (remain 2m 42s) Loss: 0.8004 Grad: 287921.9688  LR: 0.00000029  


Train:  74%|███████▍  | 1281/1730 [07:21<01:48,  4.14train_batch/s]

Epoch: [2][1280/1730] Elapsed 7m 21s (remain 2m 34s) Loss: 0.8004 Grad: 80374.7891  LR: 0.00000004  


Train:  75%|███████▌  | 1302/1730 [07:28<02:20,  3.05train_batch/s]

Epoch: [2][1300/1730] Elapsed 7m 28s (remain 2m 27s) Loss: 0.8004 Grad: 150177.3750  LR: 0.00000001  


Train:  76%|███████▋  | 1322/1730 [07:35<02:07,  3.19train_batch/s]

Epoch: [2][1320/1730] Elapsed 7m 35s (remain 2m 21s) Loss: 0.7991 Grad: 233258.4688  LR: 0.00000019  


Train:  78%|███████▊  | 1341/1730 [07:42<02:00,  3.24train_batch/s]

Epoch: [2][1340/1730] Elapsed 7m 42s (remain 2m 14s) Loss: 0.7981 Grad: 97120.1875  LR: 0.00000058  


Train:  79%|███████▊  | 1362/1730 [07:49<02:04,  2.94train_batch/s]

Epoch: [2][1360/1730] Elapsed 7m 49s (remain 2m 7s) Loss: 0.7961 Grad: 325019.8750  LR: 0.00000116  


Train:  80%|███████▉  | 1381/1730 [07:55<01:52,  3.10train_batch/s]

Epoch: [2][1380/1730] Elapsed 7m 55s (remain 2m 0s) Loss: 0.7977 Grad: 116225.5234  LR: 0.00000194  


Train:  81%|████████  | 1401/1730 [08:02<02:23,  2.30train_batch/s]

Epoch: [2][1400/1730] Elapsed 8m 2s (remain 1m 53s) Loss: 0.7974 Grad: 221648.0625  LR: 0.00000288  


Train:  82%|████████▏ | 1421/1730 [08:09<01:37,  3.16train_batch/s]

Epoch: [2][1420/1730] Elapsed 8m 9s (remain 1m 46s) Loss: 0.7954 Grad: 69325.4531  LR: 0.00000397  


Train:  83%|████████▎ | 1441/1730 [08:16<01:31,  3.18train_batch/s]

Epoch: [2][1440/1730] Elapsed 8m 16s (remain 1m 39s) Loss: 0.7938 Grad: 125096.3750  LR: 0.00000519  


Train:  84%|████████▍ | 1461/1730 [08:22<01:25,  3.16train_batch/s]

Epoch: [2][1460/1730] Elapsed 8m 22s (remain 1m 32s) Loss: 0.7943 Grad: 276717.3438  LR: 0.00000651  


Train:  86%|████████▌ | 1481/1730 [08:28<01:15,  3.31train_batch/s]

Epoch: [2][1480/1730] Elapsed 8m 28s (remain 1m 25s) Loss: 0.7946 Grad: 216682.2500  LR: 0.00000791  


Train:  87%|████████▋ | 1501/1730 [08:36<01:28,  2.60train_batch/s]

Epoch: [2][1500/1730] Elapsed 8m 36s (remain 1m 18s) Loss: 0.7937 Grad: 146542.9844  LR: 0.00000935  


Train:  88%|████████▊ | 1521/1730 [08:43<01:19,  2.62train_batch/s]

Epoch: [2][1520/1730] Elapsed 8m 43s (remain 1m 11s) Loss: 0.7949 Grad: 209509.9531  LR: 0.00001080  


Train:  89%|████████▉ | 1541/1730 [08:50<01:12,  2.61train_batch/s]

Epoch: [2][1540/1730] Elapsed 8m 50s (remain 1m 5s) Loss: 0.7963 Grad: 317664.5938  LR: 0.00001224  


Train:  90%|█████████ | 1561/1730 [08:56<00:49,  3.44train_batch/s]

Epoch: [2][1560/1730] Elapsed 8m 56s (remain 0m 58s) Loss: 0.7965 Grad: 130004.9141  LR: 0.00001362  


Train:  91%|█████████▏| 1581/1730 [09:03<00:52,  2.83train_batch/s]

Epoch: [2][1580/1730] Elapsed 9m 3s (remain 0m 51s) Loss: 0.7949 Grad: 164655.1094  LR: 0.00001494  


Train:  93%|█████████▎| 1601/1730 [09:10<00:36,  3.55train_batch/s]

Epoch: [2][1600/1730] Elapsed 9m 10s (remain 0m 44s) Loss: 0.7946 Grad: 191440.7031  LR: 0.00001615  


Train:  94%|█████████▎| 1621/1730 [09:17<00:41,  2.63train_batch/s]

Epoch: [2][1620/1730] Elapsed 9m 17s (remain 0m 37s) Loss: 0.7956 Grad: 211792.9375  LR: 0.00001722  


Train:  95%|█████████▍| 1641/1730 [09:24<00:31,  2.81train_batch/s]

Epoch: [2][1640/1730] Elapsed 9m 24s (remain 0m 30s) Loss: 0.7955 Grad: 167759.3281  LR: 0.00001815  


Train:  96%|█████████▌| 1661/1730 [09:30<00:19,  3.55train_batch/s]

Epoch: [2][1660/1730] Elapsed 9m 30s (remain 0m 23s) Loss: 0.7969 Grad: 122962.9375  LR: 0.00001890  


Train:  97%|█████████▋| 1681/1730 [09:37<00:16,  2.92train_batch/s]

Epoch: [2][1680/1730] Elapsed 9m 37s (remain 0m 16s) Loss: 0.7965 Grad: 147507.0469  LR: 0.00001947  


Train:  98%|█████████▊| 1701/1730 [09:44<00:10,  2.88train_batch/s]

Epoch: [2][1700/1730] Elapsed 9m 44s (remain 0m 9s) Loss: 0.7959 Grad: 130827.8906  LR: 0.00001984  


Train: 100%|█████████▉| 1722/1730 [09:52<00:02,  3.19train_batch/s]

Epoch: [2][1720/1730] Elapsed 9m 52s (remain 0m 3s) Loss: 0.7968 Grad: 186229.2656  LR: 0.00001999  


Train: 100%|██████████| 1730/1730 [09:54<00:00,  2.91train_batch/s]


Epoch: [2][1729/1730] Elapsed 9m 54s (remain 0m 0s) Loss: 0.7974 Grad: 166722.6562  LR: 0.00002000  


Validation:   0%|          | 1/866 [00:00<04:27,  3.23valid_batch/s]

EVAL: [0/866] Elapsed 0m 0s (remain 4m 28s) Loss: 1.0007 


Validation:   3%|▎         | 22/866 [00:01<00:57, 14.65valid_batch/s]

EVAL: [20/866] Elapsed 0m 1s (remain 1m 11s) Loss: 0.8165 


Validation:   5%|▍         | 42/866 [00:03<01:01, 13.32valid_batch/s]

EVAL: [40/866] Elapsed 0m 3s (remain 1m 4s) Loss: 0.7961 


Validation:   7%|▋         | 63/866 [00:04<00:54, 14.61valid_batch/s]

EVAL: [60/866] Elapsed 0m 4s (remain 1m 2s) Loss: 0.8119 


Validation:  10%|▉         | 83/866 [00:06<00:51, 15.32valid_batch/s]

EVAL: [80/866] Elapsed 0m 6s (remain 0m 59s) Loss: 0.7862 


Validation:  12%|█▏        | 101/866 [00:07<00:49, 15.45valid_batch/s]

EVAL: [100/866] Elapsed 0m 7s (remain 0m 56s) Loss: 0.7926 


Validation:  14%|█▍        | 122/866 [00:08<00:45, 16.48valid_batch/s]

EVAL: [120/866] Elapsed 0m 8s (remain 0m 54s) Loss: 0.7834 


Validation:  16%|█▋        | 142/866 [00:10<00:55, 12.97valid_batch/s]

EVAL: [140/866] Elapsed 0m 10s (remain 0m 53s) Loss: 0.7856 


Validation:  19%|█▊        | 162/866 [00:11<00:49, 14.36valid_batch/s]

EVAL: [160/866] Elapsed 0m 11s (remain 0m 51s) Loss: 0.7853 


Validation:  21%|██        | 183/866 [00:13<00:46, 14.68valid_batch/s]

EVAL: [180/866] Elapsed 0m 12s (remain 0m 48s) Loss: 0.7901 


Validation:  23%|██▎       | 203/866 [00:14<00:56, 11.69valid_batch/s]

EVAL: [200/866] Elapsed 0m 14s (remain 0m 47s) Loss: 0.7865 


Validation:  26%|██▌       | 223/866 [00:16<00:46, 13.79valid_batch/s]

EVAL: [220/866] Elapsed 0m 15s (remain 0m 46s) Loss: 0.7753 


Validation:  28%|██▊       | 241/866 [00:17<00:44, 13.96valid_batch/s]

EVAL: [240/866] Elapsed 0m 17s (remain 0m 45s) Loss: 0.7823 


Validation:  30%|███       | 262/866 [00:19<00:48, 12.48valid_batch/s]

EVAL: [260/866] Elapsed 0m 19s (remain 0m 44s) Loss: 0.7743 


Validation:  33%|███▎      | 282/866 [00:20<00:51, 11.43valid_batch/s]

EVAL: [280/866] Elapsed 0m 20s (remain 0m 43s) Loss: 0.7824 


Validation:  35%|███▍      | 303/866 [00:22<00:44, 12.77valid_batch/s]

EVAL: [300/866] Elapsed 0m 22s (remain 0m 42s) Loss: 0.7858 


Validation:  37%|███▋      | 322/866 [00:24<00:41, 13.05valid_batch/s]

EVAL: [320/866] Elapsed 0m 23s (remain 0m 40s) Loss: 0.7805 


Validation:  39%|███▉      | 342/866 [00:25<00:44, 11.77valid_batch/s]

EVAL: [340/866] Elapsed 0m 25s (remain 0m 39s) Loss: 0.7810 


Validation:  42%|████▏     | 363/866 [00:27<00:36, 13.86valid_batch/s]

EVAL: [360/866] Elapsed 0m 27s (remain 0m 37s) Loss: 0.7841 


Validation:  44%|████▍     | 382/866 [00:28<00:33, 14.53valid_batch/s]

EVAL: [380/866] Elapsed 0m 28s (remain 0m 36s) Loss: 0.7872 


Validation:  47%|████▋     | 404/866 [00:30<00:31, 14.62valid_batch/s]

EVAL: [400/866] Elapsed 0m 29s (remain 0m 34s) Loss: 0.7942 


Validation:  49%|████▉     | 423/866 [00:31<00:31, 14.16valid_batch/s]

EVAL: [420/866] Elapsed 0m 31s (remain 0m 33s) Loss: 0.7943 


Validation:  51%|█████     | 443/866 [00:32<00:30, 14.07valid_batch/s]

EVAL: [440/866] Elapsed 0m 32s (remain 0m 31s) Loss: 0.7898 


Validation:  53%|█████▎    | 463/866 [00:34<00:27, 14.78valid_batch/s]

EVAL: [460/866] Elapsed 0m 34s (remain 0m 29s) Loss: 0.7980 


Validation:  56%|█████▌    | 483/866 [00:35<00:24, 15.58valid_batch/s]

EVAL: [480/866] Elapsed 0m 35s (remain 0m 28s) Loss: 0.7976 


Validation:  58%|█████▊    | 503/866 [00:37<00:27, 13.14valid_batch/s]

EVAL: [500/866] Elapsed 0m 36s (remain 0m 26s) Loss: 0.7949 


Validation:  60%|██████    | 523/866 [00:38<00:27, 12.63valid_batch/s]

EVAL: [520/866] Elapsed 0m 38s (remain 0m 25s) Loss: 0.7942 


Validation:  63%|██████▎   | 542/866 [00:40<00:26, 12.16valid_batch/s]

EVAL: [540/866] Elapsed 0m 40s (remain 0m 24s) Loss: 0.7990 


Validation:  65%|██████▌   | 563/866 [00:41<00:19, 15.93valid_batch/s]

EVAL: [560/866] Elapsed 0m 41s (remain 0m 22s) Loss: 0.8001 


Validation:  67%|██████▋   | 583/866 [00:42<00:18, 14.91valid_batch/s]

EVAL: [580/866] Elapsed 0m 42s (remain 0m 20s) Loss: 0.8042 


Validation:  70%|██████▉   | 603/866 [00:44<00:18, 14.26valid_batch/s]

EVAL: [600/866] Elapsed 0m 44s (remain 0m 19s) Loss: 0.8057 


Validation:  72%|███████▏  | 622/866 [00:45<00:16, 14.92valid_batch/s]

EVAL: [620/866] Elapsed 0m 45s (remain 0m 17s) Loss: 0.8056 


Validation:  74%|███████▍  | 642/866 [00:47<00:17, 12.94valid_batch/s]

EVAL: [640/866] Elapsed 0m 47s (remain 0m 16s) Loss: 0.8082 


Validation:  77%|███████▋  | 663/866 [00:48<00:13, 14.81valid_batch/s]

EVAL: [660/866] Elapsed 0m 48s (remain 0m 15s) Loss: 0.8090 


Validation:  79%|███████▊  | 681/866 [00:49<00:12, 14.45valid_batch/s]

EVAL: [680/866] Elapsed 0m 49s (remain 0m 13s) Loss: 0.8115 


Validation:  81%|████████  | 702/866 [00:51<00:12, 13.46valid_batch/s]

EVAL: [700/866] Elapsed 0m 51s (remain 0m 12s) Loss: 0.8147 


Validation:  84%|████████▎ | 724/866 [00:52<00:09, 15.67valid_batch/s]

EVAL: [720/866] Elapsed 0m 52s (remain 0m 10s) Loss: 0.8150 


Validation:  86%|████████▌ | 743/866 [00:54<00:09, 13.46valid_batch/s]

EVAL: [740/866] Elapsed 0m 54s (remain 0m 9s) Loss: 0.8144 


Validation:  88%|████████▊ | 763/866 [00:55<00:06, 15.61valid_batch/s]

EVAL: [760/866] Elapsed 0m 55s (remain 0m 7s) Loss: 0.8084 


Validation:  90%|█████████ | 782/866 [00:56<00:06, 12.80valid_batch/s]

EVAL: [780/866] Elapsed 0m 56s (remain 0m 6s) Loss: 0.8084 


Validation:  93%|█████████▎| 803/866 [00:58<00:04, 13.10valid_batch/s]

EVAL: [800/866] Elapsed 0m 58s (remain 0m 4s) Loss: 0.8092 


Validation:  95%|█████████▍| 822/866 [01:00<00:02, 14.73valid_batch/s]

EVAL: [820/866] Elapsed 0m 59s (remain 0m 3s) Loss: 0.8077 


Validation:  97%|█████████▋| 843/866 [01:01<00:01, 17.96valid_batch/s]

EVAL: [840/866] Elapsed 1m 1s (remain 0m 1s) Loss: 0.8050 


Validation: 100%|█████████▉| 862/866 [01:02<00:00, 13.43valid_batch/s]

EVAL: [860/866] Elapsed 1m 2s (remain 0m 0s) Loss: 0.8069 


Validation: 100%|██████████| 866/866 [01:03<00:00, 13.74valid_batch/s]
Epoch 2 - avg_train_loss: 0.7974  avg_val_loss: 0.8062  time: 658s
Epoch 2 - Score: 0.8067
Epoch 2 - Save Best Score: 0.8067 Model


EVAL: [865/866] Elapsed 1m 2s (remain 0m 0s) Loss: 0.8062 


Score: 0.8067
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Train:   0%|          | 1/1730 [00:00<10:58,  2.63train_batch/s]

Epoch: [1][0/1730] Elapsed 0m 0s (remain 10m 59s) Loss: 2.1707 Grad: inf  LR: 0.00002000  


Train:   1%|▏         | 22/1730 [00:07<08:12,  3.47train_batch/s]

Epoch: [1][20/1730] Elapsed 0m 7s (remain 10m 31s) Loss: 1.6591 Grad: 40083.0352  LR: 0.00001988  


Train:   2%|▏         | 41/1730 [00:14<09:21,  3.01train_batch/s]

Epoch: [1][40/1730] Elapsed 0m 14s (remain 9m 52s) Loss: 1.5079 Grad: 125425.2656  LR: 0.00001956  


Train:   4%|▎         | 61/1730 [00:20<08:04,  3.44train_batch/s]

Epoch: [1][60/1730] Elapsed 0m 20s (remain 9m 29s) Loss: 1.3893 Grad: 99245.7578  LR: 0.00001903  


Train:   5%|▍         | 81/1730 [00:27<08:29,  3.24train_batch/s]

Epoch: [1][80/1730] Elapsed 0m 27s (remain 9m 21s) Loss: 1.3383 Grad: 197722.8906  LR: 0.00001831  


Train:   6%|▌         | 101/1730 [00:34<10:46,  2.52train_batch/s]

Epoch: [1][100/1730] Elapsed 0m 34s (remain 9m 20s) Loss: 1.2848 Grad: nan  LR: 0.00001742  


Train:   7%|▋         | 121/1730 [00:41<09:01,  2.97train_batch/s]

Epoch: [1][120/1730] Elapsed 0m 41s (remain 9m 8s) Loss: 1.2459 Grad: 128981.2344  LR: 0.00001637  


Train:   8%|▊         | 141/1730 [00:48<09:05,  2.91train_batch/s]

Epoch: [1][140/1730] Elapsed 0m 48s (remain 9m 5s) Loss: 1.2399 Grad: 59324.8516  LR: 0.00001519  


Train:   9%|▉         | 161/1730 [00:55<09:33,  2.73train_batch/s]

Epoch: [1][160/1730] Elapsed 0m 55s (remain 8m 59s) Loss: 1.2130 Grad: 27878.0859  LR: 0.00001389  


Train:  10%|█         | 181/1730 [01:01<08:20,  3.10train_batch/s]

Epoch: [1][180/1730] Elapsed 1m 1s (remain 8m 47s) Loss: 1.1918 Grad: 36195.3047  LR: 0.00001252  


Train:  12%|█▏        | 202/1730 [01:09<09:12,  2.77train_batch/s]

Epoch: [1][200/1730] Elapsed 1m 9s (remain 8m 47s) Loss: 1.1778 Grad: 58115.5742  LR: 0.00001109  


Train:  13%|█▎        | 221/1730 [01:15<07:23,  3.41train_batch/s]

Epoch: [1][220/1730] Elapsed 1m 15s (remain 8m 38s) Loss: 1.1576 Grad: 24513.1797  LR: 0.00000964  


Train:  14%|█▍        | 241/1730 [01:23<11:12,  2.21train_batch/s]

Epoch: [1][240/1730] Elapsed 1m 23s (remain 8m 36s) Loss: 1.1377 Grad: 17200.2637  LR: 0.00000819  


Train:  15%|█▌        | 261/1730 [01:29<08:40,  2.82train_batch/s]

Epoch: [1][260/1730] Elapsed 1m 29s (remain 8m 26s) Loss: 1.1240 Grad: 50600.7188  LR: 0.00000679  


Train:  16%|█▌        | 281/1730 [01:36<08:57,  2.70train_batch/s]

Epoch: [1][280/1730] Elapsed 1m 36s (remain 8m 18s) Loss: 1.1115 Grad: 101385.6094  LR: 0.00000545  


Train:  17%|█▋        | 301/1730 [01:44<10:02,  2.37train_batch/s]

Epoch: [1][300/1730] Elapsed 1m 44s (remain 8m 14s) Loss: 1.1141 Grad: 53584.4688  LR: 0.00000420  


Train:  19%|█▊        | 321/1730 [01:51<06:47,  3.45train_batch/s]

Epoch: [1][320/1730] Elapsed 1m 51s (remain 8m 8s) Loss: 1.1017 Grad: 25046.8516  LR: 0.00000308  


Train:  20%|█▉        | 341/1730 [01:59<09:20,  2.48train_batch/s]

Epoch: [1][340/1730] Elapsed 1m 59s (remain 8m 5s) Loss: 1.0871 Grad: 17860.5254  LR: 0.00000211  


Train:  21%|██        | 361/1730 [02:06<08:19,  2.74train_batch/s]

Epoch: [1][360/1730] Elapsed 2m 6s (remain 7m 59s) Loss: 1.0827 Grad: 52160.9570  LR: 0.00000130  


Train:  22%|██▏       | 381/1730 [02:13<07:01,  3.20train_batch/s]

Epoch: [1][380/1730] Elapsed 2m 13s (remain 7m 52s) Loss: 1.0702 Grad: 36290.1133  LR: 0.00000068  


Train:  23%|██▎       | 401/1730 [02:19<06:35,  3.36train_batch/s]

Epoch: [1][400/1730] Elapsed 2m 19s (remain 7m 41s) Loss: 1.0643 Grad: 33782.7734  LR: 0.00000025  


Train:  24%|██▍       | 421/1730 [02:25<05:38,  3.87train_batch/s]

Epoch: [1][420/1730] Elapsed 2m 25s (remain 7m 32s) Loss: 1.0510 Grad: 36123.5664  LR: 0.00000003  


Train:  25%|██▌       | 441/1730 [02:32<06:27,  3.32train_batch/s]

Epoch: [1][440/1730] Elapsed 2m 32s (remain 7m 24s) Loss: 1.0396 Grad: 27385.6152  LR: 0.00000002  


Train:  27%|██▋       | 461/1730 [02:39<07:47,  2.71train_batch/s]

Epoch: [1][460/1730] Elapsed 2m 39s (remain 7m 18s) Loss: 1.0320 Grad: 34757.1094  LR: 0.00000022  


Train:  28%|██▊       | 481/1730 [02:45<07:19,  2.84train_batch/s]

Epoch: [1][480/1730] Elapsed 2m 45s (remain 7m 10s) Loss: 1.0227 Grad: 41709.1328  LR: 0.00000063  


Train:  29%|██▉       | 501/1730 [02:52<07:18,  2.80train_batch/s]

Epoch: [1][500/1730] Elapsed 2m 52s (remain 7m 3s) Loss: 1.0175 Grad: 16217.0303  LR: 0.00000123  


Train:  30%|███       | 522/1730 [02:59<05:07,  3.93train_batch/s]

Epoch: [1][520/1730] Elapsed 2m 59s (remain 6m 56s) Loss: 1.0130 Grad: 51034.2148  LR: 0.00000202  


Train:  31%|███▏      | 541/1730 [03:05<07:46,  2.55train_batch/s]

Epoch: [1][540/1730] Elapsed 3m 5s (remain 6m 48s) Loss: 1.0075 Grad: 29953.5801  LR: 0.00000298  


Train:  32%|███▏      | 561/1730 [03:11<05:53,  3.30train_batch/s]

Epoch: [1][560/1730] Elapsed 3m 11s (remain 6m 39s) Loss: 1.0035 Grad: 76420.1016  LR: 0.00000409  


Train:  34%|███▎      | 581/1730 [03:18<07:21,  2.60train_batch/s]

Epoch: [1][580/1730] Elapsed 3m 18s (remain 6m 32s) Loss: 0.9989 Grad: 51627.9258  LR: 0.00000532  


Train:  35%|███▍      | 601/1730 [03:24<06:14,  3.02train_batch/s]

Epoch: [1][600/1730] Elapsed 3m 24s (remain 6m 24s) Loss: 0.9968 Grad: 37790.1133  LR: 0.00000665  


Train:  36%|███▌      | 621/1730 [03:31<05:47,  3.19train_batch/s]

Epoch: [1][620/1730] Elapsed 3m 31s (remain 6m 18s) Loss: 0.9945 Grad: 58811.0508  LR: 0.00000805  


Train:  37%|███▋      | 641/1730 [03:38<06:12,  2.92train_batch/s]

Epoch: [1][640/1730] Elapsed 3m 38s (remain 6m 11s) Loss: 0.9907 Grad: 55918.0703  LR: 0.00000949  


Train:  38%|███▊      | 662/1730 [03:46<05:47,  3.08train_batch/s]

Epoch: [1][660/1730] Elapsed 3m 46s (remain 6m 5s) Loss: 0.9874 Grad: 27732.3320  LR: 0.00001094  


Train:  39%|███▉      | 681/1730 [03:53<06:59,  2.50train_batch/s]

Epoch: [1][680/1730] Elapsed 3m 52s (remain 5m 58s) Loss: 0.9838 Grad: 34859.1797  LR: 0.00001238  


Train:  41%|████      | 701/1730 [03:59<05:38,  3.04train_batch/s]

Epoch: [1][700/1730] Elapsed 3m 59s (remain 5m 51s) Loss: 0.9838 Grad: 25793.2148  LR: 0.00001376  


Train:  42%|████▏     | 721/1730 [04:06<05:22,  3.13train_batch/s]

Epoch: [1][720/1730] Elapsed 4m 6s (remain 5m 44s) Loss: 0.9843 Grad: 31920.6504  LR: 0.00001506  


Train:  43%|████▎     | 742/1730 [04:14<05:11,  3.17train_batch/s]

Epoch: [1][740/1730] Elapsed 4m 13s (remain 5m 38s) Loss: 0.9843 Grad: 31662.9180  LR: 0.00001626  


Train:  44%|████▍     | 761/1730 [04:20<05:25,  2.97train_batch/s]

Epoch: [1][760/1730] Elapsed 4m 20s (remain 5m 31s) Loss: 0.9835 Grad: 31518.5371  LR: 0.00001732  


Train:  45%|████▌     | 781/1730 [04:26<05:35,  2.83train_batch/s]

Epoch: [1][780/1730] Elapsed 4m 26s (remain 5m 23s) Loss: 0.9811 Grad: 66143.0000  LR: 0.00001823  


Train:  46%|████▋     | 801/1730 [04:32<05:18,  2.92train_batch/s]

Epoch: [1][800/1730] Elapsed 4m 32s (remain 5m 16s) Loss: 0.9790 Grad: 86624.9453  LR: 0.00001897  


Train:  47%|████▋     | 821/1730 [04:40<06:07,  2.47train_batch/s]

Epoch: [1][820/1730] Elapsed 4m 40s (remain 5m 10s) Loss: 0.9800 Grad: 28955.1641  LR: 0.00001952  


Train:  49%|████▊     | 841/1730 [04:46<05:21,  2.76train_batch/s]

Epoch: [1][840/1730] Elapsed 4m 46s (remain 5m 2s) Loss: 0.9777 Grad: 64950.4492  LR: 0.00001986  


Train:  50%|████▉     | 861/1730 [04:53<05:16,  2.75train_batch/s]

Epoch: [1][860/1730] Elapsed 4m 53s (remain 4m 55s) Loss: 0.9834 Grad: 32694.3438  LR: 0.00002000  


Train:  51%|█████     | 881/1730 [05:00<05:43,  2.47train_batch/s]

Epoch: [1][880/1730] Elapsed 5m 0s (remain 4m 49s) Loss: 0.9828 Grad: 36126.0820  LR: 0.00001992  


Train:  52%|█████▏    | 901/1730 [05:08<06:00,  2.30train_batch/s]

Epoch: [1][900/1730] Elapsed 5m 8s (remain 4m 43s) Loss: 0.9853 Grad: 74277.0625  LR: 0.00001964  


Train:  53%|█████▎    | 921/1730 [05:15<04:18,  3.13train_batch/s]

Epoch: [1][920/1730] Elapsed 5m 15s (remain 4m 36s) Loss: 0.9809 Grad: 40011.7656  LR: 0.00001915  


Train:  54%|█████▍    | 941/1730 [05:21<04:09,  3.17train_batch/s]

Epoch: [1][940/1730] Elapsed 5m 21s (remain 4m 29s) Loss: 0.9834 Grad: 41124.8281  LR: 0.00001847  


Train:  56%|█████▌    | 961/1730 [05:28<03:52,  3.30train_batch/s]

Epoch: [1][960/1730] Elapsed 5m 28s (remain 4m 23s) Loss: 0.9816 Grad: 41853.0898  LR: 0.00001761  


Train:  57%|█████▋    | 981/1730 [05:36<04:24,  2.83train_batch/s]

Epoch: [1][980/1730] Elapsed 5m 36s (remain 4m 17s) Loss: 0.9807 Grad: 22565.3633  LR: 0.00001659  


Train:  58%|█████▊    | 1001/1730 [05:42<03:50,  3.17train_batch/s]

Epoch: [1][1000/1730] Elapsed 5m 42s (remain 4m 9s) Loss: 0.9783 Grad: 25132.7383  LR: 0.00001543  


Train:  59%|█████▉    | 1021/1730 [05:49<03:40,  3.21train_batch/s]

Epoch: [1][1020/1730] Elapsed 5m 49s (remain 4m 2s) Loss: 0.9774 Grad: 35234.0117  LR: 0.00001416  


Train:  60%|██████    | 1041/1730 [05:55<03:15,  3.53train_batch/s]

Epoch: [1][1040/1730] Elapsed 5m 55s (remain 3m 55s) Loss: 0.9772 Grad: 14336.2959  LR: 0.00001280  


Train:  61%|██████▏   | 1061/1730 [06:01<04:20,  2.57train_batch/s]

Epoch: [1][1060/1730] Elapsed 6m 1s (remain 3m 48s) Loss: 0.9746 Grad: 37248.8477  LR: 0.00001138  


Train:  62%|██████▏   | 1081/1730 [06:08<03:48,  2.85train_batch/s]

Epoch: [1][1080/1730] Elapsed 6m 8s (remain 3m 41s) Loss: 0.9717 Grad: 25266.9434  LR: 0.00000993  


Train:  64%|██████▎   | 1101/1730 [06:16<03:51,  2.72train_batch/s]

Epoch: [1][1100/1730] Elapsed 6m 16s (remain 3m 34s) Loss: 0.9704 Grad: 79868.5625  LR: 0.00000848  


Train:  65%|██████▍   | 1122/1730 [06:22<02:37,  3.86train_batch/s]

Epoch: [1][1120/1730] Elapsed 6m 22s (remain 3m 27s) Loss: 0.9670 Grad: 21136.3848  LR: 0.00000706  


Train:  66%|██████▌   | 1141/1730 [06:29<03:40,  2.68train_batch/s]

Epoch: [1][1140/1730] Elapsed 6m 29s (remain 3m 20s) Loss: 0.9657 Grad: 72829.8984  LR: 0.00000571  


Train:  67%|██████▋   | 1161/1730 [06:35<03:16,  2.90train_batch/s]

Epoch: [1][1160/1730] Elapsed 6m 35s (remain 3m 13s) Loss: 0.9621 Grad: 52344.7734  LR: 0.00000444  


Train:  68%|██████▊   | 1181/1730 [06:43<03:48,  2.40train_batch/s]

Epoch: [1][1180/1730] Elapsed 6m 43s (remain 3m 7s) Loss: 0.9597 Grad: 27304.7578  LR: 0.00000330  


Train:  69%|██████▉   | 1201/1730 [06:49<02:12,  3.99train_batch/s]

Epoch: [1][1200/1730] Elapsed 6m 49s (remain 3m 0s) Loss: 0.9559 Grad: 16612.5273  LR: 0.00000229  


Train:  71%|███████   | 1221/1730 [06:55<03:01,  2.80train_batch/s]

Epoch: [1][1220/1730] Elapsed 6m 55s (remain 2m 53s) Loss: 0.9538 Grad: 18859.5137  LR: 0.00000145  


Train:  72%|███████▏  | 1241/1730 [07:02<03:06,  2.63train_batch/s]

Epoch: [1][1240/1730] Elapsed 7m 2s (remain 2m 46s) Loss: 0.9512 Grad: 23490.5566  LR: 0.00000079  


Train:  73%|███████▎  | 1261/1730 [07:09<02:44,  2.85train_batch/s]

Epoch: [1][1260/1730] Elapsed 7m 9s (remain 2m 39s) Loss: 0.9481 Grad: 21123.8496  LR: 0.00000032  


Train:  74%|███████▍  | 1281/1730 [07:17<03:17,  2.27train_batch/s]

Epoch: [1][1280/1730] Elapsed 7m 17s (remain 2m 33s) Loss: 0.9463 Grad: 22470.2461  LR: 0.00000006  


Train:  75%|███████▌  | 1301/1730 [07:24<02:49,  2.53train_batch/s]

Epoch: [1][1300/1730] Elapsed 7m 24s (remain 2m 26s) Loss: 0.9430 Grad: 10187.6025  LR: 0.00000001  


Train:  76%|███████▋  | 1321/1730 [07:31<02:31,  2.70train_batch/s]

Epoch: [1][1320/1730] Elapsed 7m 31s (remain 2m 19s) Loss: 0.9415 Grad: 28034.5254  LR: 0.00000016  


Train:  78%|███████▊  | 1341/1730 [07:39<02:34,  2.52train_batch/s]

Epoch: [1][1340/1730] Elapsed 7m 39s (remain 2m 13s) Loss: 0.9394 Grad: 43410.8945  LR: 0.00000053  


Train:  79%|███████▊  | 1361/1730 [07:45<01:52,  3.28train_batch/s]

Epoch: [1][1360/1730] Elapsed 7m 45s (remain 2m 6s) Loss: 0.9365 Grad: 16702.4590  LR: 0.00000110  


Train:  80%|███████▉  | 1381/1730 [07:52<01:52,  3.11train_batch/s]

Epoch: [1][1380/1730] Elapsed 7m 52s (remain 1m 59s) Loss: 0.9361 Grad: 31922.1387  LR: 0.00000185  


Train:  81%|████████  | 1401/1730 [07:58<01:38,  3.33train_batch/s]

Epoch: [1][1400/1730] Elapsed 7m 58s (remain 1m 52s) Loss: 0.9338 Grad: 38498.1992  LR: 0.00000278  


Train:  82%|████████▏ | 1421/1730 [08:05<01:54,  2.70train_batch/s]

Epoch: [1][1420/1730] Elapsed 8m 5s (remain 1m 45s) Loss: 0.9314 Grad: 44147.2227  LR: 0.00000385  


Train:  83%|████████▎ | 1442/1730 [08:12<01:29,  3.22train_batch/s]

Epoch: [1][1440/1730] Elapsed 8m 12s (remain 1m 38s) Loss: 0.9300 Grad: 41918.2070  LR: 0.00000506  


Train:  84%|████████▍ | 1461/1730 [08:17<01:20,  3.35train_batch/s]

Epoch: [1][1460/1730] Elapsed 8m 17s (remain 1m 31s) Loss: 0.9271 Grad: 21400.2852  LR: 0.00000638  


Train:  86%|████████▌ | 1481/1730 [08:24<01:36,  2.59train_batch/s]

Epoch: [1][1480/1730] Elapsed 8m 24s (remain 1m 24s) Loss: 0.9259 Grad: 50473.5352  LR: 0.00000776  


Train:  87%|████████▋ | 1501/1730 [08:31<01:30,  2.52train_batch/s]

Epoch: [1][1500/1730] Elapsed 8m 31s (remain 1m 17s) Loss: 0.9248 Grad: 103296.2812  LR: 0.00000920  


Train:  88%|████████▊ | 1521/1730 [08:38<01:22,  2.55train_batch/s]

Epoch: [1][1520/1730] Elapsed 8m 38s (remain 1m 11s) Loss: 0.9232 Grad: 29691.3848  LR: 0.00001065  


Train:  89%|████████▉ | 1541/1730 [08:45<01:07,  2.80train_batch/s]

Epoch: [1][1540/1730] Elapsed 8m 45s (remain 1m 4s) Loss: 0.9230 Grad: 28539.0293  LR: 0.00001209  


Train:  90%|█████████ | 1561/1730 [08:52<01:05,  2.59train_batch/s]

Epoch: [1][1560/1730] Elapsed 8m 52s (remain 0m 57s) Loss: 0.9236 Grad: 23190.9824  LR: 0.00001349  


Train:  91%|█████████▏| 1581/1730 [08:59<00:55,  2.71train_batch/s]

Epoch: [1][1580/1730] Elapsed 8m 59s (remain 0m 50s) Loss: 0.9223 Grad: 40890.8867  LR: 0.00001481  


Train:  93%|█████████▎| 1601/1730 [09:05<00:39,  3.24train_batch/s]

Epoch: [1][1600/1730] Elapsed 9m 5s (remain 0m 43s) Loss: 0.9232 Grad: 81161.5547  LR: 0.00001603  


Train:  94%|█████████▎| 1621/1730 [09:12<00:35,  3.10train_batch/s]

Epoch: [1][1620/1730] Elapsed 9m 12s (remain 0m 37s) Loss: 0.9239 Grad: 51805.5898  LR: 0.00001712  


Train:  95%|█████████▍| 1641/1730 [09:19<00:32,  2.78train_batch/s]

Epoch: [1][1640/1730] Elapsed 9m 19s (remain 0m 30s) Loss: 0.9233 Grad: 21459.0137  LR: 0.00001806  


Train:  96%|█████████▌| 1661/1730 [09:27<00:25,  2.72train_batch/s]

Epoch: [1][1660/1730] Elapsed 9m 27s (remain 0m 23s) Loss: 0.9234 Grad: 42487.8516  LR: 0.00001884  


Train:  97%|█████████▋| 1681/1730 [09:34<00:14,  3.33train_batch/s]

Epoch: [1][1680/1730] Elapsed 9m 34s (remain 0m 16s) Loss: 0.9219 Grad: 39332.2344  LR: 0.00001942  


Train:  98%|█████████▊| 1701/1730 [09:40<00:09,  2.98train_batch/s]

Epoch: [1][1700/1730] Elapsed 9m 40s (remain 0m 9s) Loss: 0.9230 Grad: 34562.4375  LR: 0.00001981  


Train:  99%|█████████▉| 1721/1730 [09:48<00:02,  3.41train_batch/s]

Epoch: [1][1720/1730] Elapsed 9m 48s (remain 0m 3s) Loss: 0.9227 Grad: 14704.3174  LR: 0.00001999  


Train: 100%|██████████| 1730/1730 [09:50<00:00,  2.93train_batch/s]


Epoch: [1][1729/1730] Elapsed 9m 50s (remain 0m 0s) Loss: 0.9224 Grad: 41386.6289  LR: 0.00002000  


Validation:   0%|          | 3/866 [00:00<01:33,  9.26valid_batch/s]

EVAL: [0/866] Elapsed 0m 0s (remain 3m 28s) Loss: 0.4801 


Validation:   3%|▎         | 23/866 [00:01<00:52, 16.20valid_batch/s]

EVAL: [20/866] Elapsed 0m 1s (remain 1m 3s) Loss: 0.9584 


Validation:   5%|▍         | 43/866 [00:02<00:48, 17.00valid_batch/s]

EVAL: [40/866] Elapsed 0m 2s (remain 0m 57s) Loss: 0.8622 


Validation:   7%|▋         | 63/866 [00:04<00:44, 18.20valid_batch/s]

EVAL: [60/866] Elapsed 0m 4s (remain 0m 52s) Loss: 0.8855 


Validation:  10%|▉         | 84/866 [00:05<00:41, 19.05valid_batch/s]

EVAL: [80/866] Elapsed 0m 5s (remain 0m 50s) Loss: 0.8983 


Validation:  12%|█▏        | 103/866 [00:06<00:51, 14.72valid_batch/s]

EVAL: [100/866] Elapsed 0m 6s (remain 0m 49s) Loss: 0.9447 


Validation:  14%|█▍        | 122/866 [00:08<01:03, 11.65valid_batch/s]

EVAL: [120/866] Elapsed 0m 7s (remain 0m 48s) Loss: 0.9350 


Validation:  16%|█▋        | 142/866 [00:09<00:51, 13.99valid_batch/s]

EVAL: [140/866] Elapsed 0m 9s (remain 0m 48s) Loss: 0.9144 


Validation:  19%|█▉        | 163/866 [00:10<00:49, 14.10valid_batch/s]

EVAL: [160/866] Elapsed 0m 10s (remain 0m 47s) Loss: 0.9138 


Validation:  21%|██        | 183/866 [00:12<00:46, 14.68valid_batch/s]

EVAL: [180/866] Elapsed 0m 12s (remain 0m 46s) Loss: 0.9313 


Validation:  23%|██▎       | 202/866 [00:13<00:48, 13.80valid_batch/s]

EVAL: [200/866] Elapsed 0m 13s (remain 0m 45s) Loss: 0.9397 


Validation:  26%|██▌       | 222/866 [00:15<00:41, 15.60valid_batch/s]

EVAL: [220/866] Elapsed 0m 15s (remain 0m 43s) Loss: 0.9456 


Validation:  28%|██▊       | 243/866 [00:16<00:38, 16.21valid_batch/s]

EVAL: [240/866] Elapsed 0m 16s (remain 0m 42s) Loss: 0.9391 


Validation:  30%|███       | 262/866 [00:17<00:50, 12.07valid_batch/s]

EVAL: [260/866] Elapsed 0m 17s (remain 0m 41s) Loss: 0.9434 


Validation:  33%|███▎      | 282/866 [00:19<00:43, 13.58valid_batch/s]

EVAL: [280/866] Elapsed 0m 19s (remain 0m 40s) Loss: 0.9450 


Validation:  35%|███▍      | 303/866 [00:20<00:44, 12.78valid_batch/s]

EVAL: [300/866] Elapsed 0m 20s (remain 0m 39s) Loss: 0.9521 


Validation:  37%|███▋      | 323/866 [00:22<00:38, 14.02valid_batch/s]

EVAL: [320/866] Elapsed 0m 22s (remain 0m 38s) Loss: 0.9493 


Validation:  40%|███▉      | 343/866 [00:24<00:33, 15.49valid_batch/s]

EVAL: [340/866] Elapsed 0m 23s (remain 0m 36s) Loss: 0.9521 


Validation:  42%|████▏     | 363/866 [00:25<00:44, 11.28valid_batch/s]

EVAL: [360/866] Elapsed 0m 25s (remain 0m 35s) Loss: 0.9640 


Validation:  44%|████▍     | 384/866 [00:27<00:29, 16.14valid_batch/s]

EVAL: [380/866] Elapsed 0m 27s (remain 0m 34s) Loss: 0.9718 


Validation:  47%|████▋     | 403/866 [00:28<00:29, 15.89valid_batch/s]

EVAL: [400/866] Elapsed 0m 28s (remain 0m 32s) Loss: 0.9650 


Validation:  49%|████▊     | 421/866 [00:29<00:25, 17.23valid_batch/s]

EVAL: [420/866] Elapsed 0m 29s (remain 0m 31s) Loss: 0.9595 


Validation:  51%|█████     | 442/866 [00:31<00:30, 13.99valid_batch/s]

EVAL: [440/866] Elapsed 0m 31s (remain 0m 30s) Loss: 0.9617 


Validation:  54%|█████▎    | 464/866 [00:32<00:27, 14.65valid_batch/s]

EVAL: [460/866] Elapsed 0m 32s (remain 0m 28s) Loss: 0.9597 


Validation:  56%|█████▌    | 483/866 [00:34<00:26, 14.25valid_batch/s]

EVAL: [480/866] Elapsed 0m 33s (remain 0m 27s) Loss: 0.9593 


Validation:  58%|█████▊    | 502/866 [00:35<00:30, 11.90valid_batch/s]

EVAL: [500/866] Elapsed 0m 35s (remain 0m 25s) Loss: 0.9598 


Validation:  60%|██████    | 523/866 [00:37<00:24, 13.82valid_batch/s]

EVAL: [520/866] Elapsed 0m 36s (remain 0m 24s) Loss: 0.9602 


Validation:  63%|██████▎   | 543/866 [00:38<00:23, 13.70valid_batch/s]

EVAL: [540/866] Elapsed 0m 38s (remain 0m 23s) Loss: 0.9579 


Validation:  65%|██████▌   | 563/866 [00:39<00:21, 14.21valid_batch/s]

EVAL: [560/866] Elapsed 0m 39s (remain 0m 21s) Loss: 0.9535 


Validation:  67%|██████▋   | 583/866 [00:41<00:22, 12.49valid_batch/s]

EVAL: [580/866] Elapsed 0m 41s (remain 0m 20s) Loss: 0.9629 


Validation:  70%|██████▉   | 604/866 [00:42<00:16, 15.72valid_batch/s]

EVAL: [600/866] Elapsed 0m 42s (remain 0m 18s) Loss: 0.9656 


Validation:  72%|███████▏  | 623/866 [00:44<00:18, 12.93valid_batch/s]

EVAL: [620/866] Elapsed 0m 44s (remain 0m 17s) Loss: 0.9588 


Validation:  74%|███████▍  | 643/866 [00:45<00:16, 13.47valid_batch/s]

EVAL: [640/866] Elapsed 0m 45s (remain 0m 16s) Loss: 0.9613 


Validation:  77%|███████▋  | 663/866 [00:47<00:14, 13.59valid_batch/s]

EVAL: [660/866] Elapsed 0m 47s (remain 0m 14s) Loss: 0.9629 


Validation:  79%|███████▉  | 683/866 [00:48<00:13, 13.08valid_batch/s]

EVAL: [680/866] Elapsed 0m 48s (remain 0m 13s) Loss: 0.9605 


Validation:  81%|████████  | 703/866 [00:50<00:10, 15.32valid_batch/s]

EVAL: [700/866] Elapsed 0m 50s (remain 0m 11s) Loss: 0.9647 


Validation:  83%|████████▎ | 722/866 [00:51<00:09, 14.58valid_batch/s]

EVAL: [720/866] Elapsed 0m 51s (remain 0m 10s) Loss: 0.9594 


Validation:  86%|████████▌ | 744/866 [00:53<00:07, 16.55valid_batch/s]

EVAL: [740/866] Elapsed 0m 52s (remain 0m 8s) Loss: 0.9621 


Validation:  88%|████████▊ | 763/866 [00:54<00:07, 12.99valid_batch/s]

EVAL: [760/866] Elapsed 0m 54s (remain 0m 7s) Loss: 0.9603 


Validation:  90%|█████████ | 782/866 [00:55<00:05, 14.58valid_batch/s]

EVAL: [780/866] Elapsed 0m 55s (remain 0m 6s) Loss: 0.9630 


Validation:  93%|█████████▎| 804/866 [00:57<00:03, 16.73valid_batch/s]

EVAL: [800/866] Elapsed 0m 57s (remain 0m 4s) Loss: 0.9628 


Validation:  95%|█████████▍| 822/866 [00:58<00:03, 12.52valid_batch/s]

EVAL: [820/866] Elapsed 0m 58s (remain 0m 3s) Loss: 0.9626 


Validation:  97%|█████████▋| 842/866 [01:00<00:01, 13.18valid_batch/s]

EVAL: [840/866] Elapsed 0m 59s (remain 0m 1s) Loss: 0.9617 


Validation: 100%|█████████▉| 862/866 [01:01<00:00, 12.83valid_batch/s]

EVAL: [860/866] Elapsed 1m 1s (remain 0m 0s) Loss: 0.9623 


Validation: 100%|██████████| 866/866 [01:02<00:00, 13.95valid_batch/s]
Epoch 1 - avg_train_loss: 0.9224  avg_val_loss: 0.9677  time: 653s
Epoch 1 - Score: 0.7546
Epoch 1 - Save Best Score: 0.7546 Model


EVAL: [865/866] Elapsed 1m 2s (remain 0m 0s) Loss: 0.9677 


Train:   0%|          | 1/1730 [00:00<12:20,  2.34train_batch/s]

Epoch: [2][0/1730] Elapsed 0m 0s (remain 12m 20s) Loss: 1.1052 Grad: inf  LR: 0.00002000  


Train:   1%|          | 21/1730 [00:07<10:10,  2.80train_batch/s]

Epoch: [2][20/1730] Elapsed 0m 7s (remain 9m 53s) Loss: 0.8537 Grad: 443973.9688  LR: 0.00001986  


Train:   2%|▏         | 41/1730 [00:14<08:47,  3.20train_batch/s]

Epoch: [2][40/1730] Elapsed 0m 14s (remain 9m 46s) Loss: 0.9037 Grad: 344446.2188  LR: 0.00001952  


Train:   4%|▎         | 61/1730 [00:21<11:06,  2.50train_batch/s]

Epoch: [2][60/1730] Elapsed 0m 21s (remain 9m 46s) Loss: 0.8811 Grad: 461578.7500  LR: 0.00001897  


Train:   5%|▍         | 81/1730 [00:29<11:58,  2.29train_batch/s]

Epoch: [2][80/1730] Elapsed 0m 29s (remain 9m 54s) Loss: 0.8859 Grad: 403362.2188  LR: 0.00001823  


Train:   6%|▌         | 101/1730 [00:36<09:16,  2.93train_batch/s]

Epoch: [2][100/1730] Elapsed 0m 36s (remain 9m 43s) Loss: 0.8496 Grad: 619489.7500  LR: 0.00001732  


Train:   7%|▋         | 121/1730 [00:43<09:18,  2.88train_batch/s]

Epoch: [2][120/1730] Elapsed 0m 43s (remain 9m 33s) Loss: 0.8322 Grad: 419664.6250  LR: 0.00001626  


Train:   8%|▊         | 141/1730 [00:49<09:23,  2.82train_batch/s]

Epoch: [2][140/1730] Elapsed 0m 49s (remain 9m 20s) Loss: 0.8203 Grad: 187668.2812  LR: 0.00001506  


Train:   9%|▉         | 161/1730 [00:56<08:32,  3.06train_batch/s]

Epoch: [2][160/1730] Elapsed 0m 56s (remain 9m 8s) Loss: 0.8170 Grad: 245275.6562  LR: 0.00001376  


Train:  10%|█         | 181/1730 [01:02<09:32,  2.71train_batch/s]

Epoch: [2][180/1730] Elapsed 1m 2s (remain 8m 58s) Loss: 0.8011 Grad: 132274.4531  LR: 0.00001238  


Train:  12%|█▏        | 201/1730 [01:10<11:13,  2.27train_batch/s]

Epoch: [2][200/1730] Elapsed 1m 10s (remain 8m 54s) Loss: 0.7947 Grad: 158354.6719  LR: 0.00001094  


Train:  13%|█▎        | 222/1730 [01:17<08:50,  2.84train_batch/s]

Epoch: [2][220/1730] Elapsed 1m 16s (remain 8m 44s) Loss: 0.7908 Grad: 170810.2812  LR: 0.00000949  


Train:  14%|█▍        | 241/1730 [01:22<08:39,  2.86train_batch/s]

Epoch: [2][240/1730] Elapsed 1m 22s (remain 8m 31s) Loss: 0.7794 Grad: 441415.8750  LR: 0.00000805  


Train:  15%|█▌        | 261/1730 [01:28<06:46,  3.61train_batch/s]

Epoch: [2][260/1730] Elapsed 1m 28s (remain 8m 20s) Loss: 0.7770 Grad: 408634.7500  LR: 0.00000665  


Train:  16%|█▋        | 282/1730 [01:36<06:50,  3.53train_batch/s]

Epoch: [2][280/1730] Elapsed 1m 36s (remain 8m 15s) Loss: 0.7782 Grad: 231937.2656  LR: 0.00000532  


Train:  17%|█▋        | 301/1730 [01:42<08:27,  2.81train_batch/s]

Epoch: [2][300/1730] Elapsed 1m 42s (remain 8m 5s) Loss: 0.7818 Grad: 350626.4062  LR: 0.00000409  


Train:  19%|█▊        | 321/1730 [01:49<09:23,  2.50train_batch/s]

Epoch: [2][320/1730] Elapsed 1m 49s (remain 7m 59s) Loss: 0.7804 Grad: 214660.2969  LR: 0.00000298  


Train:  20%|█▉        | 341/1730 [01:55<06:23,  3.62train_batch/s]

Epoch: [2][340/1730] Elapsed 1m 55s (remain 7m 48s) Loss: 0.7690 Grad: nan  LR: 0.00000202  


Train:  21%|██        | 361/1730 [02:01<06:33,  3.48train_batch/s]

Epoch: [2][360/1730] Elapsed 2m 1s (remain 7m 40s) Loss: 0.7682 Grad: 115087.3516  LR: 0.00000123  


Train:  22%|██▏       | 381/1730 [02:08<07:11,  3.13train_batch/s]

Epoch: [2][380/1730] Elapsed 2m 8s (remain 7m 35s) Loss: 0.7666 Grad: 121994.7188  LR: 0.00000063  


Train:  23%|██▎       | 401/1730 [02:15<07:58,  2.78train_batch/s]

Epoch: [2][400/1730] Elapsed 2m 15s (remain 7m 30s) Loss: 0.7628 Grad: 93042.4922  LR: 0.00000022  


Train:  24%|██▍       | 421/1730 [02:22<07:57,  2.74train_batch/s]

Epoch: [2][420/1730] Elapsed 2m 22s (remain 7m 22s) Loss: 0.7587 Grad: 180142.8281  LR: 0.00000002  


Train:  25%|██▌       | 441/1730 [02:29<08:03,  2.67train_batch/s]

Epoch: [2][440/1730] Elapsed 2m 29s (remain 7m 16s) Loss: 0.7571 Grad: 121183.1016  LR: 0.00000003  


Train:  27%|██▋       | 461/1730 [02:36<06:59,  3.02train_batch/s]

Epoch: [2][460/1730] Elapsed 2m 36s (remain 7m 9s) Loss: 0.7528 Grad: 87931.2969  LR: 0.00000025  


Train:  28%|██▊       | 481/1730 [02:43<07:31,  2.77train_batch/s]

Epoch: [2][480/1730] Elapsed 2m 43s (remain 7m 3s) Loss: 0.7525 Grad: 258676.6250  LR: 0.00000068  


Train:  29%|██▉       | 501/1730 [02:49<07:52,  2.60train_batch/s]

Epoch: [2][500/1730] Elapsed 2m 49s (remain 6m 56s) Loss: 0.7468 Grad: 61633.2109  LR: 0.00000130  


Train:  30%|███       | 522/1730 [02:57<06:11,  3.25train_batch/s]

Epoch: [2][520/1730] Elapsed 2m 56s (remain 6m 50s) Loss: 0.7498 Grad: 152928.6250  LR: 0.00000211  


Train:  31%|███▏      | 541/1730 [03:03<05:20,  3.71train_batch/s]

Epoch: [2][540/1730] Elapsed 3m 3s (remain 6m 43s) Loss: 0.7472 Grad: 94794.6094  LR: 0.00000308  


Train:  32%|███▏      | 561/1730 [03:10<06:24,  3.04train_batch/s]

Epoch: [2][560/1730] Elapsed 3m 10s (remain 6m 37s) Loss: 0.7476 Grad: 134264.5156  LR: 0.00000420  


Train:  34%|███▎      | 581/1730 [03:17<07:41,  2.49train_batch/s]

Epoch: [2][580/1730] Elapsed 3m 17s (remain 6m 31s) Loss: 0.7494 Grad: 175470.3594  LR: 0.00000545  


Train:  35%|███▍      | 601/1730 [03:23<07:19,  2.57train_batch/s]

Epoch: [2][600/1730] Elapsed 3m 23s (remain 6m 22s) Loss: 0.7473 Grad: 175464.5312  LR: 0.00000679  


Train:  36%|███▌      | 621/1730 [03:30<06:11,  2.99train_batch/s]

Epoch: [2][620/1730] Elapsed 3m 30s (remain 6m 15s) Loss: 0.7425 Grad: 90761.7344  LR: 0.00000819  


Train:  37%|███▋      | 641/1730 [03:36<06:14,  2.91train_batch/s]

Epoch: [2][640/1730] Elapsed 3m 36s (remain 6m 8s) Loss: 0.7428 Grad: 127518.6797  LR: 0.00000964  


Train:  38%|███▊      | 661/1730 [03:43<06:30,  2.74train_batch/s]

Epoch: [2][660/1730] Elapsed 3m 43s (remain 6m 1s) Loss: 0.7444 Grad: 209141.7344  LR: 0.00001109  


Train:  39%|███▉      | 682/1730 [03:50<06:00,  2.91train_batch/s]

Epoch: [2][680/1730] Elapsed 3m 50s (remain 5m 55s) Loss: 0.7409 Grad: 257058.1875  LR: 0.00001252  


Train:  41%|████      | 702/1730 [03:57<04:21,  3.93train_batch/s]

Epoch: [2][700/1730] Elapsed 3m 57s (remain 5m 48s) Loss: 0.7418 Grad: 64293.0547  LR: 0.00001389  


Train:  42%|████▏     | 722/1730 [04:04<04:53,  3.43train_batch/s]

Epoch: [2][720/1730] Elapsed 4m 4s (remain 5m 41s) Loss: 0.7398 Grad: 298119.9688  LR: 0.00001519  


Train:  43%|████▎     | 741/1730 [04:11<06:04,  2.71train_batch/s]

Epoch: [2][740/1730] Elapsed 4m 11s (remain 5m 35s) Loss: 0.7413 Grad: 76069.4297  LR: 0.00001637  


Train:  44%|████▍     | 761/1730 [04:18<05:37,  2.87train_batch/s]

Epoch: [2][760/1730] Elapsed 4m 18s (remain 5m 28s) Loss: 0.7414 Grad: 147394.8281  LR: 0.00001742  


Train:  45%|████▌     | 781/1730 [04:24<06:28,  2.44train_batch/s]

Epoch: [2][780/1730] Elapsed 4m 24s (remain 5m 21s) Loss: 0.7411 Grad: 191784.6250  LR: 0.00001831  


Train:  46%|████▋     | 801/1730 [04:32<04:47,  3.24train_batch/s]

Epoch: [2][800/1730] Elapsed 4m 32s (remain 5m 15s) Loss: 0.7428 Grad: 72900.9922  LR: 0.00001903  


Train:  47%|████▋     | 821/1730 [04:39<05:26,  2.79train_batch/s]

Epoch: [2][820/1730] Elapsed 4m 39s (remain 5m 9s) Loss: 0.7420 Grad: 139747.1875  LR: 0.00001956  


Train:  49%|████▊     | 841/1730 [04:45<04:25,  3.35train_batch/s]

Epoch: [2][840/1730] Elapsed 4m 45s (remain 5m 1s) Loss: 0.7420 Grad: 140626.1719  LR: 0.00001988  


Train:  50%|████▉     | 861/1730 [04:52<04:09,  3.48train_batch/s]

Epoch: [2][860/1730] Elapsed 4m 52s (remain 4m 54s) Loss: 0.7436 Grad: 111317.1406  LR: 0.00002000  


Train:  51%|█████     | 881/1730 [04:58<04:20,  3.25train_batch/s]

Epoch: [2][880/1730] Elapsed 4m 58s (remain 4m 47s) Loss: 0.7426 Grad: 96575.8359  LR: 0.00001990  


Train:  52%|█████▏    | 901/1730 [05:05<04:49,  2.86train_batch/s]

Epoch: [2][900/1730] Elapsed 5m 5s (remain 4m 40s) Loss: 0.7439 Grad: 157407.1875  LR: 0.00001960  


Train:  53%|█████▎    | 921/1730 [05:11<03:57,  3.41train_batch/s]

Epoch: [2][920/1730] Elapsed 5m 11s (remain 4m 33s) Loss: 0.7449 Grad: 104606.3906  LR: 0.00001909  


Train:  54%|█████▍    | 941/1730 [05:18<04:54,  2.68train_batch/s]

Epoch: [2][940/1730] Elapsed 5m 18s (remain 4m 26s) Loss: 0.7451 Grad: 140641.0000  LR: 0.00001839  


Train:  56%|█████▌    | 961/1730 [05:26<05:53,  2.18train_batch/s]

Epoch: [2][960/1730] Elapsed 5m 26s (remain 4m 21s) Loss: 0.7433 Grad: 339078.5312  LR: 0.00001752  


Train:  57%|█████▋    | 981/1730 [05:32<03:58,  3.15train_batch/s]

Epoch: [2][980/1730] Elapsed 5m 32s (remain 4m 13s) Loss: 0.7437 Grad: 165064.9844  LR: 0.00001648  


Train:  58%|█████▊    | 1001/1730 [05:39<03:47,  3.21train_batch/s]

Epoch: [2][1000/1730] Elapsed 5m 39s (remain 4m 7s) Loss: 0.7439 Grad: 122874.4062  LR: 0.00001531  


Train:  59%|█████▉    | 1021/1730 [05:45<02:54,  4.07train_batch/s]

Epoch: [2][1020/1730] Elapsed 5m 45s (remain 4m 0s) Loss: 0.7419 Grad: 203515.7500  LR: 0.00001403  


Train:  60%|██████    | 1042/1730 [05:52<03:47,  3.02train_batch/s]

Epoch: [2][1040/1730] Elapsed 5m 52s (remain 3m 53s) Loss: 0.7421 Grad: 168716.7656  LR: 0.00001266  


Train:  61%|██████▏   | 1061/1730 [05:59<03:47,  2.94train_batch/s]

Epoch: [2][1060/1730] Elapsed 5m 59s (remain 3m 46s) Loss: 0.7417 Grad: 103129.8438  LR: 0.00001123  


Train:  62%|██████▏   | 1081/1730 [06:05<03:24,  3.17train_batch/s]

Epoch: [2][1080/1730] Elapsed 6m 5s (remain 3m 39s) Loss: 0.7430 Grad: 154628.4375  LR: 0.00000978  


Train:  64%|██████▎   | 1101/1730 [06:13<03:16,  3.20train_batch/s]

Epoch: [2][1100/1730] Elapsed 6m 13s (remain 3m 33s) Loss: 0.7419 Grad: 218563.2812  LR: 0.00000834  


Train:  65%|██████▍   | 1121/1730 [06:20<03:42,  2.73train_batch/s]

Epoch: [2][1120/1730] Elapsed 6m 20s (remain 3m 26s) Loss: 0.7412 Grad: 117022.2344  LR: 0.00000692  


Train:  66%|██████▌   | 1141/1730 [06:26<02:35,  3.79train_batch/s]

Epoch: [2][1140/1730] Elapsed 6m 26s (remain 3m 19s) Loss: 0.7387 Grad: 67620.6406  LR: 0.00000558  


Train:  67%|██████▋   | 1161/1730 [06:34<02:50,  3.34train_batch/s]

Epoch: [2][1160/1730] Elapsed 6m 34s (remain 3m 13s) Loss: 0.7407 Grad: 70243.9062  LR: 0.00000432  


Train:  68%|██████▊   | 1181/1730 [06:41<03:20,  2.74train_batch/s]

Epoch: [2][1180/1730] Elapsed 6m 41s (remain 3m 6s) Loss: 0.7400 Grad: 77787.0938  LR: 0.00000319  


Train:  69%|██████▉   | 1201/1730 [06:47<02:55,  3.02train_batch/s]

Epoch: [2][1200/1730] Elapsed 6m 47s (remain 2m 59s) Loss: 0.7398 Grad: 163637.5938  LR: 0.00000220  


Train:  71%|███████   | 1221/1730 [06:55<03:10,  2.67train_batch/s]

Epoch: [2][1220/1730] Elapsed 6m 54s (remain 2m 52s) Loss: 0.7407 Grad: 129281.0234  LR: 0.00000138  


Train:  72%|███████▏  | 1241/1730 [07:02<02:50,  2.87train_batch/s]

Epoch: [2][1240/1730] Elapsed 7m 2s (remain 2m 46s) Loss: 0.7401 Grad: 87447.6953  LR: 0.00000073  


Train:  73%|███████▎  | 1261/1730 [07:09<02:29,  3.15train_batch/s]

Epoch: [2][1260/1730] Elapsed 7m 9s (remain 2m 39s) Loss: 0.7382 Grad: 164132.2812  LR: 0.00000029  


Train:  74%|███████▍  | 1281/1730 [07:16<02:11,  3.41train_batch/s]

Epoch: [2][1280/1730] Elapsed 7m 16s (remain 2m 32s) Loss: 0.7375 Grad: 117002.5703  LR: 0.00000004  


Train:  75%|███████▌  | 1301/1730 [07:23<02:04,  3.44train_batch/s]

Epoch: [2][1300/1730] Elapsed 7m 23s (remain 2m 26s) Loss: 0.7374 Grad: 82929.3672  LR: 0.00000001  


Train:  76%|███████▋  | 1321/1730 [07:29<02:15,  3.03train_batch/s]

Epoch: [2][1320/1730] Elapsed 7m 29s (remain 2m 19s) Loss: 0.7372 Grad: 141737.4844  LR: 0.00000019  


Train:  78%|███████▊  | 1342/1730 [07:36<02:16,  2.84train_batch/s]

Epoch: [2][1340/1730] Elapsed 7m 36s (remain 2m 12s) Loss: 0.7361 Grad: 406375.2188  LR: 0.00000058  


Train:  79%|███████▊  | 1361/1730 [07:43<02:04,  2.96train_batch/s]

Epoch: [2][1360/1730] Elapsed 7m 43s (remain 2m 5s) Loss: 0.7365 Grad: 110134.6719  LR: 0.00000116  


Train:  80%|███████▉  | 1381/1730 [07:50<02:14,  2.60train_batch/s]

Epoch: [2][1380/1730] Elapsed 7m 50s (remain 1m 58s) Loss: 0.7354 Grad: 213572.2656  LR: 0.00000194  


Train:  81%|████████  | 1401/1730 [07:56<02:02,  2.69train_batch/s]

Epoch: [2][1400/1730] Elapsed 7m 56s (remain 1m 51s) Loss: 0.7337 Grad: 88794.0391  LR: 0.00000288  


Train:  82%|████████▏ | 1421/1730 [08:03<01:58,  2.60train_batch/s]

Epoch: [2][1420/1730] Elapsed 8m 3s (remain 1m 45s) Loss: 0.7335 Grad: 53078.7539  LR: 0.00000397  


Train:  83%|████████▎ | 1441/1730 [08:10<01:37,  2.96train_batch/s]

Epoch: [2][1440/1730] Elapsed 8m 10s (remain 1m 38s) Loss: 0.7318 Grad: 312312.8750  LR: 0.00000519  


Train:  84%|████████▍ | 1461/1730 [08:17<01:44,  2.57train_batch/s]

Epoch: [2][1460/1730] Elapsed 8m 17s (remain 1m 31s) Loss: 0.7317 Grad: 200849.1094  LR: 0.00000651  


Train:  86%|████████▌ | 1481/1730 [08:24<01:34,  2.64train_batch/s]

Epoch: [2][1480/1730] Elapsed 8m 24s (remain 1m 24s) Loss: 0.7302 Grad: 153231.8281  LR: 0.00000791  


Train:  87%|████████▋ | 1502/1730 [08:30<01:11,  3.20train_batch/s]

Epoch: [2][1500/1730] Elapsed 8m 30s (remain 1m 17s) Loss: 0.7292 Grad: 147603.5156  LR: 0.00000935  


Train:  88%|████████▊ | 1521/1730 [08:37<01:18,  2.67train_batch/s]

Epoch: [2][1520/1730] Elapsed 8m 37s (remain 1m 11s) Loss: 0.7294 Grad: 129602.4844  LR: 0.00001080  


Train:  89%|████████▉ | 1541/1730 [08:44<01:07,  2.82train_batch/s]

Epoch: [2][1540/1730] Elapsed 8m 44s (remain 1m 4s) Loss: 0.7282 Grad: 191987.8750  LR: 0.00001224  


Train:  90%|█████████ | 1561/1730 [08:50<00:52,  3.23train_batch/s]

Epoch: [2][1560/1730] Elapsed 8m 50s (remain 0m 57s) Loss: 0.7278 Grad: 249309.4062  LR: 0.00001362  


Train:  91%|█████████▏| 1581/1730 [08:57<00:52,  2.84train_batch/s]

Epoch: [2][1580/1730] Elapsed 8m 57s (remain 0m 50s) Loss: 0.7273 Grad: 96246.2031  LR: 0.00001494  


Train:  93%|█████████▎| 1601/1730 [09:04<00:50,  2.53train_batch/s]

Epoch: [2][1600/1730] Elapsed 9m 4s (remain 0m 43s) Loss: 0.7278 Grad: 233853.6562  LR: 0.00001615  


Train:  94%|█████████▎| 1621/1730 [09:12<00:35,  3.07train_batch/s]

Epoch: [2][1620/1730] Elapsed 9m 12s (remain 0m 37s) Loss: 0.7291 Grad: 220649.1250  LR: 0.00001722  


Train:  95%|█████████▍| 1641/1730 [09:19<00:30,  2.90train_batch/s]

Epoch: [2][1640/1730] Elapsed 9m 19s (remain 0m 30s) Loss: 0.7302 Grad: 161395.4844  LR: 0.00001815  


Train:  96%|█████████▌| 1661/1730 [09:27<00:26,  2.57train_batch/s]

Epoch: [2][1660/1730] Elapsed 9m 27s (remain 0m 23s) Loss: 0.7322 Grad: 275382.7188  LR: 0.00001890  


Train:  97%|█████████▋| 1681/1730 [09:33<00:15,  3.16train_batch/s]

Epoch: [2][1680/1730] Elapsed 9m 33s (remain 0m 16s) Loss: 0.7321 Grad: 242029.6719  LR: 0.00001947  


Train:  98%|█████████▊| 1701/1730 [09:39<00:07,  3.69train_batch/s]

Epoch: [2][1700/1730] Elapsed 9m 39s (remain 0m 9s) Loss: 0.7323 Grad: 212945.9062  LR: 0.00001984  


Train:  99%|█████████▉| 1721/1730 [09:46<00:02,  3.00train_batch/s]

Epoch: [2][1720/1730] Elapsed 9m 46s (remain 0m 3s) Loss: 0.7321 Grad: 131451.8906  LR: 0.00001999  


Train: 100%|██████████| 1730/1730 [09:49<00:00,  2.94train_batch/s]


Epoch: [2][1729/1730] Elapsed 9m 49s (remain 0m 0s) Loss: 0.7317 Grad: 311779.4688  LR: 0.00002000  


Validation:   0%|          | 3/866 [00:00<01:32,  9.29valid_batch/s]

EVAL: [0/866] Elapsed 0m 0s (remain 3m 27s) Loss: 0.5141 


Validation:   3%|▎         | 23/866 [00:01<00:52, 16.21valid_batch/s]

EVAL: [20/866] Elapsed 0m 1s (remain 1m 3s) Loss: 0.7843 


Validation:   5%|▍         | 43/866 [00:02<00:48, 17.05valid_batch/s]

EVAL: [40/866] Elapsed 0m 2s (remain 0m 57s) Loss: 0.7276 


Validation:   7%|▋         | 63/866 [00:04<00:44, 18.24valid_batch/s]

EVAL: [60/866] Elapsed 0m 4s (remain 0m 52s) Loss: 0.7870 


Validation:  10%|▉         | 84/866 [00:05<00:41, 18.97valid_batch/s]

EVAL: [80/866] Elapsed 0m 5s (remain 0m 50s) Loss: 0.7846 


Validation:  12%|█▏        | 103/866 [00:06<00:51, 14.72valid_batch/s]

EVAL: [100/866] Elapsed 0m 6s (remain 0m 49s) Loss: 0.7962 


Validation:  14%|█▍        | 122/866 [00:08<01:03, 11.65valid_batch/s]

EVAL: [120/866] Elapsed 0m 7s (remain 0m 48s) Loss: 0.8171 


Validation:  16%|█▋        | 142/866 [00:09<00:51, 14.01valid_batch/s]

EVAL: [140/866] Elapsed 0m 9s (remain 0m 48s) Loss: 0.8121 


Validation:  19%|█▉        | 163/866 [00:10<00:49, 14.10valid_batch/s]

EVAL: [160/866] Elapsed 0m 10s (remain 0m 47s) Loss: 0.8127 


Validation:  21%|██        | 183/866 [00:12<00:46, 14.66valid_batch/s]

EVAL: [180/866] Elapsed 0m 12s (remain 0m 46s) Loss: 0.8109 


Validation:  23%|██▎       | 202/866 [00:13<00:48, 13.81valid_batch/s]

EVAL: [200/866] Elapsed 0m 13s (remain 0m 45s) Loss: 0.8269 


Validation:  26%|██▌       | 222/866 [00:15<00:41, 15.60valid_batch/s]

EVAL: [220/866] Elapsed 0m 14s (remain 0m 43s) Loss: 0.8257 


Validation:  28%|██▊       | 243/866 [00:16<00:38, 16.22valid_batch/s]

EVAL: [240/866] Elapsed 0m 16s (remain 0m 42s) Loss: 0.8159 


Validation:  30%|███       | 262/866 [00:17<00:50, 12.07valid_batch/s]

EVAL: [260/866] Elapsed 0m 17s (remain 0m 41s) Loss: 0.8282 


Validation:  33%|███▎      | 282/866 [00:19<00:42, 13.61valid_batch/s]

EVAL: [280/866] Elapsed 0m 19s (remain 0m 40s) Loss: 0.8235 


Validation:  35%|███▍      | 303/866 [00:20<00:44, 12.79valid_batch/s]

EVAL: [300/866] Elapsed 0m 20s (remain 0m 39s) Loss: 0.8253 


Validation:  37%|███▋      | 323/866 [00:22<00:38, 14.02valid_batch/s]

EVAL: [320/866] Elapsed 0m 22s (remain 0m 38s) Loss: 0.8230 


Validation:  40%|███▉      | 343/866 [00:24<00:33, 15.49valid_batch/s]

EVAL: [340/866] Elapsed 0m 23s (remain 0m 36s) Loss: 0.8333 


Validation:  42%|████▏     | 363/866 [00:25<00:44, 11.31valid_batch/s]

EVAL: [360/866] Elapsed 0m 25s (remain 0m 35s) Loss: 0.8331 


Validation:  44%|████▍     | 384/866 [00:27<00:29, 16.17valid_batch/s]

EVAL: [380/866] Elapsed 0m 27s (remain 0m 34s) Loss: 0.8412 


Validation:  47%|████▋     | 403/866 [00:28<00:29, 15.87valid_batch/s]

EVAL: [400/866] Elapsed 0m 28s (remain 0m 32s) Loss: 0.8366 


Validation:  49%|████▊     | 421/866 [00:29<00:25, 17.22valid_batch/s]

EVAL: [420/866] Elapsed 0m 29s (remain 0m 31s) Loss: 0.8418 


Validation:  51%|█████     | 442/866 [00:31<00:30, 13.98valid_batch/s]

EVAL: [440/866] Elapsed 0m 31s (remain 0m 30s) Loss: 0.8427 


Validation:  54%|█████▎    | 464/866 [00:32<00:27, 14.67valid_batch/s]

EVAL: [460/866] Elapsed 0m 32s (remain 0m 28s) Loss: 0.8395 


Validation:  56%|█████▌    | 483/866 [00:34<00:26, 14.27valid_batch/s]

EVAL: [480/866] Elapsed 0m 33s (remain 0m 27s) Loss: 0.8340 


Validation:  58%|█████▊    | 502/866 [00:35<00:30, 11.89valid_batch/s]

EVAL: [500/866] Elapsed 0m 35s (remain 0m 25s) Loss: 0.8370 


Validation:  60%|██████    | 523/866 [00:37<00:24, 13.82valid_batch/s]

EVAL: [520/866] Elapsed 0m 36s (remain 0m 24s) Loss: 0.8352 


Validation:  63%|██████▎   | 543/866 [00:38<00:23, 13.71valid_batch/s]

EVAL: [540/866] Elapsed 0m 38s (remain 0m 23s) Loss: 0.8369 


Validation:  65%|██████▌   | 563/866 [00:39<00:21, 14.21valid_batch/s]

EVAL: [560/866] Elapsed 0m 39s (remain 0m 21s) Loss: 0.8289 


Validation:  67%|██████▋   | 583/866 [00:41<00:22, 12.51valid_batch/s]

EVAL: [580/866] Elapsed 0m 41s (remain 0m 20s) Loss: 0.8302 


Validation:  70%|██████▉   | 604/866 [00:42<00:16, 15.71valid_batch/s]

EVAL: [600/866] Elapsed 0m 42s (remain 0m 18s) Loss: 0.8370 


Validation:  72%|███████▏  | 623/866 [00:44<00:18, 12.94valid_batch/s]

EVAL: [620/866] Elapsed 0m 44s (remain 0m 17s) Loss: 0.8322 


Validation:  74%|███████▍  | 643/866 [00:45<00:16, 13.47valid_batch/s]

EVAL: [640/866] Elapsed 0m 45s (remain 0m 16s) Loss: 0.8347 


Validation:  77%|███████▋  | 663/866 [00:47<00:14, 13.59valid_batch/s]

EVAL: [660/866] Elapsed 0m 47s (remain 0m 14s) Loss: 0.8335 


Validation:  79%|███████▉  | 683/866 [00:48<00:13, 13.07valid_batch/s]

EVAL: [680/866] Elapsed 0m 48s (remain 0m 13s) Loss: 0.8316 


Validation:  81%|████████  | 703/866 [00:50<00:10, 15.36valid_batch/s]

EVAL: [700/866] Elapsed 0m 50s (remain 0m 11s) Loss: 0.8375 


Validation:  83%|████████▎ | 722/866 [00:51<00:09, 14.57valid_batch/s]

EVAL: [720/866] Elapsed 0m 51s (remain 0m 10s) Loss: 0.8346 


Validation:  86%|████████▌ | 744/866 [00:53<00:07, 16.56valid_batch/s]

EVAL: [740/866] Elapsed 0m 52s (remain 0m 8s) Loss: 0.8362 


Validation:  88%|████████▊ | 763/866 [00:54<00:07, 12.99valid_batch/s]

EVAL: [760/866] Elapsed 0m 54s (remain 0m 7s) Loss: 0.8362 


Validation:  90%|█████████ | 782/866 [00:55<00:05, 14.60valid_batch/s]

EVAL: [780/866] Elapsed 0m 55s (remain 0m 6s) Loss: 0.8343 


Validation:  93%|█████████▎| 804/866 [00:57<00:03, 16.28valid_batch/s]

EVAL: [800/866] Elapsed 0m 57s (remain 0m 4s) Loss: 0.8320 


Validation:  95%|█████████▍| 822/866 [00:58<00:03, 12.48valid_batch/s]

EVAL: [820/866] Elapsed 0m 58s (remain 0m 3s) Loss: 0.8289 


Validation:  97%|█████████▋| 842/866 [01:00<00:01, 13.19valid_batch/s]

EVAL: [840/866] Elapsed 0m 59s (remain 0m 1s) Loss: 0.8323 


Validation: 100%|█████████▉| 862/866 [01:01<00:00, 12.84valid_batch/s]

EVAL: [860/866] Elapsed 1m 1s (remain 0m 0s) Loss: 0.8344 


Validation: 100%|██████████| 866/866 [01:02<00:00, 13.96valid_batch/s]
Epoch 2 - avg_train_loss: 0.7317  avg_val_loss: 0.8369  time: 651s
Epoch 2 - Score: 0.7891
Epoch 2 - Save Best Score: 0.7891 Model


EVAL: [865/866] Elapsed 1m 2s (remain 0m 0s) Loss: 0.8369 


Score: 0.7891
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Train:   0%|          | 1/1730 [00:00<14:52,  1.94train_batch/s]

Epoch: [1][0/1730] Elapsed 0m 0s (remain 14m 52s) Loss: 1.8462 Grad: inf  LR: 0.00002000  


Train:   1%|          | 21/1730 [00:07<09:37,  2.96train_batch/s]

Epoch: [1][20/1730] Elapsed 0m 7s (remain 9m 29s) Loss: 1.6326 Grad: 42152.9453  LR: 0.00001988  


Train:   2%|▏         | 41/1730 [00:13<09:23,  3.00train_batch/s]

Epoch: [1][40/1730] Elapsed 0m 13s (remain 9m 26s) Loss: 1.5133 Grad: 55627.6992  LR: 0.00001956  


Train:   4%|▎         | 61/1730 [00:20<08:42,  3.19train_batch/s]

Epoch: [1][60/1730] Elapsed 0m 19s (remain 9m 6s) Loss: 1.4514 Grad: 60762.0625  LR: 0.00001903  


Train:   5%|▍         | 81/1730 [00:27<10:00,  2.75train_batch/s]

Epoch: [1][80/1730] Elapsed 0m 27s (remain 9m 14s) Loss: 1.3758 Grad: 69135.9375  LR: 0.00001831  


Train:   6%|▌         | 101/1730 [00:34<08:08,  3.34train_batch/s]

Epoch: [1][100/1730] Elapsed 0m 34s (remain 9m 15s) Loss: 1.3229 Grad: 53483.4062  LR: 0.00001742  


Train:   7%|▋         | 121/1730 [00:40<08:49,  3.04train_batch/s]

Epoch: [1][120/1730] Elapsed 0m 40s (remain 9m 2s) Loss: 1.2996 Grad: 67317.8594  LR: 0.00001637  


Train:   8%|▊         | 141/1730 [00:46<08:38,  3.07train_batch/s]

Epoch: [1][140/1730] Elapsed 0m 46s (remain 8m 48s) Loss: 1.2616 Grad: 49573.2383  LR: 0.00001519  


Train:   9%|▉         | 161/1730 [00:53<10:24,  2.51train_batch/s]

Epoch: [1][160/1730] Elapsed 0m 53s (remain 8m 45s) Loss: 1.2391 Grad: 108942.6328  LR: 0.00001389  


Train:  10%|█         | 181/1730 [01:01<08:22,  3.08train_batch/s]

Epoch: [1][180/1730] Elapsed 1m 1s (remain 8m 45s) Loss: 1.2141 Grad: 42642.7031  LR: 0.00001252  


Train:  12%|█▏        | 201/1730 [01:07<06:52,  3.70train_batch/s]

Epoch: [1][200/1730] Elapsed 1m 7s (remain 8m 31s) Loss: 1.1872 Grad: 37912.7031  LR: 0.00001109  


Train:  13%|█▎        | 221/1730 [01:13<07:44,  3.25train_batch/s]

Epoch: [1][220/1730] Elapsed 1m 13s (remain 8m 23s) Loss: 1.1666 Grad: 63089.9922  LR: 0.00000964  


Train:  14%|█▍        | 241/1730 [01:19<07:07,  3.49train_batch/s]

Epoch: [1][240/1730] Elapsed 1m 19s (remain 8m 11s) Loss: 1.1478 Grad: 41721.7539  LR: 0.00000819  


Train:  15%|█▌        | 261/1730 [01:26<09:05,  2.69train_batch/s]

Epoch: [1][260/1730] Elapsed 1m 26s (remain 8m 5s) Loss: 1.1331 Grad: 60064.6094  LR: 0.00000679  


Train:  16%|█▌        | 281/1730 [01:33<07:00,  3.45train_batch/s]

Epoch: [1][280/1730] Elapsed 1m 33s (remain 7m 59s) Loss: 1.1212 Grad: 78538.7891  LR: 0.00000545  


Train:  17%|█▋        | 301/1730 [01:41<10:54,  2.18train_batch/s]

Epoch: [1][300/1730] Elapsed 1m 41s (remain 8m 0s) Loss: 1.1139 Grad: 83658.2109  LR: 0.00000420  


Train:  19%|█▊        | 322/1730 [01:48<07:42,  3.04train_batch/s]

Epoch: [1][320/1730] Elapsed 1m 48s (remain 7m 56s) Loss: 1.1039 Grad: 51659.4727  LR: 0.00000308  


Train:  20%|█▉        | 341/1730 [01:55<06:51,  3.38train_batch/s]

Epoch: [1][340/1730] Elapsed 1m 55s (remain 7m 50s) Loss: 1.0945 Grad: 60874.2383  LR: 0.00000211  


Train:  21%|██        | 362/1730 [02:01<06:11,  3.68train_batch/s]

Epoch: [1][360/1730] Elapsed 2m 1s (remain 7m 41s) Loss: 1.0803 Grad: 77294.3750  LR: 0.00000130  


Train:  22%|██▏       | 381/1730 [02:08<07:00,  3.21train_batch/s]

Epoch: [1][380/1730] Elapsed 2m 8s (remain 7m 35s) Loss: 1.0714 Grad: 64124.4766  LR: 0.00000068  


Train:  23%|██▎       | 401/1730 [02:15<06:44,  3.28train_batch/s]

Epoch: [1][400/1730] Elapsed 2m 15s (remain 7m 29s) Loss: 1.0680 Grad: 117454.6797  LR: 0.00000025  


Train:  24%|██▍       | 421/1730 [02:22<06:52,  3.18train_batch/s]

Epoch: [1][420/1730] Elapsed 2m 22s (remain 7m 22s) Loss: 1.0617 Grad: 63655.2812  LR: 0.00000003  


Train:  25%|██▌       | 441/1730 [02:29<07:54,  2.71train_batch/s]

Epoch: [1][440/1730] Elapsed 2m 29s (remain 7m 16s) Loss: 1.0555 Grad: 40774.2031  LR: 0.00000002  


Train:  27%|██▋       | 461/1730 [02:36<06:26,  3.29train_batch/s]

Epoch: [1][460/1730] Elapsed 2m 36s (remain 7m 10s) Loss: 1.0492 Grad: 41296.2305  LR: 0.00000022  


Train:  28%|██▊       | 481/1730 [02:43<09:37,  2.16train_batch/s]

Epoch: [1][480/1730] Elapsed 2m 43s (remain 7m 5s) Loss: 1.0409 Grad: 50602.6133  LR: 0.00000063  


Train:  29%|██▉       | 501/1730 [02:50<05:46,  3.55train_batch/s]

Epoch: [1][500/1730] Elapsed 2m 50s (remain 6m 58s) Loss: 1.0336 Grad: 90221.6172  LR: 0.00000123  


Train:  30%|███       | 521/1730 [02:57<07:48,  2.58train_batch/s]

Epoch: [1][520/1730] Elapsed 2m 57s (remain 6m 52s) Loss: 1.0291 Grad: 48836.0195  LR: 0.00000202  


Train:  31%|███▏      | 541/1730 [03:03<06:08,  3.22train_batch/s]

Epoch: [1][540/1730] Elapsed 3m 3s (remain 6m 43s) Loss: 1.0292 Grad: 57690.7578  LR: 0.00000298  


Train:  32%|███▏      | 561/1730 [03:11<08:00,  2.43train_batch/s]

Epoch: [1][560/1730] Elapsed 3m 11s (remain 6m 39s) Loss: 1.0293 Grad: 100175.2422  LR: 0.00000409  


Train:  34%|███▎      | 581/1730 [03:17<05:39,  3.38train_batch/s]

Epoch: [1][580/1730] Elapsed 3m 17s (remain 6m 30s) Loss: 1.0262 Grad: 33899.0703  LR: 0.00000532  


Train:  35%|███▍      | 601/1730 [03:24<06:37,  2.84train_batch/s]

Epoch: [1][600/1730] Elapsed 3m 24s (remain 6m 23s) Loss: 1.0240 Grad: 147864.4062  LR: 0.00000665  


Train:  36%|███▌      | 621/1730 [03:31<07:26,  2.48train_batch/s]

Epoch: [1][620/1730] Elapsed 3m 31s (remain 6m 16s) Loss: 1.0249 Grad: 30133.2344  LR: 0.00000805  


Train:  37%|███▋      | 642/1730 [03:38<05:03,  3.58train_batch/s]

Epoch: [1][640/1730] Elapsed 3m 38s (remain 6m 10s) Loss: 1.0245 Grad: 110004.8672  LR: 0.00000949  


Train:  38%|███▊      | 661/1730 [03:45<05:46,  3.09train_batch/s]

Epoch: [1][660/1730] Elapsed 3m 45s (remain 6m 4s) Loss: 1.0235 Grad: 83925.8438  LR: 0.00001094  


Train:  39%|███▉      | 681/1730 [03:52<06:49,  2.56train_batch/s]

Epoch: [1][680/1730] Elapsed 3m 52s (remain 5m 58s) Loss: 1.0229 Grad: 67858.9297  LR: 0.00001238  


Train:  41%|████      | 701/1730 [03:58<06:06,  2.81train_batch/s]

Epoch: [1][700/1730] Elapsed 3m 58s (remain 5m 50s) Loss: 1.0202 Grad: 77062.7969  LR: 0.00001376  


Train:  42%|████▏     | 721/1730 [04:05<04:59,  3.37train_batch/s]

Epoch: [1][720/1730] Elapsed 4m 5s (remain 5m 43s) Loss: 1.0171 Grad: 53030.0742  LR: 0.00001506  


Train:  43%|████▎     | 741/1730 [04:12<05:18,  3.11train_batch/s]

Epoch: [1][740/1730] Elapsed 4m 12s (remain 5m 36s) Loss: 1.0169 Grad: 107965.9453  LR: 0.00001626  


Train:  44%|████▍     | 761/1730 [04:18<05:48,  2.78train_batch/s]

Epoch: [1][760/1730] Elapsed 4m 18s (remain 5m 29s) Loss: 1.0184 Grad: 78471.0469  LR: 0.00001732  


Train:  45%|████▌     | 781/1730 [04:26<06:19,  2.50train_batch/s]

Epoch: [1][780/1730] Elapsed 4m 26s (remain 5m 23s) Loss: 1.0206 Grad: 127946.7109  LR: 0.00001823  


Train:  46%|████▋     | 801/1730 [04:32<05:31,  2.80train_batch/s]

Epoch: [1][800/1730] Elapsed 4m 32s (remain 5m 16s) Loss: 1.0199 Grad: 59047.1289  LR: 0.00001897  


Train:  47%|████▋     | 821/1730 [04:39<04:15,  3.56train_batch/s]

Epoch: [1][820/1730] Elapsed 4m 39s (remain 5m 9s) Loss: 1.0187 Grad: 42558.5391  LR: 0.00001952  


Train:  49%|████▊     | 841/1730 [04:46<04:27,  3.32train_batch/s]

Epoch: [1][840/1730] Elapsed 4m 46s (remain 5m 2s) Loss: 1.0169 Grad: 23763.1406  LR: 0.00001986  


Train:  50%|████▉     | 861/1730 [04:51<04:05,  3.54train_batch/s]

Epoch: [1][860/1730] Elapsed 4m 51s (remain 4m 54s) Loss: 1.0156 Grad: 47281.3984  LR: 0.00002000  


Train:  51%|█████     | 881/1730 [04:58<04:35,  3.08train_batch/s]

Epoch: [1][880/1730] Elapsed 4m 58s (remain 4m 48s) Loss: 1.0160 Grad: 118506.3125  LR: 0.00001992  


Train:  52%|█████▏    | 901/1730 [05:05<04:59,  2.77train_batch/s]

Epoch: [1][900/1730] Elapsed 5m 5s (remain 4m 41s) Loss: 1.0161 Grad: 227372.7656  LR: 0.00001964  


Train:  53%|█████▎    | 921/1730 [05:13<04:09,  3.24train_batch/s]

Epoch: [1][920/1730] Elapsed 5m 13s (remain 4m 35s) Loss: 1.0150 Grad: 53161.3750  LR: 0.00001915  


Train:  54%|█████▍    | 941/1730 [05:20<03:23,  3.88train_batch/s]

Epoch: [1][940/1730] Elapsed 5m 20s (remain 4m 28s) Loss: 1.0143 Grad: 42968.5312  LR: 0.00001847  


Train:  56%|█████▌    | 961/1730 [05:27<04:19,  2.96train_batch/s]

Epoch: [1][960/1730] Elapsed 5m 27s (remain 4m 22s) Loss: 1.0128 Grad: 49171.0234  LR: 0.00001761  


Train:  57%|█████▋    | 981/1730 [05:34<03:30,  3.56train_batch/s]

Epoch: [1][980/1730] Elapsed 5m 34s (remain 4m 15s) Loss: 1.0109 Grad: 29581.5234  LR: 0.00001659  


Train:  58%|█████▊    | 1001/1730 [05:41<03:36,  3.36train_batch/s]

Epoch: [1][1000/1730] Elapsed 5m 41s (remain 4m 8s) Loss: 1.0094 Grad: 81669.8672  LR: 0.00001543  


Train:  59%|█████▉    | 1021/1730 [05:48<03:38,  3.25train_batch/s]

Epoch: [1][1020/1730] Elapsed 5m 48s (remain 4m 1s) Loss: 1.0085 Grad: 71777.1172  LR: 0.00001416  


Train:  60%|██████    | 1041/1730 [05:55<03:56,  2.92train_batch/s]

Epoch: [1][1040/1730] Elapsed 5m 55s (remain 3m 54s) Loss: 1.0045 Grad: 68162.6484  LR: 0.00001280  


Train:  61%|██████▏   | 1062/1730 [06:02<03:12,  3.48train_batch/s]

Epoch: [1][1060/1730] Elapsed 6m 2s (remain 3m 48s) Loss: 1.0047 Grad: 139591.1094  LR: 0.00001138  


Train:  63%|██████▎   | 1082/1730 [06:09<03:09,  3.42train_batch/s]

Epoch: [1][1080/1730] Elapsed 6m 9s (remain 3m 41s) Loss: 1.0024 Grad: 62363.1367  LR: 0.00000993  


Train:  64%|██████▎   | 1101/1730 [06:17<04:26,  2.36train_batch/s]

Epoch: [1][1100/1730] Elapsed 6m 17s (remain 3m 35s) Loss: 0.9985 Grad: 95016.7891  LR: 0.00000848  


Train:  65%|██████▍   | 1121/1730 [06:23<03:08,  3.23train_batch/s]

Epoch: [1][1120/1730] Elapsed 6m 23s (remain 3m 28s) Loss: 0.9977 Grad: 44706.2578  LR: 0.00000706  


Train:  66%|██████▌   | 1142/1730 [06:30<02:39,  3.70train_batch/s]

Epoch: [1][1140/1730] Elapsed 6m 30s (remain 3m 21s) Loss: 0.9968 Grad: 53333.9336  LR: 0.00000571  


Train:  67%|██████▋   | 1161/1730 [06:35<02:50,  3.33train_batch/s]

Epoch: [1][1160/1730] Elapsed 6m 35s (remain 3m 14s) Loss: 0.9936 Grad: 76409.6328  LR: 0.00000444  


Train:  68%|██████▊   | 1181/1730 [06:42<02:53,  3.16train_batch/s]

Epoch: [1][1180/1730] Elapsed 6m 42s (remain 3m 7s) Loss: 0.9921 Grad: 33326.7031  LR: 0.00000330  


Train:  69%|██████▉   | 1202/1730 [06:49<03:18,  2.66train_batch/s]

Epoch: [1][1200/1730] Elapsed 6m 49s (remain 3m 0s) Loss: 0.9919 Grad: 45365.8398  LR: 0.00000229  


Train:  71%|███████   | 1221/1730 [06:56<03:06,  2.73train_batch/s]

Epoch: [1][1220/1730] Elapsed 6m 56s (remain 2m 53s) Loss: 0.9895 Grad: 21276.5918  LR: 0.00000145  


Train:  72%|███████▏  | 1241/1730 [07:04<03:26,  2.37train_batch/s]

Epoch: [1][1240/1730] Elapsed 7m 4s (remain 2m 47s) Loss: 0.9877 Grad: 109525.7344  LR: 0.00000079  


Train:  73%|███████▎  | 1261/1730 [07:10<02:52,  2.71train_batch/s]

Epoch: [1][1260/1730] Elapsed 7m 10s (remain 2m 40s) Loss: 0.9844 Grad: 26134.9707  LR: 0.00000032  


Train:  74%|███████▍  | 1281/1730 [07:18<02:34,  2.91train_batch/s]

Epoch: [1][1280/1730] Elapsed 7m 18s (remain 2m 33s) Loss: 0.9818 Grad: 33293.7539  LR: 0.00000006  


Train:  75%|███████▌  | 1302/1730 [07:25<02:01,  3.52train_batch/s]

Epoch: [1][1300/1730] Elapsed 7m 25s (remain 2m 26s) Loss: 0.9793 Grad: 36685.2812  LR: 0.00000001  


Train:  76%|███████▋  | 1321/1730 [07:32<02:36,  2.62train_batch/s]

Epoch: [1][1320/1730] Elapsed 7m 32s (remain 2m 19s) Loss: 0.9767 Grad: 43325.1992  LR: 0.00000016  


Train:  78%|███████▊  | 1341/1730 [07:39<02:13,  2.91train_batch/s]

Epoch: [1][1340/1730] Elapsed 7m 39s (remain 2m 13s) Loss: 0.9748 Grad: 28013.5137  LR: 0.00000053  


Train:  79%|███████▊  | 1362/1730 [07:46<02:02,  2.99train_batch/s]

Epoch: [1][1360/1730] Elapsed 7m 46s (remain 2m 6s) Loss: 0.9740 Grad: 116704.2734  LR: 0.00000110  


Train:  80%|███████▉  | 1381/1730 [07:53<02:04,  2.81train_batch/s]

Epoch: [1][1380/1730] Elapsed 7m 53s (remain 1m 59s) Loss: 0.9706 Grad: 45071.2539  LR: 0.00000185  


Train:  81%|████████  | 1401/1730 [08:01<02:26,  2.24train_batch/s]

Epoch: [1][1400/1730] Elapsed 8m 1s (remain 1m 53s) Loss: 0.9694 Grad: 30938.6680  LR: 0.00000278  


Train:  82%|████████▏ | 1422/1730 [08:08<01:48,  2.83train_batch/s]

Epoch: [1][1420/1730] Elapsed 8m 8s (remain 1m 46s) Loss: 0.9680 Grad: 55833.2500  LR: 0.00000385  


Train:  83%|████████▎ | 1441/1730 [08:15<01:31,  3.15train_batch/s]

Epoch: [1][1440/1730] Elapsed 8m 15s (remain 1m 39s) Loss: 0.9661 Grad: 10036.5176  LR: 0.00000506  


Train:  84%|████████▍ | 1461/1730 [08:22<01:32,  2.90train_batch/s]

Epoch: [1][1460/1730] Elapsed 8m 22s (remain 1m 32s) Loss: 0.9644 Grad: 40994.9961  LR: 0.00000638  


Train:  86%|████████▌ | 1481/1730 [08:28<01:23,  2.96train_batch/s]

Epoch: [1][1480/1730] Elapsed 8m 28s (remain 1m 25s) Loss: 0.9638 Grad: 50059.9062  LR: 0.00000776  


Train:  87%|████████▋ | 1501/1730 [08:36<01:10,  3.26train_batch/s]

Epoch: [1][1500/1730] Elapsed 8m 36s (remain 1m 18s) Loss: 0.9628 Grad: 23297.1895  LR: 0.00000920  


Train:  88%|████████▊ | 1521/1730 [08:43<01:36,  2.17train_batch/s]

Epoch: [1][1520/1730] Elapsed 8m 43s (remain 1m 11s) Loss: 0.9617 Grad: 186156.2500  LR: 0.00001065  


Train:  89%|████████▉ | 1541/1730 [08:50<01:19,  2.38train_batch/s]

Epoch: [1][1540/1730] Elapsed 8m 50s (remain 1m 5s) Loss: 0.9603 Grad: 21889.0586  LR: 0.00001209  


Train:  90%|█████████ | 1561/1730 [08:57<00:51,  3.27train_batch/s]

Epoch: [1][1560/1730] Elapsed 8m 57s (remain 0m 58s) Loss: 0.9609 Grad: 61461.2383  LR: 0.00001349  


Train:  91%|█████████▏| 1581/1730 [09:04<00:56,  2.64train_batch/s]

Epoch: [1][1580/1730] Elapsed 9m 4s (remain 0m 51s) Loss: 0.9624 Grad: 26708.0332  LR: 0.00001481  


Train:  93%|█████████▎| 1601/1730 [09:12<00:52,  2.48train_batch/s]

Epoch: [1][1600/1730] Elapsed 9m 12s (remain 0m 44s) Loss: 0.9607 Grad: 32972.4336  LR: 0.00001603  


Train:  94%|█████████▎| 1621/1730 [09:19<00:35,  3.08train_batch/s]

Epoch: [1][1620/1730] Elapsed 9m 19s (remain 0m 37s) Loss: 0.9610 Grad: 17618.2617  LR: 0.00001712  


Train:  95%|█████████▍| 1641/1730 [09:25<00:26,  3.30train_batch/s]

Epoch: [1][1640/1730] Elapsed 9m 25s (remain 0m 30s) Loss: 0.9599 Grad: 48521.1602  LR: 0.00001806  


Train:  96%|█████████▌| 1662/1730 [09:31<00:17,  3.78train_batch/s]

Epoch: [1][1660/1730] Elapsed 9m 31s (remain 0m 23s) Loss: 0.9585 Grad: 19409.7676  LR: 0.00001884  


Train:  97%|█████████▋| 1681/1730 [09:37<00:16,  2.99train_batch/s]

Epoch: [1][1680/1730] Elapsed 9m 37s (remain 0m 16s) Loss: 0.9598 Grad: 55541.2344  LR: 0.00001942  


Train:  98%|█████████▊| 1701/1730 [09:44<00:08,  3.39train_batch/s]

Epoch: [1][1700/1730] Elapsed 9m 44s (remain 0m 9s) Loss: 0.9604 Grad: 18873.7266  LR: 0.00001981  


Train:  99%|█████████▉| 1721/1730 [09:52<00:02,  3.08train_batch/s]

Epoch: [1][1720/1730] Elapsed 9m 52s (remain 0m 3s) Loss: 0.9597 Grad: 22198.7344  LR: 0.00001999  


Train: 100%|██████████| 1730/1730 [09:55<00:00,  2.91train_batch/s]


Epoch: [1][1729/1730] Elapsed 9m 55s (remain 0m 0s) Loss: 0.9593 Grad: 33003.0273  LR: 0.00002000  


Validation:   0%|          | 3/866 [00:00<01:59,  7.22valid_batch/s]

EVAL: [0/866] Elapsed 0m 0s (remain 4m 14s) Loss: 1.2868 


Validation:   3%|▎         | 22/866 [00:01<00:56, 14.86valid_batch/s]

EVAL: [20/866] Elapsed 0m 1s (remain 1m 7s) Loss: 0.9252 


Validation:   5%|▌         | 44/866 [00:03<00:58, 14.09valid_batch/s]

EVAL: [40/866] Elapsed 0m 3s (remain 1m 3s) Loss: 0.9218 


Validation:   7%|▋         | 62/866 [00:04<00:55, 14.38valid_batch/s]

EVAL: [60/866] Elapsed 0m 4s (remain 0m 59s) Loss: 0.9333 


Validation:   9%|▉         | 82/866 [00:06<01:00, 12.96valid_batch/s]

EVAL: [80/866] Elapsed 0m 5s (remain 0m 56s) Loss: 0.8967 


Validation:  12%|█▏        | 102/866 [00:07<00:56, 13.64valid_batch/s]

EVAL: [100/866] Elapsed 0m 7s (remain 0m 56s) Loss: 0.8935 


Validation:  14%|█▍        | 123/866 [00:08<00:52, 14.14valid_batch/s]

EVAL: [120/866] Elapsed 0m 8s (remain 0m 54s) Loss: 0.8868 


Validation:  16%|█▋        | 142/866 [00:10<00:49, 14.50valid_batch/s]

EVAL: [140/866] Elapsed 0m 10s (remain 0m 52s) Loss: 0.8816 


Validation:  19%|█▉        | 164/866 [00:12<00:49, 14.09valid_batch/s]

EVAL: [160/866] Elapsed 0m 11s (remain 0m 52s) Loss: 0.8859 


Validation:  21%|██        | 183/866 [00:13<00:44, 15.19valid_batch/s]

EVAL: [180/866] Elapsed 0m 13s (remain 0m 50s) Loss: 0.8801 


Validation:  23%|██▎       | 203/866 [00:14<00:50, 13.01valid_batch/s]

EVAL: [200/866] Elapsed 0m 14s (remain 0m 48s) Loss: 0.8874 


Validation:  26%|██▌       | 224/866 [00:16<00:39, 16.23valid_batch/s]

EVAL: [220/866] Elapsed 0m 16s (remain 0m 47s) Loss: 0.8902 


Validation:  28%|██▊       | 242/866 [00:18<01:02,  9.97valid_batch/s]

EVAL: [240/866] Elapsed 0m 17s (remain 0m 46s) Loss: 0.8768 


Validation:  30%|███       | 262/866 [00:19<00:43, 13.90valid_batch/s]

EVAL: [260/866] Elapsed 0m 19s (remain 0m 44s) Loss: 0.8796 


Validation:  33%|███▎      | 282/866 [00:21<00:45, 12.79valid_batch/s]

EVAL: [280/866] Elapsed 0m 20s (remain 0m 43s) Loss: 0.8687 


Validation:  35%|███▍      | 303/866 [00:22<00:42, 13.14valid_batch/s]

EVAL: [300/866] Elapsed 0m 22s (remain 0m 42s) Loss: 0.8670 


Validation:  37%|███▋      | 322/866 [00:24<00:48, 11.31valid_batch/s]

EVAL: [320/866] Elapsed 0m 23s (remain 0m 40s) Loss: 0.8702 


Validation:  40%|███▉      | 343/866 [00:25<00:39, 13.18valid_batch/s]

EVAL: [340/866] Elapsed 0m 25s (remain 0m 39s) Loss: 0.8664 


Validation:  42%|████▏     | 364/866 [00:27<00:32, 15.24valid_batch/s]

EVAL: [360/866] Elapsed 0m 26s (remain 0m 37s) Loss: 0.8691 


Validation:  44%|████▍     | 382/866 [00:28<00:34, 13.84valid_batch/s]

EVAL: [380/866] Elapsed 0m 28s (remain 0m 36s) Loss: 0.8727 


Validation:  46%|████▋     | 402/866 [00:29<00:32, 14.23valid_batch/s]

EVAL: [400/866] Elapsed 0m 29s (remain 0m 34s) Loss: 0.8642 


Validation:  49%|████▉     | 424/866 [00:31<00:29, 15.09valid_batch/s]

EVAL: [420/866] Elapsed 0m 31s (remain 0m 33s) Loss: 0.8635 


Validation:  51%|█████     | 442/866 [00:32<00:31, 13.64valid_batch/s]

EVAL: [440/866] Elapsed 0m 32s (remain 0m 31s) Loss: 0.8659 


Validation:  53%|█████▎    | 462/866 [00:34<00:29, 13.55valid_batch/s]

EVAL: [460/866] Elapsed 0m 34s (remain 0m 29s) Loss: 0.8618 


Validation:  56%|█████▌    | 483/866 [00:35<00:25, 14.75valid_batch/s]

EVAL: [480/866] Elapsed 0m 35s (remain 0m 28s) Loss: 0.8622 


Validation:  58%|█████▊    | 503/866 [00:37<00:28, 12.65valid_batch/s]

EVAL: [500/866] Elapsed 0m 37s (remain 0m 26s) Loss: 0.8592 


Validation:  60%|██████    | 522/866 [00:38<00:24, 14.08valid_batch/s]

EVAL: [520/866] Elapsed 0m 38s (remain 0m 25s) Loss: 0.8600 


Validation:  63%|██████▎   | 542/866 [00:39<00:25, 12.78valid_batch/s]

EVAL: [540/866] Elapsed 0m 39s (remain 0m 23s) Loss: 0.8563 


Validation:  65%|██████▌   | 563/866 [00:41<00:24, 12.20valid_batch/s]

EVAL: [560/866] Elapsed 0m 41s (remain 0m 22s) Loss: 0.8540 


Validation:  67%|██████▋   | 583/866 [00:43<00:22, 12.47valid_batch/s]

EVAL: [580/866] Elapsed 0m 42s (remain 0m 21s) Loss: 0.8534 


Validation:  70%|██████▉   | 604/866 [00:44<00:18, 14.37valid_batch/s]

EVAL: [600/866] Elapsed 0m 44s (remain 0m 19s) Loss: 0.8524 


Validation:  72%|███████▏  | 622/866 [00:45<00:14, 16.43valid_batch/s]

EVAL: [620/866] Elapsed 0m 45s (remain 0m 17s) Loss: 0.8578 


Validation:  74%|███████▍  | 643/866 [00:46<00:14, 15.41valid_batch/s]

EVAL: [640/866] Elapsed 0m 46s (remain 0m 16s) Loss: 0.8597 


Validation:  76%|███████▋  | 662/866 [00:48<00:13, 14.97valid_batch/s]

EVAL: [660/866] Elapsed 0m 48s (remain 0m 14s) Loss: 0.8555 


Validation:  79%|███████▉  | 683/866 [00:49<00:12, 14.72valid_batch/s]

EVAL: [680/866] Elapsed 0m 49s (remain 0m 13s) Loss: 0.8550 


Validation:  81%|████████  | 701/866 [00:51<00:12, 13.31valid_batch/s]

EVAL: [700/866] Elapsed 0m 51s (remain 0m 12s) Loss: 0.8542 


Validation:  84%|████████▎ | 724/866 [00:52<00:09, 14.81valid_batch/s]

EVAL: [720/866] Elapsed 0m 52s (remain 0m 10s) Loss: 0.8619 


Validation:  86%|████████▌ | 742/866 [00:54<00:10, 12.38valid_batch/s]

EVAL: [740/866] Elapsed 0m 54s (remain 0m 9s) Loss: 0.8633 


Validation:  88%|████████▊ | 763/866 [00:55<00:06, 15.88valid_batch/s]

EVAL: [760/866] Elapsed 0m 55s (remain 0m 7s) Loss: 0.8605 


Validation:  90%|█████████ | 783/866 [00:57<00:06, 13.73valid_batch/s]

EVAL: [780/866] Elapsed 0m 57s (remain 0m 6s) Loss: 0.8586 


Validation:  93%|█████████▎| 802/866 [00:58<00:03, 16.41valid_batch/s]

EVAL: [800/866] Elapsed 0m 58s (remain 0m 4s) Loss: 0.8631 


Validation:  95%|█████████▌| 823/866 [01:00<00:02, 16.20valid_batch/s]

EVAL: [820/866] Elapsed 0m 59s (remain 0m 3s) Loss: 0.8614 


Validation:  97%|█████████▋| 843/866 [01:01<00:01, 13.13valid_batch/s]

EVAL: [840/866] Elapsed 1m 1s (remain 0m 1s) Loss: 0.8647 


Validation: 100%|█████████▉| 862/866 [01:03<00:00, 14.12valid_batch/s]

EVAL: [860/866] Elapsed 1m 3s (remain 0m 0s) Loss: 0.8674 


Validation: 100%|██████████| 866/866 [01:03<00:00, 13.65valid_batch/s]
Epoch 1 - avg_train_loss: 0.9593  avg_val_loss: 0.8686  time: 659s
Epoch 1 - Score: 0.7892
Epoch 1 - Save Best Score: 0.7892 Model


EVAL: [865/866] Elapsed 1m 3s (remain 0m 0s) Loss: 0.8686 


Train:   0%|          | 1/1730 [00:00<20:26,  1.41train_batch/s]

Epoch: [2][0/1730] Elapsed 0m 0s (remain 20m 26s) Loss: 0.7796 Grad: inf  LR: 0.00002000  


Train:   1%|          | 21/1730 [00:07<08:28,  3.36train_batch/s]

Epoch: [2][20/1730] Elapsed 0m 7s (remain 9m 38s) Loss: 1.2065 Grad: 127268.9219  LR: 0.00001986  


Train:   2%|▏         | 41/1730 [00:13<09:24,  2.99train_batch/s]

Epoch: [2][40/1730] Elapsed 0m 13s (remain 9m 4s) Loss: 1.0522 Grad: 183887.0312  LR: 0.00001952  


Train:   4%|▎         | 61/1730 [00:19<08:47,  3.16train_batch/s]

Epoch: [2][60/1730] Elapsed 0m 19s (remain 8m 52s) Loss: 0.9529 Grad: 180121.5469  LR: 0.00001897  


Train:   5%|▍         | 81/1730 [00:25<08:57,  3.07train_batch/s]

Epoch: [2][80/1730] Elapsed 0m 25s (remain 8m 44s) Loss: 0.9083 Grad: 255250.4531  LR: 0.00001823  


Train:   6%|▌         | 101/1730 [00:33<11:52,  2.29train_batch/s]

Epoch: [2][100/1730] Elapsed 0m 33s (remain 8m 55s) Loss: 0.8950 Grad: 142943.3125  LR: 0.00001732  


Train:   7%|▋         | 121/1730 [00:39<07:35,  3.53train_batch/s]

Epoch: [2][120/1730] Elapsed 0m 39s (remain 8m 43s) Loss: 0.8740 Grad: 92385.9609  LR: 0.00001626  


Train:   8%|▊         | 141/1730 [00:46<08:45,  3.03train_batch/s]

Epoch: [2][140/1730] Elapsed 0m 46s (remain 8m 41s) Loss: 0.8662 Grad: 270998.5938  LR: 0.00001506  


Train:   9%|▉         | 161/1730 [00:53<08:32,  3.06train_batch/s]

Epoch: [2][160/1730] Elapsed 0m 53s (remain 8m 38s) Loss: 0.8612 Grad: 98964.2891  LR: 0.00001376  


Train:  11%|█         | 182/1730 [01:00<08:11,  3.15train_batch/s]

Epoch: [2][180/1730] Elapsed 1m 0s (remain 8m 37s) Loss: 0.8566 Grad: 147313.5000  LR: 0.00001238  


Train:  12%|█▏        | 201/1730 [01:06<07:12,  3.53train_batch/s]

Epoch: [2][200/1730] Elapsed 1m 6s (remain 8m 28s) Loss: 0.8481 Grad: 71355.2344  LR: 0.00001094  


Train:  13%|█▎        | 221/1730 [01:12<07:43,  3.26train_batch/s]

Epoch: [2][220/1730] Elapsed 1m 12s (remain 8m 17s) Loss: 0.8358 Grad: 76311.1406  LR: 0.00000949  


Train:  14%|█▍        | 241/1730 [01:20<09:12,  2.69train_batch/s]

Epoch: [2][240/1730] Elapsed 1m 20s (remain 8m 16s) Loss: 0.8280 Grad: 86452.2891  LR: 0.00000805  


Train:  15%|█▌        | 261/1730 [01:26<08:31,  2.87train_batch/s]

Epoch: [2][260/1730] Elapsed 1m 26s (remain 8m 6s) Loss: 0.8148 Grad: 279350.4062  LR: 0.00000665  


Train:  16%|█▌        | 281/1730 [01:33<09:49,  2.46train_batch/s]

Epoch: [2][280/1730] Elapsed 1m 33s (remain 8m 0s) Loss: 0.8153 Grad: 105795.4609  LR: 0.00000532  


Train:  17%|█▋        | 301/1730 [01:39<07:45,  3.07train_batch/s]

Epoch: [2][300/1730] Elapsed 1m 39s (remain 7m 53s) Loss: 0.8140 Grad: 131390.6562  LR: 0.00000409  


Train:  19%|█▊        | 321/1730 [01:45<08:42,  2.70train_batch/s]

Epoch: [2][320/1730] Elapsed 1m 45s (remain 7m 44s) Loss: 0.8092 Grad: 148616.0469  LR: 0.00000298  


Train:  20%|█▉        | 341/1730 [01:53<11:59,  1.93train_batch/s]

Epoch: [2][340/1730] Elapsed 1m 53s (remain 7m 42s) Loss: 0.8173 Grad: 160817.0938  LR: 0.00000202  


Train:  21%|██        | 361/1730 [02:00<07:33,  3.02train_batch/s]

Epoch: [2][360/1730] Elapsed 2m 0s (remain 7m 35s) Loss: 0.8162 Grad: 84382.7500  LR: 0.00000123  


Train:  22%|██▏       | 382/1730 [02:07<07:42,  2.91train_batch/s]

Epoch: [2][380/1730] Elapsed 2m 7s (remain 7m 29s) Loss: 0.8144 Grad: 215222.1094  LR: 0.00000063  


Train:  23%|██▎       | 401/1730 [02:13<07:36,  2.91train_batch/s]

Epoch: [2][400/1730] Elapsed 2m 13s (remain 7m 23s) Loss: 0.8103 Grad: 63064.2422  LR: 0.00000022  


Train:  24%|██▍       | 421/1730 [02:22<08:57,  2.43train_batch/s]

Epoch: [2][420/1730] Elapsed 2m 22s (remain 7m 22s) Loss: 0.8090 Grad: 94394.0547  LR: 0.00000002  


Train:  25%|██▌       | 441/1730 [02:29<06:45,  3.18train_batch/s]

Epoch: [2][440/1730] Elapsed 2m 29s (remain 7m 18s) Loss: 0.8079 Grad: 105456.1641  LR: 0.00000003  


Train:  27%|██▋       | 462/1730 [02:38<06:22,  3.31train_batch/s]

Epoch: [2][460/1730] Elapsed 2m 38s (remain 7m 15s) Loss: 0.8100 Grad: 202082.9531  LR: 0.00000025  


Train:  28%|██▊       | 481/1730 [02:45<07:56,  2.62train_batch/s]

Epoch: [2][480/1730] Elapsed 2m 45s (remain 7m 8s) Loss: 0.8035 Grad: 116970.2188  LR: 0.00000068  


Train:  29%|██▉       | 501/1730 [02:52<07:31,  2.72train_batch/s]

Epoch: [2][500/1730] Elapsed 2m 52s (remain 7m 2s) Loss: 0.7991 Grad: 190668.7500  LR: 0.00000130  


Train:  30%|███       | 522/1730 [02:59<06:27,  3.12train_batch/s]

Epoch: [2][520/1730] Elapsed 2m 58s (remain 6m 54s) Loss: 0.7986 Grad: 193317.9375  LR: 0.00000211  


Train:  31%|███▏      | 541/1730 [03:04<05:56,  3.33train_batch/s]

Epoch: [2][540/1730] Elapsed 3m 4s (remain 6m 45s) Loss: 0.7945 Grad: 257895.1562  LR: 0.00000308  


Train:  32%|███▏      | 561/1730 [03:11<05:28,  3.56train_batch/s]

Epoch: [2][560/1730] Elapsed 3m 11s (remain 6m 38s) Loss: 0.7956 Grad: 84947.8594  LR: 0.00000420  


Train:  34%|███▎      | 581/1730 [03:17<07:10,  2.67train_batch/s]

Epoch: [2][580/1730] Elapsed 3m 17s (remain 6m 31s) Loss: 0.7936 Grad: 289565.1250  LR: 0.00000545  


Train:  35%|███▍      | 601/1730 [03:24<07:31,  2.50train_batch/s]

Epoch: [2][600/1730] Elapsed 3m 24s (remain 6m 24s) Loss: 0.7911 Grad: 350017.8125  LR: 0.00000679  


Train:  36%|███▌      | 621/1730 [03:31<06:02,  3.06train_batch/s]

Epoch: [2][620/1730] Elapsed 3m 31s (remain 6m 17s) Loss: 0.7894 Grad: 289879.3750  LR: 0.00000819  


Train:  37%|███▋      | 641/1730 [03:37<07:22,  2.46train_batch/s]

Epoch: [2][640/1730] Elapsed 3m 37s (remain 6m 10s) Loss: 0.7862 Grad: 223159.6406  LR: 0.00000964  


Train:  38%|███▊      | 661/1730 [03:45<07:51,  2.27train_batch/s]

Epoch: [2][660/1730] Elapsed 3m 45s (remain 6m 3s) Loss: 0.7902 Grad: 220242.7656  LR: 0.00001109  


Train:  39%|███▉      | 681/1730 [03:52<05:41,  3.07train_batch/s]

Epoch: [2][680/1730] Elapsed 3m 52s (remain 5m 57s) Loss: 0.7894 Grad: 276678.4688  LR: 0.00001252  


Train:  41%|████      | 701/1730 [03:59<06:13,  2.75train_batch/s]

Epoch: [2][700/1730] Elapsed 3m 59s (remain 5m 50s) Loss: 0.7892 Grad: 156522.1562  LR: 0.00001389  


Train:  42%|████▏     | 721/1730 [04:06<06:18,  2.67train_batch/s]

Epoch: [2][720/1730] Elapsed 4m 6s (remain 5m 44s) Loss: 0.7902 Grad: 141291.1250  LR: 0.00001519  


Train:  43%|████▎     | 741/1730 [04:12<05:36,  2.94train_batch/s]

Epoch: [2][740/1730] Elapsed 4m 12s (remain 5m 37s) Loss: 0.7916 Grad: 201193.0312  LR: 0.00001637  


Train:  44%|████▍     | 761/1730 [04:20<05:32,  2.91train_batch/s]

Epoch: [2][760/1730] Elapsed 4m 20s (remain 5m 31s) Loss: 0.7907 Grad: 142574.7500  LR: 0.00001742  


Train:  45%|████▌     | 781/1730 [04:27<06:13,  2.54train_batch/s]

Epoch: [2][780/1730] Elapsed 4m 27s (remain 5m 24s) Loss: 0.7895 Grad: 134792.3438  LR: 0.00001831  


Train:  46%|████▋     | 801/1730 [04:34<04:56,  3.13train_batch/s]

Epoch: [2][800/1730] Elapsed 4m 34s (remain 5m 17s) Loss: 0.7919 Grad: 299503.7500  LR: 0.00001903  


Train:  47%|████▋     | 821/1730 [04:41<04:51,  3.12train_batch/s]

Epoch: [2][820/1730] Elapsed 4m 41s (remain 5m 11s) Loss: 0.7905 Grad: 135701.6875  LR: 0.00001956  


Train:  49%|████▊     | 841/1730 [04:47<05:02,  2.94train_batch/s]

Epoch: [2][840/1730] Elapsed 4m 47s (remain 5m 4s) Loss: 0.7893 Grad: 106183.7500  LR: 0.00001988  


Train:  50%|████▉     | 861/1730 [04:55<04:51,  2.98train_batch/s]

Epoch: [2][860/1730] Elapsed 4m 55s (remain 4m 57s) Loss: 0.7917 Grad: 434168.4688  LR: 0.00002000  


Train:  51%|█████     | 881/1730 [05:01<04:46,  2.97train_batch/s]

Epoch: [2][880/1730] Elapsed 5m 1s (remain 4m 50s) Loss: 0.7929 Grad: 199193.2656  LR: 0.00001990  


Train:  52%|█████▏    | 901/1730 [05:08<04:29,  3.08train_batch/s]

Epoch: [2][900/1730] Elapsed 5m 8s (remain 4m 44s) Loss: 0.7926 Grad: 129688.9219  LR: 0.00001960  


Train:  53%|█████▎    | 921/1730 [05:15<05:08,  2.62train_batch/s]

Epoch: [2][920/1730] Elapsed 5m 15s (remain 4m 36s) Loss: 0.7919 Grad: 172725.2812  LR: 0.00001909  


Train:  54%|█████▍    | 942/1730 [05:22<04:36,  2.85train_batch/s]

Epoch: [2][940/1730] Elapsed 5m 22s (remain 4m 30s) Loss: 0.7900 Grad: 97282.7891  LR: 0.00001839  


Train:  56%|█████▌    | 961/1730 [05:29<05:17,  2.42train_batch/s]

Epoch: [2][960/1730] Elapsed 5m 29s (remain 4m 23s) Loss: 0.7895 Grad: 249872.2031  LR: 0.00001752  


Train:  57%|█████▋    | 981/1730 [05:36<05:30,  2.26train_batch/s]

Epoch: [2][980/1730] Elapsed 5m 36s (remain 4m 17s) Loss: 0.7921 Grad: 182178.6562  LR: 0.00001648  


Train:  58%|█████▊    | 1001/1730 [05:44<04:00,  3.04train_batch/s]

Epoch: [2][1000/1730] Elapsed 5m 44s (remain 4m 10s) Loss: 0.7936 Grad: 106642.6953  LR: 0.00001531  


Train:  59%|█████▉    | 1021/1730 [05:51<03:56,  3.00train_batch/s]

Epoch: [2][1020/1730] Elapsed 5m 51s (remain 4m 3s) Loss: 0.7938 Grad: 129149.2422  LR: 0.00001403  


Train:  60%|██████    | 1041/1730 [05:58<03:30,  3.28train_batch/s]

Epoch: [2][1040/1730] Elapsed 5m 58s (remain 3m 57s) Loss: 0.7927 Grad: 129317.7812  LR: 0.00001266  


Train:  61%|██████▏   | 1061/1730 [06:05<04:21,  2.56train_batch/s]

Epoch: [2][1060/1730] Elapsed 6m 5s (remain 3m 50s) Loss: 0.7922 Grad: 467879.3125  LR: 0.00001123  


Train:  62%|██████▏   | 1081/1730 [06:11<02:53,  3.73train_batch/s]

Epoch: [2][1080/1730] Elapsed 6m 11s (remain 3m 43s) Loss: 0.7925 Grad: 122481.4297  LR: 0.00000978  


Train:  64%|██████▎   | 1101/1730 [06:18<04:13,  2.49train_batch/s]

Epoch: [2][1100/1730] Elapsed 6m 18s (remain 3m 36s) Loss: 0.7924 Grad: 595446.9375  LR: 0.00000834  


Train:  65%|██████▍   | 1121/1730 [06:24<02:57,  3.42train_batch/s]

Epoch: [2][1120/1730] Elapsed 6m 24s (remain 3m 28s) Loss: 0.7917 Grad: 86408.6094  LR: 0.00000692  


Train:  66%|██████▌   | 1141/1730 [06:30<03:17,  2.98train_batch/s]

Epoch: [2][1140/1730] Elapsed 6m 30s (remain 3m 21s) Loss: 0.7913 Grad: 215638.8750  LR: 0.00000558  


Train:  67%|██████▋   | 1161/1730 [06:35<02:26,  3.89train_batch/s]

Epoch: [2][1160/1730] Elapsed 6m 35s (remain 3m 14s) Loss: 0.7928 Grad: 265585.5312  LR: 0.00000432  


Train:  68%|██████▊   | 1181/1730 [06:42<03:47,  2.41train_batch/s]

Epoch: [2][1180/1730] Elapsed 6m 42s (remain 3m 7s) Loss: 0.7922 Grad: 174178.4219  LR: 0.00000319  


Train:  69%|██████▉   | 1201/1730 [06:50<03:42,  2.38train_batch/s]

Epoch: [2][1200/1730] Elapsed 6m 50s (remain 3m 0s) Loss: 0.7918 Grad: 127022.8672  LR: 0.00000220  


Train:  71%|███████   | 1221/1730 [06:57<02:34,  3.29train_batch/s]

Epoch: [2][1220/1730] Elapsed 6m 57s (remain 2m 54s) Loss: 0.7921 Grad: 99663.7812  LR: 0.00000138  


Train:  72%|███████▏  | 1241/1730 [07:05<02:42,  3.00train_batch/s]

Epoch: [2][1240/1730] Elapsed 7m 5s (remain 2m 47s) Loss: 0.7914 Grad: 115376.4922  LR: 0.00000073  


Train:  73%|███████▎  | 1261/1730 [07:12<02:44,  2.85train_batch/s]

Epoch: [2][1260/1730] Elapsed 7m 12s (remain 2m 40s) Loss: 0.7904 Grad: 55008.9062  LR: 0.00000029  


Train:  74%|███████▍  | 1281/1730 [07:19<02:40,  2.79train_batch/s]

Epoch: [2][1280/1730] Elapsed 7m 19s (remain 2m 33s) Loss: 0.7898 Grad: 152121.7188  LR: 0.00000004  


Train:  75%|███████▌  | 1301/1730 [07:25<02:23,  2.99train_batch/s]

Epoch: [2][1300/1730] Elapsed 7m 25s (remain 2m 27s) Loss: 0.7885 Grad: 153995.8594  LR: 0.00000001  


Train:  76%|███████▋  | 1321/1730 [07:32<02:08,  3.18train_batch/s]

Epoch: [2][1320/1730] Elapsed 7m 32s (remain 2m 20s) Loss: 0.7874 Grad: 95376.0156  LR: 0.00000019  


Train:  78%|███████▊  | 1341/1730 [07:39<02:21,  2.75train_batch/s]

Epoch: [2][1340/1730] Elapsed 7m 39s (remain 2m 13s) Loss: 0.7864 Grad: 144622.8438  LR: 0.00000058  


Train:  79%|███████▊  | 1361/1730 [07:46<02:07,  2.89train_batch/s]

Epoch: [2][1360/1730] Elapsed 7m 46s (remain 2m 6s) Loss: 0.7854 Grad: 280397.7812  LR: 0.00000116  


Train:  80%|███████▉  | 1381/1730 [07:52<01:43,  3.38train_batch/s]

Epoch: [2][1380/1730] Elapsed 7m 52s (remain 1m 59s) Loss: 0.7842 Grad: 43650.1523  LR: 0.00000194  


Train:  81%|████████  | 1401/1730 [07:59<02:09,  2.54train_batch/s]

Epoch: [2][1400/1730] Elapsed 7m 59s (remain 1m 52s) Loss: 0.7825 Grad: 239831.2969  LR: 0.00000288  


Train:  82%|████████▏ | 1421/1730 [08:05<01:29,  3.46train_batch/s]

Epoch: [2][1420/1730] Elapsed 8m 5s (remain 1m 45s) Loss: 0.7836 Grad: 161981.2031  LR: 0.00000397  


Train:  83%|████████▎ | 1441/1730 [08:12<02:00,  2.40train_batch/s]

Epoch: [2][1440/1730] Elapsed 8m 12s (remain 1m 38s) Loss: 0.7831 Grad: 95389.2266  LR: 0.00000519  


Train:  84%|████████▍ | 1461/1730 [08:19<01:54,  2.35train_batch/s]

Epoch: [2][1460/1730] Elapsed 8m 19s (remain 1m 31s) Loss: 0.7838 Grad: 196303.0625  LR: 0.00000651  


Train:  86%|████████▌ | 1481/1730 [08:25<01:13,  3.40train_batch/s]

Epoch: [2][1480/1730] Elapsed 8m 25s (remain 1m 25s) Loss: 0.7834 Grad: 129041.7188  LR: 0.00000791  


Train:  87%|████████▋ | 1501/1730 [08:32<01:26,  2.64train_batch/s]

Epoch: [2][1500/1730] Elapsed 8m 32s (remain 1m 18s) Loss: 0.7833 Grad: 164950.1562  LR: 0.00000935  


Train:  88%|████████▊ | 1521/1730 [08:39<01:07,  3.12train_batch/s]

Epoch: [2][1520/1730] Elapsed 8m 39s (remain 1m 11s) Loss: 0.7834 Grad: 89397.5938  LR: 0.00001080  


Train:  89%|████████▉ | 1541/1730 [08:46<01:00,  3.13train_batch/s]

Epoch: [2][1540/1730] Elapsed 8m 46s (remain 1m 4s) Loss: 0.7839 Grad: 117555.9219  LR: 0.00001224  


Train:  90%|█████████ | 1562/1730 [08:53<01:00,  2.78train_batch/s]

Epoch: [2][1560/1730] Elapsed 8m 53s (remain 0m 57s) Loss: 0.7836 Grad: 64699.1406  LR: 0.00001362  


Train:  91%|█████████▏| 1582/1730 [09:00<00:46,  3.17train_batch/s]

Epoch: [2][1580/1730] Elapsed 8m 59s (remain 0m 50s) Loss: 0.7840 Grad: 120188.8594  LR: 0.00001494  


Train:  93%|█████████▎| 1601/1730 [09:05<00:34,  3.69train_batch/s]

Epoch: [2][1600/1730] Elapsed 9m 5s (remain 0m 43s) Loss: 0.7829 Grad: 127070.9141  LR: 0.00001615  


Train:  94%|█████████▎| 1621/1730 [09:12<00:36,  2.98train_batch/s]

Epoch: [2][1620/1730] Elapsed 9m 12s (remain 0m 37s) Loss: 0.7848 Grad: 115775.2109  LR: 0.00001722  


Train:  95%|█████████▍| 1641/1730 [09:19<00:37,  2.39train_batch/s]

Epoch: [2][1640/1730] Elapsed 9m 19s (remain 0m 30s) Loss: 0.7858 Grad: 295834.1250  LR: 0.00001815  


Train:  96%|█████████▌| 1661/1730 [09:27<00:23,  2.95train_batch/s]

Epoch: [2][1660/1730] Elapsed 9m 27s (remain 0m 23s) Loss: 0.7850 Grad: 81856.5312  LR: 0.00001890  


Train:  97%|█████████▋| 1681/1730 [09:34<00:23,  2.06train_batch/s]

Epoch: [2][1680/1730] Elapsed 9m 34s (remain 0m 16s) Loss: 0.7851 Grad: 63334.2695  LR: 0.00001947  


Train:  98%|█████████▊| 1701/1730 [09:40<00:08,  3.51train_batch/s]

Epoch: [2][1700/1730] Elapsed 9m 40s (remain 0m 9s) Loss: 0.7843 Grad: 199365.2969  LR: 0.00001984  


Train:  99%|█████████▉| 1721/1730 [09:48<00:03,  2.51train_batch/s]

Epoch: [2][1720/1730] Elapsed 9m 48s (remain 0m 3s) Loss: 0.7836 Grad: 141849.4375  LR: 0.00001999  


Train: 100%|██████████| 1730/1730 [09:51<00:00,  2.92train_batch/s]


Epoch: [2][1729/1730] Elapsed 9m 51s (remain 0m 0s) Loss: 0.7839 Grad: 93189.0234  LR: 0.00002000  


Validation:   0%|          | 3/866 [00:00<01:58,  7.31valid_batch/s]

EVAL: [0/866] Elapsed 0m 0s (remain 4m 8s) Loss: 1.5520 


Validation:   3%|▎         | 22/866 [00:01<00:56, 14.94valid_batch/s]

EVAL: [20/866] Elapsed 0m 1s (remain 1m 7s) Loss: 0.7730 


Validation:   5%|▌         | 44/866 [00:03<00:57, 14.41valid_batch/s]

EVAL: [40/866] Elapsed 0m 3s (remain 1m 1s) Loss: 0.7967 


Validation:   7%|▋         | 63/866 [00:04<00:53, 15.07valid_batch/s]

EVAL: [60/866] Elapsed 0m 4s (remain 0m 58s) Loss: 0.8751 


Validation:   9%|▉         | 81/866 [00:05<00:53, 14.65valid_batch/s]

EVAL: [80/866] Elapsed 0m 5s (remain 0m 55s) Loss: 0.8322 


Validation:  12%|█▏        | 102/866 [00:07<00:55, 13.71valid_batch/s]

EVAL: [100/866] Elapsed 0m 7s (remain 0m 55s) Loss: 0.8647 


Validation:  14%|█▍        | 123/866 [00:08<00:52, 14.17valid_batch/s]

EVAL: [120/866] Elapsed 0m 8s (remain 0m 53s) Loss: 0.8434 


Validation:  17%|█▋        | 143/866 [00:10<00:55, 12.93valid_batch/s]

EVAL: [140/866] Elapsed 0m 10s (remain 0m 51s) Loss: 0.8216 


Validation:  19%|█▉        | 163/866 [00:11<00:45, 15.46valid_batch/s]

EVAL: [160/866] Elapsed 0m 11s (remain 0m 51s) Loss: 0.8314 


Validation:  21%|██        | 183/866 [00:13<00:43, 15.55valid_batch/s]

EVAL: [180/866] Elapsed 0m 12s (remain 0m 49s) Loss: 0.8213 


Validation:  23%|██▎       | 203/866 [00:14<00:49, 13.39valid_batch/s]

EVAL: [200/866] Elapsed 0m 14s (remain 0m 47s) Loss: 0.8289 


Validation:  26%|██▌       | 224/866 [00:15<00:37, 17.06valid_batch/s]

EVAL: [220/866] Elapsed 0m 15s (remain 0m 46s) Loss: 0.8351 


Validation:  28%|██▊       | 242/866 [00:17<01:00, 10.39valid_batch/s]

EVAL: [240/866] Elapsed 0m 17s (remain 0m 45s) Loss: 0.8382 


Validation:  30%|███       | 262/866 [00:19<00:42, 14.27valid_batch/s]

EVAL: [260/866] Elapsed 0m 18s (remain 0m 43s) Loss: 0.8440 


Validation:  33%|███▎      | 282/866 [00:20<00:45, 12.82valid_batch/s]

EVAL: [280/866] Elapsed 0m 20s (remain 0m 42s) Loss: 0.8387 


Validation:  35%|███▍      | 303/866 [00:22<00:42, 13.17valid_batch/s]

EVAL: [300/866] Elapsed 0m 22s (remain 0m 41s) Loss: 0.8349 


Validation:  37%|███▋      | 322/866 [00:23<00:47, 11.34valid_batch/s]

EVAL: [320/866] Elapsed 0m 23s (remain 0m 39s) Loss: 0.8438 


Validation:  40%|███▉      | 343/866 [00:25<00:39, 13.20valid_batch/s]

EVAL: [340/866] Elapsed 0m 25s (remain 0m 38s) Loss: 0.8338 


Validation:  42%|████▏     | 363/866 [00:26<00:32, 15.39valid_batch/s]

EVAL: [360/866] Elapsed 0m 26s (remain 0m 36s) Loss: 0.8283 


Validation:  44%|████▍     | 382/866 [00:27<00:34, 14.00valid_batch/s]

EVAL: [380/866] Elapsed 0m 27s (remain 0m 35s) Loss: 0.8317 


Validation:  46%|████▋     | 402/866 [00:29<00:32, 14.25valid_batch/s]

EVAL: [400/866] Elapsed 0m 29s (remain 0m 33s) Loss: 0.8253 


Validation:  49%|████▉     | 424/866 [00:30<00:29, 15.09valid_batch/s]

EVAL: [420/866] Elapsed 0m 30s (remain 0m 32s) Loss: 0.8278 


Validation:  51%|█████     | 442/866 [00:32<00:30, 13.69valid_batch/s]

EVAL: [440/866] Elapsed 0m 32s (remain 0m 30s) Loss: 0.8355 


Validation:  53%|█████▎    | 462/866 [00:33<00:29, 13.53valid_batch/s]

EVAL: [460/866] Elapsed 0m 33s (remain 0m 29s) Loss: 0.8319 


Validation:  56%|█████▌    | 483/866 [00:35<00:25, 14.77valid_batch/s]

EVAL: [480/866] Elapsed 0m 34s (remain 0m 27s) Loss: 0.8347 


Validation:  58%|█████▊    | 503/866 [00:36<00:28, 12.65valid_batch/s]

EVAL: [500/866] Elapsed 0m 36s (remain 0m 26s) Loss: 0.8355 


Validation:  60%|██████    | 522/866 [00:37<00:24, 14.08valid_batch/s]

EVAL: [520/866] Elapsed 0m 37s (remain 0m 25s) Loss: 0.8414 


Validation:  63%|██████▎   | 542/866 [00:39<00:25, 12.80valid_batch/s]

EVAL: [540/866] Elapsed 0m 39s (remain 0m 23s) Loss: 0.8415 


Validation:  65%|██████▌   | 563/866 [00:40<00:24, 12.22valid_batch/s]

EVAL: [560/866] Elapsed 0m 40s (remain 0m 22s) Loss: 0.8421 


Validation:  67%|██████▋   | 583/866 [00:42<00:22, 12.47valid_batch/s]

EVAL: [580/866] Elapsed 0m 42s (remain 0m 20s) Loss: 0.8440 


Validation:  70%|██████▉   | 604/866 [00:43<00:18, 14.42valid_batch/s]

EVAL: [600/866] Elapsed 0m 43s (remain 0m 19s) Loss: 0.8393 


Validation:  72%|███████▏  | 625/866 [00:45<00:14, 16.90valid_batch/s]

EVAL: [620/866] Elapsed 0m 45s (remain 0m 17s) Loss: 0.8423 


Validation:  74%|███████▍  | 643/866 [00:46<00:14, 15.46valid_batch/s]

EVAL: [640/866] Elapsed 0m 46s (remain 0m 16s) Loss: 0.8448 


Validation:  76%|███████▋  | 662/866 [00:47<00:13, 15.01valid_batch/s]

EVAL: [660/866] Elapsed 0m 47s (remain 0m 14s) Loss: 0.8376 


Validation:  79%|███████▉  | 683/866 [00:49<00:12, 14.73valid_batch/s]

EVAL: [680/866] Elapsed 0m 49s (remain 0m 13s) Loss: 0.8365 


Validation:  81%|████████  | 701/866 [00:50<00:12, 13.29valid_batch/s]

EVAL: [700/866] Elapsed 0m 50s (remain 0m 11s) Loss: 0.8337 


Validation:  84%|████████▎ | 724/866 [00:52<00:09, 14.84valid_batch/s]

EVAL: [720/866] Elapsed 0m 52s (remain 0m 10s) Loss: 0.8397 


Validation:  86%|████████▌ | 742/866 [00:53<00:09, 12.40valid_batch/s]

EVAL: [740/866] Elapsed 0m 53s (remain 0m 9s) Loss: 0.8419 


Validation:  88%|████████▊ | 763/866 [00:55<00:06, 15.91valid_batch/s]

EVAL: [760/866] Elapsed 0m 55s (remain 0m 7s) Loss: 0.8407 


Validation:  90%|█████████ | 783/866 [00:56<00:06, 13.76valid_batch/s]

EVAL: [780/866] Elapsed 0m 56s (remain 0m 6s) Loss: 0.8380 


Validation:  93%|█████████▎| 802/866 [00:58<00:03, 16.44valid_batch/s]

EVAL: [800/866] Elapsed 0m 58s (remain 0m 4s) Loss: 0.8427 


Validation:  95%|█████████▌| 823/866 [00:59<00:02, 16.23valid_batch/s]

EVAL: [820/866] Elapsed 0m 59s (remain 0m 3s) Loss: 0.8418 


Validation:  97%|█████████▋| 843/866 [01:01<00:01, 13.15valid_batch/s]

EVAL: [840/866] Elapsed 1m 0s (remain 0m 1s) Loss: 0.8445 


Validation: 100%|█████████▉| 862/866 [01:02<00:00, 14.11valid_batch/s]

EVAL: [860/866] Elapsed 1m 2s (remain 0m 0s) Loss: 0.8465 


Validation: 100%|██████████| 866/866 [01:02<00:00, 13.78valid_batch/s]
Epoch 2 - avg_train_loss: 0.7839  avg_val_loss: 0.8476  time: 655s
Epoch 2 - Score: 0.7912
Epoch 2 - Save Best Score: 0.7912 Model


EVAL: [865/866] Elapsed 1m 2s (remain 0m 0s) Loss: 0.8476 


Score: 0.7912
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Train:   0%|          | 1/1730 [00:00<14:01,  2.06train_batch/s]

Epoch: [1][0/1730] Elapsed 0m 0s (remain 14m 1s) Loss: 1.5472 Grad: inf  LR: 0.00002000  


Train:   1%|          | 21/1730 [00:08<11:10,  2.55train_batch/s]

Epoch: [1][20/1730] Elapsed 0m 8s (remain 11m 3s) Loss: 1.6041 Grad: 73545.6484  LR: 0.00001988  


Train:   2%|▏         | 41/1730 [00:14<07:35,  3.71train_batch/s]

Epoch: [1][40/1730] Elapsed 0m 14s (remain 9m 45s) Loss: 1.4900 Grad: 92856.8594  LR: 0.00001956  


Train:   4%|▎         | 61/1730 [00:21<08:32,  3.26train_batch/s]

Epoch: [1][60/1730] Elapsed 0m 21s (remain 9m 36s) Loss: 1.4110 Grad: 48508.6055  LR: 0.00001903  


Train:   5%|▍         | 81/1730 [00:27<08:55,  3.08train_batch/s]

Epoch: [1][80/1730] Elapsed 0m 27s (remain 9m 23s) Loss: 1.3559 Grad: 85456.4219  LR: 0.00001831  


Train:   6%|▌         | 101/1730 [00:34<08:21,  3.25train_batch/s]

Epoch: [1][100/1730] Elapsed 0m 34s (remain 9m 16s) Loss: 1.3027 Grad: 36710.5625  LR: 0.00001742  


Train:   7%|▋         | 121/1730 [00:41<08:19,  3.22train_batch/s]

Epoch: [1][120/1730] Elapsed 0m 41s (remain 9m 13s) Loss: 1.2526 Grad: 43951.1680  LR: 0.00001637  


Train:   8%|▊         | 142/1730 [00:48<09:29,  2.79train_batch/s]

Epoch: [1][140/1730] Elapsed 0m 48s (remain 9m 2s) Loss: 1.2129 Grad: 31596.3594  LR: 0.00001519  


Train:   9%|▉         | 161/1730 [00:54<09:48,  2.67train_batch/s]

Epoch: [1][160/1730] Elapsed 0m 54s (remain 8m 49s) Loss: 1.1741 Grad: 42048.5234  LR: 0.00001389  


Train:  10%|█         | 181/1730 [01:00<08:13,  3.14train_batch/s]

Epoch: [1][180/1730] Elapsed 1m 0s (remain 8m 35s) Loss: 1.1608 Grad: 122358.7422  LR: 0.00001252  


Train:  12%|█▏        | 201/1730 [01:07<08:41,  2.93train_batch/s]

Epoch: [1][200/1730] Elapsed 1m 7s (remain 8m 32s) Loss: 1.1418 Grad: 65791.8984  LR: 0.00001109  


Train:  13%|█▎        | 221/1730 [01:14<11:03,  2.27train_batch/s]

Epoch: [1][220/1730] Elapsed 1m 14s (remain 8m 28s) Loss: 1.1277 Grad: 47301.6250  LR: 0.00000964  


Train:  14%|█▍        | 241/1730 [01:21<07:21,  3.38train_batch/s]

Epoch: [1][240/1730] Elapsed 1m 21s (remain 8m 23s) Loss: 1.1145 Grad: 31437.3359  LR: 0.00000819  


Train:  15%|█▌        | 261/1730 [01:27<06:52,  3.56train_batch/s]

Epoch: [1][260/1730] Elapsed 1m 27s (remain 8m 12s) Loss: 1.0982 Grad: 107107.3516  LR: 0.00000679  


Train:  16%|█▌        | 281/1730 [01:34<08:21,  2.89train_batch/s]

Epoch: [1][280/1730] Elapsed 1m 33s (remain 8m 4s) Loss: 1.0960 Grad: 90762.3203  LR: 0.00000545  


Train:  17%|█▋        | 302/1730 [01:40<06:51,  3.47train_batch/s]

Epoch: [1][300/1730] Elapsed 1m 40s (remain 7m 58s) Loss: 1.0779 Grad: 34864.8477  LR: 0.00000420  


Train:  19%|█▊        | 321/1730 [01:47<08:46,  2.68train_batch/s]

Epoch: [1][320/1730] Elapsed 1m 47s (remain 7m 50s) Loss: 1.0693 Grad: 72759.4609  LR: 0.00000308  


Train:  20%|█▉        | 341/1730 [01:53<08:22,  2.76train_batch/s]

Epoch: [1][340/1730] Elapsed 1m 53s (remain 7m 43s) Loss: 1.0578 Grad: 78247.8594  LR: 0.00000211  


Train:  21%|██        | 362/1730 [02:00<06:47,  3.36train_batch/s]

Epoch: [1][360/1730] Elapsed 2m 0s (remain 7m 36s) Loss: 1.0428 Grad: 188347.4688  LR: 0.00000130  


Train:  22%|██▏       | 381/1730 [02:08<09:15,  2.43train_batch/s]

Epoch: [1][380/1730] Elapsed 2m 8s (remain 7m 34s) Loss: 1.0361 Grad: 34662.6953  LR: 0.00000068  


Train:  23%|██▎       | 401/1730 [02:15<06:35,  3.36train_batch/s]

Epoch: [1][400/1730] Elapsed 2m 15s (remain 7m 28s) Loss: 1.0363 Grad: 28728.0156  LR: 0.00000025  


Train:  24%|██▍       | 421/1730 [02:21<05:52,  3.72train_batch/s]

Epoch: [1][420/1730] Elapsed 2m 21s (remain 7m 19s) Loss: 1.0283 Grad: 43053.4062  LR: 0.00000003  


Train:  25%|██▌       | 441/1730 [02:27<06:30,  3.30train_batch/s]

Epoch: [1][440/1730] Elapsed 2m 27s (remain 7m 12s) Loss: 1.0227 Grad: 17543.6035  LR: 0.00000002  


Train:  27%|██▋       | 461/1730 [02:34<06:21,  3.33train_batch/s]

Epoch: [1][460/1730] Elapsed 2m 34s (remain 7m 4s) Loss: 1.0176 Grad: 29522.6465  LR: 0.00000022  


Train:  28%|██▊       | 481/1730 [02:41<08:30,  2.45train_batch/s]

Epoch: [1][480/1730] Elapsed 2m 41s (remain 6m 58s) Loss: 1.0108 Grad: 28851.3516  LR: 0.00000063  


Train:  29%|██▉       | 501/1730 [02:48<08:10,  2.50train_batch/s]

Epoch: [1][500/1730] Elapsed 2m 48s (remain 6m 53s) Loss: 1.0052 Grad: 23537.0898  LR: 0.00000123  


Train:  30%|███       | 521/1730 [02:54<07:04,  2.85train_batch/s]

Epoch: [1][520/1730] Elapsed 2m 54s (remain 6m 44s) Loss: 0.9978 Grad: nan  LR: 0.00000202  


Train:  31%|███▏      | 541/1730 [03:01<05:31,  3.58train_batch/s]

Epoch: [1][540/1730] Elapsed 3m 1s (remain 6m 38s) Loss: 0.9918 Grad: 20170.2305  LR: 0.00000298  


Train:  32%|███▏      | 561/1730 [03:07<07:12,  2.70train_batch/s]

Epoch: [1][560/1730] Elapsed 3m 7s (remain 6m 31s) Loss: 0.9880 Grad: 28855.6758  LR: 0.00000409  


Train:  34%|███▎      | 581/1730 [03:15<06:56,  2.76train_batch/s]

Epoch: [1][580/1730] Elapsed 3m 15s (remain 6m 25s) Loss: 0.9860 Grad: 9719.8320  LR: 0.00000532  


Train:  35%|███▍      | 601/1730 [03:21<06:42,  2.81train_batch/s]

Epoch: [1][600/1730] Elapsed 3m 21s (remain 6m 18s) Loss: 0.9840 Grad: 40219.8008  LR: 0.00000665  


Train:  36%|███▌      | 621/1730 [03:27<06:47,  2.72train_batch/s]

Epoch: [1][620/1730] Elapsed 3m 27s (remain 6m 11s) Loss: 0.9829 Grad: 100255.0156  LR: 0.00000805  


Train:  37%|███▋      | 641/1730 [03:35<07:35,  2.39train_batch/s]

Epoch: [1][640/1730] Elapsed 3m 35s (remain 6m 6s) Loss: 0.9852 Grad: 21980.5312  LR: 0.00000949  


Train:  38%|███▊      | 661/1730 [03:42<06:28,  2.75train_batch/s]

Epoch: [1][660/1730] Elapsed 3m 42s (remain 5m 59s) Loss: 0.9812 Grad: 16794.1836  LR: 0.00001094  


Train:  39%|███▉      | 682/1730 [03:49<05:04,  3.44train_batch/s]

Epoch: [1][680/1730] Elapsed 3m 49s (remain 5m 53s) Loss: 0.9788 Grad: 23605.2402  LR: 0.00001238  


Train:  41%|████      | 701/1730 [03:56<06:58,  2.46train_batch/s]

Epoch: [1][700/1730] Elapsed 3m 56s (remain 5m 47s) Loss: 0.9797 Grad: 16264.4395  LR: 0.00001376  


Train:  42%|████▏     | 721/1730 [04:02<05:44,  2.93train_batch/s]

Epoch: [1][720/1730] Elapsed 4m 2s (remain 5m 39s) Loss: 0.9812 Grad: 20473.5117  LR: 0.00001506  


Train:  43%|████▎     | 741/1730 [04:10<07:17,  2.26train_batch/s]

Epoch: [1][740/1730] Elapsed 4m 10s (remain 5m 33s) Loss: 0.9812 Grad: 26380.3047  LR: 0.00001626  


Train:  44%|████▍     | 761/1730 [04:18<06:44,  2.40train_batch/s]

Epoch: [1][760/1730] Elapsed 4m 18s (remain 5m 28s) Loss: 0.9833 Grad: 5021.6201  LR: 0.00001732  


Train:  45%|████▌     | 781/1730 [04:25<05:55,  2.67train_batch/s]

Epoch: [1][780/1730] Elapsed 4m 25s (remain 5m 22s) Loss: 0.9854 Grad: 6163.8369  LR: 0.00001823  


Train:  46%|████▋     | 802/1730 [04:31<04:38,  3.34train_batch/s]

Epoch: [1][800/1730] Elapsed 4m 31s (remain 5m 15s) Loss: 0.9871 Grad: 10459.4707  LR: 0.00001897  


Train:  47%|████▋     | 821/1730 [04:38<05:08,  2.94train_batch/s]

Epoch: [1][820/1730] Elapsed 4m 38s (remain 5m 7s) Loss: 0.9914 Grad: 10575.0615  LR: 0.00001952  


Train:  49%|████▊     | 841/1730 [04:44<04:14,  3.49train_batch/s]

Epoch: [1][840/1730] Elapsed 4m 44s (remain 5m 1s) Loss: 0.9960 Grad: 11916.2666  LR: 0.00001986  


Train:  50%|████▉     | 861/1730 [04:51<05:23,  2.69train_batch/s]

Epoch: [1][860/1730] Elapsed 4m 51s (remain 4m 54s) Loss: 0.9982 Grad: 8220.8525  LR: 0.00002000  


Train:  51%|█████     | 881/1730 [04:57<05:29,  2.58train_batch/s]

Epoch: [1][880/1730] Elapsed 4m 57s (remain 4m 46s) Loss: 0.9967 Grad: 10166.3896  LR: 0.00001992  


Train:  52%|█████▏    | 901/1730 [05:03<04:05,  3.38train_batch/s]

Epoch: [1][900/1730] Elapsed 5m 3s (remain 4m 39s) Loss: 0.9967 Grad: 9090.7627  LR: 0.00001964  


Train:  53%|█████▎    | 921/1730 [05:11<06:16,  2.15train_batch/s]

Epoch: [1][920/1730] Elapsed 5m 11s (remain 4m 33s) Loss: 0.9981 Grad: 11543.2227  LR: 0.00001915  


Train:  54%|█████▍    | 942/1730 [05:18<03:23,  3.87train_batch/s]

Epoch: [1][940/1730] Elapsed 5m 17s (remain 4m 26s) Loss: 0.9991 Grad: 8597.9717  LR: 0.00001847  


Train:  56%|█████▌    | 961/1730 [05:24<04:03,  3.16train_batch/s]

Epoch: [1][960/1730] Elapsed 5m 24s (remain 4m 19s) Loss: 0.9974 Grad: 4403.6387  LR: 0.00001761  


Train:  57%|█████▋    | 981/1730 [05:31<04:30,  2.77train_batch/s]

Epoch: [1][980/1730] Elapsed 5m 31s (remain 4m 12s) Loss: 0.9977 Grad: 5876.9536  LR: 0.00001659  


Train:  58%|█████▊    | 1001/1730 [05:38<04:05,  2.97train_batch/s]

Epoch: [1][1000/1730] Elapsed 5m 38s (remain 4m 6s) Loss: 0.9968 Grad: 10075.2676  LR: 0.00001543  


Train:  59%|█████▉    | 1021/1730 [05:44<03:54,  3.02train_batch/s]

Epoch: [1][1020/1730] Elapsed 5m 44s (remain 3m 59s) Loss: 0.9940 Grad: 9491.7568  LR: 0.00001416  


Train:  60%|██████    | 1041/1730 [05:51<03:55,  2.92train_batch/s]

Epoch: [1][1040/1730] Elapsed 5m 51s (remain 3m 52s) Loss: 0.9928 Grad: 2343.6440  LR: 0.00001280  


Train:  61%|██████▏   | 1061/1730 [05:58<04:30,  2.47train_batch/s]

Epoch: [1][1060/1730] Elapsed 5m 58s (remain 3m 46s) Loss: 0.9884 Grad: 7493.6880  LR: 0.00001138  


Train:  62%|██████▏   | 1081/1730 [06:04<03:06,  3.49train_batch/s]

Epoch: [1][1080/1730] Elapsed 6m 4s (remain 3m 39s) Loss: 0.9863 Grad: 12359.6270  LR: 0.00000993  


Train:  64%|██████▎   | 1101/1730 [06:11<02:34,  4.08train_batch/s]

Epoch: [1][1100/1730] Elapsed 6m 11s (remain 3m 32s) Loss: 0.9836 Grad: 7192.1230  LR: 0.00000848  


Train:  65%|██████▍   | 1121/1730 [06:18<03:13,  3.14train_batch/s]

Epoch: [1][1120/1730] Elapsed 6m 18s (remain 3m 25s) Loss: 0.9808 Grad: 10222.9863  LR: 0.00000706  


Train:  66%|██████▌   | 1141/1730 [06:24<03:01,  3.25train_batch/s]

Epoch: [1][1140/1730] Elapsed 6m 24s (remain 3m 18s) Loss: 0.9782 Grad: 6174.2217  LR: 0.00000571  


Train:  67%|██████▋   | 1161/1730 [06:32<03:14,  2.92train_batch/s]

Epoch: [1][1160/1730] Elapsed 6m 32s (remain 3m 12s) Loss: 0.9761 Grad: 5271.4028  LR: 0.00000444  


Train:  68%|██████▊   | 1181/1730 [06:38<02:56,  3.12train_batch/s]

Epoch: [1][1180/1730] Elapsed 6m 38s (remain 3m 5s) Loss: 0.9724 Grad: 10012.3477  LR: 0.00000330  


Train:  69%|██████▉   | 1201/1730 [06:45<03:55,  2.24train_batch/s]

Epoch: [1][1200/1730] Elapsed 6m 45s (remain 2m 58s) Loss: 0.9694 Grad: 5948.3589  LR: 0.00000229  


Train:  71%|███████   | 1221/1730 [06:53<03:20,  2.54train_batch/s]

Epoch: [1][1220/1730] Elapsed 6m 53s (remain 2m 52s) Loss: 0.9661 Grad: 11276.0625  LR: 0.00000145  


Train:  72%|███████▏  | 1242/1730 [07:00<02:03,  3.95train_batch/s]

Epoch: [1][1240/1730] Elapsed 7m 0s (remain 2m 45s) Loss: 0.9635 Grad: 6735.7383  LR: 0.00000079  


Train:  73%|███████▎  | 1261/1730 [07:06<02:38,  2.96train_batch/s]

Epoch: [1][1260/1730] Elapsed 7m 6s (remain 2m 38s) Loss: 0.9591 Grad: 8835.6143  LR: 0.00000032  


Train:  74%|███████▍  | 1281/1730 [07:13<02:04,  3.61train_batch/s]

Epoch: [1][1280/1730] Elapsed 7m 13s (remain 2m 31s) Loss: 0.9559 Grad: 7501.4263  LR: 0.00000006  


Train:  75%|███████▌  | 1301/1730 [07:20<02:34,  2.79train_batch/s]

Epoch: [1][1300/1730] Elapsed 7m 20s (remain 2m 25s) Loss: 0.9530 Grad: 7467.9253  LR: 0.00000001  


Train:  76%|███████▋  | 1321/1730 [07:27<02:18,  2.96train_batch/s]

Epoch: [1][1320/1730] Elapsed 7m 27s (remain 2m 18s) Loss: 0.9503 Grad: 6713.9229  LR: 0.00000016  


Train:  78%|███████▊  | 1341/1730 [07:34<02:16,  2.85train_batch/s]

Epoch: [1][1340/1730] Elapsed 7m 34s (remain 2m 11s) Loss: 0.9493 Grad: 8092.9668  LR: 0.00000053  


Train:  79%|███████▊  | 1362/1730 [07:40<01:29,  4.11train_batch/s]

Epoch: [1][1360/1730] Elapsed 7m 40s (remain 2m 4s) Loss: 0.9475 Grad: 6139.0210  LR: 0.00000110  


Train:  80%|███████▉  | 1382/1730 [07:47<01:59,  2.90train_batch/s]

Epoch: [1][1380/1730] Elapsed 7m 47s (remain 1m 58s) Loss: 0.9443 Grad: 7522.4102  LR: 0.00000185  


Train:  81%|████████  | 1401/1730 [07:54<02:27,  2.23train_batch/s]

Epoch: [1][1400/1730] Elapsed 7m 54s (remain 1m 51s) Loss: 0.9436 Grad: 11851.0674  LR: 0.00000278  


Train:  82%|████████▏ | 1421/1730 [08:00<01:36,  3.20train_batch/s]

Epoch: [1][1420/1730] Elapsed 8m 0s (remain 1m 44s) Loss: 0.9435 Grad: 7343.6665  LR: 0.00000385  


Train:  83%|████████▎ | 1441/1730 [08:06<01:20,  3.58train_batch/s]

Epoch: [1][1440/1730] Elapsed 8m 6s (remain 1m 37s) Loss: 0.9415 Grad: 4095.9980  LR: 0.00000506  


Train:  84%|████████▍ | 1461/1730 [08:13<01:29,  3.00train_batch/s]

Epoch: [1][1460/1730] Elapsed 8m 13s (remain 1m 30s) Loss: 0.9391 Grad: 8349.6895  LR: 0.00000638  


Train:  86%|████████▌ | 1481/1730 [08:20<01:19,  3.11train_batch/s]

Epoch: [1][1480/1730] Elapsed 8m 20s (remain 1m 24s) Loss: 0.9373 Grad: 4772.1655  LR: 0.00000776  


Train:  87%|████████▋ | 1501/1730 [08:27<01:31,  2.51train_batch/s]

Epoch: [1][1500/1730] Elapsed 8m 27s (remain 1m 17s) Loss: 0.9368 Grad: 8629.0596  LR: 0.00000920  


Train:  88%|████████▊ | 1521/1730 [08:35<01:24,  2.47train_batch/s]

Epoch: [1][1520/1730] Elapsed 8m 35s (remain 1m 10s) Loss: 0.9361 Grad: 12789.2900  LR: 0.00001065  


Train:  89%|████████▉ | 1541/1730 [08:41<00:53,  3.51train_batch/s]

Epoch: [1][1540/1730] Elapsed 8m 41s (remain 1m 3s) Loss: 0.9349 Grad: 8812.1484  LR: 0.00001209  


Train:  90%|█████████ | 1561/1730 [08:48<01:11,  2.35train_batch/s]

Epoch: [1][1560/1730] Elapsed 8m 48s (remain 0m 57s) Loss: 0.9345 Grad: 9328.9375  LR: 0.00001349  


Train:  91%|█████████▏| 1581/1730 [08:56<00:48,  3.06train_batch/s]

Epoch: [1][1580/1730] Elapsed 8m 56s (remain 0m 50s) Loss: 0.9322 Grad: 7897.5205  LR: 0.00001481  


Train:  93%|█████████▎| 1601/1730 [09:02<00:47,  2.71train_batch/s]

Epoch: [1][1600/1730] Elapsed 9m 2s (remain 0m 43s) Loss: 0.9306 Grad: 10431.3965  LR: 0.00001603  


Train:  94%|█████████▎| 1621/1730 [09:09<00:36,  3.02train_batch/s]

Epoch: [1][1620/1730] Elapsed 9m 9s (remain 0m 36s) Loss: 0.9303 Grad: 5181.5068  LR: 0.00001712  


Train:  95%|█████████▍| 1641/1730 [09:15<00:37,  2.37train_batch/s]

Epoch: [1][1640/1730] Elapsed 9m 15s (remain 0m 30s) Loss: 0.9292 Grad: 5942.7085  LR: 0.00001806  


Train:  96%|█████████▌| 1661/1730 [09:22<00:19,  3.58train_batch/s]

Epoch: [1][1660/1730] Elapsed 9m 22s (remain 0m 23s) Loss: 0.9279 Grad: 2940.6653  LR: 0.00001884  


Train:  97%|█████████▋| 1681/1730 [09:29<00:19,  2.55train_batch/s]

Epoch: [1][1680/1730] Elapsed 9m 29s (remain 0m 16s) Loss: 0.9266 Grad: 11755.8643  LR: 0.00001942  


Train:  98%|█████████▊| 1701/1730 [09:37<00:11,  2.55train_batch/s]

Epoch: [1][1700/1730] Elapsed 9m 37s (remain 0m 9s) Loss: 0.9250 Grad: 5058.6470  LR: 0.00001981  


Train:  99%|█████████▉| 1721/1730 [09:43<00:03,  2.78train_batch/s]

Epoch: [1][1720/1730] Elapsed 9m 43s (remain 0m 3s) Loss: 0.9246 Grad: 5284.0127  LR: 0.00001999  


Train: 100%|██████████| 1730/1730 [09:46<00:00,  2.95train_batch/s]


Epoch: [1][1729/1730] Elapsed 9m 46s (remain 0m 0s) Loss: 0.9239 Grad: 8454.3711  LR: 0.00002000  


Validation:   0%|          | 1/866 [00:00<03:50,  3.75valid_batch/s]

EVAL: [0/866] Elapsed 0m 0s (remain 3m 51s) Loss: 1.1195 


Validation:   3%|▎         | 23/866 [00:02<01:09, 12.14valid_batch/s]

EVAL: [20/866] Elapsed 0m 1s (remain 1m 16s) Loss: 0.5981 


Validation:   5%|▍         | 43/866 [00:03<00:53, 15.31valid_batch/s]

EVAL: [40/866] Elapsed 0m 3s (remain 1m 7s) Loss: 0.7475 


Validation:   7%|▋         | 63/866 [00:05<00:56, 14.24valid_batch/s]

EVAL: [60/866] Elapsed 0m 4s (remain 1m 4s) Loss: 0.7800 


Validation:  10%|▉         | 83/866 [00:06<00:56, 13.85valid_batch/s]

EVAL: [80/866] Elapsed 0m 6s (remain 1m 2s) Loss: 0.8261 


Validation:  12%|█▏        | 102/866 [00:08<00:50, 15.14valid_batch/s]

EVAL: [100/866] Elapsed 0m 8s (remain 1m 0s) Loss: 0.8174 


Validation:  14%|█▍        | 124/866 [00:09<00:54, 13.69valid_batch/s]

EVAL: [120/866] Elapsed 0m 9s (remain 0m 58s) Loss: 0.8289 


Validation:  17%|█▋        | 143/866 [00:10<00:48, 14.94valid_batch/s]

EVAL: [140/866] Elapsed 0m 10s (remain 0m 54s) Loss: 0.8044 


Validation:  19%|█▉        | 163/866 [00:12<00:48, 14.60valid_batch/s]

EVAL: [160/866] Elapsed 0m 12s (remain 0m 52s) Loss: 0.8091 


Validation:  21%|██        | 183/866 [00:13<00:45, 15.14valid_batch/s]

EVAL: [180/866] Elapsed 0m 13s (remain 0m 50s) Loss: 0.8193 


Validation:  23%|██▎       | 203/866 [00:14<00:47, 13.96valid_batch/s]

EVAL: [200/866] Elapsed 0m 14s (remain 0m 49s) Loss: 0.8214 


Validation:  26%|██▌       | 224/866 [00:16<00:42, 14.98valid_batch/s]

EVAL: [220/866] Elapsed 0m 16s (remain 0m 47s) Loss: 0.8230 


Validation:  28%|██▊       | 243/866 [00:17<00:45, 13.81valid_batch/s]

EVAL: [240/866] Elapsed 0m 17s (remain 0m 45s) Loss: 0.8329 


Validation:  30%|███       | 264/866 [00:19<00:41, 14.34valid_batch/s]

EVAL: [260/866] Elapsed 0m 18s (remain 0m 44s) Loss: 0.8327 


Validation:  33%|███▎      | 283/866 [00:20<00:34, 16.76valid_batch/s]

EVAL: [280/866] Elapsed 0m 20s (remain 0m 42s) Loss: 0.8404 


Validation:  35%|███▍      | 303/866 [00:22<00:50, 11.12valid_batch/s]

EVAL: [300/866] Elapsed 0m 21s (remain 0m 41s) Loss: 0.8405 


Validation:  37%|███▋      | 323/866 [00:23<00:43, 12.45valid_batch/s]

EVAL: [320/866] Elapsed 0m 23s (remain 0m 40s) Loss: 0.8415 


Validation:  40%|███▉      | 343/866 [00:25<00:40, 12.86valid_batch/s]

EVAL: [340/866] Elapsed 0m 25s (remain 0m 38s) Loss: 0.8434 


Validation:  42%|████▏     | 363/866 [00:26<00:36, 13.60valid_batch/s]

EVAL: [360/866] Elapsed 0m 26s (remain 0m 37s) Loss: 0.8454 


Validation:  44%|████▍     | 382/866 [00:28<00:37, 12.98valid_batch/s]

EVAL: [380/866] Elapsed 0m 28s (remain 0m 36s) Loss: 0.8407 


Validation:  46%|████▋     | 402/866 [00:30<00:38, 12.03valid_batch/s]

EVAL: [400/866] Elapsed 0m 29s (remain 0m 34s) Loss: 0.8401 


Validation:  49%|████▉     | 424/866 [00:31<00:27, 15.85valid_batch/s]

EVAL: [420/866] Elapsed 0m 31s (remain 0m 33s) Loss: 0.8381 


Validation:  51%|█████     | 443/866 [00:32<00:28, 15.06valid_batch/s]

EVAL: [440/866] Elapsed 0m 32s (remain 0m 31s) Loss: 0.8393 


Validation:  53%|█████▎    | 463/866 [00:34<00:33, 12.06valid_batch/s]

EVAL: [460/866] Elapsed 0m 34s (remain 0m 29s) Loss: 0.8431 


Validation:  56%|█████▌    | 482/866 [00:35<00:25, 15.25valid_batch/s]

EVAL: [480/866] Elapsed 0m 35s (remain 0m 28s) Loss: 0.8446 


Validation:  58%|█████▊    | 502/866 [00:37<00:27, 13.43valid_batch/s]

EVAL: [500/866] Elapsed 0m 37s (remain 0m 27s) Loss: 0.8482 


Validation:  60%|██████    | 523/866 [00:38<00:24, 13.81valid_batch/s]

EVAL: [520/866] Elapsed 0m 38s (remain 0m 25s) Loss: 0.8488 


Validation:  63%|██████▎   | 543/866 [00:40<00:21, 15.30valid_batch/s]

EVAL: [540/866] Elapsed 0m 40s (remain 0m 24s) Loss: 0.8446 


Validation:  65%|██████▌   | 563/866 [00:41<00:19, 15.82valid_batch/s]

EVAL: [560/866] Elapsed 0m 41s (remain 0m 22s) Loss: 0.8443 


Validation:  67%|██████▋   | 582/866 [00:43<00:22, 12.70valid_batch/s]

EVAL: [580/866] Elapsed 0m 43s (remain 0m 21s) Loss: 0.8511 


Validation:  70%|██████▉   | 603/866 [00:44<00:17, 15.29valid_batch/s]

EVAL: [600/866] Elapsed 0m 44s (remain 0m 19s) Loss: 0.8518 


Validation:  72%|███████▏  | 623/866 [00:46<00:16, 14.85valid_batch/s]

EVAL: [620/866] Elapsed 0m 45s (remain 0m 18s) Loss: 0.8531 


Validation:  74%|███████▍  | 643/866 [00:47<00:13, 16.77valid_batch/s]

EVAL: [640/866] Elapsed 0m 47s (remain 0m 16s) Loss: 0.8522 


Validation:  77%|███████▋  | 664/866 [00:48<00:12, 15.87valid_batch/s]

EVAL: [660/866] Elapsed 0m 48s (remain 0m 15s) Loss: 0.8497 


Validation:  79%|███████▉  | 684/866 [00:50<00:12, 14.81valid_batch/s]

EVAL: [680/866] Elapsed 0m 50s (remain 0m 13s) Loss: 0.8509 


Validation:  81%|████████  | 702/866 [00:51<00:11, 14.58valid_batch/s]

EVAL: [700/866] Elapsed 0m 51s (remain 0m 12s) Loss: 0.8480 


Validation:  83%|████████▎ | 722/866 [00:52<00:10, 14.38valid_batch/s]

EVAL: [720/866] Elapsed 0m 52s (remain 0m 10s) Loss: 0.8487 


Validation:  86%|████████▌ | 743/866 [00:54<00:09, 13.55valid_batch/s]

EVAL: [740/866] Elapsed 0m 54s (remain 0m 9s) Loss: 0.8544 


Validation:  88%|████████▊ | 761/866 [00:55<00:07, 13.94valid_batch/s]

EVAL: [760/866] Elapsed 0m 55s (remain 0m 7s) Loss: 0.8535 


Validation:  91%|█████████ | 784/866 [00:57<00:05, 14.50valid_batch/s]

EVAL: [780/866] Elapsed 0m 57s (remain 0m 6s) Loss: 0.8551 


Validation:  93%|█████████▎| 804/866 [00:59<00:04, 13.22valid_batch/s]

EVAL: [800/866] Elapsed 0m 59s (remain 0m 4s) Loss: 0.8497 


Validation:  95%|█████████▌| 824/866 [01:00<00:02, 16.84valid_batch/s]

EVAL: [820/866] Elapsed 1m 0s (remain 0m 3s) Loss: 0.8506 


Validation:  97%|█████████▋| 843/866 [01:01<00:01, 14.15valid_batch/s]

EVAL: [840/866] Elapsed 1m 1s (remain 0m 1s) Loss: 0.8503 


Validation: 100%|█████████▉| 863/866 [01:03<00:00, 10.77valid_batch/s]

EVAL: [860/866] Elapsed 1m 3s (remain 0m 0s) Loss: 0.8518 


Validation: 100%|██████████| 866/866 [01:03<00:00, 13.56valid_batch/s]
Epoch 1 - avg_train_loss: 0.9239  avg_val_loss: 0.8512  time: 651s
Epoch 1 - Score: 0.8049
Epoch 1 - Save Best Score: 0.8049 Model


EVAL: [865/866] Elapsed 1m 3s (remain 0m 0s) Loss: 0.8512 


Train:   0%|          | 1/1730 [00:00<11:48,  2.44train_batch/s]

Epoch: [2][0/1730] Elapsed 0m 0s (remain 11m 48s) Loss: 0.6776 Grad: inf  LR: 0.00002000  


Train:   1%|          | 21/1730 [00:06<09:10,  3.10train_batch/s]

Epoch: [2][20/1730] Elapsed 0m 6s (remain 8m 20s) Loss: 1.0560 Grad: 234143.5156  LR: 0.00001986  


Train:   2%|▏         | 41/1730 [00:13<09:31,  2.96train_batch/s]

Epoch: [2][40/1730] Elapsed 0m 13s (remain 9m 16s) Loss: 1.0111 Grad: 168709.4844  LR: 0.00001952  


Train:   4%|▎         | 61/1730 [00:20<09:20,  2.98train_batch/s]

Epoch: [2][60/1730] Elapsed 0m 20s (remain 9m 28s) Loss: 0.9670 Grad: 88256.5547  LR: 0.00001897  


Train:   5%|▍         | 81/1730 [00:27<10:54,  2.52train_batch/s]

Epoch: [2][80/1730] Elapsed 0m 27s (remain 9m 21s) Loss: 0.9192 Grad: 78709.0859  LR: 0.00001823  


Train:   6%|▌         | 101/1730 [00:34<08:06,  3.35train_batch/s]

Epoch: [2][100/1730] Elapsed 0m 34s (remain 9m 11s) Loss: 0.9052 Grad: 86898.2734  LR: 0.00001732  


Train:   7%|▋         | 121/1730 [00:40<08:16,  3.24train_batch/s]

Epoch: [2][120/1730] Elapsed 0m 40s (remain 9m 2s) Loss: 0.8594 Grad: 36906.0898  LR: 0.00001626  


Train:   8%|▊         | 142/1730 [00:47<06:49,  3.88train_batch/s]

Epoch: [2][140/1730] Elapsed 0m 47s (remain 8m 51s) Loss: 0.8566 Grad: 129421.5000  LR: 0.00001506  


Train:   9%|▉         | 161/1730 [00:53<11:45,  2.23train_batch/s]

Epoch: [2][160/1730] Elapsed 0m 53s (remain 8m 43s) Loss: 0.8513 Grad: 189908.8125  LR: 0.00001376  


Train:  10%|█         | 181/1730 [01:01<08:27,  3.05train_batch/s]

Epoch: [2][180/1730] Elapsed 1m 1s (remain 8m 42s) Loss: 0.8471 Grad: 69128.1250  LR: 0.00001238  


Train:  12%|█▏        | 201/1730 [01:07<08:03,  3.16train_batch/s]

Epoch: [2][200/1730] Elapsed 1m 7s (remain 8m 36s) Loss: 0.8370 Grad: 176476.7812  LR: 0.00001094  


Train:  13%|█▎        | 221/1730 [01:14<07:32,  3.33train_batch/s]

Epoch: [2][220/1730] Elapsed 1m 14s (remain 8m 26s) Loss: 0.8272 Grad: 36543.0938  LR: 0.00000949  


Train:  14%|█▍        | 241/1730 [01:20<07:07,  3.49train_batch/s]

Epoch: [2][240/1730] Elapsed 1m 20s (remain 8m 17s) Loss: 0.8135 Grad: 137425.5156  LR: 0.00000805  


Train:  15%|█▌        | 261/1730 [01:27<11:49,  2.07train_batch/s]

Epoch: [2][260/1730] Elapsed 1m 27s (remain 8m 14s) Loss: 0.8092 Grad: 109175.4375  LR: 0.00000665  


Train:  16%|█▌        | 281/1730 [01:34<09:54,  2.44train_batch/s]

Epoch: [2][280/1730] Elapsed 1m 34s (remain 8m 9s) Loss: 0.8079 Grad: 133737.0000  LR: 0.00000532  


Train:  17%|█▋        | 301/1730 [01:41<08:56,  2.66train_batch/s]

Epoch: [2][300/1730] Elapsed 1m 41s (remain 8m 3s) Loss: 0.8003 Grad: 64976.3750  LR: 0.00000409  


Train:  19%|█▊        | 321/1730 [01:48<07:51,  2.99train_batch/s]

Epoch: [2][320/1730] Elapsed 1m 48s (remain 7m 58s) Loss: 0.7933 Grad: 41194.9258  LR: 0.00000298  


Train:  20%|█▉        | 341/1730 [01:55<07:19,  3.16train_batch/s]

Epoch: [2][340/1730] Elapsed 1m 55s (remain 7m 49s) Loss: 0.7870 Grad: 80200.5625  LR: 0.00000202  


Train:  21%|██        | 361/1730 [02:01<09:12,  2.48train_batch/s]

Epoch: [2][360/1730] Elapsed 2m 1s (remain 7m 41s) Loss: 0.7821 Grad: 130621.7422  LR: 0.00000123  


Train:  22%|██▏       | 381/1730 [02:08<06:52,  3.27train_batch/s]

Epoch: [2][380/1730] Elapsed 2m 8s (remain 7m 33s) Loss: 0.7797 Grad: 72705.4141  LR: 0.00000063  


Train:  23%|██▎       | 401/1730 [02:15<07:32,  2.94train_batch/s]

Epoch: [2][400/1730] Elapsed 2m 14s (remain 7m 27s) Loss: 0.7739 Grad: 206494.7188  LR: 0.00000022  


Train:  24%|██▍       | 421/1730 [02:21<07:07,  3.06train_batch/s]

Epoch: [2][420/1730] Elapsed 2m 21s (remain 7m 18s) Loss: 0.7697 Grad: 89243.4531  LR: 0.00000002  


Train:  25%|██▌       | 441/1730 [02:28<07:18,  2.94train_batch/s]

Epoch: [2][440/1730] Elapsed 2m 28s (remain 7m 12s) Loss: 0.7681 Grad: 117790.9922  LR: 0.00000003  


Train:  27%|██▋       | 461/1730 [02:34<07:32,  2.81train_batch/s]

Epoch: [2][460/1730] Elapsed 2m 34s (remain 7m 5s) Loss: 0.7625 Grad: 72251.5469  LR: 0.00000025  


Train:  28%|██▊       | 482/1730 [02:41<07:15,  2.87train_batch/s]

Epoch: [2][480/1730] Elapsed 2m 41s (remain 6m 59s) Loss: 0.7585 Grad: 118192.0000  LR: 0.00000068  


Train:  29%|██▉       | 501/1730 [02:48<07:15,  2.82train_batch/s]

Epoch: [2][500/1730] Elapsed 2m 48s (remain 6m 53s) Loss: 0.7559 Grad: 69418.5469  LR: 0.00000130  


Train:  30%|███       | 522/1730 [02:55<07:03,  2.85train_batch/s]

Epoch: [2][520/1730] Elapsed 2m 55s (remain 6m 47s) Loss: 0.7540 Grad: 49156.4727  LR: 0.00000211  


Train:  31%|███▏      | 541/1730 [03:03<05:50,  3.39train_batch/s]

Epoch: [2][540/1730] Elapsed 3m 3s (remain 6m 42s) Loss: 0.7531 Grad: 134032.5000  LR: 0.00000308  


Train:  32%|███▏      | 561/1730 [03:09<07:13,  2.69train_batch/s]

Epoch: [2][560/1730] Elapsed 3m 9s (remain 6m 34s) Loss: 0.7504 Grad: 473001.3125  LR: 0.00000420  


Train:  34%|███▎      | 581/1730 [03:16<07:48,  2.45train_batch/s]

Epoch: [2][580/1730] Elapsed 3m 16s (remain 6m 29s) Loss: 0.7479 Grad: 113360.9922  LR: 0.00000545  


Train:  35%|███▍      | 601/1730 [03:24<06:44,  2.79train_batch/s]

Epoch: [2][600/1730] Elapsed 3m 24s (remain 6m 23s) Loss: 0.7482 Grad: 91698.6250  LR: 0.00000679  


Train:  36%|███▌      | 621/1730 [03:30<05:41,  3.25train_batch/s]

Epoch: [2][620/1730] Elapsed 3m 30s (remain 6m 16s) Loss: 0.7469 Grad: 50600.5625  LR: 0.00000819  


Train:  37%|███▋      | 641/1730 [03:37<05:24,  3.35train_batch/s]

Epoch: [2][640/1730] Elapsed 3m 37s (remain 6m 10s) Loss: 0.7484 Grad: 32881.8242  LR: 0.00000964  


Train:  38%|███▊      | 661/1730 [03:44<07:13,  2.47train_batch/s]

Epoch: [2][660/1730] Elapsed 3m 44s (remain 6m 2s) Loss: 0.7470 Grad: 93216.5938  LR: 0.00001109  


Train:  39%|███▉      | 681/1730 [03:50<05:37,  3.11train_batch/s]

Epoch: [2][680/1730] Elapsed 3m 50s (remain 5m 55s) Loss: 0.7412 Grad: 51016.1914  LR: 0.00001252  


Train:  41%|████      | 701/1730 [03:57<06:47,  2.53train_batch/s]

Epoch: [2][700/1730] Elapsed 3m 57s (remain 5m 48s) Loss: 0.7416 Grad: 215542.9062  LR: 0.00001389  


Train:  42%|████▏     | 721/1730 [04:04<04:22,  3.84train_batch/s]

Epoch: [2][720/1730] Elapsed 4m 4s (remain 5m 42s) Loss: 0.7418 Grad: 139858.3750  LR: 0.00001519  


Train:  43%|████▎     | 741/1730 [04:10<04:02,  4.09train_batch/s]

Epoch: [2][740/1730] Elapsed 4m 10s (remain 5m 34s) Loss: 0.7388 Grad: 50735.5391  LR: 0.00001637  


Train:  44%|████▍     | 761/1730 [04:16<04:27,  3.62train_batch/s]

Epoch: [2][760/1730] Elapsed 4m 16s (remain 5m 27s) Loss: 0.7397 Grad: 185875.4219  LR: 0.00001742  


Train:  45%|████▌     | 781/1730 [04:23<06:14,  2.54train_batch/s]

Epoch: [2][780/1730] Elapsed 4m 23s (remain 5m 20s) Loss: 0.7391 Grad: 167840.6562  LR: 0.00001831  


Train:  46%|████▋     | 801/1730 [04:30<05:34,  2.78train_batch/s]

Epoch: [2][800/1730] Elapsed 4m 30s (remain 5m 13s) Loss: 0.7413 Grad: 65594.6172  LR: 0.00001903  


Train:  47%|████▋     | 821/1730 [04:36<04:25,  3.43train_batch/s]

Epoch: [2][820/1730] Elapsed 4m 36s (remain 5m 6s) Loss: 0.7386 Grad: 87837.9531  LR: 0.00001956  


Train:  49%|████▊     | 841/1730 [04:43<05:06,  2.90train_batch/s]

Epoch: [2][840/1730] Elapsed 4m 43s (remain 4m 59s) Loss: 0.7395 Grad: 41515.7891  LR: 0.00001988  


Train:  50%|████▉     | 861/1730 [04:50<04:32,  3.19train_batch/s]

Epoch: [2][860/1730] Elapsed 4m 50s (remain 4m 53s) Loss: 0.7363 Grad: 83781.7422  LR: 0.00002000  


Train:  51%|█████     | 882/1730 [04:57<05:01,  2.82train_batch/s]

Epoch: [2][880/1730] Elapsed 4m 57s (remain 4m 46s) Loss: 0.7349 Grad: 202644.8750  LR: 0.00001990  


Train:  52%|█████▏    | 901/1730 [05:03<04:19,  3.19train_batch/s]

Epoch: [2][900/1730] Elapsed 5m 3s (remain 4m 39s) Loss: 0.7352 Grad: 149391.2500  LR: 0.00001960  


Train:  53%|█████▎    | 921/1730 [05:10<05:41,  2.37train_batch/s]

Epoch: [2][920/1730] Elapsed 5m 10s (remain 4m 33s) Loss: 0.7350 Grad: 164987.8594  LR: 0.00001909  


Train:  54%|█████▍    | 941/1730 [05:17<04:20,  3.03train_batch/s]

Epoch: [2][940/1730] Elapsed 5m 17s (remain 4m 25s) Loss: 0.7344 Grad: 160451.5312  LR: 0.00001839  


Train:  56%|█████▌    | 962/1730 [05:23<03:39,  3.49train_batch/s]

Epoch: [2][960/1730] Elapsed 5m 23s (remain 4m 18s) Loss: 0.7340 Grad: 110203.9062  LR: 0.00001752  


Train:  57%|█████▋    | 982/1730 [05:30<03:15,  3.82train_batch/s]

Epoch: [2][980/1730] Elapsed 5m 30s (remain 4m 12s) Loss: 0.7334 Grad: 82348.5391  LR: 0.00001648  


Train:  58%|█████▊    | 1001/1730 [05:37<04:17,  2.83train_batch/s]

Epoch: [2][1000/1730] Elapsed 5m 37s (remain 4m 5s) Loss: 0.7346 Grad: 89422.3516  LR: 0.00001531  


Train:  59%|█████▉    | 1021/1730 [05:44<03:37,  3.26train_batch/s]

Epoch: [2][1020/1730] Elapsed 5m 44s (remain 3m 59s) Loss: 0.7342 Grad: 53228.5547  LR: 0.00001403  


Train:  60%|██████    | 1041/1730 [05:50<03:07,  3.68train_batch/s]

Epoch: [2][1040/1730] Elapsed 5m 50s (remain 3m 52s) Loss: 0.7325 Grad: 115117.8750  LR: 0.00001266  


Train:  61%|██████▏   | 1061/1730 [05:57<03:53,  2.86train_batch/s]

Epoch: [2][1060/1730] Elapsed 5m 57s (remain 3m 45s) Loss: 0.7326 Grad: 191251.9219  LR: 0.00001123  


Train:  62%|██████▏   | 1081/1730 [06:04<03:19,  3.25train_batch/s]

Epoch: [2][1080/1730] Elapsed 6m 4s (remain 3m 38s) Loss: 0.7302 Grad: 89798.7266  LR: 0.00000978  


Train:  64%|██████▎   | 1101/1730 [06:11<03:56,  2.66train_batch/s]

Epoch: [2][1100/1730] Elapsed 6m 11s (remain 3m 32s) Loss: 0.7282 Grad: 172879.7031  LR: 0.00000834  


Train:  65%|██████▍   | 1121/1730 [06:18<04:12,  2.41train_batch/s]

Epoch: [2][1120/1730] Elapsed 6m 18s (remain 3m 25s) Loss: 0.7267 Grad: 184327.5156  LR: 0.00000692  


Train:  66%|██████▌   | 1141/1730 [06:25<03:42,  2.65train_batch/s]

Epoch: [2][1140/1730] Elapsed 6m 25s (remain 3m 18s) Loss: 0.7247 Grad: 97790.6562  LR: 0.00000558  


Train:  67%|██████▋   | 1162/1730 [06:32<02:36,  3.62train_batch/s]

Epoch: [2][1160/1730] Elapsed 6m 32s (remain 3m 12s) Loss: 0.7243 Grad: 54147.0703  LR: 0.00000432  


Train:  68%|██████▊   | 1181/1730 [06:39<03:36,  2.53train_batch/s]

Epoch: [2][1180/1730] Elapsed 6m 39s (remain 3m 5s) Loss: 0.7228 Grad: 122348.3984  LR: 0.00000319  


Train:  69%|██████▉   | 1201/1730 [06:47<03:43,  2.36train_batch/s]

Epoch: [2][1200/1730] Elapsed 6m 47s (remain 2m 59s) Loss: 0.7226 Grad: 147905.9688  LR: 0.00000220  


Train:  71%|███████   | 1221/1730 [06:54<03:07,  2.71train_batch/s]

Epoch: [2][1220/1730] Elapsed 6m 54s (remain 2m 52s) Loss: 0.7238 Grad: 108367.5078  LR: 0.00000138  


Train:  72%|███████▏  | 1241/1730 [07:02<03:12,  2.54train_batch/s]

Epoch: [2][1240/1730] Elapsed 7m 2s (remain 2m 46s) Loss: 0.7243 Grad: 169043.9531  LR: 0.00000073  


Train:  73%|███████▎  | 1261/1730 [07:09<03:28,  2.24train_batch/s]

Epoch: [2][1260/1730] Elapsed 7m 9s (remain 2m 39s) Loss: 0.7260 Grad: 161437.0156  LR: 0.00000029  


Train:  74%|███████▍  | 1281/1730 [07:16<03:13,  2.32train_batch/s]

Epoch: [2][1280/1730] Elapsed 7m 16s (remain 2m 33s) Loss: 0.7253 Grad: 192412.1719  LR: 0.00000004  


Train:  75%|███████▌  | 1301/1730 [07:23<02:23,  2.99train_batch/s]

Epoch: [2][1300/1730] Elapsed 7m 23s (remain 2m 26s) Loss: 0.7250 Grad: 56682.3828  LR: 0.00000001  


Train:  76%|███████▋  | 1321/1730 [07:30<02:23,  2.85train_batch/s]

Epoch: [2][1320/1730] Elapsed 7m 30s (remain 2m 19s) Loss: 0.7260 Grad: 89650.1250  LR: 0.00000019  


Train:  78%|███████▊  | 1341/1730 [07:37<02:09,  3.01train_batch/s]

Epoch: [2][1340/1730] Elapsed 7m 37s (remain 2m 12s) Loss: 0.7260 Grad: 90985.4844  LR: 0.00000058  


Train:  79%|███████▊  | 1361/1730 [07:43<02:05,  2.95train_batch/s]

Epoch: [2][1360/1730] Elapsed 7m 43s (remain 2m 5s) Loss: 0.7251 Grad: 80933.1875  LR: 0.00000116  


Train:  80%|███████▉  | 1381/1730 [07:51<02:20,  2.49train_batch/s]

Epoch: [2][1380/1730] Elapsed 7m 51s (remain 1m 59s) Loss: 0.7244 Grad: 252878.1562  LR: 0.00000194  


Train:  81%|████████  | 1401/1730 [07:57<02:11,  2.51train_batch/s]

Epoch: [2][1400/1730] Elapsed 7m 57s (remain 1m 52s) Loss: 0.7244 Grad: 209094.6875  LR: 0.00000288  


Train:  82%|████████▏ | 1421/1730 [08:04<02:22,  2.17train_batch/s]

Epoch: [2][1420/1730] Elapsed 8m 4s (remain 1m 45s) Loss: 0.7249 Grad: 200305.6094  LR: 0.00000397  


Train:  83%|████████▎ | 1441/1730 [08:11<01:43,  2.80train_batch/s]

Epoch: [2][1440/1730] Elapsed 8m 11s (remain 1m 38s) Loss: 0.7256 Grad: 122006.9922  LR: 0.00000519  


Train:  84%|████████▍ | 1461/1730 [08:18<01:46,  2.53train_batch/s]

Epoch: [2][1460/1730] Elapsed 8m 18s (remain 1m 31s) Loss: 0.7255 Grad: 124043.8984  LR: 0.00000651  


Train:  86%|████████▌ | 1481/1730 [08:24<01:28,  2.82train_batch/s]

Epoch: [2][1480/1730] Elapsed 8m 24s (remain 1m 24s) Loss: 0.7244 Grad: 133077.1719  LR: 0.00000791  


Train:  87%|████████▋ | 1501/1730 [08:31<01:23,  2.75train_batch/s]

Epoch: [2][1500/1730] Elapsed 8m 31s (remain 1m 18s) Loss: 0.7238 Grad: 88234.7812  LR: 0.00000935  


Train:  88%|████████▊ | 1521/1730 [08:38<01:06,  3.16train_batch/s]

Epoch: [2][1520/1730] Elapsed 8m 37s (remain 1m 11s) Loss: 0.7226 Grad: 79678.0938  LR: 0.00001080  


Train:  89%|████████▉ | 1541/1730 [08:44<01:12,  2.60train_batch/s]

Epoch: [2][1540/1730] Elapsed 8m 44s (remain 1m 4s) Loss: 0.7225 Grad: 142180.7656  LR: 0.00001224  


Train:  90%|█████████ | 1561/1730 [08:51<01:04,  2.63train_batch/s]

Epoch: [2][1560/1730] Elapsed 8m 51s (remain 0m 57s) Loss: 0.7229 Grad: 154571.7969  LR: 0.00001362  


Train:  91%|█████████▏| 1581/1730 [08:59<00:52,  2.86train_batch/s]

Epoch: [2][1580/1730] Elapsed 8m 59s (remain 0m 50s) Loss: 0.7237 Grad: 162985.9219  LR: 0.00001494  


Train:  93%|█████████▎| 1601/1730 [09:05<00:40,  3.22train_batch/s]

Epoch: [2][1600/1730] Elapsed 9m 5s (remain 0m 43s) Loss: 0.7234 Grad: 182357.4219  LR: 0.00001615  


Train:  94%|█████████▎| 1621/1730 [09:13<00:37,  2.91train_batch/s]

Epoch: [2][1620/1730] Elapsed 9m 13s (remain 0m 37s) Loss: 0.7237 Grad: 127590.5156  LR: 0.00001722  


Train:  95%|█████████▍| 1641/1730 [09:19<00:33,  2.64train_batch/s]

Epoch: [2][1640/1730] Elapsed 9m 19s (remain 0m 30s) Loss: 0.7242 Grad: 113455.2891  LR: 0.00001815  


Train:  96%|█████████▌| 1661/1730 [09:27<00:26,  2.64train_batch/s]

Epoch: [2][1660/1730] Elapsed 9m 27s (remain 0m 23s) Loss: 0.7244 Grad: 165607.8438  LR: 0.00001890  


Train:  97%|█████████▋| 1682/1730 [09:34<00:12,  3.84train_batch/s]

Epoch: [2][1680/1730] Elapsed 9m 34s (remain 0m 16s) Loss: 0.7232 Grad: 116941.5000  LR: 0.00001947  


Train:  98%|█████████▊| 1701/1730 [09:40<00:10,  2.77train_batch/s]

Epoch: [2][1700/1730] Elapsed 9m 40s (remain 0m 9s) Loss: 0.7227 Grad: 166352.7969  LR: 0.00001984  


Train:  99%|█████████▉| 1721/1730 [09:47<00:03,  2.52train_batch/s]

Epoch: [2][1720/1730] Elapsed 9m 47s (remain 0m 3s) Loss: 0.7220 Grad: 90896.1484  LR: 0.00001999  


Train: 100%|██████████| 1730/1730 [09:50<00:00,  2.93train_batch/s]


Epoch: [2][1729/1730] Elapsed 9m 50s (remain 0m 0s) Loss: 0.7224 Grad: 236415.7812  LR: 0.00002000  


Validation:   0%|          | 1/866 [00:00<03:57,  3.64valid_batch/s]

EVAL: [0/866] Elapsed 0m 0s (remain 3m 58s) Loss: 1.0908 


Validation:   3%|▎         | 23/866 [00:02<01:13, 11.48valid_batch/s]

EVAL: [20/866] Elapsed 0m 1s (remain 1m 19s) Loss: 0.5895 


Validation:   5%|▍         | 42/866 [00:03<00:57, 14.22valid_batch/s]

EVAL: [40/866] Elapsed 0m 3s (remain 1m 9s) Loss: 0.7223 


Validation:   7%|▋         | 63/866 [00:05<00:58, 13.80valid_batch/s]

EVAL: [60/866] Elapsed 0m 5s (remain 1m 7s) Loss: 0.7526 


Validation:  10%|▉         | 83/866 [00:06<00:59, 13.21valid_batch/s]

EVAL: [80/866] Elapsed 0m 6s (remain 1m 4s) Loss: 0.7926 


Validation:  12%|█▏        | 102/866 [00:08<00:53, 14.27valid_batch/s]

EVAL: [100/866] Elapsed 0m 8s (remain 1m 3s) Loss: 0.7655 


Validation:  14%|█▍        | 123/866 [00:09<00:56, 13.26valid_batch/s]

EVAL: [120/866] Elapsed 0m 9s (remain 1m 0s) Loss: 0.7786 


Validation:  16%|█▋        | 142/866 [00:11<00:46, 15.55valid_batch/s]

EVAL: [140/866] Elapsed 0m 11s (remain 0m 57s) Loss: 0.7463 


Validation:  19%|█▉        | 163/866 [00:12<00:50, 13.95valid_batch/s]

EVAL: [160/866] Elapsed 0m 12s (remain 0m 55s) Loss: 0.7413 


Validation:  21%|██        | 183/866 [00:14<00:46, 14.64valid_batch/s]

EVAL: [180/866] Elapsed 0m 13s (remain 0m 52s) Loss: 0.7645 


Validation:  23%|██▎       | 203/866 [00:15<00:47, 13.90valid_batch/s]

EVAL: [200/866] Elapsed 0m 15s (remain 0m 51s) Loss: 0.7580 


Validation:  26%|██▌       | 224/866 [00:16<00:43, 14.90valid_batch/s]

EVAL: [220/866] Elapsed 0m 16s (remain 0m 49s) Loss: 0.7569 


Validation:  28%|██▊       | 243/866 [00:18<00:45, 13.78valid_batch/s]

EVAL: [240/866] Elapsed 0m 18s (remain 0m 47s) Loss: 0.7685 


Validation:  30%|███       | 264/866 [00:19<00:42, 14.33valid_batch/s]

EVAL: [260/866] Elapsed 0m 19s (remain 0m 45s) Loss: 0.7711 


Validation:  33%|███▎      | 283/866 [00:21<00:34, 16.73valid_batch/s]

EVAL: [280/866] Elapsed 0m 20s (remain 0m 43s) Loss: 0.7777 


Validation:  35%|███▍      | 303/866 [00:22<00:50, 11.09valid_batch/s]

EVAL: [300/866] Elapsed 0m 22s (remain 0m 42s) Loss: 0.7770 


Validation:  37%|███▋      | 323/866 [00:24<00:43, 12.37valid_batch/s]

EVAL: [320/866] Elapsed 0m 24s (remain 0m 41s) Loss: 0.7775 


Validation:  40%|███▉      | 343/866 [00:25<00:40, 12.82valid_batch/s]

EVAL: [340/866] Elapsed 0m 25s (remain 0m 39s) Loss: 0.7739 


Validation:  42%|████▏     | 363/866 [00:27<00:37, 13.59valid_batch/s]

EVAL: [360/866] Elapsed 0m 27s (remain 0m 38s) Loss: 0.7754 


Validation:  44%|████▍     | 382/866 [00:29<00:37, 12.97valid_batch/s]

EVAL: [380/866] Elapsed 0m 28s (remain 0m 36s) Loss: 0.7714 


Validation:  46%|████▋     | 402/866 [00:30<00:38, 12.05valid_batch/s]

EVAL: [400/866] Elapsed 0m 30s (remain 0m 35s) Loss: 0.7723 


Validation:  49%|████▉     | 424/866 [00:32<00:27, 15.82valid_batch/s]

EVAL: [420/866] Elapsed 0m 31s (remain 0m 33s) Loss: 0.7681 


Validation:  51%|█████     | 443/866 [00:33<00:28, 15.07valid_batch/s]

EVAL: [440/866] Elapsed 0m 33s (remain 0m 32s) Loss: 0.7696 


Validation:  53%|█████▎    | 463/866 [00:34<00:33, 12.04valid_batch/s]

EVAL: [460/866] Elapsed 0m 34s (remain 0m 30s) Loss: 0.7712 


Validation:  56%|█████▌    | 482/866 [00:36<00:25, 15.15valid_batch/s]

EVAL: [480/866] Elapsed 0m 36s (remain 0m 29s) Loss: 0.7742 


Validation:  58%|█████▊    | 502/866 [00:37<00:27, 13.40valid_batch/s]

EVAL: [500/866] Elapsed 0m 37s (remain 0m 27s) Loss: 0.7788 


Validation:  60%|██████    | 523/866 [00:39<00:24, 13.80valid_batch/s]

EVAL: [520/866] Elapsed 0m 39s (remain 0m 26s) Loss: 0.7819 


Validation:  63%|██████▎   | 543/866 [00:40<00:21, 15.28valid_batch/s]

EVAL: [540/866] Elapsed 0m 40s (remain 0m 24s) Loss: 0.7777 


Validation:  65%|██████▌   | 563/866 [00:42<00:19, 15.78valid_batch/s]

EVAL: [560/866] Elapsed 0m 42s (remain 0m 22s) Loss: 0.7764 


Validation:  67%|██████▋   | 582/866 [00:43<00:22, 12.68valid_batch/s]

EVAL: [580/866] Elapsed 0m 43s (remain 0m 21s) Loss: 0.7827 


Validation:  70%|██████▉   | 603/866 [00:45<00:17, 15.27valid_batch/s]

EVAL: [600/866] Elapsed 0m 45s (remain 0m 19s) Loss: 0.7822 


Validation:  72%|███████▏  | 623/866 [00:46<00:16, 14.83valid_batch/s]

EVAL: [620/866] Elapsed 0m 46s (remain 0m 18s) Loss: 0.7839 


Validation:  74%|███████▍  | 643/866 [00:48<00:13, 16.71valid_batch/s]

EVAL: [640/866] Elapsed 0m 47s (remain 0m 16s) Loss: 0.7836 


Validation:  77%|███████▋  | 663/866 [00:49<00:13, 15.04valid_batch/s]

EVAL: [660/866] Elapsed 0m 49s (remain 0m 15s) Loss: 0.7791 


Validation:  79%|███████▉  | 683/866 [00:50<00:12, 15.05valid_batch/s]

EVAL: [680/866] Elapsed 0m 50s (remain 0m 13s) Loss: 0.7807 


Validation:  81%|████████  | 702/866 [00:52<00:11, 13.98valid_batch/s]

EVAL: [700/866] Elapsed 0m 52s (remain 0m 12s) Loss: 0.7769 


Validation:  83%|████████▎ | 722/866 [00:53<00:10, 14.32valid_batch/s]

EVAL: [720/866] Elapsed 0m 53s (remain 0m 10s) Loss: 0.7754 


Validation:  86%|████████▌ | 743/866 [00:55<00:09, 13.54valid_batch/s]

EVAL: [740/866] Elapsed 0m 55s (remain 0m 9s) Loss: 0.7834 


Validation:  88%|████████▊ | 761/866 [00:56<00:07, 13.92valid_batch/s]

EVAL: [760/866] Elapsed 0m 56s (remain 0m 7s) Loss: 0.7818 


Validation:  91%|█████████ | 784/866 [00:58<00:05, 14.49valid_batch/s]

EVAL: [780/866] Elapsed 0m 58s (remain 0m 6s) Loss: 0.7831 


Validation:  93%|█████████▎| 804/866 [01:00<00:04, 12.87valid_batch/s]

EVAL: [800/866] Elapsed 0m 59s (remain 0m 4s) Loss: 0.7782 


Validation:  95%|█████████▌| 824/866 [01:01<00:02, 16.76valid_batch/s]

EVAL: [820/866] Elapsed 1m 1s (remain 0m 3s) Loss: 0.7805 


Validation:  97%|█████████▋| 842/866 [01:02<00:01, 13.28valid_batch/s]

EVAL: [840/866] Elapsed 1m 2s (remain 0m 1s) Loss: 0.7792 


Validation: 100%|█████████▉| 862/866 [01:04<00:00, 11.42valid_batch/s]

EVAL: [860/866] Elapsed 1m 4s (remain 0m 0s) Loss: 0.7809 


Validation: 100%|██████████| 866/866 [01:04<00:00, 13.37valid_batch/s]
Epoch 2 - avg_train_loss: 0.7224  avg_val_loss: 0.7799  time: 655s
Epoch 2 - Score: 0.8159
Epoch 2 - Save Best Score: 0.8159 Model


EVAL: [865/866] Elapsed 1m 4s (remain 0m 0s) Loss: 0.7799 


Score: 0.8159
Score: 0.8009
