# 🏋️ Model Training

## ⚙️ Setup 

### 📚 Importing Libraries

Importing from packages

In [1]:
import os
import gc
import pandas as pd
import time
import numpy as np
from sklearn.metrics import cohen_kappa_score
import torch
from torch.optim import AdamW
from torch import nn
from transformers import (
    get_linear_schedule_with_warmup,
    get_cosine_schedule_with_warmup,
)

In [2]:
os.chdir("../")

Importing user defined packages

In [3]:
from lib.config import Config
from lib.paths import Paths
from lib.model.deberta import CustomModel
from lib.model.epoch_functions import train_epoch, valid_epoch
from lib.model.utils import get_score
from lib.utils.utils import get_logger, seed_everything
from lib.data import read_data_loader_from_disk

In [None]:
seed_everything(Config.RANDOM_SEED)

## 📖 Definitions

### 🌎 Global Variables

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
LOGGER = get_logger(Paths.MODEL_OUTPUT_PATH)

### 🛠️ Functions

In [6]:
def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p
                for n, p in model.model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "lr": encoder_lr,
            "weight_decay": weight_decay,
        },
        {
            "params": [
                p
                for n, p in model.model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "lr": encoder_lr,
            "weight_decay": 0.0,
        },
        {
            "params": [p for n, p in model.named_parameters() if "model" not in n],
            "lr": decoder_lr,
            "weight_decay": 0.0,
        },
    ]

    return optimizer_parameters

In [7]:
def get_scheduler(cfg: Config, optimizer, num_train_steps):
    if cfg.SCHEDULER == "linear":
        return get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=cfg.NUM_WARMUP_STEPS,
            num_training_steps=num_train_steps,
        )
    
    if cfg.SCHEDULER == "cosine":
        return get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=cfg.NUM_WARMUP_STEPS,
            num_training_steps=num_train_steps,
            num_cycles=cfg.NUM_CYCLES,
        )

In [9]:
def get_model_optimizer_and_scheduler(train_loader):
    model = CustomModel(Config, config_path=None, pretrained=True)
    torch.save(model.config, Paths.MODEL_OUTPUT_PATH + "/config.pth")
    model.to(device)

    optimizer = AdamW(
        get_optimizer_params(
            model,
            encoder_lr=Config.ENCODER_LR,
            decoder_lr=Config.DECODER_LR,
            weight_decay=Config.WEIGHT_DECAY,
        ),
        lr=Config.ENCODER_LR,
        eps=Config.EPS,
        betas=Config.BETAS,
    )

    num_train_steps = int(len(train_loader) / Config.BATCH_SIZE_TRAIN * Config.EPOCHS)
    scheduler = get_scheduler(Config, optimizer, num_train_steps)
    return model, optimizer, scheduler

In [10]:
def train_loop(fold):
    LOGGER.info(f"========== Fold: {fold} training ==========")

    # ======== DATA LOADER ==========
    train_loader, valid_loader = read_data_loader_from_disk(fold)
    valid_fold = pd.read_csv(os.path.join(Paths.DATA_LOADER_PATH, f"valid_{fold}.csv"))
    valid_labels = valid_fold["score"].values

    # ======== MODEL ==========
    model, optimizer, scheduler = get_model_optimizer_and_scheduler(train_loader)

    # ======= LOSS ==========
    # criterion = RMSELoss(reduction="mean") # nn.SmoothL1Loss(reduction='mean')
    criterion = nn.CrossEntropyLoss()
    softmax = nn.Softmax(dim=1)

    best_score = -np.inf
    # ====== ITERATE EPOCHS ========
    for epoch in range(Config.EPOCHS):
        start_time = time.time()

        # ======= TRAIN ==========
        avg_loss = train_epoch(
            train_loader, model, criterion, optimizer, epoch, scheduler, device
        )

        # ======= EVALUATION ==========
        avg_val_loss, prediction_dict = valid_epoch(
            valid_loader, model, criterion, device
        )
        predictions = prediction_dict["predictions"]
        _, predictions = torch.max(softmax(torch.tensor(predictions)), dim=1)

        # ======= SCORING ==========
        score = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(
            f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s"
        )
        LOGGER.info(f"Epoch {epoch+1} - Score: {score:.4f}")

        if score > best_score:
            best_score = score
            LOGGER.info(f"Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model")
            torch.save(
                {"model": model.state_dict(), "predictions": predictions},
                Paths.MODEL_OUTPUT_PATH
                + f"/{Config.MODEL.replace('/', '_')}_fold_{fold}_best.pth",
            )

    predictions = torch.load(
        Paths.MODEL_OUTPUT_PATH
        + f"/{Config.MODEL.replace('/', '_')}_fold_{fold}_best.pth",
        map_location=torch.device("cpu"),
    )["predictions"]
    valid_fold["pred_score"] = predictions

    del model, optimizer, scheduler, criterion, softmax
    torch.cuda.empty_cache()
    gc.collect()

    return valid_fold

In [11]:
def get_result(oof_df):
    labels = oof_df["score"].values
    preds = oof_df["pred_score"].values
    score = get_score(labels, preds)
    LOGGER.info(f'Score: {score:<.4f}')

## 🏁 Start Training

In [12]:
if Config.TRAIN:
    oof_df = pd.DataFrame()

    for fold in range(Config.N_FOLDS):
        if fold in Config.TRAIN_FOLDS:
            _oof_df = train_loop(fold)
            oof_df = pd.concat([oof_df, _oof_df])
            LOGGER.info(f"========== Fold: {fold} result ==========")
            get_result(_oof_df)

    oof_df = oof_df.reset_index(drop=True)
    LOGGER.info(f"========== CV ==========")
    get_result(oof_df)
    oof_df.to_csv(Paths.MODEL_OUTPUT_PATH + "/oof_df.csv", index=False)



Train:   0%|          | 1/865 [00:01<14:43,  1.02s/train_batch]

Epoch: [1][0/865] Elapsed 0m 1s (remain 14m 43s) Loss: 1.8662 Grad: inf  LR: 0.00002000  


Train:   2%|▏         | 21/865 [00:08<04:57,  2.84train_batch/s]

Epoch: [1][20/865] Elapsed 0m 8s (remain 5m 31s) Loss: 1.5912 Grad: 69460.8594  LR: 0.00001819  


Train:   5%|▍         | 41/865 [00:15<04:48,  2.85train_batch/s]

Epoch: [1][40/865] Elapsed 0m 15s (remain 5m 5s) Loss: 1.4742 Grad: 41744.3594  LR: 0.00001369  


Train:   7%|▋         | 61/865 [00:23<05:12,  2.57train_batch/s]

Epoch: [1][60/865] Elapsed 0m 23s (remain 5m 7s) Loss: 1.4102 Grad: 39965.0938  LR: 0.00000798  


Train:   9%|▉         | 81/865 [00:31<05:07,  2.55train_batch/s]

Epoch: [1][80/865] Elapsed 0m 31s (remain 5m 0s) Loss: 1.3699 Grad: 83880.5078  LR: 0.00000293  


Train:  12%|█▏        | 101/865 [00:38<04:50,  2.63train_batch/s]

Epoch: [1][100/865] Elapsed 0m 38s (remain 4m 51s) Loss: 1.3440 Grad: 72980.7578  LR: 0.00000021  


Train:  14%|█▍        | 121/865 [00:46<04:42,  2.63train_batch/s]

Epoch: [1][120/865] Elapsed 0m 46s (remain 4m 42s) Loss: 1.3199 Grad: 47509.3047  LR: 0.00000071  


Train:  16%|█▋        | 141/865 [00:53<04:35,  2.62train_batch/s]

Epoch: [1][140/865] Elapsed 0m 53s (remain 4m 34s) Loss: 1.3012 Grad: 51424.1758  LR: 0.00000426  


Train:  19%|█▊        | 161/865 [01:01<04:46,  2.46train_batch/s]

Epoch: [1][160/865] Elapsed 1m 1s (remain 4m 27s) Loss: 1.2888 Grad: 92212.8906  LR: 0.00000971  


Train:  21%|██        | 181/865 [01:09<04:37,  2.46train_batch/s]

Epoch: [1][180/865] Elapsed 1m 9s (remain 4m 21s) Loss: 1.2749 Grad: 181365.8906  LR: 0.00001525  


Train:  23%|██▎       | 201/865 [01:17<04:25,  2.51train_batch/s]

Epoch: [1][200/865] Elapsed 1m 17s (remain 4m 15s) Loss: 1.2613 Grad: 58119.1055  LR: 0.00001906  


Train:  26%|██▌       | 221/865 [01:25<04:17,  2.50train_batch/s]

Epoch: [1][220/865] Elapsed 1m 25s (remain 4m 8s) Loss: 1.2436 Grad: 60661.3047  LR: 0.00001989  


Train:  28%|██▊       | 241/865 [01:33<04:09,  2.50train_batch/s]

Epoch: [1][240/865] Elapsed 1m 33s (remain 4m 0s) Loss: 1.2207 Grad: 80420.8047  LR: 0.00001747  


Train:  30%|███       | 261/865 [01:40<04:01,  2.50train_batch/s]

Epoch: [1][260/865] Elapsed 1m 40s (remain 3m 53s) Loss: 1.2165 Grad: 81006.8438  LR: 0.00001259  


Train:  32%|███▏      | 281/865 [01:48<03:53,  2.50train_batch/s]

Epoch: [1][280/865] Elapsed 1m 48s (remain 3m 46s) Loss: 1.1972 Grad: 63770.4648  LR: 0.00000685  


Train:  35%|███▍      | 301/865 [01:56<03:46,  2.49train_batch/s]

Epoch: [1][300/865] Elapsed 1m 56s (remain 3m 38s) Loss: 1.1746 Grad: 90465.4375  LR: 0.00000216  


Train:  37%|███▋      | 321/865 [02:04<03:40,  2.46train_batch/s]

Epoch: [1][320/865] Elapsed 2m 4s (remain 3m 31s) Loss: 1.1593 Grad: 105485.3438  LR: 0.00000004  


Train:  39%|███▉      | 341/865 [02:12<03:32,  2.46train_batch/s]

Epoch: [1][340/865] Elapsed 2m 12s (remain 3m 23s) Loss: 1.1482 Grad: 56695.3867  LR: 0.00000120  


Train:  42%|████▏     | 361/865 [02:20<03:24,  2.46train_batch/s]

Epoch: [1][360/865] Elapsed 2m 20s (remain 3m 16s) Loss: 1.1357 Grad: 77973.9297  LR: 0.00000525  


Train:  44%|████▍     | 381/865 [02:28<03:15,  2.48train_batch/s]

Epoch: [1][380/865] Elapsed 2m 28s (remain 3m 8s) Loss: 1.1246 Grad: 187706.9531  LR: 0.00001087  


Train:  46%|████▋     | 401/865 [02:36<03:08,  2.47train_batch/s]

Epoch: [1][400/865] Elapsed 2m 36s (remain 3m 1s) Loss: 1.1171 Grad: 92302.0391  LR: 0.00001620  


Train:  49%|████▊     | 421/865 [02:44<03:00,  2.46train_batch/s]

Epoch: [1][420/865] Elapsed 2m 44s (remain 2m 53s) Loss: 1.1128 Grad: 143604.1250  LR: 0.00001949  


Train:  51%|█████     | 441/865 [02:52<02:52,  2.46train_batch/s]

Epoch: [1][440/865] Elapsed 2m 52s (remain 2m 46s) Loss: 1.1033 Grad: 69100.6172  LR: 0.00001966  


Train:  53%|█████▎    | 461/865 [03:00<02:44,  2.46train_batch/s]

Epoch: [1][460/865] Elapsed 3m 0s (remain 2m 38s) Loss: 1.0991 Grad: 59975.2383  LR: 0.00001665  


Train:  56%|█████▌    | 481/865 [03:08<02:37,  2.44train_batch/s]

Epoch: [1][480/865] Elapsed 3m 8s (remain 2m 30s) Loss: 1.0975 Grad: 106451.8438  LR: 0.00001145  


Train:  58%|█████▊    | 501/865 [03:16<02:29,  2.43train_batch/s]

Epoch: [1][500/865] Elapsed 3m 16s (remain 2m 23s) Loss: 1.0928 Grad: 59285.2734  LR: 0.00000577  


Train:  60%|██████    | 521/865 [03:25<02:21,  2.43train_batch/s]

Epoch: [1][520/865] Elapsed 3m 25s (remain 2m 15s) Loss: 1.0884 Grad: 54376.6719  LR: 0.00000149  


Train:  63%|██████▎   | 541/865 [03:33<02:13,  2.43train_batch/s]

Epoch: [1][540/865] Elapsed 3m 33s (remain 2m 7s) Loss: 1.0833 Grad: 101763.1016  LR: 0.00000000  


Train:  65%|██████▍   | 561/865 [03:41<02:05,  2.43train_batch/s]

Epoch: [1][560/865] Elapsed 3m 41s (remain 1m 59s) Loss: 1.0759 Grad: 77974.2031  LR: 0.00000181  


Train:  67%|██████▋   | 581/865 [03:49<01:57,  2.42train_batch/s]

Epoch: [1][580/865] Elapsed 3m 49s (remain 1m 52s) Loss: 1.0663 Grad: 83232.5078  LR: 0.00000631  


Train:  69%|██████▉   | 601/865 [03:57<01:47,  2.46train_batch/s]

Epoch: [1][600/865] Elapsed 3m 57s (remain 1m 44s) Loss: 1.0622 Grad: 151139.1875  LR: 0.00001202  


Train:  72%|███████▏  | 621/865 [04:05<01:39,  2.45train_batch/s]

Epoch: [1][620/865] Elapsed 4m 5s (remain 1m 36s) Loss: 1.0586 Grad: 184638.4531  LR: 0.00001707  


Train:  74%|███████▍  | 641/865 [04:13<01:31,  2.44train_batch/s]

Epoch: [1][640/865] Elapsed 4m 13s (remain 1m 28s) Loss: 1.0591 Grad: 148010.1719  LR: 0.00001979  


Train:  76%|███████▋  | 661/865 [04:21<01:23,  2.44train_batch/s]

Epoch: [1][660/865] Elapsed 4m 21s (remain 1m 20s) Loss: 1.0559 Grad: 57337.9219  LR: 0.00001929  


Train:  79%|███████▊  | 681/865 [04:29<01:15,  2.45train_batch/s]

Epoch: [1][680/865] Elapsed 4m 29s (remain 1m 12s) Loss: 1.0570 Grad: 112604.0859  LR: 0.00001574  


Train:  81%|████████  | 701/865 [04:37<01:07,  2.45train_batch/s]

Epoch: [1][700/865] Elapsed 4m 37s (remain 1m 5s) Loss: 1.0506 Grad: 154641.2969  LR: 0.00001029  


Train:  83%|████████▎ | 721/865 [04:45<00:59,  2.43train_batch/s]

Epoch: [1][720/865] Elapsed 4m 45s (remain 0m 57s) Loss: 1.0484 Grad: 116957.6328  LR: 0.00000475  


Train:  86%|████████▌ | 741/865 [04:54<00:50,  2.43train_batch/s]

Epoch: [1][740/865] Elapsed 4m 54s (remain 0m 49s) Loss: 1.0427 Grad: 66119.3203  LR: 0.00000094  


Train:  88%|████████▊ | 761/865 [05:02<00:42,  2.44train_batch/s]

Epoch: [1][760/865] Elapsed 5m 2s (remain 0m 41s) Loss: 1.0373 Grad: 77577.7344  LR: 0.00000011  


Train:  90%|█████████ | 781/865 [05:10<00:34,  2.44train_batch/s]

Epoch: [1][780/865] Elapsed 5m 10s (remain 0m 33s) Loss: 1.0318 Grad: 69870.4531  LR: 0.00000253  


Train:  93%|█████████▎| 801/865 [05:18<00:26,  2.45train_batch/s]

Epoch: [1][800/865] Elapsed 5m 18s (remain 0m 25s) Loss: 1.0271 Grad: 142315.1250  LR: 0.00000741  


Train:  95%|█████████▍| 821/865 [05:26<00:17,  2.45train_batch/s]

Epoch: [1][820/865] Elapsed 5m 26s (remain 0m 17s) Loss: 1.0247 Grad: 141779.9531  LR: 0.00001315  


Train:  97%|█████████▋| 841/865 [05:34<00:09,  2.46train_batch/s]

Epoch: [1][840/865] Elapsed 5m 34s (remain 0m 9s) Loss: 1.0225 Grad: 73400.2812  LR: 0.00001784  


Train: 100%|█████████▉| 861/865 [05:42<00:01,  2.50train_batch/s]

Epoch: [1][860/865] Elapsed 5m 42s (remain 0m 1s) Loss: 1.0204 Grad: 130843.4375  LR: 0.00001996  


Train: 100%|██████████| 865/865 [05:44<00:00,  2.51train_batch/s]


Epoch: [1][864/865] Elapsed 5m 44s (remain 0m 0s) Loss: 1.0195 Grad: 112420.3281  LR: 0.00002000  


Validation:   0%|          | 2/433 [00:00<01:09,  6.20valid_batch/s]

EVAL: [0/433] Elapsed 0m 0s (remain 1m 43s) Loss: 0.6736 


Validation:   5%|▌         | 22/433 [00:02<00:44,  9.26valid_batch/s]

EVAL: [20/433] Elapsed 0m 2s (remain 0m 46s) Loss: 1.0864 


Validation:  10%|▉         | 42/433 [00:04<00:42,  9.29valid_batch/s]

EVAL: [40/433] Elapsed 0m 4s (remain 0m 43s) Loss: 1.0272 


Validation:  14%|█▍        | 62/433 [00:06<00:39,  9.38valid_batch/s]

EVAL: [60/433] Elapsed 0m 6s (remain 0m 40s) Loss: 1.0071 


Validation:  19%|█▉        | 82/433 [00:08<00:37,  9.30valid_batch/s]

EVAL: [80/433] Elapsed 0m 8s (remain 0m 38s) Loss: 1.0134 


Validation:  24%|██▎       | 102/433 [00:11<00:35,  9.43valid_batch/s]

EVAL: [100/433] Elapsed 0m 10s (remain 0m 36s) Loss: 0.9920 


Validation:  28%|██▊       | 122/433 [00:13<00:33,  9.30valid_batch/s]

EVAL: [120/433] Elapsed 0m 13s (remain 0m 33s) Loss: 1.0060 


Validation:  33%|███▎      | 142/433 [00:15<00:31,  9.28valid_batch/s]

EVAL: [140/433] Elapsed 0m 15s (remain 0m 31s) Loss: 1.0258 


Validation:  37%|███▋      | 162/433 [00:17<00:29,  9.31valid_batch/s]

EVAL: [160/433] Elapsed 0m 17s (remain 0m 29s) Loss: 1.0197 


Validation:  42%|████▏     | 182/433 [00:19<00:27,  9.27valid_batch/s]

EVAL: [180/433] Elapsed 0m 19s (remain 0m 27s) Loss: 1.0116 


Validation:  47%|████▋     | 202/433 [00:21<00:24,  9.34valid_batch/s]

EVAL: [200/433] Elapsed 0m 21s (remain 0m 25s) Loss: 0.9992 


Validation:  52%|█████▏    | 223/433 [00:24<00:21,  9.61valid_batch/s]

EVAL: [220/433] Elapsed 0m 23s (remain 0m 22s) Loss: 1.0046 


Validation:  56%|█████▌    | 242/433 [00:26<00:20,  9.29valid_batch/s]

EVAL: [240/433] Elapsed 0m 25s (remain 0m 20s) Loss: 1.0029 


Validation:  61%|██████    | 262/433 [00:28<00:18,  9.35valid_batch/s]

EVAL: [260/433] Elapsed 0m 28s (remain 0m 18s) Loss: 0.9896 


Validation:  65%|██████▌   | 282/433 [00:30<00:16,  9.38valid_batch/s]

EVAL: [280/433] Elapsed 0m 30s (remain 0m 16s) Loss: 0.9845 


Validation:  70%|██████▉   | 302/433 [00:32<00:14,  9.27valid_batch/s]

EVAL: [300/433] Elapsed 0m 32s (remain 0m 14s) Loss: 0.9911 


Validation:  74%|███████▍  | 322/433 [00:34<00:11,  9.29valid_batch/s]

EVAL: [320/433] Elapsed 0m 34s (remain 0m 12s) Loss: 0.9943 


Validation:  79%|███████▉  | 342/433 [00:36<00:09,  9.29valid_batch/s]

EVAL: [340/433] Elapsed 0m 36s (remain 0m 9s) Loss: 0.9918 


Validation:  84%|████████▎ | 362/433 [00:38<00:07,  9.21valid_batch/s]

EVAL: [360/433] Elapsed 0m 38s (remain 0m 7s) Loss: 0.9885 


Validation:  88%|████████▊ | 382/433 [00:41<00:05,  9.22valid_batch/s]

EVAL: [380/433] Elapsed 0m 40s (remain 0m 5s) Loss: 0.9884 


Validation:  93%|█████████▎| 402/433 [00:43<00:03,  9.23valid_batch/s]

EVAL: [400/433] Elapsed 0m 43s (remain 0m 3s) Loss: 0.9906 


Validation:  97%|█████████▋| 422/433 [00:45<00:01,  9.23valid_batch/s]

EVAL: [420/433] Elapsed 0m 45s (remain 0m 1s) Loss: 0.9924 


Validation: 100%|██████████| 433/433 [00:46<00:00,  9.30valid_batch/s]
Epoch 1 - avg_train_loss: 1.0195  avg_val_loss: 0.9945  time: 391s
Epoch 1 - Score: 0.7098
Epoch 1 - Save Best Score: 0.7098 Model


EVAL: [432/433] Elapsed 0m 46s (remain 0m 0s) Loss: 0.9945 


Train:   0%|          | 1/865 [00:00<07:36,  1.89train_batch/s]

Epoch: [2][0/865] Elapsed 0m 0s (remain 7m 20s) Loss: 0.7859 Grad: 715144.5625  LR: 0.00001998  


Train:   2%|▏         | 21/865 [00:08<05:43,  2.45train_batch/s]

Epoch: [2][20/865] Elapsed 0m 8s (remain 5m 42s) Loss: 0.8958 Grad: 229220.9375  LR: 0.00001802  


Train:   5%|▍         | 41/865 [00:16<05:36,  2.45train_batch/s]

Epoch: [2][40/865] Elapsed 0m 16s (remain 5m 32s) Loss: 0.8934 Grad: 339202.8125  LR: 0.00001342  


Train:   7%|▋         | 61/865 [00:24<05:28,  2.45train_batch/s]

Epoch: [2][60/865] Elapsed 0m 24s (remain 5m 24s) Loss: 0.8829 Grad: 279259.4688  LR: 0.00000769  


Train:   9%|▉         | 81/865 [00:32<05:19,  2.45train_batch/s]

Epoch: [2][80/865] Elapsed 0m 32s (remain 5m 16s) Loss: 0.8513 Grad: 181563.3125  LR: 0.00000273  


Train:  12%|█▏        | 101/865 [00:40<05:11,  2.45train_batch/s]

Epoch: [2][100/865] Elapsed 0m 40s (remain 5m 7s) Loss: 0.8426 Grad: 145584.6250  LR: 0.00000015  


Train:  14%|█▍        | 121/865 [00:48<05:03,  2.45train_batch/s]

Epoch: [2][120/865] Elapsed 0m 48s (remain 4m 59s) Loss: 0.8321 Grad: 201529.1875  LR: 0.00000082  


Train:  16%|█▋        | 141/865 [00:56<04:55,  2.45train_batch/s]

Epoch: [2][140/865] Elapsed 0m 56s (remain 4m 51s) Loss: 0.8166 Grad: 209013.1875  LR: 0.00000450  


Train:  19%|█▊        | 161/865 [01:04<04:47,  2.45train_batch/s]

Epoch: [2][160/865] Elapsed 1m 4s (remain 4m 43s) Loss: 0.8162 Grad: 304120.9375  LR: 0.00001000  


Train:  21%|██        | 181/865 [01:12<04:38,  2.45train_batch/s]

Epoch: [2][180/865] Elapsed 1m 12s (remain 4m 35s) Loss: 0.8197 Grad: 145558.4375  LR: 0.00001550  


Train:  23%|██▎       | 201/865 [01:20<04:30,  2.46train_batch/s]

Epoch: [2][200/865] Elapsed 1m 20s (remain 4m 27s) Loss: 0.8154 Grad: 147368.0625  LR: 0.00001918  


Train:  26%|██▌       | 221/865 [01:28<04:22,  2.45train_batch/s]

Epoch: [2][220/865] Elapsed 1m 28s (remain 4m 19s) Loss: 0.8220 Grad: 135719.7188  LR: 0.00001985  


Train:  28%|██▊       | 241/865 [01:37<04:15,  2.45train_batch/s]

Epoch: [2][240/865] Elapsed 1m 37s (remain 4m 11s) Loss: 0.8246 Grad: 182499.8594  LR: 0.00001727  


Train:  30%|███       | 261/865 [01:45<04:10,  2.41train_batch/s]

Epoch: [2][260/865] Elapsed 1m 45s (remain 4m 3s) Loss: 0.8296 Grad: 219479.2031  LR: 0.00001231  


Train:  32%|███▏      | 281/865 [01:53<04:01,  2.42train_batch/s]

Epoch: [2][280/865] Elapsed 1m 53s (remain 3m 55s) Loss: 0.8308 Grad: 200998.3594  LR: 0.00000658  


Train:  35%|███▍      | 301/865 [02:01<03:51,  2.43train_batch/s]

Epoch: [2][300/865] Elapsed 2m 1s (remain 3m 47s) Loss: 0.8229 Grad: 141006.2969  LR: 0.00000198  


Train:  37%|███▋      | 321/865 [02:09<03:41,  2.46train_batch/s]

Epoch: [2][320/865] Elapsed 2m 9s (remain 3m 39s) Loss: 0.8230 Grad: 122986.6641  LR: 0.00000002  


Train:  39%|███▉      | 341/865 [02:17<03:34,  2.45train_batch/s]

Epoch: [2][340/865] Elapsed 2m 17s (remain 3m 31s) Loss: 0.8188 Grad: 146824.8125  LR: 0.00000134  


Train:  42%|████▏     | 361/865 [02:25<03:25,  2.45train_batch/s]

Epoch: [2][360/865] Elapsed 2m 25s (remain 3m 23s) Loss: 0.8152 Grad: 120022.6719  LR: 0.00000551  


Train:  44%|████▍     | 381/865 [02:33<03:14,  2.49train_batch/s]

Epoch: [2][380/865] Elapsed 2m 33s (remain 3m 15s) Loss: 0.8129 Grad: 107764.5234  LR: 0.00001116  


Train:  46%|████▋     | 401/865 [02:41<03:07,  2.48train_batch/s]

Epoch: [2][400/865] Elapsed 2m 41s (remain 3m 6s) Loss: 0.8112 Grad: 133464.9531  LR: 0.00001643  


Train:  49%|████▊     | 421/865 [02:49<03:00,  2.46train_batch/s]

Epoch: [2][420/865] Elapsed 2m 49s (remain 2m 58s) Loss: 0.8072 Grad: 182588.2656  LR: 0.00001958  


Train:  51%|█████     | 441/865 [02:57<02:52,  2.46train_batch/s]

Epoch: [2][440/865] Elapsed 2m 57s (remain 2m 50s) Loss: 0.8097 Grad: 197223.1719  LR: 0.00001958  


Train:  53%|█████▎    | 461/865 [03:05<02:43,  2.47train_batch/s]

Epoch: [2][460/865] Elapsed 3m 5s (remain 2m 42s) Loss: 0.8102 Grad: 231568.0938  LR: 0.00001643  


Train:  56%|█████▌    | 481/865 [03:13<02:35,  2.47train_batch/s]

Epoch: [2][480/865] Elapsed 3m 13s (remain 2m 34s) Loss: 0.8122 Grad: 202924.7812  LR: 0.00001116  


Train:  58%|█████▊    | 501/865 [03:21<02:27,  2.47train_batch/s]

Epoch: [2][500/865] Elapsed 3m 21s (remain 2m 26s) Loss: 0.8151 Grad: 127147.0625  LR: 0.00000551  


Train:  60%|██████    | 521/865 [03:29<02:19,  2.46train_batch/s]

Epoch: [2][520/865] Elapsed 3m 29s (remain 2m 18s) Loss: 0.8145 Grad: 160983.5312  LR: 0.00000134  


Train:  63%|██████▎   | 541/865 [03:37<02:10,  2.47train_batch/s]

Epoch: [2][540/865] Elapsed 3m 37s (remain 2m 10s) Loss: 0.8142 Grad: 166309.0156  LR: 0.00000002  


Train:  65%|██████▍   | 561/865 [03:45<02:03,  2.46train_batch/s]

Epoch: [2][560/865] Elapsed 3m 45s (remain 2m 2s) Loss: 0.8138 Grad: 131834.6094  LR: 0.00000198  


Train:  67%|██████▋   | 581/865 [03:53<01:55,  2.46train_batch/s]

Epoch: [2][580/865] Elapsed 3m 53s (remain 1m 54s) Loss: 0.8123 Grad: 144334.3750  LR: 0.00000658  


Train:  69%|██████▉   | 601/865 [04:01<01:47,  2.45train_batch/s]

Epoch: [2][600/865] Elapsed 4m 1s (remain 1m 46s) Loss: 0.8106 Grad: 86603.8047  LR: 0.00001231  


Train:  72%|███████▏  | 621/865 [04:09<01:38,  2.48train_batch/s]

Epoch: [2][620/865] Elapsed 4m 9s (remain 1m 38s) Loss: 0.8128 Grad: 276097.1875  LR: 0.00001727  


Train:  74%|███████▍  | 641/865 [04:17<01:30,  2.48train_batch/s]

Epoch: [2][640/865] Elapsed 4m 17s (remain 1m 29s) Loss: 0.8132 Grad: 123514.5938  LR: 0.00001985  


Train:  76%|███████▋  | 661/865 [04:25<01:23,  2.45train_batch/s]

Epoch: [2][660/865] Elapsed 4m 25s (remain 1m 21s) Loss: 0.8170 Grad: 303458.5938  LR: 0.00001918  


Train:  79%|███████▊  | 681/865 [04:33<01:14,  2.45train_batch/s]

Epoch: [2][680/865] Elapsed 4m 33s (remain 1m 13s) Loss: 0.8158 Grad: 154001.6406  LR: 0.00001550  


Train:  81%|████████  | 701/865 [04:41<01:06,  2.46train_batch/s]

Epoch: [2][700/865] Elapsed 4m 41s (remain 1m 5s) Loss: 0.8141 Grad: 114182.3047  LR: 0.00001000  


Train:  83%|████████▎ | 721/865 [04:49<00:58,  2.48train_batch/s]

Epoch: [2][720/865] Elapsed 4m 49s (remain 0m 57s) Loss: 0.8151 Grad: 322141.1562  LR: 0.00000450  


Train:  86%|████████▌ | 741/865 [04:57<00:50,  2.48train_batch/s]

Epoch: [2][740/865] Elapsed 4m 57s (remain 0m 49s) Loss: 0.8174 Grad: 199449.0000  LR: 0.00000082  


Train:  88%|████████▊ | 761/865 [05:05<00:41,  2.48train_batch/s]

Epoch: [2][760/865] Elapsed 5m 5s (remain 0m 41s) Loss: 0.8190 Grad: 207907.8906  LR: 0.00000015  


Train:  90%|█████████ | 781/865 [05:13<00:33,  2.48train_batch/s]

Epoch: [2][780/865] Elapsed 5m 13s (remain 0m 33s) Loss: 0.8184 Grad: 196189.9062  LR: 0.00000273  


Train:  93%|█████████▎| 801/865 [05:21<00:26,  2.45train_batch/s]

Epoch: [2][800/865] Elapsed 5m 21s (remain 0m 25s) Loss: 0.8171 Grad: 281381.5625  LR: 0.00000769  


Train:  95%|█████████▍| 821/865 [05:29<00:17,  2.45train_batch/s]

Epoch: [2][820/865] Elapsed 5m 29s (remain 0m 17s) Loss: 0.8170 Grad: 299858.9688  LR: 0.00001342  


Train:  97%|█████████▋| 841/865 [05:37<00:09,  2.45train_batch/s]

Epoch: [2][840/865] Elapsed 5m 37s (remain 0m 9s) Loss: 0.8168 Grad: 218648.8438  LR: 0.00001802  


Train: 100%|█████████▉| 861/865 [05:45<00:01,  2.45train_batch/s]

Epoch: [2][860/865] Elapsed 5m 45s (remain 0m 1s) Loss: 0.8184 Grad: 139031.4062  LR: 0.00001998  


Train: 100%|██████████| 865/865 [05:47<00:00,  2.49train_batch/s]


Epoch: [2][864/865] Elapsed 5m 47s (remain 0m 0s) Loss: 0.8187 Grad: 209259.5625  LR: 0.00001998  


Validation:   0%|          | 2/433 [00:00<01:08,  6.32valid_batch/s]

EVAL: [0/433] Elapsed 0m 0s (remain 1m 39s) Loss: 0.5932 


Validation:   5%|▌         | 22/433 [00:02<00:44,  9.22valid_batch/s]

EVAL: [20/433] Elapsed 0m 2s (remain 0m 47s) Loss: 0.9524 


Validation:  10%|▉         | 42/433 [00:04<00:42,  9.21valid_batch/s]

EVAL: [40/433] Elapsed 0m 4s (remain 0m 43s) Loss: 0.9068 


Validation:  14%|█▍        | 62/433 [00:06<00:39,  9.32valid_batch/s]

EVAL: [60/433] Elapsed 0m 6s (remain 0m 41s) Loss: 0.8906 


Validation:  19%|█▉        | 82/433 [00:09<00:38,  9.16valid_batch/s]

EVAL: [80/433] Elapsed 0m 8s (remain 0m 38s) Loss: 0.8863 


Validation:  24%|██▎       | 102/433 [00:11<00:35,  9.34valid_batch/s]

EVAL: [100/433] Elapsed 0m 11s (remain 0m 36s) Loss: 0.8747 


Validation:  28%|██▊       | 122/433 [00:13<00:33,  9.21valid_batch/s]

EVAL: [120/433] Elapsed 0m 13s (remain 0m 34s) Loss: 0.8836 


Validation:  33%|███▎      | 142/433 [00:15<00:31,  9.19valid_batch/s]

EVAL: [140/433] Elapsed 0m 15s (remain 0m 31s) Loss: 0.8886 


Validation:  37%|███▋      | 162/433 [00:17<00:29,  9.21valid_batch/s]

EVAL: [160/433] Elapsed 0m 17s (remain 0m 29s) Loss: 0.8837 


Validation:  42%|████▏     | 182/433 [00:19<00:27,  9.23valid_batch/s]

EVAL: [180/433] Elapsed 0m 19s (remain 0m 27s) Loss: 0.8828 


Validation:  47%|████▋     | 202/433 [00:22<00:25,  9.24valid_batch/s]

EVAL: [200/433] Elapsed 0m 21s (remain 0m 25s) Loss: 0.8738 


Validation:  51%|█████     | 221/433 [00:24<00:22,  9.26valid_batch/s]

EVAL: [220/433] Elapsed 0m 24s (remain 0m 23s) Loss: 0.8781 


Validation:  56%|█████▌    | 242/433 [00:26<00:20,  9.28valid_batch/s]

EVAL: [240/433] Elapsed 0m 26s (remain 0m 20s) Loss: 0.8744 


Validation:  61%|██████    | 262/433 [00:28<00:18,  9.28valid_batch/s]

EVAL: [260/433] Elapsed 0m 28s (remain 0m 18s) Loss: 0.8643 


Validation:  65%|██████▌   | 282/433 [00:30<00:16,  9.32valid_batch/s]

EVAL: [280/433] Elapsed 0m 30s (remain 0m 16s) Loss: 0.8638 


Validation:  70%|██████▉   | 302/433 [00:32<00:14,  9.31valid_batch/s]

EVAL: [300/433] Elapsed 0m 32s (remain 0m 14s) Loss: 0.8677 


Validation:  74%|███████▍  | 322/433 [00:34<00:11,  9.26valid_batch/s]

EVAL: [320/433] Elapsed 0m 34s (remain 0m 12s) Loss: 0.8683 


Validation:  79%|███████▉  | 342/433 [00:37<00:09,  9.29valid_batch/s]

EVAL: [340/433] Elapsed 0m 36s (remain 0m 9s) Loss: 0.8663 


Validation:  84%|████████▎ | 362/433 [00:39<00:07,  9.25valid_batch/s]

EVAL: [360/433] Elapsed 0m 39s (remain 0m 7s) Loss: 0.8651 


Validation:  88%|████████▊ | 382/433 [00:41<00:05,  9.35valid_batch/s]

EVAL: [380/433] Elapsed 0m 41s (remain 0m 5s) Loss: 0.8638 


Validation:  93%|█████████▎| 402/433 [00:43<00:03,  9.31valid_batch/s]

EVAL: [400/433] Elapsed 0m 43s (remain 0m 3s) Loss: 0.8665 


Validation:  97%|█████████▋| 422/433 [00:45<00:01,  9.32valid_batch/s]

EVAL: [420/433] Elapsed 0m 45s (remain 0m 1s) Loss: 0.8664 


Validation: 100%|██████████| 433/433 [00:46<00:00,  9.25valid_batch/s]
Epoch 2 - avg_train_loss: 0.8187  avg_val_loss: 0.8706  time: 394s
Epoch 2 - Score: 0.7523
Epoch 2 - Save Best Score: 0.7523 Model


EVAL: [432/433] Elapsed 0m 46s (remain 0m 0s) Loss: 0.8706 


Score: 0.7523
Train:   0%|          | 1/865 [00:00<07:35,  1.90train_batch/s]

Epoch: [1][0/865] Elapsed 0m 0s (remain 7m 36s) Loss: 1.8657 Grad: inf  LR: 0.00002000  


Train:   2%|▏         | 21/865 [00:08<05:40,  2.48train_batch/s]

Epoch: [1][20/865] Elapsed 0m 8s (remain 5m 38s) Loss: 1.5568 Grad: 64225.2266  LR: 0.00001819  


Train:   5%|▍         | 41/865 [00:16<05:32,  2.48train_batch/s]

Epoch: [1][40/865] Elapsed 0m 16s (remain 5m 29s) Loss: 1.4989 Grad: 39392.2305  LR: 0.00001369  


Train:   7%|▋         | 61/865 [00:24<05:27,  2.46train_batch/s]

Epoch: [1][60/865] Elapsed 0m 24s (remain 5m 21s) Loss: 1.4060 Grad: 64988.8008  LR: 0.00000798  


Train:   9%|▉         | 81/865 [00:32<05:19,  2.46train_batch/s]

Epoch: [1][80/865] Elapsed 0m 32s (remain 5m 13s) Loss: 1.3476 Grad: 92394.5781  LR: 0.00000293  


Train:  12%|█▏        | 101/865 [00:40<05:10,  2.46train_batch/s]

Epoch: [1][100/865] Elapsed 0m 40s (remain 5m 6s) Loss: 1.3092 Grad: 68607.3281  LR: 0.00000021  


Train:  14%|█▍        | 121/865 [00:48<05:03,  2.45train_batch/s]

Epoch: [1][120/865] Elapsed 0m 48s (remain 4m 58s) Loss: 1.2854 Grad: 88101.0547  LR: 0.00000071  


Train:  16%|█▋        | 141/865 [00:56<04:55,  2.45train_batch/s]

Epoch: [1][140/865] Elapsed 0m 56s (remain 4m 50s) Loss: 1.2729 Grad: 64448.2031  LR: 0.00000426  


Train:  19%|█▊        | 161/865 [01:04<04:46,  2.45train_batch/s]

Epoch: [1][160/865] Elapsed 1m 4s (remain 4m 42s) Loss: 1.2560 Grad: 75836.1719  LR: 0.00000971  


Train:  21%|██        | 181/865 [01:12<04:39,  2.45train_batch/s]

Epoch: [1][180/865] Elapsed 1m 12s (remain 4m 34s) Loss: 1.2409 Grad: 133254.2656  LR: 0.00001525  


Train:  23%|██▎       | 201/865 [01:20<04:31,  2.45train_batch/s]

Epoch: [1][200/865] Elapsed 1m 20s (remain 4m 26s) Loss: 1.2196 Grad: 73795.5156  LR: 0.00001906  


Train:  26%|██▌       | 221/865 [01:28<04:26,  2.42train_batch/s]

Epoch: [1][220/865] Elapsed 1m 28s (remain 4m 18s) Loss: 1.2126 Grad: 73374.3516  LR: 0.00001989  


Train:  28%|██▊       | 241/865 [01:36<04:13,  2.46train_batch/s]

Epoch: [1][240/865] Elapsed 1m 36s (remain 4m 10s) Loss: 1.2090 Grad: 120176.2422  LR: 0.00001747  


Train:  30%|███       | 261/865 [01:44<04:06,  2.45train_batch/s]

Epoch: [1][260/865] Elapsed 1m 44s (remain 4m 2s) Loss: 1.1944 Grad: 43996.3906  LR: 0.00001259  


Train:  32%|███▏      | 281/865 [01:53<04:00,  2.43train_batch/s]

Epoch: [1][280/865] Elapsed 1m 53s (remain 3m 54s) Loss: 1.1781 Grad: 78308.0156  LR: 0.00000685  


Train:  35%|███▍      | 301/865 [02:01<03:52,  2.43train_batch/s]

Epoch: [1][300/865] Elapsed 2m 1s (remain 3m 47s) Loss: 1.1623 Grad: 47546.0820  LR: 0.00000216  


Train:  37%|███▋      | 321/865 [02:09<03:42,  2.44train_batch/s]

Epoch: [1][320/865] Elapsed 2m 9s (remain 3m 39s) Loss: 1.1458 Grad: 170130.4688  LR: 0.00000004  


Train:  39%|███▉      | 341/865 [02:17<03:33,  2.45train_batch/s]

Epoch: [1][340/865] Elapsed 2m 17s (remain 3m 31s) Loss: 1.1314 Grad: 98836.8438  LR: 0.00000120  


Train:  42%|████▏     | 361/865 [02:25<03:25,  2.46train_batch/s]

Epoch: [1][360/865] Elapsed 2m 25s (remain 3m 22s) Loss: 1.1233 Grad: 146181.3594  LR: 0.00000525  


Train:  44%|████▍     | 381/865 [02:33<03:17,  2.46train_batch/s]

Epoch: [1][380/865] Elapsed 2m 33s (remain 3m 14s) Loss: 1.1136 Grad: 78836.9375  LR: 0.00001087  


Train:  46%|████▋     | 401/865 [02:41<03:08,  2.46train_batch/s]

Epoch: [1][400/865] Elapsed 2m 41s (remain 3m 6s) Loss: 1.1055 Grad: 150734.4062  LR: 0.00001620  


Train:  49%|████▊     | 421/865 [02:49<02:59,  2.48train_batch/s]

Epoch: [1][420/865] Elapsed 2m 49s (remain 2m 58s) Loss: 1.1003 Grad: 194311.0625  LR: 0.00001949  


Train:  51%|█████     | 441/865 [02:57<02:51,  2.48train_batch/s]

Epoch: [1][440/865] Elapsed 2m 57s (remain 2m 50s) Loss: 1.0962 Grad: 60714.5391  LR: 0.00001966  


Train:  53%|█████▎    | 461/865 [03:05<02:45,  2.44train_batch/s]

Epoch: [1][460/865] Elapsed 3m 5s (remain 2m 42s) Loss: 1.0880 Grad: 53653.4102  LR: 0.00001665  


Train:  56%|█████▌    | 481/865 [03:13<02:36,  2.45train_batch/s]

Epoch: [1][480/865] Elapsed 3m 13s (remain 2m 34s) Loss: 1.0796 Grad: 90384.1719  LR: 0.00001145  


Train:  58%|█████▊    | 501/865 [03:21<02:27,  2.46train_batch/s]

Epoch: [1][500/865] Elapsed 3m 21s (remain 2m 26s) Loss: 1.0745 Grad: 69978.8359  LR: 0.00000577  


Train:  60%|██████    | 521/865 [03:29<02:19,  2.47train_batch/s]

Epoch: [1][520/865] Elapsed 3m 29s (remain 2m 18s) Loss: 1.0678 Grad: 96604.7109  LR: 0.00000149  


Train:  63%|██████▎   | 541/865 [03:37<02:10,  2.48train_batch/s]

Epoch: [1][540/865] Elapsed 3m 37s (remain 2m 10s) Loss: 1.0602 Grad: 77545.8750  LR: 0.00000000  


Train:  65%|██████▍   | 561/865 [03:45<02:02,  2.48train_batch/s]

Epoch: [1][560/865] Elapsed 3m 45s (remain 2m 2s) Loss: 1.0550 Grad: 70582.0547  LR: 0.00000181  


Train:  67%|██████▋   | 581/865 [03:53<01:54,  2.48train_batch/s]

Epoch: [1][580/865] Elapsed 3m 53s (remain 1m 54s) Loss: 1.0460 Grad: 94638.1562  LR: 0.00000631  


Train:  69%|██████▉   | 601/865 [04:01<01:46,  2.48train_batch/s]

Epoch: [1][600/865] Elapsed 4m 1s (remain 1m 45s) Loss: 1.0406 Grad: 166584.7188  LR: 0.00001202  


Train:  72%|███████▏  | 621/865 [04:09<01:39,  2.46train_batch/s]

Epoch: [1][620/865] Elapsed 4m 9s (remain 1m 37s) Loss: 1.0347 Grad: 178186.9688  LR: 0.00001707  


Train:  74%|███████▍  | 641/865 [04:17<01:31,  2.45train_batch/s]

Epoch: [1][640/865] Elapsed 4m 17s (remain 1m 29s) Loss: 1.0325 Grad: 141400.4375  LR: 0.00001979  


Train:  76%|███████▋  | 661/865 [04:25<01:22,  2.48train_batch/s]

Epoch: [1][660/865] Elapsed 4m 25s (remain 1m 21s) Loss: 1.0314 Grad: 103550.7109  LR: 0.00001929  


Train:  79%|███████▊  | 681/865 [04:33<01:15,  2.45train_batch/s]

Epoch: [1][680/865] Elapsed 4m 33s (remain 1m 13s) Loss: 1.0296 Grad: 77194.6094  LR: 0.00001574  


Train:  81%|████████  | 701/865 [04:41<01:07,  2.44train_batch/s]

Epoch: [1][700/865] Elapsed 4m 41s (remain 1m 5s) Loss: 1.0242 Grad: 146220.4062  LR: 0.00001029  


Train:  83%|████████▎ | 721/865 [04:49<00:58,  2.45train_batch/s]

Epoch: [1][720/865] Elapsed 4m 49s (remain 0m 57s) Loss: 1.0193 Grad: 82757.3672  LR: 0.00000475  


Train:  86%|████████▌ | 741/865 [04:57<00:50,  2.48train_batch/s]

Epoch: [1][740/865] Elapsed 4m 57s (remain 0m 49s) Loss: 1.0150 Grad: 83156.0391  LR: 0.00000094  


Train:  88%|████████▊ | 761/865 [05:05<00:41,  2.48train_batch/s]

Epoch: [1][760/865] Elapsed 5m 5s (remain 0m 41s) Loss: 1.0122 Grad: 78339.4766  LR: 0.00000011  


Train:  90%|█████████ | 781/865 [05:13<00:33,  2.48train_batch/s]

Epoch: [1][780/865] Elapsed 5m 13s (remain 0m 33s) Loss: 1.0078 Grad: 57546.2656  LR: 0.00000253  


Train:  93%|█████████▎| 801/865 [05:21<00:26,  2.45train_batch/s]

Epoch: [1][800/865] Elapsed 5m 21s (remain 0m 25s) Loss: 1.0055 Grad: 127723.1484  LR: 0.00000741  


Train:  95%|█████████▍| 821/865 [05:29<00:18,  2.43train_batch/s]

Epoch: [1][820/865] Elapsed 5m 29s (remain 0m 17s) Loss: 1.0008 Grad: 104842.7188  LR: 0.00001315  


Train:  97%|█████████▋| 841/865 [05:37<00:09,  2.44train_batch/s]

Epoch: [1][840/865] Elapsed 5m 37s (remain 0m 9s) Loss: 0.9989 Grad: 98131.1719  LR: 0.00001784  


Train: 100%|█████████▉| 861/865 [05:45<00:01,  2.45train_batch/s]

Epoch: [1][860/865] Elapsed 5m 45s (remain 0m 1s) Loss: 0.9971 Grad: 175712.5781  LR: 0.00001996  


Train: 100%|██████████| 865/865 [05:47<00:00,  2.49train_batch/s]


Epoch: [1][864/865] Elapsed 5m 47s (remain 0m 0s) Loss: 0.9962 Grad: 62647.0430  LR: 0.00002000  


Validation:   0%|          | 2/433 [00:00<01:13,  5.87valid_batch/s]

EVAL: [0/433] Elapsed 0m 0s (remain 1m 52s) Loss: 1.1609 


Validation:   5%|▌         | 22/433 [00:02<00:44,  9.22valid_batch/s]

EVAL: [20/433] Elapsed 0m 2s (remain 0m 47s) Loss: 0.7941 


Validation:  10%|▉         | 42/433 [00:04<00:41,  9.44valid_batch/s]

EVAL: [40/433] Elapsed 0m 4s (remain 0m 43s) Loss: 0.7432 


Validation:  14%|█▍        | 62/433 [00:06<00:40,  9.27valid_batch/s]

EVAL: [60/433] Elapsed 0m 6s (remain 0m 41s) Loss: 0.7480 


Validation:  19%|█▉        | 83/433 [00:09<00:35,  9.73valid_batch/s]

EVAL: [80/433] Elapsed 0m 8s (remain 0m 38s) Loss: 0.7661 


Validation:  24%|██▎       | 102/433 [00:11<00:35,  9.25valid_batch/s]

EVAL: [100/433] Elapsed 0m 11s (remain 0m 36s) Loss: 0.7843 


Validation:  28%|██▊       | 122/433 [00:13<00:33,  9.18valid_batch/s]

EVAL: [120/433] Elapsed 0m 13s (remain 0m 34s) Loss: 0.7698 


Validation:  33%|███▎      | 143/433 [00:15<00:29,  9.96valid_batch/s]

EVAL: [140/433] Elapsed 0m 15s (remain 0m 31s) Loss: 0.7779 


Validation:  37%|███▋      | 162/433 [00:17<00:29,  9.23valid_batch/s]

EVAL: [160/433] Elapsed 0m 17s (remain 0m 29s) Loss: 0.7897 


Validation:  42%|████▏     | 182/433 [00:19<00:27,  9.22valid_batch/s]

EVAL: [180/433] Elapsed 0m 19s (remain 0m 27s) Loss: 0.7989 


Validation:  47%|████▋     | 202/433 [00:21<00:25,  9.16valid_batch/s]

EVAL: [200/433] Elapsed 0m 21s (remain 0m 25s) Loss: 0.8001 


Validation:  51%|█████▏    | 222/433 [00:24<00:22,  9.44valid_batch/s]

EVAL: [220/433] Elapsed 0m 23s (remain 0m 23s) Loss: 0.7993 


Validation:  56%|█████▌    | 242/433 [00:26<00:20,  9.23valid_batch/s]

EVAL: [240/433] Elapsed 0m 26s (remain 0m 20s) Loss: 0.8057 


Validation:  61%|██████    | 262/433 [00:28<00:18,  9.23valid_batch/s]

EVAL: [260/433] Elapsed 0m 28s (remain 0m 18s) Loss: 0.8069 


Validation:  65%|██████▌   | 282/433 [00:30<00:16,  9.29valid_batch/s]

EVAL: [280/433] Elapsed 0m 30s (remain 0m 16s) Loss: 0.8121 


Validation:  70%|██████▉   | 302/433 [00:32<00:14,  9.32valid_batch/s]

EVAL: [300/433] Elapsed 0m 32s (remain 0m 14s) Loss: 0.8153 


Validation:  74%|███████▍  | 322/433 [00:34<00:11,  9.31valid_batch/s]

EVAL: [320/433] Elapsed 0m 34s (remain 0m 12s) Loss: 0.8154 


Validation:  79%|███████▉  | 342/433 [00:37<00:09,  9.31valid_batch/s]

EVAL: [340/433] Elapsed 0m 36s (remain 0m 9s) Loss: 0.8194 


Validation:  84%|████████▎ | 362/433 [00:39<00:07,  9.46valid_batch/s]

EVAL: [360/433] Elapsed 0m 39s (remain 0m 7s) Loss: 0.8213 


Validation:  88%|████████▊ | 382/433 [00:41<00:05,  9.58valid_batch/s]

EVAL: [380/433] Elapsed 0m 41s (remain 0m 5s) Loss: 0.8161 


Validation:  93%|█████████▎| 402/433 [00:43<00:03,  9.29valid_batch/s]

EVAL: [400/433] Elapsed 0m 43s (remain 0m 3s) Loss: 0.8177 


Validation:  98%|█████████▊| 423/433 [00:45<00:00, 10.12valid_batch/s]

EVAL: [420/433] Elapsed 0m 45s (remain 0m 1s) Loss: 0.8148 


Validation: 100%|██████████| 433/433 [00:46<00:00,  9.29valid_batch/s]
Epoch 1 - avg_train_loss: 0.9962  avg_val_loss: 0.8230  time: 394s
Epoch 1 - Score: 0.7936
Epoch 1 - Save Best Score: 0.7936 Model


EVAL: [432/433] Elapsed 0m 46s (remain 0m 0s) Loss: 0.8230 


Train:   0%|          | 1/865 [00:00<07:34,  1.90train_batch/s]

Epoch: [2][0/865] Elapsed 0m 0s (remain 7m 34s) Loss: 0.9856 Grad: inf  LR: 0.00001998  


Train:   2%|▏         | 21/865 [00:08<05:43,  2.46train_batch/s]

Epoch: [2][20/865] Elapsed 0m 8s (remain 5m 41s) Loss: 0.8635 Grad: 340313.5000  LR: 0.00001802  


Train:   5%|▍         | 41/865 [00:16<05:33,  2.47train_batch/s]

Epoch: [2][40/865] Elapsed 0m 16s (remain 5m 31s) Loss: 0.8360 Grad: 226121.5625  LR: 0.00001342  


Train:   7%|▋         | 61/865 [00:24<05:25,  2.47train_batch/s]

Epoch: [2][60/865] Elapsed 0m 24s (remain 5m 22s) Loss: 0.8457 Grad: 160230.4531  LR: 0.00000769  


Train:   9%|▉         | 81/865 [00:32<05:20,  2.45train_batch/s]

Epoch: [2][80/865] Elapsed 0m 32s (remain 5m 15s) Loss: 0.8210 Grad: 197631.2812  LR: 0.00000273  


Train:  12%|█▏        | 101/865 [00:40<05:08,  2.48train_batch/s]

Epoch: [2][100/865] Elapsed 0m 40s (remain 5m 6s) Loss: 0.8084 Grad: 166600.2188  LR: 0.00000015  


Train:  14%|█▍        | 121/865 [00:48<05:03,  2.45train_batch/s]

Epoch: [2][120/865] Elapsed 0m 48s (remain 4m 58s) Loss: 0.8030 Grad: 157311.4375  LR: 0.00000082  


Train:  16%|█▋        | 141/865 [00:56<04:51,  2.48train_batch/s]

Epoch: [2][140/865] Elapsed 0m 56s (remain 4m 50s) Loss: 0.7937 Grad: 322347.9062  LR: 0.00000450  


Train:  19%|█▊        | 161/865 [01:04<04:47,  2.45train_batch/s]

Epoch: [2][160/865] Elapsed 1m 4s (remain 4m 42s) Loss: 0.7920 Grad: 206274.3594  LR: 0.00001000  


Train:  21%|██        | 181/865 [01:12<04:39,  2.45train_batch/s]

Epoch: [2][180/865] Elapsed 1m 12s (remain 4m 34s) Loss: 0.7912 Grad: 129212.1250  LR: 0.00001550  


Train:  23%|██▎       | 201/865 [01:20<04:30,  2.45train_batch/s]

Epoch: [2][200/865] Elapsed 1m 20s (remain 4m 26s) Loss: 0.7981 Grad: 182979.3125  LR: 0.00001918  


Train:  26%|██▌       | 221/865 [01:28<04:23,  2.45train_batch/s]

Epoch: [2][220/865] Elapsed 1m 28s (remain 4m 18s) Loss: 0.7984 Grad: 198218.1406  LR: 0.00001985  


Train:  28%|██▊       | 241/865 [01:36<04:17,  2.42train_batch/s]

Epoch: [2][240/865] Elapsed 1m 36s (remain 4m 10s) Loss: 0.8007 Grad: 124614.6406  LR: 0.00001727  


Train:  30%|███       | 261/865 [01:45<04:08,  2.43train_batch/s]

Epoch: [2][260/865] Elapsed 1m 44s (remain 4m 2s) Loss: 0.7990 Grad: 271157.1875  LR: 0.00001231  


Train:  32%|███▏      | 281/865 [01:53<03:59,  2.44train_batch/s]

Epoch: [2][280/865] Elapsed 1m 53s (remain 3m 55s) Loss: 0.7962 Grad: 180105.2031  LR: 0.00000658  


Train:  35%|███▍      | 301/865 [02:01<03:50,  2.45train_batch/s]

Epoch: [2][300/865] Elapsed 2m 1s (remain 3m 47s) Loss: 0.7964 Grad: 116042.4609  LR: 0.00000198  


Train:  37%|███▋      | 321/865 [02:09<03:42,  2.44train_batch/s]

Epoch: [2][320/865] Elapsed 2m 9s (remain 3m 39s) Loss: 0.7964 Grad: 128634.8281  LR: 0.00000002  


Train:  39%|███▉      | 341/865 [02:17<03:33,  2.45train_batch/s]

Epoch: [2][340/865] Elapsed 2m 17s (remain 3m 30s) Loss: 0.7971 Grad: 161715.0625  LR: 0.00000134  


Train:  42%|████▏     | 361/865 [02:25<03:23,  2.47train_batch/s]

Epoch: [2][360/865] Elapsed 2m 25s (remain 3m 22s) Loss: 0.7943 Grad: 306183.1875  LR: 0.00000551  


Train:  44%|████▍     | 381/865 [02:33<03:15,  2.47train_batch/s]

Epoch: [2][380/865] Elapsed 2m 33s (remain 3m 14s) Loss: 0.7928 Grad: 179262.2812  LR: 0.00001116  


Train:  46%|████▋     | 401/865 [02:41<03:07,  2.48train_batch/s]

Epoch: [2][400/865] Elapsed 2m 41s (remain 3m 6s) Loss: 0.7937 Grad: 235249.2500  LR: 0.00001643  


Train:  49%|████▊     | 421/865 [02:49<02:59,  2.48train_batch/s]

Epoch: [2][420/865] Elapsed 2m 49s (remain 2m 58s) Loss: 0.7906 Grad: 156863.3125  LR: 0.00001958  


Train:  51%|█████     | 441/865 [02:57<02:53,  2.45train_batch/s]

Epoch: [2][440/865] Elapsed 2m 57s (remain 2m 50s) Loss: 0.7919 Grad: 148962.3438  LR: 0.00001958  


Train:  53%|█████▎    | 461/865 [03:05<02:43,  2.47train_batch/s]

Epoch: [2][460/865] Elapsed 3m 5s (remain 2m 42s) Loss: 0.7903 Grad: 189269.6875  LR: 0.00001643  


Train:  56%|█████▌    | 481/865 [03:13<02:30,  2.55train_batch/s]

Epoch: [2][480/865] Elapsed 3m 13s (remain 2m 34s) Loss: 0.7915 Grad: 290458.1875  LR: 0.00001116  


Train:  58%|█████▊    | 501/865 [03:20<02:28,  2.45train_batch/s]

Epoch: [2][500/865] Elapsed 3m 20s (remain 2m 26s) Loss: 0.7936 Grad: 191797.2031  LR: 0.00000551  


Train:  60%|██████    | 521/865 [03:29<02:20,  2.46train_batch/s]

Epoch: [2][520/865] Elapsed 3m 29s (remain 2m 18s) Loss: 0.7940 Grad: 324426.0312  LR: 0.00000134  


Train:  63%|██████▎   | 541/865 [03:37<02:11,  2.46train_batch/s]

Epoch: [2][540/865] Elapsed 3m 37s (remain 2m 9s) Loss: 0.7936 Grad: 117636.1016  LR: 0.00000002  


Train:  65%|██████▍   | 561/865 [03:45<02:03,  2.46train_batch/s]

Epoch: [2][560/865] Elapsed 3m 45s (remain 2m 1s) Loss: 0.7918 Grad: 176104.8750  LR: 0.00000198  


Train:  67%|██████▋   | 581/865 [03:53<01:55,  2.46train_batch/s]

Epoch: [2][580/865] Elapsed 3m 53s (remain 1m 53s) Loss: 0.7898 Grad: 145930.1719  LR: 0.00000658  


Train:  69%|██████▉   | 601/865 [04:01<01:47,  2.46train_batch/s]

Epoch: [2][600/865] Elapsed 4m 1s (remain 1m 45s) Loss: 0.7912 Grad: 119221.1562  LR: 0.00001231  


Train:  72%|███████▏  | 621/865 [04:09<01:39,  2.46train_batch/s]

Epoch: [2][620/865] Elapsed 4m 9s (remain 1m 37s) Loss: 0.7929 Grad: 184374.3281  LR: 0.00001727  


Train:  74%|███████▍  | 641/865 [04:17<01:32,  2.43train_batch/s]

Epoch: [2][640/865] Elapsed 4m 17s (remain 1m 29s) Loss: 0.7932 Grad: 199326.5312  LR: 0.00001985  


Train:  76%|███████▋  | 661/865 [04:25<01:22,  2.48train_batch/s]

Epoch: [2][660/865] Elapsed 4m 25s (remain 1m 21s) Loss: 0.7936 Grad: 138890.0469  LR: 0.00001918  


Train:  79%|███████▊  | 681/865 [04:33<01:14,  2.46train_batch/s]

Epoch: [2][680/865] Elapsed 4m 33s (remain 1m 13s) Loss: 0.7975 Grad: 118678.2734  LR: 0.00001550  


Train:  81%|████████  | 701/865 [04:41<01:07,  2.44train_batch/s]

Epoch: [2][700/865] Elapsed 4m 41s (remain 1m 5s) Loss: 0.8007 Grad: 132132.0938  LR: 0.00001000  


Train:  83%|████████▎ | 721/865 [04:49<00:58,  2.46train_batch/s]

Epoch: [2][720/865] Elapsed 4m 49s (remain 0m 57s) Loss: 0.8021 Grad: 86715.0156  LR: 0.00000450  


Train:  86%|████████▌ | 741/865 [04:57<00:50,  2.46train_batch/s]

Epoch: [2][740/865] Elapsed 4m 57s (remain 0m 49s) Loss: 0.8022 Grad: 102253.0078  LR: 0.00000082  


Train:  88%|████████▊ | 761/865 [05:05<00:43,  2.41train_batch/s]

Epoch: [2][760/865] Elapsed 5m 5s (remain 0m 41s) Loss: 0.8015 Grad: 91174.6875  LR: 0.00000015  


Train:  90%|█████████ | 781/865 [05:13<00:34,  2.46train_batch/s]

Epoch: [2][780/865] Elapsed 5m 13s (remain 0m 33s) Loss: 0.8017 Grad: 157931.5938  LR: 0.00000273  


Train:  93%|█████████▎| 801/865 [05:21<00:24,  2.58train_batch/s]

Epoch: [2][800/865] Elapsed 5m 21s (remain 0m 25s) Loss: 0.8015 Grad: 150124.3594  LR: 0.00000769  


Train:  95%|█████████▍| 821/865 [05:29<00:17,  2.52train_batch/s]

Epoch: [2][820/865] Elapsed 5m 29s (remain 0m 17s) Loss: 0.8039 Grad: 121612.8594  LR: 0.00001342  


Train:  97%|█████████▋| 841/865 [05:37<00:10,  2.39train_batch/s]

Epoch: [2][840/865] Elapsed 5m 37s (remain 0m 9s) Loss: 0.8063 Grad: 142768.2656  LR: 0.00001802  


Train: 100%|█████████▉| 861/865 [05:45<00:01,  2.47train_batch/s]

Epoch: [2][860/865] Elapsed 5m 45s (remain 0m 1s) Loss: 0.8078 Grad: 152231.2656  LR: 0.00001998  


Train: 100%|██████████| 865/865 [05:46<00:00,  2.49train_batch/s]


Epoch: [2][864/865] Elapsed 5m 46s (remain 0m 0s) Loss: 0.8080 Grad: 107628.9297  LR: 0.00001998  


Validation:   0%|          | 2/433 [00:00<01:11,  6.00valid_batch/s]

EVAL: [0/433] Elapsed 0m 0s (remain 1m 48s) Loss: 1.0449 


Validation:   5%|▌         | 22/433 [00:02<00:45,  9.07valid_batch/s]

EVAL: [20/433] Elapsed 0m 2s (remain 0m 48s) Loss: 0.8218 


Validation:  10%|▉         | 42/433 [00:04<00:41,  9.53valid_batch/s]

EVAL: [40/433] Elapsed 0m 4s (remain 0m 43s) Loss: 0.7925 


Validation:  14%|█▍        | 62/433 [00:06<00:40,  9.15valid_batch/s]

EVAL: [60/433] Elapsed 0m 6s (remain 0m 41s) Loss: 0.7978 


Validation:  19%|█▉        | 83/433 [00:09<00:35,  9.74valid_batch/s]

EVAL: [80/433] Elapsed 0m 8s (remain 0m 38s) Loss: 0.8113 


Validation:  24%|██▎       | 102/433 [00:11<00:35,  9.33valid_batch/s]

EVAL: [100/433] Elapsed 0m 11s (remain 0m 36s) Loss: 0.8347 


Validation:  28%|██▊       | 122/433 [00:13<00:33,  9.22valid_batch/s]

EVAL: [120/433] Elapsed 0m 13s (remain 0m 34s) Loss: 0.8245 


Validation:  33%|███▎      | 143/433 [00:15<00:28, 10.01valid_batch/s]

EVAL: [140/433] Elapsed 0m 15s (remain 0m 31s) Loss: 0.8296 


Validation:  37%|███▋      | 162/433 [00:17<00:29,  9.24valid_batch/s]

EVAL: [160/433] Elapsed 0m 17s (remain 0m 29s) Loss: 0.8370 


Validation:  42%|████▏     | 182/433 [00:19<00:27,  9.24valid_batch/s]

EVAL: [180/433] Elapsed 0m 19s (remain 0m 27s) Loss: 0.8465 


Validation:  47%|████▋     | 202/433 [00:21<00:25,  9.22valid_batch/s]

EVAL: [200/433] Elapsed 0m 21s (remain 0m 25s) Loss: 0.8428 


Validation:  51%|█████▏    | 222/433 [00:24<00:21,  9.60valid_batch/s]

EVAL: [220/433] Elapsed 0m 23s (remain 0m 22s) Loss: 0.8403 


Validation:  56%|█████▌    | 242/433 [00:26<00:20,  9.34valid_batch/s]

EVAL: [240/433] Elapsed 0m 26s (remain 0m 20s) Loss: 0.8487 


Validation:  61%|██████    | 262/433 [00:28<00:18,  9.28valid_batch/s]

EVAL: [260/433] Elapsed 0m 28s (remain 0m 18s) Loss: 0.8442 


Validation:  65%|██████▌   | 282/433 [00:30<00:16,  9.35valid_batch/s]

EVAL: [280/433] Elapsed 0m 30s (remain 0m 16s) Loss: 0.8512 


Validation:  70%|██████▉   | 302/433 [00:32<00:14,  9.33valid_batch/s]

EVAL: [300/433] Elapsed 0m 32s (remain 0m 14s) Loss: 0.8555 


Validation:  74%|███████▍  | 322/433 [00:34<00:11,  9.29valid_batch/s]

EVAL: [320/433] Elapsed 0m 34s (remain 0m 12s) Loss: 0.8561 


Validation:  79%|███████▉  | 342/433 [00:36<00:09,  9.32valid_batch/s]

EVAL: [340/433] Elapsed 0m 36s (remain 0m 9s) Loss: 0.8580 


Validation:  84%|████████▎ | 362/433 [00:39<00:07,  9.42valid_batch/s]

EVAL: [360/433] Elapsed 0m 38s (remain 0m 7s) Loss: 0.8612 


Validation:  88%|████████▊ | 382/433 [00:41<00:05,  9.60valid_batch/s]

EVAL: [380/433] Elapsed 0m 40s (remain 0m 5s) Loss: 0.8525 


Validation:  93%|█████████▎| 402/433 [00:43<00:03,  9.27valid_batch/s]

EVAL: [400/433] Elapsed 0m 43s (remain 0m 3s) Loss: 0.8537 


Validation:  98%|█████████▊| 423/433 [00:45<00:00, 10.04valid_batch/s]

EVAL: [420/433] Elapsed 0m 45s (remain 0m 1s) Loss: 0.8510 


Validation: 100%|██████████| 433/433 [00:46<00:00,  9.31valid_batch/s]
Epoch 2 - avg_train_loss: 0.8080  avg_val_loss: 0.8567  time: 393s
Epoch 2 - Score: 0.7865


EVAL: [432/433] Elapsed 0m 46s (remain 0m 0s) Loss: 0.8567 


Score: 0.7936
Train:   0%|          | 1/865 [00:00<07:36,  1.89train_batch/s]

Epoch: [1][0/865] Elapsed 0m 0s (remain 7m 36s) Loss: 2.4937 Grad: inf  LR: 0.00002000  


Train:   2%|▏         | 21/865 [00:08<05:41,  2.47train_batch/s]

Epoch: [1][20/865] Elapsed 0m 8s (remain 5m 39s) Loss: 1.7077 Grad: 50649.0039  LR: 0.00001819  


Train:   5%|▍         | 41/865 [00:16<05:31,  2.49train_batch/s]

Epoch: [1][40/865] Elapsed 0m 16s (remain 5m 29s) Loss: 1.5523 Grad: 47148.1406  LR: 0.00001369  


Train:   7%|▋         | 61/865 [00:24<05:29,  2.44train_batch/s]

Epoch: [1][60/865] Elapsed 0m 24s (remain 5m 22s) Loss: 1.4623 Grad: 28865.0840  LR: 0.00000798  


Train:   9%|▉         | 81/865 [00:32<05:17,  2.47train_batch/s]

Epoch: [1][80/865] Elapsed 0m 32s (remain 5m 14s) Loss: 1.3764 Grad: 31816.4375  LR: 0.00000293  


Train:  12%|█▏        | 101/865 [00:40<05:09,  2.47train_batch/s]

Epoch: [1][100/865] Elapsed 0m 40s (remain 5m 6s) Loss: 1.3294 Grad: 24822.3281  LR: 0.00000021  


Train:  14%|█▍        | 121/865 [00:48<05:03,  2.45train_batch/s]

Epoch: [1][120/865] Elapsed 0m 48s (remain 4m 58s) Loss: 1.2953 Grad: 16697.7344  LR: 0.00000071  


Train:  16%|█▋        | 141/865 [00:56<04:53,  2.46train_batch/s]

Epoch: [1][140/865] Elapsed 0m 56s (remain 4m 50s) Loss: 1.2715 Grad: 39715.8242  LR: 0.00000426  


Train:  19%|█▊        | 161/865 [01:04<04:46,  2.46train_batch/s]

Epoch: [1][160/865] Elapsed 1m 4s (remain 4m 42s) Loss: 1.2511 Grad: 59019.4766  LR: 0.00000971  


Train:  21%|██        | 181/865 [01:12<04:35,  2.48train_batch/s]

Epoch: [1][180/865] Elapsed 1m 12s (remain 4m 34s) Loss: 1.2256 Grad: 56044.6602  LR: 0.00001525  


Train:  23%|██▎       | 201/865 [01:20<04:27,  2.48train_batch/s]

Epoch: [1][200/865] Elapsed 1m 20s (remain 4m 25s) Loss: 1.2003 Grad: 102050.8828  LR: 0.00001906  


Train:  26%|██▌       | 221/865 [01:28<04:19,  2.48train_batch/s]

Epoch: [1][220/865] Elapsed 1m 28s (remain 4m 17s) Loss: 1.1924 Grad: 46110.6953  LR: 0.00001989  


Train:  28%|██▊       | 241/865 [01:36<04:11,  2.48train_batch/s]

Epoch: [1][240/865] Elapsed 1m 36s (remain 4m 9s) Loss: 1.1819 Grad: 37998.2891  LR: 0.00001747  


Train:  30%|███       | 261/865 [01:44<04:05,  2.46train_batch/s]

Epoch: [1][260/865] Elapsed 1m 44s (remain 4m 1s) Loss: 1.1615 Grad: 36519.9258  LR: 0.00001259  


Train:  32%|███▏      | 281/865 [01:52<03:55,  2.48train_batch/s]

Epoch: [1][280/865] Elapsed 1m 52s (remain 3m 53s) Loss: 1.1467 Grad: 120542.7734  LR: 0.00000685  


Train:  35%|███▍      | 301/865 [02:00<03:47,  2.47train_batch/s]

Epoch: [1][300/865] Elapsed 2m 0s (remain 3m 45s) Loss: 1.1362 Grad: 57743.1562  LR: 0.00000216  


Train:  37%|███▋      | 321/865 [02:08<03:38,  2.49train_batch/s]

Epoch: [1][320/865] Elapsed 2m 8s (remain 3m 37s) Loss: 1.1218 Grad: 38359.4023  LR: 0.00000004  


Train:  39%|███▉      | 341/865 [02:16<03:31,  2.48train_batch/s]

Epoch: [1][340/865] Elapsed 2m 16s (remain 3m 29s) Loss: 1.1082 Grad: 48754.2305  LR: 0.00000120  


Train:  42%|████▏     | 361/865 [02:24<03:25,  2.46train_batch/s]

Epoch: [1][360/865] Elapsed 2m 24s (remain 3m 21s) Loss: 1.1005 Grad: 43632.8086  LR: 0.00000525  


Train:  44%|████▍     | 381/865 [02:32<03:17,  2.45train_batch/s]

Epoch: [1][380/865] Elapsed 2m 32s (remain 3m 13s) Loss: 1.0904 Grad: 39448.0156  LR: 0.00001087  


Train:  46%|████▋     | 401/865 [02:40<03:09,  2.45train_batch/s]

Epoch: [1][400/865] Elapsed 2m 40s (remain 3m 5s) Loss: 1.0796 Grad: 69426.6328  LR: 0.00001620  


Train:  49%|████▊     | 421/865 [02:48<03:01,  2.45train_batch/s]

Epoch: [1][420/865] Elapsed 2m 48s (remain 2m 57s) Loss: 1.0742 Grad: 66184.5469  LR: 0.00001949  


Train:  51%|█████     | 441/865 [02:56<02:53,  2.44train_batch/s]

Epoch: [1][440/865] Elapsed 2m 56s (remain 2m 49s) Loss: 1.0728 Grad: 94189.4453  LR: 0.00001966  


Train:  53%|█████▎    | 461/865 [03:04<02:44,  2.46train_batch/s]

Epoch: [1][460/865] Elapsed 3m 4s (remain 2m 41s) Loss: 1.0715 Grad: 57827.9922  LR: 0.00001665  


Train:  56%|█████▌    | 481/865 [03:12<02:35,  2.47train_batch/s]

Epoch: [1][480/865] Elapsed 3m 12s (remain 2m 33s) Loss: 1.0661 Grad: 50764.1328  LR: 0.00001145  


Train:  58%|█████▊    | 501/865 [03:20<02:27,  2.48train_batch/s]

Epoch: [1][500/865] Elapsed 3m 20s (remain 2m 25s) Loss: 1.0587 Grad: 47976.1953  LR: 0.00000577  


Train:  60%|██████    | 521/865 [03:28<02:19,  2.47train_batch/s]

Epoch: [1][520/865] Elapsed 3m 28s (remain 2m 17s) Loss: 1.0506 Grad: 23906.4551  LR: 0.00000149  


Train:  63%|██████▎   | 541/865 [03:36<02:11,  2.46train_batch/s]

Epoch: [1][540/865] Elapsed 3m 36s (remain 2m 9s) Loss: 1.0471 Grad: 31033.0156  LR: 0.00000000  


Train:  65%|██████▍   | 561/865 [03:44<02:03,  2.46train_batch/s]

Epoch: [1][560/865] Elapsed 3m 44s (remain 2m 1s) Loss: 1.0399 Grad: 50068.6055  LR: 0.00000181  


Train:  67%|██████▋   | 581/865 [03:52<01:55,  2.47train_batch/s]

Epoch: [1][580/865] Elapsed 3m 52s (remain 1m 53s) Loss: 1.0360 Grad: 54869.0938  LR: 0.00000631  


Train:  69%|██████▉   | 601/865 [04:00<01:47,  2.46train_batch/s]

Epoch: [1][600/865] Elapsed 4m 0s (remain 1m 45s) Loss: 1.0314 Grad: 31316.5000  LR: 0.00001202  


Train:  72%|███████▏  | 621/865 [04:08<01:37,  2.50train_batch/s]

Epoch: [1][620/865] Elapsed 4m 8s (remain 1m 37s) Loss: 1.0254 Grad: 70937.5469  LR: 0.00001707  


Train:  74%|███████▍  | 641/865 [04:16<01:25,  2.63train_batch/s]

Epoch: [1][640/865] Elapsed 4m 16s (remain 1m 29s) Loss: 1.0214 Grad: 51548.1289  LR: 0.00001979  


Train:  76%|███████▋  | 661/865 [04:23<01:17,  2.64train_batch/s]

Epoch: [1][660/865] Elapsed 4m 23s (remain 1m 21s) Loss: 1.0204 Grad: 42365.5547  LR: 0.00001929  


Train:  79%|███████▊  | 681/865 [04:31<01:09,  2.65train_batch/s]

Epoch: [1][680/865] Elapsed 4m 31s (remain 1m 13s) Loss: 1.0191 Grad: 60906.3438  LR: 0.00001574  


Train:  81%|████████  | 701/865 [04:38<01:02,  2.64train_batch/s]

Epoch: [1][700/865] Elapsed 4m 38s (remain 1m 5s) Loss: 1.0184 Grad: 44211.6680  LR: 0.00001029  


Train:  83%|████████▎ | 721/865 [04:45<00:54,  2.64train_batch/s]

Epoch: [1][720/865] Elapsed 4m 45s (remain 0m 57s) Loss: 1.0153 Grad: 49994.5391  LR: 0.00000475  


Train:  86%|████████▌ | 741/865 [04:53<00:46,  2.64train_batch/s]

Epoch: [1][740/865] Elapsed 4m 53s (remain 0m 49s) Loss: 1.0106 Grad: 34546.3164  LR: 0.00000094  


Train:  88%|████████▊ | 761/865 [05:01<00:42,  2.46train_batch/s]

Epoch: [1][760/865] Elapsed 5m 1s (remain 0m 41s) Loss: 1.0054 Grad: 27702.9375  LR: 0.00000011  


Train:  90%|█████████ | 781/865 [05:09<00:33,  2.48train_batch/s]

Epoch: [1][780/865] Elapsed 5m 9s (remain 0m 33s) Loss: 1.0011 Grad: 34180.9414  LR: 0.00000253  


Train:  93%|█████████▎| 801/865 [05:17<00:25,  2.47train_batch/s]

Epoch: [1][800/865] Elapsed 5m 17s (remain 0m 25s) Loss: 0.9967 Grad: 38638.0469  LR: 0.00000741  


Train:  95%|█████████▍| 821/865 [05:25<00:17,  2.48train_batch/s]

Epoch: [1][820/865] Elapsed 5m 25s (remain 0m 17s) Loss: 0.9940 Grad: 45519.3711  LR: 0.00001315  


Train:  97%|█████████▋| 841/865 [05:33<00:09,  2.47train_batch/s]

Epoch: [1][840/865] Elapsed 5m 33s (remain 0m 9s) Loss: 0.9935 Grad: 44877.0977  LR: 0.00001784  


Train: 100%|█████████▉| 861/865 [05:41<00:01,  2.47train_batch/s]

Epoch: [1][860/865] Elapsed 5m 41s (remain 0m 1s) Loss: 0.9925 Grad: 31074.4785  LR: 0.00001996  


Train: 100%|██████████| 865/865 [05:42<00:00,  2.52train_batch/s]


Epoch: [1][864/865] Elapsed 5m 42s (remain 0m 0s) Loss: 0.9932 Grad: 57138.8711  LR: 0.00002000  


Validation:   0%|          | 2/433 [00:00<01:10,  6.15valid_batch/s]

EVAL: [0/433] Elapsed 0m 0s (remain 1m 43s) Loss: 0.8994 


Validation:   5%|▌         | 22/433 [00:02<00:42,  9.74valid_batch/s]

EVAL: [20/433] Elapsed 0m 2s (remain 0m 46s) Loss: 1.0196 


Validation:   9%|▉         | 41/433 [00:04<00:38, 10.12valid_batch/s]

EVAL: [40/433] Elapsed 0m 4s (remain 0m 42s) Loss: 0.9776 


Validation:  14%|█▍        | 62/433 [00:06<00:40,  9.27valid_batch/s]

EVAL: [60/433] Elapsed 0m 6s (remain 0m 40s) Loss: 1.0003 


Validation:  19%|█▉        | 82/433 [00:08<00:37,  9.25valid_batch/s]

EVAL: [80/433] Elapsed 0m 8s (remain 0m 38s) Loss: 0.9942 


Validation:  24%|██▎       | 102/433 [00:10<00:35,  9.21valid_batch/s]

EVAL: [100/433] Elapsed 0m 10s (remain 0m 35s) Loss: 0.9985 


Validation:  28%|██▊       | 122/433 [00:13<00:32,  9.44valid_batch/s]

EVAL: [120/433] Elapsed 0m 13s (remain 0m 33s) Loss: 0.9955 


Validation:  33%|███▎      | 142/433 [00:15<00:31,  9.22valid_batch/s]

EVAL: [140/433] Elapsed 0m 15s (remain 0m 31s) Loss: 0.9947 


Validation:  37%|███▋      | 162/433 [00:17<00:29,  9.11valid_batch/s]

EVAL: [160/433] Elapsed 0m 17s (remain 0m 29s) Loss: 0.9977 


Validation:  42%|████▏     | 182/433 [00:19<00:27,  9.21valid_batch/s]

EVAL: [180/433] Elapsed 0m 19s (remain 0m 27s) Loss: 0.9981 


Validation:  46%|████▋     | 201/433 [00:21<00:25,  9.21valid_batch/s]

EVAL: [200/433] Elapsed 0m 21s (remain 0m 25s) Loss: 1.0040 


Validation:  51%|█████▏    | 222/433 [00:24<00:22,  9.21valid_batch/s]

EVAL: [220/433] Elapsed 0m 23s (remain 0m 22s) Loss: 1.0013 


Validation:  56%|█████▌    | 242/433 [00:26<00:20,  9.35valid_batch/s]

EVAL: [240/433] Elapsed 0m 26s (remain 0m 20s) Loss: 0.9972 


Validation:  61%|██████    | 262/433 [00:28<00:18,  9.23valid_batch/s]

EVAL: [260/433] Elapsed 0m 28s (remain 0m 18s) Loss: 0.9967 


Validation:  65%|██████▌   | 282/433 [00:30<00:16,  9.28valid_batch/s]

EVAL: [280/433] Elapsed 0m 30s (remain 0m 16s) Loss: 0.9923 


Validation:  70%|██████▉   | 302/433 [00:32<00:14,  9.23valid_batch/s]

EVAL: [300/433] Elapsed 0m 32s (remain 0m 14s) Loss: 0.9960 


Validation:  74%|███████▍  | 322/433 [00:34<00:12,  9.17valid_batch/s]

EVAL: [320/433] Elapsed 0m 34s (remain 0m 12s) Loss: 0.9925 


Validation:  79%|███████▉  | 342/433 [00:36<00:09,  9.17valid_batch/s]

EVAL: [340/433] Elapsed 0m 36s (remain 0m 9s) Loss: 0.9878 


Validation:  84%|████████▎ | 362/433 [00:39<00:07,  9.22valid_batch/s]

EVAL: [360/433] Elapsed 0m 39s (remain 0m 7s) Loss: 0.9875 


Validation:  88%|████████▊ | 382/433 [00:41<00:05,  9.10valid_batch/s]

EVAL: [380/433] Elapsed 0m 41s (remain 0m 5s) Loss: 0.9914 


Validation:  93%|█████████▎| 402/433 [00:43<00:03,  9.12valid_batch/s]

EVAL: [400/433] Elapsed 0m 43s (remain 0m 3s) Loss: 0.9852 


Validation:  97%|█████████▋| 422/433 [00:45<00:01,  9.07valid_batch/s]

EVAL: [420/433] Elapsed 0m 45s (remain 0m 1s) Loss: 0.9882 


Validation: 100%|██████████| 433/433 [00:46<00:00,  9.24valid_batch/s]
Epoch 1 - avg_train_loss: 0.9932  avg_val_loss: 0.9913  time: 390s
Epoch 1 - Score: 0.7618
Epoch 1 - Save Best Score: 0.7618 Model


EVAL: [432/433] Elapsed 0m 46s (remain 0m 0s) Loss: 0.9913 


Train:   0%|          | 1/865 [00:00<07:37,  1.89train_batch/s]

Epoch: [2][0/865] Elapsed 0m 0s (remain 7m 37s) Loss: 1.0078 Grad: inf  LR: 0.00001998  


Train:   2%|▏         | 21/865 [00:08<05:41,  2.47train_batch/s]

Epoch: [2][20/865] Elapsed 0m 8s (remain 5m 40s) Loss: 0.9452 Grad: 330108.5625  LR: 0.00001802  


Train:   5%|▍         | 41/865 [00:16<05:23,  2.55train_batch/s]

Epoch: [2][40/865] Elapsed 0m 16s (remain 5m 27s) Loss: 0.8912 Grad: 213766.4219  LR: 0.00001342  


Train:   7%|▋         | 61/865 [00:24<05:13,  2.56train_batch/s]

Epoch: [2][60/865] Elapsed 0m 24s (remain 5m 16s) Loss: 0.8888 Grad: 123506.5469  LR: 0.00000769  


Train:   9%|▉         | 81/865 [00:31<05:08,  2.54train_batch/s]

Epoch: [2][80/865] Elapsed 0m 31s (remain 5m 7s) Loss: 0.8615 Grad: 118795.5312  LR: 0.00000273  


Train:  12%|█▏        | 101/865 [00:39<04:59,  2.55train_batch/s]

Epoch: [2][100/865] Elapsed 0m 39s (remain 4m 58s) Loss: 0.8458 Grad: 146835.7031  LR: 0.00000015  


Train:  14%|█▍        | 121/865 [00:47<04:55,  2.52train_batch/s]

Epoch: [2][120/865] Elapsed 0m 47s (remain 4m 50s) Loss: 0.8425 Grad: 169658.4688  LR: 0.00000082  


Train:  16%|█▋        | 141/865 [00:55<04:43,  2.55train_batch/s]

Epoch: [2][140/865] Elapsed 0m 55s (remain 4m 42s) Loss: 0.8399 Grad: 148007.7344  LR: 0.00000450  


Train:  19%|█▊        | 161/865 [01:02<04:38,  2.52train_batch/s]

Epoch: [2][160/865] Elapsed 1m 2s (remain 4m 34s) Loss: 0.8379 Grad: 130439.5547  LR: 0.00001000  


Train:  21%|██        | 181/865 [01:10<04:29,  2.53train_batch/s]

Epoch: [2][180/865] Elapsed 1m 10s (remain 4m 26s) Loss: 0.8273 Grad: 216383.1719  LR: 0.00001550  


Train:  23%|██▎       | 201/865 [01:18<04:18,  2.56train_batch/s]

Epoch: [2][200/865] Elapsed 1m 18s (remain 4m 18s) Loss: 0.8283 Grad: 350896.9688  LR: 0.00001918  


Train:  26%|██▌       | 221/865 [01:26<04:14,  2.53train_batch/s]

Epoch: [2][220/865] Elapsed 1m 26s (remain 4m 11s) Loss: 0.8225 Grad: 147313.7656  LR: 0.00001985  


Train:  28%|██▊       | 241/865 [01:33<04:06,  2.54train_batch/s]

Epoch: [2][240/865] Elapsed 1m 33s (remain 4m 3s) Loss: 0.8159 Grad: 206657.7188  LR: 0.00001727  


Train:  30%|███       | 261/865 [01:41<03:55,  2.56train_batch/s]

Epoch: [2][260/865] Elapsed 1m 41s (remain 3m 55s) Loss: 0.8128 Grad: 307694.7188  LR: 0.00001231  


Train:  32%|███▏      | 281/865 [01:49<03:48,  2.56train_batch/s]

Epoch: [2][280/865] Elapsed 1m 49s (remain 3m 47s) Loss: 0.8119 Grad: 204283.0312  LR: 0.00000658  


Train:  35%|███▍      | 301/865 [01:57<03:43,  2.52train_batch/s]

Epoch: [2][300/865] Elapsed 1m 57s (remain 3m 39s) Loss: 0.8069 Grad: 110758.0391  LR: 0.00000198  


Train:  37%|███▋      | 321/865 [02:04<03:32,  2.56train_batch/s]

Epoch: [2][320/865] Elapsed 2m 4s (remain 3m 31s) Loss: 0.8052 Grad: 222990.4219  LR: 0.00000002  


Train:  39%|███▉      | 341/865 [02:12<03:28,  2.52train_batch/s]

Epoch: [2][340/865] Elapsed 2m 12s (remain 3m 23s) Loss: 0.8012 Grad: 163923.0781  LR: 0.00000134  


Train:  42%|████▏     | 361/865 [02:20<03:19,  2.52train_batch/s]

Epoch: [2][360/865] Elapsed 2m 20s (remain 3m 16s) Loss: 0.7941 Grad: 99728.0625  LR: 0.00000551  


Train:  44%|████▍     | 381/865 [02:28<03:10,  2.54train_batch/s]

Epoch: [2][380/865] Elapsed 2m 28s (remain 3m 8s) Loss: 0.7941 Grad: 189837.1250  LR: 0.00001116  


Train:  46%|████▋     | 401/865 [02:35<02:56,  2.62train_batch/s]

Epoch: [2][400/865] Elapsed 2m 35s (remain 3m 0s) Loss: 0.7946 Grad: 74792.4375  LR: 0.00001643  


Train:  49%|████▊     | 421/865 [02:43<02:40,  2.76train_batch/s]

Epoch: [2][420/865] Elapsed 2m 43s (remain 2m 51s) Loss: 0.7967 Grad: 95812.3047  LR: 0.00001958  


Train:  51%|█████     | 441/865 [02:50<02:41,  2.63train_batch/s]

Epoch: [2][440/865] Elapsed 2m 50s (remain 2m 44s) Loss: 0.7964 Grad: 61578.4531  LR: 0.00001958  


Train:  53%|█████▎    | 461/865 [02:57<02:27,  2.74train_batch/s]

Epoch: [2][460/865] Elapsed 2m 57s (remain 2m 35s) Loss: 0.8008 Grad: 59549.0859  LR: 0.00001643  


Train:  56%|█████▌    | 481/865 [03:04<02:18,  2.77train_batch/s]

Epoch: [2][480/865] Elapsed 3m 4s (remain 2m 27s) Loss: 0.8010 Grad: 154645.4688  LR: 0.00001116  


Train:  58%|█████▊    | 501/865 [03:12<02:13,  2.74train_batch/s]

Epoch: [2][500/865] Elapsed 3m 12s (remain 2m 19s) Loss: 0.8052 Grad: 122914.1797  LR: 0.00000551  


Train:  60%|██████    | 521/865 [03:19<02:03,  2.78train_batch/s]

Epoch: [2][520/865] Elapsed 3m 19s (remain 2m 11s) Loss: 0.8069 Grad: 73757.3516  LR: 0.00000134  


Train:  63%|██████▎   | 541/865 [03:26<01:56,  2.77train_batch/s]

Epoch: [2][540/865] Elapsed 3m 26s (remain 2m 3s) Loss: 0.8076 Grad: 67839.7188  LR: 0.00000002  


Train:  65%|██████▍   | 561/865 [03:33<01:49,  2.77train_batch/s]

Epoch: [2][560/865] Elapsed 3m 33s (remain 1m 55s) Loss: 0.8076 Grad: 39733.8086  LR: 0.00000198  


Train:  67%|██████▋   | 581/865 [03:40<01:42,  2.78train_batch/s]

Epoch: [2][580/865] Elapsed 3m 40s (remain 1m 47s) Loss: 0.8072 Grad: 75769.7734  LR: 0.00000658  


Train:  69%|██████▉   | 601/865 [03:47<01:35,  2.77train_batch/s]

Epoch: [2][600/865] Elapsed 3m 47s (remain 1m 40s) Loss: 0.8095 Grad: 61470.9727  LR: 0.00001231  


Train:  72%|███████▏  | 621/865 [03:54<01:27,  2.77train_batch/s]

Epoch: [2][620/865] Elapsed 3m 54s (remain 1m 32s) Loss: 0.8090 Grad: 96809.7578  LR: 0.00001727  


Train:  74%|███████▍  | 641/865 [04:02<01:20,  2.77train_batch/s]

Epoch: [2][640/865] Elapsed 4m 1s (remain 1m 24s) Loss: 0.8075 Grad: 59278.7969  LR: 0.00001985  


Train:  76%|███████▋  | 661/865 [04:09<01:13,  2.78train_batch/s]

Epoch: [2][660/865] Elapsed 4m 9s (remain 1m 16s) Loss: 0.8060 Grad: 59809.1211  LR: 0.00001918  


Train:  79%|███████▊  | 681/865 [04:16<01:06,  2.77train_batch/s]

Epoch: [2][680/865] Elapsed 4m 16s (remain 1m 9s) Loss: 0.8085 Grad: 231670.8750  LR: 0.00001550  


Train:  81%|████████  | 701/865 [04:23<00:59,  2.78train_batch/s]

Epoch: [2][700/865] Elapsed 4m 23s (remain 1m 1s) Loss: 0.8084 Grad: 71031.6797  LR: 0.00001000  


Train:  83%|████████▎ | 721/865 [04:30<00:51,  2.77train_batch/s]

Epoch: [2][720/865] Elapsed 4m 30s (remain 0m 54s) Loss: 0.8076 Grad: 92149.0469  LR: 0.00000450  


Train:  86%|████████▌ | 741/865 [04:37<00:44,  2.78train_batch/s]

Epoch: [2][740/865] Elapsed 4m 37s (remain 0m 46s) Loss: 0.8057 Grad: 78358.3672  LR: 0.00000082  


Train:  88%|████████▊ | 761/865 [04:44<00:37,  2.77train_batch/s]

Epoch: [2][760/865] Elapsed 4m 44s (remain 0m 38s) Loss: 0.8036 Grad: 262562.5625  LR: 0.00000015  


Train:  90%|█████████ | 781/865 [04:51<00:30,  2.77train_batch/s]

Epoch: [2][780/865] Elapsed 4m 51s (remain 0m 31s) Loss: 0.8021 Grad: 59548.5938  LR: 0.00000273  


Train:  93%|█████████▎| 801/865 [04:58<00:23,  2.78train_batch/s]

Epoch: [2][800/865] Elapsed 4m 58s (remain 0m 23s) Loss: 0.8004 Grad: 97298.9297  LR: 0.00000769  


Train:  95%|█████████▍| 821/865 [05:05<00:15,  2.78train_batch/s]

Epoch: [2][820/865] Elapsed 5m 5s (remain 0m 16s) Loss: 0.7989 Grad: 118144.8516  LR: 0.00001342  


Train:  97%|█████████▋| 841/865 [05:13<00:08,  2.78train_batch/s]

Epoch: [2][840/865] Elapsed 5m 13s (remain 0m 8s) Loss: 0.7977 Grad: 101079.5000  LR: 0.00001802  


Train: 100%|█████████▉| 861/865 [05:20<00:01,  2.78train_batch/s]

Epoch: [2][860/865] Elapsed 5m 20s (remain 0m 1s) Loss: 0.7987 Grad: 78312.4922  LR: 0.00001998  


Train: 100%|██████████| 865/865 [05:21<00:00,  2.69train_batch/s]


Epoch: [2][864/865] Elapsed 5m 21s (remain 0m 0s) Loss: 0.7982 Grad: 91518.0703  LR: 0.00001998  


Validation:   1%|          | 3/433 [00:00<00:54,  7.89valid_batch/s]

EVAL: [0/433] Elapsed 0m 0s (remain 1m 32s) Loss: 0.6549 


Validation:   5%|▌         | 23/433 [00:02<00:38, 10.60valid_batch/s]

EVAL: [20/433] Elapsed 0m 2s (remain 0m 41s) Loss: 0.6191 


Validation:  10%|▉         | 43/433 [00:04<00:35, 10.97valid_batch/s]

EVAL: [40/433] Elapsed 0m 3s (remain 0m 37s) Loss: 0.7143 


Validation:  15%|█▍        | 63/433 [00:06<00:35, 10.43valid_batch/s]

EVAL: [60/433] Elapsed 0m 5s (remain 0m 35s) Loss: 0.7981 


Validation:  19%|█▉        | 83/433 [00:07<00:33, 10.42valid_batch/s]

EVAL: [80/433] Elapsed 0m 7s (remain 0m 33s) Loss: 0.7936 


Validation:  24%|██▍       | 103/433 [00:09<00:31, 10.44valid_batch/s]

EVAL: [100/433] Elapsed 0m 9s (remain 0m 31s) Loss: 0.8404 


Validation:  28%|██▊       | 123/433 [00:11<00:29, 10.55valid_batch/s]

EVAL: [120/433] Elapsed 0m 11s (remain 0m 29s) Loss: 0.8480 


Validation:  33%|███▎      | 143/433 [00:13<00:27, 10.42valid_batch/s]

EVAL: [140/433] Elapsed 0m 13s (remain 0m 27s) Loss: 0.8619 


Validation:  38%|███▊      | 163/433 [00:15<00:25, 10.40valid_batch/s]

EVAL: [160/433] Elapsed 0m 15s (remain 0m 26s) Loss: 0.8682 


Validation:  42%|████▏     | 183/433 [00:17<00:24, 10.41valid_batch/s]

EVAL: [180/433] Elapsed 0m 17s (remain 0m 24s) Loss: 0.8841 


Validation:  47%|████▋     | 203/433 [00:19<00:21, 10.60valid_batch/s]

EVAL: [200/433] Elapsed 0m 19s (remain 0m 22s) Loss: 0.8980 


Validation:  52%|█████▏    | 223/433 [00:21<00:20, 10.43valid_batch/s]

EVAL: [220/433] Elapsed 0m 21s (remain 0m 20s) Loss: 0.8975 


Validation:  56%|█████▌    | 243/433 [00:23<00:18, 10.53valid_batch/s]

EVAL: [240/433] Elapsed 0m 23s (remain 0m 18s) Loss: 0.8967 


Validation:  61%|██████    | 263/433 [00:25<00:16, 10.52valid_batch/s]

EVAL: [260/433] Elapsed 0m 24s (remain 0m 16s) Loss: 0.9002 


Validation:  65%|██████▌   | 283/433 [00:27<00:14, 10.53valid_batch/s]

EVAL: [280/433] Elapsed 0m 26s (remain 0m 14s) Loss: 0.8936 


Validation:  70%|██████▉   | 303/433 [00:28<00:12, 10.46valid_batch/s]

EVAL: [300/433] Elapsed 0m 28s (remain 0m 12s) Loss: 0.9035 


Validation:  75%|███████▍  | 323/433 [00:30<00:10, 10.52valid_batch/s]

EVAL: [320/433] Elapsed 0m 30s (remain 0m 10s) Loss: 0.8959 


Validation:  79%|███████▉  | 343/433 [00:32<00:08, 10.42valid_batch/s]

EVAL: [340/433] Elapsed 0m 32s (remain 0m 8s) Loss: 0.8982 


Validation:  84%|████████▍ | 363/433 [00:34<00:06, 10.43valid_batch/s]

EVAL: [360/433] Elapsed 0m 34s (remain 0m 6s) Loss: 0.8986 


Validation:  88%|████████▊ | 383/433 [00:36<00:04, 10.44valid_batch/s]

EVAL: [380/433] Elapsed 0m 36s (remain 0m 4s) Loss: 0.9035 


Validation:  93%|█████████▎| 403/433 [00:38<00:02, 10.47valid_batch/s]

EVAL: [400/433] Elapsed 0m 38s (remain 0m 3s) Loss: 0.8997 


Validation:  98%|█████████▊| 423/433 [00:40<00:00, 10.41valid_batch/s]

EVAL: [420/433] Elapsed 0m 40s (remain 0m 1s) Loss: 0.8980 


Validation: 100%|██████████| 433/433 [00:41<00:00, 10.46valid_batch/s]
Epoch 2 - avg_train_loss: 0.7982  avg_val_loss: 0.9080  time: 363s
Epoch 2 - Score: 0.7499


EVAL: [432/433] Elapsed 0m 41s (remain 0m 0s) Loss: 0.9080 


Score: 0.7618
Train:   0%|          | 1/865 [00:00<06:49,  2.11train_batch/s]

Epoch: [1][0/865] Elapsed 0m 0s (remain 6m 49s) Loss: 1.8134 Grad: inf  LR: 0.00002000  


Train:   2%|▏         | 21/865 [00:07<05:03,  2.78train_batch/s]

Epoch: [1][20/865] Elapsed 0m 7s (remain 5m 2s) Loss: 1.5935 Grad: 43242.9219  LR: 0.00001819  


Train:   5%|▍         | 41/865 [00:14<04:56,  2.78train_batch/s]

Epoch: [1][40/865] Elapsed 0m 14s (remain 4m 53s) Loss: 1.5097 Grad: 29385.3477  LR: 0.00001369  


Train:   7%|▋         | 61/865 [00:21<04:49,  2.78train_batch/s]

Epoch: [1][60/865] Elapsed 0m 21s (remain 4m 46s) Loss: 1.4142 Grad: 39763.3086  LR: 0.00000798  


Train:   9%|▉         | 81/865 [00:28<04:42,  2.78train_batch/s]

Epoch: [1][80/865] Elapsed 0m 28s (remain 4m 38s) Loss: 1.3756 Grad: 59675.6992  LR: 0.00000293  


Train:  12%|█▏        | 101/865 [00:35<04:34,  2.78train_batch/s]

Epoch: [1][100/865] Elapsed 0m 35s (remain 4m 31s) Loss: 1.3459 Grad: 45000.1719  LR: 0.00000021  


Train:  14%|█▍        | 121/865 [00:43<04:27,  2.78train_batch/s]

Epoch: [1][120/865] Elapsed 0m 43s (remain 4m 24s) Loss: 1.3340 Grad: 52835.1680  LR: 0.00000071  


Train:  16%|█▋        | 141/865 [00:50<04:20,  2.77train_batch/s]

Epoch: [1][140/865] Elapsed 0m 50s (remain 4m 17s) Loss: 1.3126 Grad: 53896.6250  LR: 0.00000426  


Train:  19%|█▊        | 161/865 [00:57<04:14,  2.77train_batch/s]

Epoch: [1][160/865] Elapsed 0m 57s (remain 4m 10s) Loss: 1.2886 Grad: 93329.8828  LR: 0.00000971  


Train:  21%|██        | 181/865 [01:04<04:06,  2.77train_batch/s]

Epoch: [1][180/865] Elapsed 1m 4s (remain 4m 3s) Loss: 1.2700 Grad: 59699.2422  LR: 0.00001525  


Train:  23%|██▎       | 201/865 [01:11<03:59,  2.78train_batch/s]

Epoch: [1][200/865] Elapsed 1m 11s (remain 3m 56s) Loss: 1.2559 Grad: 58329.3594  LR: 0.00001906  


Train:  26%|██▌       | 221/865 [01:18<03:52,  2.78train_batch/s]

Epoch: [1][220/865] Elapsed 1m 18s (remain 3m 48s) Loss: 1.2386 Grad: 73792.1328  LR: 0.00001989  


Train:  28%|██▊       | 241/865 [01:25<03:44,  2.77train_batch/s]

Epoch: [1][240/865] Elapsed 1m 25s (remain 3m 41s) Loss: 1.2293 Grad: 65925.1953  LR: 0.00001747  


Train:  30%|███       | 261/865 [01:32<03:37,  2.77train_batch/s]

Epoch: [1][260/865] Elapsed 1m 32s (remain 3m 34s) Loss: 1.2152 Grad: 58216.1133  LR: 0.00001259  


Train:  32%|███▏      | 281/865 [01:39<03:30,  2.77train_batch/s]

Epoch: [1][280/865] Elapsed 1m 39s (remain 3m 27s) Loss: 1.1953 Grad: 144438.1250  LR: 0.00000685  


Train:  35%|███▍      | 301/865 [01:46<03:23,  2.77train_batch/s]

Epoch: [1][300/865] Elapsed 1m 46s (remain 3m 20s) Loss: 1.1808 Grad: 58115.6523  LR: 0.00000216  


Train:  37%|███▋      | 321/865 [01:54<03:16,  2.77train_batch/s]

Epoch: [1][320/865] Elapsed 1m 54s (remain 3m 13s) Loss: 1.1699 Grad: 63536.2109  LR: 0.00000004  


Train:  39%|███▉      | 341/865 [02:01<03:09,  2.77train_batch/s]

Epoch: [1][340/865] Elapsed 2m 1s (remain 3m 6s) Loss: 1.1607 Grad: 65521.6172  LR: 0.00000120  


Train:  42%|████▏     | 361/865 [02:08<03:06,  2.70train_batch/s]

Epoch: [1][360/865] Elapsed 2m 8s (remain 2m 59s) Loss: 1.1475 Grad: 69694.9375  LR: 0.00000525  


Train:  44%|████▍     | 381/865 [02:15<02:54,  2.77train_batch/s]

Epoch: [1][380/865] Elapsed 2m 15s (remain 2m 52s) Loss: 1.1390 Grad: 177397.7188  LR: 0.00001087  


Train:  46%|████▋     | 401/865 [02:22<02:46,  2.79train_batch/s]

Epoch: [1][400/865] Elapsed 2m 22s (remain 2m 45s) Loss: 1.1271 Grad: 73346.2109  LR: 0.00001620  


Train:  49%|████▊     | 421/865 [02:29<02:39,  2.79train_batch/s]

Epoch: [1][420/865] Elapsed 2m 29s (remain 2m 37s) Loss: 1.1254 Grad: 177521.4844  LR: 0.00001949  


Train:  51%|█████     | 441/865 [02:36<02:32,  2.79train_batch/s]

Epoch: [1][440/865] Elapsed 2m 36s (remain 2m 30s) Loss: 1.1205 Grad: 92027.3672  LR: 0.00001966  


Train:  53%|█████▎    | 461/865 [02:43<02:24,  2.79train_batch/s]

Epoch: [1][460/865] Elapsed 2m 43s (remain 2m 23s) Loss: 1.1135 Grad: 73759.5625  LR: 0.00001665  


Train:  56%|█████▌    | 481/865 [02:50<02:17,  2.79train_batch/s]

Epoch: [1][480/865] Elapsed 2m 50s (remain 2m 16s) Loss: 1.1073 Grad: 76059.4141  LR: 0.00001145  


Train:  58%|█████▊    | 501/865 [02:58<02:10,  2.79train_batch/s]

Epoch: [1][500/865] Elapsed 2m 58s (remain 2m 9s) Loss: 1.0996 Grad: 83167.0625  LR: 0.00000577  


Train:  60%|██████    | 521/865 [03:05<02:03,  2.79train_batch/s]

Epoch: [1][520/865] Elapsed 3m 5s (remain 2m 2s) Loss: 1.0919 Grad: 104642.1719  LR: 0.00000149  


Train:  63%|██████▎   | 541/865 [03:12<01:56,  2.79train_batch/s]

Epoch: [1][540/865] Elapsed 3m 12s (remain 1m 55s) Loss: 1.0841 Grad: 70116.3594  LR: 0.00000000  


Train:  65%|██████▍   | 561/865 [03:19<01:48,  2.79train_batch/s]

Epoch: [1][560/865] Elapsed 3m 19s (remain 1m 47s) Loss: 1.0745 Grad: 100178.9219  LR: 0.00000181  


Train:  67%|██████▋   | 581/865 [03:26<01:41,  2.79train_batch/s]

Epoch: [1][580/865] Elapsed 3m 26s (remain 1m 40s) Loss: 1.0674 Grad: 75275.3047  LR: 0.00000631  


Train:  69%|██████▉   | 601/865 [03:33<01:34,  2.80train_batch/s]

Epoch: [1][600/865] Elapsed 3m 33s (remain 1m 33s) Loss: 1.0614 Grad: 50831.7148  LR: 0.00001202  


Train:  72%|███████▏  | 621/865 [03:40<01:27,  2.79train_batch/s]

Epoch: [1][620/865] Elapsed 3m 40s (remain 1m 26s) Loss: 1.0555 Grad: 78776.5156  LR: 0.00001707  


Train:  74%|███████▍  | 641/865 [03:47<01:20,  2.79train_batch/s]

Epoch: [1][640/865] Elapsed 3m 47s (remain 1m 19s) Loss: 1.0531 Grad: 97377.3750  LR: 0.00001979  


Train:  76%|███████▋  | 661/865 [03:54<01:13,  2.79train_batch/s]

Epoch: [1][660/865] Elapsed 3m 54s (remain 1m 12s) Loss: 1.0498 Grad: 215887.9219  LR: 0.00001929  


Train:  79%|███████▊  | 681/865 [04:01<01:05,  2.79train_batch/s]

Epoch: [1][680/865] Elapsed 4m 1s (remain 1m 5s) Loss: 1.0486 Grad: 49769.8008  LR: 0.00001574  


Train:  81%|████████  | 701/865 [04:08<00:58,  2.80train_batch/s]

Epoch: [1][700/865] Elapsed 4m 8s (remain 0m 58s) Loss: 1.0455 Grad: 74250.0703  LR: 0.00001029  


Train:  83%|████████▎ | 721/865 [04:15<00:51,  2.79train_batch/s]

Epoch: [1][720/865] Elapsed 4m 15s (remain 0m 51s) Loss: 1.0410 Grad: 65384.2891  LR: 0.00000475  


Train:  86%|████████▌ | 741/865 [04:22<00:44,  2.79train_batch/s]

Epoch: [1][740/865] Elapsed 4m 22s (remain 0m 43s) Loss: 1.0363 Grad: 73064.9062  LR: 0.00000094  


Train:  88%|████████▊ | 761/865 [04:29<00:37,  2.79train_batch/s]

Epoch: [1][760/865] Elapsed 4m 29s (remain 0m 36s) Loss: 1.0327 Grad: 132181.3438  LR: 0.00000011  


Train:  90%|█████████ | 781/865 [04:37<00:30,  2.79train_batch/s]

Epoch: [1][780/865] Elapsed 4m 37s (remain 0m 29s) Loss: 1.0295 Grad: 116167.3750  LR: 0.00000253  


Train:  93%|█████████▎| 801/865 [04:44<00:22,  2.79train_batch/s]

Epoch: [1][800/865] Elapsed 4m 44s (remain 0m 22s) Loss: 1.0240 Grad: 181424.4688  LR: 0.00000741  


Train:  95%|█████████▍| 821/865 [04:51<00:15,  2.79train_batch/s]

Epoch: [1][820/865] Elapsed 4m 51s (remain 0m 15s) Loss: 1.0185 Grad: 88340.5625  LR: 0.00001315  


Train:  97%|█████████▋| 841/865 [04:58<00:08,  2.79train_batch/s]

Epoch: [1][840/865] Elapsed 4m 58s (remain 0m 8s) Loss: 1.0159 Grad: 108043.0781  LR: 0.00001784  


Train: 100%|█████████▉| 861/865 [05:05<00:01,  2.79train_batch/s]

Epoch: [1][860/865] Elapsed 5m 5s (remain 0m 1s) Loss: 1.0138 Grad: 109120.0391  LR: 0.00001996  


Train: 100%|██████████| 865/865 [05:06<00:00,  2.82train_batch/s]


Epoch: [1][864/865] Elapsed 5m 6s (remain 0m 0s) Loss: 1.0135 Grad: 105487.3984  LR: 0.00002000  


Validation:   1%|          | 3/433 [00:00<00:54,  7.82valid_batch/s]

EVAL: [0/433] Elapsed 0m 0s (remain 1m 35s) Loss: 1.5162 


Validation:   5%|▌         | 23/433 [00:02<00:39, 10.49valid_batch/s]

EVAL: [20/433] Elapsed 0m 2s (remain 0m 41s) Loss: 0.8271 


Validation:  10%|▉         | 43/433 [00:04<00:37, 10.47valid_batch/s]

EVAL: [40/433] Elapsed 0m 4s (remain 0m 38s) Loss: 0.9001 


Validation:  15%|█▍        | 63/433 [00:06<00:35, 10.47valid_batch/s]

EVAL: [60/433] Elapsed 0m 5s (remain 0m 36s) Loss: 0.8947 


Validation:  19%|█▉        | 83/433 [00:08<00:33, 10.47valid_batch/s]

EVAL: [80/433] Elapsed 0m 7s (remain 0m 34s) Loss: 0.9005 


Validation:  24%|██▍       | 103/433 [00:09<00:30, 10.71valid_batch/s]

EVAL: [100/433] Elapsed 0m 9s (remain 0m 32s) Loss: 0.8915 


Validation:  28%|██▊       | 123/433 [00:11<00:29, 10.47valid_batch/s]

EVAL: [120/433] Elapsed 0m 11s (remain 0m 30s) Loss: 0.9000 


Validation:  33%|███▎      | 143/433 [00:13<00:27, 10.60valid_batch/s]

EVAL: [140/433] Elapsed 0m 13s (remain 0m 28s) Loss: 0.8980 


Validation:  38%|███▊      | 163/433 [00:15<00:25, 10.47valid_batch/s]

EVAL: [160/433] Elapsed 0m 15s (remain 0m 26s) Loss: 0.9129 


Validation:  42%|████▏     | 183/433 [00:17<00:23, 10.48valid_batch/s]

EVAL: [180/433] Elapsed 0m 17s (remain 0m 24s) Loss: 0.9080 


Validation:  47%|████▋     | 203/433 [00:19<00:21, 10.48valid_batch/s]

EVAL: [200/433] Elapsed 0m 19s (remain 0m 22s) Loss: 0.9001 


Validation:  52%|█████▏    | 223/433 [00:21<00:20, 10.48valid_batch/s]

EVAL: [220/433] Elapsed 0m 21s (remain 0m 20s) Loss: 0.9025 


Validation:  56%|█████▌    | 243/433 [00:23<00:18, 10.54valid_batch/s]

EVAL: [240/433] Elapsed 0m 23s (remain 0m 18s) Loss: 0.9081 


Validation:  61%|██████    | 263/433 [00:25<00:15, 10.75valid_batch/s]

EVAL: [260/433] Elapsed 0m 24s (remain 0m 16s) Loss: 0.9086 


Validation:  65%|██████▌   | 283/433 [00:27<00:14, 10.54valid_batch/s]

EVAL: [280/433] Elapsed 0m 26s (remain 0m 14s) Loss: 0.9089 


Validation:  70%|██████▉   | 303/433 [00:28<00:12, 10.50valid_batch/s]

EVAL: [300/433] Elapsed 0m 28s (remain 0m 12s) Loss: 0.9077 


Validation:  75%|███████▍  | 323/433 [00:30<00:10, 10.54valid_batch/s]

EVAL: [320/433] Elapsed 0m 30s (remain 0m 10s) Loss: 0.9096 


Validation:  79%|███████▉  | 343/433 [00:32<00:08, 10.75valid_batch/s]

EVAL: [340/433] Elapsed 0m 32s (remain 0m 8s) Loss: 0.9053 


Validation:  84%|████████▍ | 363/433 [00:34<00:06, 10.49valid_batch/s]

EVAL: [360/433] Elapsed 0m 34s (remain 0m 6s) Loss: 0.9039 


Validation:  88%|████████▊ | 383/433 [00:36<00:04, 10.46valid_batch/s]

EVAL: [380/433] Elapsed 0m 36s (remain 0m 4s) Loss: 0.9043 


Validation:  93%|█████████▎| 403/433 [00:38<00:02, 10.58valid_batch/s]

EVAL: [400/433] Elapsed 0m 38s (remain 0m 3s) Loss: 0.9042 


Validation:  98%|█████████▊| 423/433 [00:40<00:00, 10.68valid_batch/s]

EVAL: [420/433] Elapsed 0m 40s (remain 0m 1s) Loss: 0.9022 


Validation: 100%|██████████| 433/433 [00:41<00:00, 10.50valid_batch/s]
Epoch 1 - avg_train_loss: 1.0135  avg_val_loss: 0.9006  time: 348s
Epoch 1 - Score: 0.7676
Epoch 1 - Save Best Score: 0.7676 Model


EVAL: [432/433] Elapsed 0m 41s (remain 0m 0s) Loss: 0.9006 


Train:   0%|          | 1/865 [00:00<06:57,  2.07train_batch/s]

Epoch: [2][0/865] Elapsed 0m 0s (remain 6m 44s) Loss: 0.8759 Grad: 640162.1250  LR: 0.00001998  


Train:   2%|▏         | 21/865 [00:07<05:02,  2.79train_batch/s]

Epoch: [2][20/865] Elapsed 0m 7s (remain 5m 1s) Loss: 0.9321 Grad: 206787.5156  LR: 0.00001802  


Train:   5%|▍         | 41/865 [00:14<04:54,  2.79train_batch/s]

Epoch: [2][40/865] Elapsed 0m 14s (remain 4m 52s) Loss: 0.8704 Grad: 237117.6406  LR: 0.00001342  


Train:   7%|▋         | 61/865 [00:21<04:47,  2.79train_batch/s]

Epoch: [2][60/865] Elapsed 0m 21s (remain 4m 45s) Loss: 0.8695 Grad: 135565.6875  LR: 0.00000769  


Train:   9%|▉         | 81/865 [00:28<04:40,  2.79train_batch/s]

Epoch: [2][80/865] Elapsed 0m 28s (remain 4m 37s) Loss: 0.8623 Grad: 191137.4375  LR: 0.00000273  


Train:  12%|█▏        | 101/865 [00:35<04:33,  2.79train_batch/s]

Epoch: [2][100/865] Elapsed 0m 35s (remain 4m 30s) Loss: 0.8500 Grad: 111895.9609  LR: 0.00000015  


Train:  14%|█▍        | 121/865 [00:42<04:26,  2.79train_batch/s]

Epoch: [2][120/865] Elapsed 0m 42s (remain 4m 23s) Loss: 0.8252 Grad: 154298.4531  LR: 0.00000082  


Train:  16%|█▋        | 141/865 [00:49<04:19,  2.79train_batch/s]

Epoch: [2][140/865] Elapsed 0m 49s (remain 4m 16s) Loss: 0.8279 Grad: 165537.3438  LR: 0.00000450  


Train:  19%|█▊        | 161/865 [00:56<04:12,  2.79train_batch/s]

Epoch: [2][160/865] Elapsed 0m 56s (remain 4m 9s) Loss: 0.8245 Grad: 183725.4531  LR: 0.00001000  


Train:  21%|██        | 181/865 [01:04<04:05,  2.79train_batch/s]

Epoch: [2][180/865] Elapsed 1m 4s (remain 4m 2s) Loss: 0.8190 Grad: 164961.7188  LR: 0.00001550  


Train:  23%|██▎       | 201/865 [01:11<03:57,  2.79train_batch/s]

Epoch: [2][200/865] Elapsed 1m 11s (remain 3m 54s) Loss: 0.8218 Grad: 112940.1094  LR: 0.00001918  


Train:  26%|██▌       | 221/865 [01:18<03:50,  2.79train_batch/s]

Epoch: [2][220/865] Elapsed 1m 18s (remain 3m 47s) Loss: 0.8201 Grad: 170835.1406  LR: 0.00001985  


Train:  28%|██▊       | 241/865 [01:25<03:43,  2.79train_batch/s]

Epoch: [2][240/865] Elapsed 1m 25s (remain 3m 40s) Loss: 0.8277 Grad: 104256.0859  LR: 0.00001727  


Train:  30%|███       | 261/865 [01:32<03:36,  2.79train_batch/s]

Epoch: [2][260/865] Elapsed 1m 32s (remain 3m 33s) Loss: 0.8273 Grad: 245267.1562  LR: 0.00001231  


Train:  32%|███▏      | 281/865 [01:39<03:29,  2.79train_batch/s]

Epoch: [2][280/865] Elapsed 1m 39s (remain 3m 26s) Loss: 0.8203 Grad: 147329.3750  LR: 0.00000658  


Train:  35%|███▍      | 301/865 [01:46<03:22,  2.79train_batch/s]

Epoch: [2][300/865] Elapsed 1m 46s (remain 3m 19s) Loss: 0.8186 Grad: 151626.8438  LR: 0.00000198  


Train:  37%|███▋      | 321/865 [01:53<03:15,  2.79train_batch/s]

Epoch: [2][320/865] Elapsed 1m 53s (remain 3m 12s) Loss: 0.8135 Grad: 228684.6250  LR: 0.00000002  


Train:  39%|███▉      | 341/865 [02:00<03:07,  2.79train_batch/s]

Epoch: [2][340/865] Elapsed 2m 0s (remain 3m 5s) Loss: 0.8074 Grad: 248196.9219  LR: 0.00000134  


Train:  42%|████▏     | 361/865 [02:07<03:00,  2.79train_batch/s]

Epoch: [2][360/865] Elapsed 2m 7s (remain 2m 58s) Loss: 0.8033 Grad: 145480.4844  LR: 0.00000551  


Train:  44%|████▍     | 381/865 [02:14<02:53,  2.79train_batch/s]

Epoch: [2][380/865] Elapsed 2m 14s (remain 2m 51s) Loss: 0.7994 Grad: 168958.8906  LR: 0.00001116  


Train:  46%|████▋     | 401/865 [02:21<02:46,  2.79train_batch/s]

Epoch: [2][400/865] Elapsed 2m 21s (remain 2m 44s) Loss: 0.8007 Grad: 138114.6406  LR: 0.00001643  


Train:  49%|████▊     | 421/865 [02:28<02:39,  2.79train_batch/s]

Epoch: [2][420/865] Elapsed 2m 28s (remain 2m 37s) Loss: 0.7976 Grad: 118182.5703  LR: 0.00001958  


Train:  51%|█████     | 441/865 [02:35<02:31,  2.79train_batch/s]

Epoch: [2][440/865] Elapsed 2m 35s (remain 2m 29s) Loss: 0.8004 Grad: 98380.5547  LR: 0.00001958  


Train:  53%|█████▎    | 461/865 [02:43<02:24,  2.79train_batch/s]

Epoch: [2][460/865] Elapsed 2m 43s (remain 2m 22s) Loss: 0.7983 Grad: 85451.2656  LR: 0.00001643  


Train:  56%|█████▌    | 481/865 [02:50<02:17,  2.79train_batch/s]

Epoch: [2][480/865] Elapsed 2m 50s (remain 2m 15s) Loss: 0.7999 Grad: 151208.3906  LR: 0.00001116  


Train:  58%|█████▊    | 501/865 [02:57<02:10,  2.79train_batch/s]

Epoch: [2][500/865] Elapsed 2m 57s (remain 2m 8s) Loss: 0.7993 Grad: 249501.9531  LR: 0.00000551  


Train:  60%|██████    | 521/865 [03:04<02:03,  2.79train_batch/s]

Epoch: [2][520/865] Elapsed 3m 4s (remain 2m 1s) Loss: 0.7986 Grad: 103039.6094  LR: 0.00000134  


Train:  63%|██████▎   | 541/865 [03:11<01:55,  2.79train_batch/s]

Epoch: [2][540/865] Elapsed 3m 11s (remain 1m 54s) Loss: 0.7975 Grad: 141997.8281  LR: 0.00000002  


Train:  65%|██████▍   | 561/865 [03:18<01:49,  2.79train_batch/s]

Epoch: [2][560/865] Elapsed 3m 18s (remain 1m 47s) Loss: 0.7987 Grad: 323659.6875  LR: 0.00000198  


Train:  67%|██████▋   | 581/865 [03:25<01:41,  2.79train_batch/s]

Epoch: [2][580/865] Elapsed 3m 25s (remain 1m 40s) Loss: 0.7977 Grad: 144263.2500  LR: 0.00000658  


Train:  69%|██████▉   | 601/865 [03:32<01:34,  2.79train_batch/s]

Epoch: [2][600/865] Elapsed 3m 32s (remain 1m 33s) Loss: 0.7989 Grad: 260589.7188  LR: 0.00001231  


Train:  72%|███████▏  | 621/865 [03:39<01:27,  2.79train_batch/s]

Epoch: [2][620/865] Elapsed 3m 39s (remain 1m 26s) Loss: 0.7984 Grad: 271821.7188  LR: 0.00001727  


Train:  74%|███████▍  | 641/865 [03:46<01:20,  2.79train_batch/s]

Epoch: [2][640/865] Elapsed 3m 46s (remain 1m 19s) Loss: 0.8004 Grad: 176567.6875  LR: 0.00001985  


Train:  76%|███████▋  | 661/865 [03:53<01:13,  2.79train_batch/s]

Epoch: [2][660/865] Elapsed 3m 53s (remain 1m 12s) Loss: 0.8014 Grad: 284032.6250  LR: 0.00001918  


Train:  79%|███████▊  | 681/865 [04:00<01:05,  2.79train_batch/s]

Epoch: [2][680/865] Elapsed 4m 0s (remain 1m 5s) Loss: 0.8036 Grad: 155647.6719  LR: 0.00001550  


Train:  81%|████████  | 701/865 [04:07<00:58,  2.79train_batch/s]

Epoch: [2][700/865] Elapsed 4m 7s (remain 0m 57s) Loss: 0.8053 Grad: 242944.4062  LR: 0.00001000  


Train:  83%|████████▎ | 721/865 [04:14<00:51,  2.79train_batch/s]

Epoch: [2][720/865] Elapsed 4m 14s (remain 0m 50s) Loss: 0.8048 Grad: 121333.8281  LR: 0.00000450  


Train:  86%|████████▌ | 741/865 [04:21<00:44,  2.79train_batch/s]

Epoch: [2][740/865] Elapsed 4m 21s (remain 0m 43s) Loss: 0.8049 Grad: 247031.3438  LR: 0.00000082  


Train:  88%|████████▊ | 761/865 [04:29<00:37,  2.79train_batch/s]

Epoch: [2][760/865] Elapsed 4m 29s (remain 0m 36s) Loss: 0.8056 Grad: 212061.6406  LR: 0.00000015  


Train:  90%|█████████ | 781/865 [04:36<00:30,  2.79train_batch/s]

Epoch: [2][780/865] Elapsed 4m 36s (remain 0m 29s) Loss: 0.8046 Grad: 315864.0938  LR: 0.00000273  


Train:  93%|█████████▎| 801/865 [04:43<00:22,  2.79train_batch/s]

Epoch: [2][800/865] Elapsed 4m 43s (remain 0m 22s) Loss: 0.8025 Grad: 149446.5781  LR: 0.00000769  


Train:  95%|█████████▍| 821/865 [04:50<00:15,  2.79train_batch/s]

Epoch: [2][820/865] Elapsed 4m 50s (remain 0m 15s) Loss: 0.8010 Grad: 138330.7656  LR: 0.00001342  


Train:  97%|█████████▋| 841/865 [04:57<00:08,  2.79train_batch/s]

Epoch: [2][840/865] Elapsed 4m 57s (remain 0m 8s) Loss: 0.8019 Grad: 314763.8125  LR: 0.00001802  


Train: 100%|█████████▉| 861/865 [05:04<00:01,  2.81train_batch/s]

Epoch: [2][860/865] Elapsed 5m 4s (remain 0m 1s) Loss: 0.8026 Grad: 177460.9844  LR: 0.00001998  


Train: 100%|██████████| 865/865 [05:05<00:00,  2.83train_batch/s]


Epoch: [2][864/865] Elapsed 5m 5s (remain 0m 0s) Loss: 0.8025 Grad: 301935.8438  LR: 0.00001998  


Validation:   1%|          | 3/433 [00:00<00:55,  7.81valid_batch/s]

EVAL: [0/433] Elapsed 0m 0s (remain 1m 35s) Loss: 0.9693 


Validation:   5%|▌         | 23/433 [00:02<00:39, 10.48valid_batch/s]

EVAL: [20/433] Elapsed 0m 2s (remain 0m 41s) Loss: 0.7173 


Validation:  10%|▉         | 43/433 [00:04<00:37, 10.46valid_batch/s]

EVAL: [40/433] Elapsed 0m 4s (remain 0m 38s) Loss: 0.7780 


Validation:  15%|█▍        | 63/433 [00:06<00:35, 10.48valid_batch/s]

EVAL: [60/433] Elapsed 0m 5s (remain 0m 36s) Loss: 0.7987 


Validation:  19%|█▉        | 83/433 [00:08<00:33, 10.48valid_batch/s]

EVAL: [80/433] Elapsed 0m 7s (remain 0m 34s) Loss: 0.7992 


Validation:  24%|██▍       | 103/433 [00:09<00:30, 10.73valid_batch/s]

EVAL: [100/433] Elapsed 0m 9s (remain 0m 32s) Loss: 0.7982 


Validation:  28%|██▊       | 123/433 [00:11<00:29, 10.46valid_batch/s]

EVAL: [120/433] Elapsed 0m 11s (remain 0m 30s) Loss: 0.7950 


Validation:  33%|███▎      | 143/433 [00:13<00:27, 10.59valid_batch/s]

EVAL: [140/433] Elapsed 0m 13s (remain 0m 28s) Loss: 0.7822 


Validation:  38%|███▊      | 163/433 [00:15<00:25, 10.47valid_batch/s]

EVAL: [160/433] Elapsed 0m 15s (remain 0m 26s) Loss: 0.7957 


Validation:  42%|████▏     | 183/433 [00:17<00:23, 10.48valid_batch/s]

EVAL: [180/433] Elapsed 0m 17s (remain 0m 24s) Loss: 0.7916 


Validation:  47%|████▋     | 203/433 [00:19<00:21, 10.46valid_batch/s]

EVAL: [200/433] Elapsed 0m 19s (remain 0m 22s) Loss: 0.7898 


Validation:  52%|█████▏    | 223/433 [00:21<00:20, 10.47valid_batch/s]

EVAL: [220/433] Elapsed 0m 21s (remain 0m 20s) Loss: 0.7941 


Validation:  56%|█████▌    | 243/433 [00:23<00:18, 10.54valid_batch/s]

EVAL: [240/433] Elapsed 0m 23s (remain 0m 18s) Loss: 0.7887 


Validation:  61%|██████    | 263/433 [00:25<00:15, 10.76valid_batch/s]

EVAL: [260/433] Elapsed 0m 25s (remain 0m 16s) Loss: 0.7931 


Validation:  65%|██████▌   | 283/433 [00:27<00:14, 10.55valid_batch/s]

EVAL: [280/433] Elapsed 0m 26s (remain 0m 14s) Loss: 0.7937 


Validation:  70%|██████▉   | 303/433 [00:28<00:12, 10.51valid_batch/s]

EVAL: [300/433] Elapsed 0m 28s (remain 0m 12s) Loss: 0.7949 


Validation:  75%|███████▍  | 323/433 [00:30<00:10, 10.55valid_batch/s]

EVAL: [320/433] Elapsed 0m 30s (remain 0m 10s) Loss: 0.7995 


Validation:  79%|███████▉  | 343/433 [00:32<00:08, 10.76valid_batch/s]

EVAL: [340/433] Elapsed 0m 32s (remain 0m 8s) Loss: 0.7948 


Validation:  84%|████████▍ | 363/433 [00:34<00:06, 10.50valid_batch/s]

EVAL: [360/433] Elapsed 0m 34s (remain 0m 6s) Loss: 0.7997 


Validation:  88%|████████▊ | 383/433 [00:36<00:04, 10.44valid_batch/s]

EVAL: [380/433] Elapsed 0m 36s (remain 0m 4s) Loss: 0.7989 


Validation:  93%|█████████▎| 403/433 [00:38<00:02, 10.59valid_batch/s]

EVAL: [400/433] Elapsed 0m 38s (remain 0m 3s) Loss: 0.7986 


Validation:  98%|█████████▊| 423/433 [00:40<00:00, 10.70valid_batch/s]

EVAL: [420/433] Elapsed 0m 40s (remain 0m 1s) Loss: 0.8010 


Validation: 100%|██████████| 433/433 [00:41<00:00, 10.49valid_batch/s]
Epoch 2 - avg_train_loss: 0.8025  avg_val_loss: 0.8022  time: 347s
Epoch 2 - Score: 0.8042
Epoch 2 - Save Best Score: 0.8042 Model


EVAL: [432/433] Elapsed 0m 41s (remain 0m 0s) Loss: 0.8022 


Score: 0.8042
Train:   0%|          | 1/865 [00:00<06:48,  2.12train_batch/s]

Epoch: [1][0/865] Elapsed 0m 0s (remain 6m 48s) Loss: 2.0566 Grad: inf  LR: 0.00002000  


Train:   2%|▏         | 21/865 [00:07<05:02,  2.79train_batch/s]

Epoch: [1][20/865] Elapsed 0m 7s (remain 5m 0s) Loss: 1.6177 Grad: 81361.2500  LR: 0.00001819  


Train:   5%|▍         | 41/865 [00:14<04:55,  2.79train_batch/s]

Epoch: [1][40/865] Elapsed 0m 14s (remain 4m 52s) Loss: 1.4878 Grad: 47839.0938  LR: 0.00001369  


Train:   7%|▋         | 61/865 [00:21<04:47,  2.79train_batch/s]

Epoch: [1][60/865] Elapsed 0m 21s (remain 4m 44s) Loss: 1.4281 Grad: 78381.4609  LR: 0.00000798  


Train:   9%|▉         | 81/865 [00:28<04:40,  2.80train_batch/s]

Epoch: [1][80/865] Elapsed 0m 28s (remain 4m 37s) Loss: 1.3770 Grad: 79957.6094  LR: 0.00000293  


Train:  12%|█▏        | 101/865 [00:35<04:33,  2.79train_batch/s]

Epoch: [1][100/865] Elapsed 0m 35s (remain 4m 30s) Loss: 1.3459 Grad: 53720.7852  LR: 0.00000021  


Train:  14%|█▍        | 121/865 [00:42<04:26,  2.79train_batch/s]

Epoch: [1][120/865] Elapsed 0m 42s (remain 4m 23s) Loss: 1.3038 Grad: 62236.6523  LR: 0.00000071  


Train:  16%|█▋        | 141/865 [00:49<04:19,  2.79train_batch/s]

Epoch: [1][140/865] Elapsed 0m 49s (remain 4m 16s) Loss: 1.2799 Grad: 50866.0586  LR: 0.00000426  


Train:  19%|█▊        | 161/865 [00:56<04:12,  2.79train_batch/s]

Epoch: [1][160/865] Elapsed 0m 56s (remain 4m 8s) Loss: 1.2589 Grad: 77666.5234  LR: 0.00000971  


Train:  21%|██        | 181/865 [01:04<04:05,  2.79train_batch/s]

Epoch: [1][180/865] Elapsed 1m 3s (remain 4m 1s) Loss: 1.2417 Grad: 79973.4844  LR: 0.00001525  


Train:  23%|██▎       | 201/865 [01:11<03:58,  2.79train_batch/s]

Epoch: [1][200/865] Elapsed 1m 11s (remain 3m 54s) Loss: 1.2190 Grad: 91866.9844  LR: 0.00001906  


Train:  26%|██▌       | 221/865 [01:18<03:50,  2.79train_batch/s]

Epoch: [1][220/865] Elapsed 1m 18s (remain 3m 47s) Loss: 1.2117 Grad: 87019.1094  LR: 0.00001989  


Train:  28%|██▊       | 241/865 [01:25<03:46,  2.76train_batch/s]

Epoch: [1][240/865] Elapsed 1m 25s (remain 3m 40s) Loss: 1.1965 Grad: 99896.1406  LR: 0.00001747  


Train:  30%|███       | 261/865 [01:32<03:45,  2.68train_batch/s]

Epoch: [1][260/865] Elapsed 1m 32s (remain 3m 34s) Loss: 1.1768 Grad: 115953.9844  LR: 0.00001259  


Train:  32%|███▏      | 281/865 [01:39<03:31,  2.76train_batch/s]

Epoch: [1][280/865] Elapsed 1m 39s (remain 3m 27s) Loss: 1.1655 Grad: 112162.5391  LR: 0.00000685  


Train:  35%|███▍      | 301/865 [01:46<03:24,  2.75train_batch/s]

Epoch: [1][300/865] Elapsed 1m 46s (remain 3m 20s) Loss: 1.1524 Grad: 59589.5664  LR: 0.00000216  


Train:  37%|███▋      | 321/865 [01:53<03:16,  2.77train_batch/s]

Epoch: [1][320/865] Elapsed 1m 53s (remain 3m 13s) Loss: 1.1379 Grad: 48345.4297  LR: 0.00000004  


Train:  39%|███▉      | 341/865 [02:01<03:23,  2.58train_batch/s]

Epoch: [1][340/865] Elapsed 2m 1s (remain 3m 6s) Loss: 1.1318 Grad: 185609.7500  LR: 0.00000120  


Train:  42%|████▏     | 361/865 [02:09<03:16,  2.57train_batch/s]

Epoch: [1][360/865] Elapsed 2m 8s (remain 3m 0s) Loss: 1.1195 Grad: 135361.3125  LR: 0.00000525  


Train:  44%|████▍     | 381/865 [02:16<03:07,  2.58train_batch/s]

Epoch: [1][380/865] Elapsed 2m 16s (remain 2m 53s) Loss: 1.1095 Grad: 89095.2188  LR: 0.00001087  


Train:  46%|████▋     | 401/865 [02:24<03:04,  2.51train_batch/s]

Epoch: [1][400/865] Elapsed 2m 24s (remain 2m 47s) Loss: 1.0983 Grad: 193856.8750  LR: 0.00001620  


Train:  49%|████▊     | 421/865 [02:32<02:53,  2.57train_batch/s]

Epoch: [1][420/865] Elapsed 2m 32s (remain 2m 40s) Loss: 1.0964 Grad: 79250.5156  LR: 0.00001949  


Train:  51%|█████     | 441/865 [02:39<02:47,  2.53train_batch/s]

Epoch: [1][440/865] Elapsed 2m 39s (remain 2m 33s) Loss: 1.0909 Grad: 83366.5859  LR: 0.00001966  


Train:  53%|█████▎    | 461/865 [02:47<02:40,  2.52train_batch/s]

Epoch: [1][460/865] Elapsed 2m 47s (remain 2m 26s) Loss: 1.0830 Grad: 86657.2500  LR: 0.00001665  


Train:  56%|█████▌    | 481/865 [02:55<02:28,  2.58train_batch/s]

Epoch: [1][480/865] Elapsed 2m 55s (remain 2m 20s) Loss: 1.0817 Grad: 106375.5156  LR: 0.00001145  


Train:  58%|█████▊    | 501/865 [03:03<02:18,  2.62train_batch/s]

Epoch: [1][500/865] Elapsed 3m 3s (remain 2m 12s) Loss: 1.0757 Grad: 95427.6953  LR: 0.00000577  


Train:  60%|██████    | 521/865 [03:10<02:06,  2.72train_batch/s]

Epoch: [1][520/865] Elapsed 3m 10s (remain 2m 5s) Loss: 1.0681 Grad: 78708.8828  LR: 0.00000149  


Train:  63%|██████▎   | 541/865 [03:17<01:57,  2.77train_batch/s]

Epoch: [1][540/865] Elapsed 3m 17s (remain 1m 58s) Loss: 1.0599 Grad: 101896.3828  LR: 0.00000000  


Train:  65%|██████▍   | 561/865 [03:24<01:54,  2.66train_batch/s]

Epoch: [1][560/865] Elapsed 3m 24s (remain 1m 51s) Loss: 1.0530 Grad: 67947.7188  LR: 0.00000181  


Train:  67%|██████▋   | 581/865 [03:32<01:53,  2.51train_batch/s]

Epoch: [1][580/865] Elapsed 3m 32s (remain 1m 43s) Loss: 1.0474 Grad: 57217.0352  LR: 0.00000631  


Train:  69%|██████▉   | 601/865 [03:40<01:43,  2.55train_batch/s]

Epoch: [1][600/865] Elapsed 3m 40s (remain 1m 36s) Loss: 1.0425 Grad: 66189.0156  LR: 0.00001202  


Train:  72%|███████▏  | 621/865 [03:48<01:35,  2.56train_batch/s]

Epoch: [1][620/865] Elapsed 3m 48s (remain 1m 29s) Loss: 1.0357 Grad: 74044.6172  LR: 0.00001707  


Train:  74%|███████▍  | 641/865 [03:55<01:26,  2.58train_batch/s]

Epoch: [1][640/865] Elapsed 3m 55s (remain 1m 22s) Loss: 1.0336 Grad: 96774.0078  LR: 0.00001979  


Train:  76%|███████▋  | 661/865 [04:03<01:18,  2.59train_batch/s]

Epoch: [1][660/865] Elapsed 4m 3s (remain 1m 15s) Loss: 1.0318 Grad: 82526.6797  LR: 0.00001929  


Train:  79%|███████▊  | 681/865 [04:11<01:11,  2.57train_batch/s]

Epoch: [1][680/865] Elapsed 4m 11s (remain 1m 7s) Loss: 1.0282 Grad: 98597.0312  LR: 0.00001574  


Train:  81%|████████  | 701/865 [04:18<01:04,  2.54train_batch/s]

Epoch: [1][700/865] Elapsed 4m 18s (remain 1m 0s) Loss: 1.0229 Grad: 69842.0391  LR: 0.00001029  


Train:  83%|████████▎ | 721/865 [04:26<00:55,  2.57train_batch/s]

Epoch: [1][720/865] Elapsed 4m 26s (remain 0m 53s) Loss: 1.0178 Grad: 96546.6562  LR: 0.00000475  


Train:  86%|████████▌ | 741/865 [04:34<00:47,  2.59train_batch/s]

Epoch: [1][740/865] Elapsed 4m 34s (remain 0m 45s) Loss: 1.0137 Grad: 67966.4062  LR: 0.00000094  


Train:  88%|████████▊ | 761/865 [04:41<00:40,  2.56train_batch/s]

Epoch: [1][760/865] Elapsed 4m 41s (remain 0m 38s) Loss: 1.0103 Grad: 113672.1250  LR: 0.00000011  


Train:  90%|█████████ | 781/865 [04:49<00:32,  2.58train_batch/s]

Epoch: [1][780/865] Elapsed 4m 49s (remain 0m 31s) Loss: 1.0075 Grad: 145598.3125  LR: 0.00000253  


Train:  93%|█████████▎| 801/865 [04:57<00:24,  2.57train_batch/s]

Epoch: [1][800/865] Elapsed 4m 57s (remain 0m 23s) Loss: 1.0051 Grad: 212550.6094  LR: 0.00000741  


Train:  95%|█████████▍| 821/865 [05:04<00:17,  2.53train_batch/s]

Epoch: [1][820/865] Elapsed 5m 4s (remain 0m 16s) Loss: 1.0021 Grad: 156866.8906  LR: 0.00001315  


Train:  97%|█████████▋| 841/865 [05:12<00:09,  2.57train_batch/s]

Epoch: [1][840/865] Elapsed 5m 12s (remain 0m 8s) Loss: 1.0016 Grad: 70957.2422  LR: 0.00001784  


Train: 100%|█████████▉| 861/865 [05:20<00:01,  2.58train_batch/s]

Epoch: [1][860/865] Elapsed 5m 20s (remain 0m 1s) Loss: 1.0006 Grad: 139230.5000  LR: 0.00001996  


Train: 100%|██████████| 865/865 [05:21<00:00,  2.69train_batch/s]


Epoch: [1][864/865] Elapsed 5m 21s (remain 0m 0s) Loss: 0.9999 Grad: 260076.4062  LR: 0.00002000  


Validation:   0%|          | 2/433 [00:00<01:07,  6.35valid_batch/s]

EVAL: [0/433] Elapsed 0m 0s (remain 1m 41s) Loss: 2.1982 


Validation:   5%|▌         | 22/433 [00:02<00:42,  9.67valid_batch/s]

EVAL: [20/433] Elapsed 0m 2s (remain 0m 45s) Loss: 1.2257 


Validation:  10%|▉         | 42/433 [00:04<00:40,  9.70valid_batch/s]

EVAL: [40/433] Elapsed 0m 4s (remain 0m 41s) Loss: 1.3037 


Validation:  14%|█▍        | 62/433 [00:06<00:38,  9.75valid_batch/s]

EVAL: [60/433] Elapsed 0m 6s (remain 0m 39s) Loss: 1.2905 


Validation:  19%|█▉        | 82/433 [00:08<00:35,  9.84valid_batch/s]

EVAL: [80/433] Elapsed 0m 8s (remain 0m 36s) Loss: 1.2440 


Validation:  24%|██▎       | 102/433 [00:10<00:33,  9.97valid_batch/s]

EVAL: [100/433] Elapsed 0m 10s (remain 0m 34s) Loss: 1.2579 


Validation:  28%|██▊       | 123/433 [00:12<00:30, 10.14valid_batch/s]

EVAL: [120/433] Elapsed 0m 12s (remain 0m 32s) Loss: 1.2656 


Validation:  33%|███▎      | 143/433 [00:14<00:27, 10.38valid_batch/s]

EVAL: [140/433] Elapsed 0m 14s (remain 0m 29s) Loss: 1.2627 


Validation:  37%|███▋      | 162/433 [00:16<00:27,  9.72valid_batch/s]

EVAL: [160/433] Elapsed 0m 16s (remain 0m 27s) Loss: 1.2670 


Validation:  42%|████▏     | 182/433 [00:18<00:25,  9.69valid_batch/s]

EVAL: [180/433] Elapsed 0m 18s (remain 0m 25s) Loss: 1.2884 


Validation:  47%|████▋     | 202/433 [00:20<00:23,  9.67valid_batch/s]

EVAL: [200/433] Elapsed 0m 20s (remain 0m 23s) Loss: 1.2887 


Validation:  52%|█████▏    | 223/433 [00:22<00:19, 10.56valid_batch/s]

EVAL: [220/433] Elapsed 0m 22s (remain 0m 21s) Loss: 1.3001 


Validation:  56%|█████▌    | 243/433 [00:24<00:18, 10.42valid_batch/s]

EVAL: [240/433] Elapsed 0m 24s (remain 0m 19s) Loss: 1.2822 


Validation:  61%|██████    | 263/433 [00:26<00:16, 10.44valid_batch/s]

EVAL: [260/433] Elapsed 0m 26s (remain 0m 17s) Loss: 1.2918 


Validation:  65%|██████▌   | 282/433 [00:28<00:15,  9.76valid_batch/s]

EVAL: [280/433] Elapsed 0m 28s (remain 0m 15s) Loss: 1.2976 


Validation:  70%|██████▉   | 302/433 [00:30<00:13,  9.56valid_batch/s]

EVAL: [300/433] Elapsed 0m 30s (remain 0m 13s) Loss: 1.3019 


Validation:  74%|███████▍  | 321/433 [00:32<00:10, 10.35valid_batch/s]

EVAL: [320/433] Elapsed 0m 32s (remain 0m 11s) Loss: 1.3017 


Validation:  79%|███████▉  | 342/433 [00:34<00:09,  9.68valid_batch/s]

EVAL: [340/433] Elapsed 0m 34s (remain 0m 9s) Loss: 1.2945 


Validation:  84%|████████▎ | 362/433 [00:36<00:07,  9.68valid_batch/s]

EVAL: [360/433] Elapsed 0m 36s (remain 0m 7s) Loss: 1.2867 


Validation:  88%|████████▊ | 382/433 [00:38<00:05,  9.71valid_batch/s]

EVAL: [380/433] Elapsed 0m 38s (remain 0m 5s) Loss: 1.3014 


Validation:  93%|█████████▎| 402/433 [00:40<00:03,  9.68valid_batch/s]

EVAL: [400/433] Elapsed 0m 40s (remain 0m 3s) Loss: 1.3053 


Validation:  97%|█████████▋| 422/433 [00:42<00:01,  9.69valid_batch/s]

EVAL: [420/433] Elapsed 0m 42s (remain 0m 1s) Loss: 1.3094 


Validation: 100%|██████████| 433/433 [00:43<00:00,  9.84valid_batch/s]
Epoch 1 - avg_train_loss: 0.9999  avg_val_loss: 1.3105  time: 366s
Epoch 1 - Score: 0.6163
Epoch 1 - Save Best Score: 0.6163 Model


EVAL: [432/433] Elapsed 0m 43s (remain 0m 0s) Loss: 1.3105 


Train:   0%|          | 1/865 [00:00<07:10,  2.01train_batch/s]

Epoch: [2][0/865] Elapsed 0m 0s (remain 7m 10s) Loss: 1.5794 Grad: inf  LR: 0.00001998  


Train:   2%|▏         | 21/865 [00:07<05:10,  2.72train_batch/s]

Epoch: [2][20/865] Elapsed 0m 7s (remain 5m 12s) Loss: 1.0257 Grad: 114507.9141  LR: 0.00001802  


Train:   5%|▍         | 41/865 [00:15<05:01,  2.73train_batch/s]

Epoch: [2][40/865] Elapsed 0m 14s (remain 5m 1s) Loss: 0.9186 Grad: 164246.0938  LR: 0.00001342  


Train:   7%|▋         | 61/865 [00:22<04:52,  2.75train_batch/s]

Epoch: [2][60/865] Elapsed 0m 22s (remain 4m 52s) Loss: 0.8764 Grad: 146426.4219  LR: 0.00000769  


Train:   9%|▉         | 81/865 [00:29<04:45,  2.75train_batch/s]

Epoch: [2][80/865] Elapsed 0m 29s (remain 4m 44s) Loss: 0.8510 Grad: 97677.4062  LR: 0.00000273  


Train:  12%|█▏        | 101/865 [00:36<04:56,  2.58train_batch/s]

Epoch: [2][100/865] Elapsed 0m 36s (remain 4m 39s) Loss: 0.8448 Grad: 243661.3750  LR: 0.00000015  


Train:  14%|█▍        | 121/865 [00:44<04:49,  2.57train_batch/s]

Epoch: [2][120/865] Elapsed 0m 44s (remain 4m 34s) Loss: 0.8543 Grad: 165452.4219  LR: 0.00000082  


Train:  16%|█▋        | 141/865 [00:52<04:42,  2.56train_batch/s]

Epoch: [2][140/865] Elapsed 0m 52s (remain 4m 28s) Loss: 0.8500 Grad: 254332.8125  LR: 0.00000450  


Train:  19%|█▊        | 161/865 [00:59<04:33,  2.57train_batch/s]

Epoch: [2][160/865] Elapsed 0m 59s (remain 4m 22s) Loss: 0.8366 Grad: 155275.9219  LR: 0.00001000  


Train:  21%|██        | 181/865 [01:07<04:16,  2.66train_batch/s]

Epoch: [2][180/865] Elapsed 1m 7s (remain 4m 14s) Loss: 0.8327 Grad: 120002.1094  LR: 0.00001550  


Train:  23%|██▎       | 201/865 [01:14<04:01,  2.75train_batch/s]

Epoch: [2][200/865] Elapsed 1m 14s (remain 4m 6s) Loss: 0.8262 Grad: 105985.6953  LR: 0.00001918  


Train:  26%|██▌       | 221/865 [01:22<04:00,  2.68train_batch/s]

Epoch: [2][220/865] Elapsed 1m 22s (remain 3m 58s) Loss: 0.8222 Grad: 123310.7031  LR: 0.00001985  


Train:  28%|██▊       | 241/865 [01:29<03:53,  2.67train_batch/s]

Epoch: [2][240/865] Elapsed 1m 29s (remain 3m 51s) Loss: 0.8296 Grad: 53928.1836  LR: 0.00001727  


Train:  30%|███       | 261/865 [01:36<03:45,  2.67train_batch/s]

Epoch: [2][260/865] Elapsed 1m 36s (remain 3m 43s) Loss: 0.8346 Grad: 138699.7656  LR: 0.00001231  


Train:  32%|███▏      | 281/865 [01:44<03:46,  2.58train_batch/s]

Epoch: [2][280/865] Elapsed 1m 44s (remain 3m 36s) Loss: 0.8378 Grad: 73311.8516  LR: 0.00000658  


Train:  35%|███▍      | 301/865 [01:52<03:38,  2.58train_batch/s]

Epoch: [2][300/865] Elapsed 1m 51s (remain 3m 29s) Loss: 0.8380 Grad: 74513.7188  LR: 0.00000198  


Train:  37%|███▋      | 321/865 [01:59<03:28,  2.61train_batch/s]

Epoch: [2][320/865] Elapsed 1m 59s (remain 3m 22s) Loss: 0.8331 Grad: 57321.8711  LR: 0.00000002  


Train:  39%|███▉      | 341/865 [02:07<03:24,  2.57train_batch/s]

Epoch: [2][340/865] Elapsed 2m 7s (remain 3m 15s) Loss: 0.8278 Grad: 195441.7031  LR: 0.00000134  


Train:  42%|████▏     | 361/865 [02:14<03:14,  2.59train_batch/s]

Epoch: [2][360/865] Elapsed 2m 14s (remain 3m 8s) Loss: 0.8274 Grad: 98605.5312  LR: 0.00000551  


Train:  44%|████▍     | 381/865 [02:22<03:08,  2.57train_batch/s]

Epoch: [2][380/865] Elapsed 2m 22s (remain 3m 1s) Loss: 0.8257 Grad: 139472.4844  LR: 0.00001116  


Train:  46%|████▋     | 401/865 [02:30<02:57,  2.62train_batch/s]

Epoch: [2][400/865] Elapsed 2m 30s (remain 2m 53s) Loss: 0.8265 Grad: 90220.4141  LR: 0.00001643  


Train:  49%|████▊     | 421/865 [02:37<02:41,  2.75train_batch/s]

Epoch: [2][420/865] Elapsed 2m 37s (remain 2m 46s) Loss: 0.8245 Grad: 102917.1406  LR: 0.00001958  


Train:  51%|█████     | 441/865 [02:44<02:34,  2.75train_batch/s]

Epoch: [2][440/865] Elapsed 2m 44s (remain 2m 38s) Loss: 0.8265 Grad: 118127.3125  LR: 0.00001958  


Train:  53%|█████▎    | 461/865 [02:52<02:27,  2.74train_batch/s]

Epoch: [2][460/865] Elapsed 2m 52s (remain 2m 30s) Loss: 0.8266 Grad: 81444.3984  LR: 0.00001643  


Train:  56%|█████▌    | 481/865 [02:59<02:28,  2.59train_batch/s]

Epoch: [2][480/865] Elapsed 2m 59s (remain 2m 23s) Loss: 0.8246 Grad: 107674.2500  LR: 0.00001116  


Train:  58%|█████▊    | 501/865 [03:07<02:16,  2.67train_batch/s]

Epoch: [2][500/865] Elapsed 3m 7s (remain 2m 15s) Loss: 0.8239 Grad: 83999.5000  LR: 0.00000551  


Train:  60%|██████    | 521/865 [03:14<02:03,  2.78train_batch/s]

Epoch: [2][520/865] Elapsed 3m 14s (remain 2m 8s) Loss: 0.8249 Grad: 100906.8672  LR: 0.00000134  


Train:  63%|██████▎   | 541/865 [03:21<01:57,  2.77train_batch/s]

Epoch: [2][540/865] Elapsed 3m 21s (remain 2m 0s) Loss: 0.8246 Grad: 100104.8906  LR: 0.00000002  


Train:  65%|██████▍   | 561/865 [03:28<01:54,  2.66train_batch/s]

Epoch: [2][560/865] Elapsed 3m 28s (remain 1m 53s) Loss: 0.8255 Grad: 138254.4688  LR: 0.00000198  


Train:  67%|██████▋   | 581/865 [03:36<01:43,  2.75train_batch/s]

Epoch: [2][580/865] Elapsed 3m 36s (remain 1m 45s) Loss: 0.8238 Grad: 59746.5352  LR: 0.00000658  


Train:  69%|██████▉   | 601/865 [03:43<01:35,  2.77train_batch/s]

Epoch: [2][600/865] Elapsed 3m 43s (remain 1m 38s) Loss: 0.8241 Grad: 73526.2422  LR: 0.00001231  


Train:  72%|███████▏  | 621/865 [03:50<01:27,  2.78train_batch/s]

Epoch: [2][620/865] Elapsed 3m 50s (remain 1m 30s) Loss: 0.8238 Grad: 96506.7891  LR: 0.00001727  


Train:  74%|███████▍  | 641/865 [03:57<01:20,  2.77train_batch/s]

Epoch: [2][640/865] Elapsed 3m 57s (remain 1m 22s) Loss: 0.8290 Grad: 102655.2734  LR: 0.00001985  


Train:  76%|███████▋  | 661/865 [04:04<01:13,  2.78train_batch/s]

Epoch: [2][660/865] Elapsed 4m 4s (remain 1m 15s) Loss: 0.8298 Grad: 136956.0938  LR: 0.00001918  


Train:  79%|███████▊  | 681/865 [04:11<01:06,  2.77train_batch/s]

Epoch: [2][680/865] Elapsed 4m 11s (remain 1m 7s) Loss: 0.8317 Grad: 64358.9297  LR: 0.00001550  


Train:  81%|████████  | 701/865 [04:18<00:58,  2.78train_batch/s]

Epoch: [2][700/865] Elapsed 4m 18s (remain 1m 0s) Loss: 0.8306 Grad: 188993.7188  LR: 0.00001000  


Train:  83%|████████▎ | 721/865 [04:25<00:51,  2.77train_batch/s]

Epoch: [2][720/865] Elapsed 4m 25s (remain 0m 53s) Loss: 0.8309 Grad: 91223.1406  LR: 0.00000450  


Train:  86%|████████▌ | 741/865 [04:33<00:44,  2.77train_batch/s]

Epoch: [2][740/865] Elapsed 4m 33s (remain 0m 45s) Loss: 0.8298 Grad: 78101.9844  LR: 0.00000082  


Train:  88%|████████▊ | 761/865 [04:40<00:37,  2.78train_batch/s]

Epoch: [2][760/865] Elapsed 4m 40s (remain 0m 38s) Loss: 0.8292 Grad: 86705.8281  LR: 0.00000015  


Train:  90%|█████████ | 781/865 [04:47<00:30,  2.77train_batch/s]

Epoch: [2][780/865] Elapsed 4m 47s (remain 0m 30s) Loss: 0.8288 Grad: 57155.6602  LR: 0.00000273  


Train:  93%|█████████▎| 801/865 [04:54<00:23,  2.77train_batch/s]

Epoch: [2][800/865] Elapsed 4m 54s (remain 0m 23s) Loss: 0.8270 Grad: 100843.2891  LR: 0.00000769  


Train:  95%|█████████▍| 821/865 [05:01<00:15,  2.77train_batch/s]

Epoch: [2][820/865] Elapsed 5m 1s (remain 0m 16s) Loss: 0.8282 Grad: 65350.4141  LR: 0.00001342  


Train:  97%|█████████▋| 841/865 [05:08<00:08,  2.78train_batch/s]

Epoch: [2][840/865] Elapsed 5m 8s (remain 0m 8s) Loss: 0.8306 Grad: 110415.0000  LR: 0.00001802  


Train: 100%|█████████▉| 861/865 [05:15<00:01,  2.77train_batch/s]

Epoch: [2][860/865] Elapsed 5m 15s (remain 0m 1s) Loss: 0.8318 Grad: 96293.3750  LR: 0.00001998  


Train: 100%|██████████| 865/865 [05:17<00:00,  2.73train_batch/s]


Epoch: [2][864/865] Elapsed 5m 17s (remain 0m 0s) Loss: 0.8320 Grad: 99038.2812  LR: 0.00001998  


Validation:   1%|          | 3/433 [00:00<00:55,  7.75valid_batch/s]

EVAL: [0/433] Elapsed 0m 0s (remain 1m 36s) Loss: 0.8188 


Validation:   5%|▌         | 23/433 [00:02<00:39, 10.35valid_batch/s]

EVAL: [20/433] Elapsed 0m 2s (remain 0m 42s) Loss: 0.8862 


Validation:  10%|▉         | 43/433 [00:04<00:37, 10.40valid_batch/s]

EVAL: [40/433] Elapsed 0m 4s (remain 0m 38s) Loss: 0.8959 


Validation:  15%|█▍        | 63/433 [00:06<00:35, 10.46valid_batch/s]

EVAL: [60/433] Elapsed 0m 5s (remain 0m 36s) Loss: 0.8814 


Validation:  19%|█▉        | 83/433 [00:08<00:33, 10.52valid_batch/s]

EVAL: [80/433] Elapsed 0m 7s (remain 0m 34s) Loss: 0.8585 


Validation:  24%|██▍       | 103/433 [00:09<00:30, 10.74valid_batch/s]

EVAL: [100/433] Elapsed 0m 9s (remain 0m 32s) Loss: 0.8586 


Validation:  28%|██▊       | 123/433 [00:11<00:29, 10.46valid_batch/s]

EVAL: [120/433] Elapsed 0m 11s (remain 0m 30s) Loss: 0.8654 


Validation:  33%|███▎      | 143/433 [00:13<00:27, 10.46valid_batch/s]

EVAL: [140/433] Elapsed 0m 13s (remain 0m 28s) Loss: 0.8748 


Validation:  38%|███▊      | 163/433 [00:15<00:25, 10.49valid_batch/s]

EVAL: [160/433] Elapsed 0m 15s (remain 0m 26s) Loss: 0.8716 


Validation:  42%|████▏     | 183/433 [00:17<00:23, 10.42valid_batch/s]

EVAL: [180/433] Elapsed 0m 17s (remain 0m 24s) Loss: 0.8679 


Validation:  47%|████▋     | 203/433 [00:19<00:22, 10.42valid_batch/s]

EVAL: [200/433] Elapsed 0m 19s (remain 0m 22s) Loss: 0.8674 


Validation:  52%|█████▏    | 223/433 [00:21<00:19, 10.76valid_batch/s]

EVAL: [220/433] Elapsed 0m 21s (remain 0m 20s) Loss: 0.8643 


Validation:  56%|█████▌    | 243/433 [00:23<00:18, 10.43valid_batch/s]

EVAL: [240/433] Elapsed 0m 23s (remain 0m 18s) Loss: 0.8665 


Validation:  61%|██████    | 263/433 [00:25<00:16, 10.56valid_batch/s]

EVAL: [260/433] Elapsed 0m 24s (remain 0m 16s) Loss: 0.8667 


Validation:  65%|██████▌   | 283/433 [00:27<00:14, 10.52valid_batch/s]

EVAL: [280/433] Elapsed 0m 26s (remain 0m 14s) Loss: 0.8670 


Validation:  70%|██████▉   | 303/433 [00:28<00:12, 10.43valid_batch/s]

EVAL: [300/433] Elapsed 0m 28s (remain 0m 12s) Loss: 0.8723 


Validation:  75%|███████▍  | 323/433 [00:30<00:10, 10.72valid_batch/s]

EVAL: [320/433] Elapsed 0m 30s (remain 0m 10s) Loss: 0.8736 


Validation:  79%|███████▉  | 343/433 [00:32<00:08, 10.43valid_batch/s]

EVAL: [340/433] Elapsed 0m 32s (remain 0m 8s) Loss: 0.8707 


Validation:  84%|████████▍ | 363/433 [00:34<00:06, 10.44valid_batch/s]

EVAL: [360/433] Elapsed 0m 34s (remain 0m 6s) Loss: 0.8629 


Validation:  88%|████████▊ | 383/433 [00:36<00:04, 10.48valid_batch/s]

EVAL: [380/433] Elapsed 0m 36s (remain 0m 4s) Loss: 0.8670 


Validation:  93%|█████████▎| 403/433 [00:38<00:02, 10.43valid_batch/s]

EVAL: [400/433] Elapsed 0m 38s (remain 0m 3s) Loss: 0.8658 


Validation:  98%|█████████▊| 423/433 [00:40<00:00, 10.41valid_batch/s]

EVAL: [420/433] Elapsed 0m 40s (remain 0m 1s) Loss: 0.8632 


Validation: 100%|██████████| 433/433 [00:41<00:00, 10.45valid_batch/s]
Epoch 2 - avg_train_loss: 0.8320  avg_val_loss: 0.8636  time: 359s
Epoch 2 - Score: 0.7880
Epoch 2 - Save Best Score: 0.7880 Model


EVAL: [432/433] Elapsed 0m 41s (remain 0m 0s) Loss: 0.8636 


Score: 0.7880
Score: 0.7799
