# 🏋️ Model Training

## ⚙️ Setup 

### 📚 Importing Libraries

Importing from packages

In [1]:
import os
import gc
import pandas as pd
import time
import numpy as np
import torch
from torch.optim import AdamW
from torch import nn
from transformers import (
    get_linear_schedule_with_warmup,
    get_cosine_schedule_with_warmup,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.chdir("../")

Importing user defined packages

In [3]:
from lib.config import Config
from lib.paths import Paths
from lib.model.deberta import CustomModel
from lib.model.epoch_functions import train_epoch, valid_epoch
from lib.model.utils import get_score
from lib.utils.utils import get_logger, seed_everything
from lib.data import read_data_loader_from_disk

In [4]:
seed_everything(Config.RANDOM_SEED)

## 📖 Definitions

### 🌎 Global Variables

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
LOGGER = get_logger(Paths.MODEL_OUTPUT_PATH)

### 🛠️ Functions

In [7]:
def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p
                for n, p in model.model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "lr": encoder_lr,
            "weight_decay": weight_decay,
        },
        {
            "params": [
                p
                for n, p in model.model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "lr": encoder_lr,
            "weight_decay": 0.0,
        },
        {
            "params": [p for n, p in model.named_parameters() if "model" not in n],
            "lr": decoder_lr,
            "weight_decay": 0.0,
        },
    ]

    return optimizer_parameters

In [8]:
def get_scheduler(cfg: Config, optimizer, num_train_steps):
    if cfg.SCHEDULER == "linear":
        return get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=cfg.NUM_WARMUP_STEPS,
            num_training_steps=num_train_steps,
        )
    
    if cfg.SCHEDULER == "cosine":
        return get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=cfg.NUM_WARMUP_STEPS,
            num_training_steps=num_train_steps,
            num_cycles=cfg.NUM_CYCLES,
        )

In [9]:
def get_model_optimizer_and_scheduler(train_loader):
    model = CustomModel(Config, config_path=None, pretrained=True)
    torch.save(model.config, Paths.MODEL_OUTPUT_PATH + "/config.pth")
    model.to(device)

    optimizer = AdamW(
        get_optimizer_params(
            model,
            encoder_lr=Config.ENCODER_LR,
            decoder_lr=Config.DECODER_LR,
            weight_decay=Config.WEIGHT_DECAY,
        ),
        lr=Config.ENCODER_LR,
        eps=Config.EPS,
        betas=Config.BETAS,
    )

    num_train_steps = int(len(train_loader) / Config.BATCH_SIZE_TRAIN * Config.EPOCHS)
    scheduler = get_scheduler(Config, optimizer, num_train_steps)
    return model, optimizer, scheduler

In [10]:
def train_loop(fold):
    LOGGER.info(f"========== Fold: {fold} training ==========")

    # ======== DATA LOADER ==========
    train_loader, valid_loader = read_data_loader_from_disk(fold)
    valid_fold = pd.read_csv(os.path.join(Paths.DATA_LOADER_PATH, f"valid_{fold}.csv"))
    valid_labels = valid_fold["score"].values

    # ======== MODEL ==========
    model, optimizer, scheduler = get_model_optimizer_and_scheduler(train_loader)

    # ======= LOSS ==========
    # criterion = RMSELoss(reduction="mean") # nn.SmoothL1Loss(reduction='mean')
    criterion = nn.CrossEntropyLoss()
    softmax = nn.Softmax(dim=1)

    best_score = -np.inf
    # ====== ITERATE EPOCHS ========
    for epoch in range(Config.EPOCHS):
        start_time = time.time()

        # ======= TRAIN ==========
        avg_loss = train_epoch(
            train_loader, model, criterion, optimizer, epoch, scheduler, device
        )

        # ======= EVALUATION ==========
        avg_val_loss, prediction_dict = valid_epoch(
            valid_loader, model, criterion, device
        )
        predictions = prediction_dict["predictions"]
        _, predictions = torch.max(softmax(torch.tensor(predictions)), dim=1)

        # ======= SCORING ==========
        score = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(
            f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s"
        )
        LOGGER.info(f"Epoch {epoch+1} - Score: {score:.4f}")

        if score > best_score:
            best_score = score
            LOGGER.info(f"Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model")
            torch.save(
                {"model": model.state_dict(), "predictions": predictions},
                Paths.MODEL_OUTPUT_PATH
                + f"/{Config.MODEL.replace('/', '_')}_fold_{fold}_best.pth",
            )

    predictions = torch.load(
        Paths.MODEL_OUTPUT_PATH
        + f"/{Config.MODEL.replace('/', '_')}_fold_{fold}_best.pth",
        map_location=torch.device("cpu"),
    )["predictions"]
    valid_fold["pred_score"] = predictions

    del model, optimizer, scheduler, criterion, softmax
    torch.cuda.empty_cache()
    gc.collect()

    return valid_fold

In [11]:
def get_result(oof_df):
    labels = oof_df["score"].values
    preds = oof_df["pred_score"].values
    score = get_score(labels, preds)
    LOGGER.info(f'Score: {score:<.4f}')

## 🏁 Start Training

In [12]:
if Config.TRAIN:
    oof_df = pd.DataFrame()

    for fold in range(Config.N_FOLDS):
        if fold in Config.TRAIN_FOLDS:
            _oof_df = train_loop(fold)
            oof_df = pd.concat([oof_df, _oof_df])
            LOGGER.info(f"========== Fold: {fold} result ==========")
            get_result(_oof_df)

    oof_df = oof_df.reset_index(drop=True)
    LOGGER.info(f"========== CV ==========")
    get_result(oof_df)
    oof_df.to_csv(Paths.MODEL_OUTPUT_PATH + "/oof_df.csv", index=False)



Train:   0%|          | 1/865 [00:00<13:49,  1.04train_batch/s]

Epoch: [1][0/865] Elapsed 0m 0s (remain 13m 50s) Loss: 1.6131 Grad: inf  LR: 0.00002000  


Train:   2%|▏         | 21/865 [00:08<05:18,  2.65train_batch/s]

Epoch: [1][20/865] Elapsed 0m 8s (remain 5m 36s) Loss: 1.5265 Grad: 59890.9727  LR: 0.00001819  


Train:   5%|▍         | 41/865 [00:15<04:49,  2.85train_batch/s]

Epoch: [1][40/865] Elapsed 0m 15s (remain 5m 11s) Loss: 1.4615 Grad: 529824.6875  LR: 0.00001369  


Train:   7%|▋         | 61/865 [00:22<04:42,  2.85train_batch/s]

Epoch: [1][60/865] Elapsed 0m 22s (remain 4m 55s) Loss: 1.4019 Grad: 64941.8203  LR: 0.00000798  


Train:   9%|▉         | 81/865 [00:29<04:35,  2.85train_batch/s]

Epoch: [1][80/865] Elapsed 0m 29s (remain 4m 44s) Loss: 1.3386 Grad: 73672.7500  LR: 0.00000293  


Train:  12%|█▏        | 101/865 [00:36<04:29,  2.84train_batch/s]

Epoch: [1][100/865] Elapsed 0m 36s (remain 4m 34s) Loss: 1.3036 Grad: 45694.9648  LR: 0.00000021  


Train:  14%|█▍        | 121/865 [00:43<04:22,  2.83train_batch/s]

Epoch: [1][120/865] Elapsed 0m 43s (remain 4m 25s) Loss: 1.2810 Grad: 81239.7969  LR: 0.00000071  


Train:  16%|█▋        | 141/865 [00:50<04:15,  2.83train_batch/s]

Epoch: [1][140/865] Elapsed 0m 50s (remain 4m 17s) Loss: 1.2601 Grad: 64185.5742  LR: 0.00000426  


Train:  19%|█▊        | 161/865 [00:57<04:09,  2.82train_batch/s]

Epoch: [1][160/865] Elapsed 0m 57s (remain 4m 10s) Loss: 1.2420 Grad: 48958.6445  LR: 0.00000971  


Train:  21%|██        | 181/865 [01:04<04:02,  2.82train_batch/s]

Epoch: [1][180/865] Elapsed 1m 4s (remain 4m 2s) Loss: 1.2245 Grad: 44717.0781  LR: 0.00001525  


Train:  23%|██▎       | 201/865 [01:11<03:55,  2.82train_batch/s]

Epoch: [1][200/865] Elapsed 1m 11s (remain 3m 55s) Loss: 1.2201 Grad: 176241.7188  LR: 0.00001906  


Train:  26%|██▌       | 221/865 [01:18<03:48,  2.81train_batch/s]

Epoch: [1][220/865] Elapsed 1m 18s (remain 3m 47s) Loss: 1.2041 Grad: 106563.7266  LR: 0.00001989  


Train:  28%|██▊       | 241/865 [01:25<03:42,  2.81train_batch/s]

Epoch: [1][240/865] Elapsed 1m 25s (remain 3m 40s) Loss: 1.1956 Grad: 64564.3633  LR: 0.00001747  


Train:  30%|███       | 261/865 [01:32<03:34,  2.81train_batch/s]

Epoch: [1][260/865] Elapsed 1m 32s (remain 3m 33s) Loss: 1.1754 Grad: 68185.0781  LR: 0.00001259  


Train:  32%|███▏      | 281/865 [01:39<03:28,  2.80train_batch/s]

Epoch: [1][280/865] Elapsed 1m 39s (remain 3m 26s) Loss: 1.1590 Grad: 60382.9141  LR: 0.00000685  


Train:  35%|███▍      | 301/865 [01:46<03:21,  2.80train_batch/s]

Epoch: [1][300/865] Elapsed 1m 46s (remain 3m 19s) Loss: 1.1435 Grad: 57578.8398  LR: 0.00000216  


Train:  37%|███▋      | 321/865 [01:53<03:14,  2.80train_batch/s]

Epoch: [1][320/865] Elapsed 1m 53s (remain 3m 12s) Loss: 1.1363 Grad: 124774.3125  LR: 0.00000004  


Train:  39%|███▉      | 341/865 [02:00<03:07,  2.80train_batch/s]

Epoch: [1][340/865] Elapsed 2m 0s (remain 3m 5s) Loss: 1.1263 Grad: 114189.0469  LR: 0.00000120  


Train:  42%|████▏     | 361/865 [02:07<03:00,  2.80train_batch/s]

Epoch: [1][360/865] Elapsed 2m 7s (remain 2m 57s) Loss: 1.1141 Grad: 57403.7305  LR: 0.00000525  


Train:  44%|████▍     | 381/865 [02:14<02:53,  2.79train_batch/s]

Epoch: [1][380/865] Elapsed 2m 14s (remain 2m 50s) Loss: 1.1105 Grad: 87788.7969  LR: 0.00001087  


Train:  46%|████▋     | 401/865 [02:21<02:45,  2.80train_batch/s]

Epoch: [1][400/865] Elapsed 2m 21s (remain 2m 43s) Loss: 1.1018 Grad: 57056.1016  LR: 0.00001620  


Train:  49%|████▊     | 421/865 [02:28<02:38,  2.79train_batch/s]

Epoch: [1][420/865] Elapsed 2m 28s (remain 2m 36s) Loss: 1.0944 Grad: 84374.3281  LR: 0.00001949  


Train:  51%|█████     | 441/865 [02:35<02:31,  2.80train_batch/s]

Epoch: [1][440/865] Elapsed 2m 35s (remain 2m 29s) Loss: 1.0904 Grad: 291692.1250  LR: 0.00001966  


Train:  53%|█████▎    | 461/865 [02:42<02:24,  2.80train_batch/s]

Epoch: [1][460/865] Elapsed 2m 42s (remain 2m 22s) Loss: 1.0870 Grad: 190214.4375  LR: 0.00001665  


Train:  56%|█████▌    | 481/865 [02:49<02:17,  2.80train_batch/s]

Epoch: [1][480/865] Elapsed 2m 49s (remain 2m 15s) Loss: 1.0841 Grad: 83370.7891  LR: 0.00001145  


Train:  58%|█████▊    | 501/865 [02:56<02:10,  2.80train_batch/s]

Epoch: [1][500/865] Elapsed 2m 56s (remain 2m 8s) Loss: 1.0750 Grad: 108225.2344  LR: 0.00000577  


Train:  60%|██████    | 521/865 [03:03<02:03,  2.79train_batch/s]

Epoch: [1][520/865] Elapsed 3m 3s (remain 2m 1s) Loss: 1.0683 Grad: 113347.5234  LR: 0.00000149  


Train:  63%|██████▎   | 541/865 [03:10<01:55,  2.79train_batch/s]

Epoch: [1][540/865] Elapsed 3m 10s (remain 1m 54s) Loss: 1.0617 Grad: 99211.4688  LR: 0.00000000  


Train:  65%|██████▍   | 561/865 [03:18<01:48,  2.80train_batch/s]

Epoch: [1][560/865] Elapsed 3m 18s (remain 1m 47s) Loss: 1.0562 Grad: 135396.3281  LR: 0.00000181  


Train:  67%|██████▋   | 581/865 [03:25<01:41,  2.79train_batch/s]

Epoch: [1][580/865] Elapsed 3m 25s (remain 1m 40s) Loss: 1.0491 Grad: 108749.2188  LR: 0.00000631  


Train:  69%|██████▉   | 601/865 [03:32<01:34,  2.80train_batch/s]

Epoch: [1][600/865] Elapsed 3m 32s (remain 1m 33s) Loss: 1.0415 Grad: 73052.1719  LR: 0.00001202  


Train:  72%|███████▏  | 621/865 [03:39<01:27,  2.79train_batch/s]

Epoch: [1][620/865] Elapsed 3m 39s (remain 1m 26s) Loss: 1.0359 Grad: 137361.0469  LR: 0.00001707  


Train:  74%|███████▍  | 641/865 [03:46<01:20,  2.80train_batch/s]

Epoch: [1][640/865] Elapsed 3m 46s (remain 1m 19s) Loss: 1.0340 Grad: 210132.1094  LR: 0.00001979  


Train:  76%|███████▋  | 661/865 [03:53<01:13,  2.79train_batch/s]

Epoch: [1][660/865] Elapsed 3m 53s (remain 1m 12s) Loss: 1.0304 Grad: 106387.3984  LR: 0.00001929  


Train:  79%|███████▊  | 681/865 [04:00<01:05,  2.79train_batch/s]

Epoch: [1][680/865] Elapsed 4m 0s (remain 1m 4s) Loss: 1.0257 Grad: 115122.3203  LR: 0.00001574  


Train:  81%|████████  | 701/865 [04:07<00:58,  2.79train_batch/s]

Epoch: [1][700/865] Elapsed 4m 7s (remain 0m 57s) Loss: 1.0219 Grad: 61017.3906  LR: 0.00001029  


Train:  83%|████████▎ | 721/865 [04:14<00:51,  2.79train_batch/s]

Epoch: [1][720/865] Elapsed 4m 14s (remain 0m 50s) Loss: 1.0198 Grad: 78422.9453  LR: 0.00000475  


Train:  86%|████████▌ | 741/865 [04:21<00:44,  2.79train_batch/s]

Epoch: [1][740/865] Elapsed 4m 21s (remain 0m 43s) Loss: 1.0153 Grad: 107688.0312  LR: 0.00000094  


Train:  88%|████████▊ | 761/865 [04:28<00:37,  2.79train_batch/s]

Epoch: [1][760/865] Elapsed 4m 28s (remain 0m 36s) Loss: 1.0107 Grad: 92189.8906  LR: 0.00000011  


Train:  90%|█████████ | 781/865 [04:35<00:30,  2.79train_batch/s]

Epoch: [1][780/865] Elapsed 4m 35s (remain 0m 29s) Loss: 1.0049 Grad: 37899.4727  LR: 0.00000253  


Train:  93%|█████████▎| 801/865 [04:42<00:22,  2.79train_batch/s]

Epoch: [1][800/865] Elapsed 4m 42s (remain 0m 22s) Loss: 0.9992 Grad: 63518.2266  LR: 0.00000741  


Train:  95%|█████████▍| 821/865 [04:49<00:15,  2.79train_batch/s]

Epoch: [1][820/865] Elapsed 4m 49s (remain 0m 15s) Loss: 0.9947 Grad: 194957.2188  LR: 0.00001315  


Train:  97%|█████████▋| 841/865 [04:56<00:08,  2.79train_batch/s]

Epoch: [1][840/865] Elapsed 4m 56s (remain 0m 8s) Loss: 0.9946 Grad: 98280.3594  LR: 0.00001784  


Train: 100%|█████████▉| 861/865 [05:03<00:01,  2.79train_batch/s]

Epoch: [1][860/865] Elapsed 5m 3s (remain 0m 1s) Loss: 0.9934 Grad: 43028.2656  LR: 0.00001996  


Train: 100%|██████████| 865/865 [05:05<00:00,  2.83train_batch/s]


Epoch: [1][864/865] Elapsed 5m 5s (remain 0m 0s) Loss: 0.9929 Grad: 37949.0117  LR: 0.00002000  


Validation:   1%|          | 3/433 [00:00<00:54,  7.91valid_batch/s]

EVAL: [0/433] Elapsed 0m 0s (remain 1m 32s) Loss: 0.8513 


Validation:   5%|▌         | 23/433 [00:02<00:39, 10.40valid_batch/s]

EVAL: [20/433] Elapsed 0m 2s (remain 0m 41s) Loss: 1.0543 


Validation:  10%|▉         | 43/433 [00:04<00:37, 10.46valid_batch/s]

EVAL: [40/433] Elapsed 0m 4s (remain 0m 38s) Loss: 0.9893 


Validation:  15%|█▍        | 63/433 [00:06<00:34, 10.59valid_batch/s]

EVAL: [60/433] Elapsed 0m 5s (remain 0m 36s) Loss: 0.9564 


Validation:  19%|█▉        | 83/433 [00:08<00:33, 10.47valid_batch/s]

EVAL: [80/433] Elapsed 0m 7s (remain 0m 34s) Loss: 0.9557 


Validation:  24%|██▍       | 103/433 [00:09<00:31, 10.57valid_batch/s]

EVAL: [100/433] Elapsed 0m 9s (remain 0m 31s) Loss: 0.9396 


Validation:  28%|██▊       | 123/433 [00:11<00:29, 10.46valid_batch/s]

EVAL: [120/433] Elapsed 0m 11s (remain 0m 30s) Loss: 0.9568 


Validation:  33%|███▎      | 143/433 [00:13<00:27, 10.46valid_batch/s]

EVAL: [140/433] Elapsed 0m 13s (remain 0m 28s) Loss: 0.9759 


Validation:  38%|███▊      | 163/433 [00:15<00:25, 10.44valid_batch/s]

EVAL: [160/433] Elapsed 0m 15s (remain 0m 26s) Loss: 0.9703 


Validation:  42%|████▏     | 183/433 [00:17<00:23, 10.44valid_batch/s]

EVAL: [180/433] Elapsed 0m 17s (remain 0m 24s) Loss: 0.9647 


Validation:  47%|████▋     | 203/433 [00:19<00:21, 10.48valid_batch/s]

EVAL: [200/433] Elapsed 0m 19s (remain 0m 22s) Loss: 0.9535 


Validation:  52%|█████▏    | 223/433 [00:21<00:19, 10.68valid_batch/s]

EVAL: [220/433] Elapsed 0m 21s (remain 0m 20s) Loss: 0.9564 


Validation:  56%|█████▌    | 243/433 [00:23<00:18, 10.48valid_batch/s]

EVAL: [240/433] Elapsed 0m 23s (remain 0m 18s) Loss: 0.9566 


Validation:  61%|██████    | 263/433 [00:25<00:16, 10.51valid_batch/s]

EVAL: [260/433] Elapsed 0m 24s (remain 0m 16s) Loss: 0.9470 


Validation:  65%|██████▌   | 283/433 [00:27<00:14, 10.52valid_batch/s]

EVAL: [280/433] Elapsed 0m 26s (remain 0m 14s) Loss: 0.9461 


Validation:  70%|██████▉   | 303/433 [00:28<00:12, 10.46valid_batch/s]

EVAL: [300/433] Elapsed 0m 28s (remain 0m 12s) Loss: 0.9481 


Validation:  75%|███████▍  | 323/433 [00:30<00:10, 10.46valid_batch/s]

EVAL: [320/433] Elapsed 0m 30s (remain 0m 10s) Loss: 0.9514 


Validation:  79%|███████▉  | 343/433 [00:32<00:08, 10.46valid_batch/s]

EVAL: [340/433] Elapsed 0m 32s (remain 0m 8s) Loss: 0.9481 


Validation:  84%|████████▍ | 363/433 [00:34<00:06, 10.46valid_batch/s]

EVAL: [360/433] Elapsed 0m 34s (remain 0m 6s) Loss: 0.9482 


Validation:  88%|████████▊ | 383/433 [00:36<00:04, 10.52valid_batch/s]

EVAL: [380/433] Elapsed 0m 36s (remain 0m 4s) Loss: 0.9476 


Validation:  93%|█████████▎| 403/433 [00:38<00:02, 10.45valid_batch/s]

EVAL: [400/433] Elapsed 0m 38s (remain 0m 3s) Loss: 0.9486 


Validation:  98%|█████████▊| 423/433 [00:40<00:00, 10.45valid_batch/s]

EVAL: [420/433] Elapsed 0m 40s (remain 0m 1s) Loss: 0.9486 


Validation: 100%|██████████| 433/433 [00:41<00:00, 10.47valid_batch/s]
Epoch 1 - avg_train_loss: 0.9929  avg_val_loss: 0.9504  time: 347s
Epoch 1 - Score: 0.6996
Epoch 1 - Save Best Score: 0.6996 Model


EVAL: [432/433] Elapsed 0m 41s (remain 0m 0s) Loss: 0.9504 


Train:   0%|          | 1/865 [00:00<06:34,  2.19train_batch/s]

Epoch: [2][0/865] Elapsed 0m 0s (remain 6m 34s) Loss: 1.0455 Grad: inf  LR: 0.00001998  


Train:   2%|▏         | 21/865 [00:07<05:02,  2.79train_batch/s]

Epoch: [2][20/865] Elapsed 0m 7s (remain 5m 1s) Loss: 0.9622 Grad: 152137.5781  LR: 0.00001802  


Train:   5%|▍         | 41/865 [00:14<04:55,  2.79train_batch/s]

Epoch: [2][40/865] Elapsed 0m 14s (remain 4m 52s) Loss: 0.9116 Grad: 106934.6562  LR: 0.00001342  


Train:   7%|▋         | 61/865 [00:21<04:48,  2.79train_batch/s]

Epoch: [2][60/865] Elapsed 0m 21s (remain 4m 45s) Loss: 0.9078 Grad: 385520.0938  LR: 0.00000769  


Train:   9%|▉         | 81/865 [00:28<04:40,  2.79train_batch/s]

Epoch: [2][80/865] Elapsed 0m 28s (remain 4m 37s) Loss: 0.8826 Grad: 145484.2344  LR: 0.00000273  


Train:  12%|█▏        | 101/865 [00:35<04:33,  2.79train_batch/s]

Epoch: [2][100/865] Elapsed 0m 35s (remain 4m 30s) Loss: 0.8813 Grad: 174148.4531  LR: 0.00000015  


Train:  14%|█▍        | 121/865 [00:42<04:26,  2.79train_batch/s]

Epoch: [2][120/865] Elapsed 0m 42s (remain 4m 23s) Loss: 0.8724 Grad: 131615.6406  LR: 0.00000082  


Train:  16%|█▋        | 141/865 [00:49<04:19,  2.79train_batch/s]

Epoch: [2][140/865] Elapsed 0m 49s (remain 4m 16s) Loss: 0.8583 Grad: 92378.5000  LR: 0.00000450  


Train:  19%|█▊        | 161/865 [00:57<04:12,  2.79train_batch/s]

Epoch: [2][160/865] Elapsed 0m 56s (remain 4m 9s) Loss: 0.8534 Grad: 169523.4062  LR: 0.00001000  


Train:  21%|██        | 181/865 [01:04<04:05,  2.79train_batch/s]

Epoch: [2][180/865] Elapsed 1m 4s (remain 4m 2s) Loss: 0.8411 Grad: 260821.2812  LR: 0.00001550  


Train:  23%|██▎       | 201/865 [01:11<03:57,  2.79train_batch/s]

Epoch: [2][200/865] Elapsed 1m 11s (remain 3m 55s) Loss: 0.8386 Grad: 180633.0312  LR: 0.00001918  


Train:  26%|██▌       | 221/865 [01:18<03:50,  2.79train_batch/s]

Epoch: [2][220/865] Elapsed 1m 18s (remain 3m 47s) Loss: 0.8326 Grad: 173399.7188  LR: 0.00001985  


Train:  28%|██▊       | 241/865 [01:25<03:43,  2.79train_batch/s]

Epoch: [2][240/865] Elapsed 1m 25s (remain 3m 40s) Loss: 0.8271 Grad: 86774.2422  LR: 0.00001727  


Train:  30%|███       | 261/865 [01:32<03:36,  2.79train_batch/s]

Epoch: [2][260/865] Elapsed 1m 32s (remain 3m 33s) Loss: 0.8272 Grad: 155125.6094  LR: 0.00001231  


Train:  32%|███▏      | 281/865 [01:39<03:29,  2.79train_batch/s]

Epoch: [2][280/865] Elapsed 1m 39s (remain 3m 26s) Loss: 0.8260 Grad: 117791.8594  LR: 0.00000658  


Train:  35%|███▍      | 301/865 [01:46<03:22,  2.79train_batch/s]

Epoch: [2][300/865] Elapsed 1m 46s (remain 3m 19s) Loss: 0.8265 Grad: 355215.7188  LR: 0.00000198  


Train:  37%|███▋      | 321/865 [01:53<03:15,  2.79train_batch/s]

Epoch: [2][320/865] Elapsed 1m 53s (remain 3m 12s) Loss: 0.8206 Grad: 161986.0156  LR: 0.00000002  


Train:  39%|███▉      | 341/865 [02:00<03:07,  2.79train_batch/s]

Epoch: [2][340/865] Elapsed 2m 0s (remain 3m 5s) Loss: 0.8209 Grad: 113364.1797  LR: 0.00000134  


Train:  42%|████▏     | 361/865 [02:07<03:00,  2.79train_batch/s]

Epoch: [2][360/865] Elapsed 2m 7s (remain 2m 58s) Loss: 0.8189 Grad: 138061.1250  LR: 0.00000551  


Train:  44%|████▍     | 381/865 [02:14<02:53,  2.79train_batch/s]

Epoch: [2][380/865] Elapsed 2m 14s (remain 2m 51s) Loss: 0.8173 Grad: 176257.6250  LR: 0.00001116  


Train:  46%|████▋     | 401/865 [02:21<02:46,  2.79train_batch/s]

Epoch: [2][400/865] Elapsed 2m 21s (remain 2m 44s) Loss: 0.8179 Grad: 199806.1719  LR: 0.00001643  


Train:  49%|████▊     | 421/865 [02:28<02:39,  2.79train_batch/s]

Epoch: [2][420/865] Elapsed 2m 28s (remain 2m 37s) Loss: 0.8175 Grad: 69718.8125  LR: 0.00001958  


Train:  51%|█████     | 441/865 [02:36<02:31,  2.79train_batch/s]

Epoch: [2][440/865] Elapsed 2m 36s (remain 2m 29s) Loss: 0.8182 Grad: 172901.1250  LR: 0.00001958  


Train:  53%|█████▎    | 461/865 [02:43<02:24,  2.79train_batch/s]

Epoch: [2][460/865] Elapsed 2m 43s (remain 2m 22s) Loss: 0.8215 Grad: 89966.8359  LR: 0.00001643  


Train:  56%|█████▌    | 481/865 [02:50<02:17,  2.79train_batch/s]

Epoch: [2][480/865] Elapsed 2m 50s (remain 2m 15s) Loss: 0.8194 Grad: 151592.0156  LR: 0.00001116  


Train:  58%|█████▊    | 501/865 [02:57<02:10,  2.79train_batch/s]

Epoch: [2][500/865] Elapsed 2m 57s (remain 2m 8s) Loss: 0.8195 Grad: 260302.5312  LR: 0.00000551  


Train:  60%|██████    | 521/865 [03:04<02:03,  2.79train_batch/s]

Epoch: [2][520/865] Elapsed 3m 4s (remain 2m 1s) Loss: 0.8154 Grad: 121098.5625  LR: 0.00000134  


Train:  63%|██████▎   | 541/865 [03:11<01:56,  2.79train_batch/s]

Epoch: [2][540/865] Elapsed 3m 11s (remain 1m 54s) Loss: 0.8147 Grad: 131092.2969  LR: 0.00000002  


Train:  65%|██████▍   | 561/865 [03:18<01:48,  2.79train_batch/s]

Epoch: [2][560/865] Elapsed 3m 18s (remain 1m 47s) Loss: 0.8150 Grad: 147195.7188  LR: 0.00000198  


Train:  67%|██████▋   | 581/865 [03:25<01:41,  2.79train_batch/s]

Epoch: [2][580/865] Elapsed 3m 25s (remain 1m 40s) Loss: 0.8118 Grad: 48849.2422  LR: 0.00000658  


Train:  69%|██████▉   | 601/865 [03:32<01:34,  2.78train_batch/s]

Epoch: [2][600/865] Elapsed 3m 32s (remain 1m 33s) Loss: 0.8106 Grad: 125458.1641  LR: 0.00001231  


Train:  72%|███████▏  | 621/865 [03:39<01:27,  2.79train_batch/s]

Epoch: [2][620/865] Elapsed 3m 39s (remain 1m 26s) Loss: 0.8118 Grad: 107888.3906  LR: 0.00001727  


Train:  74%|███████▍  | 641/865 [03:46<01:20,  2.79train_batch/s]

Epoch: [2][640/865] Elapsed 3m 46s (remain 1m 19s) Loss: 0.8123 Grad: 54839.7266  LR: 0.00001985  


Train:  76%|███████▋  | 661/865 [03:53<01:13,  2.79train_batch/s]

Epoch: [2][660/865] Elapsed 3m 53s (remain 1m 12s) Loss: 0.8150 Grad: 164213.0156  LR: 0.00001918  


Train:  79%|███████▊  | 681/865 [04:00<01:05,  2.79train_batch/s]

Epoch: [2][680/865] Elapsed 4m 0s (remain 1m 5s) Loss: 0.8165 Grad: 77296.4453  LR: 0.00001550  


Train:  81%|████████  | 701/865 [04:07<00:58,  2.80train_batch/s]

Epoch: [2][700/865] Elapsed 4m 7s (remain 0m 57s) Loss: 0.8168 Grad: 68416.3984  LR: 0.00001000  


Train:  83%|████████▎ | 721/865 [04:14<00:51,  2.79train_batch/s]

Epoch: [2][720/865] Elapsed 4m 14s (remain 0m 50s) Loss: 0.8194 Grad: 84549.4844  LR: 0.00000450  


Train:  86%|████████▌ | 741/865 [04:22<00:44,  2.79train_batch/s]

Epoch: [2][740/865] Elapsed 4m 22s (remain 0m 43s) Loss: 0.8189 Grad: 45775.0391  LR: 0.00000082  


Train:  88%|████████▊ | 761/865 [04:29<00:37,  2.79train_batch/s]

Epoch: [2][760/865] Elapsed 4m 29s (remain 0m 36s) Loss: 0.8185 Grad: 65222.7461  LR: 0.00000015  


Train:  90%|█████████ | 781/865 [04:36<00:30,  2.79train_batch/s]

Epoch: [2][780/865] Elapsed 4m 36s (remain 0m 29s) Loss: 0.8179 Grad: 45445.3359  LR: 0.00000273  


Train:  93%|█████████▎| 801/865 [04:43<00:22,  2.79train_batch/s]

Epoch: [2][800/865] Elapsed 4m 43s (remain 0m 22s) Loss: 0.8183 Grad: 62973.9648  LR: 0.00000769  


Train:  95%|█████████▍| 821/865 [04:50<00:15,  2.79train_batch/s]

Epoch: [2][820/865] Elapsed 4m 50s (remain 0m 15s) Loss: 0.8174 Grad: 84612.0234  LR: 0.00001342  


Train:  97%|█████████▋| 841/865 [04:57<00:08,  2.79train_batch/s]

Epoch: [2][840/865] Elapsed 4m 57s (remain 0m 8s) Loss: 0.8176 Grad: 219146.8281  LR: 0.00001802  


Train: 100%|█████████▉| 861/865 [05:04<00:01,  2.79train_batch/s]

Epoch: [2][860/865] Elapsed 5m 4s (remain 0m 1s) Loss: 0.8180 Grad: 50890.7930  LR: 0.00001998  


Train: 100%|██████████| 865/865 [05:05<00:00,  2.83train_batch/s]


Epoch: [2][864/865] Elapsed 5m 5s (remain 0m 0s) Loss: 0.8182 Grad: 77208.9844  LR: 0.00001998  


Validation:   1%|          | 3/433 [00:00<00:54,  7.94valid_batch/s]

EVAL: [0/433] Elapsed 0m 0s (remain 1m 32s) Loss: 0.4920 


Validation:   5%|▌         | 23/433 [00:02<00:39, 10.39valid_batch/s]

EVAL: [20/433] Elapsed 0m 2s (remain 0m 41s) Loss: 1.0473 


Validation:  10%|▉         | 43/433 [00:04<00:37, 10.45valid_batch/s]

EVAL: [40/433] Elapsed 0m 4s (remain 0m 38s) Loss: 1.0236 


Validation:  15%|█▍        | 63/433 [00:06<00:34, 10.59valid_batch/s]

EVAL: [60/433] Elapsed 0m 5s (remain 0m 36s) Loss: 0.9935 


Validation:  19%|█▉        | 83/433 [00:08<00:33, 10.47valid_batch/s]

EVAL: [80/433] Elapsed 0m 7s (remain 0m 34s) Loss: 0.9932 


Validation:  24%|██▍       | 103/433 [00:09<00:31, 10.58valid_batch/s]

EVAL: [100/433] Elapsed 0m 9s (remain 0m 31s) Loss: 0.9876 


Validation:  28%|██▊       | 123/433 [00:11<00:29, 10.46valid_batch/s]

EVAL: [120/433] Elapsed 0m 11s (remain 0m 30s) Loss: 0.9980 


Validation:  33%|███▎      | 143/433 [00:13<00:27, 10.46valid_batch/s]

EVAL: [140/433] Elapsed 0m 13s (remain 0m 28s) Loss: 1.0043 


Validation:  38%|███▊      | 163/433 [00:15<00:25, 10.48valid_batch/s]

EVAL: [160/433] Elapsed 0m 15s (remain 0m 26s) Loss: 0.9940 


Validation:  42%|████▏     | 183/433 [00:17<00:23, 10.44valid_batch/s]

EVAL: [180/433] Elapsed 0m 17s (remain 0m 24s) Loss: 0.9942 


Validation:  47%|████▋     | 203/433 [00:19<00:21, 10.49valid_batch/s]

EVAL: [200/433] Elapsed 0m 19s (remain 0m 22s) Loss: 0.9897 


Validation:  52%|█████▏    | 223/433 [00:21<00:19, 10.67valid_batch/s]

EVAL: [220/433] Elapsed 0m 21s (remain 0m 20s) Loss: 0.9923 


Validation:  56%|█████▌    | 243/433 [00:23<00:18, 10.47valid_batch/s]

EVAL: [240/433] Elapsed 0m 23s (remain 0m 18s) Loss: 0.9848 


Validation:  61%|██████    | 263/433 [00:25<00:16, 10.50valid_batch/s]

EVAL: [260/433] Elapsed 0m 24s (remain 0m 16s) Loss: 0.9759 


Validation:  65%|██████▌   | 283/433 [00:27<00:14, 10.52valid_batch/s]

EVAL: [280/433] Elapsed 0m 26s (remain 0m 14s) Loss: 0.9766 


Validation:  70%|██████▉   | 303/433 [00:28<00:12, 10.47valid_batch/s]

EVAL: [300/433] Elapsed 0m 28s (remain 0m 12s) Loss: 0.9746 


Validation:  75%|███████▍  | 323/433 [00:30<00:10, 10.46valid_batch/s]

EVAL: [320/433] Elapsed 0m 30s (remain 0m 10s) Loss: 0.9725 


Validation:  79%|███████▉  | 343/433 [00:32<00:08, 10.46valid_batch/s]

EVAL: [340/433] Elapsed 0m 32s (remain 0m 8s) Loss: 0.9692 


Validation:  84%|████████▍ | 363/433 [00:34<00:06, 10.46valid_batch/s]

EVAL: [360/433] Elapsed 0m 34s (remain 0m 6s) Loss: 0.9666 


Validation:  88%|████████▊ | 383/433 [00:36<00:04, 10.51valid_batch/s]

EVAL: [380/433] Elapsed 0m 36s (remain 0m 4s) Loss: 0.9602 


Validation:  93%|█████████▎| 403/433 [00:38<00:02, 10.46valid_batch/s]

EVAL: [400/433] Elapsed 0m 38s (remain 0m 3s) Loss: 0.9597 


Validation:  98%|█████████▊| 423/433 [00:40<00:00, 10.48valid_batch/s]

EVAL: [420/433] Elapsed 0m 40s (remain 0m 1s) Loss: 0.9608 


Validation: 100%|██████████| 433/433 [00:41<00:00, 10.48valid_batch/s]
Epoch 2 - avg_train_loss: 0.8182  avg_val_loss: 0.9643  time: 347s
Epoch 2 - Score: 0.7300
Epoch 2 - Save Best Score: 0.7300 Model


EVAL: [432/433] Elapsed 0m 41s (remain 0m 0s) Loss: 0.9643 


Score: 0.7300
Train:   0%|          | 1/865 [00:00<06:58,  2.06train_batch/s]

Epoch: [1][0/865] Elapsed 0m 0s (remain 6m 59s) Loss: 1.9007 Grad: inf  LR: 0.00002000  


Train:   2%|▏         | 21/865 [00:07<05:01,  2.80train_batch/s]

Epoch: [1][20/865] Elapsed 0m 7s (remain 5m 1s) Loss: 1.5815 Grad: 76426.7188  LR: 0.00001819  


Train:   5%|▍         | 41/865 [00:14<04:55,  2.79train_batch/s]

Epoch: [1][40/865] Elapsed 0m 14s (remain 4m 52s) Loss: 1.5175 Grad: 67681.0312  LR: 0.00001369  


Train:   7%|▋         | 61/865 [00:21<04:47,  2.79train_batch/s]

Epoch: [1][60/865] Elapsed 0m 21s (remain 4m 45s) Loss: 1.4613 Grad: 59973.5586  LR: 0.00000798  


Train:   9%|▉         | 81/865 [00:28<04:41,  2.79train_batch/s]

Epoch: [1][80/865] Elapsed 0m 28s (remain 4m 37s) Loss: 1.4044 Grad: 86653.0547  LR: 0.00000293  


Train:  12%|█▏        | 101/865 [00:35<04:34,  2.79train_batch/s]

Epoch: [1][100/865] Elapsed 0m 35s (remain 4m 30s) Loss: 1.3627 Grad: 89074.9766  LR: 0.00000021  


Train:  14%|█▍        | 121/865 [00:42<04:26,  2.79train_batch/s]

Epoch: [1][120/865] Elapsed 0m 42s (remain 4m 23s) Loss: 1.3317 Grad: 64115.3438  LR: 0.00000071  


Train:  16%|█▋        | 141/865 [00:49<04:19,  2.79train_batch/s]

Epoch: [1][140/865] Elapsed 0m 49s (remain 4m 16s) Loss: 1.3035 Grad: 65152.5469  LR: 0.00000426  


Train:  19%|█▊        | 161/865 [00:56<04:12,  2.79train_batch/s]

Epoch: [1][160/865] Elapsed 0m 56s (remain 4m 9s) Loss: 1.2775 Grad: 57101.1875  LR: 0.00000971  


Train:  21%|██        | 181/865 [01:04<04:05,  2.79train_batch/s]

Epoch: [1][180/865] Elapsed 1m 4s (remain 4m 2s) Loss: 1.2734 Grad: 121665.0234  LR: 0.00001525  


Train:  23%|██▎       | 201/865 [01:11<03:57,  2.79train_batch/s]

Epoch: [1][200/865] Elapsed 1m 11s (remain 3m 54s) Loss: 1.2598 Grad: 120576.2578  LR: 0.00001906  


Train:  26%|██▌       | 221/865 [01:18<03:50,  2.79train_batch/s]

Epoch: [1][220/865] Elapsed 1m 18s (remain 3m 47s) Loss: 1.2460 Grad: 68590.2266  LR: 0.00001989  


Train:  28%|██▊       | 241/865 [01:25<03:43,  2.79train_batch/s]

Epoch: [1][240/865] Elapsed 1m 25s (remain 3m 40s) Loss: 1.2281 Grad: 101849.7578  LR: 0.00001747  


Train:  30%|███       | 261/865 [01:32<03:36,  2.79train_batch/s]

Epoch: [1][260/865] Elapsed 1m 32s (remain 3m 33s) Loss: 1.2145 Grad: 180147.7188  LR: 0.00001259  


Train:  32%|███▏      | 281/865 [01:39<03:29,  2.79train_batch/s]

Epoch: [1][280/865] Elapsed 1m 39s (remain 3m 26s) Loss: 1.2019 Grad: 234172.0625  LR: 0.00000685  


Train:  35%|███▍      | 301/865 [01:46<03:22,  2.79train_batch/s]

Epoch: [1][300/865] Elapsed 1m 46s (remain 3m 19s) Loss: 1.1911 Grad: 74642.9844  LR: 0.00000216  


Train:  37%|███▋      | 321/865 [01:53<03:15,  2.78train_batch/s]

Epoch: [1][320/865] Elapsed 1m 53s (remain 3m 12s) Loss: 1.1804 Grad: 60638.3203  LR: 0.00000004  


Train:  39%|███▉      | 341/865 [02:00<03:08,  2.78train_batch/s]

Epoch: [1][340/865] Elapsed 2m 0s (remain 3m 5s) Loss: 1.1682 Grad: 59115.3906  LR: 0.00000120  


Train:  42%|████▏     | 361/865 [02:07<03:00,  2.79train_batch/s]

Epoch: [1][360/865] Elapsed 2m 7s (remain 2m 58s) Loss: 1.1588 Grad: 63487.4375  LR: 0.00000525  


Train:  44%|████▍     | 381/865 [02:14<02:53,  2.79train_batch/s]

Epoch: [1][380/865] Elapsed 2m 14s (remain 2m 51s) Loss: 1.1531 Grad: 101705.5781  LR: 0.00001087  


Train:  46%|████▋     | 401/865 [02:21<02:46,  2.79train_batch/s]

Epoch: [1][400/865] Elapsed 2m 21s (remain 2m 44s) Loss: 1.1496 Grad: 122689.3594  LR: 0.00001620  


Train:  49%|████▊     | 421/865 [02:28<02:39,  2.79train_batch/s]

Epoch: [1][420/865] Elapsed 2m 28s (remain 2m 37s) Loss: 1.1413 Grad: 98983.4141  LR: 0.00001949  


Train:  51%|█████     | 441/865 [02:36<02:32,  2.79train_batch/s]

Epoch: [1][440/865] Elapsed 2m 36s (remain 2m 30s) Loss: 1.1377 Grad: 83741.3984  LR: 0.00001966  


Train:  53%|█████▎    | 461/865 [02:43<02:24,  2.79train_batch/s]

Epoch: [1][460/865] Elapsed 2m 43s (remain 2m 22s) Loss: 1.1302 Grad: 73437.8984  LR: 0.00001665  


Train:  56%|█████▌    | 481/865 [02:50<02:17,  2.79train_batch/s]

Epoch: [1][480/865] Elapsed 2m 50s (remain 2m 15s) Loss: 1.1238 Grad: 119392.6094  LR: 0.00001145  


Train:  58%|█████▊    | 501/865 [02:57<02:10,  2.79train_batch/s]

Epoch: [1][500/865] Elapsed 2m 57s (remain 2m 8s) Loss: 1.1161 Grad: 56992.1289  LR: 0.00000577  


Train:  60%|██████    | 521/865 [03:04<02:03,  2.79train_batch/s]

Epoch: [1][520/865] Elapsed 3m 4s (remain 2m 1s) Loss: 1.1085 Grad: 53297.9727  LR: 0.00000149  


Train:  63%|██████▎   | 541/865 [03:11<01:55,  2.80train_batch/s]

Epoch: [1][540/865] Elapsed 3m 11s (remain 1m 54s) Loss: 1.1013 Grad: 95592.2891  LR: 0.00000000  


Train:  65%|██████▍   | 561/865 [03:18<01:49,  2.79train_batch/s]

Epoch: [1][560/865] Elapsed 3m 18s (remain 1m 47s) Loss: 1.0940 Grad: 65582.0547  LR: 0.00000181  


Train:  67%|██████▋   | 581/865 [03:25<01:41,  2.79train_batch/s]

Epoch: [1][580/865] Elapsed 3m 25s (remain 1m 40s) Loss: 1.0879 Grad: 92722.4141  LR: 0.00000631  


Train:  69%|██████▉   | 601/865 [03:32<01:34,  2.79train_batch/s]

Epoch: [1][600/865] Elapsed 3m 32s (remain 1m 33s) Loss: 1.0807 Grad: 96100.0781  LR: 0.00001202  


Train:  72%|███████▏  | 621/865 [03:39<01:27,  2.79train_batch/s]

Epoch: [1][620/865] Elapsed 3m 39s (remain 1m 26s) Loss: 1.0721 Grad: 62108.7305  LR: 0.00001707  


Train:  74%|███████▍  | 641/865 [03:46<01:20,  2.79train_batch/s]

Epoch: [1][640/865] Elapsed 3m 46s (remain 1m 19s) Loss: 1.0687 Grad: 71214.9844  LR: 0.00001979  


Train:  76%|███████▋  | 661/865 [03:53<01:13,  2.79train_batch/s]

Epoch: [1][660/865] Elapsed 3m 53s (remain 1m 12s) Loss: 1.0692 Grad: 64021.4453  LR: 0.00001929  


Train:  79%|███████▊  | 681/865 [04:00<01:05,  2.79train_batch/s]

Epoch: [1][680/865] Elapsed 4m 0s (remain 1m 5s) Loss: 1.0650 Grad: 189939.8125  LR: 0.00001574  


Train:  81%|████████  | 701/865 [04:07<00:58,  2.79train_batch/s]

Epoch: [1][700/865] Elapsed 4m 7s (remain 0m 58s) Loss: 1.0630 Grad: 89816.4375  LR: 0.00001029  


Train:  83%|████████▎ | 721/865 [04:15<00:51,  2.79train_batch/s]

Epoch: [1][720/865] Elapsed 4m 15s (remain 0m 50s) Loss: 1.0576 Grad: 215562.3125  LR: 0.00000475  


Train:  86%|████████▌ | 741/865 [04:22<00:44,  2.79train_batch/s]

Epoch: [1][740/865] Elapsed 4m 22s (remain 0m 43s) Loss: 1.0535 Grad: 73089.0781  LR: 0.00000094  


Train:  88%|████████▊ | 761/865 [04:29<00:37,  2.79train_batch/s]

Epoch: [1][760/865] Elapsed 4m 29s (remain 0m 36s) Loss: 1.0491 Grad: 87720.0078  LR: 0.00000011  


Train:  90%|█████████ | 781/865 [04:36<00:30,  2.79train_batch/s]

Epoch: [1][780/865] Elapsed 4m 36s (remain 0m 29s) Loss: 1.0458 Grad: 60143.3750  LR: 0.00000253  


Train:  93%|█████████▎| 801/865 [04:43<00:22,  2.79train_batch/s]

Epoch: [1][800/865] Elapsed 4m 43s (remain 0m 22s) Loss: 1.0408 Grad: 49061.0312  LR: 0.00000741  


Train:  95%|█████████▍| 821/865 [04:50<00:15,  2.79train_batch/s]

Epoch: [1][820/865] Elapsed 4m 50s (remain 0m 15s) Loss: 1.0373 Grad: 103713.9141  LR: 0.00001315  


Train:  97%|█████████▋| 841/865 [04:57<00:08,  2.79train_batch/s]

Epoch: [1][840/865] Elapsed 4m 57s (remain 0m 8s) Loss: 1.0333 Grad: 125064.1797  LR: 0.00001784  


Train: 100%|█████████▉| 861/865 [05:04<00:01,  2.79train_batch/s]

Epoch: [1][860/865] Elapsed 5m 4s (remain 0m 1s) Loss: 1.0313 Grad: 72858.5156  LR: 0.00001996  


Train: 100%|██████████| 865/865 [05:06<00:00,  2.83train_batch/s]


Epoch: [1][864/865] Elapsed 5m 5s (remain 0m 0s) Loss: 1.0303 Grad: 138281.1875  LR: 0.00002000  


Validation:   1%|          | 3/433 [00:00<00:55,  7.68valid_batch/s]

EVAL: [0/433] Elapsed 0m 0s (remain 1m 39s) Loss: 1.1382 


Validation:   5%|▌         | 23/433 [00:02<00:39, 10.40valid_batch/s]

EVAL: [20/433] Elapsed 0m 2s (remain 0m 42s) Loss: 1.0800 


Validation:  10%|▉         | 43/433 [00:04<00:36, 10.58valid_batch/s]

EVAL: [40/433] Elapsed 0m 4s (remain 0m 38s) Loss: 1.0542 


Validation:  15%|█▍        | 63/433 [00:06<00:35, 10.51valid_batch/s]

EVAL: [60/433] Elapsed 0m 5s (remain 0m 36s) Loss: 1.0221 


Validation:  19%|█▉        | 83/433 [00:08<00:32, 10.83valid_batch/s]

EVAL: [80/433] Elapsed 0m 7s (remain 0m 34s) Loss: 1.0558 


Validation:  24%|██▍       | 103/433 [00:09<00:31, 10.54valid_batch/s]

EVAL: [100/433] Elapsed 0m 9s (remain 0m 31s) Loss: 1.0636 


Validation:  28%|██▊       | 123/433 [00:11<00:29, 10.46valid_batch/s]

EVAL: [120/433] Elapsed 0m 11s (remain 0m 29s) Loss: 1.0470 


Validation:  33%|███▎      | 143/433 [00:13<00:26, 10.98valid_batch/s]

EVAL: [140/433] Elapsed 0m 13s (remain 0m 28s) Loss: 1.0592 


Validation:  38%|███▊      | 163/433 [00:15<00:25, 10.51valid_batch/s]

EVAL: [160/433] Elapsed 0m 15s (remain 0m 26s) Loss: 1.0658 


Validation:  42%|████▏     | 183/433 [00:17<00:23, 10.48valid_batch/s]

EVAL: [180/433] Elapsed 0m 17s (remain 0m 24s) Loss: 1.0762 


Validation:  47%|████▋     | 203/433 [00:19<00:21, 10.67valid_batch/s]

EVAL: [200/433] Elapsed 0m 19s (remain 0m 22s) Loss: 1.0712 


Validation:  52%|█████▏    | 223/433 [00:21<00:19, 10.67valid_batch/s]

EVAL: [220/433] Elapsed 0m 21s (remain 0m 20s) Loss: 1.0714 


Validation:  56%|█████▌    | 243/433 [00:23<00:18, 10.48valid_batch/s]

EVAL: [240/433] Elapsed 0m 23s (remain 0m 18s) Loss: 1.0802 


Validation:  61%|██████    | 263/433 [00:25<00:16, 10.47valid_batch/s]

EVAL: [260/433] Elapsed 0m 24s (remain 0m 16s) Loss: 1.0764 


Validation:  65%|██████▌   | 283/433 [00:27<00:14, 10.47valid_batch/s]

EVAL: [280/433] Elapsed 0m 26s (remain 0m 14s) Loss: 1.0794 


Validation:  70%|██████▉   | 303/433 [00:28<00:12, 10.47valid_batch/s]

EVAL: [300/433] Elapsed 0m 28s (remain 0m 12s) Loss: 1.0771 


Validation:  75%|███████▍  | 323/433 [00:30<00:10, 10.45valid_batch/s]

EVAL: [320/433] Elapsed 0m 30s (remain 0m 10s) Loss: 1.0777 


Validation:  79%|███████▉  | 343/433 [00:32<00:08, 10.48valid_batch/s]

EVAL: [340/433] Elapsed 0m 32s (remain 0m 8s) Loss: 1.0853 


Validation:  84%|████████▍ | 363/433 [00:34<00:06, 10.58valid_batch/s]

EVAL: [360/433] Elapsed 0m 34s (remain 0m 6s) Loss: 1.0817 


Validation:  88%|████████▊ | 383/433 [00:36<00:04, 10.75valid_batch/s]

EVAL: [380/433] Elapsed 0m 36s (remain 0m 4s) Loss: 1.0729 


Validation:  93%|█████████▎| 403/433 [00:38<00:02, 10.46valid_batch/s]

EVAL: [400/433] Elapsed 0m 38s (remain 0m 3s) Loss: 1.0816 


Validation:  98%|█████████▊| 423/433 [00:40<00:00, 11.13valid_batch/s]

EVAL: [420/433] Elapsed 0m 40s (remain 0m 1s) Loss: 1.0761 


Validation: 100%|██████████| 433/433 [00:41<00:00, 10.51valid_batch/s]
Epoch 1 - avg_train_loss: 1.0303  avg_val_loss: 1.0935  time: 347s
Epoch 1 - Score: 0.6501
Epoch 1 - Save Best Score: 0.6501 Model


EVAL: [432/433] Elapsed 0m 41s (remain 0m 0s) Loss: 1.0935 


Train:   0%|          | 1/865 [00:00<06:55,  2.08train_batch/s]

Epoch: [2][0/865] Elapsed 0m 0s (remain 6m 55s) Loss: 1.6100 Grad: inf  LR: 0.00001998  


Train:   2%|▏         | 21/865 [00:07<05:02,  2.79train_batch/s]

Epoch: [2][20/865] Elapsed 0m 7s (remain 5m 2s) Loss: 1.0268 Grad: 172565.5469  LR: 0.00001802  


Train:   5%|▍         | 41/865 [00:14<04:55,  2.79train_batch/s]

Epoch: [2][40/865] Elapsed 0m 14s (remain 4m 53s) Loss: 0.9095 Grad: 164335.5000  LR: 0.00001342  


Train:   7%|▋         | 61/865 [00:21<04:48,  2.79train_batch/s]

Epoch: [2][60/865] Elapsed 0m 21s (remain 4m 45s) Loss: 0.8901 Grad: 112496.4688  LR: 0.00000769  


Train:   9%|▉         | 81/865 [00:28<04:40,  2.79train_batch/s]

Epoch: [2][80/865] Elapsed 0m 28s (remain 4m 38s) Loss: 0.8637 Grad: 157460.6094  LR: 0.00000273  


Train:  12%|█▏        | 101/865 [00:35<04:33,  2.79train_batch/s]

Epoch: [2][100/865] Elapsed 0m 35s (remain 4m 30s) Loss: 0.8618 Grad: 204349.2812  LR: 0.00000015  


Train:  14%|█▍        | 121/865 [00:42<04:27,  2.79train_batch/s]

Epoch: [2][120/865] Elapsed 0m 42s (remain 4m 23s) Loss: 0.8560 Grad: 190436.4844  LR: 0.00000082  


Train:  16%|█▋        | 141/865 [00:49<04:19,  2.79train_batch/s]

Epoch: [2][140/865] Elapsed 0m 49s (remain 4m 16s) Loss: 0.8575 Grad: 274234.6250  LR: 0.00000450  


Train:  19%|█▊        | 161/865 [00:57<04:12,  2.79train_batch/s]

Epoch: [2][160/865] Elapsed 0m 57s (remain 4m 9s) Loss: 0.8511 Grad: 147475.7969  LR: 0.00001000  


Train:  21%|██        | 181/865 [01:04<04:05,  2.78train_batch/s]

Epoch: [2][180/865] Elapsed 1m 4s (remain 4m 2s) Loss: 0.8480 Grad: 302677.8125  LR: 0.00001550  


Train:  23%|██▎       | 201/865 [01:11<03:57,  2.79train_batch/s]

Epoch: [2][200/865] Elapsed 1m 11s (remain 3m 55s) Loss: 0.8525 Grad: 164170.7812  LR: 0.00001918  


Train:  26%|██▌       | 221/865 [01:18<03:50,  2.80train_batch/s]

Epoch: [2][220/865] Elapsed 1m 18s (remain 3m 48s) Loss: 0.8499 Grad: 313952.0000  LR: 0.00001985  


Train:  28%|██▊       | 241/865 [01:25<03:43,  2.79train_batch/s]

Epoch: [2][240/865] Elapsed 1m 25s (remain 3m 40s) Loss: 0.8489 Grad: 133289.3125  LR: 0.00001727  


Train:  30%|███       | 261/865 [01:32<03:36,  2.79train_batch/s]

Epoch: [2][260/865] Elapsed 1m 32s (remain 3m 33s) Loss: 0.8490 Grad: 94735.2188  LR: 0.00001231  


Train:  32%|███▏      | 281/865 [01:39<03:29,  2.79train_batch/s]

Epoch: [2][280/865] Elapsed 1m 39s (remain 3m 26s) Loss: 0.8505 Grad: 267500.5312  LR: 0.00000658  


Train:  35%|███▍      | 301/865 [01:46<03:22,  2.79train_batch/s]

Epoch: [2][300/865] Elapsed 1m 46s (remain 3m 19s) Loss: 0.8500 Grad: 117274.3828  LR: 0.00000198  


Train:  37%|███▋      | 321/865 [01:53<03:15,  2.79train_batch/s]

Epoch: [2][320/865] Elapsed 1m 53s (remain 3m 12s) Loss: 0.8498 Grad: 181702.1875  LR: 0.00000002  


Train:  39%|███▉      | 341/865 [02:00<03:08,  2.79train_batch/s]

Epoch: [2][340/865] Elapsed 2m 0s (remain 3m 5s) Loss: 0.8499 Grad: 111405.4219  LR: 0.00000134  


Train:  42%|████▏     | 361/865 [02:07<03:00,  2.79train_batch/s]

Epoch: [2][360/865] Elapsed 2m 7s (remain 2m 58s) Loss: 0.8493 Grad: 118717.0000  LR: 0.00000551  


Train:  44%|████▍     | 381/865 [02:14<02:53,  2.79train_batch/s]

Epoch: [2][380/865] Elapsed 2m 14s (remain 2m 51s) Loss: 0.8500 Grad: 84509.4688  LR: 0.00001116  


Train:  46%|████▋     | 401/865 [02:21<02:46,  2.79train_batch/s]

Epoch: [2][400/865] Elapsed 2m 21s (remain 2m 44s) Loss: 0.8520 Grad: 65672.4297  LR: 0.00001643  


Train:  49%|████▊     | 421/865 [02:28<02:39,  2.79train_batch/s]

Epoch: [2][420/865] Elapsed 2m 28s (remain 2m 37s) Loss: 0.8525 Grad: 81535.1641  LR: 0.00001958  


Train:  51%|█████     | 441/865 [02:36<02:32,  2.79train_batch/s]

Epoch: [2][440/865] Elapsed 2m 36s (remain 2m 30s) Loss: 0.8567 Grad: 85338.8750  LR: 0.00001958  


Train:  53%|█████▎    | 461/865 [02:43<02:24,  2.79train_batch/s]

Epoch: [2][460/865] Elapsed 2m 43s (remain 2m 22s) Loss: 0.8587 Grad: 49288.0234  LR: 0.00001643  


Train:  56%|█████▌    | 481/865 [02:50<02:17,  2.79train_batch/s]

Epoch: [2][480/865] Elapsed 2m 50s (remain 2m 15s) Loss: 0.8579 Grad: 48849.2227  LR: 0.00001116  


Train:  58%|█████▊    | 501/865 [02:57<02:10,  2.79train_batch/s]

Epoch: [2][500/865] Elapsed 2m 57s (remain 2m 8s) Loss: 0.8571 Grad: 107850.1094  LR: 0.00000551  


Train:  60%|██████    | 521/865 [03:04<02:03,  2.79train_batch/s]

Epoch: [2][520/865] Elapsed 3m 4s (remain 2m 1s) Loss: 0.8574 Grad: 137403.8594  LR: 0.00000134  


Train:  63%|██████▎   | 541/865 [03:11<01:56,  2.79train_batch/s]

Epoch: [2][540/865] Elapsed 3m 11s (remain 1m 54s) Loss: 0.8555 Grad: 72171.3125  LR: 0.00000002  


Train:  65%|██████▍   | 561/865 [03:18<01:48,  2.79train_batch/s]

Epoch: [2][560/865] Elapsed 3m 18s (remain 1m 47s) Loss: 0.8566 Grad: 144683.4531  LR: 0.00000198  


Train:  67%|██████▋   | 581/865 [03:25<01:41,  2.79train_batch/s]

Epoch: [2][580/865] Elapsed 3m 25s (remain 1m 40s) Loss: 0.8569 Grad: 89573.9922  LR: 0.00000658  


Train:  69%|██████▉   | 601/865 [03:32<01:34,  2.79train_batch/s]

Epoch: [2][600/865] Elapsed 3m 32s (remain 1m 33s) Loss: 0.8578 Grad: 75015.0312  LR: 0.00001231  


Train:  72%|███████▏  | 621/865 [03:39<01:27,  2.79train_batch/s]

Epoch: [2][620/865] Elapsed 3m 39s (remain 1m 26s) Loss: 0.8598 Grad: 154615.3906  LR: 0.00001727  


Train:  74%|███████▍  | 641/865 [03:46<01:20,  2.78train_batch/s]

Epoch: [2][640/865] Elapsed 3m 46s (remain 1m 19s) Loss: 0.8598 Grad: 104994.7344  LR: 0.00001985  


Train:  76%|███████▋  | 661/865 [03:53<01:13,  2.79train_batch/s]

Epoch: [2][660/865] Elapsed 3m 53s (remain 1m 12s) Loss: 0.8629 Grad: 169622.5312  LR: 0.00001918  


Train:  79%|███████▊  | 681/865 [04:00<01:06,  2.79train_batch/s]

Epoch: [2][680/865] Elapsed 4m 0s (remain 1m 5s) Loss: 0.8656 Grad: 36451.3047  LR: 0.00001550  


Train:  81%|████████  | 701/865 [04:07<00:58,  2.79train_batch/s]

Epoch: [2][700/865] Elapsed 4m 7s (remain 0m 58s) Loss: 0.8659 Grad: 51874.7695  LR: 0.00001000  


Train:  83%|████████▎ | 721/865 [04:15<00:51,  2.79train_batch/s]

Epoch: [2][720/865] Elapsed 4m 15s (remain 0m 50s) Loss: 0.8654 Grad: 102954.1094  LR: 0.00000450  


Train:  86%|████████▌ | 741/865 [04:22<00:44,  2.79train_batch/s]

Epoch: [2][740/865] Elapsed 4m 22s (remain 0m 43s) Loss: 0.8641 Grad: 63538.1641  LR: 0.00000082  


Train:  88%|████████▊ | 761/865 [04:29<00:37,  2.79train_batch/s]

Epoch: [2][760/865] Elapsed 4m 29s (remain 0m 36s) Loss: 0.8604 Grad: 83423.5312  LR: 0.00000015  


Train:  90%|█████████ | 781/865 [04:36<00:30,  2.79train_batch/s]

Epoch: [2][780/865] Elapsed 4m 36s (remain 0m 29s) Loss: 0.8616 Grad: 84635.2969  LR: 0.00000273  


Train:  93%|█████████▎| 801/865 [04:43<00:22,  2.79train_batch/s]

Epoch: [2][800/865] Elapsed 4m 43s (remain 0m 22s) Loss: 0.8590 Grad: 98206.9297  LR: 0.00000769  


Train:  95%|█████████▍| 821/865 [04:50<00:15,  2.79train_batch/s]

Epoch: [2][820/865] Elapsed 4m 50s (remain 0m 15s) Loss: 0.8591 Grad: 135678.7188  LR: 0.00001342  


Train:  97%|█████████▋| 841/865 [04:57<00:08,  2.79train_batch/s]

Epoch: [2][840/865] Elapsed 4m 57s (remain 0m 8s) Loss: 0.8591 Grad: 64951.3594  LR: 0.00001802  


Train: 100%|█████████▉| 861/865 [05:04<00:01,  2.79train_batch/s]

Epoch: [2][860/865] Elapsed 5m 4s (remain 0m 1s) Loss: 0.8610 Grad: 207294.9844  LR: 0.00001998  


Train: 100%|██████████| 865/865 [05:06<00:00,  2.83train_batch/s]


Epoch: [2][864/865] Elapsed 5m 5s (remain 0m 0s) Loss: 0.8618 Grad: 60905.2812  LR: 0.00001998  


Validation:   1%|          | 3/433 [00:00<00:56,  7.62valid_batch/s]

EVAL: [0/433] Elapsed 0m 0s (remain 1m 41s) Loss: 1.3172 


Validation:   5%|▌         | 23/433 [00:02<00:39, 10.41valid_batch/s]

EVAL: [20/433] Elapsed 0m 2s (remain 0m 42s) Loss: 0.8811 


Validation:  10%|▉         | 43/433 [00:04<00:36, 10.58valid_batch/s]

EVAL: [40/433] Elapsed 0m 4s (remain 0m 38s) Loss: 0.8240 


Validation:  15%|█▍        | 63/433 [00:06<00:35, 10.51valid_batch/s]

EVAL: [60/433] Elapsed 0m 5s (remain 0m 36s) Loss: 0.8076 


Validation:  19%|█▉        | 83/433 [00:08<00:32, 10.84valid_batch/s]

EVAL: [80/433] Elapsed 0m 7s (remain 0m 34s) Loss: 0.8244 


Validation:  24%|██▍       | 103/433 [00:09<00:31, 10.53valid_batch/s]

EVAL: [100/433] Elapsed 0m 9s (remain 0m 31s) Loss: 0.8333 


Validation:  28%|██▊       | 123/433 [00:11<00:29, 10.47valid_batch/s]

EVAL: [120/433] Elapsed 0m 11s (remain 0m 29s) Loss: 0.8162 


Validation:  33%|███▎      | 143/433 [00:13<00:26, 10.99valid_batch/s]

EVAL: [140/433] Elapsed 0m 13s (remain 0m 28s) Loss: 0.8236 


Validation:  38%|███▊      | 163/433 [00:15<00:25, 10.51valid_batch/s]

EVAL: [160/433] Elapsed 0m 15s (remain 0m 26s) Loss: 0.8369 


Validation:  42%|████▏     | 183/433 [00:17<00:23, 10.49valid_batch/s]

EVAL: [180/433] Elapsed 0m 17s (remain 0m 24s) Loss: 0.8474 


Validation:  47%|████▋     | 203/433 [00:19<00:21, 10.66valid_batch/s]

EVAL: [200/433] Elapsed 0m 19s (remain 0m 22s) Loss: 0.8470 


Validation:  52%|█████▏    | 223/433 [00:21<00:19, 10.67valid_batch/s]

EVAL: [220/433] Elapsed 0m 21s (remain 0m 20s) Loss: 0.8485 


Validation:  56%|█████▌    | 243/433 [00:23<00:18, 10.48valid_batch/s]

EVAL: [240/433] Elapsed 0m 23s (remain 0m 18s) Loss: 0.8549 


Validation:  61%|██████    | 263/433 [00:25<00:16, 10.46valid_batch/s]

EVAL: [260/433] Elapsed 0m 24s (remain 0m 16s) Loss: 0.8540 


Validation:  65%|██████▌   | 283/433 [00:27<00:14, 10.46valid_batch/s]

EVAL: [280/433] Elapsed 0m 26s (remain 0m 14s) Loss: 0.8605 


Validation:  70%|██████▉   | 303/433 [00:28<00:12, 10.46valid_batch/s]

EVAL: [300/433] Elapsed 0m 28s (remain 0m 12s) Loss: 0.8614 


Validation:  75%|███████▍  | 323/433 [00:30<00:10, 10.44valid_batch/s]

EVAL: [320/433] Elapsed 0m 30s (remain 0m 10s) Loss: 0.8630 


Validation:  79%|███████▉  | 343/433 [00:32<00:08, 10.46valid_batch/s]

EVAL: [340/433] Elapsed 0m 32s (remain 0m 8s) Loss: 0.8679 


Validation:  84%|████████▍ | 363/433 [00:34<00:06, 10.57valid_batch/s]

EVAL: [360/433] Elapsed 0m 34s (remain 0m 6s) Loss: 0.8671 


Validation:  88%|████████▊ | 383/433 [00:36<00:04, 10.74valid_batch/s]

EVAL: [380/433] Elapsed 0m 36s (remain 0m 4s) Loss: 0.8613 


Validation:  93%|█████████▎| 403/433 [00:38<00:02, 10.46valid_batch/s]

EVAL: [400/433] Elapsed 0m 38s (remain 0m 3s) Loss: 0.8673 


Validation:  98%|█████████▊| 423/433 [00:40<00:00, 11.14valid_batch/s]

EVAL: [420/433] Elapsed 0m 40s (remain 0m 1s) Loss: 0.8633 


Validation: 100%|██████████| 433/433 [00:41<00:00, 10.51valid_batch/s]
Epoch 2 - avg_train_loss: 0.8618  avg_val_loss: 0.8759  time: 347s
Epoch 2 - Score: 0.7477
Epoch 2 - Save Best Score: 0.7477 Model


EVAL: [432/433] Elapsed 0m 41s (remain 0m 0s) Loss: 0.8759 


Score: 0.7477
Train:   0%|          | 1/865 [00:00<07:05,  2.03train_batch/s]

Epoch: [1][0/865] Elapsed 0m 0s (remain 7m 5s) Loss: 2.2606 Grad: inf  LR: 0.00002000  


Train:   2%|▏         | 21/865 [00:07<05:02,  2.79train_batch/s]

Epoch: [1][20/865] Elapsed 0m 7s (remain 5m 1s) Loss: 1.6728 Grad: 43642.8438  LR: 0.00001819  


Train:   5%|▍         | 41/865 [00:14<04:55,  2.79train_batch/s]

Epoch: [1][40/865] Elapsed 0m 14s (remain 4m 52s) Loss: 1.5009 Grad: 66156.4844  LR: 0.00001369  


Train:   7%|▋         | 61/865 [00:21<04:48,  2.79train_batch/s]

Epoch: [1][60/865] Elapsed 0m 21s (remain 4m 45s) Loss: 1.4031 Grad: 52728.2969  LR: 0.00000798  


Train:   9%|▉         | 81/865 [00:28<04:40,  2.79train_batch/s]

Epoch: [1][80/865] Elapsed 0m 28s (remain 4m 37s) Loss: 1.3383 Grad: 49522.0039  LR: 0.00000293  


Train:  12%|█▏        | 101/865 [00:35<04:33,  2.79train_batch/s]

Epoch: [1][100/865] Elapsed 0m 35s (remain 4m 30s) Loss: 1.3033 Grad: 83319.2031  LR: 0.00000021  


Train:  14%|█▍        | 121/865 [00:42<04:26,  2.79train_batch/s]

Epoch: [1][120/865] Elapsed 0m 42s (remain 4m 23s) Loss: 1.2729 Grad: 42366.5273  LR: 0.00000071  


Train:  16%|█▋        | 141/865 [00:49<04:19,  2.79train_batch/s]

Epoch: [1][140/865] Elapsed 0m 49s (remain 4m 16s) Loss: 1.2553 Grad: 119643.4844  LR: 0.00000426  


Train:  19%|█▊        | 161/865 [00:57<04:11,  2.79train_batch/s]

Epoch: [1][160/865] Elapsed 0m 56s (remain 4m 9s) Loss: 1.2304 Grad: 83957.6172  LR: 0.00000971  


Train:  21%|██        | 181/865 [01:04<04:04,  2.79train_batch/s]

Epoch: [1][180/865] Elapsed 1m 4s (remain 4m 2s) Loss: 1.2186 Grad: 123893.1250  LR: 0.00001525  


Train:  23%|██▎       | 201/865 [01:11<03:58,  2.79train_batch/s]

Epoch: [1][200/865] Elapsed 1m 11s (remain 3m 54s) Loss: 1.2161 Grad: 253829.7500  LR: 0.00001906  


Train:  26%|██▌       | 221/865 [01:18<03:50,  2.79train_batch/s]

Epoch: [1][220/865] Elapsed 1m 18s (remain 3m 47s) Loss: 1.2005 Grad: 83029.4766  LR: 0.00001989  


Train:  28%|██▊       | 241/865 [01:25<03:43,  2.79train_batch/s]

Epoch: [1][240/865] Elapsed 1m 25s (remain 3m 40s) Loss: 1.1792 Grad: 52503.3516  LR: 0.00001747  


Train:  30%|███       | 261/865 [01:32<03:36,  2.79train_batch/s]

Epoch: [1][260/865] Elapsed 1m 32s (remain 3m 33s) Loss: 1.1673 Grad: 106153.5156  LR: 0.00001259  


Train:  32%|███▏      | 281/865 [01:39<03:29,  2.79train_batch/s]

Epoch: [1][280/865] Elapsed 1m 39s (remain 3m 26s) Loss: 1.1602 Grad: 261355.6719  LR: 0.00000685  


Train:  35%|███▍      | 301/865 [01:46<03:22,  2.79train_batch/s]

Epoch: [1][300/865] Elapsed 1m 46s (remain 3m 19s) Loss: 1.1404 Grad: 85929.8672  LR: 0.00000216  


Train:  37%|███▋      | 321/865 [01:53<03:15,  2.79train_batch/s]

Epoch: [1][320/865] Elapsed 1m 53s (remain 3m 12s) Loss: 1.1284 Grad: 78543.6094  LR: 0.00000004  


Train:  39%|███▉      | 341/865 [02:00<03:07,  2.79train_batch/s]

Epoch: [1][340/865] Elapsed 2m 0s (remain 3m 5s) Loss: 1.1203 Grad: 87595.9531  LR: 0.00000120  


Train:  42%|████▏     | 361/865 [02:07<03:01,  2.78train_batch/s]

Epoch: [1][360/865] Elapsed 2m 7s (remain 2m 58s) Loss: 1.1127 Grad: 186703.8438  LR: 0.00000525  


Train:  44%|████▍     | 381/865 [02:14<03:03,  2.64train_batch/s]

Epoch: [1][380/865] Elapsed 2m 14s (remain 2m 51s) Loss: 1.1032 Grad: 56276.0938  LR: 0.00001087  


Train:  46%|████▋     | 401/865 [02:22<02:57,  2.62train_batch/s]

Epoch: [1][400/865] Elapsed 2m 22s (remain 2m 44s) Loss: 1.0941 Grad: 129035.4141  LR: 0.00001620  


Train:  49%|████▊     | 421/865 [02:29<02:39,  2.78train_batch/s]

Epoch: [1][420/865] Elapsed 2m 29s (remain 2m 37s) Loss: 1.0874 Grad: 90726.4844  LR: 0.00001949  


Train:  51%|█████     | 441/865 [02:36<02:33,  2.77train_batch/s]

Epoch: [1][440/865] Elapsed 2m 36s (remain 2m 30s) Loss: 1.0819 Grad: 79912.8672  LR: 0.00001966  


Train:  53%|█████▎    | 461/865 [02:43<02:24,  2.79train_batch/s]

Epoch: [1][460/865] Elapsed 2m 43s (remain 2m 23s) Loss: 1.0766 Grad: 83310.7266  LR: 0.00001665  


Train:  56%|█████▌    | 481/865 [02:50<02:17,  2.79train_batch/s]

Epoch: [1][480/865] Elapsed 2m 50s (remain 2m 16s) Loss: 1.0681 Grad: 96078.8672  LR: 0.00001145  


Train:  58%|█████▊    | 501/865 [02:58<02:10,  2.79train_batch/s]

Epoch: [1][500/865] Elapsed 2m 58s (remain 2m 9s) Loss: 1.0621 Grad: 112504.6797  LR: 0.00000577  


Train:  60%|██████    | 521/865 [03:05<02:03,  2.79train_batch/s]

Epoch: [1][520/865] Elapsed 3m 5s (remain 2m 2s) Loss: 1.0555 Grad: 221373.5312  LR: 0.00000149  


Train:  63%|██████▎   | 541/865 [03:12<01:55,  2.79train_batch/s]

Epoch: [1][540/865] Elapsed 3m 12s (remain 1m 55s) Loss: 1.0505 Grad: 86167.9062  LR: 0.00000000  


Train:  65%|██████▍   | 561/865 [03:19<01:48,  2.79train_batch/s]

Epoch: [1][560/865] Elapsed 3m 19s (remain 1m 47s) Loss: 1.0429 Grad: 199229.4688  LR: 0.00000181  


Train:  67%|██████▋   | 581/865 [03:26<01:41,  2.79train_batch/s]

Epoch: [1][580/865] Elapsed 3m 26s (remain 1m 40s) Loss: 1.0373 Grad: 187093.7031  LR: 0.00000631  


Train:  69%|██████▉   | 601/865 [03:33<01:34,  2.79train_batch/s]

Epoch: [1][600/865] Elapsed 3m 33s (remain 1m 33s) Loss: 1.0322 Grad: 297941.7812  LR: 0.00001202  


Train:  72%|███████▏  | 621/865 [03:40<01:27,  2.79train_batch/s]

Epoch: [1][620/865] Elapsed 3m 40s (remain 1m 26s) Loss: 1.0270 Grad: 105098.1641  LR: 0.00001707  


Train:  74%|███████▍  | 641/865 [03:47<01:20,  2.80train_batch/s]

Epoch: [1][640/865] Elapsed 3m 47s (remain 1m 19s) Loss: 1.0247 Grad: 201088.5156  LR: 0.00001979  


Train:  76%|███████▋  | 661/865 [03:54<01:13,  2.79train_batch/s]

Epoch: [1][660/865] Elapsed 3m 54s (remain 1m 12s) Loss: 1.0211 Grad: 116334.0703  LR: 0.00001929  


Train:  79%|███████▊  | 681/865 [04:01<01:05,  2.79train_batch/s]

Epoch: [1][680/865] Elapsed 4m 1s (remain 1m 5s) Loss: 1.0177 Grad: 86842.9688  LR: 0.00001574  


Train:  81%|████████  | 701/865 [04:08<00:58,  2.80train_batch/s]

Epoch: [1][700/865] Elapsed 4m 8s (remain 0m 58s) Loss: 1.0156 Grad: 166605.9219  LR: 0.00001029  


Train:  83%|████████▎ | 721/865 [04:15<00:51,  2.79train_batch/s]

Epoch: [1][720/865] Elapsed 4m 15s (remain 0m 51s) Loss: 1.0143 Grad: 77090.0469  LR: 0.00000475  


Train:  86%|████████▌ | 741/865 [04:22<00:44,  2.80train_batch/s]

Epoch: [1][740/865] Elapsed 4m 22s (remain 0m 43s) Loss: 1.0111 Grad: 95518.5781  LR: 0.00000094  


Train:  88%|████████▊ | 761/865 [04:29<00:37,  2.79train_batch/s]

Epoch: [1][760/865] Elapsed 4m 29s (remain 0m 36s) Loss: 1.0064 Grad: 123656.9922  LR: 0.00000011  


Train:  90%|█████████ | 781/865 [04:36<00:30,  2.79train_batch/s]

Epoch: [1][780/865] Elapsed 4m 36s (remain 0m 29s) Loss: 1.0021 Grad: 97022.2500  LR: 0.00000253  


Train:  93%|█████████▎| 801/865 [04:44<00:22,  2.80train_batch/s]

Epoch: [1][800/865] Elapsed 4m 44s (remain 0m 22s) Loss: 0.9972 Grad: 61018.8789  LR: 0.00000741  


Train:  95%|█████████▍| 821/865 [04:51<00:15,  2.79train_batch/s]

Epoch: [1][820/865] Elapsed 4m 51s (remain 0m 15s) Loss: 0.9941 Grad: 247740.0781  LR: 0.00001315  


Train:  97%|█████████▋| 841/865 [04:58<00:08,  2.80train_batch/s]

Epoch: [1][840/865] Elapsed 4m 58s (remain 0m 8s) Loss: 0.9911 Grad: 82008.1172  LR: 0.00001784  


Train: 100%|█████████▉| 861/865 [05:05<00:01,  2.80train_batch/s]

Epoch: [1][860/865] Elapsed 5m 5s (remain 0m 1s) Loss: 0.9914 Grad: 51628.2695  LR: 0.00001996  


Train: 100%|██████████| 865/865 [05:06<00:00,  2.82train_batch/s]


Epoch: [1][864/865] Elapsed 5m 6s (remain 0m 0s) Loss: 0.9908 Grad: 142620.5781  LR: 0.00002000  


Validation:   1%|          | 3/433 [00:00<00:55,  7.72valid_batch/s]

EVAL: [0/433] Elapsed 0m 0s (remain 1m 37s) Loss: 0.4652 


Validation:   5%|▌         | 23/433 [00:02<00:38, 10.67valid_batch/s]

EVAL: [20/433] Elapsed 0m 2s (remain 0m 41s) Loss: 0.8044 


Validation:  10%|▉         | 43/433 [00:04<00:35, 11.03valid_batch/s]

EVAL: [40/433] Elapsed 0m 3s (remain 0m 37s) Loss: 0.8813 


Validation:  15%|█▍        | 63/433 [00:06<00:35, 10.50valid_batch/s]

EVAL: [60/433] Elapsed 0m 5s (remain 0m 35s) Loss: 0.9333 


Validation:  19%|█▉        | 83/433 [00:07<00:33, 10.48valid_batch/s]

EVAL: [80/433] Elapsed 0m 7s (remain 0m 33s) Loss: 0.9425 


Validation:  24%|██▍       | 103/433 [00:09<00:31, 10.52valid_batch/s]

EVAL: [100/433] Elapsed 0m 9s (remain 0m 31s) Loss: 0.9805 


Validation:  28%|██▊       | 123/433 [00:11<00:29, 10.64valid_batch/s]

EVAL: [120/433] Elapsed 0m 11s (remain 0m 29s) Loss: 0.9857 


Validation:  33%|███▎      | 143/433 [00:13<00:27, 10.48valid_batch/s]

EVAL: [140/433] Elapsed 0m 13s (remain 0m 27s) Loss: 0.9923 


Validation:  38%|███▊      | 163/433 [00:15<00:25, 10.45valid_batch/s]

EVAL: [160/433] Elapsed 0m 15s (remain 0m 25s) Loss: 0.9893 


Validation:  42%|████▏     | 183/433 [00:17<00:23, 10.47valid_batch/s]

EVAL: [180/433] Elapsed 0m 17s (remain 0m 24s) Loss: 0.9995 


Validation:  47%|████▋     | 203/433 [00:19<00:21, 10.65valid_batch/s]

EVAL: [200/433] Elapsed 0m 19s (remain 0m 22s) Loss: 1.0164 


Validation:  52%|█████▏    | 223/433 [00:21<00:19, 10.51valid_batch/s]

EVAL: [220/433] Elapsed 0m 21s (remain 0m 20s) Loss: 1.0214 


Validation:  56%|█████▌    | 243/433 [00:23<00:17, 10.60valid_batch/s]

EVAL: [240/433] Elapsed 0m 22s (remain 0m 18s) Loss: 1.0186 


Validation:  61%|██████    | 263/433 [00:25<00:16, 10.56valid_batch/s]

EVAL: [260/433] Elapsed 0m 24s (remain 0m 16s) Loss: 1.0285 


Validation:  65%|██████▌   | 283/433 [00:26<00:14, 10.58valid_batch/s]

EVAL: [280/433] Elapsed 0m 26s (remain 0m 14s) Loss: 1.0261 


Validation:  70%|██████▉   | 303/433 [00:28<00:12, 10.52valid_batch/s]

EVAL: [300/433] Elapsed 0m 28s (remain 0m 12s) Loss: 1.0317 


Validation:  75%|███████▍  | 323/433 [00:30<00:10, 10.58valid_batch/s]

EVAL: [320/433] Elapsed 0m 30s (remain 0m 10s) Loss: 1.0286 


Validation:  79%|███████▉  | 343/433 [00:32<00:08, 10.50valid_batch/s]

EVAL: [340/433] Elapsed 0m 32s (remain 0m 8s) Loss: 1.0317 


Validation:  84%|████████▍ | 363/433 [00:34<00:06, 10.48valid_batch/s]

EVAL: [360/433] Elapsed 0m 34s (remain 0m 6s) Loss: 1.0338 


Validation:  88%|████████▊ | 383/433 [00:36<00:04, 10.50valid_batch/s]

EVAL: [380/433] Elapsed 0m 36s (remain 0m 4s) Loss: 1.0366 


Validation:  93%|█████████▎| 403/433 [00:38<00:02, 10.53valid_batch/s]

EVAL: [400/433] Elapsed 0m 38s (remain 0m 3s) Loss: 1.0315 


Validation:  98%|█████████▊| 423/433 [00:40<00:00, 10.48valid_batch/s]

EVAL: [420/433] Elapsed 0m 40s (remain 0m 1s) Loss: 1.0315 


Validation: 100%|██████████| 433/433 [00:41<00:00, 10.51valid_batch/s]
Epoch 1 - avg_train_loss: 0.9908  avg_val_loss: 1.0408  time: 348s
Epoch 1 - Score: 0.6680
Epoch 1 - Save Best Score: 0.6680 Model


EVAL: [432/433] Elapsed 0m 41s (remain 0m 0s) Loss: 1.0408 


Train:   0%|          | 1/865 [00:00<06:45,  2.13train_batch/s]

Epoch: [2][0/865] Elapsed 0m 0s (remain 6m 45s) Loss: 0.8361 Grad: inf  LR: 0.00001998  


Train:   2%|▏         | 21/865 [00:07<05:02,  2.79train_batch/s]

Epoch: [2][20/865] Elapsed 0m 7s (remain 5m 1s) Loss: 0.9077 Grad: 178425.0469  LR: 0.00001802  


Train:   5%|▍         | 41/865 [00:14<04:54,  2.79train_batch/s]

Epoch: [2][40/865] Elapsed 0m 14s (remain 4m 52s) Loss: 0.8983 Grad: 178518.1562  LR: 0.00001342  


Train:   7%|▋         | 61/865 [00:21<04:47,  2.79train_batch/s]

Epoch: [2][60/865] Elapsed 0m 21s (remain 4m 45s) Loss: 0.8767 Grad: 298065.8438  LR: 0.00000769  


Train:   9%|▉         | 81/865 [00:28<04:40,  2.79train_batch/s]

Epoch: [2][80/865] Elapsed 0m 28s (remain 4m 37s) Loss: 0.8578 Grad: 130087.1562  LR: 0.00000273  


Train:  12%|█▏        | 101/865 [00:35<04:33,  2.79train_batch/s]

Epoch: [2][100/865] Elapsed 0m 35s (remain 4m 30s) Loss: 0.8492 Grad: 227454.8438  LR: 0.00000015  


Train:  14%|█▍        | 121/865 [00:42<04:26,  2.79train_batch/s]

Epoch: [2][120/865] Elapsed 0m 42s (remain 4m 23s) Loss: 0.8326 Grad: 178069.9844  LR: 0.00000082  


Train:  16%|█▋        | 141/865 [00:49<04:19,  2.79train_batch/s]

Epoch: [2][140/865] Elapsed 0m 49s (remain 4m 16s) Loss: 0.8290 Grad: 230492.5312  LR: 0.00000450  


Train:  19%|█▊        | 161/865 [00:56<04:03,  2.89train_batch/s]

Epoch: [2][160/865] Elapsed 0m 56s (remain 4m 8s) Loss: 0.8257 Grad: 61824.5508  LR: 0.00001000  


Train:  21%|██        | 181/865 [01:03<04:04,  2.79train_batch/s]

Epoch: [2][180/865] Elapsed 1m 3s (remain 4m 1s) Loss: 0.8335 Grad: 89594.3750  LR: 0.00001550  


Train:  23%|██▎       | 201/865 [01:11<03:57,  2.80train_batch/s]

Epoch: [2][200/865] Elapsed 1m 11s (remain 3m 54s) Loss: 0.8318 Grad: 104715.9375  LR: 0.00001918  


Train:  26%|██▌       | 221/865 [01:18<03:50,  2.79train_batch/s]

Epoch: [2][220/865] Elapsed 1m 18s (remain 3m 47s) Loss: 0.8403 Grad: 192493.4844  LR: 0.00001985  


Train:  28%|██▊       | 241/865 [01:25<03:43,  2.80train_batch/s]

Epoch: [2][240/865] Elapsed 1m 25s (remain 3m 40s) Loss: 0.8408 Grad: 51797.2500  LR: 0.00001727  


Train:  30%|███       | 261/865 [01:32<03:36,  2.80train_batch/s]

Epoch: [2][260/865] Elapsed 1m 32s (remain 3m 33s) Loss: 0.8392 Grad: 95619.0703  LR: 0.00001231  


Train:  32%|███▏      | 281/865 [01:39<03:29,  2.79train_batch/s]

Epoch: [2][280/865] Elapsed 1m 39s (remain 3m 26s) Loss: 0.8340 Grad: 94742.0391  LR: 0.00000658  


Train:  35%|███▍      | 301/865 [01:46<03:22,  2.79train_batch/s]

Epoch: [2][300/865] Elapsed 1m 46s (remain 3m 19s) Loss: 0.8317 Grad: 105435.9922  LR: 0.00000198  


Train:  37%|███▋      | 321/865 [01:53<03:14,  2.79train_batch/s]

Epoch: [2][320/865] Elapsed 1m 53s (remain 3m 12s) Loss: 0.8296 Grad: 106767.5781  LR: 0.00000002  


Train:  39%|███▉      | 341/865 [02:00<03:07,  2.79train_batch/s]

Epoch: [2][340/865] Elapsed 2m 0s (remain 3m 5s) Loss: 0.8230 Grad: 134633.9844  LR: 0.00000134  


Train:  42%|████▏     | 361/865 [02:07<03:00,  2.80train_batch/s]

Epoch: [2][360/865] Elapsed 2m 7s (remain 2m 58s) Loss: 0.8194 Grad: 51430.9180  LR: 0.00000551  


Train:  44%|████▍     | 381/865 [02:14<02:53,  2.79train_batch/s]

Epoch: [2][380/865] Elapsed 2m 14s (remain 2m 50s) Loss: 0.8148 Grad: 98425.1328  LR: 0.00001116  


Train:  46%|████▋     | 401/865 [02:21<02:46,  2.79train_batch/s]

Epoch: [2][400/865] Elapsed 2m 21s (remain 2m 43s) Loss: 0.8164 Grad: 65323.1484  LR: 0.00001643  


Train:  49%|████▊     | 421/865 [02:28<02:39,  2.79train_batch/s]

Epoch: [2][420/865] Elapsed 2m 28s (remain 2m 36s) Loss: 0.8165 Grad: 59792.1523  LR: 0.00001958  


Train:  51%|█████     | 441/865 [02:35<02:32,  2.79train_batch/s]

Epoch: [2][440/865] Elapsed 2m 35s (remain 2m 29s) Loss: 0.8191 Grad: 66128.1094  LR: 0.00001958  


Train:  53%|█████▎    | 461/865 [02:42<02:24,  2.79train_batch/s]

Epoch: [2][460/865] Elapsed 2m 42s (remain 2m 22s) Loss: 0.8247 Grad: 129344.7734  LR: 0.00001643  


Train:  56%|█████▌    | 481/865 [02:49<02:17,  2.79train_batch/s]

Epoch: [2][480/865] Elapsed 2m 49s (remain 2m 15s) Loss: 0.8260 Grad: 65807.1641  LR: 0.00001116  


Train:  58%|█████▊    | 501/865 [02:56<02:10,  2.79train_batch/s]

Epoch: [2][500/865] Elapsed 2m 56s (remain 2m 8s) Loss: 0.8283 Grad: 216583.7188  LR: 0.00000551  


Train:  60%|██████    | 521/865 [03:04<02:02,  2.80train_batch/s]

Epoch: [2][520/865] Elapsed 3m 4s (remain 2m 1s) Loss: 0.8282 Grad: 134593.0000  LR: 0.00000134  


Train:  63%|██████▎   | 541/865 [03:11<01:56,  2.79train_batch/s]

Epoch: [2][540/865] Elapsed 3m 11s (remain 1m 54s) Loss: 0.8267 Grad: 56127.3672  LR: 0.00000002  


Train:  65%|██████▍   | 561/865 [03:18<01:48,  2.79train_batch/s]

Epoch: [2][560/865] Elapsed 3m 18s (remain 1m 47s) Loss: 0.8259 Grad: 73329.0703  LR: 0.00000198  


Train:  67%|██████▋   | 581/865 [03:25<01:41,  2.79train_batch/s]

Epoch: [2][580/865] Elapsed 3m 25s (remain 1m 40s) Loss: 0.8244 Grad: 121501.1016  LR: 0.00000658  


Train:  69%|██████▉   | 601/865 [03:32<01:34,  2.79train_batch/s]

Epoch: [2][600/865] Elapsed 3m 32s (remain 1m 33s) Loss: 0.8227 Grad: 94010.7969  LR: 0.00001231  


Train:  72%|███████▏  | 621/865 [03:39<01:27,  2.79train_batch/s]

Epoch: [2][620/865] Elapsed 3m 39s (remain 1m 26s) Loss: 0.8218 Grad: 133722.8594  LR: 0.00001727  


Train:  74%|███████▍  | 641/865 [03:46<01:20,  2.79train_batch/s]

Epoch: [2][640/865] Elapsed 3m 46s (remain 1m 19s) Loss: 0.8270 Grad: 96848.2031  LR: 0.00001985  


Train:  76%|███████▋  | 661/865 [03:53<01:13,  2.79train_batch/s]

Epoch: [2][660/865] Elapsed 3m 53s (remain 1m 12s) Loss: 0.8314 Grad: 230948.6250  LR: 0.00001918  


Train:  79%|███████▊  | 681/865 [04:00<01:05,  2.80train_batch/s]

Epoch: [2][680/865] Elapsed 4m 0s (remain 1m 4s) Loss: 0.8322 Grad: 175088.8281  LR: 0.00001550  


Train:  81%|████████  | 701/865 [04:07<00:58,  2.79train_batch/s]

Epoch: [2][700/865] Elapsed 4m 7s (remain 0m 57s) Loss: 0.8315 Grad: 145394.9844  LR: 0.00001000  


Train:  83%|████████▎ | 721/865 [04:14<00:51,  2.80train_batch/s]

Epoch: [2][720/865] Elapsed 4m 14s (remain 0m 50s) Loss: 0.8298 Grad: 74959.2891  LR: 0.00000450  


Train:  86%|████████▌ | 741/865 [04:21<00:44,  2.79train_batch/s]

Epoch: [2][740/865] Elapsed 4m 21s (remain 0m 43s) Loss: 0.8298 Grad: 86311.0469  LR: 0.00000082  


Train:  88%|████████▊ | 761/865 [04:28<00:37,  2.79train_batch/s]

Epoch: [2][760/865] Elapsed 4m 28s (remain 0m 36s) Loss: 0.8287 Grad: 60213.9844  LR: 0.00000015  


Train:  90%|█████████ | 781/865 [04:35<00:30,  2.79train_batch/s]

Epoch: [2][780/865] Elapsed 4m 35s (remain 0m 29s) Loss: 0.8273 Grad: 85473.6875  LR: 0.00000273  


Train:  93%|█████████▎| 801/865 [04:42<00:22,  2.79train_batch/s]

Epoch: [2][800/865] Elapsed 4m 42s (remain 0m 22s) Loss: 0.8255 Grad: 56109.0977  LR: 0.00000769  


Train:  95%|█████████▍| 821/865 [04:49<00:15,  2.79train_batch/s]

Epoch: [2][820/865] Elapsed 4m 49s (remain 0m 15s) Loss: 0.8259 Grad: 80631.6719  LR: 0.00001342  


Train:  97%|█████████▋| 841/865 [04:57<00:08,  2.79train_batch/s]

Epoch: [2][840/865] Elapsed 4m 57s (remain 0m 8s) Loss: 0.8263 Grad: 138047.1875  LR: 0.00001802  


Train: 100%|█████████▉| 861/865 [05:04<00:01,  2.80train_batch/s]

Epoch: [2][860/865] Elapsed 5m 4s (remain 0m 1s) Loss: 0.8268 Grad: 89278.7266  LR: 0.00001998  


Train: 100%|██████████| 865/865 [05:05<00:00,  2.83train_batch/s]


Epoch: [2][864/865] Elapsed 5m 5s (remain 0m 0s) Loss: 0.8268 Grad: 94724.2578  LR: 0.00001998  


Validation:   1%|          | 3/433 [00:00<00:55,  7.76valid_batch/s]

EVAL: [0/433] Elapsed 0m 0s (remain 1m 36s) Loss: 1.0972 


Validation:   5%|▌         | 23/433 [00:02<00:38, 10.66valid_batch/s]

EVAL: [20/433] Elapsed 0m 2s (remain 0m 41s) Loss: 0.7088 


Validation:  10%|▉         | 43/433 [00:04<00:35, 11.02valid_batch/s]

EVAL: [40/433] Elapsed 0m 3s (remain 0m 37s) Loss: 0.7474 


Validation:  15%|█▍        | 63/433 [00:06<00:35, 10.48valid_batch/s]

EVAL: [60/433] Elapsed 0m 5s (remain 0m 35s) Loss: 0.7849 


Validation:  19%|█▉        | 83/433 [00:07<00:33, 10.48valid_batch/s]

EVAL: [80/433] Elapsed 0m 7s (remain 0m 33s) Loss: 0.7694 


Validation:  24%|██▍       | 103/433 [00:09<00:31, 10.50valid_batch/s]

EVAL: [100/433] Elapsed 0m 9s (remain 0m 31s) Loss: 0.8095 


Validation:  28%|██▊       | 123/433 [00:11<00:29, 10.64valid_batch/s]

EVAL: [120/433] Elapsed 0m 11s (remain 0m 29s) Loss: 0.8175 


Validation:  33%|███▎      | 143/433 [00:13<00:27, 10.47valid_batch/s]

EVAL: [140/433] Elapsed 0m 13s (remain 0m 27s) Loss: 0.8287 


Validation:  38%|███▊      | 163/433 [00:15<00:25, 10.46valid_batch/s]

EVAL: [160/433] Elapsed 0m 15s (remain 0m 25s) Loss: 0.8339 


Validation:  42%|████▏     | 183/433 [00:17<00:23, 10.47valid_batch/s]

EVAL: [180/433] Elapsed 0m 17s (remain 0m 24s) Loss: 0.8389 


Validation:  47%|████▋     | 203/433 [00:19<00:21, 10.66valid_batch/s]

EVAL: [200/433] Elapsed 0m 19s (remain 0m 22s) Loss: 0.8464 


Validation:  52%|█████▏    | 223/433 [00:21<00:19, 10.50valid_batch/s]

EVAL: [220/433] Elapsed 0m 21s (remain 0m 20s) Loss: 0.8458 


Validation:  56%|█████▌    | 243/433 [00:23<00:17, 10.60valid_batch/s]

EVAL: [240/433] Elapsed 0m 22s (remain 0m 18s) Loss: 0.8420 


Validation:  61%|██████    | 263/433 [00:25<00:16, 10.56valid_batch/s]

EVAL: [260/433] Elapsed 0m 24s (remain 0m 16s) Loss: 0.8465 


Validation:  65%|██████▌   | 283/433 [00:26<00:14, 10.57valid_batch/s]

EVAL: [280/433] Elapsed 0m 26s (remain 0m 14s) Loss: 0.8419 


Validation:  70%|██████▉   | 303/433 [00:28<00:12, 10.52valid_batch/s]

EVAL: [300/433] Elapsed 0m 28s (remain 0m 12s) Loss: 0.8507 


Validation:  75%|███████▍  | 323/433 [00:30<00:10, 10.57valid_batch/s]

EVAL: [320/433] Elapsed 0m 30s (remain 0m 10s) Loss: 0.8477 


Validation:  79%|███████▉  | 343/433 [00:32<00:08, 10.48valid_batch/s]

EVAL: [340/433] Elapsed 0m 32s (remain 0m 8s) Loss: 0.8454 


Validation:  84%|████████▍ | 363/433 [00:34<00:06, 10.46valid_batch/s]

EVAL: [360/433] Elapsed 0m 34s (remain 0m 6s) Loss: 0.8443 


Validation:  88%|████████▊ | 383/433 [00:36<00:04, 10.50valid_batch/s]

EVAL: [380/433] Elapsed 0m 36s (remain 0m 4s) Loss: 0.8479 


Validation:  93%|█████████▎| 403/433 [00:38<00:02, 10.52valid_batch/s]

EVAL: [400/433] Elapsed 0m 38s (remain 0m 3s) Loss: 0.8466 


Validation:  98%|█████████▊| 423/433 [00:40<00:00, 10.47valid_batch/s]

EVAL: [420/433] Elapsed 0m 40s (remain 0m 1s) Loss: 0.8451 


Validation: 100%|██████████| 433/433 [00:41<00:00, 10.51valid_batch/s]
Epoch 2 - avg_train_loss: 0.8268  avg_val_loss: 0.8513  time: 347s
Epoch 2 - Score: 0.7724
Epoch 2 - Save Best Score: 0.7724 Model


EVAL: [432/433] Elapsed 0m 41s (remain 0m 0s) Loss: 0.8513 


Score: 0.7724
Train:   0%|          | 1/865 [00:00<06:55,  2.08train_batch/s]

Epoch: [1][0/865] Elapsed 0m 0s (remain 6m 55s) Loss: 1.6055 Grad: inf  LR: 0.00002000  


Train:   2%|▏         | 21/865 [00:07<05:01,  2.80train_batch/s]

Epoch: [1][20/865] Elapsed 0m 7s (remain 5m 1s) Loss: 1.5438 Grad: 43011.7695  LR: 0.00001819  


Train:   5%|▍         | 41/865 [00:14<04:54,  2.80train_batch/s]

Epoch: [1][40/865] Elapsed 0m 14s (remain 4m 52s) Loss: 1.4358 Grad: 47659.1836  LR: 0.00001369  


Train:   7%|▋         | 61/865 [00:21<04:47,  2.79train_batch/s]

Epoch: [1][60/865] Elapsed 0m 21s (remain 4m 44s) Loss: 1.3551 Grad: 73486.0391  LR: 0.00000798  


Train:   9%|▉         | 81/865 [00:28<04:40,  2.79train_batch/s]

Epoch: [1][80/865] Elapsed 0m 28s (remain 4m 37s) Loss: 1.2983 Grad: 120452.3750  LR: 0.00000293  


Train:  12%|█▏        | 101/865 [00:35<04:33,  2.79train_batch/s]

Epoch: [1][100/865] Elapsed 0m 35s (remain 4m 30s) Loss: 1.2629 Grad: 52098.6719  LR: 0.00000021  


Train:  14%|█▍        | 121/865 [00:42<04:26,  2.79train_batch/s]

Epoch: [1][120/865] Elapsed 0m 42s (remain 4m 23s) Loss: 1.2453 Grad: 57626.6992  LR: 0.00000071  


Train:  16%|█▋        | 141/865 [00:49<04:19,  2.79train_batch/s]

Epoch: [1][140/865] Elapsed 0m 49s (remain 4m 16s) Loss: 1.2210 Grad: 58847.4609  LR: 0.00000426  


Train:  19%|█▊        | 161/865 [00:56<04:12,  2.79train_batch/s]

Epoch: [1][160/865] Elapsed 0m 56s (remain 4m 8s) Loss: 1.1971 Grad: 50016.6445  LR: 0.00000971  


Train:  21%|██        | 181/865 [01:04<04:04,  2.79train_batch/s]

Epoch: [1][180/865] Elapsed 1m 4s (remain 4m 1s) Loss: 1.1813 Grad: 78193.2891  LR: 0.00001525  


Train:  23%|██▎       | 201/865 [01:11<03:57,  2.80train_batch/s]

Epoch: [1][200/865] Elapsed 1m 11s (remain 3m 54s) Loss: 1.1738 Grad: 154132.1250  LR: 0.00001906  


Train:  26%|██▌       | 221/865 [01:18<03:50,  2.80train_batch/s]

Epoch: [1][220/865] Elapsed 1m 18s (remain 3m 47s) Loss: 1.1595 Grad: 82258.2656  LR: 0.00001989  


Train:  28%|██▊       | 241/865 [01:25<03:43,  2.79train_batch/s]

Epoch: [1][240/865] Elapsed 1m 25s (remain 3m 40s) Loss: 1.1404 Grad: 50106.5508  LR: 0.00001747  


Train:  30%|███       | 261/865 [01:32<03:36,  2.79train_batch/s]

Epoch: [1][260/865] Elapsed 1m 32s (remain 3m 33s) Loss: 1.1381 Grad: 118537.4766  LR: 0.00001259  


Train:  32%|███▏      | 281/865 [01:39<03:29,  2.79train_batch/s]

Epoch: [1][280/865] Elapsed 1m 39s (remain 3m 26s) Loss: 1.1271 Grad: 101850.4062  LR: 0.00000685  


Train:  35%|███▍      | 301/865 [01:46<03:22,  2.79train_batch/s]

Epoch: [1][300/865] Elapsed 1m 46s (remain 3m 19s) Loss: 1.1113 Grad: 56003.1836  LR: 0.00000216  


Train:  37%|███▋      | 321/865 [01:53<03:14,  2.79train_batch/s]

Epoch: [1][320/865] Elapsed 1m 53s (remain 3m 12s) Loss: 1.0958 Grad: 65038.0781  LR: 0.00000004  


Train:  39%|███▉      | 341/865 [02:00<03:07,  2.79train_batch/s]

Epoch: [1][340/865] Elapsed 2m 0s (remain 3m 5s) Loss: 1.0861 Grad: 88383.5547  LR: 0.00000120  


Train:  42%|████▏     | 361/865 [02:07<03:00,  2.80train_batch/s]

Epoch: [1][360/865] Elapsed 2m 7s (remain 2m 58s) Loss: 1.0749 Grad: 70643.2656  LR: 0.00000525  


Train:  44%|████▍     | 381/865 [02:14<02:53,  2.79train_batch/s]

Epoch: [1][380/865] Elapsed 2m 14s (remain 2m 51s) Loss: 1.0712 Grad: 98325.6562  LR: 0.00001087  


Train:  46%|████▋     | 401/865 [02:21<02:46,  2.79train_batch/s]

Epoch: [1][400/865] Elapsed 2m 21s (remain 2m 43s) Loss: 1.0609 Grad: 128948.2812  LR: 0.00001620  


Train:  49%|████▊     | 421/865 [02:28<02:38,  2.79train_batch/s]

Epoch: [1][420/865] Elapsed 2m 28s (remain 2m 36s) Loss: 1.0565 Grad: 66051.6172  LR: 0.00001949  


Train:  51%|█████     | 441/865 [02:35<02:31,  2.79train_batch/s]

Epoch: [1][440/865] Elapsed 2m 35s (remain 2m 29s) Loss: 1.0557 Grad: 144957.5000  LR: 0.00001966  


Train:  53%|█████▎    | 461/865 [02:42<02:24,  2.79train_batch/s]

Epoch: [1][460/865] Elapsed 2m 42s (remain 2m 22s) Loss: 1.0517 Grad: 107552.6797  LR: 0.00001665  


Train:  56%|█████▌    | 481/865 [02:49<02:17,  2.79train_batch/s]

Epoch: [1][480/865] Elapsed 2m 49s (remain 2m 15s) Loss: 1.0503 Grad: 176041.3281  LR: 0.00001145  


Train:  58%|█████▊    | 501/865 [02:57<02:10,  2.79train_batch/s]

Epoch: [1][500/865] Elapsed 2m 57s (remain 2m 8s) Loss: 1.0479 Grad: 89966.3203  LR: 0.00000577  


Train:  60%|██████    | 521/865 [03:04<02:02,  2.80train_batch/s]

Epoch: [1][520/865] Elapsed 3m 4s (remain 2m 1s) Loss: 1.0410 Grad: 63941.6367  LR: 0.00000149  


Train:  63%|██████▎   | 541/865 [03:11<01:55,  2.79train_batch/s]

Epoch: [1][540/865] Elapsed 3m 11s (remain 1m 54s) Loss: 1.0347 Grad: 101857.6172  LR: 0.00000000  


Train:  65%|██████▍   | 561/865 [03:18<01:48,  2.79train_batch/s]

Epoch: [1][560/865] Elapsed 3m 18s (remain 1m 47s) Loss: 1.0306 Grad: 94012.2656  LR: 0.00000181  


Train:  67%|██████▋   | 581/865 [03:25<01:41,  2.79train_batch/s]

Epoch: [1][580/865] Elapsed 3m 25s (remain 1m 40s) Loss: 1.0251 Grad: 66313.6094  LR: 0.00000631  


Train:  69%|██████▉   | 601/865 [03:32<01:34,  2.79train_batch/s]

Epoch: [1][600/865] Elapsed 3m 32s (remain 1m 33s) Loss: 1.0211 Grad: 93253.2656  LR: 0.00001202  


Train:  72%|███████▏  | 621/865 [03:39<01:27,  2.80train_batch/s]

Epoch: [1][620/865] Elapsed 3m 39s (remain 1m 26s) Loss: 1.0178 Grad: 57016.7656  LR: 0.00001707  


Train:  74%|███████▍  | 641/865 [03:46<01:20,  2.79train_batch/s]

Epoch: [1][640/865] Elapsed 3m 46s (remain 1m 19s) Loss: 1.0152 Grad: 37857.6055  LR: 0.00001979  


Train:  76%|███████▋  | 661/865 [03:53<01:13,  2.79train_batch/s]

Epoch: [1][660/865] Elapsed 3m 53s (remain 1m 12s) Loss: 1.0128 Grad: 65971.0312  LR: 0.00001929  


Train:  79%|███████▊  | 681/865 [04:00<01:05,  2.80train_batch/s]

Epoch: [1][680/865] Elapsed 4m 0s (remain 1m 5s) Loss: 1.0099 Grad: 76648.4766  LR: 0.00001574  


Train:  81%|████████  | 701/865 [04:07<00:58,  2.80train_batch/s]

Epoch: [1][700/865] Elapsed 4m 7s (remain 0m 57s) Loss: 1.0073 Grad: 63686.9453  LR: 0.00001029  


Train:  83%|████████▎ | 721/865 [04:14<00:51,  2.80train_batch/s]

Epoch: [1][720/865] Elapsed 4m 14s (remain 0m 50s) Loss: 1.0029 Grad: 85691.3516  LR: 0.00000475  


Train:  86%|████████▌ | 741/865 [04:21<00:44,  2.79train_batch/s]

Epoch: [1][740/865] Elapsed 4m 21s (remain 0m 43s) Loss: 1.0003 Grad: 83977.8750  LR: 0.00000094  


Train:  88%|████████▊ | 761/865 [04:28<00:37,  2.80train_batch/s]

Epoch: [1][760/865] Elapsed 4m 28s (remain 0m 36s) Loss: 0.9951 Grad: 105942.2656  LR: 0.00000011  


Train:  90%|█████████ | 781/865 [04:35<00:30,  2.79train_batch/s]

Epoch: [1][780/865] Elapsed 4m 35s (remain 0m 29s) Loss: 0.9930 Grad: 52395.9297  LR: 0.00000253  


Train:  93%|█████████▎| 801/865 [04:42<00:22,  2.79train_batch/s]

Epoch: [1][800/865] Elapsed 4m 42s (remain 0m 22s) Loss: 0.9894 Grad: 46079.9922  LR: 0.00000741  


Train:  95%|█████████▍| 821/865 [04:50<00:15,  2.79train_batch/s]

Epoch: [1][820/865] Elapsed 4m 50s (remain 0m 15s) Loss: 0.9862 Grad: 98981.1562  LR: 0.00001315  


Train:  97%|█████████▋| 841/865 [04:57<00:08,  2.79train_batch/s]

Epoch: [1][840/865] Elapsed 4m 57s (remain 0m 8s) Loss: 0.9855 Grad: 64606.0898  LR: 0.00001784  


Train: 100%|█████████▉| 861/865 [05:04<00:01,  2.79train_batch/s]

Epoch: [1][860/865] Elapsed 5m 4s (remain 0m 1s) Loss: 0.9847 Grad: 74089.6875  LR: 0.00001996  


Train: 100%|██████████| 865/865 [05:05<00:00,  2.83train_batch/s]


Epoch: [1][864/865] Elapsed 5m 5s (remain 0m 0s) Loss: 0.9843 Grad: 62263.9375  LR: 0.00002000  


Validation:   1%|          | 3/433 [00:00<00:56,  7.66valid_batch/s]

EVAL: [0/433] Elapsed 0m 0s (remain 1m 39s) Loss: 1.4839 


Validation:   5%|▌         | 23/433 [00:02<00:39, 10.50valid_batch/s]

EVAL: [20/433] Elapsed 0m 2s (remain 0m 41s) Loss: 0.8296 


Validation:  10%|▉         | 43/433 [00:04<00:37, 10.48valid_batch/s]

EVAL: [40/433] Elapsed 0m 4s (remain 0m 38s) Loss: 0.8724 


Validation:  15%|█▍        | 63/433 [00:06<00:35, 10.48valid_batch/s]

EVAL: [60/433] Elapsed 0m 5s (remain 0m 36s) Loss: 0.8706 


Validation:  19%|█▉        | 83/433 [00:08<00:33, 10.48valid_batch/s]

EVAL: [80/433] Elapsed 0m 7s (remain 0m 34s) Loss: 0.8684 


Validation:  24%|██▍       | 103/433 [00:09<00:30, 10.75valid_batch/s]

EVAL: [100/433] Elapsed 0m 9s (remain 0m 32s) Loss: 0.8733 


Validation:  28%|██▊       | 123/433 [00:11<00:29, 10.49valid_batch/s]

EVAL: [120/433] Elapsed 0m 11s (remain 0m 30s) Loss: 0.8690 


Validation:  33%|███▎      | 143/433 [00:13<00:27, 10.61valid_batch/s]

EVAL: [140/433] Elapsed 0m 13s (remain 0m 28s) Loss: 0.8606 


Validation:  38%|███▊      | 163/433 [00:15<00:25, 10.48valid_batch/s]

EVAL: [160/433] Elapsed 0m 15s (remain 0m 26s) Loss: 0.8744 


Validation:  42%|████▏     | 183/433 [00:17<00:23, 10.49valid_batch/s]

EVAL: [180/433] Elapsed 0m 17s (remain 0m 24s) Loss: 0.8716 


Validation:  47%|████▋     | 203/433 [00:19<00:21, 10.48valid_batch/s]

EVAL: [200/433] Elapsed 0m 19s (remain 0m 22s) Loss: 0.8673 


Validation:  52%|█████▏    | 223/433 [00:21<00:20, 10.49valid_batch/s]

EVAL: [220/433] Elapsed 0m 21s (remain 0m 20s) Loss: 0.8704 


Validation:  56%|█████▌    | 243/433 [00:23<00:18, 10.52valid_batch/s]

EVAL: [240/433] Elapsed 0m 23s (remain 0m 18s) Loss: 0.8683 


Validation:  61%|██████    | 263/433 [00:25<00:15, 10.76valid_batch/s]

EVAL: [260/433] Elapsed 0m 24s (remain 0m 16s) Loss: 0.8724 


Validation:  65%|██████▌   | 283/433 [00:27<00:14, 10.56valid_batch/s]

EVAL: [280/433] Elapsed 0m 26s (remain 0m 14s) Loss: 0.8720 


Validation:  70%|██████▉   | 303/433 [00:28<00:12, 10.50valid_batch/s]

EVAL: [300/433] Elapsed 0m 28s (remain 0m 12s) Loss: 0.8704 


Validation:  75%|███████▍  | 323/433 [00:30<00:10, 10.55valid_batch/s]

EVAL: [320/433] Elapsed 0m 30s (remain 0m 10s) Loss: 0.8710 


Validation:  79%|███████▉  | 343/433 [00:32<00:08, 10.76valid_batch/s]

EVAL: [340/433] Elapsed 0m 32s (remain 0m 8s) Loss: 0.8686 


Validation:  84%|████████▍ | 363/433 [00:34<00:06, 10.49valid_batch/s]

EVAL: [360/433] Elapsed 0m 34s (remain 0m 6s) Loss: 0.8694 


Validation:  88%|████████▊ | 383/433 [00:36<00:04, 10.48valid_batch/s]

EVAL: [380/433] Elapsed 0m 36s (remain 0m 4s) Loss: 0.8696 


Validation:  93%|█████████▎| 403/433 [00:38<00:02, 10.60valid_batch/s]

EVAL: [400/433] Elapsed 0m 38s (remain 0m 3s) Loss: 0.8695 


Validation:  98%|█████████▊| 423/433 [00:40<00:00, 10.72valid_batch/s]

EVAL: [420/433] Elapsed 0m 40s (remain 0m 1s) Loss: 0.8705 


Validation: 100%|██████████| 433/433 [00:41<00:00, 10.50valid_batch/s]
Epoch 1 - avg_train_loss: 0.9843  avg_val_loss: 0.8680  time: 347s
Epoch 1 - Score: 0.7715
Epoch 1 - Save Best Score: 0.7715 Model


EVAL: [432/433] Elapsed 0m 41s (remain 0m 0s) Loss: 0.8680 


Train:   0%|          | 1/865 [00:00<07:07,  2.02train_batch/s]

Epoch: [2][0/865] Elapsed 0m 0s (remain 6m 55s) Loss: 0.9077 Grad: 482629.2500  LR: 0.00001998  


Train:   2%|▏         | 21/865 [00:07<05:01,  2.80train_batch/s]

Epoch: [2][20/865] Elapsed 0m 7s (remain 5m 0s) Loss: 0.8796 Grad: 116207.5000  LR: 0.00001802  


Train:   5%|▍         | 41/865 [00:14<04:55,  2.79train_batch/s]

Epoch: [2][40/865] Elapsed 0m 14s (remain 4m 52s) Loss: 0.8546 Grad: 142713.4219  LR: 0.00001342  


Train:   7%|▋         | 61/865 [00:21<04:47,  2.79train_batch/s]

Epoch: [2][60/865] Elapsed 0m 21s (remain 4m 44s) Loss: 0.8572 Grad: 167212.9844  LR: 0.00000769  


Train:   9%|▉         | 81/865 [00:28<04:40,  2.79train_batch/s]

Epoch: [2][80/865] Elapsed 0m 28s (remain 4m 37s) Loss: 0.8345 Grad: 263914.3438  LR: 0.00000273  


Train:  12%|█▏        | 101/865 [00:35<04:33,  2.79train_batch/s]

Epoch: [2][100/865] Elapsed 0m 35s (remain 4m 30s) Loss: 0.8159 Grad: 104482.1641  LR: 0.00000015  


Train:  14%|█▍        | 121/865 [00:42<04:26,  2.79train_batch/s]

Epoch: [2][120/865] Elapsed 0m 42s (remain 4m 23s) Loss: 0.8152 Grad: 123669.6328  LR: 0.00000082  


Train:  16%|█▋        | 141/865 [00:49<04:19,  2.79train_batch/s]

Epoch: [2][140/865] Elapsed 0m 49s (remain 4m 16s) Loss: 0.8019 Grad: 84730.2344  LR: 0.00000450  


Train:  19%|█▊        | 161/865 [00:56<04:11,  2.80train_batch/s]

Epoch: [2][160/865] Elapsed 0m 56s (remain 4m 8s) Loss: 0.7967 Grad: 140561.2969  LR: 0.00001000  


Train:  21%|██        | 181/865 [01:04<04:05,  2.79train_batch/s]

Epoch: [2][180/865] Elapsed 1m 3s (remain 4m 1s) Loss: 0.7943 Grad: 211771.5938  LR: 0.00001550  


Train:  23%|██▎       | 201/865 [01:11<03:57,  2.79train_batch/s]

Epoch: [2][200/865] Elapsed 1m 11s (remain 3m 54s) Loss: 0.7995 Grad: 131825.7344  LR: 0.00001918  


Train:  26%|██▌       | 221/865 [01:18<03:50,  2.79train_batch/s]

Epoch: [2][220/865] Elapsed 1m 18s (remain 3m 47s) Loss: 0.8001 Grad: 240184.9219  LR: 0.00001985  


Train:  28%|██▊       | 241/865 [01:25<03:43,  2.79train_batch/s]

Epoch: [2][240/865] Elapsed 1m 25s (remain 3m 40s) Loss: 0.8024 Grad: 413471.3125  LR: 0.00001727  


Train:  30%|███       | 261/865 [01:32<03:36,  2.80train_batch/s]

Epoch: [2][260/865] Elapsed 1m 32s (remain 3m 33s) Loss: 0.8072 Grad: 262269.6875  LR: 0.00001231  


Train:  32%|███▏      | 281/865 [01:39<03:29,  2.79train_batch/s]

Epoch: [2][280/865] Elapsed 1m 39s (remain 3m 26s) Loss: 0.8039 Grad: 326981.5938  LR: 0.00000658  


Train:  35%|███▍      | 301/865 [01:46<03:22,  2.79train_batch/s]

Epoch: [2][300/865] Elapsed 1m 46s (remain 3m 19s) Loss: 0.8065 Grad: 211479.3750  LR: 0.00000198  


Train:  37%|███▋      | 321/865 [01:53<03:15,  2.79train_batch/s]

Epoch: [2][320/865] Elapsed 1m 53s (remain 3m 12s) Loss: 0.8064 Grad: 151465.0156  LR: 0.00000002  


Train:  39%|███▉      | 341/865 [02:00<03:07,  2.79train_batch/s]

Epoch: [2][340/865] Elapsed 2m 0s (remain 3m 5s) Loss: 0.8035 Grad: 127835.5078  LR: 0.00000134  


Train:  42%|████▏     | 361/865 [02:07<03:00,  2.79train_batch/s]

Epoch: [2][360/865] Elapsed 2m 7s (remain 2m 58s) Loss: 0.8046 Grad: 192367.8125  LR: 0.00000551  


Train:  44%|████▍     | 381/865 [02:14<02:53,  2.79train_batch/s]

Epoch: [2][380/865] Elapsed 2m 14s (remain 2m 51s) Loss: 0.8023 Grad: 249616.9062  LR: 0.00001116  


Train:  46%|████▋     | 401/865 [02:21<02:46,  2.79train_batch/s]

Epoch: [2][400/865] Elapsed 2m 21s (remain 2m 43s) Loss: 0.8034 Grad: 130891.8672  LR: 0.00001643  


Train:  49%|████▊     | 421/865 [02:28<02:39,  2.79train_batch/s]

Epoch: [2][420/865] Elapsed 2m 28s (remain 2m 36s) Loss: 0.8017 Grad: 206354.0781  LR: 0.00001958  


Train:  51%|█████     | 441/865 [02:35<02:31,  2.79train_batch/s]

Epoch: [2][440/865] Elapsed 2m 35s (remain 2m 29s) Loss: 0.8038 Grad: 201941.6406  LR: 0.00001958  


Train:  53%|█████▎    | 461/865 [02:42<02:24,  2.79train_batch/s]

Epoch: [2][460/865] Elapsed 2m 42s (remain 2m 22s) Loss: 0.8052 Grad: 159531.8438  LR: 0.00001643  


Train:  56%|█████▌    | 481/865 [02:50<02:17,  2.79train_batch/s]

Epoch: [2][480/865] Elapsed 2m 49s (remain 2m 15s) Loss: 0.8085 Grad: 255962.0469  LR: 0.00001116  


Train:  58%|█████▊    | 501/865 [02:57<02:10,  2.80train_batch/s]

Epoch: [2][500/865] Elapsed 2m 57s (remain 2m 8s) Loss: 0.8089 Grad: 281260.5312  LR: 0.00000551  


Train:  60%|██████    | 521/865 [03:04<02:03,  2.79train_batch/s]

Epoch: [2][520/865] Elapsed 3m 4s (remain 2m 1s) Loss: 0.8102 Grad: 165478.0781  LR: 0.00000134  


Train:  63%|██████▎   | 541/865 [03:11<01:56,  2.79train_batch/s]

Epoch: [2][540/865] Elapsed 3m 11s (remain 1m 54s) Loss: 0.8070 Grad: 255912.4375  LR: 0.00000002  


Train:  65%|██████▍   | 561/865 [03:18<01:48,  2.79train_batch/s]

Epoch: [2][560/865] Elapsed 3m 18s (remain 1m 47s) Loss: 0.8043 Grad: 168978.1094  LR: 0.00000198  


Train:  67%|██████▋   | 581/865 [03:25<01:41,  2.79train_batch/s]

Epoch: [2][580/865] Elapsed 3m 25s (remain 1m 40s) Loss: 0.8031 Grad: 156681.7344  LR: 0.00000658  


Train:  69%|██████▉   | 601/865 [03:32<01:34,  2.79train_batch/s]

Epoch: [2][600/865] Elapsed 3m 32s (remain 1m 33s) Loss: 0.8015 Grad: 212004.3281  LR: 0.00001231  


Train:  72%|███████▏  | 621/865 [03:39<01:27,  2.80train_batch/s]

Epoch: [2][620/865] Elapsed 3m 39s (remain 1m 26s) Loss: 0.8017 Grad: 90666.7500  LR: 0.00001727  


Train:  74%|███████▍  | 641/865 [03:46<01:20,  2.79train_batch/s]

Epoch: [2][640/865] Elapsed 3m 46s (remain 1m 19s) Loss: 0.8025 Grad: 56182.3711  LR: 0.00001985  


Train:  76%|███████▋  | 661/865 [03:53<01:13,  2.79train_batch/s]

Epoch: [2][660/865] Elapsed 3m 53s (remain 1m 12s) Loss: 0.8041 Grad: 82417.2344  LR: 0.00001918  


Train:  79%|███████▊  | 681/865 [04:00<01:05,  2.79train_batch/s]

Epoch: [2][680/865] Elapsed 4m 0s (remain 1m 5s) Loss: 0.8067 Grad: 123462.3984  LR: 0.00001550  


Train:  81%|████████  | 701/865 [04:07<00:58,  2.79train_batch/s]

Epoch: [2][700/865] Elapsed 4m 7s (remain 0m 57s) Loss: 0.8083 Grad: 68483.5078  LR: 0.00001000  


Train:  83%|████████▎ | 721/865 [04:14<00:51,  2.79train_batch/s]

Epoch: [2][720/865] Elapsed 4m 14s (remain 0m 50s) Loss: 0.8132 Grad: 58775.1484  LR: 0.00000450  


Train:  86%|████████▌ | 741/865 [04:21<00:44,  2.79train_batch/s]

Epoch: [2][740/865] Elapsed 4m 21s (remain 0m 43s) Loss: 0.8136 Grad: 50654.8516  LR: 0.00000082  


Train:  88%|████████▊ | 761/865 [04:28<00:37,  2.80train_batch/s]

Epoch: [2][760/865] Elapsed 4m 28s (remain 0m 36s) Loss: 0.8139 Grad: 46424.8008  LR: 0.00000015  


Train:  90%|█████████ | 781/865 [04:35<00:30,  2.79train_batch/s]

Epoch: [2][780/865] Elapsed 4m 35s (remain 0m 29s) Loss: 0.8145 Grad: 33176.3203  LR: 0.00000273  


Train:  93%|█████████▎| 801/865 [04:42<00:22,  2.80train_batch/s]

Epoch: [2][800/865] Elapsed 4m 42s (remain 0m 22s) Loss: 0.8140 Grad: 53877.0312  LR: 0.00000769  


Train:  95%|█████████▍| 821/865 [04:50<00:15,  2.79train_batch/s]

Epoch: [2][820/865] Elapsed 4m 50s (remain 0m 15s) Loss: 0.8149 Grad: 30803.3809  LR: 0.00001342  


Train:  97%|█████████▋| 841/865 [04:57<00:08,  2.79train_batch/s]

Epoch: [2][840/865] Elapsed 4m 57s (remain 0m 8s) Loss: 0.8172 Grad: 41229.5039  LR: 0.00001802  


Train: 100%|█████████▉| 861/865 [05:04<00:01,  2.79train_batch/s]

Epoch: [2][860/865] Elapsed 5m 4s (remain 0m 1s) Loss: 0.8189 Grad: 31698.9609  LR: 0.00001998  


Train: 100%|██████████| 865/865 [05:05<00:00,  2.83train_batch/s]


Epoch: [2][864/865] Elapsed 5m 5s (remain 0m 0s) Loss: 0.8197 Grad: 41591.9023  LR: 0.00001998  


Validation:   1%|          | 3/433 [00:00<00:55,  7.72valid_batch/s]

EVAL: [0/433] Elapsed 0m 0s (remain 1m 37s) Loss: 1.4877 


Validation:   5%|▌         | 23/433 [00:02<00:39, 10.49valid_batch/s]

EVAL: [20/433] Elapsed 0m 2s (remain 0m 41s) Loss: 0.7991 


Validation:  10%|▉         | 43/433 [00:04<00:37, 10.47valid_batch/s]

EVAL: [40/433] Elapsed 0m 4s (remain 0m 38s) Loss: 0.8505 


Validation:  15%|█▍        | 63/433 [00:06<00:35, 10.47valid_batch/s]

EVAL: [60/433] Elapsed 0m 5s (remain 0m 36s) Loss: 0.8717 


Validation:  19%|█▉        | 83/433 [00:08<00:33, 10.47valid_batch/s]

EVAL: [80/433] Elapsed 0m 7s (remain 0m 34s) Loss: 0.8725 


Validation:  24%|██▍       | 103/433 [00:09<00:30, 10.73valid_batch/s]

EVAL: [100/433] Elapsed 0m 9s (remain 0m 32s) Loss: 0.8574 


Validation:  28%|██▊       | 123/433 [00:11<00:29, 10.48valid_batch/s]

EVAL: [120/433] Elapsed 0m 11s (remain 0m 30s) Loss: 0.8588 


Validation:  33%|███▎      | 143/433 [00:13<00:27, 10.60valid_batch/s]

EVAL: [140/433] Elapsed 0m 13s (remain 0m 28s) Loss: 0.8473 


Validation:  38%|███▊      | 163/433 [00:15<00:25, 10.48valid_batch/s]

EVAL: [160/433] Elapsed 0m 15s (remain 0m 26s) Loss: 0.8657 


Validation:  42%|████▏     | 183/433 [00:17<00:23, 10.50valid_batch/s]

EVAL: [180/433] Elapsed 0m 17s (remain 0m 24s) Loss: 0.8590 


Validation:  47%|████▋     | 203/433 [00:19<00:21, 10.48valid_batch/s]

EVAL: [200/433] Elapsed 0m 19s (remain 0m 22s) Loss: 0.8562 


Validation:  52%|█████▏    | 223/433 [00:21<00:19, 10.50valid_batch/s]

EVAL: [220/433] Elapsed 0m 21s (remain 0m 20s) Loss: 0.8614 


Validation:  56%|█████▌    | 243/433 [00:23<00:18, 10.55valid_batch/s]

EVAL: [240/433] Elapsed 0m 23s (remain 0m 18s) Loss: 0.8584 


Validation:  61%|██████    | 263/433 [00:25<00:15, 10.75valid_batch/s]

EVAL: [260/433] Elapsed 0m 24s (remain 0m 16s) Loss: 0.8639 


Validation:  65%|██████▌   | 283/433 [00:27<00:14, 10.55valid_batch/s]

EVAL: [280/433] Elapsed 0m 26s (remain 0m 14s) Loss: 0.8661 


Validation:  70%|██████▉   | 303/433 [00:28<00:12, 10.52valid_batch/s]

EVAL: [300/433] Elapsed 0m 28s (remain 0m 12s) Loss: 0.8670 


Validation:  75%|███████▍  | 323/433 [00:30<00:10, 10.56valid_batch/s]

EVAL: [320/433] Elapsed 0m 30s (remain 0m 10s) Loss: 0.8733 


Validation:  79%|███████▉  | 343/433 [00:32<00:08, 10.77valid_batch/s]

EVAL: [340/433] Elapsed 0m 32s (remain 0m 8s) Loss: 0.8679 


Validation:  84%|████████▍ | 363/433 [00:34<00:06, 10.50valid_batch/s]

EVAL: [360/433] Elapsed 0m 34s (remain 0m 6s) Loss: 0.8724 


Validation:  88%|████████▊ | 383/433 [00:36<00:04, 10.48valid_batch/s]

EVAL: [380/433] Elapsed 0m 36s (remain 0m 4s) Loss: 0.8727 


Validation:  93%|█████████▎| 403/433 [00:38<00:02, 10.58valid_batch/s]

EVAL: [400/433] Elapsed 0m 38s (remain 0m 3s) Loss: 0.8693 


Validation:  98%|█████████▊| 423/433 [00:40<00:00, 10.69valid_batch/s]

EVAL: [420/433] Elapsed 0m 40s (remain 0m 1s) Loss: 0.8694 


Validation: 100%|██████████| 433/433 [00:41<00:00, 10.50valid_batch/s]
Epoch 2 - avg_train_loss: 0.8197  avg_val_loss: 0.8669  time: 347s
Epoch 2 - Score: 0.7763
Epoch 2 - Save Best Score: 0.7763 Model


EVAL: [432/433] Elapsed 0m 41s (remain 0m 0s) Loss: 0.8669 


Score: 0.7763
Train:   0%|          | 1/865 [00:00<06:49,  2.11train_batch/s]

Epoch: [1][0/865] Elapsed 0m 0s (remain 6m 49s) Loss: 2.3061 Grad: inf  LR: 0.00002000  


Train:   2%|▏         | 21/865 [00:07<05:01,  2.80train_batch/s]

Epoch: [1][20/865] Elapsed 0m 7s (remain 5m 0s) Loss: 1.6815 Grad: 64385.3672  LR: 0.00001819  


Train:   5%|▍         | 41/865 [00:14<04:54,  2.80train_batch/s]

Epoch: [1][40/865] Elapsed 0m 14s (remain 4m 52s) Loss: 1.5553 Grad: 32679.4707  LR: 0.00001369  


Train:   7%|▋         | 61/865 [00:21<04:47,  2.79train_batch/s]

Epoch: [1][60/865] Elapsed 0m 21s (remain 4m 44s) Loss: 1.4735 Grad: 46701.0000  LR: 0.00000798  


Train:   9%|▉         | 81/865 [00:28<04:40,  2.79train_batch/s]

Epoch: [1][80/865] Elapsed 0m 28s (remain 4m 37s) Loss: 1.4165 Grad: 49199.0977  LR: 0.00000293  


Train:  12%|█▏        | 101/865 [00:35<04:33,  2.79train_batch/s]

Epoch: [1][100/865] Elapsed 0m 35s (remain 4m 30s) Loss: 1.3868 Grad: 53856.1562  LR: 0.00000021  


Train:  14%|█▍        | 121/865 [00:42<04:26,  2.80train_batch/s]

Epoch: [1][120/865] Elapsed 0m 42s (remain 4m 23s) Loss: 1.3433 Grad: 77406.7734  LR: 0.00000071  


Train:  16%|█▋        | 141/865 [00:49<04:19,  2.79train_batch/s]

Epoch: [1][140/865] Elapsed 0m 49s (remain 4m 15s) Loss: 1.3252 Grad: 54544.3789  LR: 0.00000426  


Train:  19%|█▊        | 161/865 [00:56<04:12,  2.79train_batch/s]

Epoch: [1][160/865] Elapsed 0m 56s (remain 4m 8s) Loss: 1.2991 Grad: 89656.0703  LR: 0.00000971  


Train:  21%|██        | 181/865 [01:03<04:05,  2.79train_batch/s]

Epoch: [1][180/865] Elapsed 1m 3s (remain 4m 1s) Loss: 1.2765 Grad: 189221.0156  LR: 0.00001525  


Train:  23%|██▎       | 201/865 [01:11<03:57,  2.79train_batch/s]

Epoch: [1][200/865] Elapsed 1m 11s (remain 3m 54s) Loss: 1.2583 Grad: 67631.6562  LR: 0.00001906  


Train:  26%|██▌       | 221/865 [01:18<03:50,  2.80train_batch/s]

Epoch: [1][220/865] Elapsed 1m 18s (remain 3m 47s) Loss: 1.2444 Grad: 216657.4844  LR: 0.00001989  


Train:  28%|██▊       | 241/865 [01:25<03:43,  2.79train_batch/s]

Epoch: [1][240/865] Elapsed 1m 25s (remain 3m 40s) Loss: 1.2267 Grad: 127871.1875  LR: 0.00001747  


Train:  30%|███       | 261/865 [01:32<03:36,  2.80train_batch/s]

Epoch: [1][260/865] Elapsed 1m 32s (remain 3m 33s) Loss: 1.2066 Grad: 71436.8672  LR: 0.00001259  


Train:  32%|███▏      | 281/865 [01:39<03:29,  2.79train_batch/s]

Epoch: [1][280/865] Elapsed 1m 39s (remain 3m 26s) Loss: 1.1913 Grad: 83955.2188  LR: 0.00000685  


Train:  35%|███▍      | 301/865 [01:46<03:22,  2.79train_batch/s]

Epoch: [1][300/865] Elapsed 1m 46s (remain 3m 19s) Loss: 1.1778 Grad: 61424.7969  LR: 0.00000216  


Train:  37%|███▋      | 321/865 [01:53<03:14,  2.79train_batch/s]

Epoch: [1][320/865] Elapsed 1m 53s (remain 3m 12s) Loss: 1.1643 Grad: 64987.9883  LR: 0.00000004  


Train:  39%|███▉      | 341/865 [02:00<03:07,  2.79train_batch/s]

Epoch: [1][340/865] Elapsed 2m 0s (remain 3m 5s) Loss: 1.1518 Grad: 68024.1250  LR: 0.00000120  


Train:  42%|████▏     | 361/865 [02:07<02:59,  2.81train_batch/s]

Epoch: [1][360/865] Elapsed 2m 7s (remain 2m 58s) Loss: 1.1377 Grad: 48236.7031  LR: 0.00000525  


Train:  44%|████▍     | 381/865 [02:14<02:53,  2.79train_batch/s]

Epoch: [1][380/865] Elapsed 2m 14s (remain 2m 51s) Loss: 1.1344 Grad: 59289.4766  LR: 0.00001087  


Train:  46%|████▋     | 401/865 [02:21<02:46,  2.79train_batch/s]

Epoch: [1][400/865] Elapsed 2m 21s (remain 2m 43s) Loss: 1.1242 Grad: 53502.3242  LR: 0.00001620  


Train:  49%|████▊     | 421/865 [02:28<02:38,  2.79train_batch/s]

Epoch: [1][420/865] Elapsed 2m 28s (remain 2m 36s) Loss: 1.1150 Grad: 68125.3438  LR: 0.00001949  


Train:  51%|█████     | 441/865 [02:35<02:31,  2.79train_batch/s]

Epoch: [1][440/865] Elapsed 2m 35s (remain 2m 29s) Loss: 1.1065 Grad: 72286.0547  LR: 0.00001966  


Train:  53%|█████▎    | 461/865 [02:42<02:24,  2.79train_batch/s]

Epoch: [1][460/865] Elapsed 2m 42s (remain 2m 22s) Loss: 1.0996 Grad: 82853.3594  LR: 0.00001665  


Train:  56%|█████▌    | 481/865 [02:49<02:17,  2.79train_batch/s]

Epoch: [1][480/865] Elapsed 2m 49s (remain 2m 15s) Loss: 1.0895 Grad: 90990.5312  LR: 0.00001145  


Train:  58%|█████▊    | 501/865 [02:57<02:10,  2.79train_batch/s]

Epoch: [1][500/865] Elapsed 2m 57s (remain 2m 8s) Loss: 1.0823 Grad: 103506.1016  LR: 0.00000577  


Train:  60%|██████    | 521/865 [03:04<02:03,  2.79train_batch/s]

Epoch: [1][520/865] Elapsed 3m 4s (remain 2m 1s) Loss: 1.0752 Grad: 96865.3125  LR: 0.00000149  


Train:  63%|██████▎   | 541/865 [03:11<01:56,  2.79train_batch/s]

Epoch: [1][540/865] Elapsed 3m 11s (remain 1m 54s) Loss: 1.0690 Grad: 67515.5078  LR: 0.00000000  


Train:  65%|██████▍   | 561/865 [03:18<01:49,  2.79train_batch/s]

Epoch: [1][560/865] Elapsed 3m 18s (remain 1m 47s) Loss: 1.0593 Grad: 81407.9375  LR: 0.00000181  


Train:  67%|██████▋   | 581/865 [03:25<01:41,  2.79train_batch/s]

Epoch: [1][580/865] Elapsed 3m 25s (remain 1m 40s) Loss: 1.0551 Grad: 52698.1641  LR: 0.00000631  


Train:  69%|██████▉   | 601/865 [03:32<01:34,  2.80train_batch/s]

Epoch: [1][600/865] Elapsed 3m 32s (remain 1m 33s) Loss: 1.0490 Grad: 114113.7891  LR: 0.00001202  


Train:  72%|███████▏  | 621/865 [03:39<01:27,  2.80train_batch/s]

Epoch: [1][620/865] Elapsed 3m 39s (remain 1m 26s) Loss: 1.0434 Grad: 125587.6328  LR: 0.00001707  


Train:  74%|███████▍  | 641/865 [03:46<01:20,  2.79train_batch/s]

Epoch: [1][640/865] Elapsed 3m 46s (remain 1m 19s) Loss: 1.0399 Grad: 85166.5391  LR: 0.00001979  


Train:  76%|███████▋  | 661/865 [03:53<01:12,  2.80train_batch/s]

Epoch: [1][660/865] Elapsed 3m 53s (remain 1m 12s) Loss: 1.0378 Grad: 91300.8906  LR: 0.00001929  


Train:  79%|███████▊  | 681/865 [04:00<01:05,  2.79train_batch/s]

Epoch: [1][680/865] Elapsed 4m 0s (remain 1m 4s) Loss: 1.0349 Grad: 94590.4453  LR: 0.00001574  


Train:  81%|████████  | 701/865 [04:07<00:58,  2.80train_batch/s]

Epoch: [1][700/865] Elapsed 4m 7s (remain 0m 57s) Loss: 1.0343 Grad: 66857.6250  LR: 0.00001029  


Train:  83%|████████▎ | 721/865 [04:14<00:51,  2.79train_batch/s]

Epoch: [1][720/865] Elapsed 4m 14s (remain 0m 50s) Loss: 1.0291 Grad: 75920.8906  LR: 0.00000475  


Train:  86%|████████▌ | 741/865 [04:21<00:44,  2.79train_batch/s]

Epoch: [1][740/865] Elapsed 4m 21s (remain 0m 43s) Loss: 1.0253 Grad: 48321.3047  LR: 0.00000094  


Train:  88%|████████▊ | 761/865 [04:28<00:37,  2.79train_batch/s]

Epoch: [1][760/865] Elapsed 4m 28s (remain 0m 36s) Loss: 1.0216 Grad: 114946.1172  LR: 0.00000011  


Train:  90%|█████████ | 781/865 [04:35<00:30,  2.79train_batch/s]

Epoch: [1][780/865] Elapsed 4m 35s (remain 0m 29s) Loss: 1.0192 Grad: 113011.7891  LR: 0.00000253  


Train:  93%|█████████▎| 801/865 [04:42<00:22,  2.79train_batch/s]

Epoch: [1][800/865] Elapsed 4m 42s (remain 0m 22s) Loss: 1.0150 Grad: 60464.7070  LR: 0.00000741  


Train:  95%|█████████▍| 821/865 [04:50<00:15,  2.79train_batch/s]

Epoch: [1][820/865] Elapsed 4m 50s (remain 0m 15s) Loss: 1.0112 Grad: 101292.5234  LR: 0.00001315  


Train:  97%|█████████▋| 841/865 [04:57<00:08,  2.79train_batch/s]

Epoch: [1][840/865] Elapsed 4m 57s (remain 0m 8s) Loss: 1.0084 Grad: 75839.7578  LR: 0.00001784  


Train: 100%|█████████▉| 861/865 [05:04<00:01,  2.79train_batch/s]

Epoch: [1][860/865] Elapsed 5m 4s (remain 0m 1s) Loss: 1.0044 Grad: 99297.6875  LR: 0.00001996  


Train: 100%|██████████| 865/865 [05:05<00:00,  2.83train_batch/s]


Epoch: [1][864/865] Elapsed 5m 5s (remain 0m 0s) Loss: 1.0039 Grad: 131582.4219  LR: 0.00002000  


Validation:   1%|          | 3/433 [00:00<00:57,  7.51valid_batch/s]

EVAL: [0/433] Elapsed 0m 0s (remain 1m 44s) Loss: 0.9266 


Validation:   5%|▌         | 23/433 [00:02<00:39, 10.42valid_batch/s]

EVAL: [20/433] Elapsed 0m 2s (remain 0m 42s) Loss: 0.9070 


Validation:  10%|▉         | 43/433 [00:04<00:37, 10.46valid_batch/s]

EVAL: [40/433] Elapsed 0m 4s (remain 0m 38s) Loss: 0.9626 


Validation:  15%|█▍        | 63/433 [00:06<00:35, 10.52valid_batch/s]

EVAL: [60/433] Elapsed 0m 5s (remain 0m 36s) Loss: 0.9994 


Validation:  19%|█▉        | 83/433 [00:08<00:33, 10.58valid_batch/s]

EVAL: [80/433] Elapsed 0m 7s (remain 0m 34s) Loss: 0.9846 


Validation:  24%|██▍       | 103/433 [00:09<00:30, 10.82valid_batch/s]

EVAL: [100/433] Elapsed 0m 9s (remain 0m 31s) Loss: 0.9862 


Validation:  28%|██▊       | 123/433 [00:11<00:29, 10.53valid_batch/s]

EVAL: [120/433] Elapsed 0m 11s (remain 0m 29s) Loss: 1.0048 


Validation:  33%|███▎      | 143/433 [00:13<00:27, 10.52valid_batch/s]

EVAL: [140/433] Elapsed 0m 13s (remain 0m 27s) Loss: 1.0260 


Validation:  38%|███▊      | 163/433 [00:15<00:25, 10.54valid_batch/s]

EVAL: [160/433] Elapsed 0m 15s (remain 0m 26s) Loss: 1.0254 


Validation:  42%|████▏     | 183/433 [00:17<00:23, 10.47valid_batch/s]

EVAL: [180/433] Elapsed 0m 17s (remain 0m 24s) Loss: 1.0300 


Validation:  47%|████▋     | 203/433 [00:19<00:21, 10.49valid_batch/s]

EVAL: [200/433] Elapsed 0m 19s (remain 0m 22s) Loss: 1.0294 


Validation:  52%|█████▏    | 223/433 [00:21<00:19, 10.84valid_batch/s]

EVAL: [220/433] Elapsed 0m 21s (remain 0m 20s) Loss: 1.0195 


Validation:  56%|█████▌    | 243/433 [00:23<00:18, 10.49valid_batch/s]

EVAL: [240/433] Elapsed 0m 22s (remain 0m 18s) Loss: 1.0218 


Validation:  61%|██████    | 263/433 [00:25<00:16, 10.59valid_batch/s]

EVAL: [260/433] Elapsed 0m 24s (remain 0m 16s) Loss: 1.0199 


Validation:  65%|██████▌   | 283/433 [00:26<00:14, 10.57valid_batch/s]

EVAL: [280/433] Elapsed 0m 26s (remain 0m 14s) Loss: 1.0244 


Validation:  70%|██████▉   | 303/433 [00:28<00:12, 10.47valid_batch/s]

EVAL: [300/433] Elapsed 0m 28s (remain 0m 12s) Loss: 1.0359 


Validation:  75%|███████▍  | 323/433 [00:30<00:10, 10.79valid_batch/s]

EVAL: [320/433] Elapsed 0m 30s (remain 0m 10s) Loss: 1.0321 


Validation:  79%|███████▉  | 343/433 [00:32<00:08, 10.49valid_batch/s]

EVAL: [340/433] Elapsed 0m 32s (remain 0m 8s) Loss: 1.0297 


Validation:  84%|████████▍ | 363/433 [00:34<00:06, 10.47valid_batch/s]

EVAL: [360/433] Elapsed 0m 34s (remain 0m 6s) Loss: 1.0223 


Validation:  88%|████████▊ | 383/433 [00:36<00:04, 10.52valid_batch/s]

EVAL: [380/433] Elapsed 0m 36s (remain 0m 4s) Loss: 1.0245 


Validation:  93%|█████████▎| 403/433 [00:38<00:02, 10.47valid_batch/s]

EVAL: [400/433] Elapsed 0m 38s (remain 0m 3s) Loss: 1.0209 


Validation:  98%|█████████▊| 423/433 [00:40<00:00, 10.47valid_batch/s]

EVAL: [420/433] Elapsed 0m 40s (remain 0m 1s) Loss: 1.0189 


Validation: 100%|██████████| 433/433 [00:41<00:00, 10.51valid_batch/s]
Epoch 1 - avg_train_loss: 1.0039  avg_val_loss: 1.0191  time: 347s
Epoch 1 - Score: 0.7023
Epoch 1 - Save Best Score: 0.7023 Model


EVAL: [432/433] Elapsed 0m 41s (remain 0m 0s) Loss: 1.0191 


Train:   0%|          | 1/865 [00:00<07:08,  2.02train_batch/s]

Epoch: [2][0/865] Elapsed 0m 0s (remain 6m 56s) Loss: 0.4764 Grad: 338953.3125  LR: 0.00001998  


Train:   2%|▏         | 21/865 [00:07<04:58,  2.83train_batch/s]

Epoch: [2][20/865] Elapsed 0m 7s (remain 5m 1s) Loss: 0.8968 Grad: 173617.9375  LR: 0.00001802  


Train:   5%|▍         | 41/865 [00:14<04:55,  2.79train_batch/s]

Epoch: [2][40/865] Elapsed 0m 14s (remain 4m 52s) Loss: 0.9012 Grad: 185793.6250  LR: 0.00001342  


Train:   7%|▋         | 61/865 [00:21<04:48,  2.79train_batch/s]

Epoch: [2][60/865] Elapsed 0m 21s (remain 4m 45s) Loss: 0.8644 Grad: 203899.4062  LR: 0.00000769  


Train:   9%|▉         | 81/865 [00:28<04:40,  2.80train_batch/s]

Epoch: [2][80/865] Elapsed 0m 28s (remain 4m 37s) Loss: 0.8459 Grad: 99254.0859  LR: 0.00000273  


Train:  12%|█▏        | 101/865 [00:35<04:33,  2.79train_batch/s]

Epoch: [2][100/865] Elapsed 0m 35s (remain 4m 30s) Loss: 0.8417 Grad: 242815.8438  LR: 0.00000015  


Train:  14%|█▍        | 121/865 [00:42<04:26,  2.79train_batch/s]

Epoch: [2][120/865] Elapsed 0m 42s (remain 4m 23s) Loss: 0.8154 Grad: 157738.1406  LR: 0.00000082  


Train:  16%|█▋        | 141/865 [00:49<04:18,  2.80train_batch/s]

Epoch: [2][140/865] Elapsed 0m 49s (remain 4m 15s) Loss: 0.8038 Grad: 93997.5469  LR: 0.00000450  


Train:  19%|█▊        | 161/865 [00:56<04:12,  2.79train_batch/s]

Epoch: [2][160/865] Elapsed 0m 56s (remain 4m 8s) Loss: 0.8075 Grad: 233784.5781  LR: 0.00001000  


Train:  21%|██        | 181/865 [01:04<04:05,  2.79train_batch/s]

Epoch: [2][180/865] Elapsed 1m 3s (remain 4m 1s) Loss: 0.8100 Grad: 157588.0938  LR: 0.00001550  


Train:  23%|██▎       | 201/865 [01:11<03:58,  2.79train_batch/s]

Epoch: [2][200/865] Elapsed 1m 11s (remain 3m 54s) Loss: 0.8089 Grad: 154916.2656  LR: 0.00001918  


Train:  26%|██▌       | 221/865 [01:18<03:50,  2.80train_batch/s]

Epoch: [2][220/865] Elapsed 1m 18s (remain 3m 47s) Loss: 0.8167 Grad: 123450.6484  LR: 0.00001985  


Train:  28%|██▊       | 241/865 [01:25<03:43,  2.80train_batch/s]

Epoch: [2][240/865] Elapsed 1m 25s (remain 3m 40s) Loss: 0.8137 Grad: 139266.5938  LR: 0.00001727  


Train:  30%|███       | 261/865 [01:32<03:35,  2.80train_batch/s]

Epoch: [2][260/865] Elapsed 1m 32s (remain 3m 33s) Loss: 0.8124 Grad: 193188.8281  LR: 0.00001231  


Train:  32%|███▏      | 281/865 [01:39<03:29,  2.79train_batch/s]

Epoch: [2][280/865] Elapsed 1m 39s (remain 3m 26s) Loss: 0.8068 Grad: 128175.2031  LR: 0.00000658  


Train:  35%|███▍      | 301/865 [01:46<03:21,  2.79train_batch/s]

Epoch: [2][300/865] Elapsed 1m 46s (remain 3m 19s) Loss: 0.8024 Grad: 132815.4688  LR: 0.00000198  


Train:  37%|███▋      | 321/865 [01:53<03:14,  2.79train_batch/s]

Epoch: [2][320/865] Elapsed 1m 53s (remain 3m 12s) Loss: 0.8006 Grad: 194226.9844  LR: 0.00000002  


Train:  39%|███▉      | 341/865 [02:00<03:07,  2.79train_batch/s]

Epoch: [2][340/865] Elapsed 2m 0s (remain 3m 5s) Loss: 0.8004 Grad: 291730.1875  LR: 0.00000134  


Train:  42%|████▏     | 361/865 [02:07<03:00,  2.79train_batch/s]

Epoch: [2][360/865] Elapsed 2m 7s (remain 2m 58s) Loss: 0.7967 Grad: 121973.7266  LR: 0.00000551  


Train:  44%|████▍     | 381/865 [02:14<02:53,  2.79train_batch/s]

Epoch: [2][380/865] Elapsed 2m 14s (remain 2m 51s) Loss: 0.7963 Grad: 116058.5156  LR: 0.00001116  


Train:  46%|████▋     | 401/865 [02:21<02:46,  2.79train_batch/s]

Epoch: [2][400/865] Elapsed 2m 21s (remain 2m 43s) Loss: 0.7942 Grad: 243953.2969  LR: 0.00001643  


Train:  49%|████▊     | 421/865 [02:28<02:38,  2.79train_batch/s]

Epoch: [2][420/865] Elapsed 2m 28s (remain 2m 36s) Loss: 0.7940 Grad: 233097.3438  LR: 0.00001958  


Train:  51%|█████     | 441/865 [02:35<02:31,  2.79train_batch/s]

Epoch: [2][440/865] Elapsed 2m 35s (remain 2m 29s) Loss: 0.7961 Grad: 260772.6719  LR: 0.00001958  


Train:  53%|█████▎    | 461/865 [02:42<02:24,  2.79train_batch/s]

Epoch: [2][460/865] Elapsed 2m 42s (remain 2m 22s) Loss: 0.7993 Grad: 192972.5312  LR: 0.00001643  


Train:  56%|█████▌    | 481/865 [02:49<02:17,  2.79train_batch/s]

Epoch: [2][480/865] Elapsed 2m 49s (remain 2m 15s) Loss: 0.7978 Grad: 312534.4375  LR: 0.00001116  


Train:  58%|█████▊    | 501/865 [02:57<02:10,  2.79train_batch/s]

Epoch: [2][500/865] Elapsed 2m 56s (remain 2m 8s) Loss: 0.7981 Grad: 154851.8125  LR: 0.00000551  


Train:  60%|██████    | 521/865 [03:04<02:03,  2.80train_batch/s]

Epoch: [2][520/865] Elapsed 3m 4s (remain 2m 1s) Loss: 0.7983 Grad: 158549.0938  LR: 0.00000134  


Train:  63%|██████▎   | 541/865 [03:11<01:55,  2.79train_batch/s]

Epoch: [2][540/865] Elapsed 3m 11s (remain 1m 54s) Loss: 0.7978 Grad: 100405.8594  LR: 0.00000002  


Train:  65%|██████▍   | 561/865 [03:18<01:48,  2.79train_batch/s]

Epoch: [2][560/865] Elapsed 3m 18s (remain 1m 47s) Loss: 0.7986 Grad: 142023.0156  LR: 0.00000198  


Train:  67%|██████▋   | 581/865 [03:25<01:41,  2.79train_batch/s]

Epoch: [2][580/865] Elapsed 3m 25s (remain 1m 40s) Loss: 0.7969 Grad: 260558.0312  LR: 0.00000658  


Train:  69%|██████▉   | 601/865 [03:32<01:34,  2.79train_batch/s]

Epoch: [2][600/865] Elapsed 3m 32s (remain 1m 33s) Loss: 0.7949 Grad: 120844.9453  LR: 0.00001231  


Train:  72%|███████▏  | 621/865 [03:39<01:27,  2.79train_batch/s]

Epoch: [2][620/865] Elapsed 3m 39s (remain 1m 26s) Loss: 0.7930 Grad: 150890.3438  LR: 0.00001727  


Train:  74%|███████▍  | 641/865 [03:46<01:20,  2.79train_batch/s]

Epoch: [2][640/865] Elapsed 3m 46s (remain 1m 19s) Loss: 0.7931 Grad: 158728.1406  LR: 0.00001985  


Train:  76%|███████▋  | 661/865 [03:53<01:13,  2.79train_batch/s]

Epoch: [2][660/865] Elapsed 3m 53s (remain 1m 12s) Loss: 0.7928 Grad: 189968.9375  LR: 0.00001918  


Train:  79%|███████▊  | 681/865 [04:00<01:05,  2.79train_batch/s]

Epoch: [2][680/865] Elapsed 4m 0s (remain 1m 5s) Loss: 0.7937 Grad: 87754.3438  LR: 0.00001550  


Train:  81%|████████  | 701/865 [04:07<00:58,  2.79train_batch/s]

Epoch: [2][700/865] Elapsed 4m 7s (remain 0m 57s) Loss: 0.7932 Grad: 233151.5625  LR: 0.00001000  


Train:  83%|████████▎ | 721/865 [04:14<00:51,  2.79train_batch/s]

Epoch: [2][720/865] Elapsed 4m 14s (remain 0m 50s) Loss: 0.7930 Grad: 104061.2031  LR: 0.00000450  


Train:  86%|████████▌ | 741/865 [04:21<00:44,  2.81train_batch/s]

Epoch: [2][740/865] Elapsed 4m 21s (remain 0m 43s) Loss: 0.7907 Grad: 143885.2031  LR: 0.00000082  


Train:  88%|████████▊ | 761/865 [04:28<00:37,  2.79train_batch/s]

Epoch: [2][760/865] Elapsed 4m 28s (remain 0m 36s) Loss: 0.7901 Grad: 215087.7500  LR: 0.00000015  


Train:  90%|█████████ | 781/865 [04:35<00:30,  2.80train_batch/s]

Epoch: [2][780/865] Elapsed 4m 35s (remain 0m 29s) Loss: 0.7892 Grad: 179447.1562  LR: 0.00000273  


Train:  93%|█████████▎| 801/865 [04:42<00:22,  2.79train_batch/s]

Epoch: [2][800/865] Elapsed 4m 42s (remain 0m 22s) Loss: 0.7870 Grad: 166676.5938  LR: 0.00000769  


Train:  95%|█████████▍| 821/865 [04:50<00:15,  2.79train_batch/s]

Epoch: [2][820/865] Elapsed 4m 50s (remain 0m 15s) Loss: 0.7858 Grad: 305100.0625  LR: 0.00001342  


Train:  97%|█████████▋| 841/865 [04:57<00:08,  2.79train_batch/s]

Epoch: [2][840/865] Elapsed 4m 57s (remain 0m 8s) Loss: 0.7858 Grad: 364351.2812  LR: 0.00001802  


Train: 100%|█████████▉| 861/865 [05:04<00:01,  2.79train_batch/s]

Epoch: [2][860/865] Elapsed 5m 4s (remain 0m 1s) Loss: 0.7864 Grad: 191188.8281  LR: 0.00001998  


Train: 100%|██████████| 865/865 [05:05<00:00,  2.83train_batch/s]


Epoch: [2][864/865] Elapsed 5m 5s (remain 0m 0s) Loss: 0.7866 Grad: 122129.7812  LR: 0.00001998  


Validation:   1%|          | 3/433 [00:00<00:56,  7.57valid_batch/s]

EVAL: [0/433] Elapsed 0m 0s (remain 1m 42s) Loss: 1.0575 


Validation:   5%|▌         | 23/433 [00:02<00:39, 10.42valid_batch/s]

EVAL: [20/433] Elapsed 0m 2s (remain 0m 42s) Loss: 1.0085 


Validation:  10%|▉         | 43/433 [00:04<00:37, 10.46valid_batch/s]

EVAL: [40/433] Elapsed 0m 4s (remain 0m 38s) Loss: 0.9644 


Validation:  15%|█▍        | 63/433 [00:06<00:35, 10.52valid_batch/s]

EVAL: [60/433] Elapsed 0m 5s (remain 0m 36s) Loss: 0.9480 


Validation:  19%|█▉        | 83/433 [00:08<00:33, 10.58valid_batch/s]

EVAL: [80/433] Elapsed 0m 7s (remain 0m 34s) Loss: 0.9235 


Validation:  24%|██▍       | 103/433 [00:09<00:30, 10.81valid_batch/s]

EVAL: [100/433] Elapsed 0m 9s (remain 0m 31s) Loss: 0.9257 


Validation:  28%|██▊       | 123/433 [00:11<00:29, 10.52valid_batch/s]

EVAL: [120/433] Elapsed 0m 11s (remain 0m 29s) Loss: 0.9366 


Validation:  33%|███▎      | 143/433 [00:13<00:27, 10.51valid_batch/s]

EVAL: [140/433] Elapsed 0m 13s (remain 0m 27s) Loss: 0.9467 


Validation:  38%|███▊      | 163/433 [00:15<00:25, 10.53valid_batch/s]

EVAL: [160/433] Elapsed 0m 15s (remain 0m 26s) Loss: 0.9483 


Validation:  42%|████▏     | 183/433 [00:17<00:23, 10.47valid_batch/s]

EVAL: [180/433] Elapsed 0m 17s (remain 0m 24s) Loss: 0.9457 


Validation:  47%|████▋     | 203/433 [00:19<00:21, 10.47valid_batch/s]

EVAL: [200/433] Elapsed 0m 19s (remain 0m 22s) Loss: 0.9402 


Validation:  52%|█████▏    | 223/433 [00:21<00:19, 10.82valid_batch/s]

EVAL: [220/433] Elapsed 0m 21s (remain 0m 20s) Loss: 0.9359 


Validation:  56%|█████▌    | 243/433 [00:23<00:18, 10.48valid_batch/s]

EVAL: [240/433] Elapsed 0m 22s (remain 0m 18s) Loss: 0.9420 


Validation:  61%|██████    | 263/433 [00:25<00:16, 10.60valid_batch/s]

EVAL: [260/433] Elapsed 0m 24s (remain 0m 16s) Loss: 0.9414 


Validation:  65%|██████▌   | 283/433 [00:26<00:14, 10.57valid_batch/s]

EVAL: [280/433] Elapsed 0m 26s (remain 0m 14s) Loss: 0.9425 


Validation:  70%|██████▉   | 303/433 [00:28<00:12, 10.47valid_batch/s]

EVAL: [300/433] Elapsed 0m 28s (remain 0m 12s) Loss: 0.9480 


Validation:  75%|███████▍  | 323/433 [00:30<00:10, 10.79valid_batch/s]

EVAL: [320/433] Elapsed 0m 30s (remain 0m 10s) Loss: 0.9443 


Validation:  79%|███████▉  | 343/433 [00:32<00:08, 10.50valid_batch/s]

EVAL: [340/433] Elapsed 0m 32s (remain 0m 8s) Loss: 0.9456 


Validation:  84%|████████▍ | 363/433 [00:34<00:06, 10.47valid_batch/s]

EVAL: [360/433] Elapsed 0m 34s (remain 0m 6s) Loss: 0.9380 


Validation:  88%|████████▊ | 383/433 [00:36<00:04, 10.54valid_batch/s]

EVAL: [380/433] Elapsed 0m 36s (remain 0m 4s) Loss: 0.9391 


Validation:  93%|█████████▎| 403/433 [00:38<00:02, 10.47valid_batch/s]

EVAL: [400/433] Elapsed 0m 38s (remain 0m 3s) Loss: 0.9353 


Validation:  98%|█████████▊| 423/433 [00:40<00:00, 10.47valid_batch/s]

EVAL: [420/433] Elapsed 0m 40s (remain 0m 1s) Loss: 0.9332 


Validation: 100%|██████████| 433/433 [00:41<00:00, 10.50valid_batch/s]
Epoch 2 - avg_train_loss: 0.7866  avg_val_loss: 0.9331  time: 347s
Epoch 2 - Score: 0.7769
Epoch 2 - Save Best Score: 0.7769 Model


EVAL: [432/433] Elapsed 0m 41s (remain 0m 0s) Loss: 0.9331 


Score: 0.7769
Score: 0.7617
