# 🏋️ Model Training

## ⚙️ Setup 

### 📚 Importing Libraries

Importing from packages

In [1]:
import os
import gc
import pandas as pd
import time
import numpy as np
import torch
from torch.optim import AdamW
from torch import nn
from transformers import (
    get_linear_schedule_with_warmup,
    get_cosine_schedule_with_warmup,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.chdir("../")

Importing user defined packages

In [3]:
from lib.config import config
from lib.paths import Paths
from lib.model.deberta import CustomModel
from lib.model.epoch_functions import train_epoch, valid_epoch
from lib.model.utils import get_score
from lib.utils.utils import get_logger, seed_everything
from lib.data import read_data_loader_from_disk

In [4]:
seed_everything()

## 📖 Definitions

### 🌎 Global Variables

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
LOGGER = get_logger(Paths.MODEL_OUTPUT_PATH)

### 🛠️ Functions

In [7]:
def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p
                for n, p in model.model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "lr": encoder_lr,
            "weight_decay": weight_decay,
        },
        {
            "params": [
                p
                for n, p in model.model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "lr": encoder_lr,
            "weight_decay": 0.0,
        },
        {
            "params": [p for n, p in model.named_parameters() if "model" not in n],
            "lr": decoder_lr,
            "weight_decay": 0.0,
        },
    ]

    return optimizer_parameters

In [8]:
def get_scheduler(optimizer, num_train_steps):
    if config.scheduler == "linear":
        return get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=config.NUM_WARMUP_STEPS,
            num_training_steps=num_train_steps,
        )

    if config.scheduler == "cosine":
        return get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=config.NUM_WARMUP_STEPS,
            num_training_steps=num_train_steps,
            num_cycles=config.NUM_CYCLES,
        )

In [9]:
def get_model_optimizer_and_scheduler(train_loader):
    model = CustomModel(config, config_path=None, pretrained=True)
    torch.save(model.config, Paths.MODEL_OUTPUT_PATH + "/config.pth")
    model.to(device)

    optimizer = AdamW(
        get_optimizer_params(
            model,
            encoder_lr=config.encoder_lr,
            decoder_lr=config.decoder_lr,
            weight_decay=config.weight_decay,
        ),
        lr=config.encoder_lr,
        eps=config.eps,
        betas=config.betas,
    )

    num_train_steps = int(len(train_loader) / config.batch_size_train * config.epochs)
    scheduler = get_scheduler(optimizer, num_train_steps)
    return model, optimizer, scheduler

In [10]:
def train_loop(fold):
    LOGGER.info(f"========== Fold: {fold} training ==========")

    # ======== DATA LOADER ==========
    train_loader, valid_loader = read_data_loader_from_disk(fold)
    valid_fold = pd.read_csv(os.path.join(Paths.DATA_LOADER_PATH, f"valid_{fold}.csv"))
    valid_labels = valid_fold["score"].values

    # ======== MODEL ==========
    model, optimizer, scheduler = get_model_optimizer_and_scheduler(train_loader)

    # ======= LOSS ==========
    # criterion = RMSELoss(reduction="mean") # nn.SmoothL1Loss(reduction='mean')
    criterion = nn.CrossEntropyLoss()
    softmax = nn.Softmax(dim=1)

    best_score = -np.inf
    # ====== ITERATE EPOCHS ========
    for epoch in range(config.epochs):
        start_time = time.time()

        # ======= TRAIN ==========
        avg_loss = train_epoch(
            train_loader, model, criterion, optimizer, epoch, scheduler, device
        )

        # ======= EVALUATION ==========
        avg_val_loss, prediction_dict = valid_epoch(
            valid_loader, model, criterion, device
        )
        predictions = prediction_dict["predictions"]
        _, predictions = torch.max(softmax(torch.tensor(predictions)), dim=1)

        # ======= SCORING ==========
        score = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(
            f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s"
        )
        LOGGER.info(f"Epoch {epoch+1} - Score: {score:.4f}")

        if score > best_score:
            best_score = score
            LOGGER.info(f"Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model")
            torch.save(
                {"model": model.state_dict(), "predictions": predictions},
                Paths.MODEL_OUTPUT_PATH
                + f"/{config.model.replace('/', '_')}_fold_{fold}_best.pth",
            )

    predictions = torch.load(
        Paths.MODEL_OUTPUT_PATH
        + f"/{config.model.replace('/', '_')}_fold_{fold}_best.pth",
        map_location=torch.device("cpu"),
    )["predictions"]
    valid_fold["pred_score"] = predictions

    del model, optimizer, scheduler, criterion, softmax
    torch.cuda.empty_cache()
    gc.collect()

    return valid_fold

In [11]:
def get_result(oof_df):
    labels = oof_df["score"].values
    preds = oof_df["pred_score"].values
    score = get_score(labels, preds)
    LOGGER.info(f'Score: {score:<.4f}')

## 🏁 Start Training

In [12]:
if config.train:
    oof_df = pd.DataFrame()

    for fold in range(config.n_folds):
        if fold in config.train_folds:
            _oof_df = train_loop(fold)
            oof_df = pd.concat([oof_df, _oof_df])
            LOGGER.info(f"========== Fold: {fold} result ==========")
            get_result(_oof_df)

    oof_df = oof_df.reset_index(drop=True)
    LOGGER.info(f"========== CV ==========")
    get_result(oof_df)
    oof_df.to_csv(Paths.MODEL_OUTPUT_PATH + "/oof_df.csv", index=False)



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Train:   0%|          | 1/865 [00:01<15:22,  1.07s/train_batch]

Epoch: [1][0/865] Elapsed 0m 1s (remain 15m 22s) Loss: 1.9761 Grad: inf  LR: 0.00002000  


Train:   2%|▏         | 21/865 [00:08<05:09,  2.73train_batch/s]

Epoch: [1][20/865] Elapsed 0m 8s (remain 5m 41s) Loss: 1.6795 Grad: 43470.1875  LR: 0.00001819  


Train:   5%|▍         | 41/865 [00:15<05:05,  2.70train_batch/s]

Epoch: [1][40/865] Elapsed 0m 15s (remain 5m 18s) Loss: 1.5454 Grad: 39716.7734  LR: 0.00001369  


Train:   7%|▋         | 61/865 [00:23<05:00,  2.68train_batch/s]

Epoch: [1][60/865] Elapsed 0m 23s (remain 5m 5s) Loss: 1.4579 Grad: 47418.4297  LR: 0.00000798  


Train:   9%|▉         | 81/865 [00:30<04:56,  2.65train_batch/s]

Epoch: [1][80/865] Elapsed 0m 30s (remain 4m 56s) Loss: 1.3800 Grad: 308037.5312  LR: 0.00000293  


Train:  12%|█▏        | 101/865 [00:38<04:47,  2.66train_batch/s]

Epoch: [1][100/865] Elapsed 0m 38s (remain 4m 47s) Loss: 1.3310 Grad: 365267.6875  LR: 0.00000021  


Train:  14%|█▍        | 121/865 [00:45<04:24,  2.81train_batch/s]

Epoch: [1][120/865] Elapsed 0m 45s (remain 4m 37s) Loss: 1.3153 Grad: 69581.8047  LR: 0.00000071  


Train:  16%|█▋        | 141/865 [00:52<04:17,  2.81train_batch/s]

Epoch: [1][140/865] Elapsed 0m 52s (remain 4m 27s) Loss: 1.2924 Grad: 75013.5625  LR: 0.00000426  


Train:  19%|█▊        | 161/865 [00:59<04:10,  2.81train_batch/s]

Epoch: [1][160/865] Elapsed 0m 59s (remain 4m 18s) Loss: 1.2710 Grad: 88212.3203  LR: 0.00000971  


Train:  21%|██        | 181/865 [01:06<04:03,  2.81train_batch/s]

Epoch: [1][180/865] Elapsed 1m 6s (remain 4m 10s) Loss: 1.2530 Grad: 218084.5312  LR: 0.00001525  


Train:  23%|██▎       | 201/865 [01:13<03:56,  2.81train_batch/s]

Epoch: [1][200/865] Elapsed 1m 13s (remain 4m 1s) Loss: 1.2302 Grad: 229881.7344  LR: 0.00001906  


Train:  26%|██▌       | 221/865 [01:20<03:49,  2.80train_batch/s]

Epoch: [1][220/865] Elapsed 1m 20s (remain 3m 53s) Loss: 1.2227 Grad: 65725.0625  LR: 0.00001989  


Train:  28%|██▊       | 241/865 [01:27<03:42,  2.80train_batch/s]

Epoch: [1][240/865] Elapsed 1m 27s (remain 3m 46s) Loss: 1.2224 Grad: 39921.3438  LR: 0.00001747  


Train:  30%|███       | 261/865 [01:34<03:35,  2.80train_batch/s]

Epoch: [1][260/865] Elapsed 1m 34s (remain 3m 38s) Loss: 1.2193 Grad: 28989.6211  LR: 0.00001259  


Train:  32%|███▏      | 281/865 [01:41<03:28,  2.79train_batch/s]

Epoch: [1][280/865] Elapsed 1m 41s (remain 3m 30s) Loss: 1.2090 Grad: 64912.7578  LR: 0.00000685  


Train:  35%|███▍      | 301/865 [01:48<03:21,  2.79train_batch/s]

Epoch: [1][300/865] Elapsed 1m 48s (remain 3m 23s) Loss: 1.1918 Grad: 129267.5547  LR: 0.00000216  


Train:  37%|███▋      | 321/865 [01:55<03:14,  2.79train_batch/s]

Epoch: [1][320/865] Elapsed 1m 55s (remain 3m 15s) Loss: 1.1821 Grad: 122521.0859  LR: 0.00000004  


Train:  39%|███▉      | 341/865 [02:02<03:07,  2.79train_batch/s]

Epoch: [1][340/865] Elapsed 2m 2s (remain 3m 8s) Loss: 1.1718 Grad: 61552.5664  LR: 0.00000120  


Train:  42%|████▏     | 361/865 [02:09<03:00,  2.79train_batch/s]

Epoch: [1][360/865] Elapsed 2m 9s (remain 3m 0s) Loss: 1.1602 Grad: 1518132.8750  LR: 0.00000525  


Train:  44%|████▍     | 381/865 [02:16<02:52,  2.80train_batch/s]

Epoch: [1][380/865] Elapsed 2m 16s (remain 2m 53s) Loss: 1.1490 Grad: 94639.2891  LR: 0.00001087  


Train:  46%|████▋     | 401/865 [02:23<02:46,  2.79train_batch/s]

Epoch: [1][400/865] Elapsed 2m 23s (remain 2m 46s) Loss: 1.1421 Grad: 54956.6406  LR: 0.00001620  


Train:  49%|████▊     | 421/865 [02:30<02:38,  2.80train_batch/s]

Epoch: [1][420/865] Elapsed 2m 30s (remain 2m 38s) Loss: 1.1303 Grad: 93972.7422  LR: 0.00001949  


Train:  51%|█████     | 441/865 [02:37<02:31,  2.79train_batch/s]

Epoch: [1][440/865] Elapsed 2m 37s (remain 2m 31s) Loss: 1.1252 Grad: 149612.7812  LR: 0.00001966  


Train:  53%|█████▎    | 461/865 [02:44<02:24,  2.80train_batch/s]

Epoch: [1][460/865] Elapsed 2m 44s (remain 2m 24s) Loss: 1.1203 Grad: 115889.5547  LR: 0.00001665  


Train:  56%|█████▌    | 481/865 [02:51<02:17,  2.79train_batch/s]

Epoch: [1][480/865] Elapsed 2m 51s (remain 2m 17s) Loss: 1.1164 Grad: 55667.0195  LR: 0.00001145  


Train:  58%|█████▊    | 501/865 [02:59<02:10,  2.79train_batch/s]

Epoch: [1][500/865] Elapsed 2m 58s (remain 2m 10s) Loss: 1.1129 Grad: 21529.1562  LR: 0.00000577  


Train:  60%|██████    | 521/865 [03:06<02:03,  2.80train_batch/s]

Epoch: [1][520/865] Elapsed 3m 6s (remain 2m 2s) Loss: 1.1090 Grad: 21699.4570  LR: 0.00000149  


Train:  63%|██████▎   | 541/865 [03:13<01:56,  2.79train_batch/s]

Epoch: [1][540/865] Elapsed 3m 13s (remain 1m 55s) Loss: 1.1060 Grad: 45328.5117  LR: 0.00000000  


Train:  65%|██████▍   | 561/865 [03:20<01:48,  2.79train_batch/s]

Epoch: [1][560/865] Elapsed 3m 20s (remain 1m 48s) Loss: 1.0997 Grad: 20466.0449  LR: 0.00000181  


Train:  67%|██████▋   | 581/865 [03:27<01:41,  2.79train_batch/s]

Epoch: [1][580/865] Elapsed 3m 27s (remain 1m 41s) Loss: 1.0958 Grad: 33721.0352  LR: 0.00000631  


Train:  69%|██████▉   | 601/865 [03:34<01:34,  2.79train_batch/s]

Epoch: [1][600/865] Elapsed 3m 34s (remain 1m 34s) Loss: 1.0921 Grad: 11370.9932  LR: 0.00001202  


Train:  72%|███████▏  | 621/865 [03:41<01:27,  2.79train_batch/s]

Epoch: [1][620/865] Elapsed 3m 41s (remain 1m 26s) Loss: 1.0907 Grad: 14007.5010  LR: 0.00001707  


Train:  74%|███████▍  | 641/865 [03:48<01:20,  2.79train_batch/s]

Epoch: [1][640/865] Elapsed 3m 48s (remain 1m 19s) Loss: 1.0890 Grad: 13571.9512  LR: 0.00001979  


Train:  76%|███████▋  | 661/865 [03:55<01:13,  2.79train_batch/s]

Epoch: [1][660/865] Elapsed 3m 55s (remain 1m 12s) Loss: 1.0936 Grad: 13624.5762  LR: 0.00001929  


Train:  79%|███████▊  | 681/865 [04:02<01:05,  2.79train_batch/s]

Epoch: [1][680/865] Elapsed 4m 2s (remain 1m 5s) Loss: 1.0894 Grad: 12791.5605  LR: 0.00001574  


Train:  81%|████████  | 701/865 [04:09<00:58,  2.79train_batch/s]

Epoch: [1][700/865] Elapsed 4m 9s (remain 0m 58s) Loss: 1.0840 Grad: 11944.3867  LR: 0.00001029  


Train:  83%|████████▎ | 721/865 [04:16<00:51,  2.79train_batch/s]

Epoch: [1][720/865] Elapsed 4m 16s (remain 0m 51s) Loss: 1.0794 Grad: 14814.8057  LR: 0.00000475  


Train:  86%|████████▌ | 741/865 [04:23<00:44,  2.79train_batch/s]

Epoch: [1][740/865] Elapsed 4m 23s (remain 0m 44s) Loss: 1.0740 Grad: 5604.5889  LR: 0.00000094  


Train:  88%|████████▊ | 761/865 [04:30<00:37,  2.79train_batch/s]

Epoch: [1][760/865] Elapsed 4m 30s (remain 0m 37s) Loss: 1.0675 Grad: 9711.2686  LR: 0.00000011  


Train:  90%|█████████ | 781/865 [04:37<00:30,  2.79train_batch/s]

Epoch: [1][780/865] Elapsed 4m 37s (remain 0m 29s) Loss: 1.0619 Grad: 19311.0195  LR: 0.00000253  


Train:  93%|█████████▎| 801/865 [04:45<00:22,  2.79train_batch/s]

Epoch: [1][800/865] Elapsed 4m 44s (remain 0m 22s) Loss: 1.0573 Grad: 7916.5898  LR: 0.00000741  


Train:  95%|█████████▍| 821/865 [04:52<00:15,  2.79train_batch/s]

Epoch: [1][820/865] Elapsed 4m 52s (remain 0m 15s) Loss: 1.0540 Grad: 5736.4111  LR: 0.00001315  


Train:  97%|█████████▋| 841/865 [04:59<00:08,  2.79train_batch/s]

Epoch: [1][840/865] Elapsed 4m 59s (remain 0m 8s) Loss: 1.0520 Grad: 5859.7959  LR: 0.00001784  


Train: 100%|█████████▉| 861/865 [05:06<00:01,  2.79train_batch/s]

Epoch: [1][860/865] Elapsed 5m 6s (remain 0m 1s) Loss: 1.0492 Grad: 5388.9302  LR: 0.00001996  


Train: 100%|██████████| 865/865 [05:07<00:00,  2.81train_batch/s]


Epoch: [1][864/865] Elapsed 5m 7s (remain 0m 0s) Loss: 1.0487 Grad: 11196.6963  LR: 0.00002000  


Validation:   1%|          | 2/217 [00:00<00:50,  4.22valid_batch/s]

EVAL: [0/217] Elapsed 0m 0s (remain 1m 7s) Loss: 0.6314 


Validation:  10%|█         | 22/217 [00:04<00:36,  5.40valid_batch/s]

EVAL: [20/217] Elapsed 0m 4s (remain 0m 37s) Loss: 0.9628 


Validation:  19%|█▉        | 42/217 [00:07<00:32,  5.40valid_batch/s]

EVAL: [40/217] Elapsed 0m 7s (remain 0m 33s) Loss: 0.9761 


Validation:  29%|██▊       | 62/217 [00:11<00:28,  5.39valid_batch/s]

EVAL: [60/217] Elapsed 0m 11s (remain 0m 29s) Loss: 0.9702 


Validation:  38%|███▊      | 82/217 [00:15<00:25,  5.40valid_batch/s]

EVAL: [80/217] Elapsed 0m 15s (remain 0m 25s) Loss: 0.9778 


Validation:  47%|████▋     | 102/217 [00:19<00:21,  5.40valid_batch/s]

EVAL: [100/217] Elapsed 0m 18s (remain 0m 21s) Loss: 0.9643 


Validation:  56%|█████▌    | 122/217 [00:22<00:17,  5.39valid_batch/s]

EVAL: [120/217] Elapsed 0m 22s (remain 0m 17s) Loss: 0.9676 


Validation:  65%|██████▌   | 142/217 [00:26<00:13,  5.41valid_batch/s]

EVAL: [140/217] Elapsed 0m 26s (remain 0m 14s) Loss: 0.9577 


Validation:  75%|███████▍  | 162/217 [00:30<00:10,  5.40valid_batch/s]

EVAL: [160/217] Elapsed 0m 29s (remain 0m 10s) Loss: 0.9607 


Validation:  84%|████████▍ | 182/217 [00:33<00:06,  5.40valid_batch/s]

EVAL: [180/217] Elapsed 0m 33s (remain 0m 6s) Loss: 0.9626 


Validation:  93%|█████████▎| 202/217 [00:37<00:02,  5.41valid_batch/s]

EVAL: [200/217] Elapsed 0m 37s (remain 0m 2s) Loss: 0.9663 


Validation: 100%|██████████| 217/217 [00:40<00:00,  5.39valid_batch/s]
Epoch 1 - avg_train_loss: 1.0487  avg_val_loss: 0.9674  time: 348s
Epoch 1 - Score: 0.7353
Epoch 1 - Save Best Score: 0.7353 Model


EVAL: [216/217] Elapsed 0m 40s (remain 0m 0s) Loss: 0.9674 


Train:   0%|          | 1/865 [00:00<06:35,  2.19train_batch/s]

Epoch: [2][0/865] Elapsed 0m 0s (remain 6m 35s) Loss: 1.3495 Grad: inf  LR: 0.00001998  


Train:   2%|▏         | 21/865 [00:07<05:02,  2.79train_batch/s]

Epoch: [2][20/865] Elapsed 0m 7s (remain 5m 0s) Loss: 1.3128 Grad: 128285.6875  LR: 0.00001802  


Train:   5%|▍         | 41/865 [00:14<04:55,  2.79train_batch/s]

Epoch: [2][40/865] Elapsed 0m 14s (remain 4m 52s) Loss: 1.2243 Grad: 46548.6211  LR: 0.00001342  


Train:   7%|▋         | 61/865 [00:21<04:48,  2.79train_batch/s]

Epoch: [2][60/865] Elapsed 0m 21s (remain 4m 44s) Loss: 1.1499 Grad: 77122.6406  LR: 0.00000769  


Train:   9%|▉         | 81/865 [00:28<04:41,  2.79train_batch/s]

Epoch: [2][80/865] Elapsed 0m 28s (remain 4m 37s) Loss: 1.0983 Grad: 71806.7734  LR: 0.00000273  


Train:  12%|█▏        | 101/865 [00:35<04:33,  2.79train_batch/s]

Epoch: [2][100/865] Elapsed 0m 35s (remain 4m 30s) Loss: 1.0648 Grad: 68658.1797  LR: 0.00000015  


Train:  14%|█▍        | 121/865 [00:42<04:29,  2.76train_batch/s]

Epoch: [2][120/865] Elapsed 0m 42s (remain 4m 23s) Loss: 1.0376 Grad: 50328.5195  LR: 0.00000082  


Train:  16%|█▋        | 141/865 [00:49<04:19,  2.79train_batch/s]

Epoch: [2][140/865] Elapsed 0m 49s (remain 4m 16s) Loss: 1.0238 Grad: 58512.0312  LR: 0.00000450  


Train:  19%|█▊        | 161/865 [00:57<04:10,  2.81train_batch/s]

Epoch: [2][160/865] Elapsed 0m 56s (remain 4m 9s) Loss: 1.0117 Grad: 30153.7168  LR: 0.00001000  


Train:  21%|██        | 181/865 [01:04<04:05,  2.79train_batch/s]

Epoch: [2][180/865] Elapsed 1m 4s (remain 4m 2s) Loss: 0.9965 Grad: 12818.2705  LR: 0.00001550  


Train:  23%|██▎       | 201/865 [01:11<03:57,  2.79train_batch/s]

Epoch: [2][200/865] Elapsed 1m 11s (remain 3m 54s) Loss: 0.9832 Grad: 16896.0312  LR: 0.00001918  


Train:  26%|██▌       | 221/865 [01:18<03:51,  2.79train_batch/s]

Epoch: [2][220/865] Elapsed 1m 18s (remain 3m 47s) Loss: 0.9729 Grad: 26916.6777  LR: 0.00001985  


Train:  28%|██▊       | 241/865 [01:25<03:43,  2.79train_batch/s]

Epoch: [2][240/865] Elapsed 1m 25s (remain 3m 40s) Loss: 0.9626 Grad: 21848.0840  LR: 0.00001727  


Train:  30%|███       | 261/865 [01:32<03:36,  2.79train_batch/s]

Epoch: [2][260/865] Elapsed 1m 32s (remain 3m 33s) Loss: 0.9495 Grad: 8293.0264  LR: 0.00001231  


Train:  32%|███▏      | 281/865 [01:39<03:29,  2.79train_batch/s]

Epoch: [2][280/865] Elapsed 1m 39s (remain 3m 26s) Loss: 0.9444 Grad: 8262.7461  LR: 0.00000658  


Train:  35%|███▍      | 301/865 [01:46<03:21,  2.79train_batch/s]

Epoch: [2][300/865] Elapsed 1m 46s (remain 3m 19s) Loss: 0.9359 Grad: 7362.6060  LR: 0.00000198  


Train:  37%|███▋      | 321/865 [01:53<03:15,  2.79train_batch/s]

Epoch: [2][320/865] Elapsed 1m 53s (remain 3m 12s) Loss: 0.9246 Grad: 10468.7197  LR: 0.00000002  


Train:  39%|███▉      | 341/865 [02:00<03:07,  2.79train_batch/s]

Epoch: [2][340/865] Elapsed 2m 0s (remain 3m 5s) Loss: 0.9162 Grad: 6291.0586  LR: 0.00000134  


Train:  42%|████▏     | 361/865 [02:07<03:00,  2.79train_batch/s]

Epoch: [2][360/865] Elapsed 2m 7s (remain 2m 58s) Loss: 0.9071 Grad: 13410.7490  LR: 0.00000551  


Train:  44%|████▍     | 381/865 [02:14<02:53,  2.79train_batch/s]

Epoch: [2][380/865] Elapsed 2m 14s (remain 2m 51s) Loss: 0.9052 Grad: 10250.0576  LR: 0.00001116  


Train:  46%|████▋     | 401/865 [02:21<02:46,  2.79train_batch/s]

Epoch: [2][400/865] Elapsed 2m 21s (remain 2m 44s) Loss: 0.9009 Grad: 11607.9775  LR: 0.00001643  


Train:  49%|████▊     | 421/865 [02:28<02:39,  2.79train_batch/s]

Epoch: [2][420/865] Elapsed 2m 28s (remain 2m 36s) Loss: 0.8940 Grad: 13463.3848  LR: 0.00001958  


Train:  51%|█████     | 441/865 [02:35<02:31,  2.79train_batch/s]

Epoch: [2][440/865] Elapsed 2m 35s (remain 2m 29s) Loss: 0.8909 Grad: 12940.8857  LR: 0.00001958  


Train:  53%|█████▎    | 461/865 [02:42<02:24,  2.80train_batch/s]

Epoch: [2][460/865] Elapsed 2m 42s (remain 2m 22s) Loss: 0.8899 Grad: 21519.1152  LR: 0.00001643  


Train:  56%|█████▌    | 481/865 [02:50<02:17,  2.79train_batch/s]

Epoch: [2][480/865] Elapsed 2m 50s (remain 2m 15s) Loss: 0.8875 Grad: 15294.3359  LR: 0.00001116  


Train:  58%|█████▊    | 501/865 [02:57<02:10,  2.79train_batch/s]

Epoch: [2][500/865] Elapsed 2m 57s (remain 2m 8s) Loss: 0.8849 Grad: 13317.6514  LR: 0.00000551  


Train:  60%|██████    | 521/865 [03:04<02:03,  2.79train_batch/s]

Epoch: [2][520/865] Elapsed 3m 4s (remain 2m 1s) Loss: 0.8832 Grad: 14989.7217  LR: 0.00000134  


Train:  63%|██████▎   | 541/865 [03:11<01:56,  2.79train_batch/s]

Epoch: [2][540/865] Elapsed 3m 11s (remain 1m 54s) Loss: 0.8793 Grad: 8210.9590  LR: 0.00000002  


Train:  65%|██████▍   | 561/865 [03:18<01:48,  2.79train_batch/s]

Epoch: [2][560/865] Elapsed 3m 18s (remain 1m 47s) Loss: 0.8752 Grad: 9653.8096  LR: 0.00000198  


Train:  67%|██████▋   | 581/865 [03:25<01:41,  2.79train_batch/s]

Epoch: [2][580/865] Elapsed 3m 25s (remain 1m 40s) Loss: 0.8707 Grad: 6143.6323  LR: 0.00000658  


Train:  69%|██████▉   | 601/865 [03:32<01:34,  2.79train_batch/s]

Epoch: [2][600/865] Elapsed 3m 32s (remain 1m 33s) Loss: 0.8672 Grad: 8511.0488  LR: 0.00001231  


Train:  72%|███████▏  | 621/865 [03:39<01:27,  2.79train_batch/s]

Epoch: [2][620/865] Elapsed 3m 39s (remain 1m 26s) Loss: 0.8663 Grad: 8385.6982  LR: 0.00001727  


Train:  74%|███████▍  | 641/865 [03:46<01:20,  2.79train_batch/s]

Epoch: [2][640/865] Elapsed 3m 46s (remain 1m 19s) Loss: 0.8637 Grad: 6851.7520  LR: 0.00001985  


Train:  76%|███████▋  | 661/865 [03:53<01:13,  2.79train_batch/s]

Epoch: [2][660/865] Elapsed 3m 53s (remain 1m 12s) Loss: 0.8642 Grad: 11184.7158  LR: 0.00001918  


Train:  79%|███████▊  | 681/865 [04:00<01:05,  2.79train_batch/s]

Epoch: [2][680/865] Elapsed 4m 0s (remain 1m 5s) Loss: 0.8619 Grad: 13596.1084  LR: 0.00001550  


Train:  81%|████████  | 701/865 [04:07<00:58,  2.79train_batch/s]

Epoch: [2][700/865] Elapsed 4m 7s (remain 0m 57s) Loss: 0.8611 Grad: 11710.1299  LR: 0.00001000  


Train:  83%|████████▎ | 721/865 [04:14<00:51,  2.79train_batch/s]

Epoch: [2][720/865] Elapsed 4m 14s (remain 0m 50s) Loss: 0.8576 Grad: 7583.4814  LR: 0.00000450  


Train:  86%|████████▌ | 741/865 [04:21<00:44,  2.79train_batch/s]

Epoch: [2][740/865] Elapsed 4m 21s (remain 0m 43s) Loss: 0.8575 Grad: 7775.0435  LR: 0.00000082  


Train:  88%|████████▊ | 761/865 [04:28<00:37,  2.79train_batch/s]

Epoch: [2][760/865] Elapsed 4m 28s (remain 0m 36s) Loss: 0.8561 Grad: 8110.1182  LR: 0.00000015  


Train:  90%|█████████ | 781/865 [04:36<00:30,  2.79train_batch/s]

Epoch: [2][780/865] Elapsed 4m 36s (remain 0m 29s) Loss: 0.8523 Grad: 6332.2109  LR: 0.00000273  


Train:  93%|█████████▎| 801/865 [04:43<00:22,  2.79train_batch/s]

Epoch: [2][800/865] Elapsed 4m 43s (remain 0m 22s) Loss: 0.8518 Grad: 13093.7920  LR: 0.00000769  


Train:  95%|█████████▍| 821/865 [04:50<00:15,  2.79train_batch/s]

Epoch: [2][820/865] Elapsed 4m 50s (remain 0m 15s) Loss: 0.8512 Grad: 7498.4761  LR: 0.00001342  


Train:  97%|█████████▋| 841/865 [04:57<00:08,  2.78train_batch/s]

Epoch: [2][840/865] Elapsed 4m 57s (remain 0m 8s) Loss: 0.8490 Grad: 12842.1230  LR: 0.00001802  


Train: 100%|█████████▉| 861/865 [05:04<00:01,  2.79train_batch/s]

Epoch: [2][860/865] Elapsed 5m 4s (remain 0m 1s) Loss: 0.8493 Grad: 9276.6748  LR: 0.00001998  


Train: 100%|██████████| 865/865 [05:05<00:00,  2.83train_batch/s]


Epoch: [2][864/865] Elapsed 5m 5s (remain 0m 0s) Loss: 0.8491 Grad: 7492.9131  LR: 0.00001998  


Validation:   1%|          | 2/217 [00:00<00:49,  4.32valid_batch/s]

EVAL: [0/217] Elapsed 0m 0s (remain 1m 4s) Loss: 0.6619 


Validation:  10%|█         | 22/217 [00:04<00:36,  5.41valid_batch/s]

EVAL: [20/217] Elapsed 0m 4s (remain 0m 37s) Loss: 1.0386 


Validation:  19%|█▉        | 42/217 [00:07<00:32,  5.40valid_batch/s]

EVAL: [40/217] Elapsed 0m 7s (remain 0m 33s) Loss: 1.0360 


Validation:  29%|██▊       | 62/217 [00:11<00:28,  5.41valid_batch/s]

EVAL: [60/217] Elapsed 0m 11s (remain 0m 29s) Loss: 1.0488 


Validation:  38%|███▊      | 82/217 [00:15<00:24,  5.41valid_batch/s]

EVAL: [80/217] Elapsed 0m 15s (remain 0m 25s) Loss: 1.0541 


Validation:  47%|████▋     | 102/217 [00:18<00:21,  5.40valid_batch/s]

EVAL: [100/217] Elapsed 0m 18s (remain 0m 21s) Loss: 1.0546 


Validation:  56%|█████▌    | 122/217 [00:22<00:17,  5.41valid_batch/s]

EVAL: [120/217] Elapsed 0m 22s (remain 0m 17s) Loss: 1.0551 


Validation:  65%|██████▌   | 142/217 [00:26<00:13,  5.40valid_batch/s]

EVAL: [140/217] Elapsed 0m 26s (remain 0m 14s) Loss: 1.0495 


Validation:  75%|███████▍  | 162/217 [00:30<00:10,  5.40valid_batch/s]

EVAL: [160/217] Elapsed 0m 29s (remain 0m 10s) Loss: 1.0366 


Validation:  84%|████████▍ | 182/217 [00:33<00:06,  5.41valid_batch/s]

EVAL: [180/217] Elapsed 0m 33s (remain 0m 6s) Loss: 1.0275 


Validation:  93%|█████████▎| 202/217 [00:37<00:02,  5.42valid_batch/s]

EVAL: [200/217] Elapsed 0m 37s (remain 0m 2s) Loss: 1.0197 


Validation: 100%|██████████| 217/217 [00:40<00:00,  5.40valid_batch/s]
Epoch 2 - avg_train_loss: 0.8491  avg_val_loss: 1.0230  time: 346s
Epoch 2 - Score: 0.7348


EVAL: [216/217] Elapsed 0m 40s (remain 0m 0s) Loss: 1.0230 


Score: 0.7353
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Train:   0%|          | 1/865 [00:00<07:03,  2.04train_batch/s]

Epoch: [1][0/865] Elapsed 0m 0s (remain 7m 3s) Loss: 1.9325 Grad: inf  LR: 0.00002000  


Train:   2%|▏         | 21/865 [00:07<05:02,  2.79train_batch/s]

Epoch: [1][20/865] Elapsed 0m 7s (remain 5m 1s) Loss: 1.6264 Grad: 53970.0977  LR: 0.00001819  


Train:   5%|▍         | 41/865 [00:14<04:54,  2.79train_batch/s]

Epoch: [1][40/865] Elapsed 0m 14s (remain 4m 51s) Loss: 1.5344 Grad: 33403.3125  LR: 0.00001369  


Train:   7%|▋         | 61/865 [00:21<04:47,  2.79train_batch/s]

Epoch: [1][60/865] Elapsed 0m 21s (remain 4m 44s) Loss: 1.4843 Grad: 48568.6914  LR: 0.00000798  


Train:   9%|▉         | 81/865 [00:28<04:40,  2.79train_batch/s]

Epoch: [1][80/865] Elapsed 0m 28s (remain 4m 37s) Loss: 1.4245 Grad: 110175.0156  LR: 0.00000293  


Train:  12%|█▏        | 101/865 [00:35<04:33,  2.79train_batch/s]

Epoch: [1][100/865] Elapsed 0m 35s (remain 4m 30s) Loss: 1.3948 Grad: 48176.3008  LR: 0.00000021  


Train:  14%|█▍        | 121/865 [00:42<04:21,  2.84train_batch/s]

Epoch: [1][120/865] Elapsed 0m 42s (remain 4m 22s) Loss: 1.3707 Grad: 315431.3750  LR: 0.00000071  


Train:  16%|█▋        | 141/865 [00:49<04:17,  2.81train_batch/s]

Epoch: [1][140/865] Elapsed 0m 49s (remain 4m 15s) Loss: 1.3548 Grad: 46798.3086  LR: 0.00000426  


Train:  19%|█▊        | 161/865 [00:56<04:11,  2.80train_batch/s]

Epoch: [1][160/865] Elapsed 0m 56s (remain 4m 8s) Loss: 1.3352 Grad: 76789.2734  LR: 0.00000971  


Train:  21%|██        | 181/865 [01:03<04:03,  2.81train_batch/s]

Epoch: [1][180/865] Elapsed 1m 3s (remain 4m 1s) Loss: 1.3274 Grad: 59354.9688  LR: 0.00001525  


Train:  23%|██▎       | 201/865 [01:10<03:57,  2.79train_batch/s]

Epoch: [1][200/865] Elapsed 1m 10s (remain 3m 54s) Loss: 1.3175 Grad: 1296901.0000  LR: 0.00001906  


Train:  26%|██▌       | 221/865 [01:17<03:50,  2.79train_batch/s]

Epoch: [1][220/865] Elapsed 1m 17s (remain 3m 47s) Loss: 1.2952 Grad: 51961.3438  LR: 0.00001989  


Train:  28%|██▊       | 241/865 [01:24<03:43,  2.79train_batch/s]

Epoch: [1][240/865] Elapsed 1m 24s (remain 3m 40s) Loss: 1.2769 Grad: 31233.1426  LR: 0.00001747  


Train:  30%|███       | 261/865 [01:32<03:36,  2.79train_batch/s]

Epoch: [1][260/865] Elapsed 1m 32s (remain 3m 33s) Loss: 1.2654 Grad: 52480.1719  LR: 0.00001259  


Train:  32%|███▏      | 281/865 [01:39<03:29,  2.78train_batch/s]

Epoch: [1][280/865] Elapsed 1m 39s (remain 3m 26s) Loss: 1.2477 Grad: 26666.3672  LR: 0.00000685  


Train:  35%|███▍      | 301/865 [01:46<03:22,  2.79train_batch/s]

Epoch: [1][300/865] Elapsed 1m 46s (remain 3m 18s) Loss: 1.2259 Grad: 64115.5898  LR: 0.00000216  


Train:  37%|███▋      | 321/865 [01:53<03:15,  2.79train_batch/s]

Epoch: [1][320/865] Elapsed 1m 53s (remain 3m 11s) Loss: 1.2077 Grad: 49129.9336  LR: 0.00000004  


Train:  39%|███▉      | 341/865 [02:00<03:07,  2.79train_batch/s]

Epoch: [1][340/865] Elapsed 2m 0s (remain 3m 4s) Loss: 1.1949 Grad: 48400.6289  LR: 0.00000120  


Train:  42%|████▏     | 361/865 [02:07<03:00,  2.79train_batch/s]

Epoch: [1][360/865] Elapsed 2m 7s (remain 2m 57s) Loss: 1.1814 Grad: 24394.1504  LR: 0.00000525  


Train:  44%|████▍     | 381/865 [02:14<02:53,  2.79train_batch/s]

Epoch: [1][380/865] Elapsed 2m 14s (remain 2m 50s) Loss: 1.1680 Grad: 46559.4922  LR: 0.00001087  


Train:  46%|████▋     | 401/865 [02:21<02:46,  2.79train_batch/s]

Epoch: [1][400/865] Elapsed 2m 21s (remain 2m 43s) Loss: 1.1544 Grad: 50163.1367  LR: 0.00001620  


Train:  49%|████▊     | 421/865 [02:28<02:39,  2.79train_batch/s]

Epoch: [1][420/865] Elapsed 2m 28s (remain 2m 36s) Loss: 1.1476 Grad: 50242.5586  LR: 0.00001949  


Train:  51%|█████     | 441/865 [02:35<02:31,  2.79train_batch/s]

Epoch: [1][440/865] Elapsed 2m 35s (remain 2m 29s) Loss: 1.1399 Grad: 62878.6406  LR: 0.00001966  


Train:  53%|█████▎    | 461/865 [02:42<02:24,  2.79train_batch/s]

Epoch: [1][460/865] Elapsed 2m 42s (remain 2m 22s) Loss: 1.1316 Grad: 38978.8398  LR: 0.00001665  


Train:  56%|█████▌    | 481/865 [02:49<02:17,  2.79train_batch/s]

Epoch: [1][480/865] Elapsed 2m 49s (remain 2m 15s) Loss: 1.1275 Grad: 37508.9727  LR: 0.00001145  


Train:  58%|█████▊    | 501/865 [02:56<02:10,  2.79train_batch/s]

Epoch: [1][500/865] Elapsed 2m 56s (remain 2m 8s) Loss: 1.1174 Grad: 27329.4141  LR: 0.00000577  


Train:  60%|██████    | 521/865 [03:03<02:03,  2.79train_batch/s]

Epoch: [1][520/865] Elapsed 3m 3s (remain 2m 1s) Loss: 1.1094 Grad: 20704.5254  LR: 0.00000149  


Train:  63%|██████▎   | 541/865 [03:11<01:56,  2.79train_batch/s]

Epoch: [1][540/865] Elapsed 3m 10s (remain 1m 54s) Loss: 1.1035 Grad: 73766.3203  LR: 0.00000000  


Train:  65%|██████▍   | 561/865 [03:18<01:48,  2.79train_batch/s]

Epoch: [1][560/865] Elapsed 3m 18s (remain 1m 47s) Loss: 1.0966 Grad: 52261.0352  LR: 0.00000181  


Train:  67%|██████▋   | 581/865 [03:25<01:41,  2.79train_batch/s]

Epoch: [1][580/865] Elapsed 3m 25s (remain 1m 40s) Loss: 1.0892 Grad: 38368.0156  LR: 0.00000631  


Train:  69%|██████▉   | 601/865 [03:32<01:34,  2.79train_batch/s]

Epoch: [1][600/865] Elapsed 3m 32s (remain 1m 33s) Loss: 1.0814 Grad: 33707.5352  LR: 0.00001202  


Train:  72%|███████▏  | 621/865 [03:39<01:27,  2.79train_batch/s]

Epoch: [1][620/865] Elapsed 3m 39s (remain 1m 26s) Loss: 1.0769 Grad: 85455.5625  LR: 0.00001707  


Train:  74%|███████▍  | 641/865 [03:46<01:20,  2.79train_batch/s]

Epoch: [1][640/865] Elapsed 3m 46s (remain 1m 19s) Loss: 1.0710 Grad: 115272.6172  LR: 0.00001979  


Train:  76%|███████▋  | 661/865 [03:53<01:13,  2.79train_batch/s]

Epoch: [1][660/865] Elapsed 3m 53s (remain 1m 12s) Loss: 1.0732 Grad: 487484.6875  LR: 0.00001929  


Train:  79%|███████▊  | 681/865 [04:00<01:05,  2.79train_batch/s]

Epoch: [1][680/865] Elapsed 4m 0s (remain 1m 4s) Loss: 1.0736 Grad: 28114.8633  LR: 0.00001574  


Train:  81%|████████  | 701/865 [04:07<00:58,  2.79train_batch/s]

Epoch: [1][700/865] Elapsed 4m 7s (remain 0m 57s) Loss: 1.0670 Grad: 98318.1016  LR: 0.00001029  


Train:  83%|████████▎ | 721/865 [04:14<00:51,  2.79train_batch/s]

Epoch: [1][720/865] Elapsed 4m 14s (remain 0m 50s) Loss: 1.0651 Grad: 39174.8789  LR: 0.00000475  


Train:  86%|████████▌ | 741/865 [04:21<00:44,  2.79train_batch/s]

Epoch: [1][740/865] Elapsed 4m 21s (remain 0m 43s) Loss: 1.0595 Grad: 16136.4414  LR: 0.00000094  


Train:  88%|████████▊ | 761/865 [04:28<00:37,  2.79train_batch/s]

Epoch: [1][760/865] Elapsed 4m 28s (remain 0m 36s) Loss: 1.0549 Grad: 132815.4688  LR: 0.00000011  


Train:  90%|█████████ | 781/865 [04:35<00:30,  2.79train_batch/s]

Epoch: [1][780/865] Elapsed 4m 35s (remain 0m 29s) Loss: 1.0488 Grad: 18345.8770  LR: 0.00000253  


Train:  93%|█████████▎| 801/865 [04:42<00:22,  2.79train_batch/s]

Epoch: [1][800/865] Elapsed 4m 42s (remain 0m 22s) Loss: 1.0451 Grad: 28406.2871  LR: 0.00000741  


Train:  95%|█████████▍| 821/865 [04:49<00:15,  2.79train_batch/s]

Epoch: [1][820/865] Elapsed 4m 49s (remain 0m 15s) Loss: 1.0409 Grad: 13231.5996  LR: 0.00001315  


Train:  97%|█████████▋| 841/865 [04:57<00:08,  2.79train_batch/s]

Epoch: [1][840/865] Elapsed 4m 56s (remain 0m 8s) Loss: 1.0368 Grad: 17737.0059  LR: 0.00001784  


Train: 100%|█████████▉| 861/865 [05:04<00:01,  2.79train_batch/s]

Epoch: [1][860/865] Elapsed 5m 4s (remain 0m 1s) Loss: 1.0360 Grad: 17846.4414  LR: 0.00001996  


Train: 100%|██████████| 865/865 [05:05<00:00,  2.83train_batch/s]


Epoch: [1][864/865] Elapsed 5m 5s (remain 0m 0s) Loss: 1.0352 Grad: 15815.4004  LR: 0.00002000  


Validation:   1%|          | 2/217 [00:00<00:52,  4.06valid_batch/s]

EVAL: [0/217] Elapsed 0m 0s (remain 1m 12s) Loss: 0.9725 


Validation:  10%|█         | 22/217 [00:04<00:36,  5.39valid_batch/s]

EVAL: [20/217] Elapsed 0m 4s (remain 0m 37s) Loss: 0.7565 


Validation:  19%|█▉        | 42/217 [00:07<00:32,  5.40valid_batch/s]

EVAL: [40/217] Elapsed 0m 7s (remain 0m 33s) Loss: 0.7623 


Validation:  29%|██▊       | 62/217 [00:11<00:28,  5.40valid_batch/s]

EVAL: [60/217] Elapsed 0m 11s (remain 0m 29s) Loss: 0.7735 


Validation:  38%|███▊      | 82/217 [00:15<00:24,  5.42valid_batch/s]

EVAL: [80/217] Elapsed 0m 15s (remain 0m 25s) Loss: 0.7914 


Validation:  47%|████▋     | 102/217 [00:19<00:21,  5.40valid_batch/s]

EVAL: [100/217] Elapsed 0m 18s (remain 0m 21s) Loss: 0.7996 


Validation:  56%|█████▌    | 122/217 [00:22<00:17,  5.41valid_batch/s]

EVAL: [120/217] Elapsed 0m 22s (remain 0m 17s) Loss: 0.7965 


Validation:  65%|██████▌   | 142/217 [00:26<00:13,  5.40valid_batch/s]

EVAL: [140/217] Elapsed 0m 26s (remain 0m 14s) Loss: 0.8040 


Validation:  75%|███████▍  | 162/217 [00:30<00:10,  5.40valid_batch/s]

EVAL: [160/217] Elapsed 0m 29s (remain 0m 10s) Loss: 0.8100 


Validation:  84%|████████▍ | 182/217 [00:33<00:06,  5.40valid_batch/s]

EVAL: [180/217] Elapsed 0m 33s (remain 0m 6s) Loss: 0.8185 


Validation:  93%|█████████▎| 202/217 [00:37<00:02,  5.40valid_batch/s]

EVAL: [200/217] Elapsed 0m 37s (remain 0m 2s) Loss: 0.8180 


Validation: 100%|██████████| 217/217 [00:40<00:00,  5.39valid_batch/s]
Epoch 1 - avg_train_loss: 1.0352  avg_val_loss: 0.8228  time: 346s
Epoch 1 - Score: 0.7856
Epoch 1 - Save Best Score: 0.7856 Model


EVAL: [216/217] Elapsed 0m 40s (remain 0m 0s) Loss: 0.8228 


Train:   0%|          | 1/865 [00:00<06:59,  2.06train_batch/s]

Epoch: [2][0/865] Elapsed 0m 0s (remain 7m 0s) Loss: 1.0676 Grad: inf  LR: 0.00001998  


Train:   2%|▏         | 21/865 [00:07<05:02,  2.79train_batch/s]

Epoch: [2][20/865] Elapsed 0m 7s (remain 5m 2s) Loss: 0.9832 Grad: 228734.1562  LR: 0.00001802  


Train:   5%|▍         | 41/865 [00:14<04:54,  2.79train_batch/s]

Epoch: [2][40/865] Elapsed 0m 14s (remain 4m 52s) Loss: 0.9535 Grad: 58812.5430  LR: 0.00001342  


Train:   7%|▋         | 61/865 [00:21<04:48,  2.79train_batch/s]

Epoch: [2][60/865] Elapsed 0m 21s (remain 4m 45s) Loss: 0.9086 Grad: 85671.8359  LR: 0.00000769  


Train:   9%|▉         | 81/865 [00:28<04:40,  2.79train_batch/s]

Epoch: [2][80/865] Elapsed 0m 28s (remain 4m 37s) Loss: 0.8849 Grad: 122489.4141  LR: 0.00000273  


Train:  12%|█▏        | 101/865 [00:35<04:33,  2.80train_batch/s]

Epoch: [2][100/865] Elapsed 0m 35s (remain 4m 30s) Loss: 0.8707 Grad: 97246.9531  LR: 0.00000015  


Train:  14%|█▍        | 121/865 [00:42<04:26,  2.79train_batch/s]

Epoch: [2][120/865] Elapsed 0m 42s (remain 4m 23s) Loss: 0.8572 Grad: 63969.6602  LR: 0.00000082  


Train:  16%|█▋        | 141/865 [00:49<04:19,  2.79train_batch/s]

Epoch: [2][140/865] Elapsed 0m 49s (remain 4m 16s) Loss: 0.8586 Grad: 71441.3828  LR: 0.00000450  


Train:  19%|█▊        | 161/865 [00:56<04:11,  2.79train_batch/s]

Epoch: [2][160/865] Elapsed 0m 56s (remain 4m 9s) Loss: 0.8512 Grad: 79125.7500  LR: 0.00001000  


Train:  21%|██        | 181/865 [01:04<04:04,  2.80train_batch/s]

Epoch: [2][180/865] Elapsed 1m 4s (remain 4m 1s) Loss: 0.8464 Grad: 69168.4609  LR: 0.00001550  


Train:  23%|██▎       | 201/865 [01:11<03:57,  2.79train_batch/s]

Epoch: [2][200/865] Elapsed 1m 11s (remain 3m 54s) Loss: 0.8403 Grad: 63041.6055  LR: 0.00001918  


Train:  26%|██▌       | 221/865 [01:18<03:50,  2.79train_batch/s]

Epoch: [2][220/865] Elapsed 1m 18s (remain 3m 47s) Loss: 0.8379 Grad: 133723.7812  LR: 0.00001985  


Train:  28%|██▊       | 241/865 [01:25<03:43,  2.79train_batch/s]

Epoch: [2][240/865] Elapsed 1m 25s (remain 3m 40s) Loss: 0.8296 Grad: 48071.2656  LR: 0.00001727  


Train:  30%|███       | 261/865 [01:32<03:36,  2.79train_batch/s]

Epoch: [2][260/865] Elapsed 1m 32s (remain 3m 33s) Loss: 0.8300 Grad: 72729.1484  LR: 0.00001231  


Train:  32%|███▏      | 281/865 [01:39<03:29,  2.79train_batch/s]

Epoch: [2][280/865] Elapsed 1m 39s (remain 3m 26s) Loss: 0.8266 Grad: 152477.0781  LR: 0.00000658  


Train:  35%|███▍      | 301/865 [01:46<03:22,  2.79train_batch/s]

Epoch: [2][300/865] Elapsed 1m 46s (remain 3m 19s) Loss: 0.8215 Grad: 73030.2031  LR: 0.00000198  


Train:  37%|███▋      | 321/865 [01:53<03:14,  2.79train_batch/s]

Epoch: [2][320/865] Elapsed 1m 53s (remain 3m 12s) Loss: 0.8169 Grad: 108017.2266  LR: 0.00000002  


Train:  39%|███▉      | 341/865 [02:00<03:07,  2.80train_batch/s]

Epoch: [2][340/865] Elapsed 2m 0s (remain 3m 5s) Loss: 0.8101 Grad: 57097.5195  LR: 0.00000134  


Train:  42%|████▏     | 361/865 [02:07<03:00,  2.79train_batch/s]

Epoch: [2][360/865] Elapsed 2m 7s (remain 2m 58s) Loss: 0.8048 Grad: 68231.6328  LR: 0.00000551  


Train:  44%|████▍     | 381/865 [02:14<02:53,  2.79train_batch/s]

Epoch: [2][380/865] Elapsed 2m 14s (remain 2m 51s) Loss: 0.8028 Grad: 80278.0547  LR: 0.00001116  


Train:  46%|████▋     | 401/865 [02:21<02:46,  2.79train_batch/s]

Epoch: [2][400/865] Elapsed 2m 21s (remain 2m 43s) Loss: 0.7999 Grad: 101589.1875  LR: 0.00001643  


Train:  49%|████▊     | 421/865 [02:28<02:39,  2.79train_batch/s]

Epoch: [2][420/865] Elapsed 2m 28s (remain 2m 36s) Loss: 0.8037 Grad: 113111.9922  LR: 0.00001958  


Train:  51%|█████     | 441/865 [02:35<02:31,  2.79train_batch/s]

Epoch: [2][440/865] Elapsed 2m 35s (remain 2m 29s) Loss: 0.8049 Grad: 76266.3438  LR: 0.00001958  


Train:  53%|█████▎    | 461/865 [02:42<02:24,  2.79train_batch/s]

Epoch: [2][460/865] Elapsed 2m 42s (remain 2m 22s) Loss: 0.8030 Grad: 138707.0625  LR: 0.00001643  


Train:  56%|█████▌    | 481/865 [02:49<02:17,  2.79train_batch/s]

Epoch: [2][480/865] Elapsed 2m 49s (remain 2m 15s) Loss: 0.8050 Grad: 58890.6523  LR: 0.00001116  


Train:  58%|█████▊    | 501/865 [02:57<02:10,  2.80train_batch/s]

Epoch: [2][500/865] Elapsed 2m 57s (remain 2m 8s) Loss: 0.8072 Grad: 43602.4844  LR: 0.00000551  


Train:  60%|██████    | 521/865 [03:04<02:03,  2.79train_batch/s]

Epoch: [2][520/865] Elapsed 3m 4s (remain 2m 1s) Loss: 0.8040 Grad: 26731.8594  LR: 0.00000134  


Train:  63%|██████▎   | 541/865 [03:11<01:55,  2.79train_batch/s]

Epoch: [2][540/865] Elapsed 3m 11s (remain 1m 54s) Loss: 0.8027 Grad: 39032.7031  LR: 0.00000002  


Train:  65%|██████▍   | 561/865 [03:18<01:48,  2.79train_batch/s]

Epoch: [2][560/865] Elapsed 3m 18s (remain 1m 47s) Loss: 0.8037 Grad: 35374.3086  LR: 0.00000198  


Train:  67%|██████▋   | 581/865 [03:25<01:41,  2.80train_batch/s]

Epoch: [2][580/865] Elapsed 3m 25s (remain 1m 40s) Loss: 0.8043 Grad: 35457.2227  LR: 0.00000658  


Train:  69%|██████▉   | 601/865 [03:32<01:34,  2.80train_batch/s]

Epoch: [2][600/865] Elapsed 3m 32s (remain 1m 33s) Loss: 0.7996 Grad: 77012.8750  LR: 0.00001231  


Train:  72%|███████▏  | 621/865 [03:39<01:27,  2.79train_batch/s]

Epoch: [2][620/865] Elapsed 3m 39s (remain 1m 26s) Loss: 0.8019 Grad: 95421.1797  LR: 0.00001727  


Train:  74%|███████▍  | 641/865 [03:46<01:20,  2.79train_batch/s]

Epoch: [2][640/865] Elapsed 3m 46s (remain 1m 19s) Loss: 0.8028 Grad: 25919.9238  LR: 0.00001985  


Train:  76%|███████▋  | 661/865 [03:53<01:13,  2.79train_batch/s]

Epoch: [2][660/865] Elapsed 3m 53s (remain 1m 12s) Loss: 0.8043 Grad: 46853.3711  LR: 0.00001918  


Train:  79%|███████▊  | 681/865 [04:00<01:05,  2.79train_batch/s]

Epoch: [2][680/865] Elapsed 4m 0s (remain 1m 5s) Loss: 0.8047 Grad: 77774.9531  LR: 0.00001550  


Train:  81%|████████  | 701/865 [04:07<00:58,  2.79train_batch/s]

Epoch: [2][700/865] Elapsed 4m 7s (remain 0m 57s) Loss: 0.8052 Grad: 54391.4258  LR: 0.00001000  


Train:  83%|████████▎ | 721/865 [04:14<00:51,  2.79train_batch/s]

Epoch: [2][720/865] Elapsed 4m 14s (remain 0m 50s) Loss: 0.8052 Grad: 22659.1191  LR: 0.00000450  


Train:  86%|████████▌ | 741/865 [04:21<00:44,  2.79train_batch/s]

Epoch: [2][740/865] Elapsed 4m 21s (remain 0m 43s) Loss: 0.8041 Grad: 30610.2500  LR: 0.00000082  


Train:  88%|████████▊ | 761/865 [04:28<00:37,  2.79train_batch/s]

Epoch: [2][760/865] Elapsed 4m 28s (remain 0m 36s) Loss: 0.8013 Grad: 21712.2852  LR: 0.00000015  


Train:  90%|█████████ | 781/865 [04:35<00:30,  2.79train_batch/s]

Epoch: [2][780/865] Elapsed 4m 35s (remain 0m 29s) Loss: 0.8007 Grad: 21985.2148  LR: 0.00000273  


Train:  93%|█████████▎| 801/865 [04:42<00:22,  2.79train_batch/s]

Epoch: [2][800/865] Elapsed 4m 42s (remain 0m 22s) Loss: 0.8009 Grad: 24488.9863  LR: 0.00000769  


Train:  95%|█████████▍| 821/865 [04:50<00:15,  2.79train_batch/s]

Epoch: [2][820/865] Elapsed 4m 50s (remain 0m 15s) Loss: 0.8035 Grad: 23449.4277  LR: 0.00001342  


Train:  97%|█████████▋| 841/865 [04:57<00:08,  2.79train_batch/s]

Epoch: [2][840/865] Elapsed 4m 57s (remain 0m 8s) Loss: 0.8063 Grad: 43676.5820  LR: 0.00001802  


Train: 100%|█████████▉| 861/865 [05:04<00:01,  2.79train_batch/s]

Epoch: [2][860/865] Elapsed 5m 4s (remain 0m 1s) Loss: 0.8095 Grad: 24965.1504  LR: 0.00001998  


Train: 100%|██████████| 865/865 [05:05<00:00,  2.83train_batch/s]


Epoch: [2][864/865] Elapsed 5m 5s (remain 0m 0s) Loss: 0.8093 Grad: 29500.2832  LR: 0.00001998  


Validation:   1%|          | 2/217 [00:00<00:53,  4.02valid_batch/s]

EVAL: [0/217] Elapsed 0m 0s (remain 1m 13s) Loss: 0.7584 


Validation:  10%|█         | 22/217 [00:04<00:36,  5.39valid_batch/s]

EVAL: [20/217] Elapsed 0m 4s (remain 0m 37s) Loss: 0.7293 


Validation:  19%|█▉        | 42/217 [00:07<00:32,  5.41valid_batch/s]

EVAL: [40/217] Elapsed 0m 7s (remain 0m 33s) Loss: 0.7477 


Validation:  29%|██▊       | 62/217 [00:11<00:28,  5.41valid_batch/s]

EVAL: [60/217] Elapsed 0m 11s (remain 0m 29s) Loss: 0.7589 


Validation:  38%|███▊      | 82/217 [00:15<00:24,  5.41valid_batch/s]

EVAL: [80/217] Elapsed 0m 15s (remain 0m 25s) Loss: 0.7737 


Validation:  47%|████▋     | 102/217 [00:19<00:21,  5.40valid_batch/s]

EVAL: [100/217] Elapsed 0m 18s (remain 0m 21s) Loss: 0.7819 


Validation:  56%|█████▌    | 122/217 [00:22<00:17,  5.41valid_batch/s]

EVAL: [120/217] Elapsed 0m 22s (remain 0m 17s) Loss: 0.7780 


Validation:  65%|██████▌   | 142/217 [00:26<00:13,  5.40valid_batch/s]

EVAL: [140/217] Elapsed 0m 26s (remain 0m 14s) Loss: 0.7911 


Validation:  75%|███████▍  | 162/217 [00:30<00:10,  5.39valid_batch/s]

EVAL: [160/217] Elapsed 0m 29s (remain 0m 10s) Loss: 0.8017 


Validation:  84%|████████▍ | 182/217 [00:33<00:06,  5.40valid_batch/s]

EVAL: [180/217] Elapsed 0m 33s (remain 0m 6s) Loss: 0.8088 


Validation:  93%|█████████▎| 202/217 [00:37<00:02,  5.39valid_batch/s]

EVAL: [200/217] Elapsed 0m 37s (remain 0m 2s) Loss: 0.8068 


Validation: 100%|██████████| 217/217 [00:40<00:00,  5.39valid_batch/s]
Epoch 2 - avg_train_loss: 0.8093  avg_val_loss: 0.8129  time: 346s
Epoch 2 - Score: 0.8072
Epoch 2 - Save Best Score: 0.8072 Model


EVAL: [216/217] Elapsed 0m 40s (remain 0m 0s) Loss: 0.8129 


Score: 0.8072
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Train:   0%|          | 1/865 [00:00<06:56,  2.07train_batch/s]

Epoch: [1][0/865] Elapsed 0m 0s (remain 6m 57s) Loss: 1.8244 Grad: inf  LR: 0.00002000  


Train:   2%|▏         | 21/865 [00:07<05:02,  2.79train_batch/s]

Epoch: [1][20/865] Elapsed 0m 7s (remain 5m 1s) Loss: 1.5910 Grad: 48185.8281  LR: 0.00001819  


Train:   5%|▍         | 41/865 [00:14<04:54,  2.80train_batch/s]

Epoch: [1][40/865] Elapsed 0m 14s (remain 4m 52s) Loss: 1.5154 Grad: 41902.5352  LR: 0.00001369  


Train:   7%|▋         | 61/865 [00:21<04:49,  2.78train_batch/s]

Epoch: [1][60/865] Elapsed 0m 21s (remain 4m 45s) Loss: 1.4354 Grad: 94396.1016  LR: 0.00000798  


Train:   9%|▉         | 81/865 [00:28<04:40,  2.79train_batch/s]

Epoch: [1][80/865] Elapsed 0m 28s (remain 4m 37s) Loss: 1.3780 Grad: 68608.4219  LR: 0.00000293  


Train:  12%|█▏        | 101/865 [00:35<04:34,  2.79train_batch/s]

Epoch: [1][100/865] Elapsed 0m 35s (remain 4m 30s) Loss: 1.3424 Grad: 63138.3438  LR: 0.00000021  


Train:  14%|█▍        | 121/865 [00:42<04:26,  2.79train_batch/s]

Epoch: [1][120/865] Elapsed 0m 42s (remain 4m 23s) Loss: 1.3093 Grad: 72258.9141  LR: 0.00000071  


Train:  16%|█▋        | 141/865 [00:49<04:19,  2.79train_batch/s]

Epoch: [1][140/865] Elapsed 0m 49s (remain 4m 16s) Loss: 1.2917 Grad: 72826.8828  LR: 0.00000426  


Train:  19%|█▊        | 161/865 [00:56<04:12,  2.79train_batch/s]

Epoch: [1][160/865] Elapsed 0m 56s (remain 4m 9s) Loss: 1.2663 Grad: 115134.3438  LR: 0.00000971  


Train:  21%|██        | 181/865 [01:04<04:04,  2.79train_batch/s]

Epoch: [1][180/865] Elapsed 1m 4s (remain 4m 2s) Loss: 1.2474 Grad: 60155.3633  LR: 0.00001525  


Train:  23%|██▎       | 201/865 [01:11<03:57,  2.79train_batch/s]

Epoch: [1][200/865] Elapsed 1m 11s (remain 3m 54s) Loss: 1.2347 Grad: 31921.3320  LR: 0.00001906  


Train:  26%|██▌       | 221/865 [01:18<03:50,  2.79train_batch/s]

Epoch: [1][220/865] Elapsed 1m 18s (remain 3m 47s) Loss: 1.2274 Grad: 19949.8027  LR: 0.00001989  


Train:  28%|██▊       | 241/865 [01:25<03:43,  2.79train_batch/s]

Epoch: [1][240/865] Elapsed 1m 25s (remain 3m 40s) Loss: 1.2200 Grad: 18073.5742  LR: 0.00001747  


Train:  30%|███       | 261/865 [01:32<03:36,  2.79train_batch/s]

Epoch: [1][260/865] Elapsed 1m 32s (remain 3m 33s) Loss: 1.2007 Grad: 23004.9004  LR: 0.00001259  


Train:  32%|███▏      | 281/865 [01:39<03:29,  2.79train_batch/s]

Epoch: [1][280/865] Elapsed 1m 39s (remain 3m 26s) Loss: 1.1857 Grad: 10513.7158  LR: 0.00000685  


Train:  35%|███▍      | 301/865 [01:46<03:22,  2.79train_batch/s]

Epoch: [1][300/865] Elapsed 1m 46s (remain 3m 19s) Loss: 1.1661 Grad: 32749.0996  LR: 0.00000216  


Train:  37%|███▋      | 321/865 [01:53<03:15,  2.79train_batch/s]

Epoch: [1][320/865] Elapsed 1m 53s (remain 3m 12s) Loss: 1.1507 Grad: 19618.5488  LR: 0.00000004  


Train:  39%|███▉      | 341/865 [02:00<03:07,  2.79train_batch/s]

Epoch: [1][340/865] Elapsed 2m 0s (remain 3m 5s) Loss: 1.1389 Grad: 29488.8359  LR: 0.00000120  


Train:  42%|████▏     | 361/865 [02:07<03:00,  2.79train_batch/s]

Epoch: [1][360/865] Elapsed 2m 7s (remain 2m 58s) Loss: 1.1317 Grad: 23643.6367  LR: 0.00000525  


Train:  44%|████▍     | 381/865 [02:14<02:53,  2.79train_batch/s]

Epoch: [1][380/865] Elapsed 2m 14s (remain 2m 51s) Loss: 1.1220 Grad: 23635.8340  LR: 0.00001087  


Train:  46%|████▋     | 401/865 [02:21<02:46,  2.79train_batch/s]

Epoch: [1][400/865] Elapsed 2m 21s (remain 2m 44s) Loss: 1.1165 Grad: 15200.5840  LR: 0.00001620  


Train:  49%|████▊     | 421/865 [02:28<02:39,  2.79train_batch/s]

Epoch: [1][420/865] Elapsed 2m 28s (remain 2m 36s) Loss: 1.1155 Grad: 23766.6543  LR: 0.00001949  


Train:  51%|█████     | 441/865 [02:35<02:31,  2.79train_batch/s]

Epoch: [1][440/865] Elapsed 2m 35s (remain 2m 29s) Loss: 1.1161 Grad: 21804.9766  LR: 0.00001966  


Train:  53%|█████▎    | 461/865 [02:42<02:24,  2.79train_batch/s]

Epoch: [1][460/865] Elapsed 2m 42s (remain 2m 22s) Loss: 1.1081 Grad: 32312.3633  LR: 0.00001665  


Train:  56%|█████▌    | 481/865 [02:50<02:17,  2.79train_batch/s]

Epoch: [1][480/865] Elapsed 2m 50s (remain 2m 15s) Loss: 1.1008 Grad: 21179.6172  LR: 0.00001145  


Train:  58%|█████▊    | 501/865 [02:57<02:10,  2.79train_batch/s]

Epoch: [1][500/865] Elapsed 2m 57s (remain 2m 8s) Loss: 1.0933 Grad: 17000.0742  LR: 0.00000577  


Train:  60%|██████    | 521/865 [03:04<02:03,  2.79train_batch/s]

Epoch: [1][520/865] Elapsed 3m 4s (remain 2m 1s) Loss: 1.0861 Grad: 28585.1348  LR: 0.00000149  


Train:  63%|██████▎   | 541/865 [03:11<01:56,  2.79train_batch/s]

Epoch: [1][540/865] Elapsed 3m 11s (remain 1m 54s) Loss: 1.0767 Grad: 10867.5361  LR: 0.00000000  


Train:  65%|██████▍   | 561/865 [03:18<01:49,  2.79train_batch/s]

Epoch: [1][560/865] Elapsed 3m 18s (remain 1m 47s) Loss: 1.0693 Grad: 13330.8223  LR: 0.00000181  


Train:  67%|██████▋   | 581/865 [03:25<01:41,  2.79train_batch/s]

Epoch: [1][580/865] Elapsed 3m 25s (remain 1m 40s) Loss: 1.0619 Grad: 10683.8223  LR: 0.00000631  


Train:  69%|██████▉   | 601/865 [03:32<01:34,  2.79train_batch/s]

Epoch: [1][600/865] Elapsed 3m 32s (remain 1m 33s) Loss: 1.0557 Grad: 15600.2383  LR: 0.00001202  


Train:  72%|███████▏  | 621/865 [03:39<01:27,  2.79train_batch/s]

Epoch: [1][620/865] Elapsed 3m 39s (remain 1m 26s) Loss: 1.0536 Grad: 17741.3379  LR: 0.00001707  


Train:  74%|███████▍  | 641/865 [03:46<01:20,  2.80train_batch/s]

Epoch: [1][640/865] Elapsed 3m 46s (remain 1m 19s) Loss: 1.0496 Grad: 13503.3760  LR: 0.00001979  


Train:  76%|███████▋  | 661/865 [03:53<01:13,  2.79train_batch/s]

Epoch: [1][660/865] Elapsed 3m 53s (remain 1m 12s) Loss: 1.0453 Grad: 29047.3594  LR: 0.00001929  


Train:  79%|███████▊  | 681/865 [04:00<01:05,  2.79train_batch/s]

Epoch: [1][680/865] Elapsed 4m 0s (remain 1m 5s) Loss: 1.0442 Grad: 35757.2422  LR: 0.00001574  


Train:  81%|████████  | 701/865 [04:07<00:58,  2.79train_batch/s]

Epoch: [1][700/865] Elapsed 4m 7s (remain 0m 57s) Loss: 1.0391 Grad: 20915.6250  LR: 0.00001029  


Train:  83%|████████▎ | 721/865 [04:14<00:51,  2.80train_batch/s]

Epoch: [1][720/865] Elapsed 4m 14s (remain 0m 50s) Loss: 1.0334 Grad: 15532.6621  LR: 0.00000475  


Train:  86%|████████▌ | 741/865 [04:21<00:44,  2.79train_batch/s]

Epoch: [1][740/865] Elapsed 4m 21s (remain 0m 43s) Loss: 1.0308 Grad: 15698.9941  LR: 0.00000094  


Train:  88%|████████▊ | 761/865 [04:28<00:37,  2.79train_batch/s]

Epoch: [1][760/865] Elapsed 4m 28s (remain 0m 36s) Loss: 1.0255 Grad: 23974.2852  LR: 0.00000011  


Train:  90%|█████████ | 781/865 [04:36<00:30,  2.79train_batch/s]

Epoch: [1][780/865] Elapsed 4m 35s (remain 0m 29s) Loss: 1.0206 Grad: 12439.4727  LR: 0.00000253  


Train:  93%|█████████▎| 801/865 [04:43<00:22,  2.80train_batch/s]

Epoch: [1][800/865] Elapsed 4m 43s (remain 0m 22s) Loss: 1.0170 Grad: 13716.1152  LR: 0.00000741  


Train:  95%|█████████▍| 821/865 [04:50<00:15,  2.79train_batch/s]

Epoch: [1][820/865] Elapsed 4m 50s (remain 0m 15s) Loss: 1.0124 Grad: 31521.9375  LR: 0.00001315  


Train:  97%|█████████▋| 841/865 [04:57<00:08,  2.79train_batch/s]

Epoch: [1][840/865] Elapsed 4m 57s (remain 0m 8s) Loss: 1.0108 Grad: 18220.7402  LR: 0.00001784  


Train: 100%|█████████▉| 861/865 [05:04<00:01,  2.78train_batch/s]

Epoch: [1][860/865] Elapsed 5m 4s (remain 0m 1s) Loss: 1.0082 Grad: 12102.8047  LR: 0.00001996  


Train: 100%|██████████| 865/865 [05:05<00:00,  2.83train_batch/s]


Epoch: [1][864/865] Elapsed 5m 5s (remain 0m 0s) Loss: 1.0082 Grad: 19870.6152  LR: 0.00002000  


Validation:   1%|          | 2/217 [00:00<00:54,  3.97valid_batch/s]

EVAL: [0/217] Elapsed 0m 0s (remain 1m 15s) Loss: 0.9087 


Validation:  10%|█         | 22/217 [00:04<00:36,  5.40valid_batch/s]

EVAL: [20/217] Elapsed 0m 4s (remain 0m 37s) Loss: 0.7482 


Validation:  19%|█▉        | 42/217 [00:07<00:32,  5.40valid_batch/s]

EVAL: [40/217] Elapsed 0m 7s (remain 0m 33s) Loss: 0.8045 


Validation:  29%|██▊       | 62/217 [00:11<00:28,  5.41valid_batch/s]

EVAL: [60/217] Elapsed 0m 11s (remain 0m 29s) Loss: 0.8276 


Validation:  38%|███▊      | 82/217 [00:15<00:24,  5.40valid_batch/s]

EVAL: [80/217] Elapsed 0m 15s (remain 0m 25s) Loss: 0.8528 


Validation:  47%|████▋     | 102/217 [00:19<00:21,  5.40valid_batch/s]

EVAL: [100/217] Elapsed 0m 18s (remain 0m 21s) Loss: 0.8711 


Validation:  56%|█████▌    | 122/217 [00:22<00:17,  5.42valid_batch/s]

EVAL: [120/217] Elapsed 0m 22s (remain 0m 17s) Loss: 0.8718 


Validation:  65%|██████▌   | 142/217 [00:26<00:13,  5.41valid_batch/s]

EVAL: [140/217] Elapsed 0m 26s (remain 0m 14s) Loss: 0.8705 


Validation:  75%|███████▍  | 162/217 [00:30<00:10,  5.41valid_batch/s]

EVAL: [160/217] Elapsed 0m 29s (remain 0m 10s) Loss: 0.8696 


Validation:  84%|████████▍ | 182/217 [00:33<00:06,  5.41valid_batch/s]

EVAL: [180/217] Elapsed 0m 33s (remain 0m 6s) Loss: 0.8704 


Validation:  93%|█████████▎| 202/217 [00:37<00:02,  5.41valid_batch/s]

EVAL: [200/217] Elapsed 0m 37s (remain 0m 2s) Loss: 0.8683 


Validation: 100%|██████████| 217/217 [00:40<00:00,  5.40valid_batch/s]
Epoch 1 - avg_train_loss: 1.0082  avg_val_loss: 0.8758  time: 346s
Epoch 1 - Score: 0.7468
Epoch 1 - Save Best Score: 0.7468 Model


EVAL: [216/217] Elapsed 0m 40s (remain 0m 0s) Loss: 0.8758 


Train:   0%|          | 1/865 [00:00<07:01,  2.05train_batch/s]

Epoch: [2][0/865] Elapsed 0m 0s (remain 7m 1s) Loss: 1.0401 Grad: inf  LR: 0.00001998  


Train:   2%|▏         | 21/865 [00:07<05:02,  2.79train_batch/s]

Epoch: [2][20/865] Elapsed 0m 7s (remain 5m 2s) Loss: 0.8995 Grad: 117731.6719  LR: 0.00001802  


Train:   5%|▍         | 41/865 [00:14<04:55,  2.79train_batch/s]

Epoch: [2][40/865] Elapsed 0m 14s (remain 4m 53s) Loss: 0.9036 Grad: 135377.8438  LR: 0.00001342  


Train:   7%|▋         | 61/865 [00:21<04:47,  2.79train_batch/s]

Epoch: [2][60/865] Elapsed 0m 21s (remain 4m 45s) Loss: 0.8627 Grad: 99812.7969  LR: 0.00000769  


Train:   9%|▉         | 81/865 [00:28<04:40,  2.79train_batch/s]

Epoch: [2][80/865] Elapsed 0m 28s (remain 4m 38s) Loss: 0.8324 Grad: 135572.8750  LR: 0.00000273  


Train:  12%|█▏        | 101/865 [00:35<04:33,  2.79train_batch/s]

Epoch: [2][100/865] Elapsed 0m 35s (remain 4m 30s) Loss: 0.8119 Grad: 144990.4844  LR: 0.00000015  


Train:  14%|█▍        | 121/865 [00:42<04:26,  2.79train_batch/s]

Epoch: [2][120/865] Elapsed 0m 42s (remain 4m 23s) Loss: 0.8253 Grad: 154720.7188  LR: 0.00000082  


Train:  16%|█▋        | 141/865 [00:49<04:19,  2.79train_batch/s]

Epoch: [2][140/865] Elapsed 0m 49s (remain 4m 16s) Loss: 0.8291 Grad: 107450.6562  LR: 0.00000450  


Train:  19%|█▊        | 161/865 [00:57<04:12,  2.79train_batch/s]

Epoch: [2][160/865] Elapsed 0m 57s (remain 4m 9s) Loss: 0.8271 Grad: 110048.6953  LR: 0.00001000  


Train:  21%|██        | 181/865 [01:04<04:05,  2.79train_batch/s]

Epoch: [2][180/865] Elapsed 1m 4s (remain 4m 2s) Loss: 0.8141 Grad: 163688.3281  LR: 0.00001550  


Train:  23%|██▎       | 201/865 [01:11<03:57,  2.79train_batch/s]

Epoch: [2][200/865] Elapsed 1m 11s (remain 3m 55s) Loss: 0.8052 Grad: 184788.3750  LR: 0.00001918  


Train:  26%|██▌       | 221/865 [01:18<03:50,  2.79train_batch/s]

Epoch: [2][220/865] Elapsed 1m 18s (remain 3m 47s) Loss: 0.8015 Grad: 242878.4062  LR: 0.00001985  


Train:  28%|██▊       | 241/865 [01:25<03:43,  2.79train_batch/s]

Epoch: [2][240/865] Elapsed 1m 25s (remain 3m 40s) Loss: 0.7964 Grad: 166969.3438  LR: 0.00001727  


Train:  30%|███       | 261/865 [01:32<03:36,  2.79train_batch/s]

Epoch: [2][260/865] Elapsed 1m 32s (remain 3m 33s) Loss: 0.7914 Grad: 145441.8906  LR: 0.00001231  


Train:  32%|███▏      | 281/865 [01:39<03:29,  2.79train_batch/s]

Epoch: [2][280/865] Elapsed 1m 39s (remain 3m 26s) Loss: 0.7849 Grad: 198371.1406  LR: 0.00000658  


Train:  35%|███▍      | 301/865 [01:46<03:22,  2.79train_batch/s]

Epoch: [2][300/865] Elapsed 1m 46s (remain 3m 19s) Loss: 0.7858 Grad: 199275.1562  LR: 0.00000198  


Train:  37%|███▋      | 321/865 [01:53<03:14,  2.79train_batch/s]

Epoch: [2][320/865] Elapsed 1m 53s (remain 3m 12s) Loss: 0.7834 Grad: 155746.4688  LR: 0.00000002  


Train:  39%|███▉      | 341/865 [02:00<03:07,  2.79train_batch/s]

Epoch: [2][340/865] Elapsed 2m 0s (remain 3m 5s) Loss: 0.7819 Grad: 136433.8750  LR: 0.00000134  


Train:  42%|████▏     | 361/865 [02:07<03:00,  2.79train_batch/s]

Epoch: [2][360/865] Elapsed 2m 7s (remain 2m 58s) Loss: 0.7790 Grad: 130763.0391  LR: 0.00000551  


Train:  44%|████▍     | 381/865 [02:14<02:53,  2.79train_batch/s]

Epoch: [2][380/865] Elapsed 2m 14s (remain 2m 51s) Loss: 0.7768 Grad: 205020.8438  LR: 0.00001116  


Train:  46%|████▋     | 401/865 [02:21<02:46,  2.79train_batch/s]

Epoch: [2][400/865] Elapsed 2m 21s (remain 2m 44s) Loss: 0.7773 Grad: 136699.1562  LR: 0.00001643  


Train:  49%|████▊     | 421/865 [02:28<02:39,  2.79train_batch/s]

Epoch: [2][420/865] Elapsed 2m 28s (remain 2m 37s) Loss: 0.7769 Grad: 179116.8438  LR: 0.00001958  


Train:  51%|█████     | 441/865 [02:36<02:31,  2.79train_batch/s]

Epoch: [2][440/865] Elapsed 2m 36s (remain 2m 30s) Loss: 0.7762 Grad: 186854.2188  LR: 0.00001958  


Train:  53%|█████▎    | 461/865 [02:43<02:23,  2.81train_batch/s]

Epoch: [2][460/865] Elapsed 2m 43s (remain 2m 22s) Loss: 0.7754 Grad: 228966.8125  LR: 0.00001643  


Train:  56%|█████▌    | 481/865 [02:50<02:17,  2.79train_batch/s]

Epoch: [2][480/865] Elapsed 2m 50s (remain 2m 15s) Loss: 0.7766 Grad: 246533.6094  LR: 0.00001116  


Train:  58%|█████▊    | 501/865 [02:57<02:10,  2.79train_batch/s]

Epoch: [2][500/865] Elapsed 2m 57s (remain 2m 8s) Loss: 0.7774 Grad: 183929.2188  LR: 0.00000551  


Train:  60%|██████    | 521/865 [03:04<02:03,  2.79train_batch/s]

Epoch: [2][520/865] Elapsed 3m 4s (remain 2m 1s) Loss: 0.7761 Grad: 189974.0625  LR: 0.00000134  


Train:  63%|██████▎   | 541/865 [03:11<01:56,  2.79train_batch/s]

Epoch: [2][540/865] Elapsed 3m 11s (remain 1m 54s) Loss: 0.7745 Grad: 178460.3438  LR: 0.00000002  


Train:  65%|██████▍   | 561/865 [03:18<01:48,  2.79train_batch/s]

Epoch: [2][560/865] Elapsed 3m 18s (remain 1m 47s) Loss: 0.7720 Grad: 134419.2344  LR: 0.00000198  


Train:  67%|██████▋   | 581/865 [03:25<01:41,  2.79train_batch/s]

Epoch: [2][580/865] Elapsed 3m 25s (remain 1m 40s) Loss: 0.7703 Grad: 181816.4375  LR: 0.00000658  


Train:  69%|██████▉   | 601/865 [03:32<01:34,  2.79train_batch/s]

Epoch: [2][600/865] Elapsed 3m 32s (remain 1m 33s) Loss: 0.7697 Grad: 128269.4062  LR: 0.00001231  


Train:  72%|███████▏  | 621/865 [03:39<01:27,  2.79train_batch/s]

Epoch: [2][620/865] Elapsed 3m 39s (remain 1m 26s) Loss: 0.7682 Grad: 143892.7188  LR: 0.00001727  


Train:  74%|███████▍  | 641/865 [03:46<01:20,  2.79train_batch/s]

Epoch: [2][640/865] Elapsed 3m 46s (remain 1m 19s) Loss: 0.7681 Grad: 176024.6250  LR: 0.00001985  


Train:  76%|███████▋  | 661/865 [03:53<01:13,  2.79train_batch/s]

Epoch: [2][660/865] Elapsed 3m 53s (remain 1m 12s) Loss: 0.7690 Grad: 61840.2578  LR: 0.00001918  


Train:  79%|███████▊  | 681/865 [04:00<01:05,  2.79train_batch/s]

Epoch: [2][680/865] Elapsed 4m 0s (remain 1m 5s) Loss: 0.7679 Grad: 158018.4219  LR: 0.00001550  


Train:  81%|████████  | 701/865 [04:07<00:58,  2.79train_batch/s]

Epoch: [2][700/865] Elapsed 4m 7s (remain 0m 58s) Loss: 0.7678 Grad: 171544.9062  LR: 0.00001000  


Train:  83%|████████▎ | 721/865 [04:15<00:51,  2.79train_batch/s]

Epoch: [2][720/865] Elapsed 4m 15s (remain 0m 50s) Loss: 0.7679 Grad: 145329.6562  LR: 0.00000450  


Train:  86%|████████▌ | 741/865 [04:22<00:44,  2.79train_batch/s]

Epoch: [2][740/865] Elapsed 4m 22s (remain 0m 43s) Loss: 0.7670 Grad: 212876.5469  LR: 0.00000082  


Train:  88%|████████▊ | 761/865 [04:29<00:37,  2.79train_batch/s]

Epoch: [2][760/865] Elapsed 4m 29s (remain 0m 36s) Loss: 0.7653 Grad: 168896.3438  LR: 0.00000015  


Train:  90%|█████████ | 781/865 [04:36<00:30,  2.79train_batch/s]

Epoch: [2][780/865] Elapsed 4m 36s (remain 0m 29s) Loss: 0.7652 Grad: 119754.9141  LR: 0.00000273  


Train:  93%|█████████▎| 801/865 [04:43<00:22,  2.79train_batch/s]

Epoch: [2][800/865] Elapsed 4m 43s (remain 0m 22s) Loss: 0.7642 Grad: 270327.9062  LR: 0.00000769  


Train:  95%|█████████▍| 821/865 [04:50<00:15,  2.79train_batch/s]

Epoch: [2][820/865] Elapsed 4m 50s (remain 0m 15s) Loss: 0.7637 Grad: 218574.7188  LR: 0.00001342  


Train:  97%|█████████▋| 841/865 [04:57<00:08,  2.79train_batch/s]

Epoch: [2][840/865] Elapsed 4m 57s (remain 0m 8s) Loss: 0.7622 Grad: 273298.3125  LR: 0.00001802  


Train: 100%|█████████▉| 861/865 [05:04<00:01,  2.79train_batch/s]

Epoch: [2][860/865] Elapsed 5m 4s (remain 0m 1s) Loss: 0.7610 Grad: 187147.8125  LR: 0.00001998  


Train: 100%|██████████| 865/865 [05:05<00:00,  2.83train_batch/s]


Epoch: [2][864/865] Elapsed 5m 5s (remain 0m 0s) Loss: 0.7603 Grad: 156694.0938  LR: 0.00001998  


Validation:   1%|          | 2/217 [00:00<00:53,  4.02valid_batch/s]

EVAL: [0/217] Elapsed 0m 0s (remain 1m 13s) Loss: 0.9741 


Validation:  10%|█         | 22/217 [00:04<00:36,  5.41valid_batch/s]

EVAL: [20/217] Elapsed 0m 4s (remain 0m 37s) Loss: 0.7763 


Validation:  19%|█▉        | 42/217 [00:07<00:32,  5.40valid_batch/s]

EVAL: [40/217] Elapsed 0m 7s (remain 0m 33s) Loss: 0.7808 


Validation:  29%|██▊       | 62/217 [00:11<00:28,  5.41valid_batch/s]

EVAL: [60/217] Elapsed 0m 11s (remain 0m 29s) Loss: 0.7877 


Validation:  38%|███▊      | 82/217 [00:15<00:24,  5.40valid_batch/s]

EVAL: [80/217] Elapsed 0m 15s (remain 0m 25s) Loss: 0.8080 


Validation:  47%|████▋     | 102/217 [00:19<00:21,  5.40valid_batch/s]

EVAL: [100/217] Elapsed 0m 18s (remain 0m 21s) Loss: 0.8132 


Validation:  56%|█████▌    | 122/217 [00:22<00:17,  5.41valid_batch/s]

EVAL: [120/217] Elapsed 0m 22s (remain 0m 17s) Loss: 0.8102 


Validation:  65%|██████▌   | 142/217 [00:26<00:13,  5.41valid_batch/s]

EVAL: [140/217] Elapsed 0m 26s (remain 0m 14s) Loss: 0.8008 


Validation:  75%|███████▍  | 162/217 [00:30<00:10,  5.41valid_batch/s]

EVAL: [160/217] Elapsed 0m 29s (remain 0m 10s) Loss: 0.7987 


Validation:  84%|████████▍ | 182/217 [00:33<00:06,  5.40valid_batch/s]

EVAL: [180/217] Elapsed 0m 33s (remain 0m 6s) Loss: 0.7966 


Validation:  93%|█████████▎| 202/217 [00:37<00:02,  5.41valid_batch/s]

EVAL: [200/217] Elapsed 0m 37s (remain 0m 2s) Loss: 0.7934 


Validation: 100%|██████████| 217/217 [00:40<00:00,  5.40valid_batch/s]
Epoch 2 - avg_train_loss: 0.7603  avg_val_loss: 0.8019  time: 346s
Epoch 2 - Score: 0.8097
Epoch 2 - Save Best Score: 0.8097 Model


EVAL: [216/217] Elapsed 0m 40s (remain 0m 0s) Loss: 0.8019 


Score: 0.8097
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Train:   0%|          | 1/865 [00:00<07:01,  2.05train_batch/s]

Epoch: [1][0/865] Elapsed 0m 0s (remain 7m 1s) Loss: 2.0673 Grad: inf  LR: 0.00002000  


Train:   2%|▏         | 21/865 [00:07<05:02,  2.79train_batch/s]

Epoch: [1][20/865] Elapsed 0m 7s (remain 5m 1s) Loss: 1.5951 Grad: 75527.4062  LR: 0.00001819  


Train:   5%|▍         | 41/865 [00:14<04:53,  2.80train_batch/s]

Epoch: [1][40/865] Elapsed 0m 14s (remain 4m 52s) Loss: 1.4666 Grad: 53347.1602  LR: 0.00001369  


Train:   7%|▋         | 61/865 [00:21<04:45,  2.82train_batch/s]

Epoch: [1][60/865] Elapsed 0m 21s (remain 4m 44s) Loss: 1.3990 Grad: 20449.4121  LR: 0.00000798  


Train:   9%|▉         | 81/865 [00:28<04:41,  2.79train_batch/s]

Epoch: [1][80/865] Elapsed 0m 28s (remain 4m 37s) Loss: 1.3431 Grad: 47256.6719  LR: 0.00000293  


Train:  12%|█▏        | 101/865 [00:35<04:33,  2.79train_batch/s]

Epoch: [1][100/865] Elapsed 0m 35s (remain 4m 30s) Loss: 1.3045 Grad: 34263.8281  LR: 0.00000021  


Train:  14%|█▍        | 121/865 [00:42<04:26,  2.79train_batch/s]

Epoch: [1][120/865] Elapsed 0m 42s (remain 4m 23s) Loss: 1.2840 Grad: 32262.2051  LR: 0.00000071  


Train:  16%|█▋        | 141/865 [00:49<04:19,  2.79train_batch/s]

Epoch: [1][140/865] Elapsed 0m 49s (remain 4m 16s) Loss: 1.2541 Grad: 32365.0410  LR: 0.00000426  


Train:  19%|█▊        | 161/865 [00:56<04:12,  2.79train_batch/s]

Epoch: [1][160/865] Elapsed 0m 56s (remain 4m 8s) Loss: 1.2366 Grad: 95376.3281  LR: 0.00000971  


Train:  21%|██        | 181/865 [01:04<04:05,  2.79train_batch/s]

Epoch: [1][180/865] Elapsed 1m 4s (remain 4m 1s) Loss: 1.2168 Grad: 139264.1406  LR: 0.00001525  


Train:  23%|██▎       | 201/865 [01:11<03:57,  2.80train_batch/s]

Epoch: [1][200/865] Elapsed 1m 11s (remain 3m 54s) Loss: 1.2066 Grad: 1654972.3750  LR: 0.00001906  


Train:  26%|██▌       | 221/865 [01:18<03:50,  2.79train_batch/s]

Epoch: [1][220/865] Elapsed 1m 18s (remain 3m 47s) Loss: 1.2010 Grad: 25943.9277  LR: 0.00001989  


Train:  28%|██▊       | 241/865 [01:25<03:43,  2.79train_batch/s]

Epoch: [1][240/865] Elapsed 1m 25s (remain 3m 40s) Loss: 1.1872 Grad: 46439.8945  LR: 0.00001747  


Train:  30%|███       | 261/865 [01:32<03:36,  2.79train_batch/s]

Epoch: [1][260/865] Elapsed 1m 32s (remain 3m 33s) Loss: 1.1746 Grad: 39083.4453  LR: 0.00001259  


Train:  32%|███▏      | 281/865 [01:39<03:29,  2.79train_batch/s]

Epoch: [1][280/865] Elapsed 1m 39s (remain 3m 26s) Loss: 1.1649 Grad: 12379.3926  LR: 0.00000685  


Train:  35%|███▍      | 301/865 [01:46<03:21,  2.79train_batch/s]

Epoch: [1][300/865] Elapsed 1m 46s (remain 3m 19s) Loss: 1.1612 Grad: 19702.7344  LR: 0.00000216  


Train:  37%|███▋      | 321/865 [01:53<03:14,  2.79train_batch/s]

Epoch: [1][320/865] Elapsed 1m 53s (remain 3m 12s) Loss: 1.1477 Grad: 23059.3379  LR: 0.00000004  


Train:  39%|███▉      | 341/865 [02:00<03:07,  2.79train_batch/s]

Epoch: [1][340/865] Elapsed 2m 0s (remain 3m 5s) Loss: 1.1341 Grad: 27285.1797  LR: 0.00000120  


Train:  42%|████▏     | 361/865 [02:07<03:00,  2.79train_batch/s]

Epoch: [1][360/865] Elapsed 2m 7s (remain 2m 58s) Loss: 1.1243 Grad: 21555.6016  LR: 0.00000525  


Train:  44%|████▍     | 381/865 [02:14<02:53,  2.79train_batch/s]

Epoch: [1][380/865] Elapsed 2m 14s (remain 2m 51s) Loss: 1.1182 Grad: 53126.9453  LR: 0.00001087  


Train:  46%|████▋     | 401/865 [02:21<02:46,  2.79train_batch/s]

Epoch: [1][400/865] Elapsed 2m 21s (remain 2m 44s) Loss: 1.1132 Grad: 20286.0391  LR: 0.00001620  


Train:  49%|████▊     | 421/865 [02:28<02:38,  2.79train_batch/s]

Epoch: [1][420/865] Elapsed 2m 28s (remain 2m 36s) Loss: 1.1063 Grad: 345167.8438  LR: 0.00001949  


Train:  51%|█████     | 441/865 [02:35<02:31,  2.79train_batch/s]

Epoch: [1][440/865] Elapsed 2m 35s (remain 2m 29s) Loss: 1.1000 Grad: 55916.9336  LR: 0.00001966  


Train:  53%|█████▎    | 461/865 [02:42<02:24,  2.79train_batch/s]

Epoch: [1][460/865] Elapsed 2m 42s (remain 2m 22s) Loss: 1.1005 Grad: 14948.2480  LR: 0.00001665  


Train:  56%|█████▌    | 481/865 [02:50<02:17,  2.80train_batch/s]

Epoch: [1][480/865] Elapsed 2m 50s (remain 2m 15s) Loss: 1.0946 Grad: 521317.3438  LR: 0.00001145  


Train:  58%|█████▊    | 501/865 [02:57<02:10,  2.79train_batch/s]

Epoch: [1][500/865] Elapsed 2m 57s (remain 2m 8s) Loss: 1.0865 Grad: 19391.6953  LR: 0.00000577  


Train:  60%|██████    | 521/865 [03:04<02:03,  2.80train_batch/s]

Epoch: [1][520/865] Elapsed 3m 4s (remain 2m 1s) Loss: 1.0804 Grad: 82308.2500  LR: 0.00000149  


Train:  63%|██████▎   | 541/865 [03:11<01:56,  2.79train_batch/s]

Epoch: [1][540/865] Elapsed 3m 11s (remain 1m 54s) Loss: 1.0762 Grad: 18897.9668  LR: 0.00000000  


Train:  65%|██████▍   | 561/865 [03:18<01:48,  2.79train_batch/s]

Epoch: [1][560/865] Elapsed 3m 18s (remain 1m 47s) Loss: 1.0688 Grad: 16676.5430  LR: 0.00000181  


Train:  67%|██████▋   | 581/865 [03:25<01:41,  2.79train_batch/s]

Epoch: [1][580/865] Elapsed 3m 25s (remain 1m 40s) Loss: 1.0670 Grad: 14481.2129  LR: 0.00000631  


Train:  69%|██████▉   | 601/865 [03:32<01:34,  2.79train_batch/s]

Epoch: [1][600/865] Elapsed 3m 32s (remain 1m 33s) Loss: 1.0629 Grad: 8504.6348  LR: 0.00001202  


Train:  72%|███████▏  | 621/865 [03:39<01:27,  2.79train_batch/s]

Epoch: [1][620/865] Elapsed 3m 39s (remain 1m 26s) Loss: 1.0631 Grad: 8963.3047  LR: 0.00001707  


Train:  74%|███████▍  | 641/865 [03:46<01:20,  2.79train_batch/s]

Epoch: [1][640/865] Elapsed 3m 46s (remain 1m 19s) Loss: 1.0598 Grad: 5605.7041  LR: 0.00001979  


Train:  76%|███████▋  | 661/865 [03:53<01:13,  2.79train_batch/s]

Epoch: [1][660/865] Elapsed 3m 53s (remain 1m 12s) Loss: 1.0594 Grad: 7956.5625  LR: 0.00001929  


Train:  79%|███████▊  | 681/865 [04:00<01:05,  2.79train_batch/s]

Epoch: [1][680/865] Elapsed 4m 0s (remain 1m 5s) Loss: 1.0570 Grad: 4865.7207  LR: 0.00001574  


Train:  81%|████████  | 701/865 [04:07<00:58,  2.79train_batch/s]

Epoch: [1][700/865] Elapsed 4m 7s (remain 0m 57s) Loss: 1.0559 Grad: 4499.7227  LR: 0.00001029  


Train:  83%|████████▎ | 721/865 [04:14<00:51,  2.79train_batch/s]

Epoch: [1][720/865] Elapsed 4m 14s (remain 0m 50s) Loss: 1.0570 Grad: 5392.0005  LR: 0.00000475  


Train:  86%|████████▌ | 741/865 [04:21<00:44,  2.79train_batch/s]

Epoch: [1][740/865] Elapsed 4m 21s (remain 0m 43s) Loss: 1.0527 Grad: 3044.1206  LR: 0.00000094  


Train:  88%|████████▊ | 761/865 [04:28<00:37,  2.79train_batch/s]

Epoch: [1][760/865] Elapsed 4m 28s (remain 0m 36s) Loss: 1.0486 Grad: 14812.4189  LR: 0.00000011  


Train:  90%|█████████ | 781/865 [04:36<00:30,  2.79train_batch/s]

Epoch: [1][780/865] Elapsed 4m 36s (remain 0m 29s) Loss: 1.0432 Grad: 4730.8618  LR: 0.00000253  


Train:  93%|█████████▎| 801/865 [04:43<00:22,  2.79train_batch/s]

Epoch: [1][800/865] Elapsed 4m 43s (remain 0m 22s) Loss: 1.0400 Grad: 3282.2888  LR: 0.00000741  


Train:  95%|█████████▍| 821/865 [04:50<00:15,  2.79train_batch/s]

Epoch: [1][820/865] Elapsed 4m 50s (remain 0m 15s) Loss: 1.0392 Grad: 4701.9155  LR: 0.00001315  


Train:  97%|█████████▋| 841/865 [04:57<00:08,  2.79train_batch/s]

Epoch: [1][840/865] Elapsed 4m 57s (remain 0m 8s) Loss: 1.0377 Grad: 11548.1113  LR: 0.00001784  


Train: 100%|█████████▉| 861/865 [05:04<00:01,  2.80train_batch/s]

Epoch: [1][860/865] Elapsed 5m 4s (remain 0m 1s) Loss: 1.0355 Grad: 4314.1973  LR: 0.00001996  


Train: 100%|██████████| 865/865 [05:05<00:00,  2.83train_batch/s]


Epoch: [1][864/865] Elapsed 5m 5s (remain 0m 0s) Loss: 1.0349 Grad: 5021.1855  LR: 0.00002000  


Validation:   1%|          | 2/217 [00:00<00:54,  3.94valid_batch/s]

EVAL: [0/217] Elapsed 0m 0s (remain 1m 16s) Loss: 1.0439 


Validation:  10%|█         | 22/217 [00:04<00:36,  5.40valid_batch/s]

EVAL: [20/217] Elapsed 0m 4s (remain 0m 37s) Loss: 0.9023 


Validation:  19%|█▉        | 42/217 [00:07<00:32,  5.41valid_batch/s]

EVAL: [40/217] Elapsed 0m 7s (remain 0m 33s) Loss: 0.9116 


Validation:  29%|██▊       | 62/217 [00:11<00:28,  5.41valid_batch/s]

EVAL: [60/217] Elapsed 0m 11s (remain 0m 29s) Loss: 0.9107 


Validation:  38%|███▊      | 82/217 [00:15<00:24,  5.41valid_batch/s]

EVAL: [80/217] Elapsed 0m 15s (remain 0m 25s) Loss: 0.9132 


Validation:  47%|████▋     | 102/217 [00:19<00:21,  5.40valid_batch/s]

EVAL: [100/217] Elapsed 0m 18s (remain 0m 21s) Loss: 0.9024 


Validation:  56%|█████▌    | 122/217 [00:22<00:17,  5.41valid_batch/s]

EVAL: [120/217] Elapsed 0m 22s (remain 0m 17s) Loss: 0.9004 


Validation:  65%|██████▌   | 142/217 [00:26<00:13,  5.40valid_batch/s]

EVAL: [140/217] Elapsed 0m 26s (remain 0m 14s) Loss: 0.8955 


Validation:  75%|███████▍  | 162/217 [00:30<00:10,  5.42valid_batch/s]

EVAL: [160/217] Elapsed 0m 29s (remain 0m 10s) Loss: 0.8983 


Validation:  84%|████████▍ | 182/217 [00:33<00:06,  5.40valid_batch/s]

EVAL: [180/217] Elapsed 0m 33s (remain 0m 6s) Loss: 0.9029 


Validation:  93%|█████████▎| 202/217 [00:37<00:02,  5.41valid_batch/s]

EVAL: [200/217] Elapsed 0m 37s (remain 0m 2s) Loss: 0.9004 


Validation: 100%|██████████| 217/217 [00:40<00:00,  5.39valid_batch/s]
Epoch 1 - avg_train_loss: 1.0349  avg_val_loss: 0.9080  time: 346s
Epoch 1 - Score: 0.7876
Epoch 1 - Save Best Score: 0.7876 Model


EVAL: [216/217] Elapsed 0m 40s (remain 0m 0s) Loss: 0.9080 


Train:   0%|          | 1/865 [00:00<07:28,  1.93train_batch/s]

Epoch: [2][0/865] Elapsed 0m 0s (remain 7m 15s) Loss: 0.8487 Grad: 524389.0625  LR: 0.00001998  


Train:   2%|▏         | 21/865 [00:07<04:59,  2.81train_batch/s]

Epoch: [2][20/865] Elapsed 0m 7s (remain 5m 1s) Loss: 1.1176 Grad: 80163.1719  LR: 0.00001802  


Train:   5%|▍         | 41/865 [00:14<04:55,  2.79train_batch/s]

Epoch: [2][40/865] Elapsed 0m 14s (remain 4m 52s) Loss: 1.0504 Grad: 40754.9375  LR: 0.00001342  


Train:   7%|▋         | 61/865 [00:21<04:47,  2.80train_batch/s]

Epoch: [2][60/865] Elapsed 0m 21s (remain 4m 44s) Loss: 0.9899 Grad: 44465.4219  LR: 0.00000769  


Train:   9%|▉         | 81/865 [00:28<04:40,  2.79train_batch/s]

Epoch: [2][80/865] Elapsed 0m 28s (remain 4m 37s) Loss: 0.9380 Grad: 44138.6367  LR: 0.00000273  


Train:  12%|█▏        | 101/865 [00:35<04:33,  2.79train_batch/s]

Epoch: [2][100/865] Elapsed 0m 35s (remain 4m 30s) Loss: 0.9241 Grad: 38268.1484  LR: 0.00000015  


Train:  14%|█▍        | 121/865 [00:42<04:26,  2.79train_batch/s]

Epoch: [2][120/865] Elapsed 0m 42s (remain 4m 23s) Loss: 0.9228 Grad: 46242.1797  LR: 0.00000082  


Train:  16%|█▋        | 141/865 [00:49<04:19,  2.79train_batch/s]

Epoch: [2][140/865] Elapsed 0m 49s (remain 4m 16s) Loss: 0.9047 Grad: 101738.6484  LR: 0.00000450  


Train:  19%|█▊        | 161/865 [00:56<04:12,  2.79train_batch/s]

Epoch: [2][160/865] Elapsed 0m 56s (remain 4m 8s) Loss: 0.8954 Grad: 107487.6484  LR: 0.00001000  


Train:  21%|██        | 181/865 [01:04<04:04,  2.79train_batch/s]

Epoch: [2][180/865] Elapsed 1m 4s (remain 4m 1s) Loss: 0.8824 Grad: 21910.0137  LR: 0.00001550  


Train:  23%|██▎       | 201/865 [01:11<03:57,  2.80train_batch/s]

Epoch: [2][200/865] Elapsed 1m 11s (remain 3m 54s) Loss: 0.8723 Grad: 39094.4141  LR: 0.00001918  


Train:  26%|██▌       | 221/865 [01:18<03:50,  2.79train_batch/s]

Epoch: [2][220/865] Elapsed 1m 18s (remain 3m 47s) Loss: 0.8621 Grad: 42052.1172  LR: 0.00001985  


Train:  28%|██▊       | 241/865 [01:25<03:43,  2.79train_batch/s]

Epoch: [2][240/865] Elapsed 1m 25s (remain 3m 40s) Loss: 0.8592 Grad: 40343.0000  LR: 0.00001727  


Train:  30%|███       | 261/865 [01:32<03:36,  2.79train_batch/s]

Epoch: [2][260/865] Elapsed 1m 32s (remain 3m 33s) Loss: 0.8566 Grad: 299646.9688  LR: 0.00001231  


Train:  32%|███▏      | 281/865 [01:39<03:28,  2.79train_batch/s]

Epoch: [2][280/865] Elapsed 1m 39s (remain 3m 26s) Loss: 0.8500 Grad: 23088.4961  LR: 0.00000658  


Train:  35%|███▍      | 301/865 [01:46<03:21,  2.79train_batch/s]

Epoch: [2][300/865] Elapsed 1m 46s (remain 3m 19s) Loss: 0.8448 Grad: 23576.4902  LR: 0.00000198  


Train:  37%|███▋      | 321/865 [01:53<03:14,  2.79train_batch/s]

Epoch: [2][320/865] Elapsed 1m 53s (remain 3m 12s) Loss: 0.8443 Grad: 694569.8125  LR: 0.00000002  


Train:  39%|███▉      | 341/865 [02:00<03:07,  2.79train_batch/s]

Epoch: [2][340/865] Elapsed 2m 0s (remain 3m 5s) Loss: 0.8414 Grad: 25831.6328  LR: 0.00000134  


Train:  42%|████▏     | 361/865 [02:07<03:00,  2.79train_batch/s]

Epoch: [2][360/865] Elapsed 2m 7s (remain 2m 58s) Loss: 0.8405 Grad: 18572.4023  LR: 0.00000551  


Train:  44%|████▍     | 381/865 [02:14<02:53,  2.79train_batch/s]

Epoch: [2][380/865] Elapsed 2m 14s (remain 2m 51s) Loss: 0.8348 Grad: 314925.1875  LR: 0.00001116  


Train:  46%|████▋     | 401/865 [02:21<02:46,  2.79train_batch/s]

Epoch: [2][400/865] Elapsed 2m 21s (remain 2m 43s) Loss: 0.8353 Grad: 50177.8945  LR: 0.00001643  


Train:  49%|████▊     | 421/865 [02:28<02:38,  2.79train_batch/s]

Epoch: [2][420/865] Elapsed 2m 28s (remain 2m 36s) Loss: 0.8372 Grad: 201806.2344  LR: 0.00001958  


Train:  51%|█████     | 441/865 [02:35<02:31,  2.79train_batch/s]

Epoch: [2][440/865] Elapsed 2m 35s (remain 2m 29s) Loss: 0.8347 Grad: 14758.0566  LR: 0.00001958  


Train:  53%|█████▎    | 461/865 [02:42<02:24,  2.80train_batch/s]

Epoch: [2][460/865] Elapsed 2m 42s (remain 2m 22s) Loss: 0.8333 Grad: 30205.0371  LR: 0.00001643  


Train:  56%|█████▌    | 481/865 [02:49<02:17,  2.79train_batch/s]

Epoch: [2][480/865] Elapsed 2m 49s (remain 2m 15s) Loss: 0.8306 Grad: 15555.7314  LR: 0.00001116  


Train:  58%|█████▊    | 501/865 [02:57<02:10,  2.79train_batch/s]

Epoch: [2][500/865] Elapsed 2m 57s (remain 2m 8s) Loss: 0.8264 Grad: 31848.7793  LR: 0.00000551  


Train:  60%|██████    | 521/865 [03:04<02:03,  2.79train_batch/s]

Epoch: [2][520/865] Elapsed 3m 4s (remain 2m 1s) Loss: 0.8290 Grad: 26962.1738  LR: 0.00000134  


Train:  63%|██████▎   | 541/865 [03:11<01:56,  2.79train_batch/s]

Epoch: [2][540/865] Elapsed 3m 11s (remain 1m 54s) Loss: 0.8288 Grad: 23417.4434  LR: 0.00000002  


Train:  65%|██████▍   | 561/865 [03:18<01:48,  2.79train_batch/s]

Epoch: [2][560/865] Elapsed 3m 18s (remain 1m 47s) Loss: 0.8271 Grad: 21593.8984  LR: 0.00000198  


Train:  67%|██████▋   | 581/865 [03:25<01:41,  2.80train_batch/s]

Epoch: [2][580/865] Elapsed 3m 25s (remain 1m 40s) Loss: 0.8261 Grad: 25136.0762  LR: 0.00000658  


Train:  69%|██████▉   | 601/865 [03:32<01:34,  2.79train_batch/s]

Epoch: [2][600/865] Elapsed 3m 32s (remain 1m 33s) Loss: 0.8272 Grad: 24206.8320  LR: 0.00001231  


Train:  72%|███████▏  | 621/865 [03:39<01:27,  2.79train_batch/s]

Epoch: [2][620/865] Elapsed 3m 39s (remain 1m 26s) Loss: 0.8252 Grad: 16537.4199  LR: 0.00001727  


Train:  74%|███████▍  | 641/865 [03:46<01:20,  2.79train_batch/s]

Epoch: [2][640/865] Elapsed 3m 46s (remain 1m 19s) Loss: 0.8255 Grad: 31328.6348  LR: 0.00001985  


Train:  76%|███████▋  | 661/865 [03:53<01:13,  2.79train_batch/s]

Epoch: [2][660/865] Elapsed 3m 53s (remain 1m 12s) Loss: 0.8265 Grad: 15335.0176  LR: 0.00001918  


Train:  79%|███████▊  | 681/865 [04:00<01:05,  2.80train_batch/s]

Epoch: [2][680/865] Elapsed 4m 0s (remain 1m 5s) Loss: 0.8265 Grad: 13914.3174  LR: 0.00001550  


Train:  81%|████████  | 701/865 [04:07<00:58,  2.79train_batch/s]

Epoch: [2][700/865] Elapsed 4m 7s (remain 0m 57s) Loss: 0.8276 Grad: 18662.0195  LR: 0.00001000  


Train:  83%|████████▎ | 721/865 [04:14<00:51,  2.79train_batch/s]

Epoch: [2][720/865] Elapsed 4m 14s (remain 0m 50s) Loss: 0.8242 Grad: 17255.2324  LR: 0.00000450  


Train:  86%|████████▌ | 741/865 [04:21<00:44,  2.79train_batch/s]

Epoch: [2][740/865] Elapsed 4m 21s (remain 0m 43s) Loss: 0.8241 Grad: 20558.6445  LR: 0.00000082  


Train:  88%|████████▊ | 761/865 [04:28<00:37,  2.79train_batch/s]

Epoch: [2][760/865] Elapsed 4m 28s (remain 0m 36s) Loss: 0.8228 Grad: 22055.4531  LR: 0.00000015  


Train:  90%|█████████ | 781/865 [04:35<00:30,  2.79train_batch/s]

Epoch: [2][780/865] Elapsed 4m 35s (remain 0m 29s) Loss: 0.8230 Grad: 25662.9531  LR: 0.00000273  


Train:  93%|█████████▎| 801/865 [04:42<00:22,  2.79train_batch/s]

Epoch: [2][800/865] Elapsed 4m 42s (remain 0m 22s) Loss: 0.8231 Grad: 20488.5918  LR: 0.00000769  


Train:  95%|█████████▍| 821/865 [04:50<00:15,  2.79train_batch/s]

Epoch: [2][820/865] Elapsed 4m 50s (remain 0m 15s) Loss: 0.8236 Grad: 24717.2969  LR: 0.00001342  


Train:  97%|█████████▋| 841/865 [04:57<00:08,  2.79train_batch/s]

Epoch: [2][840/865] Elapsed 4m 57s (remain 0m 8s) Loss: 0.8232 Grad: 16888.7598  LR: 0.00001802  


Train: 100%|█████████▉| 861/865 [05:04<00:01,  2.80train_batch/s]

Epoch: [2][860/865] Elapsed 5m 4s (remain 0m 1s) Loss: 0.8223 Grad: 20498.8047  LR: 0.00001998  


Train: 100%|██████████| 865/865 [05:05<00:00,  2.83train_batch/s]


Epoch: [2][864/865] Elapsed 5m 5s (remain 0m 0s) Loss: 0.8230 Grad: 12257.2090  LR: 0.00001998  


Validation:   1%|          | 2/217 [00:00<00:53,  4.02valid_batch/s]

EVAL: [0/217] Elapsed 0m 0s (remain 1m 13s) Loss: 1.4417 


Validation:  10%|█         | 22/217 [00:04<00:36,  5.41valid_batch/s]

EVAL: [20/217] Elapsed 0m 4s (remain 0m 37s) Loss: 0.8547 


Validation:  19%|█▉        | 42/217 [00:07<00:32,  5.41valid_batch/s]

EVAL: [40/217] Elapsed 0m 7s (remain 0m 33s) Loss: 0.8392 


Validation:  29%|██▊       | 62/217 [00:11<00:28,  5.41valid_batch/s]

EVAL: [60/217] Elapsed 0m 11s (remain 0m 29s) Loss: 0.8395 


Validation:  38%|███▊      | 82/217 [00:15<00:24,  5.41valid_batch/s]

EVAL: [80/217] Elapsed 0m 15s (remain 0m 25s) Loss: 0.8312 


Validation:  47%|████▋     | 102/217 [00:19<00:21,  5.41valid_batch/s]

EVAL: [100/217] Elapsed 0m 18s (remain 0m 21s) Loss: 0.8254 


Validation:  56%|█████▌    | 122/217 [00:22<00:17,  5.40valid_batch/s]

EVAL: [120/217] Elapsed 0m 22s (remain 0m 17s) Loss: 0.8339 


Validation:  65%|██████▌   | 142/217 [00:26<00:13,  5.40valid_batch/s]

EVAL: [140/217] Elapsed 0m 26s (remain 0m 14s) Loss: 0.8363 


Validation:  75%|███████▍  | 162/217 [00:30<00:10,  5.41valid_batch/s]

EVAL: [160/217] Elapsed 0m 29s (remain 0m 10s) Loss: 0.8415 


Validation:  84%|████████▍ | 182/217 [00:33<00:06,  5.42valid_batch/s]

EVAL: [180/217] Elapsed 0m 33s (remain 0m 6s) Loss: 0.8414 


Validation:  93%|█████████▎| 202/217 [00:37<00:02,  5.40valid_batch/s]

EVAL: [200/217] Elapsed 0m 37s (remain 0m 2s) Loss: 0.8432 


Validation: 100%|██████████| 217/217 [00:40<00:00,  5.40valid_batch/s]
Epoch 2 - avg_train_loss: 0.8230  avg_val_loss: 0.8463  time: 346s
Epoch 2 - Score: 0.7837


EVAL: [216/217] Elapsed 0m 40s (remain 0m 0s) Loss: 0.8463 


Score: 0.7876
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Train:   0%|          | 1/865 [00:00<07:04,  2.04train_batch/s]

Epoch: [1][0/865] Elapsed 0m 0s (remain 7m 4s) Loss: 2.5247 Grad: inf  LR: 0.00002000  


Train:   2%|▏         | 21/865 [00:07<05:01,  2.80train_batch/s]

Epoch: [1][20/865] Elapsed 0m 7s (remain 5m 1s) Loss: 1.7310 Grad: 48490.3594  LR: 0.00001819  


Train:   5%|▍         | 41/865 [00:14<04:54,  2.79train_batch/s]

Epoch: [1][40/865] Elapsed 0m 14s (remain 4m 52s) Loss: 1.6058 Grad: 48519.6836  LR: 0.00001369  


Train:   7%|▋         | 61/865 [00:21<04:47,  2.79train_batch/s]

Epoch: [1][60/865] Elapsed 0m 21s (remain 4m 44s) Loss: 1.5356 Grad: 305154.6562  LR: 0.00000798  


Train:   9%|▉         | 81/865 [00:28<04:40,  2.79train_batch/s]

Epoch: [1][80/865] Elapsed 0m 28s (remain 4m 37s) Loss: 1.4844 Grad: 83763.3750  LR: 0.00000293  


Train:  12%|█▏        | 101/865 [00:35<04:33,  2.79train_batch/s]

Epoch: [1][100/865] Elapsed 0m 35s (remain 4m 30s) Loss: 1.4596 Grad: 570538.8750  LR: 0.00000021  


Train:  14%|█▍        | 121/865 [00:42<04:26,  2.79train_batch/s]

Epoch: [1][120/865] Elapsed 0m 42s (remain 4m 23s) Loss: 1.4304 Grad: 31577.5059  LR: 0.00000071  


Train:  16%|█▋        | 141/865 [00:49<04:19,  2.79train_batch/s]

Epoch: [1][140/865] Elapsed 0m 49s (remain 4m 16s) Loss: 1.4001 Grad: 49662.8516  LR: 0.00000426  


Train:  19%|█▊        | 161/865 [00:56<04:12,  2.79train_batch/s]

Epoch: [1][160/865] Elapsed 0m 56s (remain 4m 8s) Loss: 1.3846 Grad: 37476.0430  LR: 0.00000971  


Train:  21%|██        | 181/865 [01:04<04:04,  2.80train_batch/s]

Epoch: [1][180/865] Elapsed 1m 3s (remain 4m 1s) Loss: 1.3610 Grad: 44941.4102  LR: 0.00001525  


Train:  23%|██▎       | 201/865 [01:11<03:56,  2.81train_batch/s]

Epoch: [1][200/865] Elapsed 1m 11s (remain 3m 54s) Loss: 1.3491 Grad: 28490.6582  LR: 0.00001906  


Train:  26%|██▌       | 221/865 [01:18<03:50,  2.79train_batch/s]

Epoch: [1][220/865] Elapsed 1m 18s (remain 3m 47s) Loss: 1.3261 Grad: 25471.4453  LR: 0.00001989  


Train:  28%|██▊       | 241/865 [01:25<03:43,  2.79train_batch/s]

Epoch: [1][240/865] Elapsed 1m 25s (remain 3m 40s) Loss: 1.3178 Grad: 58630.7969  LR: 0.00001747  


Train:  30%|███       | 261/865 [01:32<03:36,  2.79train_batch/s]

Epoch: [1][260/865] Elapsed 1m 32s (remain 3m 33s) Loss: 1.3021 Grad: 16305.3887  LR: 0.00001259  


Train:  32%|███▏      | 281/865 [01:39<03:29,  2.79train_batch/s]

Epoch: [1][280/865] Elapsed 1m 39s (remain 3m 26s) Loss: 1.2851 Grad: 21838.3281  LR: 0.00000685  


Train:  35%|███▍      | 301/865 [01:46<03:21,  2.79train_batch/s]

Epoch: [1][300/865] Elapsed 1m 46s (remain 3m 19s) Loss: 1.2635 Grad: 29969.0039  LR: 0.00000216  


Train:  37%|███▋      | 321/865 [01:53<03:14,  2.79train_batch/s]

Epoch: [1][320/865] Elapsed 1m 53s (remain 3m 12s) Loss: 1.2440 Grad: 15718.0273  LR: 0.00000004  


Train:  39%|███▉      | 341/865 [02:00<03:07,  2.79train_batch/s]

Epoch: [1][340/865] Elapsed 2m 0s (remain 3m 5s) Loss: 1.2255 Grad: 22081.0430  LR: 0.00000120  


Train:  42%|████▏     | 361/865 [02:07<03:00,  2.79train_batch/s]

Epoch: [1][360/865] Elapsed 2m 7s (remain 2m 58s) Loss: 1.2114 Grad: 36489.3555  LR: 0.00000525  


Train:  44%|████▍     | 381/865 [02:14<02:53,  2.80train_batch/s]

Epoch: [1][380/865] Elapsed 2m 14s (remain 2m 51s) Loss: 1.2003 Grad: 20928.2070  LR: 0.00001087  


Train:  46%|████▋     | 401/865 [02:21<02:46,  2.79train_batch/s]

Epoch: [1][400/865] Elapsed 2m 21s (remain 2m 43s) Loss: 1.1904 Grad: 19142.1621  LR: 0.00001620  


Train:  49%|████▊     | 421/865 [02:28<02:38,  2.79train_batch/s]

Epoch: [1][420/865] Elapsed 2m 28s (remain 2m 36s) Loss: 1.1786 Grad: 15151.0869  LR: 0.00001949  


Train:  51%|█████     | 441/865 [02:35<02:31,  2.79train_batch/s]

Epoch: [1][440/865] Elapsed 2m 35s (remain 2m 29s) Loss: 1.1698 Grad: 29353.0332  LR: 0.00001966  


Train:  53%|█████▎    | 461/865 [02:42<02:24,  2.79train_batch/s]

Epoch: [1][460/865] Elapsed 2m 42s (remain 2m 22s) Loss: 1.1621 Grad: 32373.2480  LR: 0.00001665  


Train:  56%|█████▌    | 481/865 [02:49<02:17,  2.79train_batch/s]

Epoch: [1][480/865] Elapsed 2m 49s (remain 2m 15s) Loss: 1.1528 Grad: 43980.2344  LR: 0.00001145  


Train:  58%|█████▊    | 501/865 [02:57<02:10,  2.79train_batch/s]

Epoch: [1][500/865] Elapsed 2m 57s (remain 2m 8s) Loss: 1.1450 Grad: 14459.8281  LR: 0.00000577  


Train:  60%|██████    | 521/865 [03:04<02:03,  2.79train_batch/s]

Epoch: [1][520/865] Elapsed 3m 4s (remain 2m 1s) Loss: 1.1370 Grad: 10430.9375  LR: 0.00000149  


Train:  63%|██████▎   | 541/865 [03:11<01:56,  2.79train_batch/s]

Epoch: [1][540/865] Elapsed 3m 11s (remain 1m 54s) Loss: 1.1286 Grad: 26878.6074  LR: 0.00000000  


Train:  65%|██████▍   | 561/865 [03:18<01:48,  2.79train_batch/s]

Epoch: [1][560/865] Elapsed 3m 18s (remain 1m 47s) Loss: 1.1217 Grad: 35050.2734  LR: 0.00000181  


Train:  67%|██████▋   | 581/865 [03:25<01:41,  2.80train_batch/s]

Epoch: [1][580/865] Elapsed 3m 25s (remain 1m 40s) Loss: 1.1139 Grad: 12124.4873  LR: 0.00000631  


Train:  69%|██████▉   | 601/865 [03:32<01:34,  2.79train_batch/s]

Epoch: [1][600/865] Elapsed 3m 32s (remain 1m 33s) Loss: 1.1073 Grad: 42626.8828  LR: 0.00001202  


Train:  72%|███████▏  | 621/865 [03:39<01:27,  2.79train_batch/s]

Epoch: [1][620/865] Elapsed 3m 39s (remain 1m 26s) Loss: 1.1019 Grad: 53720.5312  LR: 0.00001707  


Train:  74%|███████▍  | 641/865 [03:46<01:20,  2.79train_batch/s]

Epoch: [1][640/865] Elapsed 3m 46s (remain 1m 19s) Loss: 1.0984 Grad: 10930.1445  LR: 0.00001979  


Train:  76%|███████▋  | 661/865 [03:53<01:12,  2.80train_batch/s]

Epoch: [1][660/865] Elapsed 3m 53s (remain 1m 12s) Loss: 1.0958 Grad: 29188.5215  LR: 0.00001929  


Train:  79%|███████▊  | 681/865 [04:00<01:05,  2.79train_batch/s]

Epoch: [1][680/865] Elapsed 4m 0s (remain 1m 5s) Loss: 1.0914 Grad: 13494.6484  LR: 0.00001574  


Train:  81%|████████  | 701/865 [04:07<00:58,  2.79train_batch/s]

Epoch: [1][700/865] Elapsed 4m 7s (remain 0m 57s) Loss: 1.0862 Grad: 25736.4941  LR: 0.00001029  


Train:  83%|████████▎ | 721/865 [04:14<00:51,  2.80train_batch/s]

Epoch: [1][720/865] Elapsed 4m 14s (remain 0m 50s) Loss: 1.0783 Grad: 17722.4258  LR: 0.00000475  


Train:  86%|████████▌ | 741/865 [04:21<00:44,  2.80train_batch/s]

Epoch: [1][740/865] Elapsed 4m 21s (remain 0m 43s) Loss: 1.0735 Grad: 20750.2715  LR: 0.00000094  


Train:  88%|████████▊ | 761/865 [04:28<00:37,  2.79train_batch/s]

Epoch: [1][760/865] Elapsed 4m 28s (remain 0m 36s) Loss: 1.0671 Grad: 12377.6318  LR: 0.00000011  


Train:  90%|█████████ | 781/865 [04:35<00:30,  2.79train_batch/s]

Epoch: [1][780/865] Elapsed 4m 35s (remain 0m 29s) Loss: 1.0611 Grad: 24512.1484  LR: 0.00000253  


Train:  93%|█████████▎| 801/865 [04:42<00:22,  2.80train_batch/s]

Epoch: [1][800/865] Elapsed 4m 42s (remain 0m 22s) Loss: 1.0549 Grad: 15175.8301  LR: 0.00000741  


Train:  95%|█████████▍| 821/865 [04:50<00:15,  2.79train_batch/s]

Epoch: [1][820/865] Elapsed 4m 50s (remain 0m 15s) Loss: 1.0496 Grad: 23580.1035  LR: 0.00001315  


Train:  97%|█████████▋| 841/865 [04:57<00:08,  2.80train_batch/s]

Epoch: [1][840/865] Elapsed 4m 57s (remain 0m 8s) Loss: 1.0519 Grad: 23614.2754  LR: 0.00001784  


Train: 100%|█████████▉| 861/865 [05:04<00:01,  2.80train_batch/s]

Epoch: [1][860/865] Elapsed 5m 4s (remain 0m 1s) Loss: 1.0487 Grad: 28325.5488  LR: 0.00001996  


Train: 100%|██████████| 865/865 [05:05<00:00,  2.83train_batch/s]


Epoch: [1][864/865] Elapsed 5m 5s (remain 0m 0s) Loss: 1.0490 Grad: 18599.1562  LR: 0.00002000  


Validation:   1%|          | 2/217 [00:00<00:53,  3.99valid_batch/s]

EVAL: [0/217] Elapsed 0m 0s (remain 1m 14s) Loss: 1.1807 


Validation:  10%|█         | 22/217 [00:04<00:36,  5.40valid_batch/s]

EVAL: [20/217] Elapsed 0m 4s (remain 0m 37s) Loss: 0.9070 


Validation:  19%|█▉        | 42/217 [00:07<00:32,  5.41valid_batch/s]

EVAL: [40/217] Elapsed 0m 7s (remain 0m 33s) Loss: 0.8533 


Validation:  29%|██▊       | 62/217 [00:11<00:28,  5.40valid_batch/s]

EVAL: [60/217] Elapsed 0m 11s (remain 0m 29s) Loss: 0.8831 


Validation:  38%|███▊      | 82/217 [00:15<00:24,  5.41valid_batch/s]

EVAL: [80/217] Elapsed 0m 15s (remain 0m 25s) Loss: 0.8864 


Validation:  47%|████▋     | 102/217 [00:19<00:21,  5.41valid_batch/s]

EVAL: [100/217] Elapsed 0m 18s (remain 0m 21s) Loss: 0.8824 


Validation:  56%|█████▌    | 122/217 [00:22<00:17,  5.41valid_batch/s]

EVAL: [120/217] Elapsed 0m 22s (remain 0m 17s) Loss: 0.8902 


Validation:  65%|██████▌   | 142/217 [00:26<00:13,  5.40valid_batch/s]

EVAL: [140/217] Elapsed 0m 26s (remain 0m 14s) Loss: 0.9005 


Validation:  75%|███████▍  | 162/217 [00:30<00:10,  5.41valid_batch/s]

EVAL: [160/217] Elapsed 0m 29s (remain 0m 10s) Loss: 0.9015 


Validation:  84%|████████▍ | 182/217 [00:33<00:06,  5.41valid_batch/s]

EVAL: [180/217] Elapsed 0m 33s (remain 0m 6s) Loss: 0.8948 


Validation:  93%|█████████▎| 202/217 [00:37<00:02,  5.41valid_batch/s]

EVAL: [200/217] Elapsed 0m 37s (remain 0m 2s) Loss: 0.8951 


Validation: 100%|██████████| 217/217 [00:40<00:00,  5.40valid_batch/s]
Epoch 1 - avg_train_loss: 1.0490  avg_val_loss: 0.8946  time: 346s
Epoch 1 - Score: 0.7785
Epoch 1 - Save Best Score: 0.7785 Model


EVAL: [216/217] Elapsed 0m 40s (remain 0m 0s) Loss: 0.8946 


Train:   0%|          | 1/865 [00:00<06:58,  2.07train_batch/s]

Epoch: [2][0/865] Elapsed 0m 0s (remain 6m 58s) Loss: 0.6321 Grad: inf  LR: 0.00001998  


Train:   2%|▏         | 21/865 [00:07<05:01,  2.80train_batch/s]

Epoch: [2][20/865] Elapsed 0m 7s (remain 5m 1s) Loss: 0.8129 Grad: 187353.9375  LR: 0.00001802  


Train:   5%|▍         | 41/865 [00:14<04:55,  2.79train_batch/s]

Epoch: [2][40/865] Elapsed 0m 14s (remain 4m 53s) Loss: 0.8104 Grad: 229611.5938  LR: 0.00001342  


Train:   7%|▋         | 61/865 [00:21<04:47,  2.79train_batch/s]

Epoch: [2][60/865] Elapsed 0m 21s (remain 4m 45s) Loss: 0.7880 Grad: 160164.0312  LR: 0.00000769  


Train:   9%|▉         | 81/865 [00:28<04:40,  2.79train_batch/s]

Epoch: [2][80/865] Elapsed 0m 28s (remain 4m 37s) Loss: 0.7940 Grad: 230314.4375  LR: 0.00000273  


Train:  12%|█▏        | 101/865 [00:35<04:33,  2.79train_batch/s]

Epoch: [2][100/865] Elapsed 0m 35s (remain 4m 30s) Loss: 0.7917 Grad: 202978.7969  LR: 0.00000015  


Train:  14%|█▍        | 121/865 [00:42<04:26,  2.79train_batch/s]

Epoch: [2][120/865] Elapsed 0m 42s (remain 4m 23s) Loss: 0.7973 Grad: 239596.9219  LR: 0.00000082  


Train:  16%|█▋        | 141/865 [00:49<04:19,  2.79train_batch/s]

Epoch: [2][140/865] Elapsed 0m 49s (remain 4m 16s) Loss: 0.8019 Grad: 128594.2656  LR: 0.00000450  


Train:  19%|█▊        | 161/865 [00:56<04:12,  2.79train_batch/s]

Epoch: [2][160/865] Elapsed 0m 56s (remain 4m 9s) Loss: 0.8039 Grad: 161248.3125  LR: 0.00001000  


Train:  21%|██        | 181/865 [01:04<04:05,  2.79train_batch/s]

Epoch: [2][180/865] Elapsed 1m 4s (remain 4m 1s) Loss: 0.8062 Grad: 106295.4844  LR: 0.00001550  


Train:  23%|██▎       | 201/865 [01:11<03:57,  2.79train_batch/s]

Epoch: [2][200/865] Elapsed 1m 11s (remain 3m 54s) Loss: 0.7976 Grad: 124940.1094  LR: 0.00001918  


Train:  26%|██▌       | 221/865 [01:18<03:50,  2.79train_batch/s]

Epoch: [2][220/865] Elapsed 1m 18s (remain 3m 47s) Loss: 0.7961 Grad: 93511.1328  LR: 0.00001985  


Train:  28%|██▊       | 241/865 [01:25<03:43,  2.79train_batch/s]

Epoch: [2][240/865] Elapsed 1m 25s (remain 3m 40s) Loss: 0.7965 Grad: 125135.6875  LR: 0.00001727  


Train:  30%|███       | 261/865 [01:32<03:36,  2.79train_batch/s]

Epoch: [2][260/865] Elapsed 1m 32s (remain 3m 33s) Loss: 0.7888 Grad: 102518.5938  LR: 0.00001231  


Train:  32%|███▏      | 281/865 [01:39<03:28,  2.79train_batch/s]

Epoch: [2][280/865] Elapsed 1m 39s (remain 3m 26s) Loss: 0.7912 Grad: 173836.0625  LR: 0.00000658  


Train:  35%|███▍      | 301/865 [01:46<03:22,  2.79train_batch/s]

Epoch: [2][300/865] Elapsed 1m 46s (remain 3m 19s) Loss: 0.7975 Grad: 247257.3281  LR: 0.00000198  


Train:  37%|███▋      | 321/865 [01:53<03:14,  2.79train_batch/s]

Epoch: [2][320/865] Elapsed 1m 53s (remain 3m 12s) Loss: 0.7969 Grad: 410769.9375  LR: 0.00000002  


Train:  39%|███▉      | 341/865 [02:00<03:07,  2.79train_batch/s]

Epoch: [2][340/865] Elapsed 2m 0s (remain 3m 5s) Loss: 0.7973 Grad: 137089.0000  LR: 0.00000134  


Train:  42%|████▏     | 361/865 [02:07<03:00,  2.79train_batch/s]

Epoch: [2][360/865] Elapsed 2m 7s (remain 2m 58s) Loss: 0.7972 Grad: 129576.7109  LR: 0.00000551  


Train:  44%|████▍     | 381/865 [02:14<02:53,  2.79train_batch/s]

Epoch: [2][380/865] Elapsed 2m 14s (remain 2m 51s) Loss: 0.7971 Grad: 346884.5000  LR: 0.00001116  


Train:  46%|████▋     | 401/865 [02:21<02:46,  2.79train_batch/s]

Epoch: [2][400/865] Elapsed 2m 21s (remain 2m 44s) Loss: 0.7987 Grad: 172284.2812  LR: 0.00001643  


Train:  49%|████▊     | 421/865 [02:28<02:39,  2.79train_batch/s]

Epoch: [2][420/865] Elapsed 2m 28s (remain 2m 36s) Loss: 0.7979 Grad: 242835.3906  LR: 0.00001958  


Train:  51%|█████     | 441/865 [02:35<02:31,  2.79train_batch/s]

Epoch: [2][440/865] Elapsed 2m 35s (remain 2m 29s) Loss: 0.7974 Grad: 189892.2031  LR: 0.00001958  


Train:  53%|█████▎    | 461/865 [02:42<02:24,  2.79train_batch/s]

Epoch: [2][460/865] Elapsed 2m 42s (remain 2m 22s) Loss: 0.7982 Grad: 191127.2031  LR: 0.00001643  


Train:  56%|█████▌    | 481/865 [02:50<02:17,  2.79train_batch/s]

Epoch: [2][480/865] Elapsed 2m 50s (remain 2m 15s) Loss: 0.7980 Grad: 139292.7969  LR: 0.00001116  


Train:  58%|█████▊    | 501/865 [02:57<02:10,  2.79train_batch/s]

Epoch: [2][500/865] Elapsed 2m 57s (remain 2m 8s) Loss: 0.7977 Grad: 149914.2031  LR: 0.00000551  


Train:  60%|██████    | 521/865 [03:04<02:03,  2.79train_batch/s]

Epoch: [2][520/865] Elapsed 3m 4s (remain 2m 1s) Loss: 0.7977 Grad: 145240.2812  LR: 0.00000134  


Train:  63%|██████▎   | 541/865 [03:11<01:55,  2.79train_batch/s]

Epoch: [2][540/865] Elapsed 3m 11s (remain 1m 54s) Loss: 0.7953 Grad: 279005.4375  LR: 0.00000002  


Train:  65%|██████▍   | 561/865 [03:18<01:48,  2.79train_batch/s]

Epoch: [2][560/865] Elapsed 3m 18s (remain 1m 47s) Loss: 0.7958 Grad: 125450.2891  LR: 0.00000198  


Train:  67%|██████▋   | 581/865 [03:25<01:41,  2.79train_batch/s]

Epoch: [2][580/865] Elapsed 3m 25s (remain 1m 40s) Loss: 0.7940 Grad: 391625.2812  LR: 0.00000658  


Train:  69%|██████▉   | 601/865 [03:32<01:34,  2.79train_batch/s]

Epoch: [2][600/865] Elapsed 3m 32s (remain 1m 33s) Loss: 0.7946 Grad: 247164.2656  LR: 0.00001231  


Train:  72%|███████▏  | 621/865 [03:39<01:27,  2.79train_batch/s]

Epoch: [2][620/865] Elapsed 3m 39s (remain 1m 26s) Loss: 0.7953 Grad: 194157.3125  LR: 0.00001727  


Train:  74%|███████▍  | 641/865 [03:46<01:20,  2.79train_batch/s]

Epoch: [2][640/865] Elapsed 3m 46s (remain 1m 19s) Loss: 0.7968 Grad: 161889.2188  LR: 0.00001985  


Train:  76%|███████▋  | 661/865 [03:53<01:13,  2.79train_batch/s]

Epoch: [2][660/865] Elapsed 3m 53s (remain 1m 12s) Loss: 0.7957 Grad: 180356.6719  LR: 0.00001918  


Train:  79%|███████▊  | 681/865 [04:00<01:05,  2.79train_batch/s]

Epoch: [2][680/865] Elapsed 4m 0s (remain 1m 5s) Loss: 0.7963 Grad: 266367.2500  LR: 0.00001550  


Train:  81%|████████  | 701/865 [04:07<00:58,  2.79train_batch/s]

Epoch: [2][700/865] Elapsed 4m 7s (remain 0m 57s) Loss: 0.7949 Grad: 126407.8672  LR: 0.00001000  


Train:  83%|████████▎ | 721/865 [04:14<00:51,  2.79train_batch/s]

Epoch: [2][720/865] Elapsed 4m 14s (remain 0m 50s) Loss: 0.7935 Grad: 181674.4531  LR: 0.00000450  


Train:  86%|████████▌ | 741/865 [04:21<00:44,  2.79train_batch/s]

Epoch: [2][740/865] Elapsed 4m 21s (remain 0m 43s) Loss: 0.7908 Grad: 196885.5156  LR: 0.00000082  


Train:  88%|████████▊ | 761/865 [04:28<00:37,  2.79train_batch/s]

Epoch: [2][760/865] Elapsed 4m 28s (remain 0m 36s) Loss: 0.7908 Grad: 152030.5625  LR: 0.00000015  


Train:  90%|█████████ | 781/865 [04:36<00:30,  2.80train_batch/s]

Epoch: [2][780/865] Elapsed 4m 36s (remain 0m 29s) Loss: 0.7933 Grad: 146577.1562  LR: 0.00000273  


Train:  93%|█████████▎| 801/865 [04:43<00:22,  2.79train_batch/s]

Epoch: [2][800/865] Elapsed 4m 43s (remain 0m 22s) Loss: 0.7941 Grad: 256008.7969  LR: 0.00000769  


Train:  95%|█████████▍| 821/865 [04:50<00:15,  2.79train_batch/s]

Epoch: [2][820/865] Elapsed 4m 50s (remain 0m 15s) Loss: 0.7941 Grad: 101616.9375  LR: 0.00001342  


Train:  97%|█████████▋| 841/865 [04:57<00:08,  2.79train_batch/s]

Epoch: [2][840/865] Elapsed 4m 57s (remain 0m 8s) Loss: 0.7931 Grad: 194350.5156  LR: 0.00001802  


Train: 100%|█████████▉| 861/865 [05:04<00:01,  2.80train_batch/s]

Epoch: [2][860/865] Elapsed 5m 4s (remain 0m 1s) Loss: 0.7929 Grad: 185670.3438  LR: 0.00001998  


Train: 100%|██████████| 865/865 [05:05<00:00,  2.83train_batch/s]


Epoch: [2][864/865] Elapsed 5m 5s (remain 0m 0s) Loss: 0.7932 Grad: 237156.1250  LR: 0.00001998  


Validation:   1%|          | 2/217 [00:00<00:53,  4.00valid_batch/s]

EVAL: [0/217] Elapsed 0m 0s (remain 1m 14s) Loss: 0.9785 


Validation:  10%|█         | 22/217 [00:04<00:36,  5.40valid_batch/s]

EVAL: [20/217] Elapsed 0m 4s (remain 0m 37s) Loss: 0.7800 


Validation:  19%|█▉        | 42/217 [00:07<00:32,  5.41valid_batch/s]

EVAL: [40/217] Elapsed 0m 7s (remain 0m 33s) Loss: 0.7666 


Validation:  29%|██▊       | 62/217 [00:11<00:28,  5.41valid_batch/s]

EVAL: [60/217] Elapsed 0m 11s (remain 0m 29s) Loss: 0.7931 


Validation:  38%|███▊      | 82/217 [00:15<00:26,  5.13valid_batch/s]

EVAL: [80/217] Elapsed 0m 15s (remain 0m 25s) Loss: 0.8094 


Validation:  47%|████▋     | 102/217 [00:19<00:22,  5.02valid_batch/s]

EVAL: [100/217] Elapsed 0m 19s (remain 0m 22s) Loss: 0.8171 


Validation:  56%|█████▌    | 122/217 [00:23<00:18,  5.09valid_batch/s]

EVAL: [120/217] Elapsed 0m 23s (remain 0m 18s) Loss: 0.8117 


Validation:  65%|██████▌   | 142/217 [00:27<00:14,  5.35valid_batch/s]

EVAL: [140/217] Elapsed 0m 27s (remain 0m 14s) Loss: 0.8191 


Validation:  75%|███████▍  | 162/217 [00:30<00:10,  5.33valid_batch/s]

EVAL: [160/217] Elapsed 0m 30s (remain 0m 10s) Loss: 0.8226 


Validation:  84%|████████▍ | 182/217 [00:34<00:06,  5.38valid_batch/s]

EVAL: [180/217] Elapsed 0m 34s (remain 0m 6s) Loss: 0.8149 


Validation:  93%|█████████▎| 202/217 [00:38<00:02,  5.39valid_batch/s]

EVAL: [200/217] Elapsed 0m 38s (remain 0m 3s) Loss: 0.8175 


Validation: 100%|██████████| 217/217 [00:41<00:00,  5.27valid_batch/s]
Epoch 2 - avg_train_loss: 0.7932  avg_val_loss: 0.8195  time: 347s
Epoch 2 - Score: 0.7903
Epoch 2 - Save Best Score: 0.7903 Model


EVAL: [216/217] Elapsed 0m 41s (remain 0m 0s) Loss: 0.8195 


Score: 0.7903
Score: 0.7875
