# 🏋️ Model Training

## ⚙️ Setup 

### 📚 Importing Libraries

Importing from packages

In [1]:
import os
import gc
import pandas as pd
import time
import numpy as np
from sklearn.metrics import cohen_kappa_score
import torch
from torch.optim import AdamW
from typing import Tuple
from torch.utils.data import DataLoader
from torch import nn
from transformers import (
    get_linear_schedule_with_warmup,
    get_cosine_schedule_with_warmup,
)

In [2]:
os.chdir("../")

Importing user defined packages

In [3]:
from lib.config import Config
from lib.paths import Paths
from lib.model.deberta import CustomModel
from lib.model.epoch_functions import train_epoch, valid_epoch
from lib.utils.utils import get_logger
from lib.data import read_data_loader_from_disk

## Definitions

### 🌎 Global Variables

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [10]:
LOGGER = get_logger(Paths.MODEL_OUTPUT_PATH)

### 🛠️ Functions

In [6]:
def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p
                for n, p in model.model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "lr": encoder_lr,
            "weight_decay": weight_decay,
        },
        {
            "params": [
                p
                for n, p in model.model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "lr": encoder_lr,
            "weight_decay": 0.0,
        },
        {
            "params": [p for n, p in model.named_parameters() if "model" not in n],
            "lr": decoder_lr,
            "weight_decay": 0.0,
        },
    ]

    return optimizer_parameters

In [7]:
def get_scheduler(cfg: Config, optimizer, num_train_steps):
    if cfg.SCHEDULER == "linear":
        return get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=cfg.NUM_WARMUP_STEPS,
            num_training_steps=num_train_steps,
        )
    
    if cfg.SCHEDULER == "cosine":
        return get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=cfg.NUM_WARMUP_STEPS,
            num_training_steps=num_train_steps,
            num_cycles=cfg.NUM_CYCLES,
        )

In [8]:
def get_score(y_true, y_pred):
    score = cohen_kappa_score(y_true, y_pred, weights='quadratic')
    return score

In [9]:
def get_model_optimizer_and_scheduler(train_loader):
    model = CustomModel(Config, config_path=None, pretrained=True)
    torch.save(model.config, Paths.MODEL_OUTPUT_PATH + "/config.pth")
    model.to(device)

    optimizer = AdamW(
        get_optimizer_params(
            model,
            encoder_lr=Config.ENCODER_LR,
            decoder_lr=Config.DECODER_LR,
            weight_decay=Config.WEIGHT_DECAY,
        ),
        lr=Config.ENCODER_LR,
        eps=Config.EPS,
        betas=Config.BETAS,
    )

    num_train_steps = int(len(train_loader) / Config.BATCH_SIZE_TRAIN * Config.EPOCHS)
    scheduler = get_scheduler(Config, optimizer, num_train_steps)
    return model, optimizer, scheduler

In [11]:
def train_loop(fold):
    LOGGER.info(f"========== Fold: {fold} training ==========")

    # ======== DATA LOADER ==========
    train_loader, valid_loader = read_data_loader_from_disk(fold)
    valid_fold = pd.read_csv(os.path.join(Paths.DATA_LOADER_PATH, f"valid_{fold}.csv"))
    valid_labels = valid_fold["score"].values

    # ======== MODEL ==========
    model, optimizer, scheduler = get_model_optimizer_and_scheduler(train_loader)

    # ======= LOSS ==========
    # criterion = RMSELoss(reduction="mean") # nn.SmoothL1Loss(reduction='mean')
    criterion = nn.CrossEntropyLoss()
    softmax = nn.Softmax(dim=1)

    best_score = -np.inf
    # ====== ITERATE EPOCHS ========
    for epoch in range(Config.EPOCHS):
        start_time = time.time()

        # ======= TRAIN ==========
        avg_loss = train_epoch(
            train_loader, model, criterion, optimizer, epoch, scheduler, device
        )

        # ======= EVALUATION ==========
        avg_val_loss, prediction_dict = valid_epoch(
            valid_loader, model, criterion, device
        )
        predictions = prediction_dict["predictions"]
        _, predictions = torch.max(softmax(torch.tensor(predictions)), dim=1)

        # ======= SCORING ==========
        score = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(
            f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s"
        )
        LOGGER.info(f"Epoch {epoch+1} - Score: {score:.4f}")

        if score > best_score:
            best_score = score
            LOGGER.info(f"Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model")
            torch.save(
                {"model": model.state_dict(), "predictions": predictions},
                Paths.MODEL_OUTPUT_PATH
                + f"/{Config.MODEL.replace('/', '_')}_fold_{fold}_best.pth",
            )

    predictions = torch.load(
        Paths.MODEL_OUTPUT_PATH
        + f"/{Config.MODEL.replace('/', '_')}_fold_{fold}_best.pth",
        map_location=torch.device("cpu"),
    )["predictions"]
    valid_fold["pred_score"] = predictions

    torch.cuda.empty_cache()
    gc.collect()

    return valid_fold

In [12]:
def get_result(oof_df):
    labels = oof_df["score"].values
    preds = oof_df["pred_score"].values
    score = get_score(labels, preds)
    LOGGER.info(f'Score: {score:<.4f}')

In [13]:
if Config.TRAIN:
    oof_df = pd.DataFrame()

    for fold in range(Config.N_FOLDS):
        if fold in Config.TRAIN_FOLDS:
            _oof_df = train_loop(fold)
            oof_df = pd.concat([oof_df, _oof_df])
            LOGGER.info(f"========== Fold: {fold} result ==========")
            get_result(_oof_df)

    oof_df = oof_df.reset_index(drop=True)
    LOGGER.info(f"========== CV ==========")
    get_result(oof_df)
    oof_df.to_csv(Paths.MODEL_OUTPUT_PATH + "/oof_df.csv", index=False)



Train:   0%|          | 1/865 [00:01<14:56,  1.04s/train_batch]

Epoch: [1][0/865] Elapsed 0m 1s (remain 14m 57s) Loss: 1.7507 Grad: inf  LR: 0.00002000  


Train:   2%|▏         | 21/865 [00:08<05:21,  2.62train_batch/s]

Epoch: [1][20/865] Elapsed 0m 8s (remain 5m 42s) Loss: 1.5755 Grad: 56959.6328  LR: 0.00001819  


Train:   5%|▍         | 41/865 [00:16<05:13,  2.63train_batch/s]

Epoch: [1][40/865] Elapsed 0m 16s (remain 5m 22s) Loss: 1.4933 Grad: 71229.9375  LR: 0.00001369  


Train:   7%|▋         | 61/865 [00:23<05:04,  2.64train_batch/s]

Epoch: [1][60/865] Elapsed 0m 23s (remain 5m 9s) Loss: 1.4307 Grad: 96200.7109  LR: 0.00000798  


Train:   9%|▉         | 81/865 [00:30<04:56,  2.64train_batch/s]

Epoch: [1][80/865] Elapsed 0m 30s (remain 4m 59s) Loss: 1.3671 Grad: 61383.4414  LR: 0.00000293  


Train:  12%|█▏        | 101/865 [00:38<04:50,  2.63train_batch/s]

Epoch: [1][100/865] Elapsed 0m 38s (remain 4m 50s) Loss: 1.3226 Grad: 64941.1562  LR: 0.00000021  


Train:  14%|█▍        | 121/865 [00:45<04:41,  2.64train_batch/s]

Epoch: [1][120/865] Elapsed 0m 45s (remain 4m 42s) Loss: 1.2943 Grad: 75878.5625  LR: 0.00000071  


Train:  16%|█▋        | 141/865 [00:53<04:35,  2.62train_batch/s]

Epoch: [1][140/865] Elapsed 0m 53s (remain 4m 34s) Loss: 1.2682 Grad: 54951.6797  LR: 0.00000426  


Train:  19%|█▊        | 161/865 [01:01<04:32,  2.59train_batch/s]

Epoch: [1][160/865] Elapsed 1m 1s (remain 4m 26s) Loss: 1.2528 Grad: 73908.5547  LR: 0.00000971  


Train:  21%|██        | 181/865 [01:08<04:19,  2.63train_batch/s]

Epoch: [1][180/865] Elapsed 1m 8s (remain 4m 19s) Loss: 1.2404 Grad: 166565.7500  LR: 0.00001525  


Train:  23%|██▎       | 201/865 [01:16<04:14,  2.61train_batch/s]

Epoch: [1][200/865] Elapsed 1m 16s (remain 4m 11s) Loss: 1.2283 Grad: 233754.2969  LR: 0.00001906  


Train:  26%|██▌       | 221/865 [01:23<04:05,  2.62train_batch/s]

Epoch: [1][220/865] Elapsed 1m 23s (remain 4m 3s) Loss: 1.2112 Grad: 74786.0078  LR: 0.00001989  


Train:  28%|██▊       | 241/865 [01:31<03:59,  2.61train_batch/s]

Epoch: [1][240/865] Elapsed 1m 31s (remain 3m 56s) Loss: 1.1941 Grad: 90269.3984  LR: 0.00001747  


Train:  30%|███       | 261/865 [01:38<03:54,  2.57train_batch/s]

Epoch: [1][260/865] Elapsed 1m 38s (remain 3m 48s) Loss: 1.1876 Grad: 182027.4375  LR: 0.00001259  


Train:  32%|███▏      | 281/865 [01:46<03:44,  2.60train_batch/s]

Epoch: [1][280/865] Elapsed 1m 46s (remain 3m 41s) Loss: 1.1733 Grad: 185393.8750  LR: 0.00000685  


Train:  35%|███▍      | 301/865 [01:54<03:37,  2.60train_batch/s]

Epoch: [1][300/865] Elapsed 1m 54s (remain 3m 33s) Loss: 1.1570 Grad: 83969.9141  LR: 0.00000216  


Train:  37%|███▋      | 321/865 [02:01<03:29,  2.60train_batch/s]

Epoch: [1][320/865] Elapsed 2m 1s (remain 3m 26s) Loss: 1.1387 Grad: 76550.2422  LR: 0.00000004  


Train:  39%|███▉      | 341/865 [02:09<03:22,  2.58train_batch/s]

Epoch: [1][340/865] Elapsed 2m 9s (remain 3m 18s) Loss: 1.1260 Grad: 78081.2969  LR: 0.00000120  


Train:  42%|████▏     | 361/865 [02:17<03:15,  2.57train_batch/s]

Epoch: [1][360/865] Elapsed 2m 17s (remain 3m 11s) Loss: 1.1138 Grad: 143154.8438  LR: 0.00000525  


Train:  44%|████▍     | 381/865 [02:24<03:06,  2.59train_batch/s]

Epoch: [1][380/865] Elapsed 2m 24s (remain 3m 3s) Loss: 1.1051 Grad: 123912.5391  LR: 0.00001087  


Train:  46%|████▋     | 401/865 [02:32<02:58,  2.59train_batch/s]

Epoch: [1][400/865] Elapsed 2m 32s (remain 2m 56s) Loss: 1.0975 Grad: 88022.8359  LR: 0.00001620  


Train:  49%|████▊     | 421/865 [02:39<02:51,  2.59train_batch/s]

Epoch: [1][420/865] Elapsed 2m 39s (remain 2m 48s) Loss: 1.0969 Grad: 153657.5156  LR: 0.00001949  


Train:  51%|█████     | 441/865 [02:47<02:45,  2.57train_batch/s]

Epoch: [1][440/865] Elapsed 2m 47s (remain 2m 41s) Loss: 1.0906 Grad: 248705.1562  LR: 0.00001966  


Train:  53%|█████▎    | 461/865 [02:55<02:37,  2.57train_batch/s]

Epoch: [1][460/865] Elapsed 2m 55s (remain 2m 33s) Loss: 1.0814 Grad: 143863.8438  LR: 0.00001665  


Train:  56%|█████▌    | 481/865 [03:02<02:28,  2.59train_batch/s]

Epoch: [1][480/865] Elapsed 3m 2s (remain 2m 26s) Loss: 1.0777 Grad: 231850.3750  LR: 0.00001145  


Train:  58%|█████▊    | 501/865 [03:10<02:22,  2.56train_batch/s]

Epoch: [1][500/865] Elapsed 3m 10s (remain 2m 18s) Loss: 1.0715 Grad: 209188.1406  LR: 0.00000577  


Train:  60%|██████    | 521/865 [03:18<02:12,  2.59train_batch/s]

Epoch: [1][520/865] Elapsed 3m 18s (remain 2m 10s) Loss: 1.0621 Grad: 114463.0625  LR: 0.00000149  


Train:  63%|██████▎   | 541/865 [03:26<02:06,  2.56train_batch/s]

Epoch: [1][540/865] Elapsed 3m 25s (remain 2m 3s) Loss: 1.0552 Grad: 74120.0703  LR: 0.00000000  


Train:  65%|██████▍   | 561/865 [03:33<01:59,  2.55train_batch/s]

Epoch: [1][560/865] Elapsed 3m 33s (remain 1m 55s) Loss: 1.0475 Grad: 64517.5898  LR: 0.00000181  


Train:  67%|██████▋   | 581/865 [03:41<01:51,  2.54train_batch/s]

Epoch: [1][580/865] Elapsed 3m 41s (remain 1m 48s) Loss: 1.0394 Grad: 70623.3672  LR: 0.00000631  


Train:  69%|██████▉   | 601/865 [03:49<01:45,  2.51train_batch/s]

Epoch: [1][600/865] Elapsed 3m 49s (remain 1m 40s) Loss: 1.0347 Grad: 191911.2188  LR: 0.00001202  


Train:  72%|███████▏  | 621/865 [03:56<01:34,  2.58train_batch/s]

Epoch: [1][620/865] Elapsed 3m 56s (remain 1m 33s) Loss: 1.0307 Grad: 82344.4062  LR: 0.00001707  


Train:  74%|███████▍  | 641/865 [04:04<01:28,  2.54train_batch/s]

Epoch: [1][640/865] Elapsed 4m 4s (remain 1m 25s) Loss: 1.0313 Grad: 64755.4023  LR: 0.00001979  


Train:  76%|███████▋  | 661/865 [04:12<01:19,  2.56train_batch/s]

Epoch: [1][660/865] Elapsed 4m 12s (remain 1m 17s) Loss: 1.0299 Grad: 74849.1641  LR: 0.00001929  


Train:  79%|███████▊  | 681/865 [04:20<01:11,  2.57train_batch/s]

Epoch: [1][680/865] Elapsed 4m 20s (remain 1m 10s) Loss: 1.0276 Grad: 52250.8320  LR: 0.00001574  


Train:  81%|████████  | 701/865 [04:27<01:04,  2.54train_batch/s]

Epoch: [1][700/865] Elapsed 4m 27s (remain 1m 2s) Loss: 1.0231 Grad: 142945.4219  LR: 0.00001029  


Train:  83%|████████▎ | 721/865 [04:35<00:56,  2.56train_batch/s]

Epoch: [1][720/865] Elapsed 4m 35s (remain 0m 55s) Loss: 1.0201 Grad: 123455.9531  LR: 0.00000475  


Train:  86%|████████▌ | 741/865 [04:43<00:48,  2.58train_batch/s]

Epoch: [1][740/865] Elapsed 4m 43s (remain 0m 47s) Loss: 1.0182 Grad: 61855.5742  LR: 0.00000094  


Train:  88%|████████▊ | 761/865 [04:50<00:40,  2.54train_batch/s]

Epoch: [1][760/865] Elapsed 4m 50s (remain 0m 39s) Loss: 1.0155 Grad: 185698.4531  LR: 0.00000011  


Train:  90%|█████████ | 781/865 [04:58<00:32,  2.55train_batch/s]

Epoch: [1][780/865] Elapsed 4m 58s (remain 0m 32s) Loss: 1.0127 Grad: 101255.4297  LR: 0.00000253  


Train:  93%|█████████▎| 801/865 [05:06<00:24,  2.57train_batch/s]

Epoch: [1][800/865] Elapsed 5m 6s (remain 0m 24s) Loss: 1.0073 Grad: 70524.8125  LR: 0.00000741  


Train:  95%|█████████▍| 821/865 [05:14<00:17,  2.53train_batch/s]

Epoch: [1][820/865] Elapsed 5m 14s (remain 0m 16s) Loss: 1.0051 Grad: 128059.5625  LR: 0.00001315  


Train:  97%|█████████▋| 841/865 [05:21<00:09,  2.51train_batch/s]

Epoch: [1][840/865] Elapsed 5m 21s (remain 0m 9s) Loss: 1.0039 Grad: 94281.1328  LR: 0.00001784  


Train: 100%|█████████▉| 861/865 [05:29<00:01,  2.56train_batch/s]

Epoch: [1][860/865] Elapsed 5m 29s (remain 0m 1s) Loss: 1.0024 Grad: 81034.7891  LR: 0.00001996  


Train: 100%|██████████| 865/865 [05:31<00:00,  2.61train_batch/s]


Epoch: [1][864/865] Elapsed 5m 31s (remain 0m 0s) Loss: 1.0014 Grad: 119348.3828  LR: 0.00002000  


Validation:   0%|          | 2/433 [00:00<01:06,  6.52valid_batch/s]

EVAL: [0/433] Elapsed 0m 0s (remain 1m 37s) Loss: 0.5964 


Validation:   5%|▌         | 22/433 [00:02<00:42,  9.65valid_batch/s]

EVAL: [20/433] Elapsed 0m 2s (remain 0m 45s) Loss: 0.9756 


Validation:  10%|▉         | 42/433 [00:04<00:40,  9.66valid_batch/s]

EVAL: [40/433] Elapsed 0m 4s (remain 0m 41s) Loss: 0.9384 


Validation:  14%|█▍        | 62/433 [00:06<00:37,  9.80valid_batch/s]

EVAL: [60/433] Elapsed 0m 6s (remain 0m 39s) Loss: 0.9189 


Validation:  19%|█▉        | 82/433 [00:08<00:36,  9.61valid_batch/s]

EVAL: [80/433] Elapsed 0m 8s (remain 0m 37s) Loss: 0.9102 


Validation:  24%|██▎       | 102/433 [00:10<00:33,  9.80valid_batch/s]

EVAL: [100/433] Elapsed 0m 10s (remain 0m 34s) Loss: 0.8899 


Validation:  28%|██▊       | 122/433 [00:12<00:32,  9.60valid_batch/s]

EVAL: [120/433] Elapsed 0m 12s (remain 0m 32s) Loss: 0.9046 


Validation:  33%|███▎      | 142/433 [00:14<00:30,  9.51valid_batch/s]

EVAL: [140/433] Elapsed 0m 14s (remain 0m 30s) Loss: 0.9088 


Validation:  37%|███▋      | 162/433 [00:16<00:27,  9.81valid_batch/s]

EVAL: [160/433] Elapsed 0m 16s (remain 0m 28s) Loss: 0.9035 


Validation:  42%|████▏     | 182/433 [00:18<00:24, 10.19valid_batch/s]

EVAL: [180/433] Elapsed 0m 18s (remain 0m 26s) Loss: 0.8958 


Validation:  47%|████▋     | 202/433 [00:20<00:22, 10.10valid_batch/s]

EVAL: [200/433] Elapsed 0m 20s (remain 0m 23s) Loss: 0.8885 


Validation:  51%|█████▏    | 222/433 [00:22<00:20, 10.20valid_batch/s]

EVAL: [220/433] Elapsed 0m 22s (remain 0m 21s) Loss: 0.8896 


Validation:  56%|█████▌    | 242/433 [00:24<00:18, 10.38valid_batch/s]

EVAL: [240/433] Elapsed 0m 24s (remain 0m 19s) Loss: 0.8831 


Validation:  61%|██████    | 262/433 [00:26<00:16, 10.21valid_batch/s]

EVAL: [260/433] Elapsed 0m 26s (remain 0m 17s) Loss: 0.8742 


Validation:  65%|██████▌   | 282/433 [00:28<00:14, 10.48valid_batch/s]

EVAL: [280/433] Elapsed 0m 28s (remain 0m 15s) Loss: 0.8737 


Validation:  70%|██████▉   | 302/433 [00:30<00:13, 10.01valid_batch/s]

EVAL: [300/433] Elapsed 0m 30s (remain 0m 13s) Loss: 0.8752 


Validation:  74%|███████▍  | 322/433 [00:32<00:11,  9.86valid_batch/s]

EVAL: [320/433] Elapsed 0m 32s (remain 0m 11s) Loss: 0.8754 


Validation:  79%|███████▉  | 342/433 [00:34<00:09,  9.98valid_batch/s]

EVAL: [340/433] Elapsed 0m 34s (remain 0m 9s) Loss: 0.8727 


Validation:  84%|████████▍ | 363/433 [00:36<00:07,  9.99valid_batch/s]

EVAL: [360/433] Elapsed 0m 36s (remain 0m 7s) Loss: 0.8722 


Validation:  88%|████████▊ | 381/433 [00:38<00:05, 10.09valid_batch/s]

EVAL: [380/433] Elapsed 0m 38s (remain 0m 5s) Loss: 0.8709 


Validation:  93%|█████████▎| 402/433 [00:40<00:03,  9.70valid_batch/s]

EVAL: [400/433] Elapsed 0m 40s (remain 0m 3s) Loss: 0.8734 


Validation:  97%|█████████▋| 422/433 [00:42<00:01,  9.97valid_batch/s]

EVAL: [420/433] Elapsed 0m 42s (remain 0m 1s) Loss: 0.8742 


Validation: 100%|██████████| 433/433 [00:43<00:00,  9.88valid_batch/s]
Epoch 1 - avg_train_loss: 1.0014  avg_val_loss: 0.8780  time: 375s
Epoch 1 - Score: 0.7711
Epoch 1 - Save Best Score: 0.7711 Model


EVAL: [432/433] Elapsed 0m 43s (remain 0m 0s) Loss: 0.8780 


Train:   0%|          | 1/865 [00:00<07:01,  2.05train_batch/s]

Epoch: [2][0/865] Elapsed 0m 0s (remain 7m 1s) Loss: 1.0039 Grad: inf  LR: 0.00001998  


Train:   2%|▏         | 21/865 [00:08<05:27,  2.58train_batch/s]

Epoch: [2][20/865] Elapsed 0m 8s (remain 5m 26s) Loss: 0.8433 Grad: 147127.6875  LR: 0.00001802  


Train:   5%|▍         | 41/865 [00:15<05:20,  2.57train_batch/s]

Epoch: [2][40/865] Elapsed 0m 15s (remain 5m 12s) Loss: 0.8399 Grad: 84825.5391  LR: 0.00001342  


Train:   7%|▋         | 61/865 [00:23<05:13,  2.57train_batch/s]

Epoch: [2][60/865] Elapsed 0m 23s (remain 5m 6s) Loss: 0.8279 Grad: 123441.7891  LR: 0.00000769  


Train:   9%|▉         | 81/865 [00:30<05:05,  2.57train_batch/s]

Epoch: [2][80/865] Elapsed 0m 30s (remain 4m 59s) Loss: 0.7900 Grad: 192891.8438  LR: 0.00000273  


Train:  12%|█▏        | 101/865 [00:38<04:56,  2.57train_batch/s]

Epoch: [2][100/865] Elapsed 0m 38s (remain 4m 52s) Loss: 0.7928 Grad: 129518.9688  LR: 0.00000015  


Train:  14%|█▍        | 121/865 [00:46<04:48,  2.58train_batch/s]

Epoch: [2][120/865] Elapsed 0m 46s (remain 4m 44s) Loss: 0.7904 Grad: 136688.3594  LR: 0.00000082  


Train:  16%|█▋        | 141/865 [00:53<04:40,  2.58train_batch/s]

Epoch: [2][140/865] Elapsed 0m 53s (remain 4m 37s) Loss: 0.7912 Grad: 184443.3594  LR: 0.00000450  


Train:  19%|█▊        | 161/865 [01:01<04:34,  2.57train_batch/s]

Epoch: [2][160/865] Elapsed 1m 1s (remain 4m 29s) Loss: 0.7934 Grad: 156899.1562  LR: 0.00001000  


Train:  21%|██        | 181/865 [01:09<04:25,  2.57train_batch/s]

Epoch: [2][180/865] Elapsed 1m 9s (remain 4m 21s) Loss: 0.7992 Grad: 154463.6562  LR: 0.00001550  


Train:  23%|██▎       | 201/865 [01:16<04:18,  2.57train_batch/s]

Epoch: [2][200/865] Elapsed 1m 16s (remain 4m 13s) Loss: 0.8057 Grad: 134662.3594  LR: 0.00001918  


Train:  26%|██▌       | 221/865 [01:24<04:07,  2.60train_batch/s]

Epoch: [2][220/865] Elapsed 1m 24s (remain 4m 6s) Loss: 0.8044 Grad: 136710.5469  LR: 0.00001985  


Train:  28%|██▊       | 241/865 [01:32<03:59,  2.60train_batch/s]

Epoch: [2][240/865] Elapsed 1m 32s (remain 3m 58s) Loss: 0.8086 Grad: 312878.9688  LR: 0.00001727  


Train:  30%|███       | 261/865 [01:39<03:55,  2.57train_batch/s]

Epoch: [2][260/865] Elapsed 1m 39s (remain 3m 50s) Loss: 0.8103 Grad: 155318.7344  LR: 0.00001231  


Train:  32%|███▏      | 281/865 [01:47<03:47,  2.56train_batch/s]

Epoch: [2][280/865] Elapsed 1m 47s (remain 3m 43s) Loss: 0.8144 Grad: 177463.0312  LR: 0.00000658  


Train:  35%|███▍      | 301/865 [01:55<03:42,  2.53train_batch/s]

Epoch: [2][300/865] Elapsed 1m 55s (remain 3m 35s) Loss: 0.8154 Grad: 210208.3281  LR: 0.00000198  


Train:  37%|███▋      | 321/865 [02:02<03:31,  2.57train_batch/s]

Epoch: [2][320/865] Elapsed 2m 2s (remain 3m 28s) Loss: 0.8173 Grad: 143607.0938  LR: 0.00000002  


Train:  39%|███▉      | 341/865 [02:10<03:24,  2.57train_batch/s]

Epoch: [2][340/865] Elapsed 2m 10s (remain 3m 20s) Loss: 0.8143 Grad: 121374.5859  LR: 0.00000134  


Train:  42%|████▏     | 361/865 [02:18<03:16,  2.56train_batch/s]

Epoch: [2][360/865] Elapsed 2m 18s (remain 3m 13s) Loss: 0.8127 Grad: 115622.3750  LR: 0.00000551  


Train:  44%|████▍     | 381/865 [02:25<03:08,  2.57train_batch/s]

Epoch: [2][380/865] Elapsed 2m 25s (remain 3m 5s) Loss: 0.8123 Grad: 105358.1016  LR: 0.00001116  


Train:  46%|████▋     | 401/865 [02:33<03:00,  2.58train_batch/s]

Epoch: [2][400/865] Elapsed 2m 33s (remain 2m 57s) Loss: 0.8095 Grad: 442036.4375  LR: 0.00001643  


Train:  49%|████▊     | 421/865 [02:41<02:52,  2.57train_batch/s]

Epoch: [2][420/865] Elapsed 2m 41s (remain 2m 50s) Loss: 0.8077 Grad: 304042.5938  LR: 0.00001958  


Train:  51%|█████     | 441/865 [02:48<02:45,  2.56train_batch/s]

Epoch: [2][440/865] Elapsed 2m 48s (remain 2m 42s) Loss: 0.8094 Grad: 173051.6406  LR: 0.00001958  


Train:  53%|█████▎    | 461/865 [02:56<02:38,  2.54train_batch/s]

Epoch: [2][460/865] Elapsed 2m 56s (remain 2m 34s) Loss: 0.8093 Grad: 126492.5078  LR: 0.00001643  


Train:  56%|█████▌    | 481/865 [03:04<02:30,  2.55train_batch/s]

Epoch: [2][480/865] Elapsed 3m 4s (remain 2m 27s) Loss: 0.8080 Grad: 257020.5156  LR: 0.00001116  


Train:  58%|█████▊    | 501/865 [03:12<02:21,  2.57train_batch/s]

Epoch: [2][500/865] Elapsed 3m 12s (remain 2m 19s) Loss: 0.8086 Grad: 170908.9062  LR: 0.00000551  


Train:  60%|██████    | 521/865 [03:19<02:15,  2.54train_batch/s]

Epoch: [2][520/865] Elapsed 3m 19s (remain 2m 11s) Loss: 0.8088 Grad: 146850.2188  LR: 0.00000134  


Train:  63%|██████▎   | 541/865 [03:27<02:07,  2.55train_batch/s]

Epoch: [2][540/865] Elapsed 3m 27s (remain 2m 4s) Loss: 0.8093 Grad: 162178.2812  LR: 0.00000002  


Train:  65%|██████▍   | 561/865 [03:35<01:57,  2.58train_batch/s]

Epoch: [2][560/865] Elapsed 3m 35s (remain 1m 56s) Loss: 0.8079 Grad: 191380.3281  LR: 0.00000198  


Train:  67%|██████▋   | 581/865 [03:43<01:51,  2.54train_batch/s]

Epoch: [2][580/865] Elapsed 3m 43s (remain 1m 49s) Loss: 0.8075 Grad: 205737.2500  LR: 0.00000658  


Train:  69%|██████▉   | 601/865 [03:50<01:42,  2.57train_batch/s]

Epoch: [2][600/865] Elapsed 3m 50s (remain 1m 41s) Loss: 0.8061 Grad: 163043.1719  LR: 0.00001231  


Train:  72%|███████▏  | 621/865 [03:58<01:34,  2.58train_batch/s]

Epoch: [2][620/865] Elapsed 3m 58s (remain 1m 33s) Loss: 0.8060 Grad: 272602.0000  LR: 0.00001727  


Train:  74%|███████▍  | 641/865 [04:06<01:28,  2.54train_batch/s]

Epoch: [2][640/865] Elapsed 4m 6s (remain 1m 26s) Loss: 0.8047 Grad: 243611.7969  LR: 0.00001985  


Train:  76%|███████▋  | 661/865 [04:13<01:19,  2.57train_batch/s]

Epoch: [2][660/865] Elapsed 4m 13s (remain 1m 18s) Loss: 0.8041 Grad: 164437.8281  LR: 0.00001918  


Train:  79%|███████▊  | 681/865 [04:21<01:12,  2.52train_batch/s]

Epoch: [2][680/865] Elapsed 4m 21s (remain 1m 10s) Loss: 0.8012 Grad: 113713.7734  LR: 0.00001550  


Train:  81%|████████  | 701/865 [04:29<01:03,  2.57train_batch/s]

Epoch: [2][700/865] Elapsed 4m 29s (remain 1m 3s) Loss: 0.8014 Grad: 202443.5312  LR: 0.00001000  


Train:  83%|████████▎ | 721/865 [04:37<00:56,  2.54train_batch/s]

Epoch: [2][720/865] Elapsed 4m 37s (remain 0m 55s) Loss: 0.8010 Grad: 134918.7812  LR: 0.00000450  


Train:  86%|████████▌ | 741/865 [04:44<00:48,  2.57train_batch/s]

Epoch: [2][740/865] Elapsed 4m 44s (remain 0m 47s) Loss: 0.8026 Grad: 143057.7344  LR: 0.00000082  


Train:  88%|████████▊ | 761/865 [04:52<00:40,  2.58train_batch/s]

Epoch: [2][760/865] Elapsed 4m 52s (remain 0m 39s) Loss: 0.8012 Grad: 249027.9219  LR: 0.00000015  


Train:  90%|█████████ | 781/865 [05:00<00:32,  2.57train_batch/s]

Epoch: [2][780/865] Elapsed 5m 0s (remain 0m 32s) Loss: 0.8005 Grad: 195532.2031  LR: 0.00000273  


Train:  93%|█████████▎| 801/865 [05:07<00:24,  2.57train_batch/s]

Epoch: [2][800/865] Elapsed 5m 7s (remain 0m 24s) Loss: 0.7989 Grad: 112372.7812  LR: 0.00000769  


Train:  95%|█████████▍| 821/865 [05:15<00:17,  2.57train_batch/s]

Epoch: [2][820/865] Elapsed 5m 15s (remain 0m 16s) Loss: 0.7971 Grad: 189060.3906  LR: 0.00001342  


Train:  97%|█████████▋| 841/865 [05:23<00:09,  2.57train_batch/s]

Epoch: [2][840/865] Elapsed 5m 23s (remain 0m 9s) Loss: 0.7968 Grad: 234368.0781  LR: 0.00001802  


Train: 100%|█████████▉| 861/865 [05:30<00:01,  2.58train_batch/s]

Epoch: [2][860/865] Elapsed 5m 30s (remain 0m 1s) Loss: 0.7972 Grad: 209049.6719  LR: 0.00001998  


Train: 100%|██████████| 865/865 [05:32<00:00,  2.60train_batch/s]


Epoch: [2][864/865] Elapsed 5m 32s (remain 0m 0s) Loss: 0.7969 Grad: 140028.6094  LR: 0.00001998  


Validation:   0%|          | 2/433 [00:00<01:04,  6.65valid_batch/s]

EVAL: [0/433] Elapsed 0m 0s (remain 1m 34s) Loss: 0.5292 


Validation:   5%|▌         | 22/433 [00:02<00:42,  9.71valid_batch/s]

EVAL: [20/433] Elapsed 0m 2s (remain 0m 44s) Loss: 0.9253 


Validation:  10%|▉         | 42/433 [00:04<00:41,  9.53valid_batch/s]

EVAL: [40/433] Elapsed 0m 4s (remain 0m 41s) Loss: 0.8832 


Validation:  14%|█▍        | 62/433 [00:06<00:37,  9.80valid_batch/s]

EVAL: [60/433] Elapsed 0m 6s (remain 0m 39s) Loss: 0.8880 


Validation:  19%|█▉        | 82/433 [00:08<00:36,  9.62valid_batch/s]

EVAL: [80/433] Elapsed 0m 8s (remain 0m 36s) Loss: 0.8709 


Validation:  24%|██▎       | 102/433 [00:10<00:33,  9.84valid_batch/s]

EVAL: [100/433] Elapsed 0m 10s (remain 0m 34s) Loss: 0.8574 


Validation:  28%|██▊       | 122/433 [00:12<00:32,  9.71valid_batch/s]

EVAL: [120/433] Elapsed 0m 12s (remain 0m 32s) Loss: 0.8736 


Validation:  33%|███▎      | 142/433 [00:14<00:30,  9.66valid_batch/s]

EVAL: [140/433] Elapsed 0m 14s (remain 0m 30s) Loss: 0.8781 


Validation:  37%|███▋      | 162/433 [00:16<00:28,  9.59valid_batch/s]

EVAL: [160/433] Elapsed 0m 16s (remain 0m 28s) Loss: 0.8675 


Validation:  42%|████▏     | 182/433 [00:18<00:25,  9.70valid_batch/s]

EVAL: [180/433] Elapsed 0m 18s (remain 0m 26s) Loss: 0.8627 


Validation:  47%|████▋     | 202/433 [00:20<00:23,  9.75valid_batch/s]

EVAL: [200/433] Elapsed 0m 20s (remain 0m 24s) Loss: 0.8528 


Validation:  52%|█████▏    | 223/433 [00:23<00:20, 10.01valid_batch/s]

EVAL: [220/433] Elapsed 0m 22s (remain 0m 22s) Loss: 0.8540 


Validation:  56%|█████▌    | 242/433 [00:25<00:19,  9.74valid_batch/s]

EVAL: [240/433] Elapsed 0m 24s (remain 0m 19s) Loss: 0.8478 


Validation:  61%|██████    | 262/433 [00:27<00:17,  9.69valid_batch/s]

EVAL: [260/433] Elapsed 0m 27s (remain 0m 17s) Loss: 0.8381 


Validation:  65%|██████▌   | 282/433 [00:29<00:15,  9.70valid_batch/s]

EVAL: [280/433] Elapsed 0m 29s (remain 0m 15s) Loss: 0.8423 


Validation:  70%|██████▉   | 302/433 [00:31<00:13,  9.62valid_batch/s]

EVAL: [300/433] Elapsed 0m 31s (remain 0m 13s) Loss: 0.8459 


Validation:  74%|███████▍  | 322/433 [00:33<00:10, 10.19valid_batch/s]

EVAL: [320/433] Elapsed 0m 33s (remain 0m 11s) Loss: 0.8442 


Validation:  79%|███████▉  | 342/433 [00:35<00:09,  9.59valid_batch/s]

EVAL: [340/433] Elapsed 0m 35s (remain 0m 9s) Loss: 0.8427 


Validation:  84%|████████▎ | 362/433 [00:37<00:07,  9.54valid_batch/s]

EVAL: [360/433] Elapsed 0m 37s (remain 0m 7s) Loss: 0.8462 


Validation:  88%|████████▊ | 382/433 [00:39<00:05,  9.62valid_batch/s]

EVAL: [380/433] Elapsed 0m 39s (remain 0m 5s) Loss: 0.8461 


Validation:  93%|█████████▎| 402/433 [00:41<00:03,  9.55valid_batch/s]

EVAL: [400/433] Elapsed 0m 41s (remain 0m 3s) Loss: 0.8493 


Validation:  97%|█████████▋| 422/433 [00:43<00:01,  9.57valid_batch/s]

EVAL: [420/433] Elapsed 0m 43s (remain 0m 1s) Loss: 0.8483 


Validation: 100%|██████████| 433/433 [00:44<00:00,  9.66valid_batch/s]
Epoch 2 - avg_train_loss: 0.7969  avg_val_loss: 0.8483  time: 377s
Epoch 2 - Score: 0.7850
Epoch 2 - Save Best Score: 0.7850 Model


EVAL: [432/433] Elapsed 0m 44s (remain 0m 0s) Loss: 0.8483 


Score: 0.7850
Score: 0.7850
