# <b><span style='color:#F1A424'>|</span> HMS Wavenet All Channels</b><a class='anchor' id='configuration'></a> [↑](#top) 

In [1]:
import gc
import math
import matplotlib.pyplot as plt
import multiprocessing
import numpy as np
import os
import pandas as pd
import random
import time
import torch
import torch.nn as nn


from glob import glob
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from typing import Dict, List

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('Using', torch.cuda.device_count(), 'GPU(s)')
!mkdir models5

Using 1 GPU(s)
mkdir: cannot create directory ‘models5’: File exists


# <b><span style='color:#F1A424'>|</span> Configuration</b><a class='anchor' id='configuration'></a> [↑](#top) 

***

In [2]:
class config:
    AMP = True
    BATCH_SIZE_TRAIN = 64#32
    BATCH_SIZE_VALID = 64#32
    EPOCHS = 15
    FOLDS = 4
    GRADIENT_ACCUMULATION_STEPS = 1
    MAX_GRAD_NORM = 1e7
    NUM_WORKERS = 1 #multiprocessing.cpu_count()
    PRINT_FREQ = 50
    SEED = 30#20
    TRAIN_FULL_DATA = False
    VISUALIZE = True
    WEIGHT_DECAY = 0.01
    
    
class paths:
    OUTPUT_DIR = "/kaggle/working/"
    TRAIN_CSV = "../shared_data/hms_data/raw_data/train.csv"
    TRAIN_EEGS = "../shared_data/hms_data/raw_data/train_eegs/"

# <b><span style='color:#F1A424'>|</span> Utils</b><a class='anchor' id='utils'></a> [↑](#top) 

***

Utility functions.

In [3]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s: float):
    "Convert to minutes."
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since: float, percent: float):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))




def seed_everything(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed) 
    
    
def sep():
    print("-"*100)

    
target_preds = [x + "_pred" for x in ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']]
label_to_num = {'Seizure': 0, 'LPD': 1, 'GPD': 2, 'LRDA': 3, 'GRDA': 4, 'Other':5}
num_to_label = {v: k for k, v in label_to_num.items()}
seed_everything(config.SEED)

# <b><span style='color:#F1A424'>|</span> Load Data</b><a class='anchor' id='load_data'></a> [↑](#top) 

***

Load the competition's data.

In [4]:
train_df = pd.read_csv(paths.TRAIN_CSV)
label_cols = train_df.columns[-6:]
print(f"Train cataframe shape is: {train_df.shape}")
print(f"Labels: {list(label_cols)}")
train_df.head()

Train cataframe shape is: (106800, 15)
Labels: ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']


Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,1628180742,0,0.0,353733,0,0.0,127492639,42516,Seizure,3,0,0,0,0,0
1,1628180742,1,6.0,353733,1,6.0,3887563113,42516,Seizure,3,0,0,0,0,0
2,1628180742,2,8.0,353733,2,8.0,1142670488,42516,Seizure,3,0,0,0,0,0
3,1628180742,3,18.0,353733,3,18.0,2718991173,42516,Seizure,3,0,0,0,0,0
4,1628180742,4,24.0,353733,4,24.0,3080632009,42516,Seizure,3,0,0,0,0,0


### <b><span style='color:#F1A424'>Read one EEG parquet</span></b>

All of the EEG data (for both train and test) was collected at a frequency of 200 samples per second,

Each EEG parquet results in a dataframe with `seconds` rows and 20 columns.

- EEG features are: `['Fp1', 'F3', 'C3', 'P3', 'F7', 'T3', 'T5', 'O1', 'Fz', 'Cz', 'Pz', 'Fp2', 'F4', 'C4', 'P4', 'F8', 'T4', 'T6', 'O2', 'EKG']`
- We will use these features: `['Fp1','T3','C3','O1','Fp2','C4','T4','O2']`



In [5]:
#eeg_df = pd.read_parquet(paths.TRAIN_EEGS + "100261680.parquet")
#eeg_features = eeg_df.columns
#print(f'There are {len(eeg_features)} raw eeg features')
#print(list(eeg_features))
eeg_features = ['Fp1', 'F3', 'C3', 'P3', 'F7', 'T3', 'T5', 'O1', 'Fz', 'Cz', 'Pz', 'Fp2', 'F4', 'C4', 'P4', 'F8', 'T4', 'T6', 'O2', 'EKG']
#['Fp1','T3','C3','O1','Fp2','C4','T4','O2']
feature_to_index = {x:y for x,y in zip(eeg_features, range(len(eeg_features)))}

### <b><span style='color:#F1A424'>Read all EEG parquets</span></b>

In [None]:
### read in each npy file

FILT_EEG_PATH = '../shared_data/hms_data/all_channels_filt_eegs/all_channels_filt_eegs' # all_filt_eegs/'
#all_eegs = np.empty((0, 10000, 20))  # this will hold all the filtered eegs
all_eegs = dict() # this will hold all the filtered eegs

for i in range (1,172):
        #print(i)
        this_dict = np.load(FILT_EEG_PATH + '{:03d}'.format(i) + '.npy', allow_pickle=True).item()
        all_eegs.update(this_dict)

print(len(all_eegs))


In [None]:
#labels = pd.DataFrame(np.load('../shared_data/hms_data/eeg_filt_labels.npy',allow_pickle=True))

In [None]:
train_df

# <b><span style='color:#F1A424'>|</span> Data pre-processing</b><a class='anchor' id='preprocessing'></a> [↑](#top) 

***

In [None]:
eeg_ids = train_df.eeg_id.unique()
len(eeg_ids)

In [None]:
df = pd.read_csv(paths.TRAIN_CSV)
df = df[df['eeg_id'].isin(all_eegs.keys())]
label_cols = df.columns[-6:]

train_df = df.groupby('eeg_id')[['patient_id']].agg('first')
aux = df.groupby('eeg_id')[label_cols].agg('sum') 

for label in label_cols:
    train_df[label] = aux[label].values
    
y_data = train_df[label_cols].values
y_data = y_data / y_data.sum(axis=1,keepdims=True)
train_df[label_cols] = y_data

aux = df.groupby('eeg_id')[['expert_consensus']].agg('first')
train_df['target'] = aux

del aux, y_data, df

train_df = train_df.reset_index()
train_df = train_df.loc[train_df.eeg_id.isin(eeg_ids)]
print(f"Train dataframe with unique eeg_id has shape: {train_df.shape}")
train_df.head()

In [None]:
train_df

# <b><span style='color:#F1A424'>|</span> Validation</b><a class='anchor' id='validation'></a> [↑](#top) 

***

We train using `GroupKFold` on `patient_id`.

In [None]:
from sklearn.model_selection import KFold, StratifiedGroupKFold


gkf = StratifiedGroupKFold(n_splits=config.FOLDS,shuffle=True,random_state=20)#31)#42)
for fold, (train_index, valid_index) in enumerate(gkf.split(train_df, train_df.target, train_df.patient_id)):
    train_df.loc[valid_index, "fold"] = int(fold)
    
display(train_df.groupby('fold').size()), sep()
display(train_df.head())

# <b><span style='color:#F1A424'>|</span> Dataset</b><a class='anchor' id='dataset'></a> [↑](#top) 

***

In [None]:
class CustomDataset(Dataset):
    def __init__(
        self, df: pd.DataFrame, config, mode: str = 'train',
        eegs: Dict[int, np.ndarray] = all_eegs, downsample: int = 5
    ): 
        self.df = df
        self.config = config
        self.batch_size = self.config.BATCH_SIZE_TRAIN
        self.mode = mode
        self.eegs = eegs
        self.downsample = downsample
        
    def __len__(self):
        """
        Length of dataset.
        """
        return len(self.df)
        
    def __getitem__(self, index):
        """
        Get one item.
        """
        X, y = self.__data_generation(index)
        X = X[::self.downsample,:]
        output = {
            "X": torch.tensor(X, dtype=torch.float32),
            "y": torch.tensor(y, dtype=torch.float32)
        }
        return output
                        
    def __data_generation(self, index):
        row = self.df.iloc[index]
        X = np.zeros((10_000, 18), dtype='float32')
        y = np.zeros(6, dtype='float32')
        data = self.eegs[row.eeg_id]
        
        # === Feature engineering ===
        
        # Chains:
        
        #['Fp1', 'F3', 'C3', 'P3', 'F7', 'T3', 'T5', 'O1', 'Fz', 'Cz', 'Pz', 'Fp2', 'F4', 'C4', 'P4', 'F8', 'T4', 'T6', 'O2', 'EKG']
        
        
        X[:,0] = data[:,feature_to_index['Fp1']] - data[:,feature_to_index['F7']]
        X[:,1] = data[:,feature_to_index['F7']] - data[:,feature_to_index['T3']]
        X[:,2] = data[:,feature_to_index['T3']] - data[:,feature_to_index['T5']]
        X[:,3] = data[:,feature_to_index['T5']] - data[:,feature_to_index['O1']]
        
        X[:,4] = data[:,feature_to_index['Fp1']] - data[:,feature_to_index['F3']]
        X[:,5] = data[:,feature_to_index['F3']] - data[:,feature_to_index['C3']]
        X[:,6] = data[:,feature_to_index['C3']] - data[:,feature_to_index['P3']]
        X[:,7] = data[:,feature_to_index['P3']] - data[:,feature_to_index['O1']]
        
        X[:,8] = data[:,feature_to_index['Fp2']] - data[:,feature_to_index['F4']]
        X[:,9] = data[:,feature_to_index['F4']] - data[:,feature_to_index['C4']]
        X[:,10] = data[:,feature_to_index['C4']] - data[:,feature_to_index['P4']]
        X[:,11] = data[:,feature_to_index['P4']] - data[:,feature_to_index['O2']]
        
        X[:,12] = data[:,feature_to_index['Fp2']] - data[:,feature_to_index['F8']]
        X[:,13] = data[:,feature_to_index['F8']] - data[:,feature_to_index['T4']]
        X[:,14] = data[:,feature_to_index['T4']] - data[:,feature_to_index['T6']]
        X[:,15] = data[:,feature_to_index['T6']] - data[:,feature_to_index['O2']]
        
        X[:,16] = data[:,feature_to_index['Fz']] - data[:,feature_to_index['Cz']]
        X[:,17] = data[:,feature_to_index['Cz']] - data[:,feature_to_index['Pz']]
        
        
        # Raw EEGs
        #X = data[:,:19]
        # === Standarize ===
        X = np.clip(X,-1024, 1024)
        X = np.nan_to_num(X, nan=0) / 32.0

#         # === Butter Low-pass Filter ===
#         X = butter_lowpass_filter(X)
        
        if self.mode != 'test':
            y = row[label_cols].values.astype(np.float32)
            
        return X, y

# <b><span style='color:#F1A424'>|</span> DataLoader</b><a class='anchor' id='dataloader'></a> [↑](#top) 

***

In [None]:
train_dataset = CustomDataset(train_df, config, mode="train")
train_loader = DataLoader(
    train_dataset,
    batch_size=config.BATCH_SIZE_TRAIN,
    shuffle=False,
    num_workers=config.NUM_WORKERS, pin_memory=True, drop_last=True
)
output = train_dataset[0]
X, y = output["X"], output["y"]
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

### <b><span style='color:#F1A424'> Visualize DataLoader</span></b>


In [None]:
if config.VISUALIZE:
    for batch in train_loader:
        X = batch.pop("X")
        y = batch.pop("y")
        for item in range(4):
            plt.figure(figsize=(20,4))
            offset = 0
            for col in range(X.shape[-1]):
                if col != 0:
                    offset -= X[item,:,col].min()
                plt.plot(range(2_000), X[item,:,col]+offset,label=f'feature {col+1}')
                offset += X[item,:,col].max()
            tt = f'{y[col][0]:0.1f}'
            for t in y[col][1:]:
                tt += f', {t:0.1f}'
            plt.title(f'EEG_Id = {eeg_ids[item]}\nTarget = {tt}',size=14)
            plt.legend()
            plt.show()
        break

# <b><span style='color:#F1A424'>|</span> Model</b><a class='anchor' id='model'></a> [↑](#top) 

***

<center><img width = 800 src="https://raw.githubusercontent.com/cdeotte/Kaggle_Images/main/Jan-2024/wave-model.png"></center>

In [4]:
class Wave_Block(nn.Module):
    def __init__(self, in_channels: int, out_channels: int, dilation_rates: int, kernel_size: int = 3):
        """
        WaveNet building block.
        :param in_channels: number of input channels.
        :param out_channels: number of output channels.
        :param dilation_rates: how many levels of dilations are used.
        :param kernel_size: size of the convolving kernel.
        """
        super(Wave_Block, self).__init__()
        self.num_rates = dilation_rates
        self.convs = nn.ModuleList()
        self.filter_convs = nn.ModuleList()
        self.gate_convs = nn.ModuleList()
        self.convs.append(nn.Conv1d(in_channels, out_channels, kernel_size=1, bias=True))
        
        dilation_rates = [2 ** i for i in range(dilation_rates)]
        for dilation_rate in dilation_rates:
            self.filter_convs.append(
                nn.Conv1d(out_channels, out_channels, kernel_size=kernel_size,
                          padding=int((dilation_rate*(kernel_size-1))/2), dilation=dilation_rate))
            self.gate_convs.append(
                nn.Conv1d(out_channels, out_channels, kernel_size=kernel_size,
                          padding=int((dilation_rate*(kernel_size-1))/2), dilation=dilation_rate))
            self.convs.append(nn.Conv1d(out_channels, out_channels, kernel_size=1, bias=True))
        
        for i in range(len(self.convs)):
            nn.init.xavier_uniform_(self.convs[i].weight, gain=nn.init.calculate_gain('relu'))
            nn.init.zeros_(self.convs[i].bias)

        for i in range(len(self.filter_convs)):
            nn.init.xavier_uniform_(self.filter_convs[i].weight, gain=nn.init.calculate_gain('relu'))
            nn.init.zeros_(self.filter_convs[i].bias)

        for i in range(len(self.gate_convs)):
            nn.init.xavier_uniform_(self.gate_convs[i].weight, gain=nn.init.calculate_gain('relu'))
            nn.init.zeros_(self.gate_convs[i].bias)

    def forward(self, x):
        x = self.convs[0](x)
        res = x
        for i in range(self.num_rates):
            tanh_out = torch.tanh(self.filter_convs[i](x))
            sigmoid_out = torch.sigmoid(self.gate_convs[i](x))
            x = tanh_out * sigmoid_out
            x = self.convs[i + 1](x) 
            res = res + x
        return res
    
class WaveNet(nn.Module):
    def __init__(self, input_channels: int = 1, kernel_size: int = 3):
        super(WaveNet, self).__init__()
        self.model = nn.Sequential(
                Wave_Block(input_channels, 8, 12, kernel_size),
                Wave_Block(8, 16, 8, kernel_size),
                Wave_Block(16, 32, 4, kernel_size),
                Wave_Block(32, 64, 1, kernel_size) 
        )
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x.permute(0, 2, 1) 
        output = self.model(x)
        return output


class CustomModel(nn.Module):
    def __init__(self):
        super(CustomModel, self).__init__()
        self.model = WaveNet()
        self.global_avg_pooling = nn.AdaptiveAvgPool1d(1)
        self.dropout = 0.0
        self.head = nn.Sequential(
            nn.Linear(256, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(self.dropout),
            nn.Linear(64, 6)
        )
        
    def forward(self, x: torch.Tensor):
        """
        Forwward pass.
        """
        x1 = self.model(x[:, :, 0:1])
        x1 = self.global_avg_pooling(x1)
        x1 = x1.squeeze()
        x2 = self.model(x[:, :, 1:2])
        x2 = self.global_avg_pooling(x2)
        x2 = x2.squeeze()
        x3 = self.model(x[:, :, 2:3])
        x3 = self.global_avg_pooling(x3)
        x3 = x3.squeeze()
        x4 = self.model(x[:, :, 3:4])
        x4 = self.global_avg_pooling(x4)
        x4 = x4.squeeze()
        z1 = torch.mean(torch.stack([x1, x2, x3, x4]), dim=0)

        x1 = self.model(x[:, :, 4:5])
        x1 = self.global_avg_pooling(x1)
        x1 = x1.squeeze()
        x2 = self.model(x[:, :, 5:6])
        x2 = self.global_avg_pooling(x2)
        x2 = x2.squeeze()
        x3 = self.model(x[:, :, 6:7])
        x3 = self.global_avg_pooling(x3)
        x3 = x3.squeeze()
        x4 = self.model(x[:, :, 7:8])
        x4 = self.global_avg_pooling(x4)
        x4 = x4.squeeze()
        z2 = torch.mean(torch.stack([x1, x2, x3, x4]), dim=0)
        
        x1 = self.model(x[:, :, 8:9])
        x1 = self.global_avg_pooling(x1)
        x1 = x1.squeeze()
        x2 = self.model(x[:, :, 9:10])
        x2 = self.global_avg_pooling(x2)
        x2 = x2.squeeze()
        x3 = self.model(x[:, :, 10:11])
        x3 = self.global_avg_pooling(x3)
        x3 = x3.squeeze()
        x4 = self.model(x[:, :, 11:12])
        x4 = self.global_avg_pooling(x4)
        x4 = x4.squeeze()
        z3 = torch.mean(torch.stack([x1, x2, x3, x4]), dim=0)
        
        x1 = self.model(x[:, :, 12:13])
        x1 = self.global_avg_pooling(x1)
        x1 = x1.squeeze()
        x2 = self.model(x[:, :, 13:14])
        x2 = self.global_avg_pooling(x2)
        x2 = x2.squeeze()
        x3 = self.model(x[:, :, 14:15])
        x3 = self.global_avg_pooling(x3)
        x3 = x3.squeeze()
        x4 = self.model(x[:, :, 15:16])
        x4 = self.global_avg_pooling(x4)
        x4 = x4.squeeze()
        z4 = torch.mean(torch.stack([x1, x2, x3, x4]), dim=0)
        
        x1 = self.model(x[:, :, 16:17])
        x1 = self.global_avg_pooling(x1)
        x1 = x1.squeeze()
        x2 = self.model(x[:, :, 17:18])
        x2 = self.global_avg_pooling(x2)
        x2 = x2.squeeze()
        z5 = torch.mean(torch.stack([x1, x2]), dim=0)
        
        y = torch.cat([z1, z2, z3, z4], dim=1)
        y = self.head(y)
        
        return y

model = CustomModel()
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params}")

Total number of parameters: 98086


# <b><span style='color:#F1A424'>|</span> Scheduler</b><a class='anchor' id='scheduler'></a> [↑](#top) 

***

In [16]:
model = CustomModel()
fold = 3
# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load the state_dict from the saved checkpoint
state_dict_path = f"modelsA6/wavenet_fold_{fold}_best.pth"
state_dict = torch.load(state_dict_path)['model']

# Remove "module." from the keys in the state_dict
new_state_dict = {key.replace("module.", ""): value for key, value in state_dict.items()}

# Load the modified state_dict into the model
model.load_state_dict(new_state_dict)

# Save the modified model state_dict back to the checkpoint
torch.save({'model': model.state_dict(),
            'predictions': torch.load(state_dict_path)['predictions'],
            'best_loss': torch.load(state_dict_path)['best_loss']},
           state_dict_path)

In [9]:
state_dict_path = f"modelsA8/wavenet_fold_1_best.pth"
state_dict = torch.load(state_dict_path)['model']

In [10]:
state_dict

OrderedDict([('module.model.model.0.convs.0.weight',
              tensor([[[-1.0659]],
              
                      [[-0.4965]],
              
                      [[ 0.3991]],
              
                      [[ 0.6779]],
              
                      [[-0.4878]],
              
                      [[ 0.7320]],
              
                      [[ 0.2448]],
              
                      [[ 0.0442]]], device='cuda:0')),
             ('module.model.model.0.convs.0.bias',
              tensor([-0.0284, -0.0069, -0.0057,  0.0006,  0.0031,  0.0127, -0.0197,  0.0041],
                     device='cuda:0')),
             ('module.model.model.0.convs.1.weight',
              tensor([[[ 0.0985],
                       [ 0.6906],
                       [-0.8392],
                       [-0.0186],
                       [-0.1998],
                       [ 0.7256],
                       [-0.3579],
                       [ 0.7902]],
              
               

In [None]:
from torch.optim.lr_scheduler import OneCycleLR

EPOCHS = config.EPOCHS
BATCHES = len(train_loader)
steps = []
lrs = []
optim_lrs = []
model = CustomModel()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
scheduler = OneCycleLR(
    optimizer,
    max_lr=1e-3,
    epochs=config.EPOCHS,
    steps_per_epoch=len(train_loader),
    pct_start=0.1,
    anneal_strategy="cos",
    final_div_factor=100,
)
for epoch in range(EPOCHS):
    for batch in range(BATCHES):
        scheduler.step()
        lrs.append(scheduler.get_last_lr()[0])
        steps.append(epoch * BATCHES + batch)

max_lr = max(lrs)
min_lr = min(lrs)
print(f"Maximum LR: {max_lr} | Minimum LR: {min_lr}")
plt.figure()
plt.plot(steps, lrs, label='OneCycle')
plt.ticklabel_format(axis='y', style='sci', scilimits=(0,0))
plt.xlabel("Step")
plt.ylabel("Learning Rate")
plt.show()

# <b><span style='color:#F1A424'>|</span> Loss Function</b><a class='anchor' id='loss'></a> [↑](#top) 

***


In [None]:
import torch.nn.functional as F

# === Reduction = "mean" ===
criterion = nn.KLDivLoss(reduction="mean")
y_pred = F.log_softmax(torch.randn(6, 2, requires_grad=True), dim=1)
y_true = F.softmax(torch.rand(6, 2), dim=1)
print(f"Predictions: {y_pred}")
print(f"Targets: {y_true}")
output = criterion(y_pred, y_true)
print(f"Output: {output}")

print("\n", "="*100, "\n")

# === Reduction = "batchmean" ===
criterion = nn.KLDivLoss(reduction="batchmean")
y_pred = F.log_softmax(torch.randn(2, 6, requires_grad=True), dim=1)
y_true = F.softmax(torch.rand(2, 6), dim=1)
print(f"Predictions: {y_pred}")
print(f"Targets: {y_true}")
output = criterion(y_pred, y_true)
print(f"Output: {output}")

# <b><span style='color:#F1A424'>|</span> Train and Validation Functions</b><a class='anchor' id='functions'></a> [↑](#top) 

***

In [None]:
def train_epoch(train_loader, model, optimizer, epoch, scheduler, device):
    """One epoch training pass."""
    model.train()
    criterion = nn.KLDivLoss(reduction="batchmean")
    scaler = torch.cuda.amp.GradScaler(enabled=config.AMP)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    
    # ========== ITERATE OVER TRAIN BATCHES ============
    with tqdm(train_loader, unit="train_batch", desc='Train') as tqdm_train_loader:
        for step, batch in enumerate(tqdm_train_loader):
            X = batch.pop("X").to(device) # send inputs to `device`
            y = batch.pop("y").to(device) # send labels to `device`
            batch_size = y.size(0)
            with torch.cuda.amp.autocast(enabled=config.AMP):
                y_preds = model(X)
                loss = criterion(F.log_softmax(y_preds, dim=1), y)
            if config.GRADIENT_ACCUMULATION_STEPS > 1:
                loss = loss / config.GRADIENT_ACCUMULATION_STEPS
            losses.update(loss.item(), batch_size)
            scaler.scale(loss).backward()
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.MAX_GRAD_NORM)
            
            if (step + 1) % config.GRADIENT_ACCUMULATION_STEPS == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                global_step += 1
                scheduler.step()
            end = time.time()

            # ========== LOG INFO ==========
            if step % config.PRINT_FREQ == 0 or step == (len(train_loader)-1):
                print('Epoch: [{0}][{1}/{2}] '
                      'Elapsed {remain:s} '
                      'Loss: {loss.avg:.4f} '
                      'Grad: {grad_norm:.4f}  '
                      'LR: {lr:.8f}  '
                      .format(epoch+1, step, len(train_loader), 
                              remain=timeSince(start, float(step+1)/len(train_loader)),
                              loss=losses,
                              grad_norm=grad_norm,
                              lr=scheduler.get_last_lr()[0]))

    return losses.avg


def valid_epoch(valid_loader, model, device):
    model.eval() 
    softmax = nn.Softmax(dim=1)
    losses = AverageMeter()
    prediction_dict = {}
    preds = []
    start = end = time.time()
    with tqdm(valid_loader, unit="valid_batch", desc='Validation') as tqdm_valid_loader:
        for step, batch in enumerate(tqdm_valid_loader):
            X = batch.pop("X").to(device) 
            y = batch.pop("y").to(device)
            batch_size = y.size(0)
            with torch.no_grad():
                y_preds = model(X)
                loss = criterion(F.log_softmax(y_preds, dim=1), y)
            if config.GRADIENT_ACCUMULATION_STEPS > 1:
                loss = loss / config.GRADIENT_ACCUMULATION_STEPS
            losses.update(loss.item(), batch_size)
            y_preds = softmax(y_preds)
            preds.append(y_preds.to('cpu').numpy()) 
            end = time.time()

            # ========== LOG INFO ==========
            if step % config.PRINT_FREQ == 0 or step == (len(valid_loader)-1):
                print('EVAL: [{0}/{1}] '
                      'Elapsed {remain:s} '
                      'Loss: {loss.avg:.4f} '
                      .format(step, len(valid_loader),
                              remain=timeSince(start, float(step+1)/len(valid_loader)),
                              loss=losses))
                
    prediction_dict["predictions"] = np.concatenate(preds)
    return losses.avg, prediction_dict

# <b><span style='color:#F1A424'>|</span> Train Loop</b><a class='anchor' id='train_loop'></a> [↑](#top) 

***

In [None]:
def train_loop(df, fold):
    
    #LOGGER.info(f"========== Fold: {fold} training ==========")
    print((f"========== Fold: {fold} training =========="))

    # ======== SPLIT ==========
    train_folds = df[df['fold'] != fold].reset_index(drop=True)
    valid_folds = df[df['fold'] == fold].reset_index(drop=True)
    
    # ======== DATASETS ==========
    train_dataset = CustomDataset(train_folds, config, mode="train")
    valid_dataset = CustomDataset(valid_folds, config, mode="train")
    
    # ======== DATALOADERS ==========
    train_loader = DataLoader(train_dataset,
                              batch_size=config.BATCH_SIZE_TRAIN,
                              shuffle=True,
                              num_workers=config.NUM_WORKERS, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=config.BATCH_SIZE_VALID,
                              shuffle=False,
                              num_workers=config.NUM_WORKERS, pin_memory=True, drop_last=False)
    
    # ======== MODEL ==========
    model = CustomModel()
    model.to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=0.1, weight_decay=config.WEIGHT_DECAY)
    scheduler = OneCycleLR(
        optimizer,
        max_lr=1e-3,
        epochs=config.EPOCHS,
        steps_per_epoch=len(train_loader),
        pct_start=0.1,
        anneal_strategy="cos",
        final_div_factor=100,
    )

    # ======= LOSS ==========
    criterion = nn.KLDivLoss(reduction="batchmean")
    
    best_loss = np.inf
    # ====== ITERATE EPOCHS ========
    for epoch in range(config.EPOCHS):
        start_time = time.time()

        # ======= TRAIN ==========
        avg_train_loss = train_epoch(train_loader, model, optimizer, epoch, scheduler, device)

        # ======= EVALUATION ==========
        avg_val_loss, prediction_dict = valid_epoch(valid_loader, model, device)
        predictions = prediction_dict["predictions"]
        
        # ======= SCORING ==========
        elapsed = time.time() - start_time

        #LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_train_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        print(f'Epoch {epoch+1} - avg_train_loss: {avg_train_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        
        if avg_val_loss < best_loss:
            best_loss = avg_val_loss
            #LOGGER.info(f'Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model')
            print(f'Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                         f"modelsA4/wavenet_fold_{fold}_best.pth")

    predictions = torch.load(f"modelsA4/wavenet_fold_{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[target_preds] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [None]:
def train_loop(df, fold, prevmodel = None, best_loss = np.inf):
    
    #LOGGER.info(f"========== Fold: {fold} training ==========")
    print((f"========== Fold: {fold} training =========="))

    # ======== SPLIT ==========
    train_folds = df[df['fold'] != fold].reset_index(drop=True)
    valid_folds = df[df['fold'] == fold].reset_index(drop=True)
    
    # ======== DATASETS ==========
    train_dataset = CustomDataset(train_folds, config, mode="train")
    valid_dataset = CustomDataset(valid_folds, config, mode="train")
    
    # ======== DATALOADERS ==========
    train_loader = DataLoader(train_dataset,
                              batch_size=config.BATCH_SIZE_TRAIN,
                              shuffle=True,
                              num_workers=config.NUM_WORKERS, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=config.BATCH_SIZE_VALID,
                              shuffle=False,
                              num_workers=config.NUM_WORKERS, pin_memory=True, drop_last=False)
    
    # ======== MODEL ==========
    model = CustomModel()

    # Move the model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Load the saved state dictionary
    state_dict = torch.load(prevmodel)['model']
    #state_dict = {f"module.{key}": value for key, value in state_dict.items()}

    # Load the modified state dictionary into the model
    model = nn.DataParallel(model)  # Wrap the model with DataParallel
    model.load_state_dict(state_dict)
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.1, weight_decay=config.WEIGHT_DECAY)
    scheduler = OneCycleLR(
        optimizer,
        max_lr=1e-3,
        epochs=config.EPOCHS,
        steps_per_epoch=len(train_loader),
        pct_start=0.1,
        anneal_strategy="cos",
        final_div_factor=100,
    )

    # ======= LOSS ==========
    criterion = nn.KLDivLoss(reduction="batchmean")
    
    # ====== ITERATE EPOCHS ========
    for epoch in range(config.EPOCHS):
        start_time = time.time()

        # ======= TRAIN ==========
        avg_train_loss = train_epoch(train_loader, model, optimizer, epoch, scheduler, device)

        # ======= EVALUATION ==========
        avg_val_loss, prediction_dict = valid_epoch(valid_loader, model, device)
        predictions = prediction_dict["predictions"]
        
        # ======= SCORING ==========
        elapsed = time.time() - start_time

        #LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_train_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        print(f'Epoch {epoch+1} - avg_train_loss: {avg_train_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        
        if avg_val_loss < best_loss:
            best_loss = avg_val_loss
            #LOGGER.info(f'Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model')
            print(f'Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions,
                        'best_loss': best_loss},
                         f"modelsA8/wavenet_fold_{fold}_best.pth")

    predictions = torch.load(f"modelsA8/wavenet_fold_{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[target_preds] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

# <b><span style='color:#F1A424'>|</span> Train</b><a class='anchor' id='train'></a> [↑](#top) 

***

In [None]:
def get_result(oof_df):
    kl_loss = nn.KLDivLoss(reduction="batchmean")
    labels = torch.tensor(oof_df[label_cols].values)
    preds = torch.tensor(oof_df[target_preds].values)
    preds = F.log_softmax(preds, dim=1)
    result = kl_loss(preds, labels)
    return result



In [None]:
#use modelsA6 add to modelsA4
#use models2 and models6

In [None]:
if not config.TRAIN_FULL_DATA:
        oof_df = pd.DataFrame()
        for fold in range(config.FOLDS):
            if fold in [2,3,4]:
                _oof_df = train_loop(train_df, fold,f"modelsA6/wavenet_fold_{fold}_best.pth")#, torch.load(f"models/wavenet_fold_{fold}_best.pth")["best_loss"])
                oof_df = pd.concat([oof_df, _oof_df])
                print(f"========== Fold {fold} result: {get_result(_oof_df)} ==========")
                #LOGGER.info(f"========== Fold {fold} finished ==========")
        oof_df = oof_df.reset_index(drop=True)
        oof_df.to_csv('modelsA8/oof_df.csv', index=False)
else:
    train_loop_full_data(train_df)

In [28]:
        oof_df = oof_df.reset_index(drop=True)
        oof_df.to_csv('modelsA8/oof_df.csv', index=False)

In [50]:
!mkdir modelsA8

In [35]:
if not config.TRAIN_FULL_DATA:
        oof_df = pd.DataFrame()
        for fold in range(config.FOLDS):
            if fold in [0,1,2,3,4]:
                _oof_df = train_loop(train_df, fold,f"models/wavenet_fold_{fold}_best.pth")#, torch.load(f"models/wavenet_fold_{fold}_best.pth")["best_loss"])
                oof_df = pd.concat([oof_df, _oof_df])
                print(f"========== Fold {fold} result: {get_result(_oof_df)} ==========")
                #LOGGER.info(f"========== Fold {fold} finished ==========")
        oof_df = oof_df.reset_index(drop=True)
        oof_df.to_csv('models3/oof_df.csv', index=False)
else:
    train_loop_full_data(train_df)



Train:   2%|▏         | 1/60 [00:02<01:59,  2.03s/train_batch]

Epoch: [1][0/60] Elapsed 0m 2s (remain 1m 59s) Loss: 1.5122 Grad: inf  LR: 0.00004017  


Train:  85%|████████▌ | 51/60 [00:23<00:03,  2.35train_batch/s]

Epoch: [1][50/60] Elapsed 0m 23s (remain 0m 4s) Loss: 1.6079 Grad: 117629.7109  LR: 0.00041319  


Train: 100%|██████████| 60/60 [00:27<00:00,  2.16train_batch/s]


Epoch: [1][59/60] Elapsed 0m 27s (remain 0m 0s) Loss: 1.5455 Grad: 130999.7266  LR: 0.00052634  


Validation:  10%|▉         | 2/21 [00:01<00:08,  2.24valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 16s) Loss: 1.7033 


Validation: 100%|██████████| 21/21 [00:05<00:00,  3.99valid_batch/s]


EVAL: [20/21] Elapsed 0m 5s (remain 0m 0s) Loss: 1.2556 
Epoch 1 - avg_train_loss: 1.5455  avg_val_loss: 1.2556  time: 33s
Epoch 1 - Save Best Loss: 1.2556 Model


Train:   2%|▏         | 1/60 [00:00<00:40,  1.45train_batch/s]

Epoch: [2][0/60] Elapsed 0m 0s (remain 0m 40s) Loss: 1.0144 Grad: 400968.0938  LR: 0.00053900  


Train:  85%|████████▌ | 51/60 [00:19<00:03,  2.56train_batch/s]

Epoch: [2][50/60] Elapsed 0m 19s (remain 0m 3s) Loss: 0.8021 Grad: 148870.7344  LR: 0.00098933  


Train: 100%|██████████| 60/60 [00:23<00:00,  2.55train_batch/s]


Epoch: [2][59/60] Elapsed 0m 23s (remain 0m 0s) Loss: 0.7865 Grad: 174234.2812  LR: 0.00100000  


Validation:  10%|▉         | 2/21 [00:00<00:06,  3.04valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 11s) Loss: 1.0254 


Validation: 100%|██████████| 21/21 [00:05<00:00,  4.10valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.7939 
Epoch 2 - avg_train_loss: 0.7865  avg_val_loss: 0.7939  time: 29s
Epoch 2 - Save Best Loss: 0.7939 Model


Train:   2%|▏         | 1/60 [00:00<00:42,  1.40train_batch/s]

Epoch: [3][0/60] Elapsed 0m 0s (remain 0m 42s) Loss: 0.5495 Grad: 281016.9375  LR: 0.00099999  


Train:  85%|████████▌ | 51/60 [00:21<00:03,  2.27train_batch/s]

Epoch: [3][50/60] Elapsed 0m 21s (remain 0m 3s) Loss: 0.5076 Grad: 67053.5000  LR: 0.00099429  


Train: 100%|██████████| 60/60 [00:25<00:00,  2.31train_batch/s]


Epoch: [3][59/60] Elapsed 0m 25s (remain 0m 0s) Loss: 0.5072 Grad: 68906.7656  LR: 0.00099215  


Validation:   5%|▍         | 1/21 [00:00<00:13,  1.50valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 13s) Loss: 0.7225 


Validation: 100%|██████████| 21/21 [00:05<00:00,  3.75valid_batch/s]


EVAL: [20/21] Elapsed 0m 5s (remain 0m 0s) Loss: 0.6127 
Epoch 3 - avg_train_loss: 0.5072  avg_val_loss: 0.6127  time: 32s
Epoch 3 - Save Best Loss: 0.6127 Model


Train:   2%|▏         | 1/60 [00:00<00:33,  1.75train_batch/s]

Epoch: [4][0/60] Elapsed 0m 0s (remain 0m 33s) Loss: 0.3407 Grad: 218836.1719  LR: 0.00099189  


Train:  85%|████████▌ | 51/60 [00:18<00:03,  2.33train_batch/s]

Epoch: [4][50/60] Elapsed 0m 18s (remain 0m 3s) Loss: 0.3809 Grad: 252542.5469  LR: 0.00097371  


Train: 100%|██████████| 60/60 [00:22<00:00,  2.61train_batch/s]


Epoch: [4][59/60] Elapsed 0m 22s (remain 0m 0s) Loss: 0.3782 Grad: 243546.9219  LR: 0.00096936  


Validation:  10%|▉         | 2/21 [00:00<00:06,  2.87valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 11s) Loss: 0.6060 


Validation: 100%|██████████| 21/21 [00:05<00:00,  3.86valid_batch/s]


EVAL: [20/21] Elapsed 0m 5s (remain 0m 0s) Loss: 0.6231 
Epoch 4 - avg_train_loss: 0.3782  avg_val_loss: 0.6231  time: 28s


Train:   2%|▏         | 1/60 [00:00<00:40,  1.46train_batch/s]

Epoch: [5][0/60] Elapsed 0m 0s (remain 0m 40s) Loss: 0.2475 Grad: 171739.5000  LR: 0.00096886  


Train:  85%|████████▌ | 51/60 [00:22<00:03,  2.26train_batch/s]

Epoch: [5][50/60] Elapsed 0m 22s (remain 0m 3s) Loss: 0.2953 Grad: 110822.6250  LR: 0.00093874  


Train: 100%|██████████| 60/60 [00:26<00:00,  2.26train_batch/s]


Epoch: [5][59/60] Elapsed 0m 26s (remain 0m 0s) Loss: 0.2920 Grad: 105272.7891  LR: 0.00093231  


Validation:   5%|▍         | 1/21 [00:00<00:10,  1.88valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 10s) Loss: 0.5925 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.42valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.5042 
Epoch 5 - avg_train_loss: 0.2920  avg_val_loss: 0.5042  time: 31s
Epoch 5 - Save Best Loss: 0.5042 Model


Train:   2%|▏         | 1/60 [00:00<00:45,  1.29train_batch/s]

Epoch: [6][0/60] Elapsed 0m 0s (remain 0m 45s) Loss: 0.2062 Grad: 152835.9062  LR: 0.00093158  


Train:  85%|████████▌ | 51/60 [00:19<00:04,  2.25train_batch/s]

Epoch: [6][50/60] Elapsed 0m 19s (remain 0m 3s) Loss: 0.2384 Grad: 48321.6914  LR: 0.00089044  


Train: 100%|██████████| 60/60 [00:23<00:00,  2.55train_batch/s]


Epoch: [6][59/60] Elapsed 0m 23s (remain 0m 0s) Loss: 0.2422 Grad: 58963.5273  LR: 0.00088213  


Validation:  10%|▉         | 2/21 [00:00<00:05,  3.35valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 9s) Loss: 0.5086 


Validation: 100%|██████████| 21/21 [00:05<00:00,  4.03valid_batch/s]


EVAL: [20/21] Elapsed 0m 5s (remain 0m 0s) Loss: 0.4254 
Epoch 6 - avg_train_loss: 0.2422  avg_val_loss: 0.4254  time: 29s
Epoch 6 - Save Best Loss: 0.4254 Model


Train:   2%|▏         | 1/60 [00:00<00:51,  1.14train_batch/s]

Epoch: [7][0/60] Elapsed 0m 0s (remain 0m 52s) Loss: 0.1849 Grad: 154016.2656  LR: 0.00088119  


Train:  85%|████████▌ | 51/60 [00:22<00:03,  2.27train_batch/s]

Epoch: [7][50/60] Elapsed 0m 22s (remain 0m 4s) Loss: 0.1878 Grad: 232370.2031  LR: 0.00083029  


Train: 100%|██████████| 60/60 [00:26<00:00,  2.25train_batch/s]


Epoch: [7][59/60] Elapsed 0m 26s (remain 0m 0s) Loss: 0.1914 Grad: 109137.7969  LR: 0.00082035  


Validation:  10%|▉         | 2/21 [00:00<00:04,  4.11valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 8s) Loss: 0.5661 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.80valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.5044 
Epoch 7 - avg_train_loss: 0.1914  avg_val_loss: 0.5044  time: 31s


Train:   2%|▏         | 1/60 [00:00<00:33,  1.77train_batch/s]

Epoch: [8][0/60] Elapsed 0m 0s (remain 0m 33s) Loss: 0.1613 Grad: 153219.1875  LR: 0.00081923  


Train:  85%|████████▌ | 51/60 [00:20<00:03,  2.26train_batch/s]

Epoch: [8][50/60] Elapsed 0m 20s (remain 0m 3s) Loss: 0.1651 Grad: 87827.7734  LR: 0.00076010  


Train: 100%|██████████| 60/60 [00:24<00:00,  2.38train_batch/s]

Epoch: [8][59/60] Elapsed 0m 24s (remain 0m 0s) Loss: 0.1683 Grad: 126108.2656  LR: 0.00074884  


Train: 100%|██████████| 60/60 [00:24<00:00,  2.41train_batch/s]
Validation:   5%|▍         | 1/21 [00:00<00:11,  1.71valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 11s) Loss: 0.4101 


Validation: 100%|██████████| 21/21 [00:05<00:00,  3.87valid_batch/s]


EVAL: [20/21] Elapsed 0m 5s (remain 0m 0s) Loss: 0.4089 
Epoch 8 - avg_train_loss: 0.1683  avg_val_loss: 0.4089  time: 30s
Epoch 8 - Save Best Loss: 0.4089 Model


Train:   2%|▏         | 1/60 [00:00<00:56,  1.05train_batch/s]

Epoch: [9][0/60] Elapsed 0m 0s (remain 0m 56s) Loss: 0.1122 Grad: 122829.0078  LR: 0.00074758  


Train:  85%|████████▌ | 51/60 [00:22<00:03,  2.28train_batch/s]

Epoch: [9][50/60] Elapsed 0m 22s (remain 0m 4s) Loss: 0.1447 Grad: 28782.8477  LR: 0.00068202  


Train: 100%|██████████| 60/60 [00:25<00:00,  2.31train_batch/s]


Epoch: [9][59/60] Elapsed 0m 25s (remain 0m 0s) Loss: 0.1449 Grad: 44073.0391  LR: 0.00066977  


Validation:  10%|▉         | 2/21 [00:00<00:05,  3.33valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 9s) Loss: 0.3988 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.20valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.3941 
Epoch 9 - avg_train_loss: 0.1449  avg_val_loss: 0.3941  time: 31s
Epoch 9 - Save Best Loss: 0.3941 Model


Train:   2%|▏         | 1/60 [00:00<00:39,  1.48train_batch/s]

Epoch: [10][0/60] Elapsed 0m 0s (remain 0m 39s) Loss: 0.1363 Grad: inf  LR: 0.00066841  


Train:  85%|████████▌ | 51/60 [00:19<00:03,  2.49train_batch/s]

Epoch: [10][50/60] Elapsed 0m 19s (remain 0m 3s) Loss: 0.1283 Grad: 33455.4219  LR: 0.00059842  


Train: 100%|██████████| 60/60 [00:23<00:00,  2.33train_batch/s]

Epoch: [10][59/60] Elapsed 0m 23s (remain 0m 0s) Loss: 0.1293 Grad: 43291.6758  LR: 0.00058556  


Train: 100%|██████████| 60/60 [00:23<00:00,  2.51train_batch/s]
Validation:   5%|▍         | 1/21 [00:00<00:14,  1.35valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 14s) Loss: 0.3587 


Validation: 100%|██████████| 21/21 [00:05<00:00,  3.54valid_batch/s]


EVAL: [20/21] Elapsed 0m 5s (remain 0m 0s) Loss: 0.3816 
Epoch 10 - avg_train_loss: 0.1293  avg_val_loss: 0.3816  time: 30s
Epoch 10 - Save Best Loss: 0.3816 Model


Train:   2%|▏         | 1/60 [00:01<01:01,  1.04s/train_batch]

Epoch: [11][0/60] Elapsed 0m 1s (remain 1m 0s) Loss: 0.0974 Grad: 121799.6562  LR: 0.00058412  


Train:  85%|████████▌ | 51/60 [00:21<00:03,  2.52train_batch/s]

Epoch: [11][50/60] Elapsed 0m 21s (remain 0m 3s) Loss: 0.1084 Grad: 69488.0000  LR: 0.00051183  


Train: 100%|██████████| 60/60 [00:25<00:00,  2.38train_batch/s]


Epoch: [11][59/60] Elapsed 0m 25s (remain 0m 0s) Loss: 0.1082 Grad: 70890.5781  LR: 0.00049875  


Validation:  10%|▉         | 2/21 [00:00<00:05,  3.46valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 9s) Loss: 0.4025 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.32valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.3781 
Epoch 11 - avg_train_loss: 0.1082  avg_val_loss: 0.3781  time: 30s
Epoch 11 - Save Best Loss: 0.3781 Model


Train:   2%|▏         | 1/60 [00:00<00:44,  1.32train_batch/s]

Epoch: [12][0/60] Elapsed 0m 0s (remain 0m 44s) Loss: 0.0685 Grad: 99135.3828  LR: 0.00049729  


Train:  85%|████████▌ | 51/60 [00:22<00:03,  2.30train_batch/s]

Epoch: [12][50/60] Elapsed 0m 22s (remain 0m 3s) Loss: 0.1010 Grad: 59115.7031  LR: 0.00042489  


Train: 100%|██████████| 60/60 [00:26<00:00,  2.29train_batch/s]


Epoch: [12][59/60] Elapsed 0m 26s (remain 0m 0s) Loss: 0.1066 Grad: 50604.8398  LR: 0.00041198  


Validation:   5%|▍         | 1/21 [00:00<00:10,  1.84valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 10s) Loss: 0.3506 


Validation:  95%|█████████▌| 20/21 [00:04<00:00,  3.95valid_batch/s]

EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.3889 


Validation: 100%|██████████| 21/21 [00:05<00:00,  4.07valid_batch/s]


Epoch 12 - avg_train_loss: 0.1066  avg_val_loss: 0.3889  time: 31s


Train:   2%|▏         | 1/60 [00:00<00:56,  1.05train_batch/s]

Epoch: [13][0/60] Elapsed 0m 0s (remain 0m 56s) Loss: 0.1074 Grad: 163999.0781  LR: 0.00041055  


Train:  85%|████████▌ | 51/60 [00:20<00:02,  3.01train_batch/s]

Epoch: [13][50/60] Elapsed 0m 20s (remain 0m 3s) Loss: 0.0876 Grad: 97884.6016  LR: 0.00034023  


Train: 100%|██████████| 60/60 [00:23<00:00,  2.57train_batch/s]


Epoch: [13][59/60] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0928 Grad: 74956.7500  LR: 0.00032789  


Validation:  10%|▉         | 2/21 [00:00<00:06,  2.85valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 12s) Loss: 0.3789 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.30valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.3726 
Epoch 13 - avg_train_loss: 0.0928  avg_val_loss: 0.3726  time: 28s
Epoch 13 - Save Best Loss: 0.3726 Model


Train:   2%|▏         | 1/60 [00:00<00:43,  1.34train_batch/s]

Epoch: [14][0/60] Elapsed 0m 0s (remain 0m 44s) Loss: 0.0873 Grad: 115333.5234  LR: 0.00032653  


Train:  85%|████████▌ | 51/60 [00:22<00:03,  2.27train_batch/s]

Epoch: [14][50/60] Elapsed 0m 22s (remain 0m 4s) Loss: 0.0856 Grad: 35631.2461  LR: 0.00026044  


Train: 100%|██████████| 60/60 [00:26<00:00,  2.38train_batch/s]

Epoch: [14][59/60] Elapsed 0m 26s (remain 0m 0s) Loss: 0.0854 Grad: 69150.3750  LR: 0.00024904  


Train: 100%|██████████| 60/60 [00:26<00:00,  2.24train_batch/s]
Validation:   5%|▍         | 1/21 [00:00<00:14,  1.34valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 14s) Loss: 0.3556 


Validation: 100%|██████████| 21/21 [00:05<00:00,  3.67valid_batch/s]

EVAL: [20/21] Elapsed 0m 5s (remain 0m 0s) Loss: 0.3712 
Epoch 14 - avg_train_loss: 0.0854  avg_val_loss: 0.3712  time: 33s
Epoch 14 - Save Best Loss: 0.3712 Model



Train:   2%|▏         | 1/60 [00:00<00:47,  1.23train_batch/s]

Epoch: [15][0/60] Elapsed 0m 0s (remain 0m 47s) Loss: 0.0808 Grad: 116969.0625  LR: 0.00024779  


Train:  85%|████████▌ | 51/60 [00:18<00:03,  2.75train_batch/s]

Epoch: [15][50/60] Elapsed 0m 18s (remain 0m 3s) Loss: 0.0792 Grad: 53072.3203  LR: 0.00018793  


Train: 100%|██████████| 60/60 [00:22<00:00,  2.67train_batch/s]


Epoch: [15][59/60] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0783 Grad: 65277.6445  LR: 0.00017782  


Validation:  10%|▉         | 2/21 [00:00<00:06,  3.12valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 11s) Loss: 0.3284 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.31valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.3562 
Epoch 15 - avg_train_loss: 0.0783  avg_val_loss: 0.3562  time: 27s
Epoch 15 - Save Best Loss: 0.3562 Model


Train:   2%|▏         | 1/60 [00:00<00:49,  1.19train_batch/s]

Epoch: [16][0/60] Elapsed 0m 0s (remain 0m 49s) Loss: 0.0784 Grad: 95575.9844  LR: 0.00017671  


Train:  85%|████████▌ | 51/60 [00:21<00:03,  2.26train_batch/s]

Epoch: [16][50/60] Elapsed 0m 21s (remain 0m 3s) Loss: 0.0737 Grad: 49185.1172  LR: 0.00012491  


Train: 100%|██████████| 60/60 [00:25<00:00,  2.34train_batch/s]


Epoch: [16][59/60] Elapsed 0m 25s (remain 0m 0s) Loss: 0.0730 Grad: 41080.1289  LR: 0.00011640  


Validation:  10%|▉         | 2/21 [00:00<00:05,  3.27valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 9s) Loss: 0.3493 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.24valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.3569 
Epoch 16 - avg_train_loss: 0.0730  avg_val_loss: 0.3569  time: 31s


Train:   2%|▏         | 1/60 [00:00<00:40,  1.45train_batch/s]

Epoch: [17][0/60] Elapsed 0m 0s (remain 0m 40s) Loss: 0.0534 Grad: 74454.7266  LR: 0.00011547  


Train:  85%|████████▌ | 51/60 [00:17<00:03,  2.66train_batch/s]

Epoch: [17][50/60] Elapsed 0m 17s (remain 0m 3s) Loss: 0.0668 Grad: 41356.1406  LR: 0.00007329  


Train: 100%|██████████| 60/60 [00:21<00:00,  2.75train_batch/s]


Epoch: [17][59/60] Elapsed 0m 21s (remain 0m 0s) Loss: 0.0680 Grad: 39264.7305  LR: 0.00006664  


Validation:  10%|▉         | 2/21 [00:00<00:06,  2.99valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 11s) Loss: 0.3491 


Validation: 100%|██████████| 21/21 [00:05<00:00,  4.06valid_batch/s]


EVAL: [20/21] Elapsed 0m 5s (remain 0m 0s) Loss: 0.3520 
Epoch 17 - avg_train_loss: 0.0680  avg_val_loss: 0.3520  time: 27s
Epoch 17 - Save Best Loss: 0.3520 Model


Train:   2%|▏         | 1/60 [00:00<00:53,  1.10train_batch/s]

Epoch: [18][0/60] Elapsed 0m 0s (remain 0m 53s) Loss: 0.0665 Grad: 83691.2734  LR: 0.00006591  


Train:  85%|████████▌ | 51/60 [00:22<00:03,  2.27train_batch/s]

Epoch: [18][50/60] Elapsed 0m 22s (remain 0m 3s) Loss: 0.0663 Grad: 36600.7461  LR: 0.00003465  


Train: 100%|██████████| 60/60 [00:26<00:00,  2.27train_batch/s]


Epoch: [18][59/60] Elapsed 0m 26s (remain 0m 0s) Loss: 0.0664 Grad: 68343.9219  LR: 0.00003005  


Validation:  10%|▉         | 2/21 [00:00<00:05,  3.30valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 9s) Loss: 0.3517 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.38valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.3579 
Epoch 18 - avg_train_loss: 0.0664  avg_val_loss: 0.3579  time: 31s


Train:   2%|▏         | 1/60 [00:00<00:56,  1.05train_batch/s]

Epoch: [19][0/60] Elapsed 0m 0s (remain 0m 56s) Loss: 0.0867 Grad: 126973.8281  LR: 0.00002956  


Train:  85%|████████▌ | 51/60 [00:18<00:03,  2.39train_batch/s]

Epoch: [19][50/60] Elapsed 0m 18s (remain 0m 3s) Loss: 0.0681 Grad: 39553.9805  LR: 0.00001015  


Train: 100%|██████████| 60/60 [00:22<00:00,  2.73train_batch/s]


Epoch: [19][59/60] Elapsed 0m 21s (remain 0m 0s) Loss: 0.0674 Grad: 50190.0430  LR: 0.00000774  


Validation:   5%|▍         | 1/21 [00:00<00:12,  1.55valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 12s) Loss: 0.3328 


Validation: 100%|██████████| 21/21 [00:05<00:00,  4.14valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.3580 
Epoch 19 - avg_train_loss: 0.0674  avg_val_loss: 0.3580  time: 27s


Train:   2%|▏         | 1/60 [00:00<00:48,  1.21train_batch/s]

Epoch: [20][0/60] Elapsed 0m 0s (remain 0m 48s) Loss: 0.0548 Grad: 95843.2969  LR: 0.00000750  


Train:  85%|████████▌ | 51/60 [00:22<00:03,  2.42train_batch/s]

Epoch: [20][50/60] Elapsed 0m 22s (remain 0m 3s) Loss: 0.0633 Grad: 29029.7207  LR: 0.00000054  


Train: 100%|██████████| 60/60 [00:26<00:00,  2.28train_batch/s]


Epoch: [20][59/60] Elapsed 0m 26s (remain 0m 0s) Loss: 0.0631 Grad: 32570.4082  LR: 0.00000040  


Validation:  10%|▉         | 2/21 [00:00<00:07,  2.70valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 13s) Loss: 0.3415 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.33valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.3551 
Epoch 20 - avg_train_loss: 0.0631  avg_val_loss: 0.3551  time: 31s


Train:   2%|▏         | 1/60 [00:00<00:51,  1.14train_batch/s]

Epoch: [1][0/60] Elapsed 0m 0s (remain 0m 51s) Loss: 2.1052 Grad: inf  LR: 0.00004017  


Train:  85%|████████▌ | 51/60 [00:18<00:03,  2.39train_batch/s]

Epoch: [1][50/60] Elapsed 0m 18s (remain 0m 3s) Loss: 1.5331 Grad: 178632.1719  LR: 0.00041319  


Train: 100%|██████████| 60/60 [00:22<00:00,  2.65train_batch/s]


Epoch: [1][59/60] Elapsed 0m 22s (remain 0m 0s) Loss: 1.4892 Grad: 108679.0703  LR: 0.00052634  


Validation:  10%|▉         | 2/21 [00:00<00:06,  3.11valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 11s) Loss: 1.3771 


Validation:  95%|█████████▌| 20/21 [00:04<00:00,  3.97valid_batch/s]

EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 1.0660 


Validation: 100%|██████████| 21/21 [00:05<00:00,  4.13valid_batch/s]


Epoch 1 - avg_train_loss: 1.4892  avg_val_loss: 1.0660  time: 28s
Epoch 1 - Save Best Loss: 1.0660 Model


Train:   2%|▏         | 1/60 [00:00<00:47,  1.25train_batch/s]

Epoch: [2][0/60] Elapsed 0m 0s (remain 0m 47s) Loss: 1.3531 Grad: 545850.0625  LR: 0.00053900  


Train:  85%|████████▌ | 51/60 [00:21<00:03,  2.41train_batch/s]

Epoch: [2][50/60] Elapsed 0m 21s (remain 0m 3s) Loss: 0.7769 Grad: 138973.6875  LR: 0.00098933  


Train: 100%|██████████| 60/60 [00:25<00:00,  2.38train_batch/s]


Epoch: [2][59/60] Elapsed 0m 25s (remain 0m 0s) Loss: 0.7599 Grad: 134304.2969  LR: 0.00100000  


Validation:  10%|▉         | 2/21 [00:00<00:05,  3.41valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 10s) Loss: 0.8994 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.28valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.6647 
Epoch 2 - avg_train_loss: 0.7599  avg_val_loss: 0.6647  time: 30s
Epoch 2 - Save Best Loss: 0.6647 Model


Train:   2%|▏         | 1/60 [00:00<00:40,  1.45train_batch/s]

Epoch: [3][0/60] Elapsed 0m 0s (remain 0m 40s) Loss: 0.5129 Grad: 246459.7031  LR: 0.00099999  


Train:  85%|████████▌ | 51/60 [00:19<00:03,  2.30train_batch/s]

Epoch: [3][50/60] Elapsed 0m 19s (remain 0m 3s) Loss: 0.5046 Grad: 260779.9219  LR: 0.00099429  


Train: 100%|██████████| 60/60 [00:23<00:00,  2.57train_batch/s]


Epoch: [3][59/60] Elapsed 0m 23s (remain 0m 0s) Loss: 0.5001 Grad: 272891.2812  LR: 0.00099215  


Validation:  10%|▉         | 2/21 [00:00<00:08,  2.32valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 15s) Loss: 0.8356 


Validation: 100%|██████████| 21/21 [00:05<00:00,  3.67valid_batch/s]


EVAL: [20/21] Elapsed 0m 5s (remain 0m 0s) Loss: 0.5725 
Epoch 3 - avg_train_loss: 0.5001  avg_val_loss: 0.5725  time: 29s
Epoch 3 - Save Best Loss: 0.5725 Model


Train:   2%|▏         | 1/60 [00:00<00:46,  1.26train_batch/s]

Epoch: [4][0/60] Elapsed 0m 0s (remain 0m 46s) Loss: 0.3175 Grad: 211468.6719  LR: 0.00099189  


Train:  85%|████████▌ | 51/60 [00:22<00:03,  2.27train_batch/s]

Epoch: [4][50/60] Elapsed 0m 22s (remain 0m 3s) Loss: 0.3730 Grad: 143785.6875  LR: 0.00097371  


Train: 100%|██████████| 60/60 [00:26<00:00,  2.26train_batch/s]


Epoch: [4][59/60] Elapsed 0m 26s (remain 0m 0s) Loss: 0.3764 Grad: 115532.1328  LR: 0.00096936  


Validation:   5%|▍         | 1/21 [00:00<00:13,  1.51valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 13s) Loss: 0.6728 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.66valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.4985 
Epoch 4 - avg_train_loss: 0.3764  avg_val_loss: 0.4985  time: 31s
Epoch 4 - Save Best Loss: 0.4985 Model


Train:   2%|▏         | 1/60 [00:00<00:35,  1.68train_batch/s]

Epoch: [5][0/60] Elapsed 0m 0s (remain 0m 35s) Loss: 0.3530 Grad: 319743.0000  LR: 0.00096886  


Train:  85%|████████▌ | 51/60 [00:20<00:03,  2.27train_batch/s]

Epoch: [5][50/60] Elapsed 0m 20s (remain 0m 3s) Loss: 0.3126 Grad: 139270.5156  LR: 0.00093874  


Train: 100%|██████████| 60/60 [00:24<00:00,  2.37train_batch/s]

Epoch: [5][59/60] Elapsed 0m 24s (remain 0m 0s) Loss: 0.3121 Grad: 117272.7500  LR: 0.00093231  


Train: 100%|██████████| 60/60 [00:25<00:00,  2.39train_batch/s]
Validation:   5%|▍         | 1/21 [00:00<00:14,  1.35valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 14s) Loss: 0.6499 


Validation: 100%|██████████| 21/21 [00:05<00:00,  4.05valid_batch/s]


EVAL: [20/21] Elapsed 0m 5s (remain 0m 0s) Loss: 0.4853 
Epoch 5 - avg_train_loss: 0.3121  avg_val_loss: 0.4853  time: 30s
Epoch 5 - Save Best Loss: 0.4853 Model


Train:   2%|▏         | 1/60 [00:00<00:57,  1.03train_batch/s]

Epoch: [6][0/60] Elapsed 0m 0s (remain 0m 57s) Loss: 0.2187 Grad: 190001.0312  LR: 0.00093158  


Train:  85%|████████▌ | 51/60 [00:21<00:03,  2.27train_batch/s]

Epoch: [6][50/60] Elapsed 0m 22s (remain 0m 3s) Loss: 0.2642 Grad: 87396.1328  LR: 0.00089044  


Train: 100%|██████████| 60/60 [00:25<00:00,  2.37train_batch/s]


Epoch: [6][59/60] Elapsed 0m 25s (remain 0m 0s) Loss: 0.2635 Grad: 104589.1562  LR: 0.00088213  


Validation:  10%|▉         | 2/21 [00:00<00:05,  3.35valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 11s) Loss: 0.5806 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.34valid_batch/s]

EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.4686 
Epoch 6 - avg_train_loss: 0.2635  avg_val_loss: 0.4686  time: 30s
Epoch 6 - Save Best Loss: 0.4686 Model



Train:   2%|▏         | 1/60 [00:00<00:48,  1.22train_batch/s]

Epoch: [7][0/60] Elapsed 0m 0s (remain 0m 48s) Loss: 0.1579 Grad: 144523.4688  LR: 0.00088119  


Train:  85%|████████▌ | 51/60 [00:20<00:03,  2.32train_batch/s]

Epoch: [7][50/60] Elapsed 0m 20s (remain 0m 3s) Loss: 0.2163 Grad: 59950.6094  LR: 0.00083029  


Train: 100%|██████████| 60/60 [00:24<00:00,  2.46train_batch/s]


Epoch: [7][59/60] Elapsed 0m 24s (remain 0m 0s) Loss: 0.2198 Grad: 125179.5781  LR: 0.00082035  


Validation:  10%|▉         | 2/21 [00:00<00:06,  3.03valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 11s) Loss: 0.5015 


Validation: 100%|██████████| 21/21 [00:05<00:00,  4.19valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.4070 
Epoch 7 - avg_train_loss: 0.2198  avg_val_loss: 0.4070  time: 29s
Epoch 7 - Save Best Loss: 0.4070 Model


Train:   2%|▏         | 1/60 [00:00<00:49,  1.18train_batch/s]

Epoch: [8][0/60] Elapsed 0m 0s (remain 0m 49s) Loss: 0.1496 Grad: 113483.0625  LR: 0.00081923  


Train:  85%|████████▌ | 51/60 [00:22<00:02,  3.34train_batch/s]

Epoch: [8][50/60] Elapsed 0m 22s (remain 0m 3s) Loss: 0.1832 Grad: 58300.9531  LR: 0.00076010  


Train: 100%|██████████| 60/60 [00:25<00:00,  2.79train_batch/s]

Epoch: [8][59/60] Elapsed 0m 25s (remain 0m 0s) Loss: 0.1820 Grad: 70488.8750  LR: 0.00074884  


Train: 100%|██████████| 60/60 [00:25<00:00,  2.37train_batch/s]
Validation:  10%|▉         | 2/21 [00:00<00:06,  3.03valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 11s) Loss: 0.5214 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.61valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.3788 
Epoch 8 - avg_train_loss: 0.1820  avg_val_loss: 0.3788  time: 30s
Epoch 8 - Save Best Loss: 0.3788 Model


Train:   2%|▏         | 1/60 [00:00<00:36,  1.60train_batch/s]

Epoch: [9][0/60] Elapsed 0m 0s (remain 0m 37s) Loss: 0.1119 Grad: 118590.1797  LR: 0.00074758  


Train:  85%|████████▌ | 51/60 [00:22<00:03,  2.26train_batch/s]

Epoch: [9][50/60] Elapsed 0m 22s (remain 0m 3s) Loss: 0.1552 Grad: 158851.0781  LR: 0.00068202  


Train: 100%|██████████| 60/60 [00:26<00:00,  2.30train_batch/s]


Epoch: [9][59/60] Elapsed 0m 25s (remain 0m 0s) Loss: 0.1570 Grad: 199582.8125  LR: 0.00066977  


Validation:  10%|▉         | 2/21 [00:00<00:05,  3.18valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 9s) Loss: 0.4674 


Validation: 100%|██████████| 21/21 [00:05<00:00,  4.16valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.4080 
Epoch 9 - avg_train_loss: 0.1570  avg_val_loss: 0.4080  time: 31s


Train:   2%|▏         | 1/60 [00:00<00:40,  1.44train_batch/s]

Epoch: [10][0/60] Elapsed 0m 0s (remain 0m 41s) Loss: 0.1526 Grad: 153952.8125  LR: 0.00066841  


Train:  85%|████████▌ | 51/60 [00:20<00:02,  3.35train_batch/s]

Epoch: [10][50/60] Elapsed 0m 20s (remain 0m 3s) Loss: 0.1361 Grad: 249383.9688  LR: 0.00059842  


Train: 100%|██████████| 60/60 [00:24<00:00,  2.69train_batch/s]

Epoch: [10][59/60] Elapsed 0m 24s (remain 0m 0s) Loss: 0.1396 Grad: 201443.6094  LR: 0.00058556  


Train: 100%|██████████| 60/60 [00:24<00:00,  2.47train_batch/s]
Validation:   5%|▍         | 1/21 [00:00<00:11,  1.78valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 11s) Loss: 0.5145 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.34valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.3417 
Epoch 10 - avg_train_loss: 0.1396  avg_val_loss: 0.3417  time: 29s
Epoch 10 - Save Best Loss: 0.3417 Model


Train:   2%|▏         | 1/60 [00:00<00:48,  1.22train_batch/s]

Epoch: [11][0/60] Elapsed 0m 0s (remain 0m 47s) Loss: 0.1342 Grad: 135554.6719  LR: 0.00058412  


Train:  85%|████████▌ | 51/60 [00:21<00:03,  2.31train_batch/s]

Epoch: [11][50/60] Elapsed 0m 21s (remain 0m 3s) Loss: 0.1258 Grad: 59788.6328  LR: 0.00051183  


Train: 100%|██████████| 60/60 [00:25<00:00,  2.33train_batch/s]

Epoch: [11][59/60] Elapsed 0m 25s (remain 0m 0s) Loss: 0.1256 Grad: 66773.7734  LR: 0.00049875  


Train: 100%|██████████| 60/60 [00:25<00:00,  2.34train_batch/s]
Validation:   5%|▍         | 1/21 [00:00<00:13,  1.49valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 13s) Loss: 0.5298 


Validation: 100%|██████████| 21/21 [00:05<00:00,  3.86valid_batch/s]


EVAL: [20/21] Elapsed 0m 5s (remain 0m 0s) Loss: 0.3912 
Epoch 11 - avg_train_loss: 0.1256  avg_val_loss: 0.3912  time: 31s


Train:   2%|▏         | 1/60 [00:00<00:55,  1.06train_batch/s]

Epoch: [12][0/60] Elapsed 0m 0s (remain 0m 54s) Loss: 0.1195 Grad: 141191.0000  LR: 0.00049729  


Train:  85%|████████▌ | 51/60 [00:20<00:02,  3.15train_batch/s]

Epoch: [12][50/60] Elapsed 0m 20s (remain 0m 3s) Loss: 0.1090 Grad: 126268.4297  LR: 0.00042489  


Train: 100%|██████████| 60/60 [00:23<00:00,  2.55train_batch/s]


Epoch: [12][59/60] Elapsed 0m 23s (remain 0m 0s) Loss: 0.1103 Grad: 102990.3281  LR: 0.00041198  


Validation:  10%|▉         | 2/21 [00:00<00:05,  3.42valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 9s) Loss: 0.4397 


Validation: 100%|██████████| 21/21 [00:05<00:00,  4.16valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.3514 
Epoch 12 - avg_train_loss: 0.1103  avg_val_loss: 0.3514  time: 29s


Train:   2%|▏         | 1/60 [00:00<00:44,  1.32train_batch/s]

Epoch: [13][0/60] Elapsed 0m 0s (remain 0m 44s) Loss: 0.0934 Grad: 173295.0469  LR: 0.00041055  


Train:  85%|████████▌ | 51/60 [00:21<00:03,  2.26train_batch/s]

Epoch: [13][50/60] Elapsed 0m 21s (remain 0m 3s) Loss: 0.0977 Grad: 95575.3594  LR: 0.00034023  


Train: 100%|██████████| 60/60 [00:26<00:00,  2.30train_batch/s]


Epoch: [13][59/60] Elapsed 0m 25s (remain 0m 0s) Loss: 0.0992 Grad: 126409.0781  LR: 0.00032789  


Validation:   5%|▍         | 1/21 [00:00<00:15,  1.27valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 15s) Loss: 0.4309 


Validation: 100%|██████████| 21/21 [00:06<00:00,  3.45valid_batch/s]


EVAL: [20/21] Elapsed 0m 5s (remain 0m 0s) Loss: 0.3150 
Epoch 13 - avg_train_loss: 0.0992  avg_val_loss: 0.3150  time: 32s
Epoch 13 - Save Best Loss: 0.3150 Model


Train:   2%|▏         | 1/60 [00:00<00:41,  1.41train_batch/s]

Epoch: [14][0/60] Elapsed 0m 0s (remain 0m 42s) Loss: 0.1067 Grad: 134812.9844  LR: 0.00032653  


Train:  85%|████████▌ | 51/60 [00:19<00:03,  2.53train_batch/s]

Epoch: [14][50/60] Elapsed 0m 19s (remain 0m 3s) Loss: 0.0871 Grad: 79609.2969  LR: 0.00026044  


Train: 100%|██████████| 60/60 [00:22<00:00,  2.68train_batch/s]


Epoch: [14][59/60] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0874 Grad: 106260.7578  LR: 0.00024904  


Validation:   5%|▍         | 1/21 [00:00<00:11,  1.74valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 11s) Loss: 0.4567 


Validation: 100%|██████████| 21/21 [00:05<00:00,  4.68valid_batch/s]

EVAL: [20/21] Elapsed 0m 5s (remain 0m 0s) Loss: 0.3042 


Validation: 100%|██████████| 21/21 [00:06<00:00,  3.48valid_batch/s]


Epoch 14 - avg_train_loss: 0.0874  avg_val_loss: 0.3042  time: 28s
Epoch 14 - Save Best Loss: 0.3042 Model


Train:   2%|▏         | 1/60 [00:00<00:56,  1.05train_batch/s]

Epoch: [15][0/60] Elapsed 0m 0s (remain 0m 56s) Loss: 0.0871 Grad: 128234.3594  LR: 0.00024779  


Train:  85%|████████▌ | 51/60 [00:22<00:03,  2.27train_batch/s]

Epoch: [15][50/60] Elapsed 0m 22s (remain 0m 4s) Loss: 0.0806 Grad: 170729.5469  LR: 0.00018793  


Train: 100%|██████████| 60/60 [00:26<00:00,  2.24train_batch/s]


Epoch: [15][59/60] Elapsed 0m 26s (remain 0m 0s) Loss: 0.0807 Grad: 102269.3984  LR: 0.00017782  


Validation:  10%|▉         | 2/21 [00:00<00:05,  3.21valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 9s) Loss: 0.4270 


Validation: 100%|██████████| 21/21 [00:05<00:00,  4.17valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.3122 
Epoch 15 - avg_train_loss: 0.0807  avg_val_loss: 0.3122  time: 32s


Train:   2%|▏         | 1/60 [00:00<00:41,  1.41train_batch/s]

Epoch: [16][0/60] Elapsed 0m 0s (remain 0m 41s) Loss: 0.0911 Grad: 116937.2500  LR: 0.00017671  


Train:  85%|████████▌ | 51/60 [00:19<00:03,  2.32train_batch/s]

Epoch: [16][50/60] Elapsed 0m 19s (remain 0m 3s) Loss: 0.0773 Grad: 87406.4375  LR: 0.00012491  


Train: 100%|██████████| 60/60 [00:23<00:00,  2.36train_batch/s]

Epoch: [16][59/60] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0772 Grad: 108112.6484  LR: 0.00011640  


Train: 100%|██████████| 60/60 [00:23<00:00,  2.58train_batch/s]
Validation:   5%|▍         | 1/21 [00:00<00:11,  1.70valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 11s) Loss: 0.4429 


Validation: 100%|██████████| 21/21 [00:05<00:00,  4.00valid_batch/s]


EVAL: [20/21] Elapsed 0m 5s (remain 0m 0s) Loss: 0.3137 
Epoch 16 - avg_train_loss: 0.0772  avg_val_loss: 0.3137  time: 29s


Train:   2%|▏         | 1/60 [00:00<00:41,  1.44train_batch/s]

Epoch: [17][0/60] Elapsed 0m 0s (remain 0m 41s) Loss: 0.0975 Grad: 182208.8281  LR: 0.00011547  


Train:  85%|████████▌ | 51/60 [00:22<00:03,  2.28train_batch/s]

Epoch: [17][50/60] Elapsed 0m 22s (remain 0m 3s) Loss: 0.0726 Grad: 81330.2266  LR: 0.00007329  


Train: 100%|██████████| 60/60 [00:26<00:00,  2.27train_batch/s]


Epoch: [17][59/60] Elapsed 0m 26s (remain 0m 0s) Loss: 0.0712 Grad: 78932.2891  LR: 0.00006664  


Validation:  10%|▉         | 2/21 [00:00<00:05,  3.27valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 9s) Loss: 0.4664 


Validation: 100%|██████████| 21/21 [00:05<00:00,  4.14valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.3106 
Epoch 17 - avg_train_loss: 0.0712  avg_val_loss: 0.3106  time: 32s


Train:   2%|▏         | 1/60 [00:00<00:32,  1.80train_batch/s]

Epoch: [18][0/60] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0828 Grad: 98203.3203  LR: 0.00006591  


Train:  85%|████████▌ | 51/60 [00:17<00:03,  2.56train_batch/s]

Epoch: [18][50/60] Elapsed 0m 17s (remain 0m 3s) Loss: 0.0725 Grad: 85499.0625  LR: 0.00003465  


Train: 100%|██████████| 60/60 [00:21<00:00,  2.31train_batch/s]

Epoch: [18][59/60] Elapsed 0m 21s (remain 0m 0s) Loss: 0.0713 Grad: 83078.0703  LR: 0.00003005  


Train: 100%|██████████| 60/60 [00:21<00:00,  2.74train_batch/s]
Validation:  10%|▉         | 2/21 [00:00<00:07,  2.68valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 12s) Loss: 0.4592 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.57valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.3075 
Epoch 18 - avg_train_loss: 0.0713  avg_val_loss: 0.3075  time: 26s


Train:   2%|▏         | 1/60 [00:00<00:49,  1.19train_batch/s]

Epoch: [19][0/60] Elapsed 0m 0s (remain 0m 49s) Loss: 0.1039 Grad: 133913.9688  LR: 0.00002956  


Train:  85%|████████▌ | 51/60 [00:22<00:04,  2.24train_batch/s]

Epoch: [19][50/60] Elapsed 0m 22s (remain 0m 3s) Loss: 0.0677 Grad: 81598.9922  LR: 0.00001015  


Train: 100%|██████████| 60/60 [00:26<00:00,  2.27train_batch/s]


Epoch: [19][59/60] Elapsed 0m 26s (remain 0m 0s) Loss: 0.0681 Grad: 79317.6641  LR: 0.00000774  


Validation:  10%|▉         | 2/21 [00:00<00:07,  2.53valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 13s) Loss: 0.4553 


Validation: 100%|██████████| 21/21 [00:05<00:00,  3.83valid_batch/s]


EVAL: [20/21] Elapsed 0m 5s (remain 0m 0s) Loss: 0.3088 
Epoch 19 - avg_train_loss: 0.0681  avg_val_loss: 0.3088  time: 32s


Train:   2%|▏         | 1/60 [00:00<00:30,  1.95train_batch/s]

Epoch: [20][0/60] Elapsed 0m 0s (remain 0m 30s) Loss: 0.0579 Grad: 65994.8906  LR: 0.00000750  


Train:  85%|████████▌ | 51/60 [00:20<00:03,  2.28train_batch/s]

Epoch: [20][50/60] Elapsed 0m 20s (remain 0m 3s) Loss: 0.0660 Grad: 75594.7031  LR: 0.00000054  


Train: 100%|██████████| 60/60 [00:23<00:00,  2.29train_batch/s]

Epoch: [20][59/60] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0671 Grad: 126739.6953  LR: 0.00000040  


Train: 100%|██████████| 60/60 [00:23<00:00,  2.50train_batch/s]
Validation:  10%|▉         | 2/21 [00:00<00:05,  3.36valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 10s) Loss: 0.4515 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.29valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.3067 
Epoch 20 - avg_train_loss: 0.0671  avg_val_loss: 0.3067  time: 29s


Train:   2%|▏         | 1/60 [00:01<00:59,  1.01s/train_batch]

Epoch: [1][0/60] Elapsed 0m 1s (remain 0m 59s) Loss: 1.5732 Grad: inf  LR: 0.00004017  


Train:  85%|████████▌ | 51/60 [00:21<00:03,  2.40train_batch/s]

Epoch: [1][50/60] Elapsed 0m 21s (remain 0m 3s) Loss: 1.4097 Grad: 119432.5156  LR: 0.00041319  


Train: 100%|██████████| 60/60 [00:25<00:00,  2.43train_batch/s]

Epoch: [1][59/60] Elapsed 0m 25s (remain 0m 0s) Loss: 1.3495 Grad: 93407.1172  LR: 0.00052634  


Train: 100%|██████████| 60/60 [00:25<00:00,  2.34train_batch/s]
Validation:  10%|▉         | 2/21 [00:00<00:05,  3.42valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 10s) Loss: 1.3277 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.67valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.9869 
Epoch 1 - avg_train_loss: 1.3495  avg_val_loss: 0.9869  time: 30s
Epoch 1 - Save Best Loss: 0.9869 Model


Train:   2%|▏         | 1/60 [00:00<00:39,  1.51train_batch/s]

Epoch: [2][0/60] Elapsed 0m 0s (remain 0m 39s) Loss: 0.7419 Grad: 284651.3750  LR: 0.00053900  


Train:  85%|████████▌ | 51/60 [00:20<00:03,  2.52train_batch/s]

Epoch: [2][50/60] Elapsed 0m 20s (remain 0m 3s) Loss: 0.7030 Grad: 116793.3125  LR: 0.00098933  


Train: 100%|██████████| 60/60 [00:24<00:00,  2.48train_batch/s]


Epoch: [2][59/60] Elapsed 0m 24s (remain 0m 0s) Loss: 0.6953 Grad: 122982.5703  LR: 0.00100000  


Validation:  10%|▉         | 2/21 [00:00<00:06,  3.16valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 10s) Loss: 0.8025 


Validation: 100%|██████████| 21/21 [00:05<00:00,  4.01valid_batch/s]


EVAL: [20/21] Elapsed 0m 5s (remain 0m 0s) Loss: 0.7420 
Epoch 2 - avg_train_loss: 0.6953  avg_val_loss: 0.7420  time: 29s
Epoch 2 - Save Best Loss: 0.7420 Model


Train:   2%|▏         | 1/60 [00:01<00:59,  1.01s/train_batch]

Epoch: [3][0/60] Elapsed 0m 0s (remain 0m 58s) Loss: 0.5422 Grad: 282633.4062  LR: 0.00099999  


Train:  85%|████████▌ | 51/60 [00:23<00:04,  2.25train_batch/s]

Epoch: [3][50/60] Elapsed 0m 23s (remain 0m 4s) Loss: 0.4757 Grad: 95148.2344  LR: 0.00099429  


Train: 100%|██████████| 60/60 [00:25<00:00,  3.24train_batch/s]

Epoch: [3][59/60] Elapsed 0m 25s (remain 0m 0s) Loss: 0.4727 Grad: 119714.7266  LR: 0.00099215  


Train: 100%|██████████| 60/60 [00:25<00:00,  2.31train_batch/s]
Validation:  10%|▉         | 2/21 [00:00<00:05,  3.52valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 10s) Loss: 0.6589 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.47valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.5611 
Epoch 3 - avg_train_loss: 0.4727  avg_val_loss: 0.5611  time: 31s
Epoch 3 - Save Best Loss: 0.5611 Model


Train:   2%|▏         | 1/60 [00:00<00:46,  1.28train_batch/s]

Epoch: [4][0/60] Elapsed 0m 0s (remain 0m 46s) Loss: 0.3654 Grad: 199593.8438  LR: 0.00099189  


Train:  85%|████████▌ | 51/60 [00:21<00:04,  2.23train_batch/s]

Epoch: [4][50/60] Elapsed 0m 21s (remain 0m 3s) Loss: 0.3607 Grad: 96624.3672  LR: 0.00097371  


Train: 100%|██████████| 60/60 [00:25<00:00,  2.33train_batch/s]


Epoch: [4][59/60] Elapsed 0m 25s (remain 0m 0s) Loss: 0.3580 Grad: 107441.6484  LR: 0.00096936  


Validation:  10%|▉         | 2/21 [00:00<00:06,  3.13valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 10s) Loss: 0.7103 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.24valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.5108 
Epoch 4 - avg_train_loss: 0.3580  avg_val_loss: 0.5108  time: 31s
Epoch 4 - Save Best Loss: 0.5108 Model


Train:   2%|▏         | 1/60 [00:00<00:42,  1.39train_batch/s]

Epoch: [5][0/60] Elapsed 0m 0s (remain 0m 42s) Loss: 0.2468 Grad: 150245.0938  LR: 0.00096886  


Train:  85%|████████▌ | 51/60 [00:22<00:03,  2.82train_batch/s]

Epoch: [5][50/60] Elapsed 0m 22s (remain 0m 3s) Loss: 0.2879 Grad: 94852.2969  LR: 0.00093874  


Train: 100%|██████████| 60/60 [00:25<00:00,  2.37train_batch/s]


Epoch: [5][59/60] Elapsed 0m 25s (remain 0m 0s) Loss: 0.2858 Grad: 105897.8828  LR: 0.00093231  


Validation:   5%|▍         | 1/21 [00:00<00:12,  1.66valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 12s) Loss: 0.6242 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.40valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.4359 
Epoch 5 - avg_train_loss: 0.2858  avg_val_loss: 0.4359  time: 30s
Epoch 5 - Save Best Loss: 0.4359 Model


Train:   2%|▏         | 1/60 [00:00<00:34,  1.73train_batch/s]

Epoch: [6][0/60] Elapsed 0m 0s (remain 0m 34s) Loss: 0.2379 Grad: 221563.0781  LR: 0.00093158  


Train:  85%|████████▌ | 51/60 [00:21<00:03,  2.35train_batch/s]

Epoch: [6][50/60] Elapsed 0m 21s (remain 0m 3s) Loss: 0.2403 Grad: 110538.4688  LR: 0.00089044  


Train: 100%|██████████| 60/60 [00:24<00:00,  2.38train_batch/s]

Epoch: [6][59/60] Elapsed 0m 25s (remain 0m 0s) Loss: 0.2429 Grad: 113199.2031  LR: 0.00088213  


Train: 100%|██████████| 60/60 [00:25<00:00,  2.38train_batch/s]
Validation:   5%|▍         | 1/21 [00:00<00:12,  1.61valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 12s) Loss: 0.5661 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.30valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.3910 
Epoch 6 - avg_train_loss: 0.2429  avg_val_loss: 0.3910  time: 30s
Epoch 6 - Save Best Loss: 0.3910 Model


Train:   2%|▏         | 1/60 [00:01<00:59,  1.01s/train_batch]

Epoch: [7][0/60] Elapsed 0m 1s (remain 0m 59s) Loss: 0.2359 Grad: 140531.1250  LR: 0.00088119  


Train:  85%|████████▌ | 51/60 [00:21<00:03,  2.72train_batch/s]

Epoch: [7][50/60] Elapsed 0m 21s (remain 0m 3s) Loss: 0.2101 Grad: 73533.2109  LR: 0.00083029  


Train: 100%|██████████| 60/60 [00:24<00:00,  2.45train_batch/s]


Epoch: [7][59/60] Elapsed 0m 24s (remain 0m 0s) Loss: 0.2150 Grad: 70413.8047  LR: 0.00082035  


Validation:  10%|▉         | 2/21 [00:00<00:05,  3.48valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 11s) Loss: 0.7151 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.75valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.5256 
Epoch 7 - avg_train_loss: 0.2150  avg_val_loss: 0.5256  time: 29s


Train:   2%|▏         | 1/60 [00:00<00:55,  1.06train_batch/s]

Epoch: [8][0/60] Elapsed 0m 0s (remain 0m 55s) Loss: 0.1796 Grad: 233665.0469  LR: 0.00081923  


Train:  85%|████████▌ | 51/60 [00:22<00:03,  2.30train_batch/s]

Epoch: [8][50/60] Elapsed 0m 22s (remain 0m 4s) Loss: 0.1806 Grad: 87123.4766  LR: 0.00076010  


Train: 100%|██████████| 60/60 [00:26<00:00,  2.25train_batch/s]


Epoch: [8][59/60] Elapsed 0m 26s (remain 0m 0s) Loss: 0.1785 Grad: 109955.7031  LR: 0.00074884  


Validation:   5%|▍         | 1/21 [00:00<00:13,  1.50valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 13s) Loss: 0.5575 


Validation:  95%|█████████▌| 20/21 [00:05<00:00,  4.09valid_batch/s]

EVAL: [20/21] Elapsed 0m 5s (remain 0m 0s) Loss: 0.3772 


Validation: 100%|██████████| 21/21 [00:05<00:00,  3.71valid_batch/s]


Epoch 8 - avg_train_loss: 0.1785  avg_val_loss: 0.3772  time: 32s
Epoch 8 - Save Best Loss: 0.3772 Model


Train:   2%|▏         | 1/60 [00:00<00:45,  1.29train_batch/s]

Epoch: [9][0/60] Elapsed 0m 0s (remain 0m 45s) Loss: 0.1332 Grad: 118365.8984  LR: 0.00074758  


Train:  85%|████████▌ | 51/60 [00:20<00:03,  2.95train_batch/s]

Epoch: [9][50/60] Elapsed 0m 20s (remain 0m 3s) Loss: 0.1589 Grad: 350058.7812  LR: 0.00068202  


Train: 100%|██████████| 60/60 [00:23<00:00,  2.56train_batch/s]


Epoch: [9][59/60] Elapsed 0m 23s (remain 0m 0s) Loss: 0.1623 Grad: 158260.2969  LR: 0.00066977  


Validation:  10%|▉         | 2/21 [00:00<00:05,  3.28valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 10s) Loss: 0.4335 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.34valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.3683 
Epoch 9 - avg_train_loss: 0.1623  avg_val_loss: 0.3683  time: 28s
Epoch 9 - Save Best Loss: 0.3683 Model


Train:   2%|▏         | 1/60 [00:01<00:59,  1.02s/train_batch]

Epoch: [10][0/60] Elapsed 0m 1s (remain 1m 0s) Loss: 0.1066 Grad: 136659.2031  LR: 0.00066841  


Train:  85%|████████▌ | 51/60 [00:22<00:03,  2.26train_batch/s]

Epoch: [10][50/60] Elapsed 0m 22s (remain 0m 3s) Loss: 0.1297 Grad: 152071.1094  LR: 0.00059842  


Train: 100%|██████████| 60/60 [00:26<00:00,  2.26train_batch/s]


Epoch: [10][59/60] Elapsed 0m 26s (remain 0m 0s) Loss: 0.1308 Grad: 158558.6094  LR: 0.00058556  


Validation:  10%|▉         | 2/21 [00:00<00:05,  3.18valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 10s) Loss: 0.4307 


Validation:  95%|█████████▌| 20/21 [00:05<00:00,  3.19valid_batch/s]

EVAL: [20/21] Elapsed 0m 5s (remain 0m 0s) Loss: 0.3452 


Validation: 100%|██████████| 21/21 [00:05<00:00,  3.77valid_batch/s]


Epoch 10 - avg_train_loss: 0.1308  avg_val_loss: 0.3452  time: 32s
Epoch 10 - Save Best Loss: 0.3452 Model


Train:   2%|▏         | 1/60 [00:01<01:01,  1.04s/train_batch]

Epoch: [11][0/60] Elapsed 0m 1s (remain 1m 1s) Loss: 0.1228 Grad: 167633.2031  LR: 0.00058412  


Train:  85%|████████▌ | 51/60 [00:19<00:03,  2.51train_batch/s]

Epoch: [11][50/60] Elapsed 0m 19s (remain 0m 3s) Loss: 0.1218 Grad: 121584.5469  LR: 0.00051183  


Train: 100%|██████████| 60/60 [00:23<00:00,  2.38train_batch/s]

Epoch: [11][59/60] Elapsed 0m 23s (remain 0m 0s) Loss: 0.1203 Grad: 94051.7500  LR: 0.00049875  


Train: 100%|██████████| 60/60 [00:23<00:00,  2.56train_batch/s]
Validation:   5%|▍         | 1/21 [00:00<00:13,  1.45valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 13s) Loss: 0.4005 


Validation: 100%|██████████| 21/21 [00:05<00:00,  4.03valid_batch/s]


EVAL: [20/21] Elapsed 0m 5s (remain 0m 0s) Loss: 0.3007 
Epoch 11 - avg_train_loss: 0.1203  avg_val_loss: 0.3007  time: 29s
Epoch 11 - Save Best Loss: 0.3007 Model


Train:   2%|▏         | 1/60 [00:00<00:53,  1.10train_batch/s]

Epoch: [12][0/60] Elapsed 0m 0s (remain 0m 53s) Loss: 0.1410 Grad: 146035.4688  LR: 0.00049729  


Train:  85%|████████▌ | 51/60 [00:22<00:03,  2.25train_batch/s]

Epoch: [12][50/60] Elapsed 0m 22s (remain 0m 4s) Loss: 0.1109 Grad: 136428.6250  LR: 0.00042489  


Train: 100%|██████████| 60/60 [00:26<00:00,  2.22train_batch/s]


Epoch: [12][59/60] Elapsed 0m 26s (remain 0m 0s) Loss: 0.1088 Grad: 162342.7500  LR: 0.00041198  


Validation:  10%|▉         | 2/21 [00:00<00:05,  3.24valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 10s) Loss: 0.3798 


Validation: 100%|██████████| 21/21 [00:05<00:00,  3.97valid_batch/s]


EVAL: [20/21] Elapsed 0m 5s (remain 0m 0s) Loss: 0.2990 
Epoch 12 - avg_train_loss: 0.1088  avg_val_loss: 0.2990  time: 32s
Epoch 12 - Save Best Loss: 0.2990 Model


Train:   2%|▏         | 1/60 [00:00<00:45,  1.30train_batch/s]

Epoch: [13][0/60] Elapsed 0m 0s (remain 0m 45s) Loss: 0.0759 Grad: 88337.6328  LR: 0.00041055  


Train:  85%|████████▌ | 51/60 [00:17<00:03,  2.27train_batch/s]

Epoch: [13][50/60] Elapsed 0m 18s (remain 0m 3s) Loss: 0.0945 Grad: 129300.5000  LR: 0.00034023  


Train: 100%|██████████| 60/60 [00:21<00:00,  2.73train_batch/s]


Epoch: [13][59/60] Elapsed 0m 21s (remain 0m 0s) Loss: 0.0952 Grad: 139730.7188  LR: 0.00032789  


Validation:  10%|▉         | 2/21 [00:00<00:05,  3.24valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 10s) Loss: 0.3916 


Validation: 100%|██████████| 21/21 [00:05<00:00,  4.14valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.2783 
Epoch 13 - avg_train_loss: 0.0952  avg_val_loss: 0.2783  time: 27s
Epoch 13 - Save Best Loss: 0.2783 Model


Train:   2%|▏         | 1/60 [00:00<00:50,  1.16train_batch/s]

Epoch: [14][0/60] Elapsed 0m 0s (remain 0m 49s) Loss: 0.1216 Grad: 120955.8594  LR: 0.00032653  


Train:  85%|████████▌ | 51/60 [00:22<00:03,  2.29train_batch/s]

Epoch: [14][50/60] Elapsed 0m 22s (remain 0m 3s) Loss: 0.0860 Grad: 128054.0938  LR: 0.00026044  


Train: 100%|██████████| 60/60 [00:26<00:00,  2.27train_batch/s]


Epoch: [14][59/60] Elapsed 0m 26s (remain 0m 0s) Loss: 0.0866 Grad: 121532.9609  LR: 0.00024904  


Validation:   5%|▍         | 1/21 [00:00<00:12,  1.54valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 12s) Loss: 0.3497 


Validation: 100%|██████████| 21/21 [00:05<00:00,  4.14valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.2928 
Epoch 14 - avg_train_loss: 0.0866  avg_val_loss: 0.2928  time: 32s


Train:   2%|▏         | 1/60 [00:00<00:32,  1.80train_batch/s]

Epoch: [15][0/60] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0912 Grad: 117488.0547  LR: 0.00024779  


Train:  85%|████████▌ | 51/60 [00:18<00:03,  2.44train_batch/s]

Epoch: [15][50/60] Elapsed 0m 18s (remain 0m 3s) Loss: 0.0818 Grad: 125709.0156  LR: 0.00018793  


Train: 100%|██████████| 60/60 [00:22<00:00,  2.43train_batch/s]

Epoch: [15][59/60] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0809 Grad: 84882.0781  LR: 0.00017782  


Train: 100%|██████████| 60/60 [00:22<00:00,  2.66train_batch/s]
Validation:   5%|▍         | 1/21 [00:00<00:12,  1.57valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 12s) Loss: 0.3772 


Validation: 100%|██████████| 21/21 [00:05<00:00,  4.06valid_batch/s]


EVAL: [20/21] Elapsed 0m 5s (remain 0m 0s) Loss: 0.2830 
Epoch 15 - avg_train_loss: 0.0809  avg_val_loss: 0.2830  time: 28s


Train:   2%|▏         | 1/60 [00:00<00:41,  1.42train_batch/s]

Epoch: [16][0/60] Elapsed 0m 0s (remain 0m 41s) Loss: 0.0659 Grad: 103372.7969  LR: 0.00017671  


Train:  85%|████████▌ | 51/60 [00:22<00:03,  2.31train_batch/s]

Epoch: [16][50/60] Elapsed 0m 22s (remain 0m 3s) Loss: 0.0781 Grad: 92275.9609  LR: 0.00012491  


Train: 100%|██████████| 60/60 [00:26<00:00,  2.25train_batch/s]


Epoch: [16][59/60] Elapsed 0m 26s (remain 0m 0s) Loss: 0.0774 Grad: 84703.4531  LR: 0.00011640  


Validation:  10%|▉         | 2/21 [00:00<00:05,  3.31valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 9s) Loss: 0.3987 


Validation:  95%|█████████▌| 20/21 [00:04<00:00,  6.54valid_batch/s]

EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.2780 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.75valid_batch/s]


Epoch 16 - avg_train_loss: 0.0774  avg_val_loss: 0.2780  time: 31s
Epoch 16 - Save Best Loss: 0.2780 Model


Train:   2%|▏         | 1/60 [00:00<00:42,  1.39train_batch/s]

Epoch: [17][0/60] Elapsed 0m 0s (remain 0m 42s) Loss: 0.0678 Grad: 80914.1484  LR: 0.00011547  


Train:  85%|████████▌ | 51/60 [00:19<00:03,  2.32train_batch/s]

Epoch: [17][50/60] Elapsed 0m 19s (remain 0m 3s) Loss: 0.0711 Grad: 93355.3516  LR: 0.00007329  


Train: 100%|██████████| 60/60 [00:23<00:00,  2.51train_batch/s]


Epoch: [17][59/60] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0710 Grad: 89134.0000  LR: 0.00006664  


Validation:  10%|▉         | 2/21 [00:00<00:06,  3.12valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 11s) Loss: 0.3993 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.36valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.2770 
Epoch 17 - avg_train_loss: 0.0710  avg_val_loss: 0.2770  time: 29s
Epoch 17 - Save Best Loss: 0.2770 Model


Train:   2%|▏         | 1/60 [00:00<00:43,  1.35train_batch/s]

Epoch: [18][0/60] Elapsed 0m 0s (remain 0m 43s) Loss: 0.0672 Grad: 120092.6328  LR: 0.00006591  


Train:  85%|████████▌ | 51/60 [00:21<00:03,  2.38train_batch/s]

Epoch: [18][50/60] Elapsed 0m 21s (remain 0m 3s) Loss: 0.0704 Grad: 95691.2188  LR: 0.00003465  


Train: 100%|██████████| 60/60 [00:25<00:00,  2.35train_batch/s]


Epoch: [18][59/60] Elapsed 0m 25s (remain 0m 0s) Loss: 0.0707 Grad: 115403.1016  LR: 0.00003005  


Validation:  10%|▉         | 2/21 [00:00<00:05,  3.24valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 10s) Loss: 0.3941 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.92valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.2757 
Epoch 18 - avg_train_loss: 0.0707  avg_val_loss: 0.2757  time: 30s
Epoch 18 - Save Best Loss: 0.2757 Model


Train:   2%|▏         | 1/60 [00:00<00:42,  1.39train_batch/s]

Epoch: [19][0/60] Elapsed 0m 0s (remain 0m 42s) Loss: 0.0609 Grad: 65716.8750  LR: 0.00002956  


Train:  85%|████████▌ | 51/60 [00:19<00:03,  2.26train_batch/s]

Epoch: [19][50/60] Elapsed 0m 19s (remain 0m 3s) Loss: 0.0688 Grad: 104394.3359  LR: 0.00001015  


Train: 100%|██████████| 60/60 [00:22<00:00,  2.46train_batch/s]

Epoch: [19][59/60] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0679 Grad: 68484.1094  LR: 0.00000774  


Train: 100%|██████████| 60/60 [00:22<00:00,  2.61train_batch/s]
Validation:  10%|▉         | 2/21 [00:00<00:07,  2.63valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 12s) Loss: 0.3936 


Validation: 100%|██████████| 21/21 [00:05<00:00,  3.78valid_batch/s]


EVAL: [20/21] Elapsed 0m 5s (remain 0m 0s) Loss: 0.2753 
Epoch 19 - avg_train_loss: 0.0679  avg_val_loss: 0.2753  time: 29s
Epoch 19 - Save Best Loss: 0.2753 Model


Train:   2%|▏         | 1/60 [00:00<00:44,  1.32train_batch/s]

Epoch: [20][0/60] Elapsed 0m 0s (remain 0m 44s) Loss: 0.0696 Grad: 78662.7578  LR: 0.00000750  


Train:  85%|████████▌ | 51/60 [00:22<00:03,  2.26train_batch/s]

Epoch: [20][50/60] Elapsed 0m 22s (remain 0m 4s) Loss: 0.0690 Grad: 103088.0781  LR: 0.00000054  


Train: 100%|██████████| 60/60 [00:26<00:00,  2.25train_batch/s]


Epoch: [20][59/60] Elapsed 0m 26s (remain 0m 0s) Loss: 0.0674 Grad: 48751.6055  LR: 0.00000040  


Validation:  10%|▉         | 2/21 [00:00<00:05,  3.39valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 8s) Loss: 0.3922 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.22valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.2765 
Epoch 20 - avg_train_loss: 0.0674  avg_val_loss: 0.2765  time: 32s


Train:   2%|▏         | 1/60 [00:00<00:45,  1.29train_batch/s]

Epoch: [1][0/60] Elapsed 0m 0s (remain 0m 45s) Loss: 1.6932 Grad: inf  LR: 0.00004017  


Train:  85%|████████▌ | 51/60 [00:21<00:03,  2.29train_batch/s]

Epoch: [1][50/60] Elapsed 0m 21s (remain 0m 3s) Loss: 1.4854 Grad: 47009.3047  LR: 0.00041319  


Train: 100%|██████████| 60/60 [00:25<00:00,  2.39train_batch/s]


Epoch: [1][59/60] Elapsed 0m 24s (remain 0m 0s) Loss: 1.4240 Grad: 43314.2500  LR: 0.00052634  


Validation:  10%|▉         | 2/21 [00:00<00:05,  3.19valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 10s) Loss: 1.2060 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.36valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 1.1562 
Epoch 1 - avg_train_loss: 1.4240  avg_val_loss: 1.1562  time: 30s
Epoch 1 - Save Best Loss: 1.1562 Model


Train:   2%|▏         | 1/60 [00:00<00:49,  1.20train_batch/s]

Epoch: [2][0/60] Elapsed 0m 0s (remain 0m 49s) Loss: 0.9227 Grad: 422249.3438  LR: 0.00053900  


Train:  85%|████████▌ | 51/60 [00:21<00:03,  2.38train_batch/s]

Epoch: [2][50/60] Elapsed 0m 21s (remain 0m 3s) Loss: 0.7972 Grad: 112192.0234  LR: 0.00098933  


Train: 100%|██████████| 60/60 [00:25<00:00,  2.55train_batch/s]

Epoch: [2][59/60] Elapsed 0m 25s (remain 0m 0s) Loss: 0.7896 Grad: 116954.5312  LR: 0.00100000  


Train: 100%|██████████| 60/60 [00:25<00:00,  2.37train_batch/s]
Validation:  10%|▉         | 2/21 [00:00<00:05,  3.30valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 10s) Loss: 0.6540 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.23valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.7511 
Epoch 2 - avg_train_loss: 0.7896  avg_val_loss: 0.7511  time: 30s
Epoch 2 - Save Best Loss: 0.7511 Model


Train:   2%|▏         | 1/60 [00:00<00:31,  1.88train_batch/s]

Epoch: [3][0/60] Elapsed 0m 0s (remain 0m 31s) Loss: 0.5661 Grad: 305806.6250  LR: 0.00099999  


Train:  85%|████████▌ | 51/60 [00:21<00:03,  2.27train_batch/s]

Epoch: [3][50/60] Elapsed 0m 21s (remain 0m 3s) Loss: 0.5657 Grad: 85934.0312  LR: 0.00099429  


Train: 100%|██████████| 60/60 [00:25<00:00,  2.43train_batch/s]

Epoch: [3][59/60] Elapsed 0m 25s (remain 0m 0s) Loss: 0.5590 Grad: 243317.1719  LR: 0.00099215  


Train: 100%|██████████| 60/60 [00:25<00:00,  2.32train_batch/s]
Validation:  10%|▉         | 2/21 [00:00<00:07,  2.65valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 13s) Loss: 0.5164 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.78valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.6186 
Epoch 3 - avg_train_loss: 0.5590  avg_val_loss: 0.6186  time: 30s
Epoch 3 - Save Best Loss: 0.6186 Model


Train:   2%|▏         | 1/60 [00:00<00:49,  1.20train_batch/s]

Epoch: [4][0/60] Elapsed 0m 0s (remain 0m 49s) Loss: 0.4353 Grad: 175788.3281  LR: 0.00099189  


Train:  85%|████████▌ | 51/60 [00:20<00:03,  2.80train_batch/s]

Epoch: [4][50/60] Elapsed 0m 20s (remain 0m 3s) Loss: 0.4232 Grad: 165807.9844  LR: 0.00097371  


Train: 100%|██████████| 60/60 [00:23<00:00,  2.78train_batch/s]

Epoch: [4][59/60] Elapsed 0m 23s (remain 0m 0s) Loss: 0.4256 Grad: 116777.9766  LR: 0.00096936  


Train: 100%|██████████| 60/60 [00:23<00:00,  2.50train_batch/s]
Validation:  10%|▉         | 2/21 [00:00<00:05,  3.44valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 9s) Loss: 0.7075 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.53valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.7188 
Epoch 4 - avg_train_loss: 0.4256  avg_val_loss: 0.7188  time: 29s


Train:   2%|▏         | 1/60 [00:00<00:47,  1.23train_batch/s]

Epoch: [5][0/60] Elapsed 0m 0s (remain 0m 47s) Loss: 0.3979 Grad: 207570.3281  LR: 0.00096886  


Train:  85%|████████▌ | 51/60 [00:22<00:03,  2.27train_batch/s]

Epoch: [5][50/60] Elapsed 0m 22s (remain 0m 3s) Loss: 0.3345 Grad: 115055.5391  LR: 0.00093874  


Train: 100%|██████████| 60/60 [00:26<00:00,  2.29train_batch/s]


Epoch: [5][59/60] Elapsed 0m 26s (remain 0m 0s) Loss: 0.3344 Grad: 106072.7656  LR: 0.00093231  


Validation:  10%|▉         | 2/21 [00:00<00:05,  3.28valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 9s) Loss: 0.6710 


Validation: 100%|██████████| 21/21 [00:05<00:00,  3.88valid_batch/s]


EVAL: [20/21] Elapsed 0m 5s (remain 0m 0s) Loss: 0.5035 
Epoch 5 - avg_train_loss: 0.3344  avg_val_loss: 0.5035  time: 32s
Epoch 5 - Save Best Loss: 0.5035 Model


Train:   2%|▏         | 1/60 [00:00<00:51,  1.14train_batch/s]

Epoch: [6][0/60] Elapsed 0m 0s (remain 0m 51s) Loss: 0.2792 Grad: 183130.5938  LR: 0.00093158  


Train:  85%|████████▌ | 51/60 [00:21<00:03,  2.47train_batch/s]

Epoch: [6][50/60] Elapsed 0m 21s (remain 0m 3s) Loss: 0.2669 Grad: 37663.3203  LR: 0.00089044  


Train: 100%|██████████| 60/60 [00:24<00:00,  2.41train_batch/s]


Epoch: [6][59/60] Elapsed 0m 24s (remain 0m 0s) Loss: 0.2675 Grad: 52628.8438  LR: 0.00088213  


Validation:  10%|▉         | 2/21 [00:00<00:04,  4.00valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 8s) Loss: 0.5760 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.27valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.4235 
Epoch 6 - avg_train_loss: 0.2675  avg_val_loss: 0.4235  time: 30s
Epoch 6 - Save Best Loss: 0.4235 Model


Train:   2%|▏         | 1/60 [00:00<00:57,  1.03train_batch/s]

Epoch: [7][0/60] Elapsed 0m 0s (remain 0m 57s) Loss: 0.2225 Grad: 172980.9844  LR: 0.00088119  


Train:  85%|████████▌ | 51/60 [00:21<00:03,  2.27train_batch/s]

Epoch: [7][50/60] Elapsed 0m 21s (remain 0m 3s) Loss: 0.2106 Grad: 23664.2109  LR: 0.00083029  


Train: 100%|██████████| 60/60 [00:25<00:00,  2.32train_batch/s]


Epoch: [7][59/60] Elapsed 0m 25s (remain 0m 0s) Loss: 0.2179 Grad: 56858.4102  LR: 0.00082035  


Validation:  10%|▉         | 2/21 [00:00<00:05,  3.18valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 10s) Loss: 0.7247 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.29valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.6289 
Epoch 7 - avg_train_loss: 0.2179  avg_val_loss: 0.6289  time: 31s


Train:   2%|▏         | 1/60 [00:00<00:41,  1.41train_batch/s]

Epoch: [8][0/60] Elapsed 0m 0s (remain 0m 41s) Loss: 0.1414 Grad: 132202.5156  LR: 0.00081923  


Train:  85%|████████▌ | 51/60 [00:20<00:03,  2.72train_batch/s]

Epoch: [8][50/60] Elapsed 0m 20s (remain 0m 3s) Loss: 0.2156 Grad: 46854.3945  LR: 0.00076010  


Train: 100%|██████████| 60/60 [00:23<00:00,  2.56train_batch/s]


Epoch: [8][59/60] Elapsed 0m 23s (remain 0m 0s) Loss: 0.2135 Grad: 37990.9531  LR: 0.00074884  


Validation:  10%|▉         | 2/21 [00:00<00:06,  3.01valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 10s) Loss: 0.5355 


Validation: 100%|██████████| 21/21 [00:05<00:00,  4.15valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.4375 
Epoch 8 - avg_train_loss: 0.2135  avg_val_loss: 0.4375  time: 29s


Train:   2%|▏         | 1/60 [00:00<00:44,  1.34train_batch/s]

Epoch: [9][0/60] Elapsed 0m 0s (remain 0m 44s) Loss: 0.1614 Grad: 150231.8125  LR: 0.00074758  


Train:  85%|████████▌ | 51/60 [00:22<00:03,  2.25train_batch/s]

Epoch: [9][50/60] Elapsed 0m 22s (remain 0m 3s) Loss: 0.1687 Grad: 28950.4082  LR: 0.00068202  


Train: 100%|██████████| 60/60 [00:26<00:00,  2.30train_batch/s]


Epoch: [9][59/60] Elapsed 0m 25s (remain 0m 0s) Loss: 0.1655 Grad: 40636.6328  LR: 0.00066977  


Validation:  10%|▉         | 2/21 [00:00<00:05,  3.17valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 10s) Loss: 0.4864 


Validation: 100%|██████████| 21/21 [00:05<00:00,  3.81valid_batch/s]


EVAL: [20/21] Elapsed 0m 5s (remain 0m 0s) Loss: 0.3714 
Epoch 9 - avg_train_loss: 0.1655  avg_val_loss: 0.3714  time: 32s
Epoch 9 - Save Best Loss: 0.3714 Model


Train:   2%|▏         | 1/60 [00:00<00:50,  1.16train_batch/s]

Epoch: [10][0/60] Elapsed 0m 0s (remain 0m 50s) Loss: 0.0908 Grad: 70893.7422  LR: 0.00066841  


Train:  85%|████████▌ | 51/60 [00:17<00:02,  3.92train_batch/s]

Epoch: [10][50/60] Elapsed 0m 17s (remain 0m 3s) Loss: 0.1519 Grad: 52199.7891  LR: 0.00059842  


Train: 100%|██████████| 60/60 [00:21<00:00,  2.78train_batch/s]


Epoch: [10][59/60] Elapsed 0m 21s (remain 0m 0s) Loss: 0.1537 Grad: 37796.1055  LR: 0.00058556  


Validation:  10%|▉         | 2/21 [00:00<00:05,  3.19valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 10s) Loss: 0.4435 


Validation: 100%|██████████| 21/21 [00:05<00:00,  3.95valid_batch/s]


EVAL: [20/21] Elapsed 0m 5s (remain 0m 0s) Loss: 0.3646 
Epoch 10 - avg_train_loss: 0.1537  avg_val_loss: 0.3646  time: 27s
Epoch 10 - Save Best Loss: 0.3646 Model


Train:   2%|▏         | 1/60 [00:00<00:49,  1.20train_batch/s]

Epoch: [11][0/60] Elapsed 0m 0s (remain 0m 49s) Loss: 0.1070 Grad: 136326.3125  LR: 0.00058412  


Train:  85%|████████▌ | 51/60 [00:21<00:03,  2.38train_batch/s]

Epoch: [11][50/60] Elapsed 0m 21s (remain 0m 3s) Loss: 0.1372 Grad: 24175.2363  LR: 0.00051183  


Train: 100%|██████████| 60/60 [00:25<00:00,  2.36train_batch/s]


Epoch: [11][59/60] Elapsed 0m 25s (remain 0m 0s) Loss: 0.1378 Grad: 27968.7598  LR: 0.00049875  


Validation:   5%|▍         | 1/21 [00:00<00:12,  1.54valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 13s) Loss: 0.4918 


Validation: 100%|██████████| 21/21 [00:05<00:00,  3.64valid_batch/s]


EVAL: [20/21] Elapsed 0m 5s (remain 0m 0s) Loss: 0.3348 
Epoch 11 - avg_train_loss: 0.1378  avg_val_loss: 0.3348  time: 31s
Epoch 11 - Save Best Loss: 0.3348 Model


Train:   2%|▏         | 1/60 [00:00<00:54,  1.08train_batch/s]

Epoch: [12][0/60] Elapsed 0m 0s (remain 0m 55s) Loss: 0.1139 Grad: 113288.0625  LR: 0.00049729  


Train:  85%|████████▌ | 51/60 [00:18<00:03,  2.31train_batch/s]

Epoch: [12][50/60] Elapsed 0m 18s (remain 0m 3s) Loss: 0.1146 Grad: 25610.7695  LR: 0.00042489  


Train: 100%|██████████| 60/60 [00:22<00:00,  2.69train_batch/s]


Epoch: [12][59/60] Elapsed 0m 22s (remain 0m 0s) Loss: 0.1120 Grad: 22430.7266  LR: 0.00041198  


Validation:  10%|▉         | 2/21 [00:00<00:05,  3.27valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 11s) Loss: 0.4219 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.28valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.2894 
Epoch 12 - avg_train_loss: 0.1120  avg_val_loss: 0.2894  time: 27s
Epoch 12 - Save Best Loss: 0.2894 Model


Train:   2%|▏         | 1/60 [00:00<00:41,  1.44train_batch/s]

Epoch: [13][0/60] Elapsed 0m 0s (remain 0m 41s) Loss: 0.0819 Grad: 80409.1328  LR: 0.00041055  


Train:  85%|████████▌ | 51/60 [00:22<00:03,  2.27train_batch/s]

Epoch: [13][50/60] Elapsed 0m 22s (remain 0m 4s) Loss: 0.1010 Grad: 27500.6777  LR: 0.00034023  


Train: 100%|██████████| 60/60 [00:26<00:00,  2.24train_batch/s]


Epoch: [13][59/60] Elapsed 0m 26s (remain 0m 0s) Loss: 0.0994 Grad: 27099.3438  LR: 0.00032789  


Validation:  10%|▉         | 2/21 [00:00<00:05,  3.23valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 9s) Loss: 0.4260 


Validation: 100%|██████████| 21/21 [00:05<00:00,  3.70valid_batch/s]


EVAL: [20/21] Elapsed 0m 5s (remain 0m 0s) Loss: 0.2970 
Epoch 13 - avg_train_loss: 0.0994  avg_val_loss: 0.2970  time: 32s


Train:   2%|▏         | 1/60 [00:00<00:43,  1.37train_batch/s]

Epoch: [14][0/60] Elapsed 0m 0s (remain 0m 43s) Loss: 0.0832 Grad: 102858.0938  LR: 0.00032653  


Train:  85%|████████▌ | 51/60 [00:18<00:04,  2.22train_batch/s]

Epoch: [14][50/60] Elapsed 0m 18s (remain 0m 3s) Loss: 0.0856 Grad: 38890.0430  LR: 0.00026044  


Train: 100%|██████████| 60/60 [00:22<00:00,  2.71train_batch/s]


Epoch: [14][59/60] Elapsed 0m 21s (remain 0m 0s) Loss: 0.0875 Grad: 28275.0020  LR: 0.00024904  


Validation:   5%|▍         | 1/21 [00:00<00:10,  1.91valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 10s) Loss: 0.4281 


Validation: 100%|██████████| 21/21 [00:05<00:00,  3.75valid_batch/s]


EVAL: [20/21] Elapsed 0m 5s (remain 0m 0s) Loss: 0.2897 
Epoch 14 - avg_train_loss: 0.0875  avg_val_loss: 0.2897  time: 28s


Train:   2%|▏         | 1/60 [00:00<00:45,  1.30train_batch/s]

Epoch: [15][0/60] Elapsed 0m 0s (remain 0m 45s) Loss: 0.0719 Grad: 84568.6484  LR: 0.00024779  


Train:  85%|████████▌ | 51/60 [00:22<00:04,  2.24train_batch/s]

Epoch: [15][50/60] Elapsed 0m 22s (remain 0m 3s) Loss: 0.0806 Grad: 74858.0469  LR: 0.00018793  


Train: 100%|██████████| 60/60 [00:26<00:00,  2.29train_batch/s]


Epoch: [15][59/60] Elapsed 0m 26s (remain 0m 0s) Loss: 0.0794 Grad: 35510.6367  LR: 0.00017782  


Validation:   5%|▍         | 1/21 [00:00<00:11,  1.70valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 11s) Loss: 0.3967 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.39valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.2860 
Epoch 15 - avg_train_loss: 0.0794  avg_val_loss: 0.2860  time: 31s
Epoch 15 - Save Best Loss: 0.2860 Model


Train:   2%|▏         | 1/60 [00:00<00:33,  1.76train_batch/s]

Epoch: [16][0/60] Elapsed 0m 0s (remain 0m 33s) Loss: 0.0837 Grad: 96676.1172  LR: 0.00017671  


Train:  85%|████████▌ | 51/60 [00:18<00:03,  2.38train_batch/s]

Epoch: [16][50/60] Elapsed 0m 18s (remain 0m 3s) Loss: 0.0708 Grad: 41107.8086  LR: 0.00012491  


Train: 100%|██████████| 60/60 [00:22<00:00,  2.63train_batch/s]


Epoch: [16][59/60] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0706 Grad: 42670.0859  LR: 0.00011640  


Validation:  10%|▉         | 2/21 [00:00<00:06,  2.90valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 11s) Loss: 0.4598 


Validation: 100%|██████████| 21/21 [00:05<00:00,  4.17valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.2914 
Epoch 16 - avg_train_loss: 0.0706  avg_val_loss: 0.2914  time: 28s


Train:   2%|▏         | 1/60 [00:00<00:42,  1.38train_batch/s]

Epoch: [17][0/60] Elapsed 0m 0s (remain 0m 42s) Loss: 0.0754 Grad: 98619.7734  LR: 0.00011547  


Train:  85%|████████▌ | 51/60 [00:22<00:03,  2.35train_batch/s]

Epoch: [17][50/60] Elapsed 0m 22s (remain 0m 3s) Loss: 0.0709 Grad: 44523.4102  LR: 0.00007329  


Train: 100%|██████████| 60/60 [00:26<00:00,  2.29train_batch/s]


Epoch: [17][59/60] Elapsed 0m 26s (remain 0m 0s) Loss: 0.0718 Grad: 56597.2148  LR: 0.00006664  


Validation:  10%|▉         | 2/21 [00:00<00:05,  3.31valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 9s) Loss: 0.4235 


Validation:  95%|█████████▌| 20/21 [00:04<00:00,  5.27valid_batch/s]

EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.2837 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.44valid_batch/s]


Epoch 17 - avg_train_loss: 0.0718  avg_val_loss: 0.2837  time: 31s
Epoch 17 - Save Best Loss: 0.2837 Model


Train:   2%|▏         | 1/60 [00:00<00:45,  1.31train_batch/s]

Epoch: [18][0/60] Elapsed 0m 0s (remain 0m 45s) Loss: 0.0704 Grad: 94520.0000  LR: 0.00006591  


Train:  85%|████████▌ | 51/60 [00:20<00:03,  2.28train_batch/s]

Epoch: [18][50/60] Elapsed 0m 20s (remain 0m 3s) Loss: 0.0687 Grad: 39339.9688  LR: 0.00003465  


Train: 100%|██████████| 60/60 [00:24<00:00,  2.49train_batch/s]


Epoch: [18][59/60] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0683 Grad: 31201.1270  LR: 0.00003005  


Validation:   5%|▍         | 1/21 [00:00<00:10,  1.88valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 10s) Loss: 0.4128 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.27valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.2818 
Epoch 18 - avg_train_loss: 0.0683  avg_val_loss: 0.2818  time: 29s
Epoch 18 - Save Best Loss: 0.2818 Model


Train:   2%|▏         | 1/60 [00:00<00:55,  1.05train_batch/s]

Epoch: [19][0/60] Elapsed 0m 0s (remain 0m 55s) Loss: 0.0486 Grad: 63561.8086  LR: 0.00002956  


Train:  85%|████████▌ | 51/60 [00:22<00:03,  2.28train_batch/s]

Epoch: [19][50/60] Elapsed 0m 22s (remain 0m 3s) Loss: 0.0671 Grad: 45703.2852  LR: 0.00001015  


Train: 100%|██████████| 60/60 [00:26<00:00,  2.27train_batch/s]


Epoch: [19][59/60] Elapsed 0m 26s (remain 0m 0s) Loss: 0.0671 Grad: 39844.0000  LR: 0.00000774  


Validation:  10%|▉         | 2/21 [00:00<00:04,  3.90valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 8s) Loss: 0.4069 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.89valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.2824 
Epoch 19 - avg_train_loss: 0.0671  avg_val_loss: 0.2824  time: 31s


Train:   2%|▏         | 1/60 [00:00<00:46,  1.28train_batch/s]

Epoch: [20][0/60] Elapsed 0m 0s (remain 0m 46s) Loss: 0.0788 Grad: 95822.4688  LR: 0.00000750  


Train:  85%|████████▌ | 51/60 [00:20<00:03,  2.41train_batch/s]

Epoch: [20][50/60] Elapsed 0m 20s (remain 0m 3s) Loss: 0.0660 Grad: 44557.6680  LR: 0.00000054  


Train: 100%|██████████| 60/60 [00:24<00:00,  2.38train_batch/s]

Epoch: [20][59/60] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0660 Grad: 46149.2969  LR: 0.00000040  


Train: 100%|██████████| 60/60 [00:24<00:00,  2.46train_batch/s]
Validation:   5%|▍         | 1/21 [00:00<00:10,  1.86valid_batch/s]

EVAL: [0/21] Elapsed 0m 0s (remain 0m 10s) Loss: 0.4175 


Validation: 100%|██████████| 21/21 [00:04<00:00,  4.30valid_batch/s]


EVAL: [20/21] Elapsed 0m 4s (remain 0m 0s) Loss: 0.2816 
Epoch 20 - avg_train_loss: 0.0660  avg_val_loss: 0.2816  time: 29s
Epoch 20 - Save Best Loss: 0.2816 Model


In [27]:
if not config.TRAIN_FULL_DATA:
        oof_df = pd.DataFrame()
        for fold in range(config.FOLDS):
            if fold in [3,4]:
                _oof_df = train_loop(train_df, fold,f"models3/wavenet_fold_{fold}_best.pth")#, torch.load(f"models/wavenet_fold_{fold}_best.pth")["best_loss"])
                oof_df = pd.concat([oof_df, _oof_df])
                print(f"========== Fold {fold} result: {get_result(_oof_df)} ==========")
                #LOGGER.info(f"========== Fold {fold} finished ==========")
        oof_df = oof_df.reset_index(drop=True)
        oof_df.to_csv('models5/oof_df.csv', index=False)
else:
    train_loop_full_data(train_df)



Train:   1%|          | 1/119 [00:13<27:03, 13.76s/train_batch]

Epoch: [1][0/119] Elapsed 0m 13s (remain 27m 3s) Loss: 2.1429 Grad: inf  LR: 0.00004004  


Train:  43%|████▎     | 51/119 [00:24<00:14,  4.67train_batch/s]

Epoch: [1][50/119] Elapsed 0m 24s (remain 0m 33s) Loss: 1.8200 Grad: 69308.2734  LR: 0.00014557  


Train:  85%|████████▍ | 101/119 [00:35<00:04,  4.44train_batch/s]

Epoch: [1][100/119] Elapsed 0m 35s (remain 0m 6s) Loss: 1.5107 Grad: 31066.9453  LR: 0.00040965  


Train: 100%|██████████| 119/119 [00:39<00:00,  4.63train_batch/s]

Epoch: [1][118/119] Elapsed 0m 39s (remain 0m 0s) Loss: 1.4226 Grad: 40888.4062  LR: 0.00052318  


Train: 100%|██████████| 119/119 [00:40<00:00,  2.97train_batch/s]
Validation:   8%|▊         | 3/40 [00:01<00:19,  1.92valid_batch/s]

EVAL: [0/40] Elapsed 0m 1s (remain 1m 9s) Loss: 1.1786 


Validation: 100%|██████████| 40/40 [00:06<00:00,  6.60valid_batch/s]

EVAL: [39/40] Elapsed 0m 6s (remain 0m 0s) Loss: 0.9296 


Validation: 100%|██████████| 40/40 [00:07<00:00,  5.61valid_batch/s]


Epoch 1 - avg_train_loss: 1.4226  avg_val_loss: 0.9296  time: 47s
Epoch 1 - Save Best Loss: 0.9296 Model


Train:   1%|          | 1/119 [00:00<01:09,  1.70train_batch/s]

Epoch: [2][0/119] Elapsed 0m 0s (remain 1m 9s) Loss: 0.7053 Grad: 309372.4375  LR: 0.00052954  


Train:  43%|████▎     | 51/119 [00:11<00:14,  4.55train_batch/s]

Epoch: [2][50/119] Elapsed 0m 11s (remain 0m 15s) Loss: 0.7718 Grad: 73753.4766  LR: 0.00082281  


Train:  85%|████████▍ | 101/119 [00:22<00:04,  4.04train_batch/s]

Epoch: [2][100/119] Elapsed 0m 22s (remain 0m 4s) Loss: 0.7397 Grad: 74039.7266  LR: 0.00098786  


Train: 100%|██████████| 119/119 [00:27<00:00,  4.47train_batch/s]

Epoch: [2][118/119] Elapsed 0m 27s (remain 0m 0s) Loss: 0.7330 Grad: 77455.8281  LR: 0.00100000  


Train: 100%|██████████| 119/119 [00:27<00:00,  4.35train_batch/s]
Validation:   5%|▌         | 2/40 [00:00<00:10,  3.57valid_batch/s]

EVAL: [0/40] Elapsed 0m 0s (remain 0m 20s) Loss: 1.1094 


Validation:  98%|█████████▊| 39/40 [00:05<00:00,  7.40valid_batch/s]

EVAL: [39/40] Elapsed 0m 5s (remain 0m 0s) Loss: 0.7161 


Validation: 100%|██████████| 40/40 [00:06<00:00,  6.41valid_batch/s]


Epoch 2 - avg_train_loss: 0.7330  avg_val_loss: 0.7161  time: 34s
Epoch 2 - Save Best Loss: 0.7161 Model


Train:   1%|          | 1/119 [00:00<01:09,  1.69train_batch/s]

Epoch: [3][0/119] Elapsed 0m 0s (remain 1m 9s) Loss: 0.5022 Grad: 195338.7344  LR: 0.00100000  


Train:  43%|████▎     | 51/119 [00:11<00:14,  4.55train_batch/s]

Epoch: [3][50/119] Elapsed 0m 11s (remain 0m 15s) Loss: 0.5713 Grad: 141808.8594  LR: 0.00099855  


Train:  85%|████████▍ | 101/119 [00:31<00:07,  2.50train_batch/s]

Epoch: [3][100/119] Elapsed 0m 31s (remain 0m 5s) Loss: 0.5523 Grad: 104032.9766  LR: 0.00099442  


Train: 100%|██████████| 119/119 [00:38<00:00,  2.41train_batch/s]

Epoch: [3][118/119] Elapsed 0m 38s (remain 0m 0s) Loss: 0.5507 Grad: 109600.9766  LR: 0.00099228  


Train: 100%|██████████| 119/119 [00:39<00:00,  3.04train_batch/s]
Validation:   5%|▌         | 2/40 [00:00<00:14,  2.61valid_batch/s]

EVAL: [0/40] Elapsed 0m 0s (remain 0m 26s) Loss: 0.6995 


Validation: 100%|██████████| 40/40 [00:09<00:00,  4.44valid_batch/s]

EVAL: [39/40] Elapsed 0m 9s (remain 0m 0s) Loss: 0.5739 


Validation: 100%|██████████| 40/40 [00:10<00:00,  3.90valid_batch/s]


Epoch 3 - avg_train_loss: 0.5507  avg_val_loss: 0.5739  time: 49s
Epoch 3 - Save Best Loss: 0.5739 Model


Train:   1%|          | 1/119 [00:00<01:46,  1.11train_batch/s]

Epoch: [4][0/119] Elapsed 0m 0s (remain 1m 46s) Loss: 0.4500 Grad: 193939.9531  LR: 0.00099215  


Train:  43%|████▎     | 51/119 [00:19<00:29,  2.33train_batch/s]

Epoch: [4][50/119] Elapsed 0m 19s (remain 0m 26s) Loss: 0.4390 Grad: 50219.1680  LR: 0.00098436  


Train:  85%|████████▍ | 101/119 [00:41<00:07,  2.30train_batch/s]

Epoch: [4][100/119] Elapsed 0m 41s (remain 0m 7s) Loss: 0.4235 Grad: 69831.7031  LR: 0.00097397  


Train: 100%|██████████| 119/119 [00:48<00:00,  3.06train_batch/s]

Epoch: [4][118/119] Elapsed 0m 48s (remain 0m 0s) Loss: 0.4260 Grad: 43546.4453  LR: 0.00096961  


Train: 100%|██████████| 119/119 [00:49<00:00,  2.43train_batch/s]
Validation:   2%|▎         | 1/40 [00:00<00:27,  1.42valid_batch/s]

EVAL: [0/40] Elapsed 0m 0s (remain 0m 27s) Loss: 0.8087 


Validation: 100%|██████████| 40/40 [00:08<00:00,  5.50valid_batch/s]

EVAL: [39/40] Elapsed 0m 8s (remain 0m 0s) Loss: 0.6823 


Validation: 100%|██████████| 40/40 [00:09<00:00,  4.31valid_batch/s]


Epoch 4 - avg_train_loss: 0.4260  avg_val_loss: 0.6823  time: 58s


Train:   1%|          | 1/119 [00:01<02:26,  1.24s/train_batch]

Epoch: [5][0/119] Elapsed 0m 1s (remain 2m 25s) Loss: 0.2709 Grad: 172900.1406  LR: 0.00096935  


Train:  43%|████▎     | 51/119 [00:21<00:28,  2.40train_batch/s]

Epoch: [5][50/119] Elapsed 0m 21s (remain 0m 29s) Loss: 0.3744 Grad: 100675.7578  LR: 0.00095547  


Train:  85%|████████▍ | 101/119 [00:41<00:06,  2.65train_batch/s]

Epoch: [5][100/119] Elapsed 0m 41s (remain 0m 7s) Loss: 0.3576 Grad: 65522.9414  LR: 0.00093914  


Train: 100%|██████████| 119/119 [00:49<00:00,  2.38train_batch/s]

Epoch: [5][118/119] Elapsed 0m 49s (remain 0m 0s) Loss: 0.3577 Grad: 44355.2188  LR: 0.00093267  


Train: 100%|██████████| 119/119 [00:49<00:00,  2.39train_batch/s]
Validation:   2%|▎         | 1/40 [00:00<00:27,  1.42valid_batch/s]

EVAL: [0/40] Elapsed 0m 0s (remain 0m 27s) Loss: 1.7021 


Validation: 100%|██████████| 40/40 [00:10<00:00,  4.81valid_batch/s]

EVAL: [39/40] Elapsed 0m 10s (remain 0m 0s) Loss: 1.1629 


Validation: 100%|██████████| 40/40 [00:10<00:00,  3.65valid_batch/s]


Epoch 5 - avg_train_loss: 0.3577  avg_val_loss: 1.1629  time: 61s


Train:   1%|          | 1/119 [00:00<01:50,  1.07train_batch/s]

Epoch: [6][0/119] Elapsed 0m 0s (remain 1m 50s) Loss: 0.5008 Grad: 277274.9688  LR: 0.00093230  


Train:  43%|████▎     | 51/119 [00:21<00:25,  2.65train_batch/s]

Epoch: [6][50/119] Elapsed 0m 21s (remain 0m 28s) Loss: 0.2991 Grad: 47017.9609  LR: 0.00091274  


Train:  85%|████████▍ | 101/119 [00:40<00:07,  2.43train_batch/s]

Epoch: [6][100/119] Elapsed 0m 40s (remain 0m 7s) Loss: 0.3120 Grad: 56527.8398  LR: 0.00089096  


Train: 100%|██████████| 119/119 [00:48<00:00,  2.31train_batch/s]

Epoch: [6][118/119] Elapsed 0m 48s (remain 0m 0s) Loss: 0.3098 Grad: 43627.3633  LR: 0.00088260  


Train: 100%|██████████| 119/119 [00:49<00:00,  2.43train_batch/s]
Validation:   2%|▎         | 1/40 [00:00<00:32,  1.20valid_batch/s]

EVAL: [0/40] Elapsed 0m 0s (remain 0m 32s) Loss: 0.5620 


Validation: 100%|██████████| 40/40 [00:09<00:00,  4.79valid_batch/s]

EVAL: [39/40] Elapsed 0m 9s (remain 0m 0s) Loss: 0.4369 


Validation: 100%|██████████| 40/40 [00:10<00:00,  3.96valid_batch/s]


Epoch 6 - avg_train_loss: 0.3098  avg_val_loss: 0.4369  time: 59s
Epoch 6 - Save Best Loss: 0.4369 Model


Train:   1%|          | 1/119 [00:00<01:21,  1.46train_batch/s]

Epoch: [7][0/119] Elapsed 0m 0s (remain 1m 21s) Loss: 0.2212 Grad: 149712.5469  LR: 0.00088212  


Train:  43%|████▎     | 51/119 [00:18<00:28,  2.40train_batch/s]

Epoch: [7][50/119] Elapsed 0m 18s (remain 0m 25s) Loss: 0.2623 Grad: 101405.3359  LR: 0.00085748  


Train:  85%|████████▍ | 101/119 [00:40<00:07,  2.27train_batch/s]

Epoch: [7][100/119] Elapsed 0m 40s (remain 0m 7s) Loss: 0.2618 Grad: 21515.2344  LR: 0.00083091  


Train: 100%|██████████| 119/119 [00:48<00:00,  3.21train_batch/s]

Epoch: [7][118/119] Elapsed 0m 48s (remain 0m 0s) Loss: 0.2685 Grad: 28342.3418  LR: 0.00082090  


Train: 100%|██████████| 119/119 [00:48<00:00,  2.45train_batch/s]
Validation:   5%|▌         | 2/40 [00:00<00:13,  2.91valid_batch/s]

EVAL: [0/40] Elapsed 0m 0s (remain 0m 24s) Loss: 0.5926 


Validation: 100%|██████████| 40/40 [00:08<00:00,  4.74valid_batch/s]

EVAL: [39/40] Elapsed 0m 8s (remain 0m 0s) Loss: 0.5648 


Validation: 100%|██████████| 40/40 [00:09<00:00,  4.33valid_batch/s]


Epoch 7 - avg_train_loss: 0.2685  avg_val_loss: 0.5648  time: 58s


Train:   1%|          | 1/119 [00:00<01:50,  1.07train_batch/s]

Epoch: [8][0/119] Elapsed 0m 0s (remain 1m 50s) Loss: 0.2297 Grad: 182115.3750  LR: 0.00082034  


Train:  43%|████▎     | 51/119 [00:22<00:28,  2.36train_batch/s]

Epoch: [8][50/119] Elapsed 0m 22s (remain 0m 29s) Loss: 0.2419 Grad: 51863.0977  LR: 0.00079136  


Train:  85%|████████▍ | 101/119 [00:42<00:05,  3.36train_batch/s]

Epoch: [8][100/119] Elapsed 0m 42s (remain 0m 7s) Loss: 0.2386 Grad: 25940.0996  LR: 0.00076081  


Train: 100%|██████████| 119/119 [00:49<00:00,  2.31train_batch/s]

Epoch: [8][118/119] Elapsed 0m 49s (remain 0m 0s) Loss: 0.2358 Grad: 14349.8936  LR: 0.00074946  


Train: 100%|██████████| 119/119 [00:50<00:00,  2.37train_batch/s]
Validation:   2%|▎         | 1/40 [00:01<00:39,  1.01s/valid_batch]

EVAL: [0/40] Elapsed 0m 1s (remain 0m 39s) Loss: 0.6412 


Validation: 100%|██████████| 40/40 [00:10<00:00,  4.36valid_batch/s]

EVAL: [39/40] Elapsed 0m 10s (remain 0m 0s) Loss: 0.4383 


Validation: 100%|██████████| 40/40 [00:10<00:00,  3.71valid_batch/s]


Epoch 8 - avg_train_loss: 0.2358  avg_val_loss: 0.4383  time: 61s


Train:   1%|          | 1/119 [00:00<01:48,  1.09train_batch/s]

Epoch: [9][0/119] Elapsed 0m 0s (remain 1m 48s) Loss: 0.1762 Grad: 170570.1094  LR: 0.00074883  


Train:  43%|████▎     | 51/119 [00:21<00:21,  3.10train_batch/s]

Epoch: [9][50/119] Elapsed 0m 21s (remain 0m 28s) Loss: 0.2043 Grad: 46494.3398  LR: 0.00071639  


Train:  85%|████████▍ | 101/119 [00:39<00:07,  2.47train_batch/s]

Epoch: [9][100/119] Elapsed 0m 39s (remain 0m 7s) Loss: 0.1972 Grad: 27274.8359  LR: 0.00068280  


Train: 100%|██████████| 119/119 [00:47<00:00,  2.40train_batch/s]

Epoch: [9][118/119] Elapsed 0m 47s (remain 0m 0s) Loss: 0.1939 Grad: 16991.7051  LR: 0.00067045  


Train: 100%|██████████| 119/119 [00:47<00:00,  2.48train_batch/s]
Validation:   2%|▎         | 1/40 [00:00<00:32,  1.19valid_batch/s]

EVAL: [0/40] Elapsed 0m 0s (remain 0m 32s) Loss: 0.4797 


Validation: 100%|██████████| 40/40 [00:10<00:00,  4.77valid_batch/s]

EVAL: [39/40] Elapsed 0m 10s (remain 0m 0s) Loss: 0.3821 


Validation: 100%|██████████| 40/40 [00:10<00:00,  3.86valid_batch/s]


Epoch 9 - avg_train_loss: 0.1939  avg_val_loss: 0.3821  time: 58s
Epoch 9 - Save Best Loss: 0.3821 Model


Train:   1%|          | 1/119 [00:00<01:47,  1.10train_batch/s]

Epoch: [10][0/119] Elapsed 0m 0s (remain 1m 47s) Loss: 0.1754 Grad: 202133.3438  LR: 0.00066976  


Train:  43%|████▎     | 51/119 [00:20<00:29,  2.30train_batch/s]

Epoch: [10][50/119] Elapsed 0m 20s (remain 0m 27s) Loss: 0.1654 Grad: 30133.8223  LR: 0.00063486  


Train:  85%|████████▍ | 101/119 [00:41<00:07,  2.48train_batch/s]

Epoch: [10][100/119] Elapsed 0m 41s (remain 0m 7s) Loss: 0.1700 Grad: 37423.9648  LR: 0.00059923  


Train: 100%|██████████| 119/119 [00:48<00:00,  2.87train_batch/s]

Epoch: [10][118/119] Elapsed 0m 48s (remain 0m 0s) Loss: 0.1697 Grad: 34196.2188  LR: 0.00058627  


Train: 100%|██████████| 119/119 [00:49<00:00,  2.42train_batch/s]
Validation:   5%|▌         | 2/40 [00:00<00:15,  2.49valid_batch/s]

EVAL: [0/40] Elapsed 0m 0s (remain 0m 28s) Loss: 0.4635 


Validation: 100%|██████████| 40/40 [00:09<00:00,  4.54valid_batch/s]

EVAL: [39/40] Elapsed 0m 9s (remain 0m 0s) Loss: 0.3927 


Validation: 100%|██████████| 40/40 [00:10<00:00,  3.92valid_batch/s]


Epoch 10 - avg_train_loss: 0.1697  avg_val_loss: 0.3927  time: 59s


Train:   1%|          | 1/119 [00:01<01:58,  1.01s/train_batch]

Epoch: [11][0/119] Elapsed 0m 1s (remain 1m 58s) Loss: 0.2339 Grad: 249136.0625  LR: 0.00058555  


Train:  43%|████▎     | 51/119 [00:21<00:28,  2.40train_batch/s]

Epoch: [11][50/119] Elapsed 0m 21s (remain 0m 29s) Loss: 0.1482 Grad: 66992.6172  LR: 0.00054923  


Train:  85%|████████▍ | 101/119 [00:40<00:07,  2.41train_batch/s]

Epoch: [11][100/119] Elapsed 0m 40s (remain 0m 7s) Loss: 0.1497 Grad: 31564.5371  LR: 0.00051266  


Train: 100%|██████████| 119/119 [00:47<00:00,  2.39train_batch/s]

Epoch: [11][118/119] Elapsed 0m 47s (remain 0m 0s) Loss: 0.1478 Grad: 28980.0176  LR: 0.00049947  


Train: 100%|██████████| 119/119 [00:48<00:00,  2.46train_batch/s]
Validation:   5%|▌         | 2/40 [00:01<00:17,  2.16valid_batch/s]

EVAL: [0/40] Elapsed 0m 0s (remain 0m 33s) Loss: 0.3891 


Validation: 100%|██████████| 40/40 [00:09<00:00,  4.50valid_batch/s]

EVAL: [39/40] Elapsed 0m 9s (remain 0m 0s) Loss: 0.3169 


Validation: 100%|██████████| 40/40 [00:10<00:00,  3.89valid_batch/s]


Epoch 11 - avg_train_loss: 0.1478  avg_val_loss: 0.3169  time: 59s
Epoch 11 - Save Best Loss: 0.3169 Model


Train:   1%|          | 1/119 [00:01<02:34,  1.31s/train_batch]

Epoch: [12][0/119] Elapsed 0m 1s (remain 2m 34s) Loss: 0.0941 Grad: 116621.2578  LR: 0.00049873  


Train:  43%|████▎     | 51/119 [00:20<00:25,  2.65train_batch/s]

Epoch: [12][50/119] Elapsed 0m 20s (remain 0m 27s) Loss: 0.1295 Grad: 31369.2285  LR: 0.00046212  


Train:  85%|████████▍ | 101/119 [00:40<00:07,  2.39train_batch/s]

Epoch: [12][100/119] Elapsed 0m 40s (remain 0m 7s) Loss: 0.1294 Grad: 34985.9102  LR: 0.00042571  


Train: 100%|██████████| 119/119 [00:47<00:00,  2.52train_batch/s]

Epoch: [12][118/119] Elapsed 0m 47s (remain 0m 0s) Loss: 0.1272 Grad: 54568.8867  LR: 0.00041269  


Train: 100%|██████████| 119/119 [00:48<00:00,  2.47train_batch/s]
Validation:   2%|▎         | 1/40 [00:00<00:35,  1.11valid_batch/s]

EVAL: [0/40] Elapsed 0m 0s (remain 0m 35s) Loss: 0.4240 


Validation: 100%|██████████| 40/40 [00:09<00:00,  4.65valid_batch/s]

EVAL: [39/40] Elapsed 0m 9s (remain 0m 0s) Loss: 0.3182 


Validation: 100%|██████████| 40/40 [00:09<00:00,  4.16valid_batch/s]


Epoch 12 - avg_train_loss: 0.1272  avg_val_loss: 0.3182  time: 58s


Train:   1%|          | 1/119 [00:00<01:35,  1.23train_batch/s]

Epoch: [13][0/119] Elapsed 0m 0s (remain 1m 36s) Loss: 0.1234 Grad: 155100.5000  LR: 0.00041197  


Train:  43%|████▎     | 51/119 [00:18<00:29,  2.28train_batch/s]

Epoch: [13][50/119] Elapsed 0m 18s (remain 0m 25s) Loss: 0.1225 Grad: 13373.9795  LR: 0.00037616  


Train:  85%|████████▍ | 101/119 [00:40<00:07,  2.28train_batch/s]

Epoch: [13][100/119] Elapsed 0m 40s (remain 0m 7s) Loss: 0.1205 Grad: 35955.5312  LR: 0.00034102  


Train: 100%|██████████| 119/119 [00:48<00:00,  2.36train_batch/s]

Epoch: [13][118/119] Elapsed 0m 48s (remain 0m 0s) Loss: 0.1194 Grad: 17812.1309  LR: 0.00032857  


Train: 100%|██████████| 119/119 [00:49<00:00,  2.42train_batch/s]
Validation:   5%|▌         | 2/40 [00:00<00:14,  2.57valid_batch/s]

EVAL: [0/40] Elapsed 0m 0s (remain 0m 28s) Loss: 0.3931 


Validation: 100%|██████████| 40/40 [00:09<00:00,  5.19valid_batch/s]

EVAL: [39/40] Elapsed 0m 9s (remain 0m 0s) Loss: 0.3124 


Validation: 100%|██████████| 40/40 [00:09<00:00,  4.09valid_batch/s]


Epoch 13 - avg_train_loss: 0.1194  avg_val_loss: 0.3124  time: 59s
Epoch 13 - Save Best Loss: 0.3124 Model


Train:   1%|          | 1/119 [00:00<01:49,  1.08train_batch/s]

Epoch: [14][0/119] Elapsed 0m 0s (remain 1m 50s) Loss: 0.0962 Grad: 129077.3125  LR: 0.00032788  


Train:  43%|████▎     | 51/119 [00:22<00:27,  2.48train_batch/s]

Epoch: [14][50/119] Elapsed 0m 22s (remain 0m 30s) Loss: 0.1080 Grad: 30358.0078  LR: 0.00029397  


Train:  85%|████████▍ | 101/119 [00:42<00:06,  2.72train_batch/s]

Epoch: [14][100/119] Elapsed 0m 42s (remain 0m 7s) Loss: 0.1065 Grad: 19883.1035  LR: 0.00026117  


Train: 100%|██████████| 119/119 [00:49<00:00,  2.42train_batch/s]

Epoch: [14][118/119] Elapsed 0m 49s (remain 0m 0s) Loss: 0.1039 Grad: 27330.3730  LR: 0.00024967  


Train: 100%|██████████| 119/119 [00:49<00:00,  2.40train_batch/s]
Validation:   2%|▎         | 1/40 [00:00<00:34,  1.14valid_batch/s]

EVAL: [0/40] Elapsed 0m 0s (remain 0m 34s) Loss: 0.3824 


Validation: 100%|██████████| 40/40 [00:10<00:00,  4.01valid_batch/s]

EVAL: [39/40] Elapsed 0m 10s (remain 0m 0s) Loss: 0.2979 


Validation: 100%|██████████| 40/40 [00:10<00:00,  3.65valid_batch/s]


Epoch 14 - avg_train_loss: 0.1039  avg_val_loss: 0.2979  time: 61s
Epoch 14 - Save Best Loss: 0.2979 Model


Train:   1%|          | 1/119 [00:01<02:00,  1.02s/train_batch]

Epoch: [15][0/119] Elapsed 0m 1s (remain 2m 0s) Loss: 0.0864 Grad: 116222.1016  LR: 0.00024903  


Train:  43%|████▎     | 51/119 [00:21<00:26,  2.53train_batch/s]

Epoch: [15][50/119] Elapsed 0m 21s (remain 0m 28s) Loss: 0.0914 Grad: 49849.2891  LR: 0.00021805  


Train:  85%|████████▍ | 101/119 [00:41<00:07,  2.29train_batch/s]

Epoch: [15][100/119] Elapsed 0m 41s (remain 0m 7s) Loss: 0.0909 Grad: 17943.4180  LR: 0.00018858  


Train: 100%|██████████| 119/119 [00:48<00:00,  2.48train_batch/s]

Epoch: [15][118/119] Elapsed 0m 48s (remain 0m 0s) Loss: 0.0922 Grad: inf  LR: 0.00017837  


Train: 100%|██████████| 119/119 [00:48<00:00,  2.44train_batch/s]
Validation:   2%|▎         | 1/40 [00:00<00:27,  1.41valid_batch/s]

EVAL: [0/40] Elapsed 0m 0s (remain 0m 27s) Loss: 0.3825 


Validation: 100%|██████████| 40/40 [00:09<00:00,  4.73valid_batch/s]

EVAL: [39/40] Elapsed 0m 9s (remain 0m 0s) Loss: 0.2729 


Validation: 100%|██████████| 40/40 [00:09<00:00,  4.05valid_batch/s]


Epoch 15 - avg_train_loss: 0.0922  avg_val_loss: 0.2729  time: 59s
Epoch 15 - Save Best Loss: 0.2729 Model


Train:   1%|          | 1/119 [00:01<02:13,  1.13s/train_batch]

Epoch: [16][0/119] Elapsed 0m 1s (remain 2m 13s) Loss: 0.0705 Grad: 103471.6797  LR: 0.00017781  


Train:  43%|████▎     | 51/119 [00:20<00:30,  2.24train_batch/s]

Epoch: [16][50/119] Elapsed 0m 20s (remain 0m 27s) Loss: 0.0821 Grad: 75137.5156  LR: 0.00015070  


Train:  85%|████████▍ | 101/119 [00:42<00:07,  2.34train_batch/s]

Epoch: [16][100/119] Elapsed 0m 42s (remain 0m 7s) Loss: 0.0818 Grad: 15278.5762  LR: 0.00012546  


Train: 100%|██████████| 119/119 [00:50<00:00,  2.70train_batch/s]

Epoch: [16][118/119] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0816 Grad: 13157.8125  LR: 0.00011686  


Train: 100%|██████████| 119/119 [00:50<00:00,  2.35train_batch/s]
Validation:   5%|▌         | 2/40 [00:00<00:15,  2.38valid_batch/s]

EVAL: [0/40] Elapsed 0m 0s (remain 0m 30s) Loss: 0.3873 


Validation: 100%|██████████| 40/40 [00:09<00:00,  4.47valid_batch/s]

EVAL: [39/40] Elapsed 0m 9s (remain 0m 0s) Loss: 0.2639 


Validation: 100%|██████████| 40/40 [00:09<00:00,  4.04valid_batch/s]


Epoch 16 - avg_train_loss: 0.0816  avg_val_loss: 0.2639  time: 61s
Epoch 16 - Save Best Loss: 0.2639 Model


Train:   1%|          | 1/119 [00:00<01:41,  1.16train_batch/s]

Epoch: [17][0/119] Elapsed 0m 0s (remain 1m 42s) Loss: 0.0843 Grad: 97280.7266  LR: 0.00011639  


Train:  43%|████▎     | 51/119 [00:21<00:27,  2.44train_batch/s]

Epoch: [17][50/119] Elapsed 0m 21s (remain 0m 29s) Loss: 0.0767 Grad: 57154.9492  LR: 0.00009397  


Train:  85%|████████▍ | 101/119 [00:39<00:05,  3.40train_batch/s]

Epoch: [17][100/119] Elapsed 0m 39s (remain 0m 7s) Loss: 0.0791 Grad: 38780.0312  LR: 0.00007373  


Train: 100%|██████████| 119/119 [00:46<00:00,  2.43train_batch/s]

Epoch: [17][118/119] Elapsed 0m 46s (remain 0m 0s) Loss: 0.0785 Grad: 41180.6641  LR: 0.00006699  


Train: 100%|██████████| 119/119 [00:46<00:00,  2.56train_batch/s]
Validation:   2%|▎         | 1/40 [00:00<00:26,  1.48valid_batch/s]

EVAL: [0/40] Elapsed 0m 0s (remain 0m 26s) Loss: 0.3691 


Validation: 100%|██████████| 40/40 [00:09<00:00,  4.74valid_batch/s]

EVAL: [39/40] Elapsed 0m 9s (remain 0m 0s) Loss: 0.2666 


Validation: 100%|██████████| 40/40 [00:09<00:00,  4.28valid_batch/s]


Epoch 17 - avg_train_loss: 0.0785  avg_val_loss: 0.2666  time: 56s


Train:   1%|          | 1/119 [00:00<01:56,  1.02train_batch/s]

Epoch: [18][0/119] Elapsed 0m 0s (remain 1m 56s) Loss: 0.0959 Grad: 107931.6484  LR: 0.00006663  


Train:  43%|████▎     | 51/119 [00:21<00:19,  3.46train_batch/s]

Epoch: [18][50/119] Elapsed 0m 21s (remain 0m 28s) Loss: 0.0726 Grad: 106013.0938  LR: 0.00004958  


Train:  85%|████████▍ | 101/119 [00:40<00:07,  2.30train_batch/s]

Epoch: [18][100/119] Elapsed 0m 40s (remain 0m 7s) Loss: 0.0710 Grad: 154188.1406  LR: 0.00003495  


Train: 100%|██████████| 119/119 [00:48<00:00,  2.42train_batch/s]

Epoch: [18][118/119] Elapsed 0m 48s (remain 0m 0s) Loss: 0.0726 Grad: 37560.4805  LR: 0.00003029  


Train: 100%|██████████| 119/119 [00:48<00:00,  2.44train_batch/s]
Validation:   5%|▌         | 2/40 [00:00<00:15,  2.42valid_batch/s]

EVAL: [0/40] Elapsed 0m 0s (remain 0m 28s) Loss: 0.3772 


Validation: 100%|██████████| 40/40 [00:09<00:00,  4.46valid_batch/s]

EVAL: [39/40] Elapsed 0m 9s (remain 0m 0s) Loss: 0.2634 


Validation: 100%|██████████| 40/40 [00:10<00:00,  3.99valid_batch/s]


Epoch 18 - avg_train_loss: 0.0726  avg_val_loss: 0.2634  time: 59s
Epoch 18 - Save Best Loss: 0.2634 Model


Train:   1%|          | 1/119 [00:01<02:27,  1.25s/train_batch]

Epoch: [19][0/119] Elapsed 0m 1s (remain 2m 28s) Loss: 0.0617 Grad: 99470.8906  LR: 0.00003004  


Train:  43%|████▎     | 51/119 [00:18<00:26,  2.59train_batch/s]

Epoch: [19][50/119] Elapsed 0m 18s (remain 0m 25s) Loss: 0.0694 Grad: 75711.5469  LR: 0.00001888  


Train:  85%|████████▍ | 101/119 [00:39<00:07,  2.51train_batch/s]

Epoch: [19][100/119] Elapsed 0m 39s (remain 0m 6s) Loss: 0.0716 Grad: 47356.2891  LR: 0.00001031  


Train: 100%|██████████| 119/119 [00:46<00:00,  2.36train_batch/s]

Epoch: [19][118/119] Elapsed 0m 46s (remain 0m 0s) Loss: 0.0711 Grad: 36232.5312  LR: 0.00000787  


Train: 100%|██████████| 119/119 [00:47<00:00,  2.52train_batch/s]
Validation:   5%|▌         | 2/40 [00:00<00:15,  2.44valid_batch/s]

EVAL: [0/40] Elapsed 0m 0s (remain 0m 29s) Loss: 0.3735 


Validation: 100%|██████████| 40/40 [00:08<00:00,  5.04valid_batch/s]

EVAL: [39/40] Elapsed 0m 8s (remain 0m 0s) Loss: 0.2603 


Validation: 100%|██████████| 40/40 [00:09<00:00,  4.23valid_batch/s]


Epoch 19 - avg_train_loss: 0.0711  avg_val_loss: 0.2603  time: 57s
Epoch 19 - Save Best Loss: 0.2603 Model


Train:   1%|          | 1/119 [00:00<01:52,  1.05train_batch/s]

Epoch: [20][0/119] Elapsed 0m 0s (remain 1m 53s) Loss: 0.1087 Grad: 138402.0938  LR: 0.00000774  


Train:  43%|████▎     | 51/119 [00:21<00:30,  2.25train_batch/s]

Epoch: [20][50/119] Elapsed 0m 21s (remain 0m 28s) Loss: 0.0694 Grad: 74703.6328  LR: 0.00000281  


Train:  85%|████████▍ | 101/119 [00:43<00:07,  2.34train_batch/s]

Epoch: [20][100/119] Elapsed 0m 43s (remain 0m 7s) Loss: 0.0705 Grad: 39918.0000  LR: 0.00000056  


Train: 100%|██████████| 119/119 [00:49<00:00,  2.66train_batch/s]

Epoch: [20][118/119] Elapsed 0m 49s (remain 0m 0s) Loss: 0.0710 Grad: 78803.0469  LR: 0.00000040  


Train: 100%|██████████| 119/119 [00:49<00:00,  2.40train_batch/s]
Validation:   8%|▊         | 3/40 [00:00<00:09,  4.05valid_batch/s]

EVAL: [0/40] Elapsed 0m 0s (remain 0m 26s) Loss: 0.3726 


Validation: 100%|██████████| 40/40 [00:09<00:00,  4.22valid_batch/s]

EVAL: [39/40] Elapsed 0m 9s (remain 0m 0s) Loss: 0.2623 


Validation: 100%|██████████| 40/40 [00:10<00:00,  3.90valid_batch/s]


Epoch 20 - avg_train_loss: 0.0710  avg_val_loss: 0.2623  time: 60s


In [29]:
get_result(oof_df)

tensor(0.9396, dtype=torch.float64)