In [1]:
# basic imports
import numpy as np
import sys,os,h5py,math

# PyTorch imports
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Sampler
import pytorch_lightning as pl

# Personal imports
sys.path.append(os.path.dirname(os.getcwd())) #add parent folder to PATH
import lib.models as models
from lib.metrics import accuracy,weighted_auc

## DanQ

In [2]:
class CustomDataset(Dataset):
    def __init__(self,x,y):
        super(CustomDataset,self).__init__()
        self.x, self.y = x,y
        
    def __len__(self): return len(self.x)

    def __getitem__(self, i): return self.x[i].long(), self.y[i].float()

In [3]:
class Experiment(pl.LightningModule):

    def __init__(self,bs):
        super(Experiment, self).__init__()
        self.model = models.DanQ()
        self.bs = bs #batch size
        self.loss_fn = nn.BCEWithLogitsLoss()
        self.class_weights = torch.load('../data/Processed/class_weights')
        
    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_nb):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat, y)
        tensorboard_logs = {}#{'train_loss': loss}
        return {'loss': loss, 'log': tensorboard_logs}
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        return {'val_loss': self.loss_fn(y_hat, y)}

    def validation_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        tensorboard_logs = {}#{'val_loss': avg_loss}
        return {'avg_val_loss': avg_loss, 'log': tensorboard_logs}

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        y_pred = F.softmax(y_hat,dim=1).detach().cpu()
        return {'test_loss': self.loss_fn(y_hat, y),'y_pred':y_pred, 'y_true':y.cpu()}

    def test_end(self, outputs):
        avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean()
        y_preds = torch.cat([x['y_pred'] for x in outputs])
        y_trues = torch.cat([x['y_true'] for x in outputs]).byte()
        roc_auc = weighted_auc(y_preds,y_trues, self.class_weights)
        
        tensorboard_logs = {'test_loss': avg_loss,'roc_auc':roc_auc}
        return {'avg_test_loss': avg_loss, 'log': tensorboard_logs}
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-4)

    @pl.data_loader
    def train_dataloader(self):
        print('Loading training dataset')
        train_h5 = h5py.File('../data/Processed/train.hdf5')
        X_train = torch.tensor(train_h5['X_train'][:])
        y_train = torch.tensor(train_h5['y_train'][:])
        train_h5.close()
        trn_ds = CustomDataset(X_train,y_train)
        trn_dl = DataLoader(trn_ds, batch_size=self.bs,shuffle=True, num_workers=6)
        return trn_dl
    
    @pl.data_loader
    def val_dataloader(self):
        valid = np.load('../data/Processed/valid.npz')
        X_valid = torch.tensor(valid['arr_0'][:])
        y_valid = torch.tensor(valid['arr_1'][:])

        vld_ds = CustomDataset(X_valid,y_valid)
        vld_dl = DataLoader(vld_ds, batch_size=self.bs,shuffle=False, num_workers=6)
        return vld_dl
    
    @pl.data_loader
    def test_dataloader(self):
        test = np.load('../data/Processed/test.npz')
        X_test = torch.tensor(test['arr_0'][:])
        y_test = torch.tensor(test['arr_1'][:])
                              
        tst_ds = CustomDataset(X_test,y_test)
        tst_dl = DataLoader(tst_ds, batch_size=self.bs,shuffle=False, num_workers=6)
        return tst_dl

In [5]:
exp = Experiment(bs=512)

trainer = pl.Trainer(gpus=1,max_nb_epochs=1, train_percent_check=0.01,val_percent_check=0.5,
                     default_save_path='../data',log_gpu_memory='min_max')    
trainer.fit(exp) 
# #trainer.test()

Loading training dataset


Epoch 1:  91%|█████████▏| 85/93 [00:20<00:01,  4.26batch/s, batch_nb=84, gpu=0, loss=0.133, v_nb=22]
Validating:   0%|          | 0/8 [00:00<?, ?batch/s][A
Epoch 1:  92%|█████████▏| 86/93 [00:21<00:02,  2.40batch/s, batch_nb=84, gpu=0, loss=0.133, v_nb=22]
Epoch 1:  95%|█████████▍| 88/93 [00:21<00:01,  3.22batch/s, batch_nb=84, gpu=0, loss=0.133, v_nb=22]
Epoch 1:  97%|█████████▋| 90/93 [00:21<00:00,  4.22batch/s, batch_nb=84, gpu=0, loss=0.133, v_nb=22]
Epoch 1: 100%|██████████| 93/93 [00:22<00:00,  5.38batch/s, batch_nb=84, gpu=0, loss=0.133, v_nb=22]
Epoch 1: : 94batch [00:23,  4.06batch/s, batch_nb=85, gpu=0, loss=0.132, v_nb=22]                   


1

In [5]:
exp = Experiment(bs=512)
chpt_path = '../data/lightning_logs/version_4/checkpoints/_ckpt_epoch_1.ckpt'
exp.load_state_dict(torch.load(chpt_path)['state_dict'])
# exp.cuda()
trainer = pl.Trainer(gpus=1,max_nb_epochs=1, train_percent_check=0.1,val_percent_check=0.5,
                     test_percent_check=0.1, default_save_path='../data',log_gpu_memory='min_max')
trainer.test(exp)

Loading training dataset


Testing: 100%|██████████| 88/88 [00:08<00:00, 10.53batch/s]


## Transformer XL

In [2]:
import transformers as ts

In [3]:
class LMDataset(Dataset):
    def __init__(self,x,y,mem_len):
        super(LMDataset,self).__init__()
        self.x, self.y = x,y
        self.n = x.shape[0]*math.ceil(1000/mem_len)
        
    def __len__(self): return self.n
    def __getitem__(self, i): 
        (b_idx,seq_idxs, seq_start) = i
        x = self.x[b_idx,seq_idxs[0]:seq_idxs[1]].long()
        inp,tgt = x[:-1], x[1:]
        return inp,tgt,seq_start
    
class LMSampleR(Sampler):
    def __init__(self, ds, bs,mem_len):
        self.ds, self.bs = ds, bs
        self.mem_len = mem_len
        
    def __len__(self): return len(self.ds)
    
    def __iter__(self):
        for i in range(0,self.ds.x.shape[0],self.bs):
            for j in range(0,1000,self.mem_len):
                seq_idxs = (j,j+self.mem_len+1)
                for k in range(self.bs):
                    b_idx = i+k
                    seq_start = j==0 
                    yield (b_idx,seq_idxs, seq_start) # (bs,seq_len)

In [4]:
class TransXL_LM(nn.Module):
    def __init__(self,cfg):
        super(TransXL_LM, self).__init__()
        self.cfg = cfg
        self.core = ts.TransfoXLModel(cfg)
        self.lm_head = nn.Linear(self.cfg.d_model,self.cfg.vocab_size)
        
    def forward(self,x,mems=None):
        last_hidden_state,  mems = self.core(x,mems)
        out = self.lm_head(last_hidden_state)
        return out, mems

In [5]:
class Experiment(pl.LightningModule):

    def __init__(self,bs):
        super(Experiment, self).__init__()
        self.cfg = ts.TransfoXLConfig(vocab_size=4, d_model=64, d_embed=8, n_head=4, d_head=16, d_inner=128, 
                             n_layer=6, tgt_len=0, ext_len=0, mem_len=512, cutoffs=[1], )
        self.model = TransXL_LM(self.cfg)
        self.bs = bs #batch size
        self.loss_fn = nn.CrossEntropyLoss()
        self.class_weights = torch.load('../data/Processed/class_weights')
        
    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_nb):
        x, y, new_mem = batch
        self.mem = None if new_mem[0] else self.mem
        y_hat, self.mem = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1,self.cfg.vocab_size), y.view(-1))
        tensorboard_logs = {'train_loss': loss}
        return {'loss': loss, 'log': tensorboard_logs}
    
    def validation_step(self, batch, batch_idx):
        x, y, new_mem = batch
        self.mem = None if new_mem[0] else self.mem
        y_hat, self.mem = self.forward(x)
        return {'val_loss': self.loss_fn(y_hat.view(-1,self.cfg.vocab_size), y.view(-1))}

    def validation_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        tensorboard_logs = {'val_loss': avg_loss}
        return {'avg_val_loss': avg_loss, 'log': tensorboard_logs}

#     def test_step(self, batch, batch_idx):
#         x, y, new_mem = batch
#         self.mem = None if new_mem[0] else self.mem
#         y_hat, self.mem = self.forward(x)
#         y_pred = F.softmax(y_hat,dim=1).detach().cpu()
#         return {'test_loss': self.loss_fn(y_hat, y),'y_pred':y_pred, 'y_true':y.cpu()}

#     def test_end(self, outputs):
#         avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean()
#         y_preds = torch.cat([x['y_pred'] for x in outputs])
#         y_trues = torch.cat([x['y_true'] for x in outputs]).byte()
#         roc_auc = weighted_auc(y_preds,y_trues, self.class_weights)
        
        tensorboard_logs = {'test_loss': avg_loss,'roc_auc':roc_auc}
        return {'avg_test_loss': avg_loss, 'log': tensorboard_logs}
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-4)

    @pl.data_loader
    def train_dataloader(self):
        print('Loading training dataset')
        train_h5 = h5py.File('../data/Processed/train.hdf5')
        X_train = torch.tensor(train_h5['X_train'][:])
        y_train = torch.tensor(train_h5['y_train'][:])
        train_h5.close()
        
        trn_ds = LMDataset(X_train,y_train,self.cfg.mem_len)
        splr   = LMSampleR(trn_ds,self.bs,self.cfg.mem_len)
        trn_dl = DataLoader(trn_ds, self.bs,sampler=splr,pin_memory=True)
        return trn_dl
    
    @pl.data_loader
    def val_dataloader(self):
        valid = np.load('../data/Processed/valid.npz')
        X_valid = torch.tensor(valid['arr_0'][:])
        y_valid = torch.tensor(valid['arr_1'][:])

        vld_ds = LMDataset(X_valid,y_valid,self.cfg.mem_len)
        splr   = LMSampleR(vld_ds,self.bs,self.cfg.mem_len)
        vld_dl = DataLoader(vld_ds,self.bs,sampler=splr,pin_memory=True)
        return vld_dl
    
#     @pl.data_loader
#     def test_dataloader(self):
#         test = np.load('../data/Processed/test.npz')
#         X_test = torch.tensor(test['arr_0'][:])
#         y_test = torch.tensor(test['arr_1'][:])
                              
#         tst_ds = LMDataset(X_test,y_test,self.cfg.mem_len)
#         splr   = LMSampleR(tst_ds,self.bs,self.cfg.mem_len)
#         tst_dl = DataLoader(tst_ds,self.bs,sampler=splr,pin_memory=True)
#         return tst_dl

In [6]:
exp = Experiment(bs=32)

trainer = pl.Trainer(gpus=1,max_nb_epochs=1, train_percent_check=0.01,val_percent_check=0.5,
                     default_save_path='../data')    

trainer.fit(exp)

Loading training dataset


Epoch 1:  25%|██▍       | 745/3000 [11:39<34:47,  1.08batch/s, batch_nb=744, gpu=0, loss=1.329, v_nb=14]

KeyboardInterrupt: 

In [None]:
#add acc
#dataloader slow when num_workers>0
#add shuffler to samplers
#apex https://github.com/adityaiitb/pyprof2
#resnet+transformerxl

## Resnet + TransformerXL

In [2]:
import transformers as ts

In [3]:
class CustomDataset(Dataset):
    def __init__(self,x,y):
        super(CustomDataset,self).__init__()
        self.x, self.y = x,y
        
    def __len__(self): return len(self.x)

    def __getitem__(self, i): return self.x[i].long(), self.y[i].float()

In [4]:
# cyclic lr with restart
def lr_i(step_i,cycle_len = 1000, lrs = (3e-4,1e-5),warm_pct = 4/20):
    cycle_i = step_i%cycle_len
    warm_len = int(cycle_len*warm_pct)
    cool_len = cycle_len - warm_len
    lr_range = lrs[0]-lrs[1]
    
    if cycle_i < warm_len:
        return lrs[1] + lr_range*(cycle_i/warm_len)
    else:
        return lrs[0] - lr_range*((cycle_i-warm_len)/cool_len)
    
# import numpy as np
# import matplotlib.pyplot as plt

# x = np.linspace(0,50*30,10000)
# y = list(map(lr_i,x))
# plt.figure(figsize=(20,10))
# plt.ylim([0,4e-4])
# plt.scatter(x,y)

In [5]:
class Experiment(pl.LightningModule):

    def __init__(self,model,hparams):
        super(Experiment, self).__init__()
        self.model = model
        self.bs = hparams['bs'] #batch size
        self.lr = hparams['lr'] #batch size
        self.loss_fn = nn.BCEWithLogitsLoss()
        self.class_weights = torch.load('../data/Processed/class_weights')
        
    def forward(self, x): return self.model(x)

    def optimizer_step(self, current_epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
        # warm up lr
        if self.trainer.global_step < 500:
            lr_scale = min(1., float(self.trainer.global_step + 1) / 200.)
            for pg in optimizer.param_groups:
                pg['lr'] = lr_scale * self.lr

#         for pg in optimizer.param_groups:
#             pg['lr'] = lr_i(self.trainer.global_step)
            
        optimizer.step()
        optimizer.zero_grad()
    
    def training_step(self, batch, batch_nb):
        x, y = batch
        y_hat, self.mem = self.forward(x)
        loss = self.loss_fn(y_hat, y)
        tensorboard_logs = {'train_loss': loss}
        return {'loss': loss, 'log': tensorboard_logs}
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat, self.mem = self.forward(x)
        y_pred = torch.sigmoid(y_hat)
        return {'val_loss': self.loss_fn(y_hat, y),'y_pred':y_pred.cpu(), 'y_true':y.cpu()}

    def validation_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        y_preds = torch.cat([x['y_pred'] for x in outputs])
        y_trues = torch.cat([x['y_true'] for x in outputs]).byte()

        roc_auc = weighted_auc(y_preds.cuda(),y_trues.cuda(), self.class_weights.cuda())
        acc = accuracy(y_preds,y_trues)
        tensorboard_logs = {'val_loss': avg_loss,'valid_roc_auc':roc_auc,'valid_acc':acc}
        return {'avg_val_loss': avg_loss, 'log': tensorboard_logs}

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat, self.mem = self.forward(x)
        y_pred = torch.sigmoid(y_hat)
        return {'test_loss': self.loss_fn(y_hat, y),'y_pred':y_pred.cpu(), 'y_true':y.cpu()}

    def test_end(self, outputs,save_preds=False):
        avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean()
        y_preds = torch.cat([x['y_pred'] for x in outputs])
        y_trues = torch.cat([x['y_true'] for x in outputs]).byte()
        if save_preds: self.preds=[y_preds,y_trues]
            
        roc_auc = weighted_auc(y_preds.cuda(),y_trues.cuda(), self.class_weights.cuda())
        acc = accuracy(y_preds,y_trues)
        tensorboard_logs = {'test_loss': avg_loss,'test_roc_auc':roc_auc,'test_acc':acc}
        return {'avg_test_loss': avg_loss, 'log': tensorboard_logs}
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)

    @pl.data_loader
    def train_dataloader(self):
        print('Loading training dataset')
        train_h5 = h5py.File('../data/Processed/train.hdf5')
        X_train = torch.tensor(train_h5['X_train'][:])
        y_train = torch.tensor(train_h5['y_train'][:])
        train_h5.close()
        
        trn_ds = CustomDataset(X_train,y_train)
        trn_dl = DataLoader(trn_ds, self.bs,pin_memory=True,shuffle=True)
        return trn_dl
    
    @pl.data_loader
    def val_dataloader(self):
        valid = np.load('../data/Processed/valid.npz')
        X_valid = torch.tensor(valid['arr_0'][:])
        y_valid = torch.tensor(valid['arr_1'][:])

        vld_ds = CustomDataset(X_valid,y_valid)
        vld_dl = DataLoader(vld_ds,self.bs,pin_memory=True)
        return vld_dl
    
    @pl.data_loader
    def test_dataloader(self):
        test = np.load('../data/Processed/test.npz')
        X_test = torch.tensor(test['arr_0'][:])
        y_test = torch.tensor(test['arr_1'][:])
                              
        tst_ds = CustomDataset(X_test,y_test)
        tst_dl = DataLoader(tst_ds,self.bs,pin_memory=True)
        return tst_dl

In [6]:
# cfg = ts.TransfoXLConfig(vocab_size=4, d_embed=8,d_model=256, n_head=4, d_head=16, d_inner=256, 
#                          n_layer=6, tgt_len=0, ext_len=0, mem_len=256, cutoffs=[1], )

# model = models.ResTransXL(vocab_size=4, d_emb=64, tsfm_cfg=cfg, n_res_blocks=3, res_k=16, 
#                           skip_cnt=True, fc_h_dim=512, lin_p=0.5, WVN=True)
# model.summary()

In [7]:
cfg = ts.TransfoXLConfig(vocab_size=4, d_embed=8,d_model=352, n_head=4, d_head=16, d_inner=256, 
                         n_layer=6, tgt_len=0, ext_len=0, mem_len=256, cutoffs=[1], )

model = models.ResTransXL(vocab_size=4, d_emb=64, tsfm_cfg=cfg,skip_cnt=True,fc_h_dim=256,n_res_blocks=3, res_k=16
                   ,LSTM=True)
model.summary()

Model parameters:				
Resnet part:		8894k
Transformer-XL part:	1988k
Linear part:		34028k
Total:			44911k


In [8]:
exp = Experiment(model,{'bs':25,'lr':1e-4})

# chpt_path = '../data/lightning_logs/version_52/checkpoints/_ckpt_epoch_1.ckpt'
# tags_csv = '../data/lightning_logs/version_7/meta_tags.csv'
# exp.load_state_dict(torch.load(chpt_path)['state_dict'])
# exp.load_from_checkpoint(chpt_path)
# exp.load_from_metrics

In [None]:
trainer = pl.Trainer(gpus=1, fast_dev_run=False, max_nb_epochs=10, accumulate_grad_batches=4,
                     train_percent_check=1, val_check_interval=0.1, use_amp=True,
                     default_save_path='../data')    
trainer.fit(exp)

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Loading training dataset


Epoch 1:   8%|▊         | 14116/179200 [12:53<2:15:44, 20.27batch/s, batch_nb=14115, gpu=0, loss=0.076, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:   9%|▉         | 16199/179200 [14:44<2:25:15, 18.70batch/s, batch_nb=16198, gpu=0, loss=0.077, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  10%|▉         | 17600/179200 [15:59<2:14:42, 19.99batch/s, batch_nb=17599, gpu=0, loss=0.074, v_nb=55]
Validating:   0%|          | 0/320 [00:00<?, ?batch/s][A
Epoch 1:  10%|▉         | 17610/179200 [16:00<1:42:46, 26.21batch/s, batch_nb=17599, gpu=0, loss=0.074, v_nb=55]
Epoch 1:  10%|▉         | 17621/179200 [16:00<1:19:42, 33.78batch/s, batch_nb=17599, gpu=0, loss=0.074, v_nb=55]
Epoch 1:  10%|▉         | 17632/179200 [16:00<1:03:35, 42.35batch/s, batch_nb=17599, gpu=0, loss=0.074, v_nb=55]
Epoch 1:  10%|▉         | 17643/179200 [16:00<52:16, 51.50batch/s, batch_nb=17599, gpu=0, loss=0.074, v_nb=55]  
Epoch 1:  10%|▉         | 17654/179200 [16:00<44:19, 60.74batch/s, batch_nb=17599, gpu=0, loss=0.074, v_nb=55]
Epoch 1:  10%|▉         | 17665/179200 [16:00<38:45, 69.47batch/s, batch_nb=17599, gpu=0, loss=0.074, v_nb=55]
Epoch 1:  10%|▉         | 17676/179200 [16:00<34:51, 77.23batch/s, batch_nb=17599, gpu=0, loss=0.074, v_nb=55]
Epoch 1:  10%|▉         | 17687/179200 [16:0

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  12%|█▏        | 22167/179200 [19:56<2:15:10, 19.36batch/s, batch_nb=21846, gpu=0, loss=0.074, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  14%|█▎        | 24227/179200 [21:46<2:22:26, 18.13batch/s, batch_nb=23906, gpu=0, loss=0.074, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  15%|█▍        | 26411/179200 [23:45<2:12:17, 19.25batch/s, batch_nb=26090, gpu=0, loss=0.073, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  16%|█▌        | 28803/179200 [25:56<2:18:26, 18.11batch/s, batch_nb=28482, gpu=0, loss=0.074, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  17%|█▋        | 30876/179200 [27:50<2:04:43, 19.82batch/s, batch_nb=30555, gpu=0, loss=0.073, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  18%|█▊        | 32899/179200 [29:41<2:11:48, 18.50batch/s, batch_nb=32578, gpu=0, loss=0.074, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  19%|█▉        | 34903/179200 [31:32<2:17:00, 17.55batch/s, batch_nb=34582, gpu=0, loss=0.073, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  20%|█▉        | 35520/179200 [32:07<2:14:24, 17.82batch/s, batch_nb=35199, gpu=0, loss=0.073, v_nb=55]
Epoch 1:  20%|█▉        | 35530/179200 [32:07<1:41:17, 23.64batch/s, batch_nb=35199, gpu=0, loss=0.073, v_nb=55]
Epoch 1:  20%|█▉        | 35541/179200 [32:07<1:17:54, 30.73batch/s, batch_nb=35199, gpu=0, loss=0.073, v_nb=55]
Epoch 1:  20%|█▉        | 35551/179200 [32:07<1:01:44, 38.77batch/s, batch_nb=35199, gpu=0, loss=0.073, v_nb=55]
Epoch 1:  20%|█▉        | 35562/179200 [32:07<50:04, 47.81batch/s, batch_nb=35199, gpu=0, loss=0.073, v_nb=55]  
Epoch 1:  20%|█▉        | 35573/179200 [32:07<41:54, 57.13batch/s, batch_nb=35199, gpu=0, loss=0.073, v_nb=55]
Epoch 1:  20%|█▉        | 35584/179200 [32:07<36:12, 66.10batch/s, batch_nb=35199, gpu=0, loss=0.073, v_nb=55]
Epoch 1:  20%|█▉        | 35595/179200 [32:07<32:12, 74.33batch/s, batch_nb=35199, gpu=0, loss=0.073, v_nb=55]
Epoch 1:  20%|█▉        | 35606/179200 [32:07<29:23, 81.44batch/s, batch_nb=35199, gpu=0, loss=0.073, 

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  23%|██▎       | 41419/179200 [37:24<2:06:43, 18.12batch/s, batch_nb=40778, gpu=0, loss=0.071, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  25%|██▍       | 44183/179200 [39:57<2:11:45, 17.08batch/s, batch_nb=43542, gpu=0, loss=0.072, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  27%|██▋       | 48199/179200 [43:49<2:04:48, 17.49batch/s, batch_nb=47558, gpu=0, loss=0.070, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  28%|██▊       | 49395/179200 [45:00<2:15:38, 15.95batch/s, batch_nb=48754, gpu=0, loss=0.072, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  30%|██▉       | 53440/179200 [49:02<2:02:48, 17.07batch/s, batch_nb=52799, gpu=0, loss=0.070, v_nb=55]
Epoch 1:  30%|██▉       | 53443/179200 [49:02<1:42:20, 20.48batch/s, batch_nb=52799, gpu=0, loss=0.070, v_nb=55]
Epoch 1:  30%|██▉       | 53453/179200 [49:03<1:18:21, 26.75batch/s, batch_nb=52799, gpu=0, loss=0.070, v_nb=55]
Epoch 1:  30%|██▉       | 53463/179200 [49:03<1:01:32, 34.05batch/s, batch_nb=52799, gpu=0, loss=0.070, v_nb=55]
Epoch 1:  30%|██▉       | 53473/179200 [49:03<49:33, 42.28batch/s, batch_nb=52799, gpu=0, loss=0.070, v_nb=55]  
Epoch 1:  30%|██▉       | 53483/179200 [49:03<41:13, 50.82batch/s, batch_nb=52799, gpu=0, loss=0.070, v_nb=55]
Epoch 1:  30%|██▉       | 53493/179200 [49:03<35:32, 58.94batch/s, batch_nb=52799, gpu=0, loss=0.070, v_nb=55]
Epoch 1:  30%|██▉       | 53503/179200 [49:03<31:20, 66.83batch/s, batch_nb=52799, gpu=0, loss=0.070, v_nb=55]
Epoch 1:  30%|██▉       | 53513/179200 [49:03<28:29, 73.54batch/s, batch_nb=52799, gpu=0, loss=0.070, 

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  30%|███       | 53911/179200 [49:20<2:03:50, 16.86batch/s, batch_nb=52950, gpu=0, loss=0.070, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  31%|███▏      | 56239/179200 [51:48<2:04:25, 16.47batch/s, batch_nb=55278, gpu=0, loss=0.071, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  33%|███▎      | 58407/179200 [54:08<2:04:07, 16.22batch/s, batch_nb=57446, gpu=0, loss=0.070, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  35%|███▍      | 62439/179200 [58:29<2:02:31, 15.88batch/s, batch_nb=61478, gpu=0, loss=0.071, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  36%|███▌      | 64459/179200 [1:00:31<1:43:51, 18.41batch/s, batch_nb=63498, gpu=0, loss=0.070, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  37%|███▋      | 66561/179200 [1:02:33<1:50:18, 17.02batch/s, batch_nb=65600, gpu=0, loss=0.069, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  38%|███▊      | 68135/179200 [1:04:04<1:45:42, 17.51batch/s, batch_nb=67174, gpu=0, loss=0.071, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  40%|███▉      | 71360/179200 [1:07:12<1:42:29, 17.54batch/s, batch_nb=70399, gpu=0, loss=0.070, v_nb=55]
Epoch 1:  40%|███▉      | 71364/179200 [1:07:13<1:23:27, 21.54batch/s, batch_nb=70399, gpu=0, loss=0.070, v_nb=55]
Epoch 1:  40%|███▉      | 71375/179200 [1:07:13<1:03:41, 28.21batch/s, batch_nb=70399, gpu=0, loss=0.070, v_nb=55]
Epoch 1:  40%|███▉      | 71385/179200 [1:07:13<50:15, 35.75batch/s, batch_nb=70399, gpu=0, loss=0.070, v_nb=55]  
Epoch 1:  40%|███▉      | 71396/179200 [1:07:13<40:26, 44.42batch/s, batch_nb=70399, gpu=0, loss=0.070, v_nb=55]
Epoch 1:  40%|███▉      | 71407/179200 [1:07:13<33:34, 53.51batch/s, batch_nb=70399, gpu=0, loss=0.070, v_nb=55]
Epoch 1:  40%|███▉      | 71418/179200 [1:07:13<28:45, 62.47batch/s, batch_nb=70399, gpu=0, loss=0.070, v_nb=55]
Epoch 1:  40%|███▉      | 71429/179200 [1:07:13<25:24, 70.70batch/s, batch_nb=70399, gpu=0, loss=0.070, v_nb=55]
Epoch 1:  40%|███▉      | 71440/179200 [1:07:13<23:01, 77.99batch/s, batch_nb=70399, gpu

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  43%|████▎     | 76303/179200 [1:11:50<1:41:19, 16.92batch/s, batch_nb=75022, gpu=0, loss=0.069, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  44%|████▎     | 78335/179200 [1:13:49<1:35:23, 17.62batch/s, batch_nb=77054, gpu=0, loss=0.070, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  44%|████▍     | 79422/179200 [1:14:53<1:42:01, 16.30batch/s, batch_nb=78141, gpu=0, loss=0.072, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  46%|████▌     | 81975/179200 [1:17:23<1:40:07, 16.19batch/s, batch_nb=80694, gpu=0, loss=0.070, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  48%|████▊     | 85979/179200 [1:21:20<1:26:49, 17.89batch/s, batch_nb=84698, gpu=0, loss=0.069, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  49%|████▉     | 88119/179200 [1:23:30<1:24:25, 17.98batch/s, batch_nb=86838, gpu=0, loss=0.070, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  50%|████▉     | 89280/179200 [1:24:42<1:26:39, 17.29batch/s, batch_nb=87999, gpu=0, loss=0.069, v_nb=55]
Epoch 1:  50%|████▉     | 89284/179200 [1:24:43<1:10:06, 21.38batch/s, batch_nb=87999, gpu=0, loss=0.069, v_nb=55]
Epoch 1:  50%|████▉     | 89295/179200 [1:24:43<53:23, 28.07batch/s, batch_nb=87999, gpu=0, loss=0.069, v_nb=55]  
Epoch 1:  50%|████▉     | 89305/179200 [1:24:43<41:54, 35.75batch/s, batch_nb=87999, gpu=0, loss=0.069, v_nb=55]
Epoch 1:  50%|████▉     | 89315/179200 [1:24:43<33:55, 44.16batch/s, batch_nb=87999, gpu=0, loss=0.069, v_nb=55]
Epoch 1:  50%|████▉     | 89326/179200 [1:24:43<28:09, 53.19batch/s, batch_nb=87999, gpu=0, loss=0.069, v_nb=55]
Epoch 1:  50%|████▉     | 89337/179200 [1:24:43<24:08, 62.05batch/s, batch_nb=87999, gpu=0, loss=0.069, v_nb=55]
Epoch 1:  50%|████▉     | 89348/179200 [1:24:43<21:15, 70.46batch/s, batch_nb=87999, gpu=0, loss=0.069, v_nb=55]
Epoch 1:  50%|████▉     | 89359/179200 [1:24:43<19:17, 77.60batch/s, batch_nb=87999, gpu=0

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  52%|█████▏    | 92588/179200 [1:27:56<1:30:21, 15.98batch/s, batch_nb=90987, gpu=0, loss=0.069, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  53%|█████▎    | 94601/179200 [1:29:59<1:32:45, 15.20batch/s, batch_nb=93000, gpu=0, loss=0.068, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  53%|█████▎    | 94727/179200 [1:30:07<1:25:44, 16.42batch/s, batch_nb=93126, gpu=0, loss=0.069, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  54%|█████▍    | 96919/179200 [1:32:26<1:25:35, 16.02batch/s, batch_nb=95318, gpu=0, loss=0.070, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  56%|█████▋    | 101011/179200 [1:36:46<1:17:20, 16.85batch/s, batch_nb=99410, gpu=0, loss=0.070, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  58%|█████▊    | 103075/179200 [1:38:56<1:22:29, 15.38batch/s, batch_nb=101474, gpu=0, loss=0.069, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  58%|█████▊    | 103559/179200 [1:39:28<1:15:02, 16.80batch/s, batch_nb=101958, gpu=0, loss=0.070, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  60%|█████▉    | 107200/179200 [1:43:19<1:06:32, 18.03batch/s, batch_nb=105599, gpu=0, loss=0.069, v_nb=55]
Epoch 1:  60%|█████▉    | 107205/179200 [1:43:19<53:00, 22.64batch/s, batch_nb=105599, gpu=0, loss=0.069, v_nb=55]  
Epoch 1:  60%|█████▉    | 107216/179200 [1:43:19<40:40, 29.50batch/s, batch_nb=105599, gpu=0, loss=0.069, v_nb=55]
Epoch 1:  60%|█████▉    | 107226/179200 [1:43:19<32:06, 37.36batch/s, batch_nb=105599, gpu=0, loss=0.069, v_nb=55]
Epoch 1:  60%|█████▉    | 107237/179200 [1:43:19<25:57, 46.21batch/s, batch_nb=105599, gpu=0, loss=0.069, v_nb=55]
Epoch 1:  60%|█████▉    | 107248/179200 [1:43:19<21:37, 55.44batch/s, batch_nb=105599, gpu=0, loss=0.069, v_nb=55]
Epoch 1:  60%|█████▉    | 107259/179200 [1:43:20<18:36, 64.43batch/s, batch_nb=105599, gpu=0, loss=0.069, v_nb=55]
Epoch 1:  60%|█████▉    | 107270/179200 [1:43:20<16:34, 72.29batch/s, batch_nb=105599, gpu=0, loss=0.069, v_nb=55]
Epoch 1:  60%|█████▉    | 107281/179200 [1:43:20<15:04, 79.53batch/s, batch_

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  61%|██████▏   | 110151/179200 [1:46:16<1:05:16, 17.63batch/s, batch_nb=108230, gpu=0, loss=0.068, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  63%|██████▎   | 112182/179200 [1:48:27<1:14:57, 14.90batch/s, batch_nb=110261, gpu=0, loss=0.068, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  63%|██████▎   | 112662/179200 [1:48:59<1:19:43, 13.91batch/s, batch_nb=110741, gpu=0, loss=0.068, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  65%|██████▌   | 116695/179200 [1:53:25<1:09:17, 15.03batch/s, batch_nb=114774, gpu=0, loss=0.068, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  65%|██████▌   | 116921/179200 [1:53:40<1:01:36, 16.85batch/s, batch_nb=115000, gpu=0, loss=0.069, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  68%|██████▊   | 121055/179200 [1:58:15<1:05:34, 14.78batch/s, batch_nb=119134, gpu=0, loss=0.068, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  69%|██████▊   | 123131/179200 [2:00:29<55:58, 16.70batch/s, batch_nb=121210, gpu=0, loss=0.069, v_nb=55]  

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  70%|██████▉   | 125120/179200 [2:02:32<51:51, 17.38batch/s, batch_nb=123199, gpu=0, loss=0.067, v_nb=55]  
Validating:   0%|          | 0/320 [00:00<?, ?batch/s][A
Epoch 1:  70%|██████▉   | 125130/179200 [2:02:32<39:08, 23.02batch/s, batch_nb=123199, gpu=0, loss=0.067, v_nb=55]
Epoch 1:  70%|██████▉   | 125141/179200 [2:02:32<29:58, 30.05batch/s, batch_nb=123199, gpu=0, loss=0.067, v_nb=55]
Epoch 1:  70%|██████▉   | 125152/179200 [2:02:33<23:39, 38.07batch/s, batch_nb=123199, gpu=0, loss=0.067, v_nb=55]
Epoch 1:  70%|██████▉   | 125162/179200 [2:02:33<19:16, 46.74batch/s, batch_nb=123199, gpu=0, loss=0.067, v_nb=55]
Epoch 1:  70%|██████▉   | 125173/179200 [2:02:33<16:05, 55.95batch/s, batch_nb=123199, gpu=0, loss=0.067, v_nb=55]
Epoch 1:  70%|██████▉   | 125184/179200 [2:02:33<13:51, 64.95batch/s, batch_nb=123199, gpu=0, loss=0.067, v_nb=55]
Epoch 1:  70%|██████▉   | 125195/179200 [2:02:33<12:17, 73.19batch/s, batch_nb=123199, gpu=0, loss=0.067, v_nb=55]
Epoch 1:  70%|██████

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  71%|███████   | 127662/179200 [2:04:58<56:37, 15.17batch/s, batch_nb=125421, gpu=0, loss=0.068, v_nb=55]  

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  72%|███████▏  | 129602/179200 [2:06:58<51:18, 16.11batch/s, batch_nb=127361, gpu=0, loss=0.066, v_nb=55]  

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  74%|███████▍  | 132875/179200 [2:10:24<47:20, 16.31batch/s, batch_nb=130634, gpu=0, loss=0.068, v_nb=55]  

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  77%|███████▋  | 137211/179200 [2:15:02<39:55, 17.53batch/s, batch_nb=134970, gpu=0, loss=0.068, v_nb=55]  

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  78%|███████▊  | 139387/179200 [2:17:20<40:43, 16.30batch/s, batch_nb=137146, gpu=0, loss=0.067, v_nb=55]  

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  79%|███████▊  | 140761/179200 [2:18:48<36:37, 17.49batch/s, batch_nb=138520, gpu=0, loss=0.068, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  80%|███████▉  | 143040/179200 [2:21:19<35:47, 16.84batch/s, batch_nb=140799, gpu=0, loss=0.068, v_nb=55]
Validating:   0%|          | 0/320 [00:00<?, ?batch/s][A
Epoch 1:  80%|███████▉  | 143050/179200 [2:21:19<27:02, 22.28batch/s, batch_nb=140799, gpu=0, loss=0.068, v_nb=55]
Epoch 1:  80%|███████▉  | 143060/179200 [2:21:19<20:54, 28.81batch/s, batch_nb=140799, gpu=0, loss=0.068, v_nb=55]
Epoch 1:  80%|███████▉  | 143070/179200 [2:21:19<16:30, 36.48batch/s, batch_nb=140799, gpu=0, loss=0.068, v_nb=55]
Epoch 1:  80%|███████▉  | 143080/179200 [2:21:19<13:23, 44.95batch/s, batch_nb=140799, gpu=0, loss=0.068, v_nb=55]
Epoch 1:  80%|███████▉  | 143090/179200 [2:21:19<11:15, 53.43batch/s, batch_nb=140799, gpu=0, loss=0.068, v_nb=55]
Epoch 1:  80%|███████▉  | 143100/179200 [2:21:19<09:44, 61.74batch/s, batch_nb=140799, gpu=0, loss=0.068, v_nb=55]
Epoch 1:  80%|███████▉  | 143110/179200 [2:21:20<08:41, 69.26batch/s, batch_nb=140799, gpu=0, loss=0.068, v_nb=55]
Epoch 1:  80%|███████▉

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  82%|████████▏ | 146351/179200 [2:24:39<32:42, 16.74batch/s, batch_nb=143790, gpu=0, loss=0.067, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  83%|████████▎ | 148815/179200 [2:27:17<32:49, 15.42batch/s, batch_nb=146254, gpu=0, loss=0.067, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  85%|████████▍ | 151439/179200 [2:30:07<27:07, 17.06batch/s, batch_nb=148878, gpu=0, loss=0.066, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  86%|████████▌ | 153661/179200 [2:32:32<24:47, 17.17batch/s, batch_nb=151100, gpu=0, loss=0.067, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  88%|████████▊ | 158081/179200 [2:37:34<21:27, 16.40batch/s, batch_nb=155520, gpu=0, loss=0.067, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  89%|████████▉ | 160111/179200 [2:39:51<19:13, 16.54batch/s, batch_nb=157550, gpu=0, loss=0.066, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  90%|████████▉ | 160960/179200 [2:40:49<19:20, 15.72batch/s, batch_nb=158399, gpu=0, loss=0.068, v_nb=55]
Epoch 1:  90%|████████▉ | 160963/179200 [2:40:49<15:59, 19.01batch/s, batch_nb=158399, gpu=0, loss=0.068, v_nb=55]
Epoch 1:  90%|████████▉ | 160973/179200 [2:40:49<12:11, 24.92batch/s, batch_nb=158399, gpu=0, loss=0.068, v_nb=55]
Epoch 1:  90%|████████▉ | 160982/179200 [2:40:49<09:36, 31.63batch/s, batch_nb=158399, gpu=0, loss=0.068, v_nb=55]
Epoch 1:  90%|████████▉ | 160991/179200 [2:40:49<07:44, 39.23batch/s, batch_nb=158399, gpu=0, loss=0.068, v_nb=55]
Epoch 1:  90%|████████▉ | 161000/179200 [2:40:49<06:30, 46.58batch/s, batch_nb=158399, gpu=0, loss=0.068, v_nb=55]
Epoch 1:  90%|████████▉ | 161010/179200 [2:40:49<05:33, 54.60batch/s, batch_nb=158399, gpu=0, loss=0.068, v_nb=55]
Epoch 1:  90%|████████▉ | 161019/179200 [2:40:49<04:54, 61.83batch/s, batch_nb=158399, gpu=0, loss=0.068, v_nb=55]
Epoch 1:  90%|████████▉ | 161029/179200 [2:40:49<04:25, 68.37batch/s, batch_nb=1

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  92%|█████████▏| 164887/179200 [2:44:58<16:56, 14.08batch/s, batch_nb=162006, gpu=0, loss=0.068, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  93%|█████████▎| 166891/179200 [2:47:13<13:06, 15.65batch/s, batch_nb=164010, gpu=0, loss=0.067, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  93%|█████████▎| 167311/179200 [2:47:40<11:41, 16.96batch/s, batch_nb=164430, gpu=0, loss=0.066, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  96%|█████████▌| 171521/179200 [2:52:18<07:08, 17.91batch/s, batch_nb=168640, gpu=0, loss=0.066, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  97%|█████████▋| 173521/179200 [2:54:30<05:39, 16.74batch/s, batch_nb=170640, gpu=0, loss=0.067, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  99%|█████████▉| 177581/179200 [2:59:01<02:07, 12.68batch/s, batch_nb=174700, gpu=0, loss=0.065, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1: 100%|█████████▉| 178880/179200 [3:00:28<00:18, 16.95batch/s, batch_nb=175999, gpu=0, loss=0.067, v_nb=55]
Epoch 1: 100%|█████████▉| 178890/179200 [3:00:28<00:13, 22.57batch/s, batch_nb=175999, gpu=0, loss=0.067, v_nb=55]
Epoch 1: 100%|█████████▉| 178900/179200 [3:00:28<00:10, 29.39batch/s, batch_nb=175999, gpu=0, loss=0.067, v_nb=55]
Epoch 1: 100%|█████████▉| 178911/179200 [3:00:28<00:07, 37.43batch/s, batch_nb=175999, gpu=0, loss=0.067, v_nb=55]
Epoch 1: 100%|█████████▉| 178922/179200 [3:00:28<00:06, 46.30batch/s, batch_nb=175999, gpu=0, loss=0.067, v_nb=55]
Epoch 1: 100%|█████████▉| 178933/179200 [3:00:28<00:04, 55.53batch/s, batch_nb=175999, gpu=0, loss=0.067, v_nb=55]
Epoch 1: 100%|█████████▉| 178944/179200 [3:00:28<00:03, 64.53batch/s, batch_nb=175999, gpu=0, loss=0.067, v_nb=55]
Epoch 1: 100%|█████████▉| 178955/179200 [3:00:28<00:03, 72.83batch/s, batch_nb=175999, gpu=0, loss=0.067, v_nb=55]
Epoch 1: 100%|█████████▉| 178966/179200 [3:00:28<00:02, 79.44batch/s, batch_nb=1

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:   2%|▏         | 3535/179200 [03:59<3:29:57, 13.94batch/s, batch_nb=3534, gpu=0, loss=0.065, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:   3%|▎         | 5541/179200 [06:19<3:10:57, 15.16batch/s, batch_nb=5540, gpu=0, loss=0.066, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:   4%|▍         | 6819/179200 [07:52<3:04:10, 15.60batch/s, batch_nb=6818, gpu=0, loss=0.066, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 2:   6%|▋         | 11395/179200 [13:19<3:22:42, 13.80batch/s, batch_nb=11394, gpu=0, loss=0.066, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:   8%|▊         | 13951/179200 [16:15<2:46:05, 16.58batch/s, batch_nb=13950, gpu=0, loss=0.067, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:   8%|▊         | 15051/179200 [17:30<2:45:23, 16.54batch/s, batch_nb=15050, gpu=0, loss=0.066, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 2:  10%|▉         | 17600/179200 [20:26<2:48:54, 15.95batch/s, batch_nb=17599, gpu=0, loss=0.066, v_nb=55]
Validating:   0%|          | 0/320 [00:00<?, ?batch/s][A
Epoch 2:  10%|▉         | 17609/179200 [20:27<2:07:46, 21.08batch/s, batch_nb=17599, gpu=0, loss=0.066, v_nb=55]
Epoch 2:  10%|▉         | 17618/179200 [20:27<1:38:37, 27.31batch/s, batch_nb=17599, gpu=0, loss=0.066, v_nb=55]
Epoch 2:  10%|▉         | 17628/179200 [20:27<1:18:00, 34.52batch/s, batch_nb=17599, gpu=0, loss=0.066, v_nb=55]
Epoch 2:  10%|▉         | 17637/179200 [20:27<1:03:43, 42.26batch/s, batch_nb=17599, gpu=0, loss=0.066, v_nb=55]
Epoch 2:  10%|▉         | 17646/179200 [20:27<54:22, 49.53batch/s, batch_nb=17599, gpu=0, loss=0.066, v_nb=55]  
Epoch 2:  10%|▉         | 17655/179200 [20:27<47:05, 57.17batch/s, batch_nb=17599, gpu=0, loss=0.066, v_nb=55]
Epoch 2:  10%|▉         | 17664/179200 [20:27<42:00, 64.09batch/s, batch_nb=17599, gpu=0, loss=0.066, v_nb=55]
Epoch 2:  10%|▉         | 17675/179200 [20

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 2:  13%|█▎        | 22941/179200 [26:46<2:52:50, 15.07batch/s, batch_nb=22620, gpu=0, loss=0.065, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  14%|█▍        | 25435/179200 [29:54<3:18:42, 12.90batch/s, batch_nb=25114, gpu=0, loss=0.066, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  16%|█▌        | 28281/179200 [33:30<2:36:53, 16.03batch/s, batch_nb=27960, gpu=0, loss=0.066, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  17%|█▋        | 30487/179200 [36:14<2:46:37, 14.88batch/s, batch_nb=30166, gpu=0, loss=0.065, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  18%|█▊        | 32895/179200 [39:08<2:46:14, 14.67batch/s, batch_nb=32574, gpu=0, loss=0.066, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  18%|█▊        | 32951/179200 [39:12<2:25:40, 16.73batch/s, batch_nb=32630, gpu=0, loss=0.065, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 2:  20%|█▉        | 35520/179200 [42:15<2:32:47, 15.67batch/s, batch_nb=35199, gpu=0, loss=0.064, v_nb=55]
Epoch 2:  20%|█▉        | 35529/179200 [42:15<1:55:02, 20.82batch/s, batch_nb=35199, gpu=0, loss=0.064, v_nb=55]
Epoch 2:  20%|█▉        | 35539/179200 [42:15<1:28:01, 27.20batch/s, batch_nb=35199, gpu=0, loss=0.064, v_nb=55]
Epoch 2:  20%|█▉        | 35549/179200 [42:15<1:09:18, 34.55batch/s, batch_nb=35199, gpu=0, loss=0.064, v_nb=55]
Epoch 2:  20%|█▉        | 35559/179200 [42:15<56:10, 42.62batch/s, batch_nb=35199, gpu=0, loss=0.064, v_nb=55]  
Epoch 2:  20%|█▉        | 35569/179200 [42:15<46:53, 51.05batch/s, batch_nb=35199, gpu=0, loss=0.064, v_nb=55]
Epoch 2:  20%|█▉        | 35579/179200 [42:15<40:15, 59.46batch/s, batch_nb=35199, gpu=0, loss=0.064, v_nb=55]
Epoch 2:  20%|█▉        | 35589/179200 [42:15<35:37, 67.19batch/s, batch_nb=35199, gpu=0, loss=0.064, v_nb=55]
Epoch 2:  20%|█▉        | 35599/179200 [42:15<32:22, 73.91batch/s, batch_nb=35199, gpu=0, loss=0.064, 

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  22%|██▏       | 39640/179200 [46:58<2:19:07, 16.72batch/s, batch_nb=38999, gpu=0, loss=0.067, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  23%|██▎       | 40911/179200 [48:31<2:14:55, 17.08batch/s, batch_nb=40270, gpu=0, loss=0.065, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 2:  25%|██▌       | 44915/179200 [53:25<2:28:34, 15.06batch/s, batch_nb=44274, gpu=0, loss=0.063, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  26%|██▌       | 46671/179200 [55:30<2:13:07, 16.59batch/s, batch_nb=46030, gpu=0, loss=0.066, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 2:  28%|██▊       | 50855/179200 [1:00:29<2:53:09, 12.35batch/s, batch_nb=50214, gpu=0, loss=0.064, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  30%|██▉       | 53411/179200 [1:03:43<2:06:51, 16.53batch/s, batch_nb=52770, gpu=0, loss=0.065, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  30%|██▉       | 53440/179200 [1:03:45<2:17:51, 15.20batch/s, batch_nb=52799, gpu=0, loss=0.065, v_nb=55]
Epoch 2:  30%|██▉       | 53443/179200 [1:03:45<1:53:59, 18.39batch/s, batch_nb=52799, gpu=0, loss=0.065, v_nb=55]
Epoch 2:  30%|██▉       | 53453/179200 [1:03:45<1:26:35, 24.20batch/s, batch_nb=52799, gpu=0, loss=0.065, v_nb=55]
Epoch 2:  30%|██▉       | 53462/179200 [1:03:45<1:07:53, 30.87batch/s, batch_nb=52799, gpu=0, loss=0.065, v_nb=55]
Epoch 2:  30%|██▉       | 53471/179200 [1:03:45<55:07, 38.01batch/s, batch_nb=52799, gpu=0, loss=0.065, v_nb=55]  
Epoch 2:  30%|██▉       | 53481/179200 [1:03:45<45:20, 46.21batch/s, batch_nb=52799, gpu=0, loss=0.065, v_nb=55]
Epoch 2:  30%|██▉       | 53492/179200 [1:03:45<37:53, 55.30batch/s, batch_nb=52799, gpu=0, loss=0.065, v_nb=55]
Epoch 2:  30%|██▉       | 53502/179200 [1:03:45<32:55, 63.63batch/s, batch_nb=52799, gpu=0, loss=0.065, v_nb=55]
Epoch 2:  30%|██▉       | 53512/179200 [1:03:45<29:26, 71.15batch/s, batch_nb=52799, g

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  33%|███▎      | 58487/179200 [1:09:54<2:29:48, 13.43batch/s, batch_nb=57526, gpu=0, loss=0.064, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  34%|███▍      | 60519/179200 [1:12:37<2:19:35, 14.17batch/s, batch_nb=59558, gpu=0, loss=0.065, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  35%|███▌      | 62721/179200 [1:15:35<2:12:41, 14.63batch/s, batch_nb=61760, gpu=0, loss=0.064, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  36%|███▋      | 65281/179200 [1:19:03<2:11:11, 14.47batch/s, batch_nb=64320, gpu=0, loss=0.066, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  38%|███▊      | 67391/179200 [1:21:55<2:05:00, 14.91batch/s, batch_nb=66430, gpu=0, loss=0.063, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  39%|███▉      | 69475/179200 [1:24:45<2:37:04, 11.64batch/s, batch_nb=68514, gpu=0, loss=0.066, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  40%|███▉      | 71360/179200 [1:27:19<2:09:26, 13.89batch/s, batch_nb=70399, gpu=0, loss=0.066, v_nb=55]
Epoch 2:  40%|███▉      | 71363/179200 [1:27:19<1:45:15, 17.07batch/s, batch_nb=70399, gpu=0, loss=0.066, v_nb=55]
Epoch 2:  40%|███▉      | 71372/179200 [1:27:19<1:19:59, 22.46batch/s, batch_nb=70399, gpu=0, loss=0.066, v_nb=55]
Epoch 2:  40%|███▉      | 71381/179200 [1:27:19<1:02:02, 28.96batch/s, batch_nb=70399, gpu=0, loss=0.066, v_nb=55]
Epoch 2:  40%|███▉      | 71390/179200 [1:27:19<49:42, 36.15batch/s, batch_nb=70399, gpu=0, loss=0.066, v_nb=55]  
Epoch 2:  40%|███▉      | 71400/179200 [1:27:19<40:43, 44.12batch/s, batch_nb=70399, gpu=0, loss=0.066, v_nb=55]
Epoch 2:  40%|███▉      | 71409/179200 [1:27:19<34:30, 52.06batch/s, batch_nb=70399, gpu=0, loss=0.066, v_nb=55]
Epoch 2:  40%|███▉      | 71418/179200 [1:27:20<30:10, 59.54batch/s, batch_nb=70399, gpu=0, loss=0.066, v_nb=55]
Epoch 2:  40%|███▉      | 71427/179200 [1:27:20<27:08, 66.19batch/s, batch_nb=70399, g

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  42%|████▏     | 74827/179200 [1:31:42<2:09:37, 13.42batch/s, batch_nb=73546, gpu=0, loss=0.066, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  43%|████▎     | 77235/179200 [1:35:00<2:23:26, 11.85batch/s, batch_nb=75954, gpu=0, loss=0.064, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  44%|████▍     | 79467/179200 [1:37:55<2:00:05, 13.84batch/s, batch_nb=78186, gpu=0, loss=0.063, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  46%|████▌     | 81575/179200 [1:40:42<2:06:58, 12.81batch/s, batch_nb=80294, gpu=0, loss=0.065, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  47%|████▋     | 83741/179200 [1:43:32<1:46:48, 14.89batch/s, batch_nb=82460, gpu=0, loss=0.063, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  48%|████▊     | 85795/179200 [1:46:18<2:14:15, 11.60batch/s, batch_nb=84514, gpu=0, loss=0.064, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  49%|████▉     | 88127/179200 [1:49:32<1:56:40, 13.01batch/s, batch_nb=86846, gpu=0, loss=0.065, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  50%|████▉     | 89280/179200 [1:51:07<1:42:54, 14.56batch/s, batch_nb=87999, gpu=0, loss=0.066, v_nb=55]
Validating:   0%|          | 0/320 [00:00<?, ?batch/s][A
Epoch 2:  50%|████▉     | 89289/179200 [1:51:07<1:17:20, 19.38batch/s, batch_nb=87999, gpu=0, loss=0.066, v_nb=55]
Epoch 2:  50%|████▉     | 89298/179200 [1:51:08<59:15, 25.29batch/s, batch_nb=87999, gpu=0, loss=0.066, v_nb=55]  
Epoch 2:  50%|████▉     | 89307/179200 [1:51:08<46:42, 32.07batch/s, batch_nb=87999, gpu=0, loss=0.066, v_nb=55]
Epoch 2:  50%|████▉     | 89316/179200 [1:51:08<37:48, 39.63batch/s, batch_nb=87999, gpu=0, loss=0.066, v_nb=55]
Epoch 2:  50%|████▉     | 89325/179200 [1:51:08<31:34, 47.43batch/s, batch_nb=87999, gpu=0, loss=0.066, v_nb=55]
Epoch 2:  50%|████▉     | 89334/179200 [1:51:08<27:19, 54.82batch/s, batch_nb=87999, gpu=0, loss=0.066, v_nb=55]
Epoch 2:  50%|████▉     | 89343/179200 [1:51:08<24:24, 61.36batch/s, batch_nb=87999, gpu=0, loss=0.066, v_nb=55]
Epoch 2:  50%|████▉     | 89353/

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  52%|█████▏    | 93351/179200 [1:56:25<1:36:11, 14.87batch/s, batch_nb=91750, gpu=0, loss=0.066, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  52%|█████▏    | 93415/179200 [1:56:30<2:06:36, 11.29batch/s, batch_nb=91814, gpu=0, loss=0.065, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 2:  55%|█████▍    | 97695/179200 [2:02:28<1:55:01, 11.81batch/s, batch_nb=96094, gpu=0, loss=0.065, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  56%|█████▌    | 99787/179200 [2:05:24<1:35:34, 13.85batch/s, batch_nb=98186, gpu=0, loss=0.064, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  57%|█████▋    | 102535/179200 [2:09:18<1:48:45, 11.75batch/s, batch_nb=100934, gpu=0, loss=0.064, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  57%|█████▋    | 102879/179200 [2:09:46<1:30:29, 14.06batch/s, batch_nb=101278, gpu=0, loss=0.063, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 2:  60%|█████▉    | 107067/179200 [2:15:41<1:35:21, 12.61batch/s, batch_nb=105466, gpu=0, loss=0.065, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  60%|█████▉    | 107200/179200 [2:15:53<1:26:45, 13.83batch/s, batch_nb=105599, gpu=0, loss=0.064, v_nb=55]
Validating:   0%|          | 0/320 [00:00<?, ?batch/s][A
Epoch 2:  60%|█████▉    | 107209/179200 [2:15:53<1:04:56, 18.48batch/s, batch_nb=105599, gpu=0, loss=0.064, v_nb=55]
Epoch 2:  60%|█████▉    | 107218/179200 [2:15:53<49:34, 24.20batch/s, batch_nb=105599, gpu=0, loss=0.064, v_nb=55]  
Epoch 2:  60%|█████▉    | 107227/179200 [2:15:53<38:44, 30.96batch/s, batch_nb=105599, gpu=0, loss=0.064, v_nb=55]
Epoch 2:  60%|█████▉    | 107237/179200 [2:15:53<31:04, 38.59batch/s, batch_nb=105599, gpu=0, loss=0.064, v_nb=55]
Epoch 2:  60%|█████▉    | 107246/179200 [2:15:53<25:51, 46.38batch/s, batch_nb=105599, gpu=0, loss=0.064, v_nb=55]
Epoch 2:  60%|█████▉    | 107255/179200 [2:15:53<22:12, 53.99batch/s, batch_nb=105599, gpu=0, loss=0.064, v_nb=55]
Epoch 2:  60%|█████▉    | 107264/179200 [2:15:53<19:46, 60.61batch/s, batch_nb=105599, gpu=0, loss=0.064, v_nb=55]
Epoch 2:  60%|██

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  62%|██████▏   | 111591/179200 [2:21:47<1:16:35, 14.71batch/s, batch_nb=109670, gpu=0, loss=0.064, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  63%|██████▎   | 113671/179200 [2:24:43<1:15:07, 14.54batch/s, batch_nb=111750, gpu=0, loss=0.066, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  64%|██████▍   | 115251/179200 [2:26:58<1:14:29, 14.31batch/s, batch_nb=113330, gpu=0, loss=0.065, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 2:  67%|██████▋   | 119291/179200 [2:32:46<1:08:58, 14.48batch/s, batch_nb=117370, gpu=0, loss=0.065, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  68%|██████▊   | 121375/179200 [2:35:48<1:31:07, 10.58batch/s, batch_nb=119454, gpu=0, loss=0.064, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  69%|██████▉   | 123635/179200 [2:39:03<1:27:39, 10.56batch/s, batch_nb=121714, gpu=0, loss=0.064, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  70%|██████▉   | 125120/179200 [2:41:11<1:03:19, 14.23batch/s, batch_nb=123199, gpu=0, loss=0.064, v_nb=55]
Validating:   0%|          | 0/320 [00:00<?, ?batch/s][A
Epoch 2:  70%|██████▉   | 125129/179200 [2:41:11<47:35, 18.93batch/s, batch_nb=123199, gpu=0, loss=0.064, v_nb=55]  
Epoch 2:  70%|██████▉   | 125138/179200 [2:41:11<36:22, 24.78batch/s, batch_nb=123199, gpu=0, loss=0.064, v_nb=55]
Epoch 2:  70%|██████▉   | 125147/179200 [2:41:11<28:27, 31.65batch/s, batch_nb=123199, gpu=0, loss=0.064, v_nb=55]
Epoch 2:  70%|██████▉   | 125156/179200 [2:41:12<22:55, 39.30batch/s, batch_nb=123199, gpu=0, loss=0.064, v_nb=55]
Epoch 2:  70%|██████▉   | 125166/179200 [2:41:12<18:58, 47.44batch/s, batch_nb=123199, gpu=0, loss=0.064, v_nb=55]
Epoch 2:  70%|██████▉   | 125176/179200 [2:41:12<16:18, 55.24batch/s, batch_nb=123199, gpu=0, loss=0.064, v_nb=55]
Epoch 2:  70%|██████▉   | 125185/179200 [2:41:12<14:25, 62.40batch/s, batch_nb=123199, gpu=0, loss=0.064, v_nb=55]
Epoch 2:  70%|████

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  72%|███████▏  | 128640/179200 [2:45:54<56:55, 14.80batch/s, batch_nb=126399, gpu=0, loss=0.063, v_nb=55]  

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  73%|███████▎  | 130667/179200 [2:48:49<1:03:22, 12.76batch/s, batch_nb=128426, gpu=0, loss=0.064, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  74%|███████▍  | 132679/179200 [2:51:46<58:47, 13.19batch/s, batch_nb=130438, gpu=0, loss=0.065, v_nb=55]  

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  75%|███████▌  | 134721/179200 [2:54:45<52:45, 14.05batch/s, batch_nb=132480, gpu=0, loss=0.065, v_nb=55]  

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  76%|███████▌  | 136155/179200 [2:56:53<1:06:18, 10.82batch/s, batch_nb=133914, gpu=0, loss=0.064, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 2:  78%|███████▊  | 140195/179200 [3:02:39<54:47, 11.87batch/s, batch_nb=137954, gpu=0, loss=0.065, v_nb=55]  

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  79%|███████▉  | 142261/179200 [3:05:29<38:26, 16.02batch/s, batch_nb=140020, gpu=0, loss=0.064, v_nb=55]  

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  80%|███████▉  | 142927/179200 [3:06:20<48:37, 12.43batch/s, batch_nb=140686, gpu=0, loss=0.064, v_nb=55]  

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 2:  80%|███████▉  | 143040/179200 [3:06:28<40:35, 14.85batch/s, batch_nb=140799, gpu=0, loss=0.066, v_nb=55]  
Epoch 2:  80%|███████▉  | 143044/179200 [3:06:29<32:21, 18.63batch/s, batch_nb=140799, gpu=0, loss=0.066, v_nb=55]
Epoch 2:  80%|███████▉  | 143055/179200 [3:06:29<24:23, 24.69batch/s, batch_nb=140799, gpu=0, loss=0.066, v_nb=55]
Epoch 2:  80%|███████▉  | 143065/179200 [3:06:29<18:52, 31.89batch/s, batch_nb=140799, gpu=0, loss=0.066, v_nb=55]
Epoch 2:  80%|███████▉  | 143076/179200 [3:06:29<14:59, 40.15batch/s, batch_nb=140799, gpu=0, loss=0.066, v_nb=55]
Epoch 2:  80%|███████▉  | 143087/179200 [3:06:29<12:15, 49.11batch/s, batch_nb=140799, gpu=0, loss=0.066, v_nb=55]
Epoch 2:  80%|███████▉  | 143098/179200 [3:06:29<10:19, 58.23batch/s, batch_nb=140799, gpu=0, loss=0.066, v_nb=55]
Epoch 2:  80%|███████▉  | 143109/179200 [3:06:29<08:59, 66.92batch/s, batch_nb=140799, gpu=0, loss=0.066, v_nb=55]
Epoch 2:  80%|███████▉  | 143120/179200 [3:06:29<08:02, 74.72batch/s, batch_nb

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  84%|████████▎ | 149655/179200 [3:14:37<36:31, 13.48batch/s, batch_nb=147094, gpu=0, loss=0.065, v_nb=55]  

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  85%|████████▍ | 152311/179200 [3:18:02<28:51, 15.53batch/s, batch_nb=149750, gpu=0, loss=0.064, v_nb=55]  

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  86%|████████▌ | 154339/179200 [3:20:39<28:14, 14.67batch/s, batch_nb=151778, gpu=0, loss=0.064, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  87%|████████▋ | 156441/179200 [3:23:22<24:04, 15.76batch/s, batch_nb=153880, gpu=0, loss=0.064, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  89%|████████▊ | 158779/179200 [3:26:24<25:58, 13.10batch/s, batch_nb=156218, gpu=0, loss=0.064, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  90%|████████▉ | 160855/179200 [3:29:06<23:18, 13.11batch/s, batch_nb=158294, gpu=0, loss=0.063, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  90%|████████▉ | 160960/179200 [3:29:14<22:31, 13.49batch/s, batch_nb=158399, gpu=0, loss=0.063, v_nb=55]
Epoch 2:  90%|████████▉ | 160964/179200 [3:29:14<17:42, 17.16batch/s, batch_nb=158399, gpu=0, loss=0.063, v_nb=55]
Epoch 2:  90%|████████▉ | 160975/179200 [3:29:14<13:16, 22.89batch/s, batch_nb=158399, gpu=0, loss=0.063, v_nb=55]
Epoch 2:  90%|████████▉ | 160986/179200 [3:29:14<10:10, 29.86batch/s, batch_nb=158399, gpu=0, loss=0.063, v_nb=55]
Epoch 2:  90%|████████▉ | 160997/179200 [3:29:14<07:59, 37.96batch/s, batch_nb=158399, gpu=0, loss=0.063, v_nb=55]
Epoch 2:  90%|████████▉ | 161008/179200 [3:29:14<06:27, 46.90batch/s, batch_nb=158399, gpu=0, loss=0.063, v_nb=55]
Epoch 2:  90%|████████▉ | 161019/179200 [3:29:14<05:24, 56.09batch/s, batch_nb=158399, gpu=0, loss=0.063, v_nb=55]
Epoch 2:  90%|████████▉ | 161030/179200 [3:29:14<04:39, 65.05batch/s, batch_nb=158399, gpu=0, loss=0.063, v_nb=55]
Epoch 2:  90%|████████▉ | 161041/179200 [3:29:15<04:07, 73.26batch/s, batch_nb=1

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  93%|█████████▎| 165767/179200 [3:35:11<16:21, 13.68batch/s, batch_nb=162886, gpu=0, loss=0.062, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  94%|█████████▍| 168059/179200 [3:38:12<12:38, 14.68batch/s, batch_nb=165178, gpu=0, loss=0.063, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  95%|█████████▍| 170075/179200 [3:40:51<12:31, 12.15batch/s, batch_nb=167194, gpu=0, loss=0.064, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  96%|█████████▌| 172415/179200 [3:43:55<08:33, 13.22batch/s, batch_nb=169534, gpu=0, loss=0.063, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  97%|█████████▋| 174560/179200 [3:46:44<05:36, 13.77batch/s, batch_nb=171679, gpu=0, loss=0.064, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2:  99%|█████████▊| 176571/179200 [3:49:23<02:49, 15.55batch/s, batch_nb=173690, gpu=0, loss=0.064, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2: 100%|█████████▉| 178621/179200 [3:52:06<00:39, 14.73batch/s, batch_nb=175740, gpu=0, loss=0.064, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 2: 100%|█████████▉| 178880/179200 [3:52:26<00:21, 14.58batch/s, batch_nb=175999, gpu=0, loss=0.065, v_nb=55]
Epoch 2: 100%|█████████▉| 178884/179200 [3:52:26<00:17, 18.42batch/s, batch_nb=175999, gpu=0, loss=0.065, v_nb=55]
Epoch 2: 100%|█████████▉| 178895/179200 [3:52:26<00:12, 24.43batch/s, batch_nb=175999, gpu=0, loss=0.065, v_nb=55]
Epoch 2: 100%|█████████▉| 178906/179200 [3:52:27<00:09, 31.67batch/s, batch_nb=175999, gpu=0, loss=0.065, v_nb=55]
Epoch 2: 100%|█████████▉| 178917/179200 [3:52:27<00:07, 39.96batch/s, batch_nb=175999, gpu=0, loss=0.065, v_nb=55]
Epoch 2: 100%|█████████▉| 178928/179200 [3:52:27<00:05, 48.90batch/s, batch_nb=175999, gpu=0, loss=0.065, v_nb=55]
Epoch 2: 100%|█████████▉| 178939/179200 [3:52:27<00:04, 58.03batch/s, batch_nb=175999, gpu=0, loss=0.065, v_nb=55]
Epoch 2: 100%|█████████▉| 178949/179200 [3:52:27<00:03, 66.11batch/s, batch_nb=175999, gpu=0, loss=0.065, v_nb=55]
Epoch 2: 100%|█████████▉| 178960/179200 [3:52:27<00:03, 74.13batch/s, batch_nb=1

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:   3%|▎         | 4915/179200 [06:30<4:13:37, 11.45batch/s, batch_nb=4914, gpu=0, loss=0.063, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:   4%|▍         | 6915/179200 [09:08<3:58:19, 12.05batch/s, batch_nb=6914, gpu=0, loss=0.064, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:   5%|▍         | 8947/179200 [11:50<3:30:48, 13.46batch/s, batch_nb=8946, gpu=0, loss=0.063, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:   5%|▌         | 9591/179200 [12:41<3:13:43, 14.59batch/s, batch_nb=9590, gpu=0, loss=0.063, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 3:   8%|▊         | 14131/179200 [18:46<3:07:11, 14.70batch/s, batch_nb=14130, gpu=0, loss=0.063, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:   9%|▊         | 15427/179200 [20:29<3:29:38, 13.02batch/s, batch_nb=15426, gpu=0, loss=0.062, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 3:  10%|▉         | 17600/179200 [23:23<2:53:22, 15.53batch/s, batch_nb=17599, gpu=0, loss=0.062, v_nb=55]
Epoch 3:  10%|▉         | 17604/179200 [23:23<2:18:17, 19.47batch/s, batch_nb=17599, gpu=0, loss=0.062, v_nb=55]
Epoch 3:  10%|▉         | 17615/179200 [23:24<1:44:39, 25.73batch/s, batch_nb=17599, gpu=0, loss=0.062, v_nb=55]
Epoch 3:  10%|▉         | 17626/179200 [23:24<1:21:07, 33.19batch/s, batch_nb=17599, gpu=0, loss=0.062, v_nb=55]
Epoch 3:  10%|▉         | 17637/179200 [23:24<1:04:37, 41.67batch/s, batch_nb=17599, gpu=0, loss=0.062, v_nb=55]
Epoch 3:  10%|▉         | 17648/179200 [23:24<53:04, 50.74batch/s, batch_nb=17599, gpu=0, loss=0.062, v_nb=55]  
Epoch 3:  10%|▉         | 17659/179200 [23:24<44:59, 59.85batch/s, batch_nb=17599, gpu=0, loss=0.062, v_nb=55]
Epoch 3:  10%|▉         | 17670/179200 [23:24<39:20, 68.44batch/s, batch_nb=17599, gpu=0, loss=0.062, v_nb=55]
Epoch 3:  10%|▉         | 17681/179200 [23:24<35:23, 76.08batch/s, batch_nb=17599, gpu=0, loss=0.062

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  12%|█▏        | 21995/179200 [28:57<3:43:44, 11.71batch/s, batch_nb=21674, gpu=0, loss=0.062, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  13%|█▎        | 23680/179200 [31:13<3:07:18, 13.84batch/s, batch_nb=23359, gpu=0, loss=0.062, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 3:  15%|█▌        | 27695/179200 [36:38<3:17:58, 12.75batch/s, batch_nb=27374, gpu=0, loss=0.063, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  16%|█▌        | 28511/179200 [37:44<3:01:48, 13.81batch/s, batch_nb=28190, gpu=0, loss=0.063, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 3:  18%|█▊        | 32575/179200 [43:19<3:29:59, 11.64batch/s, batch_nb=32254, gpu=0, loss=0.063, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  19%|█▊        | 33171/179200 [44:06<2:48:08, 14.48batch/s, batch_nb=32850, gpu=0, loss=0.061, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 3:  20%|█▉        | 35520/179200 [47:28<2:51:35, 13.96batch/s, batch_nb=35199, gpu=0, loss=0.062, v_nb=55]
Epoch 3:  20%|█▉        | 35530/179200 [47:28<2:07:21, 18.80batch/s, batch_nb=35199, gpu=0, loss=0.062, v_nb=55]
Epoch 3:  20%|█▉        | 35541/179200 [47:28<1:36:10, 24.90batch/s, batch_nb=35199, gpu=0, loss=0.062, v_nb=55]
Epoch 3:  20%|█▉        | 35552/179200 [47:28<1:14:18, 32.22batch/s, batch_nb=35199, gpu=0, loss=0.062, v_nb=55]
Epoch 3:  20%|█▉        | 35563/179200 [47:28<58:57, 40.60batch/s, batch_nb=35199, gpu=0, loss=0.062, v_nb=55]  
Epoch 3:  20%|█▉        | 35574/179200 [47:28<48:23, 49.47batch/s, batch_nb=35199, gpu=0, loss=0.062, v_nb=55]
Epoch 3:  20%|█▉        | 35585/179200 [47:28<40:56, 58.47batch/s, batch_nb=35199, gpu=0, loss=0.062, v_nb=55]
Epoch 3:  20%|█▉        | 35595/179200 [47:28<36:18, 65.91batch/s, batch_nb=35199, gpu=0, loss=0.062, v_nb=55]
Epoch 3:  20%|█▉        | 35606/179200 [47:29<32:26, 73.76batch/s, batch_nb=35199, gpu=0, loss=0.062, 

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 3:  23%|██▎       | 41535/179200 [55:31<3:27:02, 11.08batch/s, batch_nb=40894, gpu=0, loss=0.064, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  24%|██▍       | 43567/179200 [58:24<3:15:00, 11.59batch/s, batch_nb=42926, gpu=0, loss=0.061, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  26%|██▌       | 45751/179200 [1:01:26<3:03:39, 12.11batch/s, batch_nb=45110, gpu=0, loss=0.062, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  27%|██▋       | 47831/179200 [1:04:19<2:33:04, 14.30batch/s, batch_nb=47190, gpu=0, loss=0.061, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  28%|██▊       | 49755/179200 [1:07:02<3:52:22,  9.28batch/s, batch_nb=49114, gpu=0, loss=0.062, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 3:  30%|██▉       | 53440/179200 [1:12:11<2:18:45, 15.10batch/s, batch_nb=52799, gpu=0, loss=0.062, v_nb=55]
Epoch 3:  30%|██▉       | 53449/179200 [1:12:11<1:44:15, 20.10batch/s, batch_nb=52799, gpu=0, loss=0.062, v_nb=55]
Epoch 3:  30%|██▉       | 53458/179200 [1:12:11<1:19:59, 26.20batch/s, batch_nb=52799, gpu=0, loss=0.062, v_nb=55]
Epoch 3:  30%|██▉       | 53468/179200 [1:12:11<1:02:51, 33.33batch/s, batch_nb=52799, gpu=0, loss=0.062, v_nb=55]
Epoch 3:  30%|██▉       | 53479/179200 [1:12:11<50:14, 41.70batch/s, batch_nb=52799, gpu=0, loss=0.062, v_nb=55]  
Epoch 3:  30%|██▉       | 53489/179200 [1:12:11<41:42, 50.24batch/s, batch_nb=52799, gpu=0, loss=0.062, v_nb=55]
Epoch 3:  30%|██▉       | 53499/179200 [1:12:11<36:10, 57.93batch/s, batch_nb=52799, gpu=0, loss=0.062, v_nb=55]
Epoch 3:  30%|██▉       | 53509/179200 [1:12:11<32:01, 65.40batch/s, batch_nb=52799, gpu=0, loss=0.062, v_nb=55]
Epoch 3:  30%|██▉       | 53520/179200 [1:12:11<28:29, 73.52batch/s, batch_nb=52799, g

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  32%|███▏      | 57241/179200 [1:17:09<2:10:23, 15.59batch/s, batch_nb=56280, gpu=0, loss=0.061, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  33%|███▎      | 59387/179200 [1:20:09<3:01:33, 11.00batch/s, batch_nb=58426, gpu=0, loss=0.063, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  33%|███▎      | 59901/179200 [1:20:52<2:10:58, 15.18batch/s, batch_nb=58940, gpu=0, loss=0.062, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 3:  35%|███▌      | 63519/179200 [1:25:59<2:27:11, 13.10batch/s, batch_nb=62558, gpu=0, loss=0.062, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 3:  37%|███▋      | 66811/179200 [1:30:38<2:12:07, 14.18batch/s, batch_nb=65850, gpu=0, loss=0.062, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 3:  40%|███▉      | 70907/179200 [1:36:31<2:24:30, 12.49batch/s, batch_nb=69946, gpu=0, loss=0.062, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  40%|███▉      | 71360/179200 [1:37:10<2:01:40, 14.77batch/s, batch_nb=70399, gpu=0, loss=0.064, v_nb=55]
Validating:   0%|          | 0/320 [00:00<?, ?batch/s][A
Epoch 3:  40%|███▉      | 71370/179200 [1:37:10<1:30:46, 19.80batch/s, batch_nb=70399, gpu=0, loss=0.064, v_nb=55]
Epoch 3:  40%|███▉      | 71381/179200 [1:37:11<1:08:47, 26.12batch/s, batch_nb=70399, gpu=0, loss=0.064, v_nb=55]
Epoch 3:  40%|███▉      | 71392/179200 [1:37:11<53:22, 33.66batch/s, batch_nb=70399, gpu=0, loss=0.064, v_nb=55]  
Epoch 3:  40%|███▉      | 71402/179200 [1:37:11<42:51, 41.93batch/s, batch_nb=70399, gpu=0, loss=0.064, v_nb=55]
Epoch 3:  40%|███▉      | 71412/179200 [1:37:11<35:38, 50.42batch/s, batch_nb=70399, gpu=0, loss=0.064, v_nb=55]
Epoch 3:  40%|███▉      | 71422/179200 [1:37:11<30:39, 58.58batch/s, batch_nb=70399, gpu=0, loss=0.064, v_nb=55]
Epoch 3:  40%|███▉      | 71432/179200 [1:37:11<27:02, 66.42batch/s, batch_nb=70399, gpu=0, loss=0.064, v_nb=55]
Epoch 3:  40%|███▉      | 7144

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 3:  43%|████▎     | 77141/179200 [1:45:24<1:50:58, 15.33batch/s, batch_nb=75860, gpu=0, loss=0.061, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  43%|████▎     | 77901/179200 [1:46:30<1:54:32, 14.74batch/s, batch_nb=76620, gpu=0, loss=0.062, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 3:  45%|████▌     | 81295/179200 [1:51:27<2:36:19, 10.44batch/s, batch_nb=80014, gpu=0, loss=0.062, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 3:  48%|████▊     | 85407/179200 [1:57:29<2:05:27, 12.46batch/s, batch_nb=84126, gpu=0, loss=0.062, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  49%|████▉     | 87661/179200 [2:00:49<1:41:28, 15.03batch/s, batch_nb=86380, gpu=0, loss=0.062, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  50%|████▉     | 89280/179200 [2:03:14<1:51:06, 13.49batch/s, batch_nb=87999, gpu=0, loss=0.063, v_nb=55]
Epoch 3:  50%|████▉     | 89283/179200 [2:03:14<1:29:22, 16.77batch/s, batch_nb=87999, gpu=0, loss=0.063, v_nb=55]
Epoch 3:  50%|████▉     | 89293/179200 [2:03:14<1:07:08, 22.32batch/s, batch_nb=87999, gpu=0, loss=0.063, v_nb=55]
Epoch 3:  50%|████▉     | 89303/179200 [2:03:14<51:31, 29.08batch/s, batch_nb=87999, gpu=0, loss=0.063, v_nb=55]  
Epoch 3:  50%|████▉     | 89313/179200 [2:03:14<40:41, 36.82batch/s, batch_nb=87999, gpu=0, loss=0.063, v_nb=55]
Epoch 3:  50%|████▉     | 89324/179200 [2:03:14<32:56, 45.48batch/s, batch_nb=87999, gpu=0, loss=0.063, v_nb=55]
Epoch 3:  50%|████▉     | 89334/179200 [2:03:14<27:37, 54.21batch/s, batch_nb=87999, gpu=0, loss=0.063, v_nb=55]
Epoch 3:  50%|████▉     | 89345/179200 [2:03:14<23:45, 63.04batch/s, batch_nb=87999, gpu=0, loss=0.063, v_nb=55]
Epoch 3:  50%|████▉     | 89356/179200 [2:03:15<21:06, 70.92batch/s, batch_nb=87999, gpu

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  52%|█████▏    | 93139/179200 [2:08:33<1:42:56, 13.93batch/s, batch_nb=91538, gpu=0, loss=0.062, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  53%|█████▎    | 95355/179200 [2:11:50<1:57:31, 11.89batch/s, batch_nb=93754, gpu=0, loss=0.063, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  54%|█████▍    | 97281/179200 [2:14:40<1:33:27, 14.61batch/s, batch_nb=95680, gpu=0, loss=0.062, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 3:  57%|█████▋    | 101447/179200 [2:20:57<1:55:32, 11.22batch/s, batch_nb=99846, gpu=0, loss=0.063, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  58%|█████▊    | 103561/179200 [2:24:19<1:28:21, 14.27batch/s, batch_nb=101960, gpu=0, loss=0.063, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  59%|█████▉    | 105661/179200 [2:27:38<1:25:43, 14.30batch/s, batch_nb=104060, gpu=0, loss=0.063, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  60%|█████▉    | 107200/179200 [2:30:02<1:26:52, 13.81batch/s, batch_nb=105599, gpu=0, loss=0.061, v_nb=55]
Validating:   0%|          | 0/320 [00:00<?, ?batch/s][A
Epoch 3:  60%|█████▉    | 107210/179200 [2:30:02<1:04:50, 18.50batch/s, batch_nb=105599, gpu=0, loss=0.061, v_nb=55]
Epoch 3:  60%|█████▉    | 107220/179200 [2:30:03<49:10, 24.39batch/s, batch_nb=105599, gpu=0, loss=0.061, v_nb=55]  
Epoch 3:  60%|█████▉    | 107230/179200 [2:30:03<38:07, 31.46batch/s, batch_nb=105599, gpu=0, loss=0.061, v_nb=55]
Epoch 3:  60%|█████▉    | 107240/179200 [2:30:03<30:25, 39.41batch/s, batch_nb=105599, gpu=0, loss=0.061, v_nb=55]
Epoch 3:  60%|█████▉    | 107250/179200 [2:30:03<25:06, 47.77batch/s, batch_nb=105599, gpu=0, loss=0.061, v_nb=55]
Epoch 3:  60%|█████▉    | 107260/179200 [2:30:03<21:24, 56.00batch/s, batch_nb=105599, gpu=0, loss=0.061, v_nb=55]
Epoch 3:  60%|█████▉    | 107270/179200 [2:30:03<18:46, 63.87batch/s, batch_nb=105599, gpu=0, loss=0.061, v_nb=55]
Epoch 3:  60%|██

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  61%|██████    | 109407/179200 [2:33:04<1:39:01, 11.75batch/s, batch_nb=107486, gpu=0, loss=0.061, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 3:  63%|██████▎   | 112661/179200 [2:38:09<1:18:04, 14.20batch/s, batch_nb=110740, gpu=0, loss=0.063, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 3:  65%|██████▌   | 117271/179200 [2:45:24<1:12:26, 14.25batch/s, batch_nb=115350, gpu=0, loss=0.063, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  66%|██████▌   | 118151/179200 [2:46:48<1:11:44, 14.18batch/s, batch_nb=116230, gpu=0, loss=0.062, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 3:  68%|██████▊   | 120981/179200 [2:51:15<1:15:35, 12.84batch/s, batch_nb=119060, gpu=0, loss=0.063, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 3:  70%|██████▉   | 125120/179200 [2:57:58<1:07:03, 13.44batch/s, batch_nb=123199, gpu=0, loss=0.061, v_nb=55]
Epoch 3:  70%|██████▉   | 125129/179200 [2:57:58<49:56, 18.04batch/s, batch_nb=123199, gpu=0, loss=0.061, v_nb=55]  
Epoch 3:  70%|██████▉   | 125139/179200 [2:57:58<37:45, 23.86batch/s, batch_nb=123199, gpu=0, loss=0.061, v_nb=55]
Epoch 3:  70%|██████▉   | 125149/179200 [2:57:58<29:17, 30.76batch/s, batch_nb=123199, gpu=0, loss=0.061, v_nb=55]
Epoch 3:  70%|██████▉   | 125159/179200 [2:57:58<23:18, 38.64batch/s, batch_nb=123199, gpu=0, loss=0.061, v_nb=55]
Epoch 3:  70%|██████▉   | 125168/179200 [2:57:59<19:19, 46.62batch/s, batch_nb=123199, gpu=0, loss=0.061, v_nb=55]
Epoch 3:  70%|██████▉   | 125178/179200 [2:57:59<16:27, 54.72batch/s, batch_nb=123199, gpu=0, loss=0.061, v_nb=55]
Epoch 3:  70%|██████▉   | 125188/179200 [2:57:59<14:24, 62.49batch/s, batch_nb=123199, gpu=0, loss=0.061, v_nb=55]
Epoch 3:  70%|██████▉   | 125198/179200 [2:57:59<12:56, 69.52batch/s, batch_

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  71%|███████   | 126759/179200 [3:00:12<1:12:15, 12.09batch/s, batch_nb=124518, gpu=0, loss=0.062, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 3:  74%|███████▎  | 131739/179200 [3:08:13<1:00:53, 12.99batch/s, batch_nb=129498, gpu=0, loss=0.062, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  75%|███████▍  | 133935/179200 [3:11:46<1:15:28, 10.00batch/s, batch_nb=131694, gpu=0, loss=0.061, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  76%|███████▌  | 135959/179200 [3:15:02<58:31, 12.31batch/s, batch_nb=133718, gpu=0, loss=0.063, v_nb=55]  

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  77%|███████▋  | 138171/179200 [3:18:39<50:12, 13.62batch/s, batch_nb=135930, gpu=0, loss=0.062, v_nb=55]  

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  79%|███████▊  | 140739/179200 [3:22:48<50:00, 12.82batch/s, batch_nb=138498, gpu=0, loss=0.061, v_nb=55]  

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  80%|███████▉  | 143040/179200 [3:26:32<44:45, 13.47batch/s, batch_nb=140799, gpu=0, loss=0.061, v_nb=55]  
Epoch 3:  80%|███████▉  | 143049/179200 [3:26:33<33:19, 18.08batch/s, batch_nb=140799, gpu=0, loss=0.061, v_nb=55]
Epoch 3:  80%|███████▉  | 143059/179200 [3:26:33<25:13, 23.88batch/s, batch_nb=140799, gpu=0, loss=0.061, v_nb=55]
Epoch 3:  80%|███████▉  | 143069/179200 [3:26:33<19:29, 30.88batch/s, batch_nb=140799, gpu=0, loss=0.061, v_nb=55]
Epoch 3:  80%|███████▉  | 143079/179200 [3:26:33<15:30, 38.83batch/s, batch_nb=140799, gpu=0, loss=0.061, v_nb=55]
Epoch 3:  80%|███████▉  | 143089/179200 [3:26:33<12:43, 47.30batch/s, batch_nb=140799, gpu=0, loss=0.061, v_nb=55]
Epoch 3:  80%|███████▉  | 143098/179200 [3:26:33<10:55, 55.10batch/s, batch_nb=140799, gpu=0, loss=0.061, v_nb=55]
Epoch 3:  80%|███████▉  | 143108/179200 [3:26:33<09:30, 63.24batch/s, batch_nb=140799, gpu=0, loss=0.061, v_nb=55]
Epoch 3:  80%|███████▉  | 143117/179200 [3:26:33<08:52, 67.75batch/s, batch_nb

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  81%|████████▏ | 145699/179200 [3:30:27<42:25, 13.16batch/s, batch_nb=143138, gpu=0, loss=0.061, v_nb=55]  

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  83%|████████▎ | 148071/179200 [3:34:16<42:28, 12.21batch/s, batch_nb=145510, gpu=0, loss=0.062, v_nb=55]  

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  84%|████████▍ | 150551/179200 [3:38:17<34:06, 14.00batch/s, batch_nb=147990, gpu=0, loss=0.061, v_nb=55]  

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  85%|████████▌ | 152939/179200 [3:42:10<32:59, 13.26batch/s, batch_nb=150378, gpu=0, loss=0.063, v_nb=55]  

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  87%|████████▋ | 155141/179200 [3:45:44<29:20, 13.67batch/s, batch_nb=152580, gpu=0, loss=0.062, v_nb=55]  

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  88%|████████▊ | 157741/179200 [3:49:59<26:15, 13.62batch/s, batch_nb=155180, gpu=0, loss=0.064, v_nb=55]  

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  90%|████████▉ | 160851/179200 [3:55:04<22:19, 13.70batch/s, batch_nb=158290, gpu=0, loss=0.061, v_nb=55]  

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  90%|████████▉ | 160960/179200 [3:55:14<22:44, 13.37batch/s, batch_nb=158399, gpu=0, loss=0.062, v_nb=55]
Validating:   0%|          | 0/320 [00:00<?, ?batch/s][A
Epoch 3:  90%|████████▉ | 160970/179200 [3:55:14<16:54, 17.97batch/s, batch_nb=158399, gpu=0, loss=0.062, v_nb=55]
Epoch 3:  90%|████████▉ | 160980/179200 [3:55:15<12:46, 23.78batch/s, batch_nb=158399, gpu=0, loss=0.062, v_nb=55]
Epoch 3:  90%|████████▉ | 160990/179200 [3:55:15<09:52, 30.74batch/s, batch_nb=158399, gpu=0, loss=0.062, v_nb=55]
Epoch 3:  90%|████████▉ | 161000/179200 [3:55:15<07:51, 38.60batch/s, batch_nb=158399, gpu=0, loss=0.062, v_nb=55]
Epoch 3:  90%|████████▉ | 161010/179200 [3:55:15<06:27, 46.93batch/s, batch_nb=158399, gpu=0, loss=0.062, v_nb=55]
Epoch 3:  90%|████████▉ | 161020/179200 [3:55:15<05:28, 55.38batch/s, batch_nb=158399, gpu=0, loss=0.062, v_nb=55]
Epoch 3:  90%|████████▉ | 161030/179200 [3:55:15<04:46, 63.36batch/s, batch_nb=158399, gpu=0, loss=0.062, v_nb=55]
Epoch 3:  90%|████████

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  93%|█████████▎| 166471/179200 [4:03:47<15:19, 13.85batch/s, batch_nb=163590, gpu=0, loss=0.061, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  94%|█████████▍| 168719/179200 [4:07:26<13:29, 12.94batch/s, batch_nb=165838, gpu=0, loss=0.062, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  96%|█████████▌| 171255/179200 [4:11:36<13:57,  9.48batch/s, batch_nb=168374, gpu=0, loss=0.062, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  97%|█████████▋| 174255/179200 [4:16:29<07:08, 11.55batch/s, batch_nb=171374, gpu=0, loss=0.062, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3:  98%|█████████▊| 176336/179200 [4:19:44<04:13, 11.31batch/s, batch_nb=173455, gpu=0, loss=0.063, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3: 100%|█████████▉| 178431/179200 [4:23:01<00:58, 13.07batch/s, batch_nb=175550, gpu=0, loss=0.061, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 3: 100%|█████████▉| 178880/179200 [4:23:45<00:23, 13.43batch/s, batch_nb=175999, gpu=0, loss=0.061, v_nb=55]
Validating:   0%|          | 0/320 [00:00<?, ?batch/s][A
Epoch 3: 100%|█████████▉| 178890/179200 [4:23:45<00:17, 18.10batch/s, batch_nb=175999, gpu=0, loss=0.061, v_nb=55]
Epoch 3: 100%|█████████▉| 178901/179200 [4:23:45<00:12, 24.03batch/s, batch_nb=175999, gpu=0, loss=0.061, v_nb=55]
Epoch 3: 100%|█████████▉| 178911/179200 [4:23:45<00:09, 30.94batch/s, batch_nb=175999, gpu=0, loss=0.061, v_nb=55]
Epoch 3: 100%|█████████▉| 178922/179200 [4:23:45<00:07, 39.07batch/s, batch_nb=175999, gpu=0, loss=0.061, v_nb=55]
Epoch 3: 100%|█████████▉| 178933/179200 [4:23:45<00:05, 48.03batch/s, batch_nb=175999, gpu=0, loss=0.061, v_nb=55]
Epoch 3: 100%|█████████▉| 178944/179200 [4:23:45<00:04, 57.16batch/s, batch_nb=175999, gpu=0, loss=0.061, v_nb=55]
Epoch 3: 100%|█████████▉| 178955/179200 [4:23:45<00:03, 66.00batch/s, batch_nb=175999, gpu=0, loss=0.061, v_nb=55]
Epoch 3: 100%|████████

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 4:   3%|▎         | 4579/179200 [07:21<3:42:03, 13.11batch/s, batch_nb=4578, gpu=0, loss=0.061, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 4:   4%|▎         | 6675/179200 [10:42<4:55:07,  9.74batch/s, batch_nb=6674, gpu=0, loss=0.062, v_nb=55] 

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 4:   5%|▍         | 8887/179200 [14:14<4:04:51, 11.59batch/s, batch_nb=8886, gpu=0, loss=0.061, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 4:   6%|▋         | 11641/179200 [18:36<3:24:57, 13.63batch/s, batch_nb=11640, gpu=0, loss=0.060, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 4:   8%|▊         | 13991/179200 [22:18<3:09:33, 14.53batch/s, batch_nb=13990, gpu=0, loss=0.062, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 4:   9%|▉         | 16456/179200 [26:12<4:34:36,  9.88batch/s, batch_nb=16455, gpu=0, loss=0.061, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 4:  10%|▉         | 17600/179200 [28:00<3:29:10, 12.88batch/s, batch_nb=17599, gpu=0, loss=0.061, v_nb=55]
Epoch 4:  10%|▉         | 17610/179200 [28:01<2:34:37, 17.42batch/s, batch_nb=17599, gpu=0, loss=0.061, v_nb=55]
Epoch 4:  10%|▉         | 17621/179200 [28:01<1:56:03, 23.20batch/s, batch_nb=17599, gpu=0, loss=0.061, v_nb=55]
Epoch 4:  10%|▉         | 17632/179200 [28:01<1:29:03, 30.24batch/s, batch_nb=17599, gpu=0, loss=0.061, v_nb=55]
Epoch 4:  10%|▉         | 17643/179200 [28:01<1:10:12, 38.35batch/s, batch_nb=17599, gpu=0, loss=0.061, v_nb=55]
Epoch 4:  10%|▉         | 17654/179200 [28:01<57:00, 47.22batch/s, batch_nb=17599, gpu=0, loss=0.061, v_nb=55]  
Epoch 4:  10%|▉         | 17665/179200 [28:01<47:46, 56.36batch/s, batch_nb=17599, gpu=0, loss=0.061, v_nb=55]
Epoch 4:  10%|▉         | 17676/179200 [28:01<41:18, 65.17batch/s, batch_nb=17599, gpu=0, loss=0.061, v_nb=55]
Epoch 4:  10%|▉         | 17687/179200 [28:01<36:45, 73.24batch/s, batch_nb=17599, gpu=0, loss=0.061

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 4:  13%|█▎        | 22859/179200 [36:01<3:20:11, 13.02batch/s, batch_nb=22538, gpu=0, loss=0.061, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 4:  14%|█▍        | 24935/179200 [39:30<4:40:38,  9.16batch/s, batch_nb=24614, gpu=0, loss=0.060, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 4:  15%|█▌        | 27261/179200 [43:19<2:54:08, 14.54batch/s, batch_nb=26940, gpu=0, loss=0.059, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 4:  17%|█▋        | 29999/179200 [47:47<3:12:54, 12.89batch/s, batch_nb=29678, gpu=0, loss=0.060, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 4:  18%|█▊        | 32161/179200 [51:18<2:42:43, 15.06batch/s, batch_nb=31840, gpu=0, loss=0.060, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 4:  19%|█▉        | 34320/179200 [54:50<2:46:22, 14.51batch/s, batch_nb=33999, gpu=0, loss=0.060, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 4:  20%|█▉        | 35520/179200 [56:48<3:29:23, 11.44batch/s, batch_nb=35199, gpu=0, loss=0.061, v_nb=55]
Epoch 4:  20%|█▉        | 35524/179200 [56:49<2:41:41, 14.81batch/s, batch_nb=35199, gpu=0, loss=0.061, v_nb=55]
Epoch 4:  20%|█▉        | 35534/179200 [56:49<2:00:22, 19.89batch/s, batch_nb=35199, gpu=0, loss=0.061, v_nb=55]
Epoch 4:  20%|█▉        | 35545/179200 [56:49<1:31:14, 26.24batch/s, batch_nb=35199, gpu=0, loss=0.061, v_nb=55]
Epoch 4:  20%|█▉        | 35556/179200 [56:49<1:10:55, 33.75batch/s, batch_nb=35199, gpu=0, loss=0.061, v_nb=55]
Epoch 4:  20%|█▉        | 35567/179200 [56:49<56:44, 42.18batch/s, batch_nb=35199, gpu=0, loss=0.061, v_nb=55]  
Epoch 4:  20%|█▉        | 35578/179200 [56:49<46:45, 51.19batch/s, batch_nb=35199, gpu=0, loss=0.061, v_nb=55]
Epoch 4:  20%|█▉        | 35589/179200 [56:49<39:40, 60.33batch/s, batch_nb=35199, gpu=0, loss=0.061, v_nb=55]
Epoch 4:  20%|█▉        | 35600/179200 [56:49<34:43, 68.92batch/s, batch_nb=35199, gpu=0, loss=0.061

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 4:  22%|██▏       | 40111/179200 [1:03:58<2:55:37, 13.20batch/s, batch_nb=39470, gpu=0, loss=0.060, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 4:  24%|██▎       | 42279/179200 [1:07:38<3:09:50, 12.02batch/s, batch_nb=41638, gpu=0, loss=0.061, v_nb=55]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 4:  25%|██▍       | 44081/179200 [1:10:40<2:43:49, 13.75batch/s, batch_nb=43440, gpu=0, loss=0.060, v_nb=55]

In [10]:
trainer.test(exp)

Testing: 100%|██████████| 18201/18201 [09:36<00:00, 31.57batch/s]


In [None]:
# find memory leak(loading train dataset on every run)
# torch.optim.lr_scheduler.CosineAnnealingWarmRestarts
# label smoothing

In [None]:
# https://arxiv.org/pdf/1912.01857.pdf
# https://github.com/feidfoe/AdjustBnd4Imbalance/blob/master/cifar.py
gamma = 0.1 # hparams for re_scaling https://arxiv.org/pdf/1912.01857.pdf
if args.evaluate:
    print('\nEvaluation only')
    test_loss, test_acc = test(testloader, model, criterion, 
                               start_epoch, use_cuda)
    print('[w/o RS] Test Loss: %.8f, Test Acc: %.2f%%' % (test_loss, test_acc))

    current_state = model.state_dict()
    W = current_state['module.fc.weight']

    imb_factor = 1. / args.imbalance
    img_max = 50000/num_classes
    num_sample = [img_max * (imb_factor**(i/(num_classes - 1))) \
                     for i in range(num_classes)]

    ns = [ float(n) / max(num_sample) for n in num_sample ]
    ns = [ n**gamma for n in ns ]
    ns = torch.FloatTensor(ns).unsqueeze(-1).cuda()
    new_W = W / ns

    current_state['module.fc.weight'] = new_W
    model.load_state_dict(current_state)

    test_loss, test_acc = test(testloader, model, criterion, 
                               start_epoch, use_cuda)
    print('[w/  RS] Test Loss: %.8f, Test Acc: %.2f%%' % (test_loss, test_acc))

    return