In [1]:
# basic imports
import numpy as np
import sys,os,h5py,math

# PyTorch imports
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Sampler
import pytorch_lightning as pl

# Personal imports
sys.path.append(os.path.dirname(os.getcwd())) #add parent folder to PATH
import lib.models as models
from lib.metrics import accuracy,weighted_aucs

chpt_path = None

## DanQ

In [2]:
class CustomDataset(Dataset):
    def __init__(self,x,y):
        super(CustomDataset,self).__init__()
        self.x, self.y = x,y
        
    def __len__(self): return len(self.x)

    def __getitem__(self, i): return self.x[i].long(), self.y[i].float()

In [3]:
class Experiment(pl.LightningModule):

    def __init__(self,bs):
        super(Experiment, self).__init__()
        self.model = models.DanQ()
        self.bs = bs #batch size
        self.loss_fn = nn.BCEWithLogitsLoss()
        self.class_weights = torch.load('../data/Processed/class_weights')
        
    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_nb):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat, y)
        tensorboard_logs = {}#{'train_loss': loss}
        return {'loss': loss, 'log': tensorboard_logs}
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        return {'val_loss': self.loss_fn(y_hat, y)}

    def validation_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        tensorboard_logs = {}#{'val_loss': avg_loss}
        return {'avg_val_loss': avg_loss, 'log': tensorboard_logs}

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        y_pred = F.softmax(y_hat,dim=1).detach().cpu()
        return {'test_loss': self.loss_fn(y_hat, y),'y_pred':y_pred, 'y_true':y.cpu()}

    def test_end(self, outputs):
        avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean()
        y_preds = torch.cat([x['y_pred'] for x in outputs])
        y_trues = torch.cat([x['y_true'] for x in outputs]).byte()
        roc_auc = weighted_auc(y_preds,y_trues, self.class_weights)
        
        tensorboard_logs = {'test_loss': avg_loss,'roc_auc':roc_auc}
        return {'avg_test_loss': avg_loss, 'log': tensorboard_logs}
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-4)

    @pl.data_loader
    def train_dataloader(self):
        print('Loading training dataset')
        train_h5 = h5py.File('../data/Processed/train.hdf5')
        X_train = torch.tensor(train_h5['X_train'][:])
        y_train = torch.tensor(train_h5['y_train'][:])
        train_h5.close()
        trn_ds = CustomDataset(X_train,y_train)
        trn_dl = DataLoader(trn_ds, batch_size=self.bs,shuffle=True, num_workers=6)
        return trn_dl
    
    @pl.data_loader
    def val_dataloader(self):
        valid = np.load('../data/Processed/valid.npz')
        X_valid = torch.tensor(valid['arr_0'][:])
        y_valid = torch.tensor(valid['arr_1'][:])

        vld_ds = CustomDataset(X_valid,y_valid)
        vld_dl = DataLoader(vld_ds, batch_size=self.bs,shuffle=False, num_workers=6)
        return vld_dl
    
    @pl.data_loader
    def test_dataloader(self):
        test = np.load('../data/Processed/test.npz')
        X_test = torch.tensor(test['arr_0'][:])
        y_test = torch.tensor(test['arr_1'][:])
                              
        tst_ds = CustomDataset(X_test,y_test)
        tst_dl = DataLoader(tst_ds, batch_size=self.bs,shuffle=False, num_workers=6)
        return tst_dl

In [5]:
exp = Experiment(bs=512)

trainer = pl.Trainer(gpus=1,max_nb_epochs=1, train_percent_check=0.01,val_percent_check=0.5,
                     default_save_path='../data',log_gpu_memory='min_max')    
trainer.fit(exp) 
# #trainer.test()

Loading training dataset


Epoch 1:  91%|█████████▏| 85/93 [00:20<00:01,  4.26batch/s, batch_nb=84, gpu=0, loss=0.133, v_nb=22]
Validating:   0%|          | 0/8 [00:00<?, ?batch/s][A
Epoch 1:  92%|█████████▏| 86/93 [00:21<00:02,  2.40batch/s, batch_nb=84, gpu=0, loss=0.133, v_nb=22]
Epoch 1:  95%|█████████▍| 88/93 [00:21<00:01,  3.22batch/s, batch_nb=84, gpu=0, loss=0.133, v_nb=22]
Epoch 1:  97%|█████████▋| 90/93 [00:21<00:00,  4.22batch/s, batch_nb=84, gpu=0, loss=0.133, v_nb=22]
Epoch 1: 100%|██████████| 93/93 [00:22<00:00,  5.38batch/s, batch_nb=84, gpu=0, loss=0.133, v_nb=22]
Epoch 1: : 94batch [00:23,  4.06batch/s, batch_nb=85, gpu=0, loss=0.132, v_nb=22]                   


1

In [5]:
exp = Experiment(bs=512)
chpt_path = '../data/lightning_logs/version_4/checkpoints/_ckpt_epoch_1.ckpt'
exp.load_state_dict(torch.load(chpt_path)['state_dict'])
# exp.cuda()
trainer = pl.Trainer(gpus=1,max_nb_epochs=1, train_percent_check=0.1,val_percent_check=0.5,
                     test_percent_check=0.1, default_save_path='../data',log_gpu_memory='min_max')
trainer.test(exp)

Loading training dataset


Testing: 100%|██████████| 88/88 [00:08<00:00, 10.53batch/s]


## Transformer XL

In [2]:
import transformers as ts

In [3]:
class LMDataset(Dataset):
    def __init__(self,x,y,mem_len):
        super(LMDataset,self).__init__()
        self.x, self.y = x,y
        self.n = x.shape[0]*math.ceil(1000/mem_len)
        
    def __len__(self): return self.n
    def __getitem__(self, i): 
        (b_idx,seq_idxs, seq_start) = i
        x = self.x[b_idx,seq_idxs[0]:seq_idxs[1]].long()
        inp,tgt = x[:-1], x[1:]
        return inp,tgt,seq_start
    
class LMSampleR(Sampler):
    def __init__(self, ds, bs,mem_len):
        self.ds, self.bs = ds, bs
        self.mem_len = mem_len
        
    def __len__(self): return len(self.ds)
    
    def __iter__(self):
        for i in range(0,self.ds.x.shape[0],self.bs):
            for j in range(0,1000,self.mem_len):
                seq_idxs = (j,j+self.mem_len+1)
                for k in range(self.bs):
                    b_idx = i+k
                    seq_start = j==0 
                    yield (b_idx,seq_idxs, seq_start) # (bs,seq_len)

In [4]:
class TransXL_LM(nn.Module):
    def __init__(self,cfg):
        super(TransXL_LM, self).__init__()
        self.cfg = cfg
        self.core = ts.TransfoXLModel(cfg)
        self.lm_head = nn.Linear(self.cfg.d_model,self.cfg.vocab_size)
        
    def forward(self,x,mems=None):
        last_hidden_state,  mems = self.core(x,mems)
        out = self.lm_head(last_hidden_state)
        return out, mems

In [5]:
class Experiment(pl.LightningModule):

    def __init__(self,bs):
        super(Experiment, self).__init__()
        self.cfg = ts.TransfoXLConfig(vocab_size=4, d_model=64, d_embed=8, n_head=4, d_head=16, d_inner=128, 
                             n_layer=6, tgt_len=0, ext_len=0, mem_len=512, cutoffs=[1], )
        self.model = TransXL_LM(self.cfg)
        self.bs = bs #batch size
        self.loss_fn = nn.CrossEntropyLoss()
        self.class_weights = torch.load('../data/Processed/class_weights')
        
    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_nb):
        x, y, new_mem = batch
        self.mem = None if new_mem[0] else self.mem
        y_hat, self.mem = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1,self.cfg.vocab_size), y.view(-1))
        tensorboard_logs = {'train_loss': loss}
        return {'loss': loss, 'log': tensorboard_logs}
    
    def validation_step(self, batch, batch_idx):
        x, y, new_mem = batch
        self.mem = None if new_mem[0] else self.mem
        y_hat, self.mem = self.forward(x)
        return {'val_loss': self.loss_fn(y_hat.view(-1,self.cfg.vocab_size), y.view(-1))}

    def validation_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        tensorboard_logs = {'val_loss': avg_loss}
        return {'avg_val_loss': avg_loss, 'log': tensorboard_logs}

#     def test_step(self, batch, batch_idx):
#         x, y, new_mem = batch
#         self.mem = None if new_mem[0] else self.mem
#         y_hat, self.mem = self.forward(x)
#         y_pred = F.softmax(y_hat,dim=1).detach().cpu()
#         return {'test_loss': self.loss_fn(y_hat, y),'y_pred':y_pred, 'y_true':y.cpu()}

#     def test_end(self, outputs):
#         avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean()
#         y_preds = torch.cat([x['y_pred'] for x in outputs])
#         y_trues = torch.cat([x['y_true'] for x in outputs]).byte()
#         roc_auc = weighted_auc(y_preds,y_trues, self.class_weights)
        
        tensorboard_logs = {'test_loss': avg_loss,'roc_auc':roc_auc}
        return {'avg_test_loss': avg_loss, 'log': tensorboard_logs}
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-4)

    @pl.data_loader
    def train_dataloader(self):
        print('Loading training dataset')
        train_h5 = h5py.File('../data/Processed/train.hdf5')
        X_train = torch.tensor(train_h5['X_train'][:])
        y_train = torch.tensor(train_h5['y_train'][:])
        train_h5.close()
        
        trn_ds = LMDataset(X_train,y_train,self.cfg.mem_len)
        splr   = LMSampleR(trn_ds,self.bs,self.cfg.mem_len)
        trn_dl = DataLoader(trn_ds, self.bs,sampler=splr,pin_memory=True)
        return trn_dl
    
    @pl.data_loader
    def val_dataloader(self):
        valid = np.load('../data/Processed/valid.npz')
        X_valid = torch.tensor(valid['arr_0'][:])
        y_valid = torch.tensor(valid['arr_1'][:])

        vld_ds = LMDataset(X_valid,y_valid,self.cfg.mem_len)
        splr   = LMSampleR(vld_ds,self.bs,self.cfg.mem_len)
        vld_dl = DataLoader(vld_ds,self.bs,sampler=splr,pin_memory=True)
        return vld_dl
    
#     @pl.data_loader
#     def test_dataloader(self):
#         test = np.load('../data/Processed/test.npz')
#         X_test = torch.tensor(test['arr_0'][:])
#         y_test = torch.tensor(test['arr_1'][:])
                              
#         tst_ds = LMDataset(X_test,y_test,self.cfg.mem_len)
#         splr   = LMSampleR(tst_ds,self.bs,self.cfg.mem_len)
#         tst_dl = DataLoader(tst_ds,self.bs,sampler=splr,pin_memory=True)
#         return tst_dl

In [6]:
exp = Experiment(bs=32)

trainer = pl.Trainer(gpus=1,max_nb_epochs=1, train_percent_check=0.01,val_percent_check=0.5,
                     default_save_path='../data')    

trainer.fit(exp)

Loading training dataset


Epoch 1:  25%|██▍       | 745/3000 [11:39<34:47,  1.08batch/s, batch_nb=744, gpu=0, loss=1.329, v_nb=14]

KeyboardInterrupt: 

In [None]:
#add acc
#dataloader slow when num_workers>0
#add shuffler to samplers
#apex https://github.com/adityaiitb/pyprof2
#resnet+transformerxl

## Resnet + TransformerXL

In [2]:
import transformers as ts

In [3]:
class CustomDataset(Dataset):
    def __init__(self,x,y):
        super(CustomDataset,self).__init__()
        self.x, self.y = x,y
        
    def __len__(self): return len(self.x)

    def __getitem__(self, i): return self.x[i].long(), self.y[i].float()

In [4]:
# cyclic lr with restart
def lr_i(step_i,cycle_len = 1000, lrs = (3e-4,1e-5),warm_pct = 4/20):
    cycle_i = step_i%cycle_len
    warm_len = int(cycle_len*warm_pct)
    cool_len = cycle_len - warm_len
    lr_range = lrs[0]-lrs[1]
    
    if cycle_i < warm_len:
        return lrs[1] + lr_range*(cycle_i/warm_len)
    else:
        return lrs[0] - lr_range*((cycle_i-warm_len)/cool_len)
    
# import numpy as np
# import matplotlib.pyplot as plt

# x = np.linspace(0,50*30,10000)
# y = list(map(lr_i,x))
# plt.figure(figsize=(20,10))
# plt.ylim([0,4e-4])
# plt.scatter(x,y)

In [5]:
class Experiment(pl.LightningModule):

    def __init__(self,model,hparams):
        super(Experiment, self).__init__()
        self.model = model
        self.bs = hparams['bs'] #batch size
        self.lr = hparams['lr'] #batch size
        self.loss_fn = nn.BCEWithLogitsLoss()
        self.class_weights = torch.load('../data/Processed/class_weights')
        
    def forward(self, x): return self.model(x)

    def optimizer_step(self, current_epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
        # warm up lr
        if self.trainer.global_step < 500:
            lr_scale = min(1., float(self.trainer.global_step + 1) / 200.)
            for pg in optimizer.param_groups:
                pg['lr'] = lr_scale * self.lr

#         for pg in optimizer.param_groups:
#             pg['lr'] = lr_i(self.trainer.global_step)
            
        optimizer.step()
        optimizer.zero_grad()
    
    def training_step(self, batch, batch_nb):
        x, y = batch
        y_hat, self.mem = self.forward(x)
        loss = self.loss_fn(y_hat, y)
        tensorboard_logs = {'train_loss': loss}
        return {'loss': loss, 'log': tensorboard_logs}
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat, self.mem = self.forward(x)
        y_pred = torch.sigmoid(y_hat)
        return {'val_loss': self.loss_fn(y_hat, y),'y_pred':y_pred.cpu(), 'y_true':y.cpu()}

    def validation_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        y_preds = torch.cat([x['y_pred'] for x in outputs])
        y_trues = torch.cat([x['y_true'] for x in outputs]).byte()

        roc_auc, pr_auc = weighted_aucs(y_preds.cuda(),y_trues.cuda(), self.class_weights.cuda())
        acc = accuracy(y_preds,y_trues)
        tensorboard_logs = {'val_loss': avg_loss,'valid_roc_auc':roc_auc,'valid_pr_auc':pr_auc,'valid_acc':acc}
        return {'avg_val_loss': avg_loss, 'log': tensorboard_logs}

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat, self.mem = self.forward(x)
        y_pred = torch.sigmoid(y_hat)
        return {'test_loss': self.loss_fn(y_hat, y),'y_pred':y_pred.cpu(), 'y_true':y.cpu()}

    def test_end(self, outputs,save_preds=True):
        avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean()
        y_preds = torch.cat([x['y_pred'] for x in outputs])
        y_trues = torch.cat([x['y_true'] for x in outputs]).byte()
        if save_preds: self.preds=[y_preds,y_trues]
            
        roc_auc, pr_auc = weighted_aucs(y_preds.cuda(),y_trues.cuda(), self.class_weights.cuda())
        acc = accuracy(y_preds,y_trues)
        print('Test ROC AUC:',roc_auc.item(),'Test PR AUC:',pr_auc.item())
        tensorboard_logs = {'test_loss': avg_loss,'test_roc_auc':roc_auc,'test_pr_auc':pr_auc,'test_acc':acc}
        return {'avg_test_loss': avg_loss, 'log': tensorboard_logs}
    
    def configure_optimizers(self):
        return torch.optim.RMSprop(self.parameters(), lr=self.lr)

    @pl.data_loader
    def train_dataloader(self):
        print('Loading training dataset')
        train_h5 = h5py.File('../data/Processed/train.hdf5')
        X_train = torch.tensor(train_h5['X_train'][:])
        y_train = torch.tensor(train_h5['y_train'][:])
        train_h5.close()
        
        trn_ds = CustomDataset(X_train,y_train)
        trn_dl = DataLoader(trn_ds, self.bs,pin_memory=True,shuffle=True)
        return trn_dl
    
    @pl.data_loader
    def val_dataloader(self):
        valid = np.load('../data/Processed/valid.npz')
        X_valid = torch.tensor(valid['arr_0'][:])
        y_valid = torch.tensor(valid['arr_1'][:])

        vld_ds = CustomDataset(X_valid,y_valid)
        vld_dl = DataLoader(vld_ds,self.bs,pin_memory=True)
        return vld_dl
    
    @pl.data_loader
    def test_dataloader(self):
        test = np.load('../data/Processed/test.npz')
        X_test = torch.tensor(test['arr_0'][:])
        y_test = torch.tensor(test['arr_1'][:])
                              
        tst_ds = CustomDataset(X_test,y_test)
        tst_dl = DataLoader(tst_ds,self.bs,pin_memory=True)
        return tst_dl

In [6]:
#best t_xl model
# cfg = ts.TransfoXLConfig(vocab_size=4, d_embed=8,d_model=256, n_head=4, d_head=16, d_inner=256, 
#                          n_layer=6, tgt_len=0, ext_len=0, mem_len=256, cutoffs=[1], )

# model = models.ResTransXL(vocab_size=4, d_emb=64, tsfm_cfg=cfg, n_res_blocks=3, res_k=16, 
#                           skip_cnt=True, fc_h_dim=512, lin_p=0.5, WVN=True)
# chpt_path = '../data/lightning_logs/version_52/checkpoints/_ckpt_epoch_1.ckpt'
# use_amp = False
# model.summary()

In [7]:
cfg = ts.TransfoXLConfig(d_model=352)

model = models.ResTransXL(vocab_size=4, d_emb=64, tsfm_cfg=cfg,skip_cnt=True,fc_h_dim=925,
                          n_res_blocks=3, res_k=16,LSTM=True, LSTM_p=0.25,res_p=0.1,lin_p=0.05)
use_amp = True
chpt_path = '../data/lightning_logs/version_67/checkpoints/_ckpt_epoch_1.ckpt' #best LSTM model
model.summary()

Model parameters:				
Resnet part:		8894k
Transformer-XL part:	1988k
Linear part:		122952k
Total:			133835k


In [None]:
exp = Experiment(model,{'bs':25,'lr':1e-4})
if chpt_path:
    exp.load_state_dict(torch.load(chpt_path)['state_dict'])

trainer = pl.Trainer(gpus=1, fast_dev_run=False, max_nb_epochs=10, accumulate_grad_batches=4,
                     train_percent_check=1, val_check_interval=0.1, use_amp=use_amp,
                     default_save_path='../data')    
trainer.fit(exp)

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Loading training dataset


Epoch 1:   9%|▉         | 16007/179200 [18:12<2:57:43, 15.30batch/s, batch_nb=16006, gpu=0, loss=0.056, v_nb=68]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  10%|▉         | 17600/179200 [20:02<2:59:12, 15.03batch/s, batch_nb=17599, gpu=0, loss=0.055, v_nb=68]
Epoch 1:  10%|▉         | 17602/179200 [20:02<2:36:03, 17.26batch/s, batch_nb=17599, gpu=0, loss=0.055, v_nb=68]
Epoch 1:  10%|▉         | 17610/179200 [20:02<1:59:49, 22.48batch/s, batch_nb=17599, gpu=0, loss=0.055, v_nb=68]
Epoch 1:  10%|▉         | 17618/179200 [20:03<1:34:33, 28.48batch/s, batch_nb=17599, gpu=0, loss=0.055, v_nb=68]
Epoch 1:  10%|▉         | 17626/179200 [20:03<1:16:43, 35.10batch/s, batch_nb=17599, gpu=0, loss=0.055, v_nb=68]
Epoch 1:  10%|▉         | 17635/179200 [20:03<1:03:18, 42.53batch/s, batch_nb=17599, gpu=0, loss=0.055, v_nb=68]
Epoch 1:  10%|▉         | 17644/179200 [20:03<54:19, 49.57batch/s, batch_nb=17599, gpu=0, loss=0.055, v_nb=68]  
Epoch 1:  10%|▉         | 17652/179200 [20:03<48:57, 54.99batch/s, batch_nb=17599, gpu=0, loss=0.055, v_nb=68]
Epoch 1:  10%|▉         | 17660/179200 [20:03<44:43, 60.20batch/s, batch_nb=17599, gpu=0, loss=0.0

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  12%|█▏        | 21527/179200 [24:25<2:55:00, 15.02batch/s, batch_nb=21206, gpu=0, loss=0.054, v_nb=68]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  13%|█▎        | 23911/179200 [27:13<2:53:20, 14.93batch/s, batch_nb=23590, gpu=0, loss=0.054, v_nb=68]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  14%|█▍        | 25927/179200 [29:35<2:53:53, 14.69batch/s, batch_nb=25606, gpu=0, loss=0.054, v_nb=68]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  16%|█▌        | 28859/179200 [33:01<2:47:09, 14.99batch/s, batch_nb=28538, gpu=0, loss=0.055, v_nb=68]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  18%|█▊        | 32055/179200 [36:46<2:49:52, 14.44batch/s, batch_nb=31734, gpu=0, loss=0.054, v_nb=68]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  19%|█▉        | 34519/179200 [39:41<2:40:34, 15.02batch/s, batch_nb=34198, gpu=0, loss=0.054, v_nb=68]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  20%|█▉        | 35520/179200 [40:53<2:44:53, 14.52batch/s, batch_nb=35199, gpu=0, loss=0.055, v_nb=68]
Epoch 1:  20%|█▉        | 35521/179200 [40:53<2:33:18, 15.62batch/s, batch_nb=35199, gpu=0, loss=0.055, v_nb=68]
Epoch 1:  20%|█▉        | 35530/179200 [40:53<1:56:01, 20.64batch/s, batch_nb=35199, gpu=0, loss=0.055, v_nb=68]
Epoch 1:  20%|█▉        | 35538/179200 [40:53<1:30:19, 26.51batch/s, batch_nb=35199, gpu=0, loss=0.055, v_nb=68]
Epoch 1:  20%|█▉        | 35547/179200 [40:54<1:11:53, 33.30batch/s, batch_nb=35199, gpu=0, loss=0.055, v_nb=68]
Epoch 1:  20%|█▉        | 35556/179200 [40:54<59:07, 40.49batch/s, batch_nb=35199, gpu=0, loss=0.055, v_nb=68]  
Epoch 1:  20%|█▉        | 35565/179200 [40:54<50:22, 47.52batch/s, batch_nb=35199, gpu=0, loss=0.055, v_nb=68]
Epoch 1:  20%|█▉        | 35573/179200 [40:54<44:23, 53.93batch/s, batch_nb=35199, gpu=0, loss=0.055, v_nb=68]
Epoch 1:  20%|█▉        | 35581/179200 [40:54<40:27, 59.17batch/s, batch_nb=35199, gpu=0, loss=0.055

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  22%|██▏       | 39735/179200 [45:44<2:42:43, 14.28batch/s, batch_nb=39094, gpu=0, loss=0.055, v_nb=68]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  24%|██▍       | 43242/179200 [49:56<2:57:11, 12.79batch/s, batch_nb=42600, gpu=0, loss=0.055, v_nb=68]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  26%|██▌       | 45859/179200 [53:05<2:33:29, 14.48batch/s, batch_nb=45218, gpu=0, loss=0.054, v_nb=68]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  27%|██▋       | 48055/179200 [55:44<2:35:04, 14.10batch/s, batch_nb=47414, gpu=0, loss=0.055, v_nb=68]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  28%|██▊       | 50511/179200 [58:42<2:22:12, 15.08batch/s, batch_nb=49870, gpu=0, loss=0.056, v_nb=68]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  30%|██▉       | 53440/179200 [1:02:16<2:24:14, 14.53batch/s, batch_nb=52799, gpu=0, loss=0.054, v_nb=68]
Validating:   0%|          | 0/320 [00:00<?, ?batch/s][A
Epoch 1:  30%|██▉       | 53447/179200 [1:02:16<1:50:50, 18.91batch/s, batch_nb=52799, gpu=0, loss=0.054, v_nb=68]
Epoch 1:  30%|██▉       | 53455/179200 [1:02:16<1:25:34, 24.49batch/s, batch_nb=52799, gpu=0, loss=0.054, v_nb=68]
Epoch 1:  30%|██▉       | 53463/179200 [1:02:16<1:07:56, 30.84batch/s, batch_nb=52799, gpu=0, loss=0.054, v_nb=68]
Epoch 1:  30%|██▉       | 53472/179200 [1:02:16<55:21, 37.85batch/s, batch_nb=52799, gpu=0, loss=0.054, v_nb=68]  
Epoch 1:  30%|██▉       | 53481/179200 [1:02:16<46:19, 45.24batch/s, batch_nb=52799, gpu=0, loss=0.054, v_nb=68]
Epoch 1:  30%|██▉       | 53490/179200 [1:02:16<40:07, 52.22batch/s, batch_nb=52799, gpu=0, loss=0.054, v_nb=68]
Epoch 1:  30%|██▉       | 53499/179200 [1:02:16<35:42, 58.66batch/s, batch_nb=52799, gpu=0, loss=0.054, v_nb=68]
Epoch 1:  30%|██▉       | 53

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  32%|███▏      | 56455/179200 [1:05:36<2:16:06, 15.03batch/s, batch_nb=55494, gpu=0, loss=0.056, v_nb=68]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  33%|███▎      | 59047/179200 [1:08:37<2:13:35, 14.99batch/s, batch_nb=58086, gpu=0, loss=0.054, v_nb=68]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  34%|███▍      | 60555/179200 [1:10:22<2:17:53, 14.34batch/s, batch_nb=59594, gpu=0, loss=0.054, v_nb=68]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


Epoch 1:  36%|███▌      | 64807/179200 [1:15:18<2:03:55, 15.38batch/s, batch_nb=63846, gpu=0, loss=0.055, v_nb=68]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8388608.0


Epoch 1:  38%|███▊      | 68838/179200 [1:19:57<2:02:49, 14.97batch/s, batch_nb=67877, gpu=0, loss=0.056, v_nb=68]

In [None]:
# TODO
# find memory leak(loading train dataset on every run)
# guardar hparams
# torch.optim.lr_scheduler.CosineAnnealingWarmRestarts
# label smoothing
# balanced valdiation
# grad cam

## Validation

In [10]:
trainer.test(exp)

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Loading training dataset


Testing: 100%|██████████| 18201/18201 [03:03<00:00, 98.93batch/s] 

Test ROC AUC: 0.889613687992096 Test PR AUC: 0.41424208879470825





In [None]:
#TODO

# https://arxiv.org/pdf/1912.01857.pdf
# https://github.com/feidfoe/AdjustBnd4Imbalance/blob/master/cifar.py
gamma = 0.1 # hparams for re_scaling https://arxiv.org/pdf/1912.01857.pdf
if args.evaluate:
    print('\nEvaluation only')
    test_loss, test_acc = test(testloader, model, criterion, 
                               start_epoch, use_cuda)
    print('[w/o RS] Test Loss: %.8f, Test Acc: %.2f%%' % (test_loss, test_acc))

    current_state = model.state_dict()
    W = current_state['module.fc.weight']

    imb_factor = 1. / args.imbalance
    img_max = 50000/num_classes
    num_sample = [img_max * (imb_factor**(i/(num_classes - 1))) \
                     for i in range(num_classes)]

    ns = [ float(n) / max(num_sample) for n in num_sample ]
    ns = [ n**gamma for n in ns ]
    ns = torch.FloatTensor(ns).unsqueeze(-1).cuda()
    new_W = W / ns

    current_state['module.fc.weight'] = new_W
    model.load_state_dict(current_state)

    test_loss, test_acc = test(testloader, model, criterion, 
                               start_epoch, use_cuda)
    print('[w/  RS] Test Loss: %.8f, Test Acc: %.2f%%' % (test_loss, test_acc))

## Grad-CAM

In [None]:
# TODO
#https://arxiv.org/pdf/1610.02391.pdf
# https://github.com/HaebinShin/grad-cam-text
# https://course.fast.ai/videos/?lesson=6 1:06:00