Setup an experiment with given data and arch by adjusting config

In [None]:
import os
import importlib
import neptune
import pandas as pd
import numpy as np
import datetime as dt
import torch
tnn = torch.nn
top = torch.optim

from torch.utils import data as tdt
from src import utils
%matplotlib inline

ARCHS_DIR = 'archs'
DATA_DIR = 'data'
EXPERIMENTS_DIR = 'experiments'
NEPTUNE_PRJ = 'indiacovidseva/covid-net'

### Device Info

In [None]:
CUDA="cuda:0"
CPU="cpu"
if torch.cuda.is_available():
    device = torch.device(CUDA)
    cd = torch.cuda.current_device()
    print("Num devices:", torch.cuda.device_count())
    print("Current device:", cd)
    print("Device name:", torch.cuda.get_device_name(cd))
    print("Device props:", torch.cuda.get_device_properties(cd))
#     print(torch.cuda.memory_summary(cd))
else:
    device = torch.device(CPU)
print(device)

### Experiment Config

In [None]:
config = {
    "NEPTUNE_ID": "",
    "ID": "0000",
    "DESC": "Check if notebook works.",
    "ARCH": "v3",
    "DATASET": "ds_cd_p_4020_owid_2020-07-14.csv.pt",
    "IP_FEATURES": [0],
    "OP_FEATURES": [0],
    "AUX_FEATURES": [0],
    "BATCH_SIZE": 100,
    "HIDDEN_SIZE": 20,
    "NUM_LAYERS": 4,
    "DROPOUT": 0.5,
    "LEARNING_RATE": 0.001,
    "NUM_EPOCHS": 101
}

# checkpoint filename to resume training else ""
RESUME_CP = ""

# setup exp
experiment_dir = EXPERIMENTS_DIR + '/' + config['ID']
try:
    os.mkdir(experiment_dir)
except OSError:
    print("!!WARNING!! EXPERIMENT ALREADY EXISTS:", config['ID'])
else:
    print("Initialising experiment:", config['ID'])
print("Resume:", RESUME_CP if RESUME_CP else False)

# load data
ds = torch.load(DATA_DIR + "/" + config['DATASET'])
print("Dataset loaded")
config['DS'] = ds['config']
print(config['DS'])

# load arch
arch_mod = importlib.import_module("." + config['ARCH'], ARCHS_DIR)
importlib.reload(arch_mod) # ensure changes are imported

# init Net
model = arch_mod.CovidNet(
    ip_seq_len=config['DS']['IP_SEQ_LEN'], 
    op_seq_len=config['DS']['OP_SEQ_LEN'],
    ip_size=len(config['IP_FEATURES']),
    op_size=len(config['OP_FEATURES']),
    hidden_size=config['HIDDEN_SIZE'], 
    num_layers=config['NUM_LAYERS'],
    dropout=config['DROPOUT'],
    ip_aux_size=len(config['AUX_FEATURES'])
)
model = model.to(device)
print ("Model initialised")
print("Num params:", sum(p.numel() for p in model.parameters() if p.requires_grad))

# init Loss and Optimizer
loss_fn = tnn.L1Loss()
optimizer = top.Adam(model.parameters(), lr=config['LEARNING_RATE'])

# init dataset loaders
trn_loader = tdt.DataLoader(ds['trn'], shuffle=True, batch_size=config['BATCH_SIZE'])
val_loader = tdt.DataLoader(ds['val'], shuffle=True, batch_size=config['BATCH_SIZE'])

### Training loop

In [None]:
trn_loss_vals = []
val_loss_vals = []
trn_acc_vals = []
val_acc_vals = []
e = 0
min_val_loss = np.Inf
max_val_acc = 0

if RESUME_CP:
    N = config['NUM_EPOCHS']
    cp = utils.load_checkpoint(experiment_dir, RESUME_CP, device=device)
    config, e, md, od = cp['config'], cp['epoch'], cp['model_state_dict'], cp['optimizer_state_dict']
    trn_loss_vals, val_loss_vals, min_val_loss = cp['trn_losses'], cp['val_losses'], cp['min_val_loss']
    trn_acc_vals, val_acc_vals, max_val_acc = cp['trn_acc'], cp['val_acc'], cp['max_val_acc']
    e+=1
    config['NUM_EPOCHS'] = N
    model.load_state_dict(md)
    optimizer.load_state_dict(od)

# Neptune
neptune_prj = neptune.init(NEPTUNE_PRJ)
if config['NEPTUNE_ID']:
    neptune_exp = neptune_prj.get_experiments(id=config['NEPTUNE_ID'])[0]
else:
    neptune_exp = neptune.create_experiment(name=config['ID'], params=config)
    config['NEPTUNE_ID'] = neptune_exp.id

# TRAIN
print("BEGIN: [", dt.datetime.now(), "]")
while e < config['NUM_EPOCHS']:
    model.train()
    trn_losses = []
    trn_ops = []
    for data in trn_loader:
        ip, ip_aux, op = data
        ip = ip[:, :, config['IP_FEATURES']].to(device)
        ip_aux = ip_aux[:, config['AUX_FEATURES']].to(device)
        op = op[:, :, config['OP_FEATURES']].to(device)
        optimizer.zero_grad() # set grads to 0
        preds = model(
            ip.view(-1, config['DS']['IP_SEQ_LEN'], len(config['IP_FEATURES'])),
            ip_aux.view(-1, len(config['AUX_FEATURES']))
        ) # predict
        loss = loss_fn(preds, op.view(-1, config['DS']['OP_SEQ_LEN'], len(config['OP_FEATURES']))) # calc loss
        loss.backward() # calc and assign grads
        optimizer.step() # update weights
        trn_losses.append(loss) # logging
        trn_ops.append(op.mean())
    avg_trn_loss = torch.stack(trn_losses).mean().item()
    avg_trn_acc = 1 - avg_trn_loss / torch.stack(trn_ops).mean().item()
    trn_loss_vals.append(avg_trn_loss * 10000)
    trn_acc_vals.append(avg_trn_acc * 100)
    
    model.eval()
    with torch.no_grad():
        val_losses = []
        val_ops = []
        for data in val_loader:
            ip, ip_aux, op = data
            ip = ip[:, :, config['IP_FEATURES']].to(device)
            ip_aux = ip_aux[:, config['AUX_FEATURES']].to(device)
            op = op[:, :, config['OP_FEATURES']].to(device)
            preds = model(
                ip.view(-1, config['DS']['IP_SEQ_LEN'], len(config['IP_FEATURES'])),
                ip_aux.view(-1, len(config['AUX_FEATURES']))
            )
            loss = loss_fn(preds, op.view(-1, config['DS']['OP_SEQ_LEN'], len(config['OP_FEATURES'])))
            val_losses.append(loss)
            val_ops.append(op.mean())
        avg_val_loss = torch.stack(val_losses).mean().item()
        avg_val_acc = 1 - avg_val_loss / torch.stack(val_ops).mean().item()
        val_loss_vals.append(avg_val_loss * 10000)
        val_acc_vals.append(avg_val_acc * 100)
    
    neptune_exp.log_metric('validation accuracy', avg_val_acc*100)
    neptune_exp.log_metric('training accuracy', avg_trn_acc*100)
    neptune_exp.log_metric('validation loss', avg_val_loss*1e4)
    neptune_exp.log_metric('training loss', avg_trn_loss*1e4)
    
    if e%10==0:
        print(
            "[", dt.datetime.now(), "] epoch:", f"{e:5}", 
            "val_acc:", f"{avg_val_acc*100: 4.2f}", "trn_acc:", f"{avg_trn_acc*100: 4.2f}", 
            "val_loss:", f"{avg_val_loss*1e4: 4.2f}", "trn_loss:", f"{avg_trn_loss*1e4: 4.2f}"
        )
        if e%100==0:
            utils.save_checkpoint(
                config, e, model, optimizer, 
                trn_loss_vals, val_loss_vals, min_val_loss,
                trn_acc_vals, val_acc_vals, max_val_acc,
                experiment_dir, "/latest-e" + str(e) + ".pt"
            )
        if avg_val_acc >= max_val_acc:
            max_val_acc = avg_val_acc
            utils.save_checkpoint(
                config, e, model, optimizer, 
                trn_loss_vals, val_loss_vals, min_val_loss,
                trn_acc_vals, val_acc_vals, max_val_acc,
                experiment_dir, "/best-e" + str(e) + ".pt"
            )
    e+=1
print("END: [", dt.datetime.now(), "]")

neptune_exp.stop()

### Plot loss & acc

In [None]:
df_loss = pd.DataFrame({
    'trn_loss': trn_loss_vals,
    'val_loss': val_loss_vals
})

df_acc = pd.DataFrame({
    'trn_acc': trn_acc_vals,
    'val_acc': val_acc_vals
})

# smoothen
df_loss['trn_loss'] = df_loss['trn_loss'].rolling(3, min_periods=1, center=True).mean()
df_loss['val_loss'] = df_loss['val_loss'].rolling(3, min_periods=1, center=True).mean()
df_acc['trn_acc'] = df_acc['trn_acc'].rolling(3, min_periods=1, center=True).mean()
df_acc['val_acc'] = df_acc['val_acc'].rolling(3, min_periods=1, center=True).mean()

_ = df_loss[2:].plot(
    y=['trn_loss', 'val_loss'],
    title='Loss per epoch',
    subplots=False,
    figsize=(5,5),
    sharex=False,
    logy=True
)
_ = df_acc[2:].plot(
    y=['trn_acc', 'val_acc'],
    title='Acc per epoch',
    subplots=False,
    figsize=(5,5),
    sharex=False,
    logy=False
)