In [1]:
from haven import haven_chk as hc
from haven import haven_results as hr
from haven import haven_utils as hu
import torch
import torchvision
import tqdm
import pandas as pd
import pprint
import itertools
import os
import pylab as plt
import exp_configs
import time
import numpy as np

from src import models
from src import datasets


import argparse

from torch.utils.data import sampler
from torch.utils.data.sampler import RandomSampler
from torch.backends import cudnn
from torch.nn import functional as F
from torch.utils.data import DataLoader

cudnn.benchmark = True

In [2]:
def trainval(exp_dict, savedir_base, datadir, reset=False, num_workers=0):
    # bookkeepting stuff
    # ==================
    pprint.pprint(exp_dict)
    exp_id = hu.hash_dict(exp_dict)
    savedir = os.path.join(savedir_base, exp_id)
    if reset:
        hc.delete_and_backup_experiment(savedir)

    os.makedirs(savedir, exist_ok=True)
    hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict)
    print("Experiment saved in %s" % savedir)

    # Dataset
    # ==================
    # train set
    train_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"],
                                     split="train",
                                     datadir=datadir,
                                     exp_dict=exp_dict,
                                     dataset_size=exp_dict['dataset_size'])
    # val set
    val_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"],
                                   split="val",
                                   datadir=datadir,
                                   exp_dict=exp_dict,
                                   dataset_size=exp_dict['dataset_size'])

    val_sampler = torch.utils.data.SequentialSampler(val_set)
    val_loader = DataLoader(val_set,
                            sampler=val_sampler,
                            batch_size=1,
                            num_workers=num_workers)
    # Model
    # ==================
    model = models.get_model(model_dict=exp_dict['model'],
                             exp_dict=exp_dict,
                             train_set=train_set).cuda()

    # model.opt = optimizers.get_optim(exp_dict['opt'], model)
    model_path = os.path.join(savedir, "model.pth")
    score_list_path = os.path.join(savedir, "score_list.pkl")

    if os.path.exists(score_list_path):
        # resume experiment
        model.load_state_dict(hu.torch_load(model_path))
        score_list = hu.load_pkl(score_list_path)
        s_epoch = score_list[-1]['epoch'] + 1
    else:
        # restart experiment
        score_list = []
        s_epoch = 0

    # Train & Val
    # ==================
    print("Starting experiment at epoch %d" % (s_epoch))

    train_sampler = torch.utils.data.RandomSampler(
        train_set, replacement=True, num_samples=2*len(val_set))

    train_loader = DataLoader(train_set,
                              sampler=train_sampler,
                              batch_size=exp_dict["batch_size"], 
                              drop_last=True, num_workers=num_workers)

    for e in range(s_epoch, exp_dict['max_epoch']):
        # Validate only at the start of each cycle
        score_dict = {}

        # Train the model
        train_dict = model.train_on_loader(train_loader)

        # Validate and Visualize the model
        val_dict = model.val_on_loader(val_loader, 
                        savedir_images=os.path.join(savedir, "images"),
                        n_images=3)
        score_dict.update(val_dict)
        # model.vis_on_loader(
        #     vis_loader, savedir=os.path.join(savedir, "images"))

        # Get new score_dict
        score_dict.update(train_dict)
        score_dict["epoch"] = len(score_list)

        # Add to score_list and save checkpoint
        score_list += [score_dict]

        # Report & Save
        score_df = pd.DataFrame(score_list)
        print("\n", score_df.tail(), "\n")
        hu.torch_save(model_path, model.get_state_dict())
        hu.save_pkl(score_list_path, score_list)
        print("Checkpoint Saved: %s" % savedir)

        # Save Best Checkpoint
        if e == 0 or (score_dict.get("val_score", 0) > score_df["val_score"][:-1].fillna(0).max()):
            hu.save_pkl(os.path.join(
                savedir, "score_list_best.pkl"), score_list)
            hu.torch_save(os.path.join(savedir, "model_best.pth"),
                          model.get_state_dict())
            print("Saved Best: %s" % savedir)

    print('Experiment completed et epoch %d' % e)




In [None]:
'''
"batch_size": [1,5,10],
         "max_epoch": [100],
'''
    
    
exp_group_list = ['virtualOR']
savedir_base = '../../../../media/James/BigData/LCFCN_temp_folder'
datadir = '../../../../media/James/BigData/TRANCOS_v3'
reset = 1
exp_id = None
run_jobs = False
num_workers = 0


# Collect experiments
# ===================
if exp_id is not None:
    # select one experiment
    savedir = os.path.join(savedir_base, args.exp_id)
    exp_dict = hu.load_json(os.path.join(savedir, "exp_dict.json"))

    exp_list = [exp_dict]

else:
    # select exp group
    print('here')
    exp_list = []
    for exp_group_name in exp_group_list:
        exp_list += exp_configs.EXP_GROUPS[exp_group_name]
        
    print(exp_list)

# Run experiments
# ===============
if run_jobs:
    print('running')
    from haven import haven_jobs as hjb
    jm = hjb.JobManager(exp_list=exp_list, savedir_base=savedir_base)
    jm_summary_list = jm.get_summary()
    #print(jm.get_summary()['status'])
 
    import usr_configs as uc
    uc.run_jobs(exp_list, savedir_base, datadir)

else:
    print('running alt')
    for exp_dict in exp_list:
        # do trainval
        trainval(exp_dict=exp_dict,
                savedir_base=savedir_base,
                datadir=datadir,
                reset=reset,
                num_workers=num_workers)


here
[{'dataset': {'name': 'virtualOR', 'transform': 'rgb_normalize'}, 'model': {'name': 'lcfcn', 'base': 'fcn8_vgg16'}, 'batch_size': 1, 'max_epoch': 1, 'dataset_size': {'train': 'all', 'val': 'all'}, 'optimizer': 'adam', 'lr': 1e-05}]
running alt
{'batch_size': 1,
 'dataset': {'name': 'virtualOR', 'transform': 'rgb_normalize'},
 'dataset_size': {'train': 'all', 'val': 'all'},
 'lr': 1e-05,
 'max_epoch': 1,
 'model': {'base': 'fcn8_vgg16', 'name': 'lcfcn'},
 'optimizer': 'adam'}
Experiment saved in ../../../../media/James/BigData/LCFCN_temp_folder/f8f5c56297038020cd7d7f8b9035d31d


  0%|          | 0/840 [00:00<?, ?it/s]

Starting experiment at epoch 0


Training. Loss: 85.9867:   0%|          | 2/840 [00:10<1:28:48,  6.36s/it]

In [None]:
# from haven import haven_jupyter as hj
# from haven import haven_results as hr

# # path to where the experiments got saved
# savedir_base = '../../../../media/James/BigData/LCFCN_temp_folder/'

# # filter exps
# filterby_list = [('dataset.name','trancos')]
# # get experiments
# rm = hr.ResultManager(savedir_base=savedir_base )
# # rm = hr.ResultManager(savedir_base=savedir_base, filterby_list=filterby_list, verbose=0)
# # dashboard variables
# legend_list = ['model.base']
# title_list = ['dataset', 'model']
# y_metrics = ['val_mae']

# # launch dashboard
# hj.get_dashboard(rm, vars(), show_jobs=False,wide_display=True)