In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import sys
import time
import datetime
import argparse
import tensorboardX
import torch
import yaml
import shutil
import torchvision
from torchvision import datasets, models, transforms
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import StepLR
import torch.nn as nn
sys.path.append('../')

In [2]:
from pycode.dataset import RLBench_dataset, Softargmax_dataset, imageaug_full_transform, train_val_split, pose_aug
from pycode.config import _C as cfg
from pycode.model.Hourglass import stacked_hourglass_model
from pycode.loss.mp_loss import Train_Loss_sequence_hourglass
from pycode.misc import save_outputs, build_model_MP, build_dataset_MP, build_optimizer, str2bool, save_args, save_checkpoint, load_checkpoint

kornia requires version >= 3.6. your version 3.6


In [3]:
# parser
parser = argparse.ArgumentParser(description='parser for image generator')
parser.add_argument('--config_file', type=str, default='', metavar='FILE', help='path to config file')
parser.add_argument('--log_step', type=int, default=10, help='')
parser.add_argument('--save_step', type=int, default=100, help='')
parser.add_argument('--eval_step', type=int, default=100, help='')
parser.add_argument('--output_dirname', type=str, default='', help='')
parser.add_argument('--checkpoint_path', type=str, default=None, help='')
parser.add_argument('--log2wandb', type=str2bool, default=True)
args = parser.parse_args(args=['--config_file','../configs/RLBench_MP.yaml','--output_dirname','hoge', '--log2wandb','False'])
# args = parser.parse_args(args=['--checkpoint_path','output/2020-04-02_18:28:18.736004/model_log/checkpoint_epoch9_iter11'])

# get cfg data
if len(args.config_file) > 0:
    print('Loaded configration file {}'.format(args.config_file))
    cfg.merge_from_file(args.config_file)

# define output dirname
if len(args.output_dirname) == 0:
    dt_now = datetime.datetime.now()
    output_dirname = str(dt_now.date()) + '_' + str(dt_now.time())
else:
    output_dirname = args.output_dirname
    
cfg.BASIC.OUTPUT_DIR = os.path.join(cfg.BASIC.OUTPUT_DIR, cfg.DATASET.NAME, output_dirname)
cfg.freeze()

# define save model path
model_path = os.path.join(cfg.BASIC.OUTPUT_DIR, 'model_log')

# make output dir
os.makedirs(cfg.BASIC.OUTPUT_DIR, exist_ok=True)
os.makedirs(model_path, exist_ok=True)

# copy config file
if len(args.config_file) > 0:
    shutil.copy(args.config_file,cfg.BASIC.OUTPUT_DIR)

# save args
argsfile_path = os.path.join(cfg.BASIC.OUTPUT_DIR, "args.txt")
save_args(args,argsfile_path)

# set seed and cuda
torch.manual_seed(cfg.BASIC.SEED)
cuda = torch.cuda.is_available()
device = torch.device(cfg.BASIC.DEVICE)

if cuda:
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = True
    torch.cuda.manual_seed(cfg.BASIC.SEED)

with open(args.config_file) as file:
    obj = yaml.safe_load(file)

if args.log2wandb:
    import wandb
    wandb.login()
    run = wandb.init(project='MotionPrediction-{}'.format(cfg.DATASET.NAME), entity='tendon', config=obj, save_code=True, name=args.output_dirname)

Loaded configration file ../configs/RLBench_MP.yaml


In [4]:
# set dataset
train_dataset = build_dataset_MP(cfg, save_dataset=False, mode='train')
val_dataset = build_dataset_MP(cfg, save_dataset=False, mode='val')

# set dataloader
train_dataloader = DataLoader(train_dataset, batch_size=cfg.BASIC.BATCH_SIZE, shuffle=True, num_workers=cfg.BASIC.WORKERS)
val_dataloader = DataLoader(val_dataset, batch_size=cfg.BASIC.BATCH_SIZE, shuffle=False, num_workers=cfg.BASIC.WORKERS)

# set model
model = build_model_MP(cfg)
# wandb.watch(model, log_freq=5000)
model = torch.nn.DataParallel(model, device_ids = list(range(cfg.BASIC.NUM_GPU)))
# model = convert_model(model)
model = model.to(device)

# set loss
train_loss = Train_Loss_sequence_hourglass(cfg, device)
val_loss = Train_Loss_sequence_hourglass(cfg, device)

# set optimizer
optimizer = build_optimizer(cfg, model, 'mp')
scheduler = StepLR(optimizer, step_size=cfg.SCHEDULER.STEPLR.STEP_SIZE, gamma=cfg.SCHEDULER.STEPLR.GAMMA)

# load checkpoint
if args.checkpoint_path != None:
    checkpoint_path = os.path.join(args.checkpoint_path, 'mp.pth')
    
    if cfg.LOAD_MODEL == 'all':
        model, optimizer, start_epoch, start_iter, scheduler = load_checkpoint(model, checkpoint_path, optimizer=optimizer, scheduler=scheduler)
    elif cfg.LOAD_MODEL == 'model_only':
        model, _, _, _, _ = load_checkpoint(model, checkpoint_path)
        start_epoch, start_iter = 0, 1
else:
    start_epoch, start_iter = 0, 1

length of future is 1 frame
load json data
length of future is 1 frame
load json data


In [5]:
tic = time.time()
end = time.time()
trained_time = 0
max_iter = cfg.BASIC.MAX_EPOCH * len(train_dataloader)
for epoch in range(start_epoch, cfg.BASIC.MAX_EPOCH):
    for iteration, inputs in enumerate(train_dataloader, 1):
        total_iteration = len(train_dataloader) * epoch + iteration
            
        # skip until start iter
        if iteration < start_iter:
            continue
            
        # optimize generator
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = train_loss(inputs, outputs)
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        # time setting
        trained_time += time.time() - end
        end = time.time() 
        
        # save and print log
        if total_iteration % args.log_step == 0:
            log = train_loss.get_log()
            eta_seconds = int((trained_time / total_iteration) * (max_iter - total_iteration))
            
            if args.log2wandb:
                wandb.log(log,step=total_iteration)
            
            print('===> Iter: {:06d}/{:06d}, LR: {:.5f}, Cost: {:.2f}s, Eta: {}, Loss: {:.6f}'.format(total_iteration, 
                max_iter, optimizer.param_groups[0]['lr'], time.time() - tic, 
                str(datetime.timedelta(seconds=eta_seconds)), log['train/weight_loss']))
            
            train_loss.reset_log()
            tic = time.time()
            
        # validataion
        if total_iteration % args.eval_step == 0:
            print('validataion start')
            for iteration, inputs in enumerate(val_dataloader, 1):
                with torch.no_grad():
                    outputs = model(inputs)
                    _ = val_loss(inputs, outputs, mode='val')
            
            val_log = val_loss.get_log()
            if args.log2wandb:
                wandb.log(val_log,step=total_iteration)
            
            print('===> Iter: {:06d}/{:06d}, VAL Loss: {:.6f}'.format(total_iteration, max_iter, val_log['val/weight_loss']))
            print('')
            val_loss.reset_log()
        
        # save checkpoint
        if total_iteration % args.save_step == 0:
            checkpoint_dir = os.path.join(model_path,'checkpoint_epoch{}_iter{}'.format(epoch,iteration))
            os.makedirs(checkpoint_dir, exist_ok=True)
            cp_path = os.path.join(checkpoint_dir, 'mp.pth')
            save_checkpoint(model, optimizer, epoch, iteration, cp_path, scheduler)
            
            # save output image
            for i, inputs in enumerate(train_dataloader, 1):
                with torch.no_grad():
                    outputs = model(inputs)
                    save_outputs(inputs, outputs, checkpoint_dir, i, cfg, mode='train')
                    
                if i >= 5:
                    break
            
            for i, inputs in enumerate(val_dataloader, 1):
                with torch.no_grad():
                    outputs = model(inputs)
                    save_outputs(inputs, outputs, checkpoint_dir, i, cfg, mode='val')
                    
                if i >= 5:
                    break        
                
    train_dataset.update_seed()
    print("seed: {}".format(train_dataset.seed))
    start_iter = 1

===> Iter: 000010/550928000, LR: 0.00010, Cost: 10.28s, Eta: 6555 days, 22:50:01, Loss: 55.946618
===> Iter: 000020/550928000, LR: 0.00010, Cost: 5.92s, Eta: 5165 days, 22:01:23, Loss: 55.901848
===> Iter: 000030/550928000, LR: 0.00010, Cost: 5.81s, Eta: 4677 days, 23:41:22, Loss: 55.279611
===> Iter: 000040/550928000, LR: 0.00010, Cost: 5.92s, Eta: 4451 days, 21:24:38, Loss: 54.814889
===> Iter: 000050/550928000, LR: 0.00010, Cost: 5.87s, Eta: 4309 days, 23:50:14, Loss: 54.771715
===> Iter: 000060/550928000, LR: 0.00010, Cost: 5.95s, Eta: 4224 days, 4:29:15, Loss: 52.478517
===> Iter: 000070/550928000, LR: 0.00010, Cost: 5.88s, Eta: 4156 days, 15:31:18, Loss: 51.198046
===> Iter: 000080/550928000, LR: 0.00010, Cost: 5.90s, Eta: 4107 days, 16:06:26, Loss: 48.254823
===> Iter: 000090/550928000, LR: 0.00010, Cost: 5.92s, Eta: 4071 days, 0:25:39, Loss: 42.386750
===> Iter: 000100/550928000, LR: 0.00010, Cost: 5.91s, Eta: 4040 days, 13:26:06, Loss: 40.930115
validataion start


Process Process-4:
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 261, in _bootstrap
    util._exit_function()
  File "/usr/lib/python3.6/multiprocessing/util.py", line 322, in _exit_function
    _run_finalizers()
  File "/usr/lib/python3.6/multiprocessing/util.py", line 262, in _run_finalizers
    finalizer()
  File "/usr/lib/python3.6/multiprocessing/util.py", line 186, in __call__
    res = self._callback(*self._args, **self._kwargs)
  File "/usr/lib/python3.6/multiprocessing/queues.py", line 191, in _finalize_join
    thread.join()
  File "/usr/lib/python3.6/threading.py", line 1056, in join
    self._wait_for_tstate_lock()
  File "/usr/lib/python3.6/threading.py", line 1072, in _wait_for_tstate_lock
    elif lock.acquire(block, timeout):
KeyboardInterrupt
Exception ignored in: <bound method _MultiProcessingDataLoaderIter.__del__ of <torch.utils.data.dataloader._MultiProcessingDataLoaderIter object at 0x7fcf1bc0d588>>
Traceback (most

KeyboardInterrupt: 

In [7]:
inputs['rotation_matrix'].dtype

torch.float64