In [1]:
import os
import time
import csv
import yaml
import torch
import cv2
import argparse
import numpy as np

from torch.utils.data import DataLoader
import torch.backends.cudnn as cudnn
import torch.optim
from datetime import timedelta
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint
from torchvision.transforms import Compose

from experiment import DPT
from util.callbacks import TensorCheckpoint
from dpt.models import DPTDepthModel
from dataloader.nyu_loader import NYUDataset


In [2]:
import os

os.environ["CUDA_VISIBLE_DEVICES"]="0"


In [3]:
# parser = argparse.ArgumentParser(description='Generic runner model')
# parser.add_argument('--config',  '-c',
#                     dest="filename",
#                     metavar='FILE',
#                     help =  'path to the config file',
#                     default='config/train.yaml')

config_path = "config/train.yaml"
                    

with open(config_path, 'r') as file:
    try:
        config = yaml.safe_load(file)
    except yaml.YAMLError as exc:
        print(exc)

seed = 1234
torch.manual_seed(seed)
np.random.seed(seed)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("DEVICE: ", device)



batch_size = config['experiment']['batch_size']
epochs = config['experiment']['epochs']
lr = config['experiment']['learning_rate']
logs_path = config['experiment']['logs_path']

net_w = 640
net_h = 480

DEVICE:  cuda


In [5]:
train_dataset = NYUDataset(config['dataset']['data_path'], type='train')
val_dataset = NYUDataset(config['dataset']['data_path'], type='val')

train_loader = DataLoader(train_dataset, 
                            batch_size=config['experiment']['batch_size'], 
                            shuffle=True,
                            prefetch_factor=10, # increase or decrease based on free gpu mem
                            pin_memory=True,
                            num_workers=4*torch.cuda.device_count() if torch.cuda.is_available() else 0)

val_loader = DataLoader(val_dataset,
                        batch_size=config['experiment']['batch_size'],
                        prefetch_factor=10, # increase or decrease based on free gpu mem
                        pin_memory=True,
                        num_workers=4*torch.cuda.device_count() if torch.cuda.is_available() else 0)

 

Found 1004 images in train folder.
Found 1004 images in val folder.


In [6]:
model = DPT(config)

# logging setup
logger = TensorBoardLogger(logs_path, 
                        name='finetune',
                        log_graph=True)
# checkpointing
model_ckpt = ModelCheckpoint(every_n_epochs=5,
                            save_top_k=-1,
                            filename='dpt-finetune-{epoch}')


In [7]:
if torch.cuda.is_available():
    if torch.cuda.device_count() > 1:
        if(config['model']['load_ckpt']):
            path =config['model']['model_path']
        else:
            path=None
        trainer = pl.Trainer(gpus=torch.cuda.device_count(), 
                            max_epochs=model.epochs,
                            logger=logger,
                            num_sanity_val_steps=0,
                            progress_bar_refresh_rate=None if config['experiment']['verbose'] else 0)
    else:
        if(config['model']['load_ckpt']):
            path =config['model']['model_path']
        else:
            path=None
        trainer = pl.Trainer(resume_from_checkpoint=path,
                                gpus=1,
                                max_epochs=model.epochs,
                                logger=logger,
                                num_sanity_val_steps=0,
                                progress_bar_refresh_rate=None if config['experiment']['verbose'] else 0)
else:
    trainer = pl.Trainer(max_epochs=1, logger=logger)
    
print('Training')


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


Training


In [8]:
import matplotlib.pyplot as plt

cmap = plt.cm.viridis
def colored_depthmap(depth, d_min=None, d_max=None):
    if d_min is None:
        d_min = np.min(depth)
    if d_max is None:
        d_max = np.max(depth)
    depth_relative = (depth - d_min) / (d_max - d_min)
    return 255 * cmap(depth_relative)[:,:,:3] # H, W, C



In [12]:
# import cv2
# cv2.setNumThreads(0)
# from dpt.models import DPTDepthModel
# test_model = DPTDepthModel(
#             path= config['model']['model_path'],
#             scale=0.00006016,
#             shift=0.00579,
#             invert=True,
#             backbone="vitb_rn50_384",
#             non_negative=True,
#             enable_attention_hooks=False,
#             load_ckpt=config['model']['load_ckpt']
        # )


In [10]:
# for inp,dep in train_loader:
#     samp_inp = inp
#     samp_dep = dep
#     yhat = test_model(samp_inp)
#     yhat = torch.unsqueeze(yhat,axis=1)
#     print(torch.max(yhat),torch.max(samp_dep))
#     print(yhat[0,0,0,0])
#     loss = MaskedL1Loss()
#     loss_val = loss(yhat, samp_dep)
#     print(loss_val)

#     break

In [11]:
cv2.setNumThreads(0)

start = time.time()
trainer.fit(model, 
            train_dataloaders=train_loader,
            val_dataloaders=val_loader)


print(f'Training completed in {timedelta(seconds=round(time.time()-start,2))}')
print(f'Training checkpoints and logs are saved in {trainer.log_dir}')


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(

  | Name     | Type          | Params
-------------------------------------------
0 | criteria | MaskedL1Loss  | 0     
1 | model    | DPTDepthModel | 123 M 
-------------------------------------------
123 M     Trainable params
0         Non-trainable params
123 M     Total params
492.588   Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

Epoch 0




Validating: 0it [00:00, ?it/s]

--- Epoch 0 training ---
train_loss: 3.871
train_absrel: 0.3385
train_mae: 0.6405
train_delta1: 0.4995
-------------------------
--- Epoch 0 validation ---
val_loss: 1.052
val_absrel: 4.524
val_mae: 10.92
val_delta1: 1.0
-------------------------
Epoch 1


Validating: 0it [00:00, ?it/s]

--- Epoch 1 training ---
train_loss: 0.8086
train_absrel: 0.3054
train_mae: 0.5782
train_delta1: 0.554
-------------------------
--- Epoch 1 validation ---
val_loss: 1.226
val_absrel: 0.3478
val_mae: 1.064
val_delta1: 0.2876
-------------------------
Training completed in 0:11:03.880000
Training checkpoints and logs are saved in /ssd_scratch/cvit/shan/logs/finetune/version_3
Final trained weights saved in model.pt
