# Annotate Homographies
Large scale homography annotation for extraction of next best views.

In [1]:
import os
import cv2
import copy
import torch
import torchvision
import torchvision.io as io
from torchvision.datasets.video_utils import VideoClips
from torch.utils.data import DataLoader
import sys

sys.path.append('../')

from utils import load_yaml, save_yaml, dictListToCompose, recursiveMethodCallFromDictList

servers = load_yaml('../configs/servers.yml')
server = servers['local']

databases = load_yaml('../configs/cholec80_transforms.yml')
database = databases['databases'][0]

In [2]:
# load transforms and convert them to torch.functional methods
key_dict = {
    'Crop': 'crop',
    'Resize': 'resize'
}

functional_databases = copy.deepcopy(databases)
for db_idx, db in enumerate(databases['databases']):
    functional_databases['databases'][db_idx]['transforms'] = [] # delete old transforms
    for transforms in db['transforms']:
        functional_transforms = []
        for transform in transforms:
            # 'Crop' -> 'crop', shape -> height, width, top_left_corner -> top, left
            # 'Resize' -> 'resize', dsize -> size
            functional_transform = {}
            for key, value in transform.items():
                if key == 'Crop':
                    functional_transform[key_dict[key]] = {
                        'height': value['shape'][0],
                        'width': value['shape'][1],
                        'top': value['top_left_corner'][0],
                        'left': value['top_left_corner'][1]
                    }
                elif key == 'Resize':
                    functional_transform[key_dict[key]] = {
                        'size': value['dsize'][::-1]
                    }
                else:
                    raise ValueError('Key not known')

            functional_transforms.append(functional_transform)                
        functional_databases['databases'][db_idx]['transforms'].append(functional_transforms)

save_yaml('../configs/cholec80_transforms_functional.yml', functional_databases)

In [2]:
import kornia
import cv2
from kornia import warp_perspective

from utils.processing import image_edges, four_pt_to_matrix_homography_representation
from lightning_modules import DeepImageHomographyEstimationModuleBackbone

# load the model
# load best model\n",
model_prefix = '/home/martin/Tresors/homography_imitation_learning_logs/deep_image_homography_estimation_backbone/version_2'
# prefix = '/home/martin/Tresors/homography_imitation_learning_logs/unsupervised_deep_homography_estimation_backbone/version_0'
configs = load_yaml(os.path.join(model_prefix, 'configs.yml'))
model = DeepImageHomographyEstimationModuleBackbone.load_from_checkpoint(os.path.join(model_prefix, 'checkpoints/epoch=49.ckpt'), shape=configs['model']['shape'])

device = 'cpu'
if torch.cuda.is_available():
    print('Running with CUDA backend.')
    device = 'cuda'

model.to(device)
model = model.eval()

def forward_model(frame_i, frame_ip1):
    duv = model(frame_i, frame_ip1)

    uv = image_edges(frame_i)
    H = four_pt_to_matrix_homography_representation(uv, duv)

    wrp = warp_perspective(frame_i, torch.inverse(H), frame_i.shape[-2:])

    return wrp, H, duv

Running with CUDA backend.


In [3]:
import pandas as pd
from utils.viz import yt_alpha_blend
import time

# load the transforms and videos from database
functional_databases = load_yaml('../configs/cholec80_transforms_functional.yml')
functional_database = functional_databases['databases'][0]

paths = [
    os.path.join(
        server['database']['location'], 
        functional_database['prefix'], 
        functional_database['videos']['prefix'], 
        x
    ) for x in functional_database['videos']['files'][:1]
]

paths = ['/media/martin/Samsung_T5/data/endoscopic_data/cholec80/videos/video01_short.mp4']

# video reader not compiled yet https://github.com/pytorch/vision/issues/1446
# video reading https://github.com/pytorch/vision/blob/ed5b2dc3a5e7411d8b40cc7e526e151983e99cf9/torchvision/datasets/video_utils.py#L45-L69
# dataset example check https://github.com/pytorch/vision/blob/ed5b2dc3a5e7411d8b40cc7e526e151983e99cf9/torchvision/datasets/kinetics.py#L50-L78

N = 100 # already 4 second preview horizon via N = 100 @ full res, image downscale 2x2 -> factor 4, easily 16 seconds preview horizon with 8GB memory
step = 5

vc = VideoClips(paths, clip_length_in_frames=N+1, frames_between_clips=N)


df = pd.DataFrame(columns=['t', 'duv', 'H']) # track results
global_idx = 0
max_clips = 150 # vc.num_clips()
max_clips = int(min(max_clips, vc.num_clips()))

with torch.no_grad():
    for i in range(max_clips):
        now = time.time_ns()
        video, audio, info, video_idx = vc.get_clip(i)
        print('\nLoading time: {} ms'.format((time.time_ns() - now)/1.e6))

        video = video.permute(0, 3, 1, 2)
        transforms = functional_database['transforms'][video_idx]
        video = recursiveMethodCallFromDictList(video, transforms, torchvision.transforms.functional)
        video = video.float()/255.

        # re-sort images into i and i+1
        frames_i   = video[:-step:step]
        frames_ip1 = video[step::step]

        frames_i, frames_ip1 = frames_i.to(device), frames_ip1.to(device)

        now = time.time_ns()
        wrps, Hs, duvs = forward_model(frames_i, frames_ip1)
        print('Forwarding time: {} ms'.format((time.time_ns() - now)/1.e6))

        for idx, H in enumerate(Hs):
            df = df.append({
                't': global_idx,
                'duv': duvs[idx].squeeze().cpu().numpy(),
                'H': H.squeeze().cpu().numpy()
            }, ignore_index=True)
            global_idx += step
            print('\r{}/{}'.format(global_idx, max_clips*N), end='')

        frames_i = kornia.tensor_to_image(frames_i)
        frames_ip1 = kornia.tensor_to_image(frames_ip1)
        wrps = kornia.tensor_to_image(wrps)

        for idx, w in enumerate(wrps):
            blend = yt_alpha_blend(frames_ip1[idx], w)
            cv2.imshow('blend', blend)
            cv2.imshow('img', frames_i[idx])
            cv2.imshow('wrp', w)
            cv2.waitKey()
cv2.destroyAllWindows()

df.to_pickle('H_{}.pkl'.format(step))


100%|██████████| 1/1 [00:01<00:00,  1.66s/it]

Loading time: 121.820683 ms
Forwarding time: 693.775682 ms
25/750
Loading time: 230.959205 ms
Forwarding time: 132.851764 ms
50/750
Loading time: 289.967595 ms
Forwarding time: 130.481043 ms
75/750
Loading time: 339.831279 ms
Forwarding time: 146.180757 ms
100/750
Loading time: 405.020069 ms
Forwarding time: 139.023227 ms
125/750
Loading time: 447.353381 ms
Forwarding time: 134.475889 ms
150/750
Loading time: 519.782804 ms
Forwarding time: 137.43477 ms
175/750
Loading time: 576.220349 ms
Forwarding time: 134.853256 ms
200/750
Loading time: 666.922047 ms
Forwarding time: 133.928237 ms
225/750
Loading time: 719.402735 ms
Forwarding time: 144.60392 ms
250/750
Loading time: 759.020998 ms
Forwarding time: 136.39896 ms
275/750
Loading time: 204.74555 ms
Forwarding time: 141.343443 ms
300/750
Loading time: 269.396871 ms
Forwarding time: 139.789613 ms
325/750
Loading time: 338.602279 ms
Forwarding time: 142.911266 ms
350/750
Loading time: 382.0469