# Homography Regression Qualitative Evaluation
A homography regression network that was trained on synthetically generated camera motion data is evaluated on data where camera motion occurs naturally.

## Load Model

In [1]:
import os
import cv2
import torch
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader
from kornia import warp_perspective, tensor_to_image
from torchvision.transforms import ToTensor

from datasets import ImageSequenceDataset
from lightning_modules import DeepImageHomographyEstimationModuleBackbone
from utils.io import load_yaml
from utils.processing import image_edges, four_pt_to_matrix_homography_representation
from utils.viz import yt_alpha_blend

# load best model\n",
prefix = '/home/martin/Tresors/homography_imitation_learning_logs/deep_image_homography_estimation_backbone/version_0'
# prefix = '/home/martin/Tresors/homography_imitation_learning_logs/unsupervised_deep_homography_estimation_backbone/version_0'
configs = load_yaml(os.path.join(prefix, 'configs.yml'))
model = DeepImageHomographyEstimationModuleBackbone.load_from_checkpoint(os.path.join(prefix, 'checkpoints/epoch=19.ckpt'), shape=configs['model']['shape'])

device = 'cpu'
if torch.cuda.is_available():
    print('Running with CUDA backend.')
    device = 'cuda'

model.to(device)
model = model.eval()

# create dataset and evaluate model on
#   - with motion dataset
#   - cholec80
def eval(img_seq, init_frame=None, init=False, h_memory=torch.eye(3).unsqueeze(0), shape=(640, 480), show_image=True, track=False):
    img_seq[0], img_seq[1] = img_seq[0].to(device).squeeze(), img_seq[1].to(device).squeeze()
    duv = model(img_seq[0].unsqueeze(0), img_seq[1].unsqueeze(0))

    uv = image_edges(img_seq[0].unsqueeze(0))
    H = four_pt_to_matrix_homography_representation(uv, duv)

    wrp = warp_perspective(img_seq[0].unsqueeze(0), torch.inverse(H), img_seq[0].shape[-2:])

    wrp = cv2.resize(tensor_to_image(wrp), shape)[...,::-1]
    img0 = cv2.resize(tensor_to_image(img_seq[0]), shape)[...,::-1]
    img1 = cv2.resize(tensor_to_image(img_seq[1]), shape)[...,::-1]

    l1_img0_img1 = abs(img1-img0)
    l1_img1_wrp  = abs(img1-wrp)
    blend_img0_img1 = yt_alpha_blend(img1, img0)
    blend_img1_wrp  = yt_alpha_blend(img1, wrp)

    if track:
        if not init:
            init_frame = cv2.resize(tensor_to_image(img_seq[0]), shape)[...,::-1]
            init = True

        h_memory = h_memory.matmul(H.detach()) # inverse order

        wrp_memory = warp_perspective(img_seq[1].unsqueeze(0), h_memory, img_seq[1].shape[-2:])
        wrp_memory = cv2.resize(tensor_to_image(wrp_memory), shape)[...,::-1]

        blend_wrp_memory_init_frame  = yt_alpha_blend(wrp_memory, init_frame)
        top_row    = np.concatenate([img0, l1_img0_img1, blend_img0_img1], axis=1)
        bottom_row = np.concatenate([blend_wrp_memory_init_frame,  l1_img1_wrp,  blend_img1_wrp], axis=1)
    else:
        top_row    = np.concatenate([img0, l1_img0_img1, blend_img0_img1], axis=1)
        bottom_row = np.concatenate([img1,  l1_img1_wrp,  blend_img1_wrp], axis=1)
    composite = np.concatenate([top_row, bottom_row], axis=0)

    if show_image:
        cv2.imshow('composite', composite)
        cv2.waitKey()

    if track:
        return composite, init_frame, init, h_memory
    else:
        return composite


Running with CUDA backend.


## With Camera Motion - DaVinci

In [2]:
# motion dataset
prefix = '/media/martin/Samsung_T5/data/endoscopic_data/camera_motion_separated_png/with_camera_motion'
df_name = 'log_with_camera_motion_seq_len_2.pkl'
df = pd.read_pickle(os.path.join(prefix, df_name))
ds = ImageSequenceDataset(df, prefix, ToTensor())
dl = DataLoader(ds, batch_size=1)

init_frame = None
init = False
h_memory = torch.eye(3, device=device)
for img_seq in dl: 
    composite, init_frame, init, h_memory = eval(img_seq, init_frame, init, h_memory, show_image=True, track=True)

## With Camera Motion - Cholec80
### Image Shapes

In [2]:
# cholec80, analyze image shape
cholec_configs = load_yaml('configs/cholec80_transforms.yml')

database_prefix = '/media/martin/Samsung_T5/data/endoscopic_data'

for database in cholec_configs['databases']:
    path = os.path.join(database_prefix, database['prefix'])
    for file in database['videos']['files']:
        path = os.path.join(database_prefix, database['prefix'], database['videos']['prefix'], file)
        vc = cv2.VideoCapture(path)

        _, img = vc.read()
        cv2.imshow('img', img)
        cv2.waitKey()

### Homography Estimation

In [2]:
# cholec80, evaluate model
prefix = '/media/martin/Samsung_T5/data/endoscopic_data/tmp'
df_name = 'cholec80_seq_len_2.pkl'
df = pd.read_pickle(os.path.join(prefix, df_name))
ds = ImageSequenceDataset(df, prefix, ToTensor())
dl = DataLoader(ds, batch_size=1)

# 44.7 s ± 419 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) (divide by 1000)
# in between ~ 22 and 50 fps
init_frame = None
init = False
h_memory = torch.eye(3, device=device)
track = False
for img_seq in dl: 
    if track:
        composite, init_frame, init, h_memory = eval(img_seq, init_frame, init, h_memory, show_image=True, track=track)
    else:
        composite = eval(img_seq, shape=(320, 240), show_image=True)

#### Create Video

In [3]:
# cholec80, evaluate model
prefix = '/media/martin/Samsung_T5/data/endoscopic_data/tmp'
df_name = 'cholec80_seq_len_2.pkl'
df = pd.read_pickle(os.path.join(prefix, df_name))
ds = ImageSequenceDataset(df, prefix, ToTensor())
dl = DataLoader(ds, batch_size=1)

# 44.7 s ± 419 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) (divide by 1000)
# in between ~ 22 and 50 fps

shape = eval(next(iter(dl)), show_image=False, track=False).shape[:2]
out = cv2.VideoWriter('composite_stride_10.avi', fourcc=cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), fps=25, frameSize=(shape[1], shape[0]))

init_frame = None
init = False
h_memory = torch.eye(3, device=device).unsqueeze(0)
for img_seq in dl: 
    composite, init_frame, init, h_memory = eval(img_seq, init_frame, init, h_memory, show_image=False, track=True)
    composite = (composite*255).astype(np.uint8)
    out.write(composite)

out.release()

# Image Reflections
Generate masks of reflections by thresholding images

In [5]:
# # motion dataset
# prefix = '/media/martin/Samsung_T5/data/endoscopic_data/camera_motion_separated_png/without_camera_motion'
# df_name = 'log_without_camera_motion_seq_len_2.pkl'
# df = pd.read_pickle(os.path.join(prefix, df_name))
# ds = ImageSequenceDataset(df, prefix, ToTensor())
# dl = DataLoader(ds, batch_size=1)

# for img_seq in dl:
#     img0 = tensor_to_image(img_seq[0])
#     cv2.imshow('img0', img0)
#     cv2.waitKey()