# Estimating space an time needed for PDD inference

In [None]:
import numpy as np
import cv2
from PIL import Image
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
from math import ceil

from utils.preprocessing import preprocessing_flownet, preprocessing_pwc
from utils.load_models import load_flownet2, load_pwcnet, init_weights
from utils.plotting import flow2img, overlaySegment, showFlow
from utils.layers import warp, warpImage
from utils.encoding import labelMatrixOneHot, dice_coeff

os.environ["CUDA_VISIBLE_DEVICES"] = '3'
available_gpus = [(torch.cuda.device(i),torch.cuda.get_device_name(i)) for i in range(torch.cuda.device_count())]
print(available_gpus)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

In [None]:
W,H = (150,150)
o_m = H//4 +1
o_n = W//4 +1
ogrid_xy = F.affine_grid(torch.eye(2,3).unsqueeze(0),(1,1,o_m,o_n)).view(1,1,-1,2).cuda()
disp_range = 0.25#0.25
displacement_width = 15#15#11#17
grid_size = 32#25#30
disp_hw = 5
displace_range = 11
grid_xy = F.affine_grid(torch.eye(2,3).unsqueeze(0),(1,1,grid_size,grid_size)).view(1,-1,1,2).cuda()


def init_weights(m):
    if isinstance(m, nn.Linear) or isinstance(m, nn.Conv3d) or isinstance(m, nn.ConvTranspose2d) or isinstance(m, nn.Conv2d) or isinstance(m, nn.Conv1d):
        nn.init.xavier_normal(m.weight)
        if m.bias is not None:
            nn.init.constant(m.bias, 0.0)

class OBELISK2d(nn.Module):
    def __init__(self, chan = 16):

        super(OBELISK2d, self).__init__()
        channels = chan
        self.offsets = nn.Parameter(torch.randn(2,channels *2,2) *0.05)
        self.layer0 = nn.Conv2d(1, 4, 5, stride=2, bias=False, padding=2)
        self.batch0 = nn.BatchNorm2d(4)

        self.layer1 = nn.Conv2d(channels *8, channels *4, 1, bias=False, groups=1)
        self.batch1 = nn.BatchNorm2d(channels *4)
        self.layer2 = nn.Conv2d(channels *4, channels *4, 3, bias=False, padding=1)
        self.batch2 = nn.BatchNorm2d(channels *4)
        self.layer3 = nn.Conv2d(channels *4, channels *1, 1)
        

    def forward(self, input_img):
        img_in = F.avg_pool2d(input_img ,3 ,padding=1 ,stride=2)
        img_in = F.relu(self.batch0(self.layer0(img_in)))
        sampled = F.grid_sample(img_in ,ogrid_xy + self.offsets[0 ,:,:].view(1 ,-1 ,1 ,2)).view(1 ,-1 ,o_m ,o_n)
        sampled -= F.grid_sample(img_in ,ogrid_xy + self.offsets[1 ,:,:].view(1 ,-1 ,1 ,2)).view(1 ,-1 ,o_m ,o_n)

        x = F.relu(self.batch1(self.layer1(sampled)))
        x = F.relu(self.batch2(self.layer2(x)))
        features = self.layer3(x)
        return features

In [None]:
def min_convolution(ssd_distance, displace_range, H, W):
    # Prepare operators for smooth dense displacement space
    pad1 = nn.ReplicationPad2d(5)
    avg1 = nn.AvgPool2d(5,stride=1)
    max1 = nn.MaxPool2d(3,stride=1)
    pad2 = nn.ReplicationPad2d(6)
    # approximate min convolution / displacement compatibility

    ssd_minconv = avg1(avg1(-max1(-pad1(ssd_distance.permute(0,2,3,1).reshape(1,-1,displace_range,displace_range)))))

    ssd_minconv = ssd_minconv.permute(0,2,3,1).view(1,-1,H,W)
    min_conv_cost = avg1(avg1(avg1(pad2(ssd_minconv))))
    
    return min_conv_cost

def meanfield(ssd_distance,img_fixed,displace_range,H,W):

    crnt_dev = ssd_distance.device

    cost = min_convolution(ssd_distance, displace_range, H, W)

    soft_cost = F.softmax(-10*cost.view(displace_range**2,-1).t(),1)
    
    disp_hw = (displace_range-1)//2
    disp_mesh_grid = disp_hw*F.affine_grid(torch.eye(2,3).unsqueeze(0),(1,1,displace_range,displace_range),align_corners=True)
    disp_mesh_grid /= torch.Tensor([(W-1)*.5,(H-1)*.5])

    disp_xy = torch.sum(soft_cost.view(1,H,W,-1,1)*disp_mesh_grid.view(1,1,1,-1,2).to(crnt_dev),3).permute(0,3,1,2) 
    

    return soft_cost,disp_xy

def correlation_layer(displace_range, feat_moving, feat_fixed):
    
    disp_hw = (displace_range-1)//2
    feat_moving_unfold = F.unfold(feat_moving.transpose(1,0),(displace_range,displace_range),padding=disp_hw)
    B,C,H,W = feat_fixed.size()
    
    ssd_distance = ((feat_moving_unfold-feat_fixed.view(C,1,-1))**2).sum(0).view(1,displace_range**2,H,W)

    return ssd_distance

In [None]:
path_to_state_dict = "models/Experiment_3/14_10_21-16-40/student_0.pth"
model_24 = OBELISK2d(24)
model_24.load_state_dict(torch.load(path_to_state_dict))

path_to_state_dict = "models/Experiment_2/obel16_ensemble_13_10_21-21-30.pth"
model_16 = OBELISK2d(16)
model_16.load_state_dict(torch.load(path_to_state_dict))

# Space usage of different models
comparing the inferenc time of models with 16 and 24 feature channels and the baseline algorithm

In [None]:
from torchsummary import summary

In [None]:
summary(model_16.cuda(), torch.zeros(1,1,150,150), verbose=False, device='cpu')

In [None]:
summary(model_24.cuda(), torch.zeros(1,1,150,150), verbose=False, device='cpu')

# Time 

In [None]:
import torch.utils.benchmark as benchmark
import time

Time of a 24 feature channel model

In [None]:
t0 = benchmark.Timer(
    stmt='feat1 = model(img1)\nfeat2 = model(img2)\nssd_distance = correlation_layer(displace_range, feat2, feat1)\nsoft_cost,disp_xy = meanfield(ssd_distance, img1, displace_range, H//4, W//4)\nflow=interpolate(disp_xy,size=(150,150))',
    globals={'model': model_24.cpu(), 
            'img1': torch.rand(1,1,150,150).cpu(),
             'img2': torch.rand(1,1,150,150).cpu(),
             'correlation_layer': correlation_layer,
            'displace_range': displace_range,
            'meanfield': meanfield,
            'H': 150,
            'W': 150,
            'interpolate': F.interpolate})
t0.timeit(100)

In [None]:
# doing it manually to have a time plot and a mean
times = []
for i in range(100):
    
    # create pseudo imgs
    img1 = torch.rand(1,1,150,150)
    img2 = torch.rand(1,1,150,150)
    
    # measure time
    start = time.time()
    
    # forward pass
    feat1 = model_24(img2)
    feat2 = model_24(img1)
    ssd_distance = correlation_layer(displace_range, feat2, feat1)
    soft_cost,disp_xy = meanfield(ssd_distance, img1, displace_range, H//4, W//4)
    # scaling
    flow=F.interpolate(disp_xy,size=(150,150))
    
    # end measurement
    end = time.time()
    times.append(end-start)

In [None]:
plt.scatter(x=np.arange(100), y=times)
plt.plot(np.tile(np.mean(times), 100), color='r', label=f'mean = {round(np.mean(times), 5)}')
plt.legend()
plt.savefig('plots/runtime_model24_cpu.png')

Time of a 16 feature channel model

In [None]:
t0 = benchmark.Timer(
    stmt='feat1 = model(img1)\nfeat2 = model(img1)\nssd_distance = correlation_layer(displace_range, feat2, feat1)\nsoft_cost,disp_xy = meanfield(ssd_distance, img1, displace_range, H//4, W//4)\nflow=interpolate(disp_xy,size=(150,150))',
    globals={'model': model_16.cpu(), 
            'img1': torch.rand(1,1,150,150).cpu(),
             'img2': torch.rand(1,1,150,150).cpu(),
             'correlation_layer': correlation_layer,
            'displace_range': displace_range,
            'meanfield': meanfield,
            'H': 150,
            'W': 150,
            'interpolate': F.interpolate})
t0.timeit(100)

In [None]:
# doing it manually to have a time plot and a mean
times = []
for i in range(100):
    
    # create pseudo imgs
    img1 = torch.rand(1,1,150,150)
    img2 = torch.rand(1,1,150,150)
    
    # measure time
    start = time.time()
    
    # forward pass
    feat1 = model_16(img2)
    feat2 = model_16(img1)
    ssd_distance = correlation_layer(displace_range, feat2, feat1)
    soft_cost,disp_xy = meanfield(ssd_distance, img1, displace_range, H//4, W//4)
    # scaling
    flow=F.interpolate(disp_xy,size=(150,150))
    
    # end measurement
    end = time.time()
    times.append(end-start)

In [None]:
plt.scatter(x=np.arange(100), y=times)
plt.plot(np.tile(np.mean(times), 100), color='r', label=f'mean = {round(np.mean(times), 5)}')
plt.legend()
plt.savefig('plots/runtime_model16_cpu.png')

Time of the baseline

In [None]:
baseline = cv2.optflow.DualTVL1OpticalFlow_create()
print("Inner iterations: ", baseline.getInnerIterations())
print("Outer iterations: ", baseline.getOuterIterations())

In [None]:
t0 = benchmark.Timer(
    stmt='calc(in1,in2,None)',
    num_threads=10,
    globals={'calc': baseline.calc, 
            'in1': np.random.uniform(size=(150,150,1)).astype(np.float32),
             'in2': np.random.uniform(size=(150,150,1)).astype(np.float32)})
t0.timeit(100)

In [None]:
t0 = benchmark.Timer(
    stmt='calc(in1,in2,None)',
    globals={'calc': baseline.calc, 
            'in1': np.random.uniform(size=(150,150,1)).astype(np.float32),
             'in2': np.random.uniform(size=(150,150,1)).astype(np.float32)})
t0.timeit(100)

In [None]:
t0 = benchmark.Timer(
    stmt='calc(in1,in2,None)',
    num_threads=50,
    globals={'calc': baseline.calc, 
            'in1': np.random.uniform(size=(150,150,1)).astype(np.float32),
             'in2': np.random.uniform(size=(150,150,1)).astype(np.float32)})
t0.timeit(100)

In [None]:
# doing it manually to have a time plot and a mean
times = []
for i in range(100):
    
    # create pseudo imgs
    img1 = np.random.uniform(size=(150,150,1)).astype(np.float32)
    img2 = np.random.uniform(size=(150,150,1)).astype(np.float32)
    
    # measure time
    start = time.time()
    flow = baseline.calc(img1, img2, None)
    
    # end measurement
    end = time.time()
    times.append(end-start)

In [None]:
plt.scatter(x=np.arange(100), y=times)
plt.plot(np.tile(np.mean(times), 100), color='r', label=f'mean = {round(np.mean(times), 5)}')
plt.legend()
plt.savefig('plots/runtime_baseline_cpu.png')

# Verdict
Running it on the server on Tue 26.10.2021 around 22:20 - 22:25 with no other thread using the CPUs and GPUs.

I am getting 358 ms with 10m, 365 ms with 1 thread and 362 ms with 50 threads for DualTVL1 and 91.05 or 74ms for the two PDD-Nets 24 and 16 respectively.
With GPU acceleration, these numbers drop to 2.8 and 2.1 ms respectively. 
Taking the mean of about 80ms, and 360ms for the baseline, the PDD-Net is about 4.5 times faster in computation. 

In [None]:
import time

In [None]:
model = OBELISK2d(16)
model.cuda()

seq = torch.nn.Sequential(torch.nn.Conv2d(1,32,kernel_size=5,stride=2,padding=4,dilation=2),
                          torch.nn.BatchNorm2d(32),
                          torch.nn.PReLU(),
                          torch.nn.Conv2d(32,32,kernel_size=3,stride=1,padding=1,dilation=1),
                          torch.nn.BatchNorm2d(32),
                          torch.nn.PReLU(),
                          torch.nn.Conv2d(32,64,kernel_size=3,stride=2,padding=1,dilation=1),
                          torch.nn.BatchNorm2d(64),
                          torch.nn.PReLU(),
                          torch.nn.Conv2d(64,16,kernel_size=1,stride=1,padding=0,dilation=1),
                          torch.nn.Sigmoid())
seq.cuda()

In [None]:
obel_time = []
seq_time = []
for i in range(100):
    in1 = torch.rand(1,1,150,150)
    in2 = torch.rand(1,1,150,150)
    
    feat1 = model(in1.cuda())
    start = time.time()
    #feat1 = model(in1.cuda())
    feat2 = model(in2.cuda())
    ssd_distance = correlation_layer(displace_range, feat1, feat2)
    soft_cost,disp_xy = meanfield(ssd_distance, in1, displace_range, H//4 +1, W//4 +1)
    end = time.time()
    
    obel_time.append((end-start)*1000)
    
    
    in1 = torch.rand(1,1,150,150)
    in2 = torch.rand(1,1,150,150)
    
    feat1 = seq(in1.cuda())
    start = time.time()
    #feat1 = seq(in1.cuda())
    
    feat2 = seq(in2.cuda())
    ssd_distance = correlation_layer(displace_range, feat1, feat2)
    soft_cost,disp_xy = meanfield(ssd_distance, in1, displace_range, H//4 +1, W//4 +1)
    end = time.time()
    
    seq_time.append((end-start)*1000)

In [None]:
fonts = {'fontsize': 22,'family': 'Latin Modern Roman'}
plt.figure(figsize=(12,8))

plt.scatter(np.arange(100), obel_time, label=f'Obelisk: {round(np.mean(obel_time), 3)}', alpha=0.5)
plt.scatter(np.arange(100), seq_time, label=f'Sequential: {round(np.mean(seq_time), 3)}', alpha=0.5)
plt.xlabel('# of Iterations', fontdict=fonts)
plt.ylabel('Time [ms]', fontdict=fonts)
plt.legend(fontsize=20)
plt.show()