In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np
import BasicConvLSTMCell

import torchvision
DISP_SCALING_RESNET50 = 10.0
MIN_DISP = 0.01


In [4]:
def resize_like(inputs,ref):
    assert(input.size(2) >= ref.size(2) and input.size(3) >= ref.size(3))
    
    return input[:, :, :ref.size(2), :ref.size(3)]
     

In [5]:
def convLSTM(input, hidden, filters, kernel, scope):
    cell = BasicConvLSTMCell.BasicConvLSTMCell([input.get_shape()[1], input.get_shape()[2]], kernel, filters)
    
    if hidden is None:
        hidden=cell.zero_state(input.shape[0]).float()
    
    y_, hideen=cell(input, hidden)
    
    return y_, hidden

In [9]:
def rnn_depth_net_decoderlstm(self, current_input, hidden_state, is_training=True):
    H=current_input.shape[1].value
    W=current_input.shape[2].value
    
    def forward():
        cnv1=nn.Conv2d(current_input, 32, [3,3], stride=2, scope='cnv1')
        cnv1b=nn.Conv2d(cnv1 ,32, [3,3], rate=2, stride=1, scope='cnv1b')
        cnv2=nn.Conv2d(cnv1b, 64, [3,3], stride=2, scope='cnv2')
        cnv2b = nn.Conv2d(cnv2,  64,  [3, 3], rate=2, stride=1, scope='cnv2b')
        cnv3  = nn.Conv2d(cnv2b, 128, [3, 3], stride=2, scope='cnv3')
        cnv3b = nn.Conv2d(cnv3,  128, [3, 3], rate=2, stride=1, scope='cnv3b')
        cnv4  = nn.Conv2d(cnv3b, 256, [3, 3], stride=2, scope='cnv4')
        cnv4b = nn.Conv2d(cnv4,  256, [3, 3], rate=2, stride=1, scope='cnv4b')
        cnv5  = nn.Conv2d(cnv4b, 256, [3, 3], stride=2, scope='cnv5')
        cnv5b = nn.Conv2d(cnv5,  256, [3, 3], rate=2, stride=1, scope='cnv5b')
        cnv6  = nn.Conv2d(cnv5b, 256, [3, 3], stride=2, scope='cnv6')
        cnv6b = nn.Conv2d(cnv6,  256, [3, 3], rate=2, stride=1, scope='cnv6b')
        cnv7  = nn.Conv2d(cnv6b, 512, [3, 3], stride=2, scope='cnv7')
        cnv7b = nn.Conv2d(cnv7,  512, [3, 3], rate=2, stride=1, scope='cnv7b')
        
        upcnv7 = nn.ConvTranspose2d(cnv7b, 256, [3, 3], stride=2, scope='upcnv7')
        # There might be dimension mismatch due to uneven down/up-sampling
        upcnv7 = resize_like(upcnv7, cnv6b)
        i7_in  = torch.concat((upcnv7, cnv6b), axis=3)
        icnv7, hidden7= convLSTM(i7_in, hidden_state[6], 256, [3, 3], scope='icnv7_lstm')
        upcnv6 = nn.ConvTranspose2d(icnv7, 128, [3, 3], stride=2, scope='upcnv6')
        upcnv6 = resize_like(upcnv6, cnv5b)
        i6_in  = torc.cat((upcnv6, cnv5b), axis=3)
        icnv6, hidden6= convLSTM(i6_in, hidden_state[5], 128, [3, 3], scope='icnv6_lstm')

        upcnv5 = nn.ConvTranspose2d(icnv6, 128, [3, 3], stride=2, scope='upcnv5')
        upcnv5 = resize_like(upcnv5, cnv4b)
        i5_in  = torch.cat((upcnv5, cnv4b), axis=3)
        icnv5, hidden5 = convLSTM(i5_in, hidden_state[4], 128, [3, 3], scope='icnv5_lstm')
        upcnv4 = nn.ConvTranspose2d(icnv5, 128, [3, 3], stride=2, scope='upcnv4')
        i4_in  = torch.cat((upcnv4, cnv3b), axis=3)
        icnv4, hidden4 = convLSTM(i4_in, hidden_state[3], 128, [3, 3], scope='icnv4_lstm')
        upcnv3 = nn.ConvTranspose2d(icnv4, 64,  [3, 3], stride=2, scope='upcnv3')
        i3_in  = torch.cat((upcnv3, cnv2b), axis=3)
        icnv3, hidden3 = convLSTM(i3_in, hidden_state[2], 64, [3, 3], scope='icnv3_lstm')
        upcnv2 = nn.ConvTranspose2d(icnv3, 32,  [3, 3], stride=2, scope='upcnv2')
        i2_in  = torch.cat((upcnv2, cnv1b), axis=3)
        icnv2, hidden2 = convLSTM(i2_in, hidden_state[1], 32, [3, 3], scope='icnv2_lstm')
        
        upcnv1 = nn.ConvTranspose2d(icnv2, 16,  [3, 3], stride=2, scope='upcnv1')
        icnv1, hidden1 = convLSTM(upcnv1, hidden_state[0], 16, [3, 3], scope='icnv1_lstm')
        depth=nn.Conv2d(icnv1,1,[3,3], stride=1, activation=nn.sigmoid, scope='disp1')*DISP_SCALING_RESNET50+MIN_DISP
                       
        return depth, [hidden1, hidden2, hidden3, hidden4, hidden5, hidden6, hidden7]


In [1]:
def pose_net(posenet_inputs, hidden_state, is_training=True):
    
    def forward():
        conv1  = nn.Conv2d(posenet_inputs, 16,  7, 2)
        cnv1b, hidden1 = convLSTM(conv1, hidden_state[0], 16, [3, 3], scope='cnv1_lstm')
        conv2  = nn.Conv2d(cnv1b, 32,  5, 2)
        cnv2b, hidden2 = convLSTM(conv2, hidden_state[1], 64, [3, 3], scope='cnv2_lstm')
        conv3  = nn.Conv2d(cnv2b, 64,  3, 2)
        cnv3b, hidden3 = convLSTM(conv3, hidden_state[2], 128, [3, 3], scope='cnv3_lstm')
        conv4  = nn.Conv2d(cnv3b, 128, 3, 2)
        cnv4b, hidden4 = convLSTM(conv4, hidden_state[3], 256, [3, 3], scope='cnv4_lstm')
        conv5  = nn.Conv2d(cnv4b, 256, 3, 2)
        cnv5b, hidden5 = convLSTM(conv5, hidden_state[4], 256, [3, 3], scope='cnv5_lstm')
        conv6  = nn.Conv2d(cnv5b, 256, 3, 2)
        cnv6b, hidden6 = convLSTM(conv6, hidden_state[5], 256, [3, 3], scope='cnv6_lstm')
        conv7  = nn.Conv2d(cnv6b, 256, 3, 2)
        cnv7b, hidden7 = convLSTM(conv7, hidden_state[6], 512, [3, 3], scope='cnv7_lstm')
        pose_pred = nn.Conv2d(cnv7b, 6, 1, 1,activation=None)
        pose_avg = torch.mean(pose_pred, [1, 2])
        pose_final = torch.reshape(pose_avg, [-1, 6])*0.01

        return pose_final,[hidden1, hidden2, hidden3, hidden4, hidden5, hidden6, hidden7]  

In [2]:
def rnn_depth_net_encoderlstm(current_input,hidden_state,is_training=True):
    
    H = current_input.get_shape()[1].value
    W = current_input.get_shape()[2].value

    def forward():
        cnv1  = nn.Conv2d(current_input, 32,  [3, 3], stride=2, scope='cnv1')
        cnv1b, hidden1 = convLSTM(cnv1, hidden_state[0], 32, [3, 3], scope='cnv1_lstm')
        #cnv1b = slim.conv2d(cnv1,  32,  [3, 3], rate=2, stride=1, scope='cnv1b')
        cnv2  = nn.Conv2d(cnv1b, 64,  [3, 3], stride=2, scope='cnv2')
        cnv2b, hidden2 = convLSTM(cnv2, hidden_state[1], 64, [3, 3], scope='cnv2_lstm')
        #cnv2b = slim.conv2d(cnv2,  64,  [3, 3], rate=2, stride=1, scope='cnv2b')
        cnv3  = nn.Conv2d(cnv2b, 128, [3, 3], stride=2, scope='cnv3')
        cnv3b, hidden3 = convLSTM(cnv3, hidden_state[2], 128, [3, 3], scope='cnv3_lstm')
        #cnv3b = slim.conv2d(cnv3,  128, [3, 3], rate=2, stride=1, scope='cnv3b')
        cnv4  = nn.Conv2d(cnv3b, 256, [3, 3], stride=2, scope='cnv4')
        cnv4b, hidden4 = convLSTM(cnv4, hidden_state[3], 256, [3, 3], scope='cnv4_lstm')
        #cnv4b = slim.conv2d(cnv4,  256, [3, 3], rate=2, stride=1, scope='cnv4b')
        cnv5  = nn.Conv2d(cnv4b, 256, [3, 3], stride=2, scope='cnv5')
        cnv5b, hidden5 = convLSTM(cnv5, hidden_state[4], 256, [3, 3], scope='cnv5_lstm')
        #cnv5b = slim.conv2d(cnv5,  256, [3, 3], rate=2, stride=1, scope='cnv5b')
        cnv6  = nn.Conv2d(cnv5b, 256, [3, 3], stride=2, scope='cnv6')
        cnv6b, hidden6 = convLSTM(cnv6, hidden_state[5], 256, [3, 3], scope='cnv6_lstm')
        #cnv6b = slim.conv2d(cnv6,  256, [3, 3], rate=2, stride=1, scope='cnv6b')
        cnv7  = nn.Conv2d(cnv6b, 512, [3, 3], stride=2, scope='cnv7')
        cnv7b, hidden7 = convLSTM(cnv7, hidden_state[6], 512, [3, 3], scope='cnv7_lstm')
        #cnv7b = slim.conv2d(cnv7,  512, [3, 3], rate=2, stride=1, scope='cnv7b')

        upcnv7 = nn.ConvTranspose2d(cnv7b, 256, [3, 3], stride=2, scope='upcnv7')
        # There might be dimension mismatch due to uneven down/up-sampling
        upcnv7 = resize_like(upcnv7, cnv6b)
        i7_in  = torch.cat((upcnv7, cnv6b), axis=3)
        icnv7  = nn.Conv2d(i7_in, 256, [3, 3], stride=1, scope='icnv7')

        upcnv6 = nn.ConvTranspose2d(icnv7, 128, [3, 3], stride=2, scope='upcnv6')
        upcnv6 = resize_like(upcnv6, cnv5b)
        i6_in  = torch.cat((upcnv6, cnv5b), axis=3)
        icnv6  = nn.Conv2d(i6_in, 128, [3, 3], stride=1, scope='icnv6')

        upcnv5 = nn.ConvTranspose2d(icnv6, 128, [3, 3], stride=2, scope='upcnv5')
        upcnv5 = resize_like(upcnv5, cnv4b)
        i5_in  = torch.cat([upcnv5, cnv4b], axis=3)
        icnv5  = nn.Conv2d(i5_in, 128, [3, 3], stride=1, scope='icnv5')

        upcnv4 = nn.ConvTranspose2d(icnv5, 128, [3, 3], stride=2, scope='upcnv4')
        upcnv4 = resize_like(upcnv4, cnv3b)
        i4_in  = torch.cat((upcnv4, cnv3b), axis=3)
        icnv4  = nn.Conv2d(i4_in, 128, [3, 3], stride=1, scope='icnv4')

        upcnv3 = nn.ConvTranspose2d(icnv4, 64,  [3, 3], stride=2, scope='upcnv3')
        upcnv3 = resize_like(upcnv3, cnv2b)
        i3_in  = torch.cat((upcnv3, cnv2b), axis=3)
        icnv3  = nn.Conv2d(i3_in, 64, [3, 3], stride=1, scope='icnv3')

        upcnv2 = nn.ConvTranspose2d(icnv3, 32,  [3, 3], stride=2, scope='upcnv2')
        upcnv2 = resize_like(upcnv2, cnv1b)
        i2_in  = torch.cat([upcnv2, cnv1b], axis=3)
        icnv2  = nn.Conv2d(i2_in, 32, [3, 3], stride=1, scope='icnv2')

        upcnv1 = nn.ConvTranspose2d(icnv2, 16,  [3, 3], stride=2, scope='upcnv1')
        icnv1  = nn.Conv2d(upcnv1, 16,  [3, 3], stride=1, scope='icnv1')
        depth  = nn.Conv2d(icnv1, 1,   [1, 1], stride=1,activation=nn.sigmoid, scope='disp1')*DISP_SCALING_RESNET50+MIN_DISP # was 10.0

        return depth, [hidden1, hidden2, hidden3, hidden4, hidden5, hidden6, hidden7]
