# Step by step reconstruction of the model

In [31]:
import numpy as np
import tensorflow as tf
import os
import cv2
import matplotlib.pyplot as plt

%matplotlib inline

In [32]:
def load_data(img_dir, ending):
    return np.array([cv2.imread(os.path.join(img_dir, img)) for img in os.listdir(img_dir) if img.endswith(ending)])


In [33]:
X_ORG = load_data("3Shapes2_large/", "im1.png")
Y_ORG = load_data("3Shapes2_large/","im2.png")

In [34]:
X_ORG.shape

(8015, 128, 128, 3)

In [35]:
X = X_ORG[:10]
Y = Y_ORG[:10]

In [36]:
def motion_encoder(img1, img2, batch_size):
    img1_64 = tf.nn.max_pool(img1, ksize=[1,2,2,1], strides=[1,2,2,1], padding='VALID')
    img2_64 = tf.nn.max_pool(img2, ksize=[1,2,2,1], strides=[1,2,2,1], padding='VALID')
    
    #Motion encoder

    #First convolution: 5x5x96
    weights = tf.Variable(tf.random_normal([5,5,3,96]))
    bias = tf.Variable(tf.zeros([96,]))
    logits = tf.nn.conv2d(img1_64, filter=weights, strides=[1,1,1,1], padding='SAME')
    logits = tf.add(logits, bias)
    logits = tf.nn.relu(logits)
    
    #Second convolution: 5x5x96
    weights = tf.Variable(tf.random_normal([5,5,96,96]))
    bias = tf.Variable(tf.zeros([96,]))
    logits = tf.nn.conv2d(logits, filter=weights, strides=[1,1,1,1], padding='SAME')
    logits = tf.add(logits, bias)
    logits = tf.nn.max_pool(logits, ksize=[1,2,2,1], strides=[1,2,2,1], padding='VALID')
    logits = tf.nn.relu(logits)
    
    #Third convolution: 5x5x128
    weights = tf.Variable(tf.random_normal([5,5,96,128])) # 128
    bias = tf.Variable(tf.zeros([128,]))
    logits = tf.nn.conv2d(logits, filter=weights, strides=[1,1,1,1], padding='SAME')
    logits = tf.add(logits, bias)
    logits = tf.nn.max_pool(logits, ksize=[1,2,2,1], strides=[1,2,2,1], padding='VALID')
    logits = tf.nn.relu(logits)
    
    #Forth convolution: 5x5x128
    weights = tf.Variable(tf.random_normal([5,5,128,128]))
    bias = tf.Variable(tf.zeros([128,]))
    logits = tf.nn.conv2d(logits, filter=weights, strides=[1,1,1,1], padding='VALID')
    logits = tf.add(logits, bias)
    logits = tf.nn.max_pool(logits, ksize=[1,2,2,1], strides=[1,2,2,1], padding='VALID')
    logits = tf.nn.relu(logits)

    #Fifth convolution: 5x5x256
    weights = tf.Variable(tf.random_normal([5,5,128,256])) #256
    bias = tf.Variable(tf.zeros([256,]))
    logits = tf.nn.conv2d(logits, filter=weights, strides=[1,1,1,1], padding='VALID')
    logits = tf.add(logits, bias)
    logits = tf.nn.max_pool(logits, ksize=[1,2,2,1], strides=[1,2,2,1], padding='VALID')
    logits = tf.nn.relu(logits)

    #Sixth convolution: 5x5x256
    weights = tf.Variable(tf.random_normal([5,5,256,256])) # TODO change 3 for 256
    bias = tf.Variable(tf.zeros([256,]))
    logits = tf.nn.conv2d(logits, filter=weights, strides=[1,1,1,1], padding='SAME')
    logits = tf.add(logits, bias)
    logits = tf.nn.relu(logits)
    
    logits = tf.image.resize_images(logits, [64,128])
    
    #flatten
    logits_flatten = tf.reshape(logits, shape=[logits.shape.as_list()[0],-1])
    
    # mean and std TODO: consider using direclty the random_normal with mean=mean and std=std and not using one with 0,1
    mean, std = tf.split(logits_flatten, 2, axis=1)
    epsilon = tf.random_normal(mean.shape.as_list(),0,1, dtype=tf.float32)
    kernel = mean + tf.multiply(std,epsilon)
    
    kernel = tf.reshape(kernel, shape=[batch_size,64,64,3])
    kernel = tf.nn.sigmoid(kernel)
    
    return img1_64, img2_64, kernel


In [92]:
def image_encoder(img1, batch_size):
    #Motion encoder
    img1_64 = tf.nn.max_pool(img1, ksize=[1,2,2,1], strides=[1,2,2,1], padding='VALID')

    #First convolution: 5x5x96
    weights = tf.Variable(tf.random_normal([5,5,3,96]))
    bias = tf.Variable(tf.zeros([96,]))
    logits = tf.nn.conv2d(img1, filter=weights, strides=[1,1,1,1], padding='SAME')
    logits = tf.add(logits, bias)
    logits = tf.nn.relu(logits)
    
    #Second convolution: 5x5x96
    weights = tf.Variable(tf.random_normal([5,5,96,64]))
    bias = tf.Variable(tf.zeros([64,]))
    logits = tf.nn.conv2d(logits, filter=weights, strides=[1,1,1,1], padding='SAME')
    logits = tf.add(logits, bias)
    logits = tf.nn.max_pool(logits, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME')
    logits = tf.nn.relu(logits)
    
    #Third convolution: 5x5x128
    weights = tf.Variable(tf.random_normal([5,5,64,64])) # 128
    bias = tf.Variable(tf.zeros([64,]))
    logits = tf.nn.conv2d(logits, filter=weights, strides=[1,1,1,1], padding='SAME')
    logits = tf.add(logits, bias)
    logits = tf.nn.max_pool(logits, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME')
    logits = tf.nn.relu(logits)
    
    #Forth convolution: 5x5x128
    weights = tf.Variable(tf.random_normal([5,5,64,32]))
    bias = tf.Variable(tf.zeros([32,]))
    logits = tf.nn.conv2d(logits, filter=weights, strides=[1,1,1,1], padding='SAME')
    logits = tf.add(logits, bias)
    logits = tf.nn.relu(logits)
    
    return logits, img1_64

In [97]:
def motion_decoder(img):
    logits = tf.image.resize_images(img, size = [64,64])
    
    #First convolution: 5x5x128
    weights = tf.Variable(tf.random_normal([9,9,32,128]))
    bias = tf.Variable(tf.zeros([128,]))
    logits = tf.nn.conv2d(logits, filter=weights, strides=[1,1,1,1], padding='SAME')
    logits = tf.add(logits, bias)
    logits = tf.nn.relu(logits)
    
    #Second convolution: 5x5x128
    weights = tf.Variable(tf.random_normal([1,1,128,128]))
    bias = tf.Variable(tf.zeros([128,]))
    logits = tf.nn.conv2d(logits, filter=weights, strides=[1,1,1,1], padding='SAME')
    logits = tf.add(logits, bias)
    logits = tf.nn.relu(logits)
    
    #Third convolution: 5x5x128
    weights = tf.Variable(tf.random_normal([1,1,128,3]))
    bias = tf.Variable(tf.zeros([3,]))
    logits = tf.nn.conv2d(logits, filter=weights, strides=[1,1,1,1], padding='SAME')
    logits = tf.add(logits, bias)
    logits = tf.nn.sigmoid(logits) #64x64x3

In [98]:
def train(prediction, y):
    l2_loss = tf.reduce_mean(tf.square(prediction - y))
    optimizer = tf.train.AdamOptimizer(0.01)
    train = optimizer.minimize(l2_loss)
    
    return l2_loss, train

In [101]:
# run the model
def run(X, Y, n_epochs = 10, batch_size = 30):
    img1 = tf.placeholder(shape=(batch_size,128,128,3), dtype=tf.float32, name="s1s")
    img2 = tf.placeholder(shape=(batch_size,128,128,3), dtype=tf.float32)
    
    coded_img, img1_64 = image_encoder(img1, batch_size)
    output = motion_decoder(coded_img)
    loss, training = train(output, img1_64)
    
    with tf.Session() as sess:
        
        sess.run(tf.global_variables_initializer())
        
        for epoch in range(n_epochs):
            
            print('Epoch %i/%i' % (epoch+1, n_epochs))
            cumulative_loss = 0.0
            
            for batch_num in range(int(X.shape[0]/batch_size)):
                # get x and y
                from_index = batch_size*batch_num
                to_index = batch_size*batch_num + batch_size if batch_size*(batch_num+1) < X.shape[0] else X.shape[0]
                x = X[from_index:to_index]
                y = Y[from_index:to_index]
                
                # run train and loss
                _, batch_loss = sess.run([training,loss], feed_dict={img1:x, img2:y})
                print("\t\tbatch_loss:", batch_loss)
                cumulative_loss += batch_loss
            
            print("\tEpoch's loss:", cumulative_loss)
        
        final = output.eval(feed_dict={img1:X_ORG[11:13],img2:Y_ORG[11:13]})[0]
        
        fig, axs = plt.subplots(2)
        axs[0].imshow(X_ORG[11])
        axs[1].imshow(final)

                

# TODO
1. Solve the nan problem
2. Integrate different sizes

In [102]:
run(X, Y, 10, 2)

ValueError: Dimensions must be equal, but are 3 and 32 for 'Conv2D_131' (op: 'Conv2D') with input shapes: [2,64,64,3], [9,9,32,128].