In [1]:
import tensorflow as tf 
import numpy as np
import cv2
import matplotlib.pyplot as plt
from datetime import datetime

In [2]:
# for Tensorboard logging and visualization
now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
root_logdir = "tf_logs"
logdir = "{}/run-{}/".format(root_logdir, now)

In [3]:
# a list that specifies convolution-pooling architecture; 
# list index indicate layer position in stack; 
# a pooling layer is represented by a tuple: (pooling type, kernel_size, strides) 
# a convolution layer is represented by a typle: (filter_height, filter_width, depth)
layers = [(5, 5, 6),
          ('max', (1,2,2,1), (1,2,2,1)),
          (5, 5, 16), 
          ('max', (1,2,2,1), (1,2,2,1)),
          (5, 5, 60),
          ('max', (1,2,2,1), (1,2,2,1))]  

def conv_pool(x, layers):
    out = x
    n_conv, n_pool = 0, 0
    prev_depth = int(x.shape[3])
    for l in layers:
        if type(l[0]) == int:
            n_conv += 1
            with tf.variable_scope('conv_{}'.format(n_conv), reuse = tf.AUTO_REUSE):
                w = tf.get_variable('filter', initializer=tf.truncated_normal((l[0], l[1], prev_depth, l[2]),0,0.1))
                b = tf.get_variable('bias', initializer=tf.zeros(l[2]))  
                out = tf.nn.relu(tf.nn.conv2d(out, w, strides=(1,1,1,1), padding='SAME') + b)
            prev_depth = l[2]
        elif l[0] == 'max':
            n_pool += 1
            out = tf.nn.max_pool(out, l[1], l[2], padding='SAME', name='pool_{}'.format(n_pool))
        elif l[0] == 'avg':
            n_pool += 1
            out = tf.nn.avg_pool(out, l[1], l[2], padding='SAME', name='pool_{}'.format(n_pool))
    return out

# get all frames from video downscaled by a factor
# return an ndarray of shape (n_frames, height, width, channels)
def get_frames(path, n_frames, downscale_factor):
    cap = cv2.VideoCapture(path)
    seq = []
    count = 0
    while True:
        success,frame = cap.read()
        if count == n_frames or not success:
            break
        # downscale frame
        width = int(frame.shape[1] / downscale_factor)
        height = int(frame.shape[0] / downscale_factor)
        seq.append(cv2.resize(frame, (width, height), interpolation = cv2.INTER_AREA))
        count += 1
    return np.stack(seq)

# mini-batch generator
def next_batch(path, labels, n_batches, batch_size, n_frames, downscale_factor):
    for i in range(n_batches):
        x_batch, y_batch = [], []
        for j in range(0, batch_size):
            x_batch.append(get_frames(path.format(i*batch_size+j), n_frames, downscale_factor))
            y_batch.append(labels[i*batch_size+j])
        x_batch = np.stack(x_batch)
        yield x_batch, y_batch
        
# generate feature maps for each video in mini-batch
# x has shape (batch_size, n_frames, height, width, channels)
def get_feature_maps(x):
    instances = []
    for i in range(x.shape[0]):
        instances.append(tf.contrib.layers.flatten(conv_pool(x[i, :, :, :, :], layers)))
    return tf.stack(instances, axis=0)

In [4]:
path = '/home/ysqyang/Dropbox/BenchPress ({}).avi'

height, width, channels = 240, 320, 3
downscale_factor = 2
n_frames = 50
n_classes = 3
n_batches, batch_size = 4, 5
n_hidden = 200 # number of hidden cells in LSTM

X = tf.placeholder(dtype=tf.float32, 
                   shape=(batch_size, n_frames, int(height/downscale_factor), int(width/downscale_factor), channels))
y = tf.placeholder(dtype=tf.int32, shape=(batch_size,))

labels = np.random.randint(0, high=2, size=20) 

X_features = get_feature_maps(X)
print(X_features.shape)

cell = tf.contrib.rnn.BasicLSTMCell(n_hidden)
output, _ = tf.nn.dynamic_rnn(cell, X_features, initial_state = cell.zero_state(batch_size, dtype=tf.float32))

with tf.variable_scope('out', reuse = tf.AUTO_REUSE):
    w = tf.get_variable('weight', shape=(n_hidden, n_classes))
    b = tf.get_variable('bias', initializer=tf.zeros(n_classes))
    pred = tf.matmul(output[:,-1,:], w) + b

loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pred, labels=y))
optimizer = tf.train.AdamOptimizer()
training_op = optimizer.minimize(loss)
loss_summary = tf.summary.scalar('loss', loss)
file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

Instructions for updating:
Use the retry module or similar alternatives.
(5, 50, 18000)


In [5]:
saver = tf.train.Saver()
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    batch_num = 0
    for X_batch, y_batch in next_batch(path, labels, n_batches, batch_size, 50, 2):      
        print(X_batch.shape)
        batch_num += 1
        summary_str = loss_summary.eval(feed_dict={X: X_batch, y: y_batch})
        file_writer.add_summary(summary_str, batch_num)
        sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        saver.save(sess, '/tmp/after_batch_{}.ckpt'.format(batch_num))
        print(pred.eval(feed_dict={X: X_batch, y: y_batch}))
        print(loss.eval(feed_dict={X: X_batch, y: y_batch}))
    
    saver.save(sess, '/tmp/final.ckpt')

file_writer.close()

(5, 50, 120, 160, 3)
[[ 0.29842168 -0.09643143  0.00508654]
 [ 0.5214707   0.47156107  0.1378365 ]
 [ 0.18558514  0.2355089   0.29197365]
 [ 0.42201093  0.23631538  0.06043892]
 [ 0.28713176  0.2371669  -0.12906916]]
1.0569654
(5, 50, 120, 160, 3)
[[ 1.1509163   0.4360936   0.43221557]
 [ 1.03426     0.4587123   0.29691675]
 [ 1.0846407   0.38448873  0.3850946 ]
 [ 1.0154705  -0.57733554  0.43001953]
 [ 0.89260465 -0.2639995   0.37984625]]
0.942766
(5, 50, 120, 160, 3)
[[0.80285484 0.48759004 0.6547006 ]
 [0.9006973  1.0073901  0.8847123 ]
 [0.7121032  0.75908434 0.5425482 ]
 [0.64985144 0.67178065 0.464167  ]
 [0.7529783  0.5797617  0.7808617 ]]
1.0583283
(5, 50, 120, 160, 3)
[[0.85404426 0.5370007  0.5224493 ]
 [0.81446964 0.5452881  0.52517176]
 [0.82173425 0.5437668  0.524672  ]
 [0.81446964 0.5452881  0.52517176]
 [0.8144697  0.545288   0.52517194]]
1.0765269
