In [1]:
import tensorflow as tf 
import numpy as np
import cv2
import matplotlib.pyplot as plt
from datetime import datetime

  from ._conv import register_converters as _register_converters


In [2]:
# for Tensorboard logging and visualization
now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
root_logdir = "tf_logs"
logdir = "{}/run-{}/".format(root_logdir, now)

In [3]:
# a list that specifies convolution-pooling architecture; 
# list index indicate layer position in stack; 
# a pooling layer is represented by a tuple: (pooling type, kernel_size, strides) 
# a convolution layer is represented by a typle: (filter_height, filter_width, depth)
layers = [(5, 5, 6),
          ('max', (1,2,2,1), (1,2,2,1)),
          (5, 5, 16), 
          ('max', (1,2,2,1), (1,2,2,1)),
          (5, 5, 60),
          ('max', (1,2,2,1), (1,2,2,1))]  

def conv_pool(x, layers):
    out = x
    n_conv, n_pool = 0, 0
    prev_depth = int(x.shape[3])
    for l in layers:
        if type(l[0]) == int:
            n_conv += 1
            with tf.variable_scope('conv_{}'.format(n_conv), reuse = tf.AUTO_REUSE):
                w = tf.get_variable('filter', initializer=tf.truncated_normal((l[0], l[1], prev_depth, l[2]),0,0.1))
                b = tf.get_variable('bias', initializer=tf.zeros(l[2]))  
                out = tf.nn.relu(tf.nn.conv2d(out, w, strides=(1,1,1,1), padding='SAME') + b)
            prev_depth = l[2]
        elif l[0] == 'max':
            n_pool += 1
            out = tf.nn.max_pool(out, l[1], l[2], padding='SAME', name='pool_{}'.format(n_pool))
        elif l[0] == 'avg':
            n_pool += 1
            out = tf.nn.avg_pool(out, l[1], l[2], padding='SAME', name='pool_{}'.format(n_pool))
    return out

# get all frames from video downscaled by a factor
# return an ndarray of shape (n_frames, height, width, channels)
def get_frames(path, n_frames, downscale_factor):
    cap = cv2.VideoCapture(path)
    seq = []
    count = 0
    while True:
        success,frame = cap.read()
        if count == n_frames or not success:
            break
        # downscale frame
        width = int(frame.shape[1] / downscale_factor)
        height = int(frame.shape[0] / downscale_factor)
        resized = cv2.resize(frame, (width, height), interpolation = cv2.INTER_AREA)
        if resized.shape[0] > resized.shape[1]:
            resized = np.transpose(resized, (1,0,2))
        seq.append(resized)
        count += 1
    return np.stack(seq)

# mini-batch generator
def next_batch(path, labels, n_batches, batch_size, n_frames, downscale_factor):
    perm = np.random.permutation(300)
    for i in range(n_batches):
        x_batch, y_batch = [], []
        for j in range(0, batch_size):
            all_frames = get_frames(path.format(perm[i*batch_size+j]+1), n_frames, downscale_factor)
            #print(all_frames.shape)
            x_batch.append(all_frames)
            y_batch.append(labels[perm[i*batch_size+j]])
        x_batch = np.stack(x_batch)
        yield x_batch, y_batch
        
# generate feature maps for each video in mini-batch
# x has shape (batch_size, n_frames, height, width, channels)
def get_feature_maps(x):
    instances = []
    for i in range(x.shape[0]):
        instances.append(tf.contrib.layers.flatten(conv_pool(x[i, :, :, :, :], layers)))
    return tf.stack(instances, axis=0)

def score_to_label(scores, thresh_1, thresh_2):
    for x in np.nditer(scores, op_flags=['readwrite']):
        if x < thresh_1:
            x[...] = 0
        elif x < thresh_2:
            x[...] = 1
        else:
            x[...] = 2
    return scores

In [None]:
path = '/home/mallesh/video-qoe-labeling_1/VQA-Deep-Learning/data/set1-4/trace_{}.mp4'

height, width, n_channels = 1080, 1920, 3
downscale_factor = 8
n_frames = 100
n_classes = 3
n_batches, batch_size = 30, 10
n_hidden = 100 # number of hidden cells in LSTM

X = tf.placeholder(tf.float32, shape=
                   (batch_size, n_frames, int(height/downscale_factor), int(width/downscale_factor), n_channels))
y = tf.placeholder(tf.int32, shape=(batch_size,))

labels = score_to_label(np.loadtxt('/home/mallesh/video-qoe-labeling_1/VQA-Deep-Learning/data/set1-4.txt'), 2, 3.8)

X_features = get_feature_maps(X)
print(X_features.shape)

cell = tf.contrib.rnn.BasicLSTMCell(n_hidden)
output, _ = tf.nn.dynamic_rnn(cell, X_features, initial_state = cell.zero_state(batch_size, dtype=tf.float32))

with tf.variable_scope('out', reuse = tf.AUTO_REUSE):
    w = tf.get_variable('weight', shape=(n_hidden, n_classes))
    b = tf.get_variable('bias', initializer=tf.zeros(n_classes))
    pred = tf.matmul(output[:,-1,:], w) + b

loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pred, labels=y))
optimizer = tf.train.AdamOptimizer()
training_op = optimizer.minimize(loss)
loss_summary = tf.summary.scalar('loss', loss)
file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

Instructions for updating:
Use the retry module or similar alternatives.
(10, 100, 30600)


In [None]:
saver = tf.train.Saver()
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    batch_num = 0
    for X_batch, y_batch in next_batch(path, labels, n_batches, batch_size, n_frames, downscale_factor):      
        print(X_batch.shape)
        batch_num += 1
        summary_str = loss_summary.eval(feed_dict={X: X_batch, y: y_batch})
        file_writer.add_summary(summary_str, batch_num)
        sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        saver.save(sess, '/tmp/after_batch_{}.ckpt'.format(batch_num))
        print(pred.eval(feed_dict={X: X_batch, y: y_batch}))
        print(loss.eval(feed_dict={X: X_batch, y: y_batch}))
    
    saver.save(sess, '/tmp/final.ckpt')

file_writer.close()

(10, 100, 135, 240, 3)
[[ 0.4174981  -1.0018234  -0.6342016 ]
 [ 0.6103216  -0.51104087  0.01119593]
 [ 0.40747958 -0.27480763 -0.74518925]
 [ 0.20012076 -0.6476899   0.07149868]
 [ 0.45678324 -0.28826642 -0.25301382]
 [ 0.54582065 -0.24579637 -0.26788443]
 [ 0.6002198  -0.05261083 -0.03244453]
 [ 0.54348373 -0.9414733  -1.0088395 ]
 [ 0.22715957 -0.23753387 -0.16339573]
 [ 0.1973783  -0.64249223 -0.4302865 ]]
1.1603103
(10, 100, 135, 240, 3)
[[ 0.19832787  0.8647749   0.37825832]
 [ 0.2931341   1.0047479   0.19141704]
 [ 0.2007918   0.88519174  0.42430237]
 [ 0.4827534   0.4256199   0.44925913]
 [ 0.28557256  0.85731757  0.32078674]
 [ 0.31966028  1.0215192   0.22625132]
 [ 0.3346729   0.5464126  -0.07177597]
 [ 0.10489751  0.7356182   0.10347681]
 [ 0.283435    0.84828556  0.3380723 ]
 [ 0.25059026  1.0959179   0.0278153 ]]
0.99492186
(10, 100, 135, 240, 3)
[[ 0.14351866  1.2301936   0.6916934 ]
 [ 0.14351866  1.2301936   0.6916934 ]
 [-0.11161161  0.8429192   0.2947703 ]
 [ 0.182602