TODO:
- Figure out how to deal with epochs
- Ensemble - figure out how to do this
- Checkpoints
- How to involve test-set
- Early-stopping

[03/06/2017] I have made the following changes to my code:
- Added steps to save to and restore from a checkpoint
- Added a global step that keeps track of how many iterations have been run in total (even if the model is paused and restored)

I now need to:
- Add global step to the graph
- Figure out how to deal with epochs
- Figure out how which checkpoint is restored - it seems that by default the file 'checkpoint' is the latest version. Also by default all variables are stored (this is recommended it seems).

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import pickle
import os

In [3]:
EMBEDDING_DIM = 300
BATCH_SIZE = 2048
N_FEATURES = 30
TRAIN_SIZE = 727722
VAL_SIZE = 80858

In [4]:
num_lstm = np.random.randint(175,275)
num_dense = np.random.randint(100,150)
rate_drop_lstm = 0.15+np.random.rand()*0.25
rate_drop_dense = 0.15 + np.random.rand()*0.25

In [5]:
with open('embeddings.pkl','rb') as f:
    embedding_matrix = pickle.load(f)

In [6]:
act = 'relu'
re_weight = True
nb_words = embedding_matrix.shape[0]

In [7]:
STAMP = 'lstm_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, \
        rate_drop_dense)

In [8]:
def build_queues(filename,batch_size=BATCH_SIZE):
    train_q = tf.train.string_input_producer([filename])
    
    
    reader = tf.TextLineReader(skip_header_lines=1)
    _, value = reader.read(train_q)

    record_defaults = [[1] for i in range(2*N_FEATURES+3)]

    content = tf.decode_csv(value,record_defaults=record_defaults)

    seq = tf.stack(content[:2*N_FEATURES])

    length = tf.stack(content[2*N_FEATURES:-1])
    
    label = content[-1]

    min_after_dequeue = 10*batch_size

    capacity = 20*batch_size

    q = tf.RandomShuffleQueue(
            dtypes=[seq.dtype, length.dtype, label.dtype],
            shapes = [seq.shape, length.shape, label.shape],
            capacity=capacity,
            min_after_dequeue=min_after_dequeue
            )

    enqueue_op = q.enqueue([seq, length, label])
    numberOfThreads = 1
    qr = tf.train.QueueRunner(q,[enqueue_op]*numberOfThreads)
    tf.train.add_queue_runner(qr)
    
    return q

def batch_generator(batch_size):
    select_q = tf.placeholder(tf.int32,name='select_q')
    
    train_q,val_q = [build_queues('seqs_%s.csv'%name,batch_size) 
                     for name in ['train','val']] 
    q = tf.QueueBase.from_list(select_q, [train_q, val_q])
    
    data_batch, length_batch, label_batch = q.dequeue_many(batch_size)
    return data_batch, length_batch, label_batch, select_q


In [24]:
def get_test_input(test_size):
    train_q = tf.train.string_input_producer(['test'])
    
    
    reader = tf.TextLineReader(skip_header_lines=1)
    _, value = reader.read_up_to(train_q,test_size)

    record_defaults = [[1] for i in range(2*N_FEATURES+3)]

    content = tf.decode_csv(value,record_defaults=record_defaults)

    seq = tf.stack(content[:2*N_FEATURES])

    length = tf.stack(content[2*N_FEATURES:-1])
    
    label = content[-1]
    
    return seq, length, label

In [25]:
def batch_normalize(X,is_tr,eps=1e-5,momentum=0.9):
    moments_shape = (1,X.get_shape()[-1].value)
    running_mean = tf.get_variable('rm',shape=moments_shape,trainable=False,
                                   initializer=tf.zeros_initializer())
    running_var = tf.get_variable('rv',shape=moments_shape,trainable=False,
                                  initializer=tf.ones_initializer())
    
    mu,var = tf.nn.moments(X,axes=[0],keep_dims=True)

    gamma = tf.get_variable('gamma',shape=moments_shape)
    beta = tf.get_variable('beta',shape=moments_shape)
    
    bn = tf.nn.batch_normalization(
            X,
            tf.cond(is_tr,lambda:mu,lambda:running_mean),
            tf.cond(is_tr,lambda:var,lambda:running_var),
            gamma,
            beta,
            eps
        )
    
    tf.cond(is_tr,lambda:update_moments(running_mean,running_var,mu,var,momentum),tf.no_op)
    
    return bn

def update_moments(running_mean,running_var,mu,var,momentum):
    old_weight = momentum
    new_weight = 1-momentum
    tf.add_to_collection(
        tf.assign(running_mean,old_weight*running_mean + new_weight*mu),
        tf.GraphKeys.UPDATE_OPS
        )
    tf.add_to_collection(
        tf.assign(running_var, old_weight*running_var + new_weight*var),
        tf.GraphKeys.UPDATE_OPS
        )
    return tf.no_op()

In [26]:
def reset_graph():
    if 'sess' in globals() and sess:
        sess.close()
    tf.reset_default_graph()

In [29]:
def build_graph(
    vocab_size = nb_words,
    state_size = num_lstm,
    embed_size = EMBEDDING_DIM,
    fc_size = num_dense,
    batch_size = BATCH_SIZE,
    num_classes = 2,
    lr = 1e-4,
):
    reset_graph()
    
    is_tr = tf.placeholder(tf.bool)
    is_te = tf.placeholder(tf.bool)
    select_q = tf.placeholder(tf.int32)
    
    
    data_batch, length_batch, y = \
    tf.cond(
            is_test,
            get_test_input(test_size),
            batch_generator(batch_size)
        )
    
    X,seqlens = (tf.concat(tf.split(orig,2,axis=1),axis=0) for orig in 
                   [data_batch, length_batch])
    
    seqlens = tf.reshape(seqlens,[-1])
    
    rate_drop_dense = tf.placeholder(tf.float32)
    rate_drop_lstm = tf.placeholder(tf.float32)
    
    global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step')
    
    with tf.variable_scope('embed'):
    
        embeddings = tf.get_variable('embeddings',
                                    (vocab_size, embed_size),
                                    dtype=tf.float32,
                                    trainable=False)
        rnn_inputs = tf.nn.embedding_lookup(embeddings, X)
    
    with tf.variable_scope('lstm'):
        cell = tf.contrib.rnn.LSTMCell(state_size)
        rnn_outputs, final_state = tf.nn.dynamic_rnn(
                                                     cell=cell,
                                                     inputs=rnn_inputs,
                                                     dtype=tf.float32,
                                                     sequence_length = seqlens)
        
        rnn_outputs = tf.concat(rnn_outputs, 2)
        rnn_outputs = tf.nn.dropout(rnn_outputs, rate_drop_dense)
        range_size = tf.cond(is_test,tf.constant(test_size),tf.constant(batch_size))
        idx = tf.range(2*range_size)*tf.shape(rnn_outputs)[1]\
                + (seqlens - 1)
        last_rnn_outputs = tf.gather(tf.reshape(rnn_outputs,
                                               [-1, state_size]), idx)
        split = tf.split(last_rnn_outputs,2)
        merged = tf.concat(split,axis=1)
    
    with tf.variable_scope('bn1'):
        merged = batch_normalize(merged,is_tr)
    
    with tf.variable_scope('fc1'):
        colms = X.get_shape()[-1].value
        W1 = tf.get_variable('W1',shape=[2*state_size,num_dense])
        b1 = tf.get_variable('b1',shape=[num_dense])

        merged = tf.nn.xw_plus_b(merged,W1,b1)
        merged = tf.nn.dropout(merged,rate_drop_dense)
    
    with tf.variable_scope('bn2'):
        merged = batch_normalize(merged,is_tr)
        
    with tf.variable_scope('softmax'):
        W_out = tf.get_variable('W_out', [num_dense, 2])
        b_out = tf.get_variable('b_out', [2],
            initializer=tf.constant_initializer(0.0))
        logits = tf.nn.xw_plus_b(merged,W_out,b_out)
        
        preds = tf.nn.softmax(logits)
        correct = tf.equal(tf.cast(tf.argmax(preds,1),tf.int32),y)
        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

        loss = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,labels=y)
        )
    
    extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    
    with tf.control_dependencies(extra_update_ops):
         train_step = tf.train.AdamOptimizer(lr).minimize(loss,
                                                          global_step=global_step)
    
    return {
        'select_q':select_q,
        'rate_drop_dense' : rate_drop_dense,
        'rate_drop_lstm' : rate_drop_lstm,
        'loss' : loss,
        'ts' : train_step,
        'preds' : preds,
        'accuracy' : accuracy,
        'is_train':is_tr,
        'embeddings':embeddings,
        'global_step':global_step,
        'y':y
    }

In [28]:
import timeit

In [13]:
def evaluate(sess,fetch,feed,batches_per_epoch):
    acc,loss = 0,0
    for step in range(batches_per_epoch):
        acc_,loss_ = sess.run(fetch, feed_dict=feed)
        acc+=acc_
        loss+=loss_

    return acc/batches_per_epoch, loss/batches_per_epoch

In [14]:
def train_graph(graph,
                filename,
                dir_,
                train_size = TRAIN_SIZE,
                val_size = VAL_SIZE,
                batch_size = BATCH_SIZE,
                num_epochs = 10, 
                loss_every=50):
    
    batches_per_epoch = {'train':int(np.round(train_size/batch_size)),
                         'test':int(np.round(val_size/batch_size))}
    
    g = graph
    
    path = 'checkpoints/'+ dir_+'/%s'
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        
#         saver = tf.train.Saver()
#         ckpt = tf.train.get_checkpoint_state(os.path.dirname(path%'checkpoint'))
#         if ckpt and ckpt.model_checkpoint_path:
#             saver.restore(sess, ckpt.model_checkpoint_path)
                
        initial_step = g['global_step'].eval()
        
        print(initial_step)
        
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord)

        step, accuracy,loss = initial_step, 0, 0
        
        tr_accs, te_accs = [], []
        tr_losses, te_losses = [], []
        
        current_epoch = 0
        
        skip_step = 0
        
        for step in range(initial_step+1,batches_per_epoch['train']*num_epochs+1):
            #If restarting at a later point want to ensure that
            #accumulated accuracies and losses are averaged properly
            #Maybe it would be better somehow to save the accumulated loss but we will stick
            #with this for the moment 
            rel_step = step-initial_step 
            skip_step += 1
            
            feed = {g['select_q']:0,
                    g['is_train']:True, 
                    g['embeddings']:embedding_matrix,
                    g['rate_drop_dense']:rate_drop_dense,
                    g['rate_drop_lstm']:rate_drop_lstm }
            accuracy_, _,loss_ = sess.run([g['accuracy'], 
                                           g['ts'],g['loss']], 
                                         feed_dict=feed)
            accuracy += accuracy_
            loss += loss_
            
            if (rel_step%loss_every) == 0: 
                step_string= '%d step%s'%(int(step),'s'*(int(step)>1))
                print('Average loss at %s = %f'%(step_string,loss / skip_step))
#                 saver.save(sess, path%filename, step)
            
            
            if (rel_step%batches_per_epoch['train']) == 0:
                current_epoch += 1
                tr_accs.append(accuracy / rel_step)
                tr_losses.append(loss / rel_step)
                loss, accuracy = 0, 0
                skip_step = 0
                
                fetch = [g['accuracy'],g['loss']]
                feed = {g['select_q']:1, 
                        g['is_train']:True, 
                        g['embeddings']:embedding_matrix,
                        g['rate_drop_dense']:rate_drop_dense,
                        g['rate_drop_lstm']:rate_drop_lstm }

                avg_val_acc, avg_val_loss = evaluate(sess,fetch,feed,batches_per_epoch['test'])
                te_accs.append(avg_val_acc)
                te_losses.append(avg_val_loss)
                print("Accuracy after epoch", current_epoch, " - tr:", 
                      tr_accs[-1], "- te:", te_accs[-1])
                
        coord.request_stop()
        coord.join(threads)
        
    return tr_losses, te_losses, tr_accs, te_accs

In [15]:
graph = build_graph()

In [33]:
len(pd.read_csv('seqs_train.csv'))

FileNotFoundError: File b'seqs_train.csv' does not exist

In [23]:
import timeit
start = timeit.default_timer()
tr_losses, te_losses, tr_accs, te_accs = train_graph(graph,'lstm','lstm_v',
                                                     num_epochs = 10,
                                                     loss_every=50)
print(timeit.default_timer()-start)

INFO:tensorflow:Restoring parameters from checkpoints/lstm_v/lstm-700


INFO:tensorflow:Restoring parameters from checkpoints/lstm_v/lstm-700


700
INFO:tensorflow:Error reported to Coordinator: <class 'tensorflow.python.framework.errors_impl.NotFoundError'>, seqs_train.csv
	 [[Node: ReaderReadV2 = ReaderReadV2[_device="/job:localhost/replica:0/task:0/cpu:0"](TextLineReaderV2, input_producer)]]

Caused by op 'ReaderReadV2', defined at:
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/traitlets/config/application.py", line 592, in launch_instance
    app.start()
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 405, in start
    ioloop.IOLoop.instance().start

INFO:tensorflow:Error reported to Coordinator: <class 'tensorflow.python.framework.errors_impl.NotFoundError'>, seqs_train.csv
	 [[Node: ReaderReadV2 = ReaderReadV2[_device="/job:localhost/replica:0/task:0/cpu:0"](TextLineReaderV2, input_producer)]]

Caused by op 'ReaderReadV2', defined at:
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/traitlets/config/application.py", line 592, in launch_instance
    app.start()
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 405, in start
    ioloop.IOLoop.instance().start()
 

OutOfRangeError: RandomShuffleQueue '_13_random_shuffle_queue' is closed and has insufficient elements (requested 2048, current size 0)
	 [[Node: Gather_DequeueMany = QueueDequeueManyV2[component_types=[DT_INT32, DT_INT32, DT_INT32], timeout_ms=-1, _device="/job:localhost/replica:0/task:0/cpu:0"](Gather, Gather_DequeueMany/n)]]

Caused by op 'Gather_DequeueMany', defined at:
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/traitlets/config/application.py", line 592, in launch_instance
    app.start()
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 405, in start
    ioloop.IOLoop.instance().start()
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/zmq/eventloop/ioloop.py", line 162, in start
    super(ZMQIOLoop, self).start()
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/tornado/ioloop.py", line 827, in start
    self._run_callback(callback)
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/tornado/ioloop.py", line 600, in _run_callback
    ret = callback()
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 242, in enter_eventloop
    self.eventloop(self)
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/ipykernel/eventloops.py", line 241, in loop_cocoa
    show.mainloop()
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/matplotlib/backends/backend_macosx.py", line 26, in mainloop
    _macosx.show()
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/matplotlib/backend_bases.py", line 1311, in _on_timer
    ret = func(*args, **kwargs)
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/ipykernel/eventloops.py", line 218, in doi
    kernel.do_one_iteration()
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 275, in do_one_iteration
    stream.flush(zmq.POLLIN, 1)
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 352, in flush
    self._handle_recv()
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 260, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 212, in dispatch_shell
    handler(stream, idents, msg)
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 370, in execute_request
    user_expressions, allow_stdin)
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 175, in do_execute
    shell.run_cell(code, store_history=store_history, silent=silent)
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2902, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 3006, in run_ast_nodes
    if self.run_code(code, result):
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 3066, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-15-cf9f68c3d8b9>", line 1, in <module>
    graph = build_graph()
  File "<ipython-input-11-6fea49f57213>", line 14, in build_graph
    data_batch, length_batch, y, select_q = batch_generator(batch_size)
  File "<ipython-input-8-dde9e1a413e0>", line 43, in batch_generator
    data_batch, length_batch, label_batch = q.dequeue_many(batch_size)
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/tensorflow/python/ops/data_flow_ops.py", line 458, in dequeue_many
    self._queue_ref, n=n, component_types=self._dtypes, name=name)
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/tensorflow/python/ops/gen_data_flow_ops.py", line 1328, in _queue_dequeue_many_v2
    timeout_ms=timeout_ms, name=name)
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 768, in apply_op
    op_def=op_def)
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 2336, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/Users/user/anaconda/envs/tensorflow3/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1228, in __init__
    self._traceback = _extract_stack()

OutOfRangeError (see above for traceback): RandomShuffleQueue '_13_random_shuffle_queue' is closed and has insufficient elements (requested 2048, current size 0)
	 [[Node: Gather_DequeueMany = QueueDequeueManyV2[component_types=[DT_INT32, DT_INT32, DT_INT32], timeout_ms=-1, _device="/job:localhost/replica:0/task:0/cpu:0"](Gather, Gather_DequeueMany/n)]]
