In [1]:
# Working with TF commit 24466c2e6d32621cd85f0a78d47df6eed2c5c5a6

import math

import numpy as np
import tensorflow as tf
import tensorflow.contrib.seq2seq as seq2seq
from tensorflow.contrib.layers import safe_embedding_lookup_sparse as embedding_lookup_unique
from tensorflow.contrib.rnn import LSTMCell, LSTMStateTuple, GRUCell

import helpers


In [2]:

class Seq2SeqNetwork():
    ''' 
     Basic unidirectional seq2seq model with no attention
     For now we assume all sequences are of the same length
     This is a simplification offered to us by the Fibonacci task
    '''
    PAD = 0
    EOS = 1
    EOS_SLICE = tf.ones([1, self.batch_size], dtype=tf.int32) * EOS
    PAD_SLICE = tf.ones([1, self.batch_size], dtype=tf.int32) * PAD
    
    
    def __init__(self, model_config, data_config):
        
        self.encoder_cell = model_config.encoder_cell
        self.decoder_cell = model_config.decoder_cell
        
        self.sequence_length = data_config.sequence_length
        self.embedding_size = model_config.embedding_size
        self.alphabet_size = data_config.alphabet_size
        
        self._make_graph()
        
        
    @property
    def decoder_hidden_units(self):
        return self.decoder_cell.output_size
    
    
    def _set_target(self, target):
        """
        During training, `decoder_targets`
        and decoder logits. This means that their shapes should be compatible.

        Here we do a bit of plumbing to set this up.
        """
        target = tf.placeholder()
        
        with tf.name_scope('DecoderTrainFeeds'):
            
            # Put the end of sentence token on top of all the decoder_targets
            # For the copying task these are the reverted inputs following EOS
            decoder_input = tf.concat([EOS_SLICE, target], axis=0)

            decoder_target = tf.concat([target, PAD_SLICE], axis=0)
            
            eos_mask = tf.one_hot(self.sequence_length,
                                  self.sequence_length + 1,
                                  on_value = EOS, 
                                  off_value = PAD,
                                  dtype = tf.int32)
            
            eos_mask = tf.transpose(eos_mask, [1, 0])

            # hacky way using one_hot to put EOS symbol at the end of target sequence
            decoder_target = tf.add(decoder_target, eos_mask)

            loss_weights = tf.ones(shape=(self.batch_size, self.sequence_length), 
                                   dtype=tf.float32, 
                                   name="loss_weights")
            
            self.decoder_input = decoder_input
            self.decoder_train_target = decoder_train_target
            self.loss_weights = loss_weights
             
            
    def _set_decoder():
        
        with tf.variable_scope("Decoder") as scope:
            
            def output_fn(outputs):
                return tf.contrib.layers.linear(outputs, self.alphabet_size, scope=scope)
            
            
            decode = seq2seq.simple_decoder_fn_train(encoder_state = self.encoder_state)
            sample = seq2seq.simple_decoder_fn_inference(output_fn=output_fn,
                                                        encoder_state = self.encoder_state,
                                                        embeddings = self.embedding_matrix,
                                                        start_of_sequence_id = self.EOS
                                                        end_of_sequence_id = self.EOS, 
                                                        maximum_length = tf.reduce_max(self.encoder_inputs) + 3,
                                                        num_decoder_symbols = self.alphabet_size)
        
            decoder_train = (
        
                 seq2seq.dynamic_rnn_decoder(cell = self.decoder_cell,
                                        decoder_fn = decode,
                                        inputs = self.decoder_embedding
                                        sequence_length = self.sequence_length,
                                        time_major = True,
                                        scope = scope)
            )
        
            self.decoder_output_train = decoder_train[0]
            self.decoder_state_train = decoder_train[1]
            self.decoder_context_state_train = decoder_train[2]
            
            self.decoder_logits_train = output_fn(self.decoder_output)
            self.decoder_prediction_train = tf.argmax(self.decoder_logits,
                                                axis = -1,
                                                name = 'decoder_prediction')
            # explain
            scope.reuse_variables()
            
            decoder_infer = (
                
                seq2seq.dynamic_rnn_decoder(cell = self.decoder_cell,
                                            decoder_fn = sample,
                                            time_major = True,
                                            scope = scope)
                
            )
            
            self.decoder_logits_inference = decoder_infer[0]
            self.decoder_state_inference = decoder_infer[1]
            self.decoder_context_state_inference = decoder_infer[2]
            
            self.decoder_prediction_inference = tf.argmax(self.decoder_logits_inference,
                                                          axis = -1,
                                                          name = 'decoder_prediction_inference')
            
    
    def _init_optimizer(self):
        logits = tf.transpose(self.decoder_logits_train, [1, 0, 2])
        targets = tf.transpose(self.decoder_train_target, [1, 0])
        self.loss = seq2seq.sequence_loss(logits = logits,
                                          targets = targets,
                                          weights = self.loss_weights)
        
        self.train_op = tf.train.AdamOptimizer().minimize(self.loss)
        
        
    def make_train_inputs(self, input_seq, target_seq):
        inputs_, inputs_length_ = helpers.batch(input_seq)
        targets_, targets_length_ = helpers.batch(target_seq)
        return {
            self.encoder_inputs: inputs_,
            self.encoder_inputs_length: inputs_length_,
            self.decoder_targets: targets_,
            self.decoder_targets_length: targets_length_,
        }

    def make_inference_inputs(self, input_seq):
        inputs_, inputs_length_ = helpers.batch(input_seq)
        return {
            self.encoder_inputs: inputs_,
            self.encoder_inputs_length: inputs_length_,
        }


    def _set_encoder():
        
        with tf.variable_scope('Encoder') as scope:
            (self.encoder_output, self.encoder_state) = (
                    
                tf.nn.dynamic_rnn(cell = self.encoder_cell,
                                  inputs = self.encoder_embedding,
                                  sequence_length = self.sequence_length,
                                  time_major = True,
                                  dtype = tf.float32)
            )
            
    def _set_embeddings(self):
        with tf.variable_scope("embedding") as scope:

            # Uniform(-sqrt(3), sqrt(3)) has variance=1.
            sqrt3 = math.sqrt(3)
            initializer = tf.random_uniform_initializer(-sqrt3, sqrt3)

            self.embedding_matrix = tf.get_variable(name="embedding_matrix",
                                        shape=(self.alphabet_size, self.embedding_size),
                                        initializer=initializer,
                                        dtype=tf.float32)

            self.encoder_embedded = tf.nn.embedding_lookup(self.embedding_matrix, 
                                                           self.encoder_inputs)

            self.decoder_embedded = tf.nn.embedding_lookup(self.embedding_matrix,
                                                           self.decoder_inputs)

            
    def _make_graph(self):
        
        # init placeholders
        self.input = tf.placeholder(shape = (self.sequence_length, self.batch_size),
                                            dtype = tf.int32,
                                            name  = 'encoder_inputs')
        
        self.target = tf.placeholder(shape = (self.sequence_length, self.batch_size),
                                             dtype = tf.int32,
                                             name = 'decoder_target')
        
        self._set_embeddings()
        
        self._set_encoder()
        
        self._set_decoder()
        
        self._set_optimizer()
        

IndentationError: expected an indented block (<ipython-input-2-0e532c7d4a60>, line 70)

In [None]:
def make_seq2seq_model(**kwargs):
    args = dict(encoder_cell=LSTMCell(10),
                decoder_cell=LSTMCell(20),
                vocab_size=10,
                embedding_size=10,
                attention=True,
                bidirectional=True,
                debug=False)
    args.update(kwargs)
    return Seq2SeqModel(**args)



In [None]:
def train_on_copy_task(session, model,
                       length_from=3, length_to=8,
                       vocab_lower=2, vocab_upper=10,
                       batch_size=100,
                       max_batches=5000,
                       batches_in_epoch=1000,
                       verbose=True):

    batches = helpers.random_sequences(length_from=length_from, length_to=length_to,
                                       vocab_lower=vocab_lower, vocab_upper=vocab_upper,
                                       batch_size=batch_size)
    loss_track = []
    try:
        for batch in range(max_batches+1):
            batch_data = next(batches)
            fd = model.make_train_inputs(batch_data, batch_data)
            _, l = session.run([model.train_op, model.loss], fd)
            loss_track.append(l)

            if verbose:
                if batch == 0 or batch % batches_in_epoch == 0:
                    print('batch {}'.format(batch))
                    print('  minibatch loss: {}'.format(session.run(model.loss, fd)))
                    for i, (e_in, dt_pred) in enumerate(zip(
                            fd[model.encoder_inputs].T,
                            session.run(model.decoder_prediction_train, fd).T
                        )):
                        print('  sample {}:'.format(i + 1))
                        print('    enc input           > {}'.format(e_in))
                        print('    dec train predicted > {}'.format(dt_pred))
                        if i >= 2:
                            break
                    print()
    except KeyboardInterrupt:
        print('training interrupted')

    return loss_track


In [None]:
    import sys

    if 'fw-debug' in sys.argv:
        tf.reset_default_graph()
        with tf.Session() as session:
            model = make_seq2seq_model(debug=True)
            session.run(tf.global_variables_initializer())
            session.run(model.decoder_prediction_train)
            session.run(model.decoder_prediction_train)

    elif 'fw-inf' in sys.argv:
        tf.reset_default_graph()
        with tf.Session() as session:
            model = make_seq2seq_model()
            session.run(tf.global_variables_initializer())
            fd = model.make_inference_inputs([[5, 4, 6, 7], [6, 6]])
            inf_out = session.run(model.decoder_prediction_inference, fd)
            print(inf_out)

    elif 'train' in sys.argv:
        tracks = {}

        tf.reset_default_graph()

        with tf.Session() as session:
            model = make_seq2seq_model(attention=True)
            session.run(tf.global_variables_initializer())
            loss_track_attention = train_on_copy_task(session, model)

        tf.reset_default_graph()

        with tf.Session() as session:
            model = make_seq2seq_model(attention=False)
            session.run(tf.global_variables_initializer())
            loss_track_no_attention = train_on_copy_task(session, model)

        import matplotlib.pyplot as plt
        plt.plot(loss_track)
        print('loss {:.4f} after {} examples (batch_size={})'.format(loss_track[-1], len(loss_track)*batch_size, batch_size))

    else:
        tf.reset_default_graph()
        session = tf.InteractiveSession()
        model = make_seq2seq_model(debug=False)
        session.run(tf.global_variables_initializer())

        fd = model.make_inference_inputs([[5, 4, 6, 7], [6, 6]])

        inf_out = session.run(model.decoder_prediction_inference, fd)


In [None]:
%matplotlib inline

import numpy as np
import tensorflow as tf
from tensorflow.contrib.rnn import LSTMCell, GRUCell
from model_new import Seq2SeqModel, train_on_copy_task
import pandas as pd
import helpers

import warnings
warnings.filterwarnings("ignore")

In [None]:
tf.__version__

In [None]:
tf.reset_default_graph()
tf.set_random_seed(1)

with tf.Session() as session:

    # with bidirectional encoder, decoder state size should be
    # 2x encoder state size
    model = Seq2SeqModel(encoder_cell=LSTMCell(10),
                         decoder_cell=LSTMCell(20), 
                         vocab_size=10,
                         embedding_size=10,
                         attention=True,
                         bidirectional=True,
                         debug=False)

    session.run(tf.global_variables_initializer())

    train_on_copy_task(session, model,
                       length_from=3, length_to=8,
                       vocab_lower=2, vocab_upper=10,
                       batch_size=100,
                       max_batches=3000,
                       batches_in_epoch=1000,
                       verbose=True)

In [None]:
loss_tracks = dict()

def do_train(session, model):
    return train_on_copy_task(session, model,
                              length_from=3, length_to=8,
                              vocab_lower=2, vocab_upper=10,
                              batch_size=100,
                              max_batches=5000,
                              batches_in_epoch=1000,
                              verbose=False)

def make_model(**kwa):
    args = dict(cell_class=LSTMCell,
                num_units_encoder=10,
                vocab_size=10,
                embedding_size=10,
                attention=False,
                bidirectional=False,
                debug=False)
    args.update(kwa)
    
    cell_class = args.pop('cell_class')
    
    num_units_encoder = args.pop('num_units_encoder')
    num_units_decoder = num_units_encoder
    
    if args['bidirectional']:
        num_units_decoder *= 2
    
    args['encoder_cell'] = cell_class(num_units_encoder)
    args['decoder_cell'] = cell_class(num_units_decoder)
    
    return Seq2SeqModel(**args)

In [None]:
import time

tf.reset_default_graph()
tf.set_random_seed(1)
with tf.Session() as session:
    model = make_model(bidirectional=True, attention=True, cell_class=LSTMCell)
    session.run(tf.global_variables_initializer())
    t0 = time.time()
    lstm_track = do_train(session, model)
    lstm_took = time.time() - t0

tf.reset_default_graph()
tf.set_random_seed(1)
with tf.Session() as session:
    model = make_model(bidirectional=True, attention=True, cell_class=GRUCell)
    session.run(tf.global_variables_initializer())
    t0 = time.time()
    gru_track = do_train(session, model)
    gru_took = time.time() - t0
    
gru = pd.Series(gru_track, name='gru')
lstm = pd.Series(lstm_track, name='lstm')
tracks_batch = pd.DataFrame(dict(lstm=lstm, gru=gru))
tracks_batch.index.name = 'batch'

gru.index = gru.index / gru_took
lstm.index = lstm.index / lstm_took
tracks_time = pd.DataFrame(dict(lstm=lstm, gru=gru)).ffill()
tracks_time.index.name = 'time (seconds)'

In [None]:
tf.reset_default_graph()
tf.set_random_seed(1)
with tf.Session() as session:
    model = make_model(bidirectional=False, attention=False)
    session.run(tf.global_variables_initializer())
    loss_tracks['forward encoder, no attention'] = do_train(session, model)


tf.reset_default_graph()
tf.set_random_seed(1)
with tf.Session() as session:
    model = make_model(bidirectional=True, attention=False)
    session.run(tf.global_variables_initializer())
    loss_tracks['bidirectional encoder, no attention'] = do_train(session, model)


tf.reset_default_graph()
tf.set_random_seed(1)
with tf.Session() as session:
    model = make_model(bidirectional=False, attention=True)
    session.run(tf.global_variables_initializer())
    loss_tracks['forward encoder, with attention'] = do_train(session, model)

    
tf.reset_default_graph()
tf.set_random_seed(1)
with tf.Session() as session:
    model = make_model(bidirectional=True, attention=True)
    session.run(tf.global_variables_initializer())
    loss_tracks['bidirectional encoder, with attention'] = do_train(session, model)

pd.DataFrame(loss_tracks).plot(figsize=(13, 8))