In [2]:
import numpy as np
import time

import helper

source_path = 'data/letters_source.txt'
target_path = 'data/letters_target.txt'

source_sentences = helper.load_data(source_path)
target_sentences = helper.load_data(target_path)

In [3]:
source_sentences[:50].split('\n')

['bsaqq',
 'npy',
 'lbwuj',
 'bqv',
 'kial',
 'tddam',
 'edxpjpg',
 'nspv',
 'huloz',
 '']

In [4]:
target_sentences[:50].split('\n')

['abqqs',
 'npy',
 'bjluw',
 'bqv',
 'aikl',
 'addmt',
 'degjppx',
 'npsv',
 'hlouz',
 '']

In [5]:
# turn the each string into a list of characters
#then convert the characters to their int values as declared in our vocabulary

def extract_character_vocab(data):
    special_words = ['<PAD>', '<UNK>', '<GO>', '<EOS>']
    
    set_words = set([character for line in data.split('\n') for character in line])
    int_to_vocab = {word_i: word for word_i, word in enumerate(special_words + list(set_words))}
    vocab_to_int = {word: word_i for word_i, word in int_to_vocab.items()}
    
    return int_to_vocab, vocab_to_int

# buils int2letter and letter2int dicts
source_int_to_letter, source_letter_to_int = extract_character_vocab(source_sentences)
target_int_to_letter, target_letter_to_int = extract_character_vocab(target_sentences)

# convert character to ids
source_letter_ids = [[source_letter_to_int.get(letter, source_letter_to_int['<UNK>']) for letter in line] for line in source_sentences.split('\n')]
target_letter_ids = [[target_letter_to_int.get(letter, target_letter_to_int['<UNK>']) for letter in line] + [target_letter_to_int['<EOS>']] for line in target_sentences.split('\n')]

print('Example source sequence')
print(source_letter_ids[:3])
print('\n')
print('Example target sequence')
print(target_letter_ids[:3])

Example source sequence
[[22, 10, 11, 28, 28], [18, 27, 25], [4, 22, 29, 23, 13]]


Example target sequence
[[11, 22, 28, 28, 10, 3], [18, 27, 25, 3], [22, 13, 4, 23, 29, 3]]


In [6]:
from distutils.version import LooseVersion
import tensorflow as tf
from tensorflow.python.layers.core import Dense

# check the tensorflow version
assert LooseVersion(tf.__version__) >= LooseVersion('1.1'), 'Please use Tensorflow version 1.1 or newer'
print('Tensorflow Version: {}'.format(tf.__version__))

Tensorflow Version: 1.1.0


In [7]:
# number of epochs
epochs = 60
# batch size
batch_size = 128
# RNN size
rnn_size = 50
# Number of Layer
num_layers = 2
# Embedding size
encoding_embedding_size = 15
decoding_embedding_size = 15
# Learning rate
learning_rate = 0.001

In [8]:
def get_model_input():
    input_data = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    lr = tf.placeholder(tf.float32, name='learning_rate')
    
    target_sequence_length = tf.placeholder(tf.int32, (None,), name='target_sequence_length')
    max_target_sequence_length = tf.reduce_max(target_sequence_length, name='max_target_len')
    source_sequence_length = tf.placeholder(tf.int32, (None,), name='source_sequence_length')
    
    return input_data, targets, lr, target_sequence_length, max_target_sequence_length, source_sequence_length

In [9]:
def encoding_layer(input_data, rnn_size, num_layers, source_sequence_length, 
                  source_vocab_size, encoding_embedding_size):
    
    # Encoder embedding
    enc_embed_input = tf.contrib.layers.embed_sequence(input_data, source_vocab_size, encoding_embedding_size)
    
    # RNN cell
    def make_cell(rnn_size):
        enc_cell = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
        return enc_cell
    
    enc_cell = tf.contrib.rnn.MultiRNNCell([make_cell(rnn_size) for _ in range(num_layers)])
    
    enc_output, enc_state = tf.nn.dynamic_rnn(enc_cell, enc_embed_input, sequence_length=source_sequence_length, dtype=tf.float32)
    
    return enc_output, enc_state

In [10]:
# Process the input to feed to the decoder
def process_decoder_input(target_data, vocab_to_int, batch_size):
    # remove the last word id from each batch and concat the <GO> to the beginning of each batch
    ending = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1])
    dec_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['<GO>']), ending], 1)
    return dec_input

In [14]:
def decoding_layer(target_letter_to_int, decoding_embedding_size, num_layers, rnn_size,
                  target_sequence_length, max_target_sequence_length, enc_state, dec_input):
    
    #decoder embedding
    target_vocab_size = len(target_letter_to_int)
    dec_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, decoding_embedding_size]))
    dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, dec_input)
    
    #construct the decoder cell
    def make_cell(rnn_size):
        dec_cell = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
        
        return dec_cell
    
    dec_cell = tf.contrib.rnn.MultiRNNCell([make_cell(rnn_size) for _ in range(num_layers)])
    
    #dense layer to translate the decoder's output at each time step
    output_layer = Dense(target_vocab_size, kernel_initializer=tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))
    
    #training decoder
    with tf.variable_scope('decode'):
        #helper for training process. used by basic decoder to read inputs
        training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_embed_input, sequence_length=target_sequence_length, time_major=False)
        
        #basic decoder
        training_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell, training_helper, enc_state, output_layer)
        
        #perform dynamic decoding using the decoder
        training_decoder_output, _ = tf.contrib.seq2seq.dynamic_decode(training_decoder, impute_finished=True, maximum_iterations=max_target_sequence_length)
        
        #inference decoder
        #reuse the same parameters trained by the training process
    with tf.variable_scope('decode', reuse=True):
        start_tokens = tf.tile(tf.constant([target_letter_to_int['<GO>']], dtype=tf.int32), [batch_size], name='start_tokens')
            
        #helper for the inference process
        inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(dec_embeddings, start_tokens, target_letter_to_int['<EOS>'])
            
        #basic decoder
        inference_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell, inference_helper, enc_state, output_layer)
            
        #perform dynamic decodingusing decoder
        inference_decoder_output, _ = tf.contrib.seq2seq.dynamic_decode(inference_decoder, impute_finished=True, maximum_iterations=max_target_sequence_length)
            
    return training_decoder_output, inference_decoder_output

In [15]:
def seq2seq_model(input_data, targets, lr, target_sequence_length, max_target_sequence_length,
                 source_sequence_length, source_vocab_size, target_vocab_size, enc_embedding_size,
                 dec_embedding_size, rnn_size, num_layers):
    
    #pass the input data through the encoder.
    #we'll ignore the encoder output.
    _, enc_state = encoding_layer(input_data, rnn_size, num_layers,
                                  source_sequence_length, source_vocab_size,
                                 encoding_embedding_size)
    
    #prepare the target sequences to feed to the decoder in training mode
    dec_input = process_decoder_input(targets, target_letter_to_int, batch_size)
    
    #pass encoder state and decoder inputs to the decoders
    training_decoder_output, inference_decoder_output = decoding_layer(target_letter_to_int, decoding_embedding_size, 
                                                                      num_layers, rnn_size, target_sequence_length, 
                                                                      max_target_sequence_length, enc_state, dec_input)
    
    return training_decoder_output, inference_decoder_output

In [16]:
# build the graph
train_graph = tf.Graph()
#set the graph to default to ensure it is ready fot training.
with train_graph.as_default():
    #load model inputs
    input_data, targets, lr, target_sequence_length, max_target_sequence_length, source_sequence_length = get_model_input()
    
    # create the training and inference logits
    training_decoder_output, inference_decoder_output = seq2seq_model(input_data, targets, lr, target_sequence_length, 
                                                                     max_target_sequence_length, 
                                                                     source_sequence_length, 
                                                                     len(source_letter_to_int), 
                                                                     len(target_letter_to_int), 
                                                                     encoding_embedding_size, 
                                                                     decoding_embedding_size, 
                                                                     rnn_size, num_layers)
    
    #create tensors for the training logits and inference logits
    training_logits = tf.identity(training_decoder_output.rnn_output, 'logits')
    inference_logits = tf.identity(inference_decoder_output.sample_id, name='predictions')
    
    # create the weights for sequence_loss
    masks = tf.sequence_mask(target_sequence_length, max_target_sequence_length, dtype=tf.float32, name='masks')
    
    with tf.name_scope('optimization'):
        
        # loss function
        cost = tf.contrib.seq2seq.sequence_loss(
        training_logits, 
        targets, 
        masks)
        
        # optimizer
        optimizer = tf.train.AdamOptimizer(lr)
        
        # gradient clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)

In [17]:
def pad_sentence_batch(sentence_batch, pad_int):
    #pad sentences with <PAD> so that all the sentences are of equal length
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [pad_int] * (max_sentence - len(sentence)) for sentence in sentence_batch]

In [18]:
def get_batches(targets, sources, batch_size, source_pad_int, target_pad_int):
    #batch targets, sources and lengths of their sentences together
    for batch_i in range(0, len(sources) // batch_size):
        start_i = batch_i * batch_size
        sources_batch = sources[start_i:start_i+batch_size]
        targets_batch = targets[start_i:start_i+batch_size]
        pad_sources_batch = np.array(pad_sentence_batch(sources_batch, source_pad_int))
        pad_targets_batch = np.array(pad_sentence_batch(targets_batch, target_pad_int))
        
        #need the lengths for the _lengths parameters
        pad_targets_lengths = []
        for target in pad_targets_batch:
            pad_targets_lengths.append(len(target))
            
        pad_source_lengths = []
        for source in pad_sources_batch:
            pad_source_lengths.append(len(source))
            
        yield pad_targets_batch, pad_sources_batch, pad_targets_lengths, pad_source_lengths

In [23]:
# split data to training and validation sets
import os
save_path = 'checkpoints/'
model_name = 'my_model'
if not os.path.exists(save_path):
    os.makedirs(save_path)
train_source = source_letter_ids[batch_size:]
train_target = target_letter_ids[batch_size:]
valid_source = source_letter_ids[:batch_size]
valid_target = target_letter_ids[:batch_size]
(valid_targets_batch, valid_sources_batch, valid_targets_lengths, valid_sources_lengths) = next(get_batches(valid_target, valid_source, batch_size, source_letter_to_int['<PAD>'], target_letter_to_int['<PAD>']))

display_step = 20 # check training loss after every 20 batches

#checkpoint = 'best_model.ckpt'
save_path_full = os.path.join(save_path, model_name)

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch_i in range(1, epochs+1):
        for batch_i, (targets_batch, sources_batch, targets_lengths, sources_lengths) in enumerate(
                get_batches(train_target, train_source, batch_size, source_letter_to_int['<PAD>'], target_letter_to_int['<PAD>'])):
            
            #training step
            _, loss = sess.run([train_op, cost], {input_data: sources_batch,
                                                 targets: targets_batch, 
                                                 lr: learning_rate, 
                                                 target_sequence_length: targets_lengths, 
                                                 source_sequence_length: sources_lengths})
            
            #debug message updating us on the status of the training 
            if batch_i % display_step == 0 and batch_i > 0:
                
                #calculate validation cost
                validation_loss = sess.run([cost], {input_data: valid_sources_batch, 
                                                   targets: valid_targets_batch, 
                                                   lr: learning_rate, 
                                                   target_sequence_length: valid_targets_lengths, 
                                                   source_sequence_length: valid_sources_lengths})
                
                print('Epoch {:>3}/{} Batch {:>4}/{} - Loss: {:>6.3f} - Validation Loss: {:>6.3f}'.format(epoch_i, 
                                                                                                         epochs, 
                                                                                                         batch_i, 
                                                                                                         len(train_source) // batch_size, 
                                                                                                         loss, 
                                                                                                         validation_loss[0]))
                
    #save model
    saver = tf.train.Saver()
    saver.save(sess, save_path_full)
    #saver.save(sess, checkpoint)
    #saver.save(sess, os.path.join(os.getcwd(), checkpoint))
    print('Model Trained and Saved') 

Epoch   1/60 Batch   20/77 - Loss:  2.362 - Validation Loss:  2.408
Epoch   1/60 Batch   40/77 - Loss:  2.292 - Validation Loss:  2.254
Epoch   1/60 Batch   60/77 - Loss:  1.977 - Validation Loss:  2.021
Epoch   2/60 Batch   20/77 - Loss:  1.679 - Validation Loss:  1.758
Epoch   2/60 Batch   40/77 - Loss:  1.669 - Validation Loss:  1.633
Epoch   2/60 Batch   60/77 - Loss:  1.494 - Validation Loss:  1.530
Epoch   3/60 Batch   20/77 - Loss:  1.362 - Validation Loss:  1.431
Epoch   3/60 Batch   40/77 - Loss:  1.427 - Validation Loss:  1.393
Epoch   3/60 Batch   60/77 - Loss:  1.283 - Validation Loss:  1.334
Epoch   4/60 Batch   20/77 - Loss:  1.185 - Validation Loss:  1.247
Epoch   4/60 Batch   40/77 - Loss:  1.247 - Validation Loss:  1.214
Epoch   4/60 Batch   60/77 - Loss:  1.134 - Validation Loss:  1.178
Epoch   5/60 Batch   20/77 - Loss:  1.082 - Validation Loss:  1.119
Epoch   5/60 Batch   40/77 - Loss:  1.131 - Validation Loss:  1.088
Epoch   5/60 Batch   60/77 - Loss:  1.021 - Vali

Epoch  41/60 Batch   40/77 - Loss:  0.026 - Validation Loss:  0.028
Epoch  41/60 Batch   60/77 - Loss:  0.028 - Validation Loss:  0.028
Epoch  42/60 Batch   20/77 - Loss:  0.017 - Validation Loss:  0.026
Epoch  42/60 Batch   40/77 - Loss:  0.024 - Validation Loss:  0.026
Epoch  42/60 Batch   60/77 - Loss:  0.026 - Validation Loss:  0.026
Epoch  43/60 Batch   20/77 - Loss:  0.016 - Validation Loss:  0.024
Epoch  43/60 Batch   40/77 - Loss:  0.022 - Validation Loss:  0.025
Epoch  43/60 Batch   60/77 - Loss:  0.024 - Validation Loss:  0.025
Epoch  44/60 Batch   20/77 - Loss:  0.015 - Validation Loss:  0.023
Epoch  44/60 Batch   40/77 - Loss:  0.021 - Validation Loss:  0.023
Epoch  44/60 Batch   60/77 - Loss:  0.022 - Validation Loss:  0.024
Epoch  45/60 Batch   20/77 - Loss:  0.014 - Validation Loss:  0.021
Epoch  45/60 Batch   40/77 - Loss:  0.020 - Validation Loss:  0.022
Epoch  45/60 Batch   60/77 - Loss:  0.020 - Validation Loss:  0.022
Epoch  46/60 Batch   20/77 - Loss:  0.013 - Vali

In [24]:
# Prediction
def source_to_seq(text):
    #prepare the text for the model
    sequence_length = 7
    return [source_letter_to_int.get(word, source_letter_to_int['<UNK>']) for word in text] + [source_letter_to_int['<PAD>']]*(sequence_length-len(text))

In [28]:
input_sentence = 'exactly'
text = source_to_seq(input_sentence)

checkpoint = 'checkpoints/my_model'

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # load saved model
    loader = tf.train.import_meta_graph(checkpoint + '.meta')
    loader.restore(sess, checkpoint)
    
    input_data = loaded_graph.get_tensor_by_name('input:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    source_sequence_length = loaded_graph.get_tensor_by_name('source_sequence_length:0')
    target_sequence_length = loaded_graph.get_tensor_by_name('target_sequence_length:0')
    
    #multiply by batch_size to match the model's input parameters
    answer_logits = sess.run(logits, {input_data: [text]*batch_size, 
                                     target_sequence_length: [len(text)]*batch_size, 
                                     source_sequence_length: [len(text)]*batch_size})[0]
    
pad = source_letter_to_int['<PAD>']

print('Original Text:', input_sentence)

print('\nSource')
print('  Word Ids:    {}'.format([i for i in text]))
print('  Input Words: {}'.format(' '.join([source_int_to_letter[i] for i in text])))

print('\nTarget')
print('  Word Ids:       {}'.format([i for i in answer_logits if i != pad]))
print('  Response Words: {}'.format(' '.join([target_int_to_letter[i] for i in answer_logits if i != pad])))

INFO:tensorflow:Restoring parameters from checkpoints/my_model
Original Text: exactly

Source
  Word Ids:    [8, 6, 11, 5, 15, 4, 25]
  Input Words: e x a c t l y

Target
  Word Ids:       [11, 5, 8, 4, 15, 6, 25]
  Response Words: a c e l t x y
