In [3]:
import tensorflow as tf
import numpy as np
from tensorflow.python.layers.core import Dense
from sklearn.model_selection import train_test_split

  return f(*args, **kwds)


In [4]:
tf.__version__

'1.4.0'

In [5]:
def read_file_by_line(fname, sentence=False):
    ret = []
    with open(fname, encoding='utf-8') as f:
        for line in f:
            line = line.replace('\n', '')
            if sentence:
                line = line.split(' ')
            ret.append(line)
    return ret

In [6]:
FLAG = 'char'
if FLAG is 'nmt':
    train_input = read_file_by_line('./data/train.en', sentence=True)
    train_target = read_file_by_line('./data/train.vi', sentence=True)
    vocab_input = read_file_by_line('./data/vocab.en')
    vocab_target = read_file_by_line('./data/vocab.vi')
    vocab_input = ['<pad>'] + vocab_input
    vocab_target = ['<pad>'] + vocab_target
else:
    train_input = read_file_by_line('./char-data/letters_source.txt')
    train_input = [list(word) for word in train_input]
    train_target = read_file_by_line('./char-data/letters_target.txt')
    train_target = [list(word) for word in train_target]
    
    input_set_words = list(set([c for word in train_input for c in word]))
    target_set_words = list(set([c for word in train_input for c in word]))
    vocab_input = ['<pad>', '<unk>', '<s>', '</s>'] + input_set_words
    vocab_target = ['<pad>', '<unk>', '<s>', '</s>'] + target_set_words

In [7]:
vocab_input_len = len(vocab_input)
vocab_target_len = len(vocab_target)

In [8]:
vocab_inp_to_int = {word: i for i, word in enumerate(vocab_input)}
vocab_tgt_to_int = {word: i for i, word in enumerate(vocab_target)}
tgt_int_to_char = { i: c for c, i in vocab_tgt_to_int.items()}

In [9]:
print('<pad>: ', vocab_inp_to_int.get('<pad>'))
print('<unk> :', vocab_inp_to_int.get('<unk>'))
print('<s>: ', vocab_inp_to_int.get('<s>'))
print('</s>: ', vocab_inp_to_int.get('</s>'))

<pad>:  0
<unk> : 1
<s>:  2
</s>:  3


In [10]:
enc_input = [[vocab_inp_to_int.get(token, vocab_inp_to_int.get('<unk>')) for token in sentence] for sentence in train_input]
enc_target = [[vocab_tgt_to_int.get(token, vocab_tgt_to_int.get('<unk>')) for token in sentence] for sentence in train_target]

enc_input, enc_input_test, enc_target, enc_target_test = train_test_split(enc_input, enc_target, test_size=.1)

inp_max_len = np.max([len(sentence) for sentence in enc_input])
tgt_max_len = np.max([len(sentence) for sentence in enc_target])

In [21]:
emb_size = 20
rnn_size = 50
rnn_layers = 2
learning_rate = 0.001
epoch = 80
batch_size = 128
use_attention = True

TGT_START_TOKEN = vocab_tgt_to_int.get('<s>')
TGT_END_TOKEN = vocab_tgt_to_int.get('</s>')

In [22]:
def get_data():
    enc_input_p = tf.placeholder(tf.int32, [None, None], name="input")
    enc_input_len_p = tf.placeholder(tf.int32, [None], name="input_length")
    
    enc_target_p = tf.placeholder(tf.int32, [None, None])
    enc_target_len_p = tf.placeholder(tf.int32, [None], name="target_length")
    max_enc_target_len = tf.reduce_max(enc_target_len_p)
    
    return enc_input_p, enc_input_len_p, enc_target_p, enc_target_len_p, max_enc_target_len

In [23]:
def make_cell(rnn_size):
    return tf.contrib.rnn.LSTMCell(num_units=rnn_size,
                                           initializer=tf.random_uniform_initializer(-.1, .1))

def build_graph(input_data, input_len, target_data, target_len, max_target_len):
    # Encoding
    enc_embeddings = tf.get_variable('enc_embeddings',
                                     [vocab_input_len, emb_size])
    enc_emb_output = tf.nn.embedding_lookup(enc_embeddings, input_data)
    
    encoder_cell = tf.contrib.rnn.MultiRNNCell([make_cell(rnn_size) for _ in range(rnn_layers)])
    encoder_output, encoder_state = tf.nn.dynamic_rnn(encoder_cell,
                                                      enc_emb_output,
                                                      sequence_length=input_len,
                                                      dtype=tf.float32)
    
    decoder_init_state = encoder_state
    
    # Decoding
    batch_size = tf.shape(input_data)[0]
    
    decoding_input = tf.strided_slice(target_data, [0,0], [batch_size, -1], [1,1])
    decoding_input = tf.concat([tf.fill([batch_size, 1], TGT_START_TOKEN), decoding_input], axis=1)
    
    dec_embeddings = tf.get_variable('dec_embeddings', 
                                     [vocab_target_len, emb_size], 
                                     initializer=tf.random_uniform_initializer(-.1, .1))
    dec_emb_output = tf.nn.embedding_lookup(dec_embeddings, decoding_input)
    
    decoder_cell = tf.contrib.rnn.MultiRNNCell([make_cell(rnn_size) for _ in range(rnn_layers)])

    # Attention
    if use_attention:
        attention_mechanism = tf.contrib.seq2seq.LuongAttention(rnn_size, 
                                                                encoder_output, 
                                                                memory_sequence_length=input_len)
        decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell,
                                                           attention_mechanism, 
                                                           attention_layer_size=rnn_size)
        decoder_init_state = decoder_cell.zero_state(batch_size, dtype=tf.float32)
    
    projection_layer = Dense(vocab_target_len, kernel_initializer=tf.truncated_normal_initializer(0.0, 0.1))
    
    with tf.variable_scope("decode"):
        helper = tf.contrib.seq2seq.TrainingHelper(dec_emb_output, target_len)
        decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, 
                                                  helper, 
                                                  decoder_init_state,
                                                  output_layer=projection_layer)

        train_outputs = tf.contrib.seq2seq.dynamic_decode(decoder,
                                                          impute_finished=True,
                                                          maximum_iterations=max_target_len)[0]
    with tf.variable_scope("decode", reuse=True):
        start_tokens = tf.fill([batch_size], TGT_START_TOKEN)
        greedy_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(dec_embeddings, start_tokens, TGT_END_TOKEN)
        infer_decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, 
                                                        greedy_helper, 
                                                        decoder_init_state, 
                                                        output_layer=projection_layer)
        
        infer_outputs = tf.contrib.seq2seq.dynamic_decode(infer_decoder, 
                                                          impute_finished=True, 
                                                          maximum_iterations=max_target_len)[0]
    
    return train_outputs, infer_outputs
    

In [24]:
def get_batches(X, y, batch_size, pad_value):
    def pad(a, max_len, value):
        return [i + [value for j in range(max_len - len(i))] for i in a]
    
    for i in range(0, len(X), batch_size):
        X_batch = X[i:i+batch_size]
        X_batch_len = [len(X_item) for X_item in X_batch]
        X_batch = pad(X_batch, max(X_batch_len), pad_value)
        
        y_batch = y[i:i+batch_size]
        y_batch_len = [len(y_item) for y_item in y_batch]
        y_batch = pad(y_batch, max(y_batch_len), pad_value)
        
        yield X_batch, X_batch_len, y_batch, y_batch_len

In [25]:
ckpt = './model.ckpt'
if use_attention:
    ckpt = './model-attention.ckpt'
    
tf.reset_default_graph()
test_inp_data_batch, test_inp_data_len, test_tgt_data_batch, test_tgt_data_len = next(
    get_batches(enc_input_test, enc_target_test, batch_size, vocab_inp_to_int.get('<pad>')))
with tf.Session() as sess:
    input_data, input_len, target_data, target_len, max_target_len = get_data()
    
    train_outputs, infer_outputs = build_graph(input_data, input_len, target_data, target_len, max_target_len)
    
    # For training
    masks = tf.sequence_mask(target_len, max_target_len, dtype=tf.float32)
    loss = tf.contrib.seq2seq.sequence_loss(train_outputs.rnn_output, target_data, masks)
    optimizer = tf.train.AdamOptimizer(learning_rate)
    gradients = optimizer.compute_gradients(loss)
    clipped_grad = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(clipped_grad)
    
    # For inference
    tf.identity(infer_outputs.sample_id, name="inference_logit")
    
    saver = tf.train.Saver()
    try:
        saver.restore(sess, ckpt)
    except tf.errors.NotFoundError:
        print('No model saved')
    
    sess.run(tf.global_variables_initializer())
    
    for epoch_i in range(1, epoch+1):        
        for i, (inp_data_batch, inp_data_len, tgt_data_batch, tgt_data_len) in enumerate(
            get_batches(enc_input, enc_target, batch_size, vocab_inp_to_int.get('<pad>'))):
            loss_eval, _ = sess.run([loss, train_op], 
                                    feed_dict={
                                        input_data: inp_data_batch,
                                        input_len: inp_data_len,
                                        target_data: tgt_data_batch,
                                        target_len: tgt_data_len})
            if i % 50 == 0:
                valid_loss_eval = sess.run(loss, feed_dict={
                    input_data: test_inp_data_batch,
                    input_len: test_inp_data_len,
                    target_data: test_tgt_data_batch,
                    target_len: test_tgt_data_len
                })
                print("Loss: ", loss_eval, "/ Validation Loss: ", valid_loss_eval, " / Epoch: ", epoch_i)
    
    saver.save(sess, ckpt)
    print('################## Saved!! ##################')            

INFO:tensorflow:Restoring parameters from ./model-attention.ckpt
Loss:  3.40126 / Validation Loss:  3.40066  / Epoch:  1
Loss:  3.07162 / Validation Loss:  3.08143  / Epoch:  1
Loss:  2.77428 / Validation Loss:  2.7651  / Epoch:  2
Loss:  2.39495 / Validation Loss:  2.39593  / Epoch:  2
Loss:  2.25402 / Validation Loss:  2.26641  / Epoch:  3
Loss:  2.04217 / Validation Loss:  2.04087  / Epoch:  3
Loss:  1.94127 / Validation Loss:  1.96947  / Epoch:  4
Loss:  1.77986 / Validation Loss:  1.79311  / Epoch:  4
Loss:  1.66698 / Validation Loss:  1.72994  / Epoch:  5
Loss:  1.54523 / Validation Loss:  1.57456  / Epoch:  5
Loss:  1.44711 / Validation Loss:  1.51305  / Epoch:  6
Loss:  1.33793 / Validation Loss:  1.35126  / Epoch:  6
Loss:  1.30794 / Validation Loss:  1.32147  / Epoch:  7
Loss:  1.19095 / Validation Loss:  1.23141  / Epoch:  7
Loss:  1.12008 / Validation Loss:  1.14377  / Epoch:  8
Loss:  1.08758 / Validation Loss:  1.12511  / Epoch:  8
Loss:  1.01002 / Validation Loss:  1.039

In [26]:
def predict(text):
    text_tokens = list(text)
    if FLAG == 'nmt':
        text_tokens = text.split(' ')
    text_ids = [vocab_tgt_to_int.get(c, vocab_tgt_to_int['<unk>']) for c in text_tokens]
    text_ids_len = [len(text_ids)]
    
    prediction_graph = tf.Graph()
    with tf.Session(graph=prediction_graph) as sess:
        loader = tf.train.import_meta_graph(ckpt + '.meta')
        loader.restore(sess, ckpt)

        # Placeholder
        input_data = prediction_graph.get_tensor_by_name("input:0")
        input_length = prediction_graph.get_tensor_by_name("input_length:0")
        target_length = prediction_graph.get_tensor_by_name("target_length:0")
        
        # Logit
        inference_logit = prediction_graph.get_tensor_by_name("inference_logit:0")
        
        sample_ids = sess.run(inference_logit, feed_dict={
            input_data: [text_ids]*batch_size,
            input_length: [len(text_ids)]*batch_size,
            target_length: [len(text_ids)]*batch_size
        })
        
        return ' '.join([tgt_int_to_char[i] for i in sample_ids[0]])


In [28]:
predict('qmwoihg')

INFO:tensorflow:Restoring parameters from ./model-attention.ckpt


'g h i m o q w'