In [6]:
import tensorflow as tf
from tensorflow.python.layers import core as layers_core
import numpy as np

In [7]:
data_path = "data/seq2seq/imdb/"
# source_file = data_path + "source_id_20.txt"
# target_file = data_path + "target_id_20.txt"
source_file = data_path + "source_id.txt"
target_file = data_path + "target_id.txt"
vocab_file = data_path + "vocab.txt"
save_path = "model/save/implementation_4/model"
graph_path = "./graphs/implementation_4_graph"
EPOCH_NUM = 10000
BATCH_SIZE = 64
HIDDEN_SIZE = 128
EMB_DIM = 128
VOCAB_SIZE = 5000
ENCODER_MAX_LENGTH = 3
DECODER_MAX_LENGTH = 20

In [8]:
def load_emb_data(emb_dict_file):
    word_dict = {}
    word_list = []
    item = 0
    with open(emb_dict_file, 'r', encoding="utf-8") as f:
        lines = f.readlines()
        for line in lines:
            word = line.strip()
            word_dict[word] = item
            item += 1
            word_list.append(word)
    length = len(word_dict)
    print("Load embedding success! Num: %d" % length)
    return word_dict, length, word_list

In [9]:
class Dataloader:
    def __init__(self, batch_size, encoder_max_length, decoder_max_length, vocab_dict):
        self.batch_size = batch_size
        self.encoder_max_length = encoder_max_length
        self.decoder_max_length = decoder_max_length
        self.vocab_dict = vocab_dict
        
    def create_encoder_batches(self, data_file_list):
        """make self.token_stream into a integer stream."""
        self.encoder_token_stream = []
        print("load %s file data.." % ' '.join(data_file_list))
        for data_file in data_file_list:
            with open(data_file, 'r', encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    line = line.split()
                    parse_line = [int(x) for x in line]
                    if len(parse_line) < self.encoder_max_length:
                        parse_line.extend([self.vocab_dict['<PAD>']] * (self.encoder_max_length - len(parse_line)))  # padding
                    else:
                        parse_line = parse_line[:self.encoder_max_length]
                    #y.append(self.vocab_dict['<EOS>'])#end token
                    self.encoder_token_stream.append(parse_line)
        self.encoder_num_batch = int(len(self.encoder_token_stream) / self.batch_size)
        # cut the taken_stream's length exactly equal to num_batch * batch_size
        self.encoder_token_stream = self.encoder_token_stream[:self.encoder_num_batch * self.batch_size]
        self.encoder_sequence_batch = np.split(np.array(self.encoder_token_stream), self.encoder_num_batch, 0)
        self.encoder_pointer = 0
        print("      Load %d * %d encoder batches" % (self.encoder_num_batch, self.batch_size))

    def create_decoder_batches(self, data_file_list):
        """make self.token_stream into a integer stream."""
        self.decoder_token_stream = []
        print("load %s file data.." % ' '.join(data_file_list))
        for data_file in data_file_list:
            with open(data_file, 'r', encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    line = line.split()
                    parse_line = [int(x) for x in line]
                    if len(parse_line) < self.decoder_max_length:
                        parse_line.extend([self.vocab_dict['<PAD>']] * (self.decoder_max_length - len(parse_line)))  # padding
                    else:
                        parse_line = parse_line[:self.decoder_max_length]
                    #y.append(self.vocab_dict['<EOS>'])#end token
                    self.decoder_token_stream.append(parse_line)
        self.decoder_num_batch = int(len(self.decoder_token_stream) / self.batch_size)
        # cut the taken_stream's length exactly equal to num_batch * batch_size
        self.decoder_token_stream = self.decoder_token_stream[:self.decoder_num_batch * self.batch_size]
        self.decoder_sequence_batch = np.split(np.array(self.decoder_token_stream), self.decoder_num_batch, 0)
        self.decoder_pointer = 0
        print("      Load %d * %d decoder batches" % (self.decoder_num_batch, self.batch_size))    
    
    def next_encoder_batch(self):
        ret = self.encoder_sequence_batch[self.encoder_pointer]
        self.encoder_pointer = (self.encoder_pointer + 1) % (self.encoder_num_batch - 5)
        #x = np.concatenate([np.tile(self.vocab_dict['<GO>'], self.batch_size), ret], axis=0)
        #y = np.concatenate([ret, np.tile(self.vocab_dict['<EOS>'], self.batch_size)], axis=0)
        ret = [np.array(x) for x in ret]
        return np.array(ret)
    
    def next_decoder_batch(self):
        ret = self.decoder_sequence_batch[self.decoder_pointer]
        self.decoder_pointer = (self.decoder_pointer + 1) % (self.decoder_num_batch - 5)
        x = np.column_stack((np.tile(self.vocab_dict['<GO>'], self.batch_size), ret))
        y = np.column_stack((ret, np.tile(self.vocab_dict['<EOS>'], self.batch_size)))
        #x = np.concatenate([np.tile(self.vocab_dict['<GO>'], self.batch_size), ret], axis=0)
        #y = np.concatenate([ret, np.tile(self.vocab_dict['<EOS>'], self.batch_size)], axis=0)
        return x, y

In [10]:
class Generator:
    def __init__(self, batch_size, hidden_size, emb_dim, vocab_size, encoder_max_length, decoder_max_length, vocab_dict, learning_rate=0.01, num_layers = 2):
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.emb_dim = emb_dim
        self.vocab_size = vocab_size
        self.encoder_max_length = encoder_max_length
        self.decoder_max_length = decoder_max_length
        self.learning_rate = learning_rate
        self.num_layers = num_layers
        self.output_keep_prob = 0.6 #to prevent overfit
        with tf.variable_scope("placeholder"):
            self.x = tf.placeholder(shape=[self.batch_size, self.encoder_max_length], dtype=tf.int32, name="encoder_inputs")
            self.y = tf.placeholder(shape=[self.batch_size, self.decoder_max_length], dtype=tf.int32, name="decoder_inputs")
            self.targets = tf.placeholder(shape=[self.batch_size, self.decoder_max_length], dtype=tf.int32, name="decoder_targets")
            self.sequence_lengths = tf.placeholder(shape=[self.batch_size], dtype=tf.int32)
            #self.infer_length = tf.placeholder(dtype=tf.int32, name="infer_length")
        with tf.variable_scope('embeeding'):
            self.embedding = tf.Variable(tf.random_uniform([self.vocab_size, self.emb_dim], -1, 1), name="embedding")
            self.encoder_inputs = tf.nn.embedding_lookup(self.embedding, self.x)# map (seq_length,batch_size,emb_dim)
            self.decoder_inputs = tf.nn.embedding_lookup(self.embedding, self.y)# map (seq_length,batch_size,emb_dim)
        with tf.variable_scope("rnn"):
            def get_lstm_cell(hidden_size):
                lstm_cell = tf.contrib.rnn.BasicLSTMCell(hidden_size, state_is_tuple = False, name="lstm_cell")
                #lstm_cell = tf.contrib.rnn.DropoutWrapper(lstm_cell, output_keep_prob=self.output_keep_prob)
                return lstm_cell
            #self.lstm_cell = get_lstm_cell(self.hidden_size)
            self.lstm_cell = tf.nn.rnn_cell.MultiRNNCell([get_lstm_cell(self.hidden_size) for _ in range(self.num_layers)] ,state_is_tuple=False) #multi layer) #multi layer
        with tf.variable_scope("encoder"):
            self.encoder_output, self.encoder_state = tf.nn.dynamic_rnn(cell=self.lstm_cell, inputs=self.encoder_inputs, dtype=tf.float32)#outputs [batch_size,max_length,hidden_size],  state [batch_size, hidden_size]
        with tf.variable_scope("decoder"):
            #self.decoder_cell = get_lstm_cell(self.hidden_size)
            self.decoder_cell = tf.nn.rnn_cell.MultiRNNCell([get_lstm_cell(self.hidden_size) for _ in range(self.num_layers)],state_is_tuple=False) #multi layer
            self.initial_state = self.encoder_state
            self.output_layer = layers_core.Dense(vocab_size,kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1))
        with tf.variable_scope("attention"):
            attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(num_units=self.hidden_size, memory=self.encoder_output, memory_sequence_length=[decoder_max_length] * batch_size)
            self.decoder_cell = tf.contrib.seq2seq.AttentionWrapper(cell=self.decoder_cell,attention_mechanism=attention_mechanism,attention_layer_size=self.hidden_size, name='Attention_Wrapper')
            self.initial_state = self.decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32).clone(cell_state=self.initial_state)
        with tf.variable_scope("decoder_train"):
            self.training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=self.decoder_inputs, sequence_length=[decoder_max_length] * self.batch_size,time_major=False)
            self.training_decoder = tf.contrib.seq2seq.BasicDecoder(cell=self.decoder_cell,helper=self.training_helper,initial_state=self.initial_state,output_layer=self.output_layer)
            self.training_decoder_outputs, self.training_decoder_state, self.training_decoder_sequence_length =  tf.contrib.seq2seq.dynamic_decode(decoder=self.training_decoder, output_time_major=False,impute_finished=True, maximum_iterations=self.decoder_max_length, swap_memory=True)
        with tf.variable_scope("decoder_inference", reuse=True):
            self.inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding=self.embedding, start_tokens=tf.fill([self.batch_size], vocab_dict['<GO>']), end_token=vocab_dict['<EOS>'])
            self.inference_decoder = tf.contrib.seq2seq.BasicDecoder(cell=self.decoder_cell,helper=self.inference_helper, initial_state=self.initial_state, output_layer=self.output_layer)
            self.inference_decoder_outputs, self.inference_decoder_state, self.inference_decoder_sequence_length = tf.contrib.seq2seq.dynamic_decode(decoder=self.inference_decoder, output_time_major=False, impute_finished=True, maximum_iterations=self.decoder_max_length, swap_memory=True)
        with tf.variable_scope("train"):
            self.training_logits = tf.identity(self.training_decoder_outputs.rnn_output, name="logits")
            self.prediction = tf.identity(self.inference_decoder_outputs.sample_id, name="prediction")
            masks = tf.sequence_mask(self.sequence_lengths, self.decoder_max_length, dtype=tf.float32, name='masks')
            self.loss = tf.contrib.seq2seq.sequence_loss(logits=self.training_logits, targets=self.targets, weights=masks)
#             cost = tf.contrib.seq2seq.sequence_loss(
#             training_logits,
#             targets,
#             masks)
            # Optimizer
            #optimizer = tf.train.AdamOptimizer(self.learning_rate)
            #self.train = optimizer.minimize(self.loss)
            # Gradient Clipping
            #gradients = optimizer.compute_gradients(self.loss)
            #capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
            #self.train = optimizer.apply_gradients(capped_gradients)
            tvars = tf.trainable_variables()
            max_grad_norm = 5
            grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), max_grad_norm)
            gradients = list(zip(grads, tvars))
            self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
            self.train = self.optimizer.apply_gradients(gradients)

In [None]:
if __name__ == "__main__":
    vocab_dict, vocab_size, word_list = load_emb_data(vocab_file)
    dataloader = Dataloader(BATCH_SIZE, ENCODER_MAX_LENGTH, DECODER_MAX_LENGTH,vocab_dict)
    generator = Generator(BATCH_SIZE, HIDDEN_SIZE, EMB_DIM, vocab_size, ENCODER_MAX_LENGTH, DECODER_MAX_LENGTH+1, vocab_dict)
    dataloader.create_encoder_batches([source_file])
    dataloader.create_decoder_batches([target_file])
    encoder_inputs = dataloader.next_encoder_batch()
    decoder_inputs, decoder_targets = dataloader.next_decoder_batch()
    saver = tf.train.Saver(max_to_keep=1)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        writer = tf.summary.FileWriter(graph_path, sess.graph)
        writer.close()
        #for i in range(EPOCH_NUM):
        for i in range(dataloader.encoder_num_batch * 50):
            encoder_inputs = dataloader.next_encoder_batch()
            #print(np.shape(encoder_inputs))
            decoder_inputs, decoder_targets = dataloader.next_decoder_batch()
            #print(np.shape(decoder_inputs), np.shape(decoder_targets))
            loss, _, logits = sess.run([generator.loss, generator.train, generator.training_logits], feed_dict={generator.x:encoder_inputs, generator.y:decoder_inputs, generator.targets:decoder_targets, generator.sequence_lengths:[DECODER_MAX_LENGTH+1] * BATCH_SIZE})
            #print(logits)
            if (i % 100 == 0):
                print(i, loss)
                saver.save(sess, save_path, global_step=i)

Load embedding success! Num: 75941
Instructions for updating:
This class is deprecated, please use tf.nn.rnn_cell.LSTMCell, which supports all the feature this cell currently has. Please replace the existing code with tf.nn.rnn_cell.LSTMCell(name='basic_lstm_cell').
load data/seq2seq/imdb/source_id.txt file data..
      Load 6526 * 64 encoder batches
load data/seq2seq/imdb/target_id.txt file data..
      Load 6526 * 64 decoder batches
0 11.229811
100 2.9158914
200 2.310601
300 2.3741493
400 2.0285466
500 2.256639
