In [1]:
import os
import time
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.python.layers.core import Dense

In [2]:
class Seq2seq():
    def __init__(self,params,vocab_to_int,embeddings):
        self.params = params
        self.vocab_to_int = vocab_to_int
        self.embeddings = embeddings

    def _init_placeholders(self):
        with tf.name_scope("Input_Placeholder"):
            self.encoder_inputs = tf.placeholder(dtype=tf.int32, shape=(None,None), name='encoder_inputs')
            self.decoder_targets = tf.placeholder(dtype=tf.int32, shape=(None,None), name='decoder_targets')
            self.encoder_lengths = tf.placeholder(dtype=tf.int32, shape=(None,), name='encoder_lengths')
            self.decoder_lengths = tf.placeholder(dtype=tf.int32, shape=(None,), name='decoder_lengths')
            self.max_dec_length = tf.reduce_max(self.decoder_lengths, name='max_dec_len')
            self.learning_rate = tf.placeholder(tf.float32, name='learning_rate')
            self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
            
    def _single_rnn_(self, cell_type, num_units):
        cell_type = cell_type.upper()
        if cell_type == 'LSTM':
            return tf.contrib.rnn.LSTMCell(num_units,initializer=tf.truncated_normal_initializer(-0.1, 0.1, seed=2))
        elif cell_type == 'GRU':
            return tf.contrib.rnn.GRUCell(num_units,initializer=tf.truncated_normal_initializer(-0.1, 0.1, seed=2))
        elif cell_type == 'RNN':
            return tf.contrib.rnn.RNNCell(num_units,initializer=tf.truncated_normal_initializer(-0.1, 0.1, seed=2))
        elif cell_type == 'GLSTM':
            return tf.contrib.rnn.GLSTMCell(num_units,initializer=tf.truncated_normal_initializer(-0.1, 0.1, seed=2))
    
    def _multi_rnn_(self, cell_type, num_units, num_layers):
        return [self._single_rnn_(cell_type, num_units) for _ in range(num_layers)]
    
    def _encoder_(self):
        with tf.variable_scope("Simple_Encoder",reuse=tf.AUTO_REUSE):
            enc_embed_input = tf.nn.embedding_lookup(self.embeddings, self.encoder_inputs)
            if self.params['num_layers'] == 1:
                cells = self._single_rnn_(self.params['cell_type'],self.params['num_units'])
                self.encoder_outputs, self.encoder_final_state = tf.nn.dynamic_rnn(cell=cells,inputs=enc_embed_input,sequence_length=self.encoder_lengths,dtype=tf.float32) 
            else:
                cells = tf.contrib.rnn.MultiRNNCell(self._multi_rnn_(self.params['cell_type'],self.params['num_units'],self.params['num_layers']))
                self.encoder_outputs, self.encoder_states = tf.nn.dynamic_rnn(cell=cells,inputs=enc_embed_input,sequence_length=self.encoder_lengths,dtype=tf.float32)
                self.encoder_final_state = states[-1]
            
    def _bidirectional_encoder_(self):
        with tf.variable_scope("Bidirectional_Encoder",reuse=tf.AUTO_REUSE):
            enc_embed_input = tf.nn.embedding_lookup(self.embeddings, self.encoder_inputs)
            if self.params['num_layers'] == 1:
                cells_fw = self._single_rnn_(self.params['cell_type'],self.params['num_units'])
                cells_bw = self._single_rnn_(self.params['cell_type'],self.params['num_units'])
            else:
                cells_fw = self._multi_rnn_(self.params['cell_type'],self.params['num_units'],self.params['num_layers'])
                cells_bw = self._multi_rnn_(self.params['cell_type'],self.params['num_units'],self.params['num_layers'])
            outputs, encoder_final_fw_states, encoder_final_bw_states = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(cells_fw=cells_fw,cells_bw=cells_bw,inputs=enc_embed_input,sequence_length=self.encoder_lengths,dtype=tf.float32) 
            self.encoder_outputs = tf.concat(outputs,axis=2,name='Encoder_BiDirectional_Output_Concat')
            c = tf.reduce_mean([tf.concat((encoder_final_fw_states[i].c, encoder_final_bw_states[i].c),1) for i in range(self.params['num_layers'])],axis=0,name='Encoder_Bidirectional_State_Concat_c')
            h = tf.reduce_mean([tf.concat((encoder_final_fw_states[i].h, encoder_final_bw_states[i].h),1) for i in range(self.params['num_layers'])],axis=0,name='Encoder_Bidirectional_State_Concat_h')
            self.encoder_final_state = tf.contrib.rnn.LSTMStateTuple(c,h)
    
    def _decoder_with_attention_(self,infer=False):
        if self.params['bidirectional']:
            if self.params['num_layers'] == 1:
                cells = self._single_rnn_(self.params['cell_type'],self.params['num_units']*2)
            else:
                cells = self._multi_rnn_(self.params['cell_type'],self.params['num_units']*2,self.params['num_layers'])
        else:
            if self.params['num_layers'] == 1:
                cells = self._single_rnn_(self.params['cell_type'],self.params['num_units'])
            else:
                cells = self._multi_rnn_(self.params['cell_type'],self.params['num_units'],self.params['num_layers'])
        if infer:
            memory = tf.contrib.seq2seq.tile_batch(self.encoder_outputs, self.params['beam_size'])
            memory_sequence_length = tf.contrib.seq2seq.tile_batch(self.encoder_lengths, self.params['beam_size'])
        else:
            memory = self.encoder_outputs
            memory_sequence_length = self.encoder_lengths
        
        if self.params['attention'] == 'bahdanau':
            attn_mech = tf.contrib.seq2seq.BahdanauAttention(num_units=self.params['num_units'],memory=memory,memory_sequence_length=memory_sequence_length)
        elif self.params['attention'] == 'bahdanau_norm':
            attn_mech = tf.contrib.seq2seq.BahdanauAttention(num_units=self.params['num_units'],memory=memory,memory_sequence_length=memory_sequence_length,normalize=True)
        elif self.params['attention'] == 'luong':
            attn_mech = tf.contrib.seq2seq.LuongAttention(num_units=self.params['num_units'],memory=memory,memory_sequence_length=memory_sequence_length)
        elif self.params['attention'] == 'luong_scaled':
            attn_mech = tf.contrib.seq2seq.LuongAttention(num_units=self.params['num_units'],memory=memory,memory_sequence_length=memory_sequence_length,scale=True)
        
        cells[0] = tf.contrib.seq2seq.AttentionWrapper(cell=cells[0],attention_mechanism=attn_mech,attention_layer_size=self.params['num_units'])
        
        if infer:
            batch_size = self.params['batch_size']*self.params['beam_size']
            self.encoder_final_state = tf.contrib.seq2seq.tile_batch(self.encoder_final_state,self.params['beam_size'])
        else:
            batch_size = self.params['batch_size']
        initial_state = [self.encoder_final_state for i in range(self.params['num_layers'])]
        attention_cell_state = cells[0].zero_state(dtype=tf.float32,batch_size=batch_size)
        initial_state[0] = attention_cell_state.clone(cell_state=initial_state[0])
        self.decoder_initial_state = tuple(initial_state)
        self.decoder_cells = tf.contrib.rnn.MultiRNNCell(cells)
        self.output_layer = Dense(self.params['vocab_size'],kernel_initializer=tf.truncated_normal_initializer(mean = 0.0, stddev=0.1),name='Output')
    
    def _train_decoder_(self):
        with tf.variable_scope("Decode",reuse=tf.AUTO_REUSE):
            self._decoder_with_attention_()
            ending = tf.strided_slice(self.decoder_targets,begin=[0,0],end=[self.params['batch_size'],-1],strides=[1,1])
            self.decoder_inputs = tf.concat([tf.fill([self.params['batch_size'], 1],self.vocab_to_int['<GO>']),ending],1)
            dec_embed_input = tf.nn.embedding_lookup(self.embeddings, self.decoder_inputs)

            train_helper = tf.contrib.seq2seq.TrainingHelper(dec_embed_input,sequence_length=self.decoder_lengths,name='Train_Helper')
            self.train_decoder = tf.contrib.seq2seq.BasicDecoder(cell=self.decoder_cells,helper=train_helper,initial_state=self.decoder_initial_state,output_layer=self.output_layer)
            self.training_logits, _, _ = tf.contrib.seq2seq.dynamic_decode(self.train_decoder,impute_finished=True,maximum_iterations=self.max_dec_length,swap_memory=True)

    def _inference_decoder_(self):
        with tf.variable_scope("Decode",reuse=tf.AUTO_REUSE):
            self._decoder_with_attention_(infer=True)
            start_tokens = tf.fill([self.params['batch_size']], self.vocab_to_int[self.params['start_token']])
            beam_search_decoder = tf.contrib.seq2seq.BeamSearchDecoder(cell=self.decoder_cells,embedding=self.embeddings,start_tokens=start_tokens,end_token=self.vocab_to_int[self.params['end_token']],initial_state=self.decoder_initial_state,beam_width=self.params['beam_size'],output_layer=self.output_layer)
            final_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(beam_search_decoder,maximum_iterations=self.max_dec_length)
            beam_predictions = final_outputs.predicted_ids
            self.inference_logits = tf.transpose(beam_predictions, perm=[0, 2, 1])
        
    def graph_construct(self):
        train_graph = tf.Graph()
        with train_graph.as_default():
            self._init_placeholders()
            print("Initialized Placeholders")
            if self.params['bidirectional']:
                self._bidirectional_encoder_()
            else:
                self._encoder_()
            print("Encoder Netwrok Constructed")
            self._train_decoder_()
            print("Train Decoder Netwrok Constructed")
            self._inference_decoder_()
            print("Infer Decoder Netwrok Constructed")
            training_logits = tf.identity(self.training_logits.rnn_output, 'logits')
            inference_logits = tf.identity(self.inference_logits, name='predictions')
            masks = tf.sequence_mask(self.decoder_lengths,self.max_dec_length,dtype=tf.float32,name='masks')
            with tf.name_scope("optimization"):
                cost = tf.contrib.seq2seq.sequence_loss(training_logits,self.decoder_targets,masks)
                optimizer = tf.train.AdamOptimizer(self.params['learning_rate'])
                # Gradient Clipping
                gradients = optimizer.compute_gradients(cost)
                capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
                train_op = optimizer.apply_gradients(capped_gradients)
        print("Graph is built.")
        return train_graph

In [3]:
params = {'num_units':128,
          'num_layers':3,
          'vocab_size':1000,
          'embed_dim':300,
          'cell_type':'LSTM',
          'attention':'bahdanau',
          'batch_size':32,
          'bidirectional':True,
          'beam_size':10,
          'end_token':'<EOS>',
          'start_token':'<GO>',
          'learning_rate':0.001}
word2int = {'<GO>':1,'<EOS>':2,'<PAD>':3}
embedding_matrix = np.ones(shape=(params['vocab_size'],params['embed_dim']),dtype=np.float32)

In [4]:
seqob = Seq2seq(params,word2int,embedding_matrix)

In [5]:
graph = seqob.graph_construct()

Initialized Placeholders
Encoder Netwrok Constructed
Train Decoder Netwrok Constructed
Infer Decoder Netwrok Constructed
Graph is built.


In [6]:
with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter("./logs/seq2seq_2/run1", sess.graph)