In [1]:
import tensorflow as tf
import numpy as np
import copy

  from ._conv import register_converters as _register_converters


In [2]:
data_path = "data/sentiment_analysis/"
positive_file = data_path + "sst_pos_sentences_id.txt"
negative_file = data_path + "sst_neg_sentences_id.txt"
data_path = "data/sentiment_analysis/"
positive_file = data_path + "sst_pos_sentences_id.txt"
negative_file = data_path + "sst_neg_sentences_id.txt"
vocab_file = data_path + "sst_vocab.txt"
save_path = "model/save/implementation_3/model"
graph_path = "./graphs/implementation_3_graph"
BATCH_SIZE = 64
HIDDEN_SIZE = 64
EMB_DIM = 16
VOCAB_SIZE = 5000
MAX_LENGTH = 20
EPOCH_NUM = 10000

In [3]:
def load_emb_data(emb_dict_file):
    word_dict = {}
    word_list = []
    item = 0
    with open(emb_dict_file, 'r', encoding="utf-8") as f:
        lines = f.readlines()
        for line in lines:
            word = line.strip()
            word_dict[word] = item
            item += 1
            word_list.append(word)
    length = len(word_dict)
    print("Load embedding success! Num: %d" % length)
    return word_dict, length, word_list

In [4]:
class Dataloader:
    def __init__(self, batch_size, max_length, vocab_dict):
        self.batch_size = batch_size
        self.max_length = max_length
        self.vocab_dict = vocab_dict
        
    def create_batches(self, data_file_list):
        """make self.token_stream into a integer stream."""
        self.token_stream = []
        print("load %s file data.." % ' '.join(data_file_list))
        for data_file in data_file_list:
            with open(data_file, 'r', encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    line = line.split()
                    parse_line = [int(x) for x in line]
                    parse_line.extend([self.vocab_dict['<PAD>']] * (self.max_length - len(parse_line)))  # padding
                    #y.append(self.vocab_dict['<EOS>'])#end token
                    self.token_stream.append(parse_line)
        self.num_batch = int(len(self.token_stream) / self.batch_size)
        # cut the taken_stream's length exactly equal to num_batch * batch_size
        self.token_stream = self.token_stream[:self.num_batch * self.batch_size]
        self.sequence_batch = np.split(np.array(self.token_stream), self.num_batch, 0)
        self.pointer = 0
        print("      Load %d * %d batches" % (self.num_batch, self.batch_size))

    def next_batch(self):
        ret = self.sequence_batch[self.pointer]
        self.pointer = (self.pointer + 1) % (self.num_batch - 5)
        x = np.column_stack((np.tile(self.vocab_dict['<GO>'], self.batch_size), ret))
        y = np.column_stack((ret, np.tile(self.vocab_dict['<EOS>'], self.batch_size)))
        #x = np.concatenate([np.tile(self.vocab_dict['<GO>'], self.batch_size), ret], axis=0)
        #y = np.concatenate([ret, np.tile(self.vocab_dict['<EOS>'], self.batch_size)], axis=0)
        return x, y

In [5]:
class Generator:
    def __init__(self, batch_size, hidden_size, emb_dim, vocab_size, max_length, vocab_dice, learning_rate=0.01, num_layers = 1):
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.emb_dim = emb_dim
        self.vocab_size = vocab_size
        self.max_length = max_length
        self.learning_rate = learning_rate
        self.num_layers = num_layers
        self.output_keep_prob = 0.6 #to prevent overfit
        with tf.variable_scope("placeholder"):
            self.x = tf.placeholder(shape=[self.batch_size, self.max_length], dtype=tf.int32, name="inputs")#train_input
            self.y = tf.placeholder(shape=[self.batch_size, self.max_length], dtype=tf.int32, name="targets")#train_label
            self.token = tf.placeholder(shape=[None], dtype=tf.int32, name="token") #prediction
            self.next_state = tf.placeholder(shape=[None, self.hidden_size * 2], dtype=tf.float32, name="next_state")
        with tf.variable_scope('embeeding'):
            self.embedding = tf.Variable(tf.random_uniform([self.vocab_size, self.emb_dim], -1, 1), name="embedding")
            self.inputs = tf.nn.embedding_lookup(self.embedding, self.x)# map (seq_length,batch_size,emb_dim)
            self.token_input = tf.nn.embedding_lookup(self.embedding, self.token)# map (seq_length,batch_size,emb_dim)
        with tf.variable_scope("rnn"):
            def get_lstm_cell(hidden_size):
                lstm_cell = tf.contrib.rnn.BasicLSTMCell(self.hidden_size, state_is_tuple = False, name="lstm_cell")
                lstm_cell = tf.contrib.rnn.DropoutWrapper(lstm_cell, output_keep_prob=self.output_keep_prob)
                return lstm_cell
            #self.lstm_cell = get_lstm_cell(self.hidden_size)
            self.lstm_cell = tf.nn.rnn_cell.MultiRNNCell([get_lstm_cell(self.hidden_size) for _ in range(self.num_layers)],state_is_tuple=False) #multi layer
        with tf.variable_scope("output"):
            self.outputs, self.states = tf.nn.dynamic_rnn(cell=self.lstm_cell, inputs=self.inputs, dtype=tf.float32)#outputs [batch_size,max_length,hidden_size]
            self.outputs = tf.reshape(self.outputs, shape=[-1, self.hidden_size])#[batch_size*max_length, hidden_size]
            self.output_weights = tf.get_variable(name="weights", shape=[self.hidden_size, self.vocab_size], initializer=tf.random_normal_initializer())
            # Create variable named "biases".
            self.output_biases = tf.get_variable(name="biases", shape=[self.vocab_size], initializer=tf.constant_initializer(0.0))
            self.logits = tf.matmul(self.outputs, self.output_weights) + self.output_biases
            #self.logits = tf.layers.dense(self.outputs, self.vocab_size, name="logits")#(batch_size*max_length, vocab_size)
            #self.softmax = tf.nn.softmax(self.logits, name="softmax")#(batch_size*max_length, vocab_size)
        with tf.variable_scope("train"):
            self.label = tf.one_hot(tf.to_int32(tf.reshape(self.y, [-1])), self.vocab_size, 1.0, 0.0)
            self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.label)
            self.loss = tf.reduce_mean(self.loss)
            # We clip the gradients to prevent explosion
            tvars = tf.trainable_variables()
            max_grad_norm = 5
            grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), max_grad_norm)
            gradients = list(zip(grads, tvars))
            self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
            self.train = self.optimizer.apply_gradients(gradients)
        with tf.variable_scope("inference"):
            self.initial_state = tf.identity(self.lstm_cell.zero_state(1, dtype=tf.float32), name="initial_state")
            self.infer_outputs, self.infer_states = self.lstm_cell.call(inputs=self.token_input, state=self.next_state)
            self.infer_states = tf.identity(self.infer_states, name="states")
            self.infer_logits = tf.matmul(self.infer_outputs, self.output_weights) + self.output_biases
            self.infer_softmax = tf.nn.softmax(self.infer_logits, name="softmax")
            self.prediction = tf.argmax(self.infer_softmax, axis=1, name="prediction")

In [None]:
if __name__ == "__main__":
    vocab_dict, vocab_size, word_list = load_emb_data(vocab_file)
    generator = Generator(BATCH_SIZE, HIDDEN_SIZE, EMB_DIM, vocab_size, MAX_LENGTH+1,vocab_dict)
    dataloader = Dataloader(BATCH_SIZE, MAX_LENGTH, vocab_dict)
    dataloader.create_batches([positive_file])
    x, y = dataloader.next_batch()
    saver = tf.train.Saver(max_to_keep=1)
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        writer = tf.summary.FileWriter(graph_path, sess.graph)
        writer.close()
        for i in range(EPOCH_NUM):
            x, y = dataloader.next_batch()
            loss, _ = sess.run([generator.loss, generator.train], feed_dict={generator.x:x, generator.y:y})
            if i % 100 == 0:
                saver.save(sess, save_path, global_step=i, write_meta_graph=True)
                print(i, loss)
        

Load embedding success! Num: 4734
Instructions for updating:
This class is deprecated, please use tf.nn.rnn_cell.LSTMCell, which supports all the feature this cell currently has. Please replace the existing code with tf.nn.rnn_cell.LSTMCell(name='basic_lstm_cell').
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.

load data/sentiment_analysis/sst_pos_sentences_id.txt file data..
      Load 33 * 64 batches
0 9.071622
100 2.439889
200 2.8991222
300 2.1354482
400 2.4451084
500 2.0592806
600 1.9721216
700 1.5632784
800 1.383038
900 1.2891471
1000 1.0794462
1100 1.2707294
1200 1.1351669
1300 1.1745154
1400 0.96032315
1500 1.0961546
1600 0.9556165
1700 0.8583081
1800 1.0001655
1900 0.9101052
2000 0.98084795
2100 0.7820507
2200 0.9451951
2300 0.7967659
2400 0.74684983
2500 0.8375371
2600 0.7995469
2700 0.7901979
2800 0.7404907
2900 0.8742024
3000 0.69954