In [1]:
import tensorflow as tf
import numpy as np
import copy

  from ._conv import register_converters as _register_converters


In [2]:
data_path = "data/sentiment_analysis/"
positive_file = data_path + "sst_pos_sentences_id.txt"
negative_file = data_path + "sst_neg_sentences_id.txt"
vocab = data_path + "sst_vocab.txt"

BATCH_SIZE = 12
HIDDEN_SIZE = 50
EMB_DIM = 64
VOCAB_SIZE = 5000
MAX_LENGTH = 20

In [3]:
class Dataloader:
    def __init__(self, batch_size, max_length):
        self.batch_size = batch_size
        self.max_length = max_length

    def create_batches(self, data_file_list):
        """make self.token_stream into a integer stream."""
        self.x_token_stream = []
        self.y_token_stream = []
        print("load %s file data.." % ' '.join(data_file_list))
        for data_file in data_file_list:
            with open(data_file, 'r', encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    line = line.split()
                    parse_line = [int(x) for x in line]
                    parse_line.extend([0] * (self.max_length - len(parse_line)))  # padding
                    x = copy.deepcopy(parse_line)
                    y = copy.deepcopy(parse_line)
                    x.insert(0, 1)
                    y.append(2)
                    self.x_token_stream.append(x)
                    self.y_token_stream.append(y)
        self.num_batch = int(len(self.x_token_stream) / self.batch_size)
        # cut the taken_stream's length exactly equal to num_batch * batch_size
        self.x_token_stream = self.x_token_stream[:self.num_batch * self.batch_size]
        self.y_token_stream = self.y_token_stream[:self.num_batch * self.batch_size]
        self.x_sequence_batch = np.split(np.array(self.x_token_stream), self.num_batch, 0)
        self.y_sequence_batch = np.split(np.array(self.y_token_stream), self.num_batch, 0)
        self.pointer = 0
        print("      Load %d * %d batches" % (self.num_batch, self.batch_size))

    def next_batch(self):
        x_ret = self.x_sequence_batch[self.pointer]
        y_ret = self.y_sequence_batch[self.pointer]
        self.pointer = (self.pointer + 1) % (self.num_batch - 5)
        return x_ret, y_ret

In [4]:
class Generator:
    def __init__(self, batch_size, hidden_size, emb_dim, vocab_size, max_length, learning_rate=0.01):
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.emb_dim = emb_dim
        self.vocab_size = vocab_size
        self.max_length = max_length
        self.learning_rate = learning_rate
        self.output_keep_prob = 0.6 #to prevent overfit
        with tf.variable_scope("placeholder"):
            self.x = tf.placeholder(shape=[self.batch_size, self.max_length], dtype=tf.int32, name="inputs")
            self.y = tf.placeholder(shape=[self.batch_size, self.max_length], dtype=tf.int32, name="targets")
        with tf.variable_scope('embeeding'):
            self.embedding = tf.Variable(tf.random.uniform([self.vocab_size, self.emb_dim], -1, 1), name="embedding")
            self.inputs = tf.nn.embedding_lookup(self.embedding, self.x)# map (seq_length,batch_size,emb_dim)
        with tf.variable_scope("rnn"):
            self.lstm_cell = tf.contrib.rnn.BasicLSTMCell(self.hidden_size, state_is_tuple = False, name="lstm_cell")
            self.lstm_cell = tf.contrib.rnn.DropoutWrapper(self.lstm_cell, output_keep_prob=self.output_keep_prob)
        with tf.variable_scope("ouput"):
            self.outputs, self.states = tf.nn.dynamic_rnn(cell=self.lstm_cell, inputs=self.inputs, dtype=tf.float32)#outputs (batch_size,max_length,hidden_size)
            self.outputs = tf.reshape(self.outputs, shape=[-1, self.hidden_size])#(batch_size*max_length, hidden_size)
            self.logits = tf.layers.dense(self.outputs, self.vocab_size, name="logits")#(batch_size*max_length, vocab_size)
            self.softmax = tf.nn.softmax(self.logits, name="softmax")#(batch_size*max_length, vocab_size)
        with tf.variable_scope("train"):
            self.label = tf.one_hot(tf.to_int32(tf.reshape(self.y, [-1])), self.vocab_size, 1.0, 0.0)
            self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.label)
            self.loss = tf.reduce_mean(self.loss)
            # We clip the gradients to prevent explosion
            tvars = tf.trainable_variables()
            max_grad_norm = 5
            grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), max_grad_norm)
            gradients = list(zip(grads, tvars))
            self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
            self.train = self.optimizer.apply_gradients(gradients)

In [None]:
if __name__ == "__main__":
    generator = Generator(BATCH_SIZE, HIDDEN_SIZE, EMB_DIM, VOCAB_SIZE, MAX_LENGTH+1)
    dataloader = Dataloader(BATCH_SIZE, MAX_LENGTH)
    dataloader.create_batches([positive_file])
    x, y = dataloader.next_batch()
    
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for i in range(1000):
            x, y = dataloader.next_batch()
            loss, _ = sess.run([generator.loss, generator.train], feed_dict={generator.x:x, generator.y:y})
            print(loss)