In [1]:
import numpy as np
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [2]:
class Dataloader():
    def __init__(self, batch_size, max_length = 100):
        self.batch_size = batch_size
        self.sentences = np.array([])
        self.labels = np.array([])
        self.max_length = max_length

    def load_train_data(self, positive_file, negative_file):
        # Load data
        positive_examples = []
        negative_examples = []
        with open(positive_file)as fin:
            for line in fin:
                line = line.strip()
                line = line.split()
                parse_line = [int(x) for x in line]
                parse_line.extend([0] * (self.max_length - len(parse_line)))
                if len(parse_line) == self.max_length:
                    positive_examples.append(parse_line)
        with open(negative_file)as fin:
            for line in fin:
                line = line.strip()
                line = line.split()
                parse_line = [int(x) for x in line]
                parse_line.extend([0] * (self.max_length - len(parse_line)))
                if len(parse_line) == self.max_length:
                    negative_examples.append(parse_line)
        self.sentences = np.array(positive_examples + negative_examples)

        # Generate labels
        # positive_labels = [1 for _ in positive_examples]
        # negative_labels = [0 for _ in negative_examples]
        positive_labels = [[0, 1] for _ in positive_examples]
        negative_labels = [[1, 0] for _ in negative_examples]
        self.labels = np.concatenate([positive_labels, negative_labels], 0)
        # self.labels = positive_labels + negative_labels

        # Shuffle the data
        shuffle_indices = np.random.permutation(np.arange(len(self.labels)))
        self.sentences = self.sentences[shuffle_indices]
        self.labels = self.labels[shuffle_indices]

        # Split batches
        self.num_batch = int(len(self.labels) / self.batch_size)
        self.sentences = self.sentences[:self.num_batch * self.batch_size]
        self.labels = self.labels[:self.num_batch * self.batch_size]
        self.sentences_batches = np.split(self.sentences, self.num_batch, 0)
        self.labels_batches = np.split(self.labels, self.num_batch, 0)

        self.pointer = 0


    def next_batch(self):
        ret = self.sentences_batches[self.pointer], self.labels_batches[self.pointer]
        self.pointer = (self.pointer + 1) % (self.num_batch - 5)
        return ret

    def test_batch(self):#Preserve part of dataset for testing
        ret = self.sentences_batches[self.num_batch - 1], self.labels_batches[self.num_batch - 1]
        return ret

    def reset_pointer(self):
        self.pointer = 0

In [3]:
class Detection:
    def __init__(self, sequence_length, batch_size, vocab_size, emb_dim, hidden_dim = 128):
        self.num_emb = vocab_size  # vocab size
        self.batch_size = batch_size  # batch size
        self.emb_dim = emb_dim  # dimision of embedding
        self.hidden_dim = hidden_dim  # hidden size
        self.sequence_length = sequence_length  # sequence length
        self.output_dim = 2
        with tf.variable_scope('embedding'):
            self.g_embeddings = tf.Variable(tf.random_uniform([self.num_emb, self.emb_dim], -1.0, 1.0), name="W_text")
        self.x = tf.placeholder(shape=[self.batch_size, self.sequence_length], dtype=tf.int32)
        self.inputs= tf.nn.embedding_lookup(self.g_embeddings, self.x)  # seq_length x batch_size x emb_dim
        self.targets = tf.placeholder(shape=[self.batch_size, self.output_dim], dtype=tf.int64)
        self.output_keep_prob = 0.7#to prevent overfit
        cell_bw = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, state_is_tuple=False)  # single lstm unit
        cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, output_keep_prob=self.output_keep_prob)
        cell_fw = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, state_is_tuple=False)  # single lstm unit
        cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, output_keep_prob=self.output_keep_prob)
        self.outputs, self.states = tf.nn.bidirectional_dynamic_rnn(cell_bw, cell_fw, self.inputs, dtype=tf.float32)
        self.outputs = tf.reshape(self.outputs, shape=[-1, self.sequence_length, self.hidden_dim])
        self.outputs = tf.transpose(self.outputs, perm=[1, 0, 2])  # batch_size x seq_length
        self.outputs = tf.reduce_mean(self.outputs, 0)
        self.outputs = self.outputs[:self.batch_size] + self.outputs[self.batch_size:]
        self.logits = tf.layers.dense(self.outputs, self.output_dim, name="logits")
        self.prob = tf.nn.softmax(self.logits, name="softmax_output")
        
        self.accuracy = tf.equal(tf.argmax(self.targets, axis=1), tf.argmax(self.prob, axis=1))
        self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.targets, logits=self.logits))
        tvars = tf.trainable_variables()
        max_grad_norm = 5
        # We clip the gradients to prevent explosion
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), max_grad_norm)
        gradients = list(zip(grads, tvars))
        self.train_op = tf.train.AdamOptimizer(0.005).apply_gradients(gradients)
        
    def train(self, sess, x_batch, y_batch):
        _, loss = sess.run([self.train_op, self.loss], feed_dict={self.x:x_batch, self.targets:y_batch})
        return loss

    def predict(self, sess, x_batch):
        prob = sess.run([self.prob], feed_dict={self.x:x_batch})
        return prob

    def get_accuracy(self, sess, x_batch, y_batch):
        accuracy = sess.run([self.accuracy], feed_dict={self.x: x_batch, self.targets: y_batch})
        return (accuracy[0].tolist().count(True) / len(x_batch))

In [4]:
data_path = "data/sentiment_analysis/"
BATCH_SIZE = 48
SEQ_LENGTH = 100
EMB_DIM = 32 # embedding dimension
HIDDEN_DIM = 64 # hidden state dimension of lstm cell
emb_dict_file = data_path + "sst_vocab.txt"
positive_file = data_path + "sst_pos_sentences_id.txt"
negative_file = data_path + "sst_neg_sentences_id.txt"
EPOCH_NUM = 1000

In [5]:
def load_emb_data(emb_dict_file):
    word_dict = {}
    word_list = []
    item = 0
    with open(emb_dict_file, 'r', encoding="utf-8") as f:
        lines = f.readlines()
        for line in lines:
            word = line.strip()
            word_dict[word] = item
            item += 1
            word_list.append(word)
    length = len(word_dict)
    print("Load embedding success! Num: %d" % length)
    return word_dict, length, word_list

In [6]:
if __name__ == "__main__":
    vocab_dict, vocab_size, vocab_list = load_emb_data(emb_dict_file)
    dis_data_loader = Dataloader(BATCH_SIZE, SEQ_LENGTH)
    dis_data_loader.load_train_data(positive_file, negative_file)
    detection = Detection(SEQ_LENGTH, BATCH_SIZE, vocab_size, EMB_DIM, HIDDEN_DIM)
    test_x_batch, test_y_batch = dis_data_loader.test_batch()
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for i in range(EPOCH_NUM):
            x_batch, y_batch = dis_data_loader.next_batch()
            loss = detection.train(sess, x_batch, y_batch)
            if (i % 20 == 0):
                accuracy = detection.get_accuracy(sess, test_x_batch, test_y_batch)
                print("%d, loss:%f, accuracy:%f" % (i, loss, accuracy))
        writer = tf.summary.FileWriter("graphs/implementation2_graph", sess.graph)
        writer.close()

Load embedding success! Num: 4734
Instructions for updating:
This class is deprecated, please use tf.nn.rnn_cell.LSTMCell, which supports all the feature this cell currently has. Please replace the existing code with tf.nn.rnn_cell.LSTMCell(name='basic_lstm_cell').
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.

0, loss:0.681424, accuracy:0.479167
20, loss:0.707668, accuracy:0.520833
40, loss:0.680380, accuracy:0.625000
60, loss:0.548028, accuracy:0.854167
80, loss:0.649473, accuracy:0.833333
100, loss:0.581155, accuracy:0.875000
120, loss:0.248631, accuracy:0.916667
140, loss:0.137719, accuracy:0.937500
160, loss:0.343888, accuracy:0.937500
180, loss:0.343734, accuracy:0.937500
200, loss:0.287875, accuracy:0.958333
220, loss:0.292525, accuracy:0.958333
240, loss:0.259251, accuracy:0.958333
260, loss:0.396529, accuracy:0.937500
280, loss:0.27060