In [1]:
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
import pickle as pkl
import tensorflow as tf
import numpy as np
import pandas as pd
import itertools
from random import random

In [2]:
'''
This methods accepts:
    data - a list of words
    labels - a list of tags (as #s) corresponding to data (defaults to none for test data)
    tag_delim - the tag corresponding to -DOCSTART- that we split the tags on
    word_delim - the word to split the sentences on
Returns:
    sentences: a list of lists. Each sublist holds the words of a given sentence
    sentence_labels: a list of lists of the corresponding tags (as #s) to the sentences

NOTE: This is called within read_data
'''

def get_sentences(data_file, word_delim = '-DOCSTART-'):
    data = pd.read_csv(data_file)
    del data['id']
    sentences = []
    for x, y in itertools.groupby(list(data['word']), lambda z: z == word_delim):
        if x: sentences.append([])
        sentences[-1].extend(y)
    return sentences

def get_labels(label_file):
    labels = pd.read_csv(label_file)
    del labels['id']
    # convert labels to numbers and store the conversion from # back to tag in a dictionary tag_list
    labels['tag'] = labels['tag'].astype('category')
    tag_list = list(labels['tag'].cat.categories)
    tag_delim = tag_list.index('O') # get # corresponding to tag 'O'
    labels = np.array(labels['tag'].cat.codes)
    sentence_labels = []
    for x, y in itertools.groupby(labels, lambda z: z == tag_delim):
        if x:
            sentence_labels.append([])
        sentence_labels[-1].extend(y)
    return sentence_labels, tag_list
'''
This method accepts:
    data_file and label_file (optional) - file names for words and corresponding tags
Returns:
    sentences - a list of sentences, where each sentence is represented as a list of words and begins with -DOCSTART-
    sentences_tags - a list of lists of tags corresponding to the sentences, where tags are represented as integers
    tag_list - a list of the unique tags. The index of each tag is what we replace all tags with.
            Later, we will use this list to convert number tags back to actual tags:
            tags = [tag_list[x] for x in tags]

'''
def read_data(data_file, label_file = None):
    if label_file is None:
        return get_sentences(data_file)
    else:
        sentences, (sentences_tags, tag_list) = get_sentences(data_file), get_labels(label_file)
        return sentences, sentences_tags, tag_list

In [3]:
class Params(object):
    pass

In [4]:
class RandomVec:
    def __init__(self, dim):
        self.dim = dim
        self.vocab = {}
        self.vec = []

    def __getitem__(self, word):
        ind = self.vocab.get(word, -1)
        if ind == -1:
            new_vec = np.array([random() for i in range(self.dim)])
            self.vocab[word] = len(self.vocab)
            self.vec.append(new_vec)
            return new_vec
        else:
            return self.vec[ind]

In [5]:
class WordVec:
    def __init__(self, args):
        print('processing corpus')
        if args.restore is None:
            sentences =  read_data(args.corpus)
            print('training')
            self.wvec_model = Word2Vec(sentences=sentences, size=args.dimension, window=args.window,
                                       workers=args.workers,
                                       sg=args.sg,
                                       batch_words=args.batch_size, min_count=1, max_vocab_size=args.vocab_size)
            self.wvec_model.save('wordvec_model_train_' + str(args.dimension) + '.pkl')
        else:
            #self.wvec_model = KeyedVectors.load_word2vec_format(args.restore, binary=True)
            print('loading model')
            self.wvec_model = Word2Vec.load(args.restore)
        self.rand_model = RandomVec(args.dimension)

    def __getitem__(self, word):
        #word = word.lower()
        try:
            return self.wvec_model[word]
        except KeyError:
            #print("Don't found!")
            return self.rand_model[word]

In [6]:
word_dim = 100

In [7]:
args = Params()
args.corpus = 'data/train_x.csv'
args.dimension = word_dim
args.window = 5
args.vocab_size = 10000
args.workers = 3
args.sg = 1
args.batch_size = 10000
args.restore = None
w2vmodel = WordVec(args)

processing corpus
training


In [8]:
'Load train and dev data'
train_x, train_y, tag_list = read_data('data/train_x.csv', 'data/train_y.csv')
#dev_x, dev_y, tag_list = read_data('data/dev_x.csv', 'data/dev_y.csv')

In [9]:
#test_x = read_data('data/test_x.csv')

In [10]:
def embed_dataset(dataset, max_sentence_length=5000):
    output = np.zeros((len(dataset), max_sentence_length, word_dim))
    count = 0
    length = np.zeros(len(dataset))
    for sentence in dataset:
        size = min(len(sentence), max_sentence_length)
        output[count,:size] = [w2vmodel[i] for i in sentence[:size]]
        if (max_sentence_length > size):
            output[count,size:max_sentence_length] = [[0] * word_dim] * (max_sentence_length - size)
        length[count] = len(sentence)
        count = count + 1
    return output, length

In [11]:
def embed_labels(labels, max_sentence_length=5000):
    output = np.zeros((len(labels), max_sentence_length))
    count = 0
    for sentence in labels:
        size = min(len(sentence), max_sentence_length)
        output[count,:size] = sentence[:size]
        if (max_sentence_length > size):
            output[count,size:max_sentence_length] = [0] * (max_sentence_length - size)
        count = count + 1
    return output

In [12]:
embed_train_x, length_train_x = embed_dataset(train_x, max_sentence_length=500)

In [13]:
embed_train_y = embed_labels(train_y, max_sentence_length=500)

In [14]:
embed_train_input, embed_test_input = embed_train_x[:1100], embed_train_x[1100:]
embed_train_output, embed_test_output = embed_train_y[:1100], embed_train_y[1100:]

In [15]:
embed_test_output = np.array(embed_test_output, dtype=np.int32)

In [16]:
#embed_test_x, length_test_x = embed_dataset(test_x, max_sentence_length=500)

In [17]:
class Model:
    def __init__(self, args):
        self.args = args
        # Define the input placeholders
        
        self.graph = tf.Graph()
        with self.graph.as_default():
            self.input_data = tf.placeholder(tf.float32, [None, args.sentence_length, args.word_dim])
            self.output_data = tf.placeholder(tf.int32, [None, args.sentence_length])
            self.one_hot_output_data = tf.one_hot(self.output_data, args.class_size)
            
            if args1.cell_type == 1:
                # Define the forward cell
                fw_cell = tf.nn.rnn_cell.GRUCell(args.rnn_size, activation=tf.nn.tanh)
                # Define the backward cell
                bw_cell = tf.nn.rnn_cell.GRUCell(args.rnn_size, activation=tf.nn.tanh)
            else:
                # Define the forward cell
                fw_cell = tf.nn.rnn_cell.LSTMCell(args.rnn_size, state_is_tuple=True)
                # Define the backward cell
                bw_cell = tf.nn.rnn_cell.LSTMCell(args.rnn_size, state_is_tuple=True)
            
            # Add Dropout to the forward and backward cell
            if args1.dropout_layers == True:
                fw_cell = tf.nn.rnn_cell.DropoutWrapper(fw_cell, output_keep_prob=0.5)
                bw_cell = tf.nn.rnn_cell.DropoutWrapper(bw_cell, output_keep_prob=0.5)          
            
            # Add multilayers of the forward and backward layers
            if args1.num_layers > 1:            
                fw_cell = tf.nn.rnn_cell.MultiRNNCell([fw_cell] * args.num_layers, state_is_tuple=True)
                bw_cell = tf.nn.rnn_cell.MultiRNNCell([bw_cell] * args.num_layers, state_is_tuple=True)

            # Get the max sequence length for each sequence to adjust the size of the RNN and later on compute the score.
            words_used_in_sent = tf.sign(tf.reduce_max(tf.abs(self.input_data), reduction_indices=2))
            self.length = tf.cast(tf.reduce_sum(words_used_in_sent, reduction_indices=1), tf.int32)

            # Create the bidirectional RNN. The input has to be unpack the input into a list of tensors of 2D, corresponding
            # to each element of the batch thus, the permutation of dimensions.
            output, _, _ = tf.nn.bidirectional_rnn(fw_cell, bw_cell,
                                                   tf.unpack(tf.transpose(self.input_data, perm=[1, 0, 2])),
                                                   dtype=tf.float32, sequence_length=self.length)

            # Define the Weight and Bias of the FC layer for the prediction. The number of nodes is two times the
            # size of the RNN to use the forward and backward direction.
            weight, bias = self.weight_and_bias(2 * args.rnn_size, args.class_size)

            # First, we permute the output to the original order of dimensions, we stack them back together into a single tensor.
            # Finally we reshape it to concatenate the output of both directions of the Bi-RNN to operate on the last layer.
            output = tf.reshape(tf.transpose(tf.pack(output), perm=[1, 0, 2]), [-1, 2 * args.rnn_size])

            # FC calculating the prediction
            pred = tf.matmul(output, weight) + bias
            prediction = tf.nn.softmax(pred)

            # Reshape the prediction to the maximum length of the sentence and the number of classes
            self.prediction = tf.reshape(prediction, [-1, args.sentence_length, args.class_size])
            
            # Compute the cost excluding the padding elements
            self.loss = self.cost()
            
            # We use the optimizer recommended in the paper with the corresponding parameters
            optimizer = tf.train.AdamOptimizer(0.003)
            
            # We manually compute the gradients and clip them to avoid vanishing and exploding gradients.
            tvars = tf.trainable_variables()
            grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), 10)
            
            # We apply the computed gradients to the optimizer. 
            self.train_op = optimizer.apply_gradients(zip(grads, tvars))

    def cost(self):
        cross_entropy = self.one_hot_output_data * tf.log(self.prediction)
        cross_entropy = -tf.reduce_sum(cross_entropy, reduction_indices=2)
        mask = tf.sign(tf.reduce_max(tf.abs(self.one_hot_output_data), reduction_indices=2))
        cross_entropy *= mask
        cross_entropy = tf.reduce_sum(cross_entropy, reduction_indices=1)
        cross_entropy /= tf.cast(self.length, tf.float32)
        return tf.reduce_mean(cross_entropy)

    @staticmethod
    def weight_and_bias(in_size, out_size):
        weight = tf.truncated_normal([in_size, out_size], stddev=0.01)
        bias = tf.constant(0.1, shape=[out_size])
        return tf.Variable(weight), tf.Variable(bias)


In [18]:
args1 = Params()
args1.sentence_length = 500
args1.word_dim = word_dim
args1.class_size = len(tag_list)
args1.rnn_size = 10
args1.num_layers = 1
args1.batch_size = 110
args1.epoch = 51
args1.cell_type = 2 # 1 = GRU, 2 = LSTM
args1.dropout_layers = None
args1.restore = None
model = Model(args1)

In [None]:
def score(args, prediction, target, length):
    prediction = np.argmax(prediction, 2)
    acum = 0
    for i in range(len(target)):
        for j in range(length[i]):
            if target[i, j] == prediction[i, j]:
                acum += 1
    return (acum / float(np.sum(length)))

In [None]:
#train_inp, train_out = get_train_data()
train_inp, train_out = embed_train_input, embed_train_output
test_a_inp, test_a_out = embed_test_input, embed_test_output
#test_b_inp, test_b_out = get_test_b_data()
maximum = 0
with tf.Session(graph=model.graph) as sess:
    sess.run(tf.global_variables_initializer())
    print("Variables Initialized")
    saver = tf.train.Saver()
    if args1.restore is not None:
        saver.restore(sess, args1.restore)
        print("model restored")
    for e in range(args1.epoch):
        print("Epoch: " + str(e))
        for ptr in range(0, len(train_inp), args1.batch_size):
            feed_dict = {model.input_data: train_inp[ptr:ptr + args1.batch_size],
                                      model.output_data: train_out[ptr:ptr + args1.batch_size]}
            _, loss = sess.run([model.train_op, model.loss], feed_dict=feed_dict)
            if e % 10 == 0 and ptr == 0:
                #save_path = saver.save(sess, "model.ckpt")
                #print("model saved in file: %s" % save_path)
                print("Iter " + str(ptr) + ", Minibatch Loss= " + str(loss))
                pred, length = sess.run([model.prediction, model.length], {model.input_data: test_a_inp,
                                                                           model.output_data: test_a_out})
                print('test_a score:' + str(score(args1, pred, test_a_out, length)))

Variables Initialized
Epoch: 0
Iter 0, Minibatch Loss= 15.8843
test_a score:0.106987967885
Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
Epoch: 10
Iter 0, Minibatch Loss= 12.9405
test_a score:0.245657540645
Epoch: 11
Epoch: 12
Epoch: 13
Epoch: 14
Epoch: 15
Epoch: 16
Epoch: 17
Epoch: 18
Epoch: 19
Epoch: 20
Iter 0, Minibatch Loss= 10.3745
test_a score:0.531514801059
Epoch: 21
Epoch: 22
Epoch: 23
Epoch: 24
Epoch: 25
Epoch: 26
Epoch: 27
Epoch: 28
Epoch: 29
Epoch: 30
Iter 0, Minibatch Loss= 8.23315
test_a score:0.632186464426
Epoch: 31
Epoch: 32
Epoch: 33
Epoch: 34
Epoch: 35
Epoch: 36
Epoch: 37
Epoch: 38
Epoch: 39
Epoch: 40
Iter 0, Minibatch Loss= 6.47642
test_a score:0.70976135934
Epoch: 41
Epoch: 42
Epoch: 43
Epoch: 44
Epoch: 45


## Testset and output results

In [None]:
with tf.Session(graph=model.graph) as sess:
    sess.run(tf.global_variables_initializer())
    print("Variables Initialized")
    saver = tf.train.Saver()
    saver.restore(sess, "./model.ckpt")
    print("model restored")
    test_prediction, length = sess.run([model.prediction, model.length], {model.input_data: embed_test_x})


In [None]:
final_prediction = np.argmax(test_prediction, 2)

In [None]:
def generate_file(output, length, filename):
    f = open(filename,'w')
    f.write("id,tag\n")
    count = 0
    for key, sentence in enumerate(output):
        size = int(min(len(sentence), length[key]))
        for word in range(size):
            f.write(str(count) + ",\"" + tag_list[sentence[word]] + "\"\n")
            count += 1
        if len(sentence) < length[key]:
            for i in range(int(length[key])-len(sentence)):
                f.write(str(count) + ",\"" + tag_list[int(random() * len(tag_list))] + "\"\n")
                count += 1
    f.close()

In [None]:
generate_file(final_prediction, length_test_x, "output.csv")