### Tensorflow based morphological tagger experiment for Latvian
Repository at https://github.com/PeterisP/tf-morphotagger  
Appendix to the paper "Deep neural learning approaches for Latvian morphological tagging" at Baltic HLT 2016 (http://hlt2016.tilde.eu/)

In [1]:
# Required libraries
import json, os, pickle, datetime, collections, itertools
import tensorflow as tf
import numpy as np
from gensim.models import Word2Vec
from tensorflow.contrib.learn.python.learn.preprocessing import CategoricalVocabulary

In [2]:
# Experiment parametrization
default_epochs = 20
development = False
remove_rare_words = True
max_ngrams = 4
input_features = {
    'wordform_onehot' : True,
    'wordform_embeddings' : True,
    'analyzer_nhot' : True,
    'ngrams' : True,
    'wordshape' : True    
}
output_features = {
    'pos_onehot' : True,
    'tag_onehot' : True,
    'attribute_nhot' : True
}

if development:
    train_data_filename = 'data/train.json'
    eval_data_filename = 'data/dev.json'
else:
    train_data_filename = 'data/train_dev.json'  # When training the final system, add the development set to the training set
    eval_data_filename = 'data/test.json'
embeddings_filename = 'embeddings/lvnews2.c0p0d0.shuf.txt.we216-200-ssg-w5-m10.20160516233643.bin'
# This embeddings file can be downloaded from https://dl.dropboxusercontent.com/u/9455117/lvnews2.c0p0d0.shuf.txt.we216-200-ssg-w5-m10.20160516233643.bin

wordform_key = 'wordform'
wordform_original_key = 'wordform_original'
tag_key = 'gold_tag_simple'
attribute_key = 'gold_attributes'
pos_key = 'pos'
options_key = 'options'
unk = '_UNK_'
output_dir = 'output'
os.makedirs(output_dir, exist_ok=True)

In [3]:
# Vocabulary management - mapping between words and numeric identifiers/positions in vectors
# Currently implements 5 different vocabularies and can populate them from a document (training data)
class Vocabularies(object):
    def __init__(self, document=None, folder=None):
        if folder:
            with open(folder + '/voc_wordforms.p', 'rb') as f:
                self.voc_wordforms = pickle.load(f)
            with open(folder + '/voc_tags.p', 'rb') as f:
                self.voc_tags = pickle.load(f)
            with open(folder + '/voc_pos.p', 'rb') as f:
                self.voc_pos = pickle.load(f)
            with open(folder + '/voc_ngrams.p', 'rb') as f:
                self.voc_ngrams = pickle.load(f)
            with open(folder + '/voc_attributes.p', 'rb') as f:
                self.voc_attributes = pickle.load(f)
        else:
            self.voc_wordforms = CategoricalVocabulary(unk)
            self.voc_tags = CategoricalVocabulary(unk)
            self.voc_pos = CategoricalVocabulary(unk)
            self.voc_ngrams = CategoricalVocabulary(unk)
            self.voc_attributes = CategoricalVocabulary(unk)

        if document:
            self.add_document(document)
        
    def dump(self, folder):
        with open(folder + '/voc_wordforms.p', 'wb') as f:
            pickle.dump(self.voc_wordforms, f)
        with open(folder + '/voc_tags.p', 'wb') as f:
            pickle.dump(self.voc_tags, f)
        with open(folder + '/voc_pos.p', 'wb') as f:
            pickle.dump(self.voc_pos, f)
        with open(folder + '/voc_ngrams.p', 'wb') as f:
            pickle.dump(self.voc_ngrams, f)
        with open(folder + '/voc_attributes.p', 'wb') as f:
            pickle.dump(self.voc_attributes, f)
            
    def add_document(self, document):
        self.voc_wordforms.freeze(False)
        self.voc_tags.freeze(False)
        for sentence in document.sentences:
            for token in sentence:
                wordform = token[wordform_key]
                self.voc_wordforms.add(wordform)
                self.voc_tags.add(token[tag_key])
                self.voc_pos.add(token[pos_key])
                for option in token[options_key]:
                    self.voc_tags.add(option) 
                    # We assume that analyzer options have the same set of possible values as tags
                    # If that's not the case, modifications will be required
                for i in range(1, max_ngrams+1):
                    self.voc_ngrams.add(wordform[-i:])
                for key, value in token[attribute_key].items():
                    self.voc_attributes.add('{}:{}'.format(key, value))
        if remove_rare_words:
            self.voc_wordforms.trim(2) # Remove rare wordforms - treat once-seen words as OOV
        self.voc_wordforms.freeze()
        self.voc_tags.freeze()
        self.voc_pos.freeze()
        self.voc_ngrams.trim(2) # Ignore suffix ngrams that appear only once
        self.voc_ngrams.freeze()
        self.voc_attributes.freeze()

In [4]:
# A helper class for accuracy evaluations
class AccuracyCounter():
    def __init__(self):
        self.c = collections.Counter()
    def add(self, gold, silver):
        self.c[gold == silver] += 1
    def add_b(self, boolean):
        self.c[boolean] += 1
    def average(self):
        total = sum(self.c.values()) 
        if not total:
            return 0
        return self.c[True] / total

In [5]:
# A wrapper for JSON document used for input of training and test data. Includes also the evaluation measurement functionality
# The data needs to be sentence-split, tokenized and (for best results) with data from the lexicon/paradigm based morphological analyzer
# File format is generated by https://github.com/PeterisP/LVTagger/blob/master/src/main/java/lv/lumii/morphotagger/MorphoConverter.java
class Document(object):
    # self.sentences - list of sentences, each sentence is a list of tokens, each token is a dict of fields with key names defined in the configuration cell above.
    def __init__(self, filename, limit=None):
        with open(filename, 'r') as f:
            self.sentences = json.load(f)
        if limit and limit < len(self.sentences):
            self.sentences = self.sentences[:limit]
        self._preprocess()

    def _simplify_wordform(word):
        return word.lower()
    
    def _preprocess(self):
        for sentence in self.sentences:
            for token in sentence:
                if not token.get(tag_key) or not token.get(wordform_key):
                    print(json.dumps(token))    
                    assert False
                token[wordform_original_key] = token[wordform_key]
                token[wordform_key] = Document._simplify_wordform(token[wordform_key])                    
                token[pos_key] = token[tag_key][0]

    # Outputs the tagged results and also calculates the accuracy metrics
    def output_tagged(self, silver_tags, silver_poses, silver_attributes, filename, evaluate = True, vocabularies = None):
        tag = AccuracyCounter()
        oov_tag = AccuracyCounter()
        tag_pos = AccuracyCounter()
        oov_tag_pos = AccuracyCounter()
        direct_pos = AccuracyCounter()
        oov_direct_pos = AccuracyCounter()
        attributes = AccuracyCounter()
        oov_attributes = AccuracyCounter()
        per_attribute = collections.defaultdict(AccuracyCounter)
        attribute_errors = collections.Counter()
        if not silver_tags:
            silver_tags = []
        if not silver_poses:
            silver_poses = []
        if not silver_attributes:
            silver_attributes = []
            
        with open(filename, 'w') as f:
            for sentence, sentence_tags, sentence_poses, sentence_attributes in itertools.zip_longest(self.sentences, silver_tags, silver_poses, silver_attributes):
                if not sentence_tags:
                    sentence_tags = []
                if not sentence_poses:
                    sentence_poses = []
                if not sentence_attributes:
                    sentence_attributes = []
                for token, silver_tag, silver_pos, silver_token_attributes in itertools.zip_longest(sentence, sentence_tags, sentence_poses, sentence_attributes):
                    gold_tag = token.get(tag_key)
                    silver_tag_pos = silver_tag[0] if silver_tag else None
                    tag.add(gold_tag, silver_tag)
                    tag_pos.add(gold_tag[0], silver_tag_pos)
                    direct_pos.add(gold_tag[0], silver_pos)
                    if vocabularies and not vocabularies.voc_wordforms.get(token.get(wordform_key)):
                        oov_tag.add(gold_tag, silver_tag)
                        oov_tag_pos.add(gold_tag[0], silver_tag_pos)
                        oov_direct_pos.add(gold_tag[0], silver_pos)
                    
                    gold_attrs = ','.join('{}:{}'.format(key, value) for key, value in token.get(attribute_key).items())
                    silver_attrs = ''                    
                    # Check the accuracy of predicted attributes
                    if silver_token_attributes:
                        errors = []                        
                        for key, gold_value in token.get(attribute_key).items():
                            if key in ['Skaitlis 2', 'Locījums 2', 'Rekcija']: # Lexical properties that shouldn't be tagged
                                continue
                            silver_value = ''
                            best_confidence = 0
                            for silver_attribute, confidence in silver_token_attributes.items():
                                if silver_attribute.split(':')[0] != key:
                                    continue  # NB! we simply ignore the tagger's opinion on any attributes that are not relevant for this POS
                                if confidence > best_confidence:
                                    best_confidence = confidence
                                    silver_value = silver_attribute.split(':', maxsplit=1)[1]
#                             print("{}: gold -'{}', silver -'{}' @ {}".format(key, gold_value, silver_value, best_confidence))
                            per_attribute[key].add(gold_value, silver_value)
                            if gold_value != silver_value:
                                errors.append('{}:{} nevis {}'.format(key, silver_value, gold_value))
                                attribute_errors['{}:{} nevis {}'.format(key, silver_value, gold_value)] += 1
                                
                        attributes.add_b(not errors)
                        if vocabularies and not vocabularies.voc_wordforms.get(token.get(wordform_key)):
                            oov_attributes.add_b(not errors)     
                        silver_attrs = '\t'.join(errors)
                                            
                    if not silver_tag:
                        silver_tag = silver_pos
                    if not silver_tag:
                        silver_tag = ''
                    f.write('\t'.join([token.get(wordform_original_key), gold_tag, silver_tag, gold_attrs, silver_attrs]) + '\n')
        print('Test set tag accuracy:        {:.2%} ({:.2%})'.format(tag.average(), oov_tag.average()))
        print('Attribute accuracy:           {:.2%} ({:.2%})'.format(attributes.average(), oov_attributes.average()))
        print('Test set tag POS accuracy:    {:.2%} ({:.2%})'.format(tag_pos.average(), oov_tag_pos.average()))
        print('Test set direct POS accuracy: {:.2%} ({:.2%})'.format(direct_pos.average(), oov_direct_pos.average()))
        for key, counter in per_attribute.items():
            print('    {}: {:.2%})'.format(key, counter.average()))
        print(attribute_errors)

In [6]:
# A helper method to allow altering word embedding models loaded in gensim library, so that we can add an all-zero vector for OOV words
# Vocab class copied from gensim project
class Vocab(object):
    """
    A single vocabulary item, used internally for collecting per-word frequency/sampling info,
    and for constructing binary trees (incl. both word leaves and inner nodes).
    """
    def __init__(self, **kwargs):
        self.count = 0
        self.__dict__.update(kwargs)

    def __lt__(self, other):  # used for sorting in a priority queue
        return self.count < other.count

    def __str__(self):
        vals = ['%s:%r' % (key, self.__dict__[key]) for key in sorted(self.__dict__) if not key.startswith('_')]
        return "%s(%s)" % (self.__class__.__name__, ', '.join(vals))
    
def gensim_add_zero_word(model, word):
    word_id = len(model.vocab)
    model.vocab[word] = Vocab(index=word_id, count=1)
    model.syn0 = np.append(model.syn0, [np.zeros(model.vector_size, dtype=np.float32)], 0)
    model.index2word.append(word)

In [7]:
# class to build input/output vectors for each sentence from the raw data.
FeatureVectors = collections.namedtuple('FeatureVectors', 
    ['wordform_ids','wordshape', 'wordform_embeddings','analyzer_options','ngrams','tag_ids','pos_ids','attribute_ids'])
class FeatureFactory(object):
    def __init__(self, vocabularies, embeddings):
        self._vocabularies = vocabularies
        self._embeddings = embeddings
        # Tā kā gensim modeļi mirst, ja prasam OOV vārdus, tad mums vajag pievienot tur nulles vektoru ja tāds jau nav
        if self._embeddings and unk not in self._embeddings:
            gensim_add_zero_word(self._embeddings, unk)
    
    def wordform_vector_size(self):
        return len(self._vocabularies.voc_wordforms) # one-hot vector describing the wordform
    
    def embedding_vector_size(self):
        return self._embeddings.vector_size
    
    def tag_vector_size(self):
        return len(self._vocabularies.voc_tags) # one-hot vector mapping to individual tags

    def pos_vector_size(self):
        return len(self._vocabularies.voc_pos) # one-hot vector mapping to pos tags

    def ngram_vector_size(self):
        return len(self._vocabularies.voc_ngrams) # n-hot vector mapping to ngrams
    
    def attribute_vector_size(self):
        return len(self._vocabularies.voc_attributes) # n-hot vector mapping to attribute-value pairs

    def wordshape_vector_size(self):
        return 1
    
    # convert a document to the appropriate numeric input and output vectors suitable for tagging
    def vectorize(self, document):
        return [self._vectorize_sentence(sentence) for sentence in document.sentences]
               
    def _vectorize_sentence(self, sentence):
        wordform_ids = np.array([self._vocabularies.voc_wordforms.get(token[wordform_key]) for token in sentence])
        if input_features.get('wordform_embeddings'):
            wordforms_filtered = [token[wordform_key] if token[wordform_key] in self._embeddings else unk for token in sentence]
            wordform_embeddings = np.tanh(self._embeddings[wordforms_filtered]) # normalizējam uz [0..1] diapazonu no [-x..+x]
        else:
            wordform_embeddings = None
        
        if input_features.get('wordshape'):
            wordshape = np.zeros([len(sentence), 1], dtype = np.float32)
            for tok_id, token in enumerate(sentence):
                if token[wordform_original_key][0].isupper():  # Pagaidām tikai pārbaudam vai sākas ar lielo burtu.
                    wordshape[tok_id, 0] = 1
        else:
            wordshape = None
            
        if input_features.get('analyzer_nhot'):
            # TODO - šo noteikti var kautkā optimizēt, ja datu preparēšana kļūst par lēnu
            analyzer_options = np.zeros([len(sentence),len(self._vocabularies.voc_tags)], dtype = np.float32)
            for tok_id, token in enumerate(sentence):
                for option in token[options_key]:
                    analyzer_options[tok_id, self._vocabularies.voc_tags.get(option)] = 1
        else:
            analyzer_options = None
            
        if input_features.get('ngrams'):
            ngrams = np.zeros([len(sentence),len(self._vocabularies.voc_ngrams)], dtype = np.float32)
            for tok_id, token in enumerate(sentence):
                wordform = token.get(wordform_key)
                for i in range(1, max_ngrams+1):
                    ngrams[tok_id, self._vocabularies.voc_ngrams.get(wordform[-i:])] = 1
        else:
            ngrams = None            
        
        tag_ids = np.array([self._vocabularies.voc_tags.get(token[tag_key]) for token in sentence], dtype = np.float32)
        pos_ids = np.array([self._vocabularies.voc_pos.get(token[pos_key]) for token in sentence], dtype = np.float32)
        if output_features.get('attribute_nhot'):
            attribute_ids = np.zeros([len(sentence),len(self._vocabularies.voc_attributes)], dtype = np.float32)
            for tok_id, token in enumerate(sentence):
                for key, value in token[attribute_key].items():
                    attribute_ids[tok_id, self._vocabularies.voc_attributes.get('{}:{}'.format(key, value))] = 1
        else:
            attribute_ids = None
        return FeatureVectors(wordform_ids, wordshape, wordform_embeddings, analyzer_options, ngrams, tag_ids, pos_ids, attribute_ids)
    
    def dump(self, folder):
        self._vocabularies.dump(folder)

In [8]:
# NN layer helper functions

# A layer for a bidirectional RNN (currently LSTM cells)
def recurrent_layer(input_layer, sequence_lengths, dropout_keep_prob, name_scope='recurrent', rnn_hidden=100, num_layers=1):
    with tf.name_scope(name_scope):
        # Transform from "num_words x data_width" uz "batch_size x num_words x data_width"
        batched_input_layer = tf.expand_dims(input_layer,0)
        with tf.name_scope('fw'):
            cell_fw = tf.nn.rnn_cell.LSTMCell(rnn_hidden, state_is_tuple=True)
            cell_with_dropout_fw = tf.nn.rnn_cell.DropoutWrapper(cell_fw, output_keep_prob=dropout_keep_prob)                
            if num_layers > 1:
                multicell_fw = tf.nn.rnn_cell.MultiRNNCell([cell_with_dropout_fw] * num_layers, state_is_tuple=True)
            else:
                multicell_fw = cell_fw
            
        with tf.name_scope('bw'):
            cell_bw = tf.nn.rnn_cell.LSTMCell(rnn_hidden, state_is_tuple=True)
            cell_with_dropout_bw = tf.nn.rnn_cell.DropoutWrapper(cell_bw, output_keep_prob=dropout_keep_prob)
            if num_layers > 1:
                multicell_bw = tf.nn.rnn_cell.MultiRNNCell([cell_with_dropout_bw] * num_layers, state_is_tuple=True)
            else:
                multicell_bw = cell_bw
        rnn_outputs, _ = tf.nn.bidirectional_dynamic_rnn(multicell_fw, multicell_bw, batched_input_layer,
                                                         sequence_lengths, scope=name_scope+"BiRNN", dtype=tf.float32)
        rnn_output = tf.concat(2, rnn_outputs)
        return tf.reshape(rnn_output, [-1, rnn_hidden*2]) # flatten to return to an unbatched representation
    
# standard fully connected ReLU layer
def fully_connected_layer(input_layer, dropout_keep_prob, hidden_units, name_scope='fully_connected'):
    input_vector_size = input_layer.get_shape().as_list()[1]
    with tf.name_scope(name_scope):
        weights = tf.Variable(tf.truncated_normal([input_vector_size, hidden_units], stddev=0.1), name='weights')
        bias = tf.Variable(tf.zeros([hidden_units]), name='bias')
        fc = tf.nn.relu(tf.matmul(input_layer, weights) + bias, name=name_scope) 
        return tf.nn.dropout(fc, dropout_keep_prob)
    
# one dimensional convolution over words in sentence; from a vector in shape [num_words x in_data_width], return [num_words x out_data_width]
# For each word's output, it will take input data from 'window' of surrounding words
def convolution_layer(input_layer, window = 3, hidden_units = 400, name_scope='convolution'):
    input_vector_size = input_layer.get_shape().as_list()[1]
    # Transform from [num_words x data_width] to [1 x 1 x num_words x data_width] - "[batch x height x num_words x data_width]"
    batched_layer = tf.expand_dims( tf.expand_dims(input_layer, 0), 0)
    with tf.name_scope(name_scope):
        convolution = tf.Variable(tf.truncated_normal([1, window, input_vector_size, hidden_units], stddev=0.1) , name='convolution_filter')
        result = tf.nn.conv2d(batched_layer, convolution, [1,1,1,1], padding='SAME')        
        unbatched = tf.reshape(result, [-1, hidden_units])
    return unbatched

In [9]:
# The actual tagger class - construction of the network graph, training and tagging
class Tagger(object):
    def __init__(self, featurefactory = None):
        self.session = None
        self._featurefactory = featurefactory # necessary because model structure depends on vocabulary sizes
        self._prepare_graph()
        
    def __del__(self):
        if self.session:
            self.session.close()
        
    def _feed_dict(self, sentence, train=False, feed_output=True):
        feed_dict = dict()
        feed_dict[self.sentence_length] = [len(sentence.tag_ids)]
        if train:
            feed_dict[self.dropout_keep_prob] = 0.5
        else:
            feed_dict[self.dropout_keep_prob] = 1.0
        if input_features.get('wordform_onehot'):
            feed_dict[self.wordform_ids] = sentence.wordform_ids
        if input_features.get('wordform_embeddings'):
            feed_dict[self.wordform_embeddings] = sentence.wordform_embeddings
        if input_features.get('analyzer_nhot'):
            feed_dict[self.analyzer_options] = sentence.analyzer_options
        if input_features.get('ngrams'):
            feed_dict[self.ngrams] = sentence.ngrams
        if input_features.get('wordshape'):
            feed_dict[self.wordshape] = sentence.wordshape
        if feed_output:
            if output_features.get('pos_onehot'):
                feed_dict[self.pos_ids] = sentence.pos_ids
            if output_features.get('tag_onehot'):
                feed_dict[self.tag_ids] = sentence.tag_ids
            if output_features.get('attribute_nhot'):
                feed_dict[self.attribute_ids] = sentence.attribute_ids
        return feed_dict
            
    def _prepare_graph(self):
        tf.reset_default_graph()
        
        self.dropout_keep_prob = tf.placeholder(tf.float32, name = 'dropout_keep_prob')
        self.sentence_length = tf.placeholder(tf.int64, name = 'sentence_length')
                
        with tf.name_scope('input'):
            input_vectors = []
            compressed_input_vectors = []
            if input_features.get('wordform_onehot'):
                wordform_vector_size = self._featurefactory.wordform_vector_size()
                self.wordform_ids = tf.placeholder(tf.int64, [None], name = 'wordform_ids')
                wordform_onehot = tf.one_hot(self.wordform_ids, wordform_vector_size, 1.0, 0.0, name = 'wordform_onehot')
                compressed_wordform = fully_connected_layer(wordform_onehot, self.dropout_keep_prob, 400, 'compressed_wordform')
                input_vectors.append(wordform_onehot)
                compressed_input_vectors.append(compressed_wordform)
            if input_features.get('wordshape'):
                wordshape_vector_size = self._featurefactory.wordshape_vector_size()
                self.wordshape = tf.placeholder(tf.float32, [None, wordshape_vector_size], name = 'wordshape')
                input_vectors.append(self.wordshape)
                compressed_input_vectors.append(self.wordshape)
            if input_features.get('wordform_embeddings'):
                embedding_vector_size = self._featurefactory.embedding_vector_size()
                self.wordform_embeddings = tf.placeholder(tf.float32, [None, embedding_vector_size], name = 'wordform_embeddings')
#                 normalized_embeddings = tf.sigmoid(self.wordform_embeddings)
                input_vectors.append(self.wordform_embeddings)
                compressed_input_vectors.append(self.wordform_embeddings)
            if input_features.get('analyzer_nhot'):
                tag_vector_size = self._featurefactory.tag_vector_size()
                self.analyzer_options = tf.placeholder(tf.float32, [None, tag_vector_size], name='analyzer_options_nhot')
                input_vectors.append(self.analyzer_options)
                compressed_input_vectors.append(self.analyzer_options)
            if input_features.get('ngrams'):
                ngram_vector_size = self._featurefactory.ngram_vector_size()
                self.ngrams = tf.placeholder(tf.float32, [None, ngram_vector_size], name='ngram_nhot')
                input_vectors.append(self.ngrams)
                compressed_ngrams = fully_connected_layer(self.ngrams, self.dropout_keep_prob, 400, 'compressed_ngrams')
                compressed_input_vectors.append(compressed_ngrams)
            input_vector = tf.concat(1, input_vectors)
            compressed_input_vector = tf.concat(1, compressed_input_vectors)
            del input_vectors
            del compressed_input_vectors

        with tf.name_scope('gold_output'):
            gold_output_vectors = []
            if output_features.get('pos_onehot'):
                pos_vector_size = self._featurefactory.pos_vector_size()
                self.pos_ids = tf.placeholder(tf.int64, [None], name = 'pos_ids')
                gold_pos_onehot = tf.one_hot(self.pos_ids, pos_vector_size, 1.0, 0.0, name = 'gold_pos_onehot')
                gold_output_vectors.append(gold_pos_onehot)
            if output_features.get('tag_onehot'):
                tag_vector_size = self._featurefactory.tag_vector_size()
                self.tag_ids = tf.placeholder(tf.int64, [None], name = 'tag_ids')
                gold_tag_onehot = tf.one_hot(self.tag_ids, tag_vector_size, 1.0, 0.0, name = 'gold_tag_onehot')
                gold_output_vectors.append(gold_tag_onehot)
            if output_features.get('attribute_nhot'):
                attribute_vector_size = self._featurefactory.attribute_vector_size()
                self.attribute_ids = tf.placeholder(tf.float32, [None, attribute_vector_size], name = 'attribute_nhot')
                gold_output_vectors.append(self.attribute_ids)
            gold_output_vector = tf.concat(1,gold_output_vectors)
            del gold_output_vectors
            output_vector_size = gold_output_vector.get_shape().as_list()[1]

        # NB! the following lines define the main layout of the nework
#         layer = input_vector
        layer = compressed_input_vector
        layer = convolution_layer(layer, window=3, hidden_units=500, name_scope = 'convolution1')
#         layer = fully_connected_layer(layer, self.dropout_keep_prob, 400)
        layer = recurrent_layer(layer, self.sentence_length, self.dropout_keep_prob, 'recurrent1', rnn_hidden=400)
#         layer = recurrent_layer(layer, self.sentence_length, self.dropout_keep_prob, 'recurrent2', rnn_hidden=300)
#         layer = recurrent_layer(layer, self.sentence_length, self.dropout_keep_prob, 'recurrent3', rnn_hidden=300)         
        layer = tf.concat(1, [input_vector, layer]) # wide and deep
        final_layer = layer
        final_layer_size = final_layer.get_shape().as_list()[1]
                
        with tf.name_scope('softmax'):
            weights = tf.Variable(tf.truncated_normal([final_layer_size, output_vector_size], stddev=0.1), name='weights')  # simple mapping from all words to all tags
            bias = tf.Variable(tf.zeros([output_vector_size]), name='bias')
            output_vector = tf.nn.softmax(tf.matmul(final_layer, weights) + bias, name='output_vector')
        
        regularization_coeff = 1e-7
#         regularization_coeff = 0
        with tf.name_scope('train'):
            cross_entropy = tf.reduce_mean(-tf.reduce_sum(gold_output_vector * tf.log(output_vector), reduction_indices=[1]), name='cross_entropy')                    
            if regularization_coeff > 0:
                loss = cross_entropy + regularization_coeff * (tf.nn.l2_loss(weights) + tf.nn.l2_loss(bias))
            else:
                loss = cross_entropy
            cross_entropy_summary = tf.scalar_summary('cross entropy', cross_entropy)
            loss_summary = tf.scalar_summary('loss', loss)
            self.train_step = tf.train.AdamOptimizer(1e-4).minimize(loss, name='train_step')
        
        with tf.name_scope('evaluate'):
            start_index = 0
            if output_features.get('pos_onehot'):
                pos_onehot = tf.slice(output_vector, [0, start_index], [-1, pos_vector_size])
                start_index = start_index + pos_vector_size
                self.pos_answers = tf.argmax(pos_onehot, 1, name='pos_answers')
                pos_accuracy = tf.equal(tf.argmax(gold_pos_onehot, 1), tf.argmax(pos_onehot, 1), name='pos_accuracy')
                self.pos_accuracy_measure = tf.reduce_mean(tf.cast(pos_accuracy, tf.float32))
                self.pos_accuracy_summary = tf.scalar_summary('pos_accuracy', self.pos_accuracy_measure)
            if output_features.get('tag_onehot'):                
                tag_onehot = tf.slice(output_vector, [0, start_index], [-1, tag_vector_size])
                start_index = start_index + tag_vector_size
                self.tag_answers = tf.argmax(tag_onehot, 1, name='tag_answers')
                tag_accuracy = tf.equal(tf.argmax(gold_tag_onehot, 1), tf.argmax(tag_onehot, 1), name='tag_accuracy')
                self.tag_accuracy_measure = tf.reduce_mean(tf.cast(tag_accuracy, tf.float32))
                self.tag_accuracy_summary = tf.scalar_summary('tag_accuracy', self.tag_accuracy_measure)
            if output_features.get('attribute_nhot'):
                attributes_nhot = tf.slice(output_vector, [0, start_index], [-1, attribute_vector_size])
                self.attribute_answers = attributes_nhot
                start_index = start_index + attribute_vector_size
            assert start_index == output_vector_size  # check that all output has been processed            
            
        init = tf.initialize_all_variables()
        self.merged = tf.merge_all_summaries()
        self.session = tf.Session()
        self.saver = tf.train.Saver()
        self.session.run(init)
        
    def train(self, train_data, epochs, test_data = None):
        sentence_count = 0
        self.run_id = datetime.datetime.now().strftime("%Y%m%d-%H%M")
        self.train_writer = tf.train.SummaryWriter(output_dir + '/tb/train' + self.run_id, self.session.graph)
        self.test_writer = tf.train.SummaryWriter(output_dir + '/tb/test' + self.run_id)

        for epoch in range(1, epochs+1):
            for sentence in train_data:
                # If we don't want to check process in tensorboard, the following one line would suffice
                # self.session.run(self.train_step, feed_dict = {self.token_ids : sentence_wordforms, self.tag_ids : sentence_tags})                
                sentence_count = sentence_count + 1
                if sentence_count % 1000 == 0:
#                     run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
#                     run_metadata = tf.RunMetadata()
#                     summary, _ = self.session.run([self.merged, self.train_step], feed_dict = self._feed_dict(sentence, train=True),
#                                                  options = run_options, run_metadata = run_metadata)
#                     self.train_writer.add_run_metadata(run_metadata, 'sentences_{}k'.format(sentence_count // 1000), sentence_count)
                    summary, _ = self.session.run([self.merged, self.train_step], feed_dict = self._feed_dict(sentence, train=True))
                    self.train_writer.add_summary(summary, sentence_count)
                    if test_data:
                        #TODO - calculate all results in a single parse
                        if output_features.get('pos_onehot'):
                            acc = self.evaluate_accuracy(test_data, self.pos_accuracy_measure)
                            test_summary = tf.Summary(value=[tf.Summary.Value(tag="pos_accuracy", simple_value=acc)])
                            self.test_writer.add_summary(test_summary, sentence_count)
                        if output_features.get('tag_onehot'):
                            acc = self.evaluate_accuracy(test_data, self.tag_accuracy_measure)
                            test_summary = tf.Summary(value=[tf.Summary.Value(tag="tag_accuracy", simple_value=acc)])
                            self.test_writer.add_summary(test_summary, sentence_count)
                        if output_features.get('attributes_nhot'):
                            # TODO - a simple metric for attribute accuracy 
                            pass
                elif sentence_count % 10 == 9:
                    summary, _ = self.session.run([self.merged, self.train_step], feed_dict = self._feed_dict(sentence, train=True))
                    self.train_writer.add_summary(summary, sentence_count)
                else:
                    self.session.run(self.train_step, feed_dict = self._feed_dict(sentence, train=True))
            print('Epoch {} done'.format(epoch))
            
    def evaluate_accuracy(self, document_vectors, measure):
        acc = 0.0
        tokens = 0.0
        for sentence in document_vectors:
            tokens = tokens + len(sentence.tag_ids)
            acc = acc + len(sentence.tag_ids) * self.session.run(measure, feed_dict = self._feed_dict(sentence))
        acc = acc / tokens
        return acc
                
    
    #TODO - calculate all outputs from a single run over testdata
    def _parse_sentences_tags(self, document_vectors, vocabularies):
        for sentence in document_vectors:
            silver_tag_ids = self.session.run(self.tag_answers, feed_dict = self._feed_dict(sentence, feed_output=False))
            yield [vocabularies.voc_tags.reverse(tag_id) for tag_id in silver_tag_ids]
    def _parse_sentences_pos(self, document_vectors, vocabularies):
        for sentence in document_vectors:
            silver_pos_ids = self.session.run(self.pos_answers, feed_dict = self._feed_dict(sentence, feed_output=False))
            yield [vocabularies.voc_pos.reverse(pos_id) for pos_id in silver_pos_ids]
    def _parse_sentences_attributes(self, document_vectors, vocabularies):
        for sentence in document_vectors:
            silver_attribute_data = self.session.run(self.attribute_answers, feed_dict = self._feed_dict(sentence, feed_output=False))
            result = []
            for token_silver_attributes in silver_attribute_data:
                result.append({vocabularies.voc_attributes.reverse(attribute_id) : value for attribute_id, value in enumerate(token_silver_attributes)})
            yield result        
        
    def tag(self, test_doc, test_data, vocabularies, filename):
        silver_tags = None
        silver_pos = None
        silver_attributes = None
        if output_features.get('pos_onehot'):
            silver_pos = self._parse_sentences_pos(test_data, vocabularies)
        if output_features.get('tag_onehot'):
            silver_tags = self._parse_sentences_tags(test_data, vocabularies)
        if output_features.get('attribute_nhot'):
            silver_attributes = self._parse_sentences_attributes(test_data, vocabularies)
        test_doc.output_tagged(silver_tags, silver_pos, silver_attributes, filename, vocabularies = vocabularies)
        
    def dump(self, folder, filename = 'model.tf'):
        self._featurefactory.dump(folder)
        self.saver.save(self.session, folder + '/' + filename)
        
    def load_model(self, folder):
        self.saver.restore(self.session, folder + '/model.tf')

## The actual experiment - training and evaluation

In [11]:
print('Loading...')
# Load both documents
train_doc = Document(train_data_filename)
eval_doc = Document(eval_data_filename)

# Load the embeddings
if input_features.get('wordform_embeddings'):
    embeddings = Word2Vec.load_word2vec_format(embeddings_filename, binary=True)
else:
    embeddings = None

# Build vocabularies and prepare vectorized data
vocabularies = Vocabularies(train_doc)
featurefactory = FeatureFactory(vocabularies, embeddings)    
train_data = featurefactory.vectorize(train_doc)
eval_data = featurefactory.vectorize(eval_doc)    

def train_stuff(tagger, epochs = default_epochs):
    # Train the model
    print('Training...')
    %time tagger.train(train_data, epochs, eval_data)
    tagger.dump(output_dir)
    print('Model saved')
    tagger.tag(eval_doc, eval_data, vocabularies, output_dir + '/eval.tagged.txt')
    return tagger
    
trained_tagger = train_stuff(Tagger(featurefactory))

Loading...


ResourceExhaustedError: OOM when allocating tensor with shape[12240,520]
	 [[Node: train/softmax/weights/Adam/Assign = Assign[T=DT_FLOAT, _class=["loc:@softmax/weights"], use_locking=true, validate_shape=true, _device="/job:localhost/replica:0/task:0/gpu:0"](train/softmax/weights/Adam, train/zeros_18)]]
Caused by op 'train/softmax/weights/Adam/Assign', defined at:
  File "/usr/lib/python3.5/runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/peteris/tf_source0.10/lib/python3.5/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/home/peteris/tf_source0.10/lib/python3.5/site-packages/traitlets/config/application.py", line 596, in launch_instance
    app.start()
  File "/home/peteris/tf_source0.10/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 474, in start
    ioloop.IOLoop.instance().start()
  File "/home/peteris/tf_source0.10/lib/python3.5/site-packages/zmq/eventloop/ioloop.py", line 162, in start
    super(ZMQIOLoop, self).start()
  File "/home/peteris/tf_source0.10/lib/python3.5/site-packages/tornado/ioloop.py", line 887, in start
    handler_func(fd_obj, events)
  File "/home/peteris/tf_source0.10/lib/python3.5/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/peteris/tf_source0.10/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/home/peteris/tf_source0.10/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/peteris/tf_source0.10/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/home/peteris/tf_source0.10/lib/python3.5/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/peteris/tf_source0.10/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/peteris/tf_source0.10/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/peteris/tf_source0.10/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 390, in execute_request
    user_expressions, allow_stdin)
  File "/home/peteris/tf_source0.10/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/peteris/tf_source0.10/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 498, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/peteris/tf_source0.10/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2705, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/peteris/tf_source0.10/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2809, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/peteris/tf_source0.10/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2869, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-11-e1dfc88ab947>", line 27, in <module>
    trained_tagger = train_stuff(Tagger(featurefactory))
  File "<ipython-input-9-09f938c9896a>", line 6, in __init__
    self._prepare_graph()
  File "<ipython-input-9-09f938c9896a>", line 128, in _prepare_graph
    self.train_step = tf.train.AdamOptimizer(1e-4).minimize(loss, name='train_step')
  File "/home/peteris/tf_source0.10/lib/python3.5/site-packages/tensorflow/python/training/optimizer.py", line 198, in minimize
    name=name)
  File "/home/peteris/tf_source0.10/lib/python3.5/site-packages/tensorflow/python/training/optimizer.py", line 300, in apply_gradients
    self._create_slots(var_list)
  File "/home/peteris/tf_source0.10/lib/python3.5/site-packages/tensorflow/python/training/adam.py", line 118, in _create_slots
    self._zeros_slot(v, "m", self._name)
  File "/home/peteris/tf_source0.10/lib/python3.5/site-packages/tensorflow/python/training/optimizer.py", line 494, in _zeros_slot
    named_slots[var] = slot_creator.create_zeros_slot(var, op_name)
  File "/home/peteris/tf_source0.10/lib/python3.5/site-packages/tensorflow/python/training/slot_creator.py", line 108, in create_zeros_slot
    colocate_with_primary=colocate_with_primary)
  File "/home/peteris/tf_source0.10/lib/python3.5/site-packages/tensorflow/python/training/slot_creator.py", line 86, in create_slot
    return _create_slot_var(primary, val, scope)
  File "/home/peteris/tf_source0.10/lib/python3.5/site-packages/tensorflow/python/training/slot_creator.py", line 50, in _create_slot_var
    slot = variables.Variable(val, name=scope, trainable=False)
  File "/home/peteris/tf_source0.10/lib/python3.5/site-packages/tensorflow/python/ops/variables.py", line 211, in __init__
    dtype=dtype)
  File "/home/peteris/tf_source0.10/lib/python3.5/site-packages/tensorflow/python/ops/variables.py", line 313, in _init_from_args
    validate_shape=validate_shape).op
  File "/home/peteris/tf_source0.10/lib/python3.5/site-packages/tensorflow/python/ops/gen_state_ops.py", line 45, in assign
    use_locking=use_locking, name=name)
  File "/home/peteris/tf_source0.10/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 703, in apply_op
    op_def=op_def)
  File "/home/peteris/tf_source0.10/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 2317, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/home/peteris/tf_source0.10/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1239, in __init__
    self._traceback = _extract_stack()


In [None]:
# Train some more epochs
trained_tagger = train_stuff(trained_tagger, 5)

## Evaluation when reading a model from file

In [None]:
def tag_stuff(filename):
    # Load embeddings
    embeddings = Word2Vec.load_word2vec_format(embeddings_filename, binary=True)

    # Load vocabularies
    vocabularies = Vocabularies(folder = output_dir)
    featurefactory = FeatureFactory(vocabularies, embeddings)

    # Load the document and vectorize it
    eval_doc = Document(filename, small_eval_limit)
    eval_data = featurefactory.vectorize(eval_doc)    

    # Load the tagger model
    tagger = Tagger(featurefactory)
    tagger.load_model(output_dir)
    tagger.tag(eval_doc, eval_data, vocabularies, output_dir + '/eval.tagged.txt')
    
tag_stuff(eval_data_filename)
tag_stuff(test_data_filename)