# Part-of-speech tagging

## Reading data, vocabulary and pretrained embeddings
These parts are similar to the previous examples. Things to note though:
* Our data has already been tokenized and divided into sentences
* We _cannot_ skip tokens
* We are using a specific OOV (out-of-vocabulary) embedding for all words which are not present in our vocab
* We now have one label for each token, not for each document.

In [2]:
# Load our training data
import json
import random
import numpy
with open("data/pos_train.json") as f:
    data=json.load(f)
print(data[0])

# We need to gather the texts, into a list
texts=[one_example["text"] for one_example in data]
labels=[one_example["tags"] for one_example in data] # This is now a list of lists just like the texts variable
print(texts[:2])
print(labels[:2])

# Lets do the same thing for the validation data
# We use a separate validation set, since generally using sentences from the same documents as train/validation results in overly optimistic scores
with open("data/pos_devel.json") as f:
    validation_data=json.load(f)
validation_texts=[one_example["text"] for one_example in data]
validation_labels=[one_example["tags"] for one_example in data]

# Use gensim to read the embedding model

from gensim.models import KeyedVectors

vector_model=KeyedVectors.load_word2vec_format("data/wiki-news-300d-1M.vec", binary=False, limit=50000)

# sort based on the index to make sure they are in the correct order
words=[k for k,v in sorted(vector_model.vocab.items(), key=lambda x:x[1].index)]
print("Words from embedding model:",len(words))
print("First 50 words:",words[:50])

# Normalize the vectors

print("Before normalization:",vector_model.get_vector("in")[:10])
vector_model.init_sims(replace=True)
print("After normalization:",vector_model.get_vector("in")[:10])

# Build vocabulary mappings

vocabulary={"<SPECIAL>": 0, "<OOV>": 1} # zero has a special meaning in sequence models, prevent using it for a normal word
for word in words:
    vocabulary.setdefault(word, len(vocabulary))

print("Words in vocabulary:",len(vocabulary))
inversed_vocabulary={value:key for key, value in vocabulary.items()} # inverse the dictionary

# Label mappings
label_set = set([label for sentence_labels in labels for label in sentence_labels])
label_map = {label: index for index, label in enumerate(label_set)}
                
# Embedding matrix

def load_pretrained_embeddings(vocab, embedding_model):
    """ vocab: vocabulary from our data vectorizer, embedding_model: model loaded with gensim """
    pretrained_embeddings=numpy.random.uniform(low=-0.05, high=0.05, size=(len(vocab)-1,embedding_model.vectors.shape[1]))
    pretrained_embeddings = numpy.vstack((numpy.zeros(shape=(1,embedding_model.vectors.shape[1])), pretrained_embeddings))
    found=0
    for word,idx in vocab.items():
        if word in embedding_model.vocab:
            pretrained_embeddings[idx]=embedding_model.get_vector(word)
            found+=1
            
    print("Found pretrained vectors for {found} words.".format(found=found))
    return pretrained_embeddings

pretrained=load_pretrained_embeddings(vocabulary, vector_model)


{'tags': ['PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'ADJ', 'NOUN', 'VERB', 'PROPN', 'PROPN', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'PROPN', 'PUNCT', 'ADP', 'DET', 'ADJ', 'NOUN', 'PUNCT'], 'text': ['Al', '-', 'Zaman', ':', 'American', 'forces', 'killed', 'Shaikh', 'Abdullah', 'al', '-', 'Ani', ',', 'the', 'preacher', 'at', 'the', 'mosque', 'in', 'the', 'town', 'of', 'Qaim', ',', 'near', 'the', 'Syrian', 'border', '.']}
[['Al', '-', 'Zaman', ':', 'American', 'forces', 'killed', 'Shaikh', 'Abdullah', 'al', '-', 'Ani', ',', 'the', 'preacher', 'at', 'the', 'mosque', 'in', 'the', 'town', 'of', 'Qaim', ',', 'near', 'the', 'Syrian', 'border', '.'], ['[', 'This', 'killing', 'of', 'a', 'respected', 'cleric', 'will', 'be', 'causing', 'us', 'trouble', 'for', 'years', 'to', 'come', '.', ']']]
[['PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'ADJ', 'NOUN', 'VERB', 'PROPN', 'PROPN', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN',

## Vectorizing data
If we want to consider the task as sequence labeling, we should feed the input data as word sequences and outputs as label sequences.

In [3]:
import numpy

def vectorizer(vocab, texts, label_map, labels):
    vectorized_data = [] # turn text into numbers based on our vocabulary mapping
    vectorized_labels = [] # same thing for the labels
    sentence_lengths = [] # Number of tokens in each sentence
    
    for i, one_example in enumerate(texts):
        vectorized_example = []
        vectorized_example_labels = []
        for word in one_example:
            vectorized_example.append(vocab.get(word, 1)) # 1 is our index for out-of-vocabulary tokens
        
        for label in labels[i]:
            vectorized_example_labels.append(label_map[label])

        vectorized_data.append(vectorized_example)
        vectorized_labels.append(vectorized_example_labels)
        
        sentence_lengths.append(len(one_example))
        
    vectorized_data = numpy.array(vectorized_data) # turn python list into numpy matrix
    vectorized_labels = numpy.array(vectorized_labels)
    
    return vectorized_data, vectorized_labels, sentence_lengths

vectorized_data, vectorized_labels, lengths=vectorizer(vocabulary, texts, label_map, labels)
validation_vectorized_data, validation_vectorized_labels, validation_lengths=vectorizer(vocabulary, validation_texts, label_map, validation_labels)

## Padding
We add padding to the label sequences as well.

In [4]:
import tensorflow as tf
### Only needed for me, not to block the whole GPU, you don't need this stuff
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.3
set_session(tf.Session(config=config))
### ---end of weird stuff

from keras.preprocessing.sequence import pad_sequences
print("Old shape:", vectorized_data.shape)
vectorized_data_padded=pad_sequences(vectorized_data, padding='post')
print("New shape:", vectorized_data_padded.shape)
print("First example:", vectorized_data_padded[0])
# Even with the sparse output format, the shape has to be similar to the one-hot encoding
vectorized_labels_padded=numpy.expand_dims(pad_sequences(vectorized_labels, padding='post'), -1)
print("Padded labels shape:", vectorized_labels_padded.shape)
print(label_map)
print("First example labels:", vectorized_labels_padded[0])

weights = numpy.copy(vectorized_data_padded)
weights[weights > 0] = 1
print("First weight vector:", weights[0])

# Same stuff for the validation data
validation_vectorized_data_padded=pad_sequences(validation_vectorized_data, padding='post')
validation_vectorized_labels_padded=numpy.expand_dims(pad_sequences(validation_vectorized_labels, padding='post'), -1)
validation_weights = numpy.copy(validation_vectorized_data_padded)
validation_weights[weights > 0] = 1

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Old shape: (12543,)
New shape: (12543, 159)
First example: [ 3424    37     1    11   285  1084   974 34462 10554  4733    37 43264
     2     3 16500    29     3  8683     8     3   754     6     1     2
   504     3  4761  1757     4     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     

## Evaluating POS tags
Keras does not use sample weighting in metrics (only for losses) (correct me if I'm wrong), so we have to create our own evaluation if we want to ignore padding in models which do not support masking (e.g. convolution).
Thus, to have evaluation that is identical for all models, we have to create our own script, which will ignore padded parts of the sequences.


In [42]:
import keras
from sklearn.metrics import accuracy_score

def accuracy(predictions, gold, lengths):
    pred_tags = numpy.concatenate([labels[:lengths[i]] for i, labels in enumerate(predictions)]).ravel()
    
    gold_tags = numpy.concatenate([labels[:lengths[i], 0] for i, labels in enumerate(gold)]).ravel()
    
    print('Accuracy:', accuracy_score(gold_tags, pred_tags))

class EvaluateTags(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        pred = numpy.argmax(self.model.predict(validation_vectorized_data_padded), axis=-1)
        accuracy(pred, validation_vectorized_labels_padded, validation_lengths) # FIXME: Using global variables here, not good!

## Independent classification
Time-distributed means that the same dense layer is applied to each time step. This means that we are now simply using a normal feedforward network to classify each word/token separately.

Why didn't we one-hot encode our labels? :S

It's because the sparse loss is doing it for us implicitly! Neat, right!

In [43]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Activation, Conv1D, TimeDistributed, LSTM, Bidirectional
from keras.optimizers import SGD, Adam

example_count, sequence_len = vectorized_data_padded.shape
class_count = len(label_set)

vector_size= pretrained.shape[1]

In [None]:
inp=Input(shape=(sequence_len,))
embeddings=Embedding(len(vocabulary), vector_size, mask_zero=True, trainable=False, weights=[pretrained])(inp)
hidden = TimeDistributed(Dense(100, activation="softmax"))(embeddings)
outp = TimeDistributed(Dense(class_count, activation="softmax"))(hidden)
model=Model(inputs=[inp], outputs=[outp])

optimizer=Adam(lr=0.001) # define the learning rate
model.compile(optimizer=optimizer,loss="sparse_categorical_crossentropy", sample_weight_mode='temporal')

print(model.summary())

# train
hist=model.fit(vectorized_data_padded,vectorized_labels_padded, sample_weight=weights, batch_size=100,verbose=1,epochs=100, callbacks=[EvaluateTags()])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 159)               0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 159, 300)          15000600  
_________________________________________________________________
time_distributed_13 (TimeDis (None, 159, 100)          30100     
_________________________________________________________________
time_distributed_14 (TimeDis (None, 159, 17)           1717      
Total params: 15,032,417
Trainable params: 31,817
Non-trainable params: 15,000,600
_________________________________________________________________
None
Epoch 1/100
Accuracy: 0.30482828055736116
Epoch 2/100
Accuracy: 0.3310297301656346
Epoch 3/100
Accuracy: 0.42641258607965515
Epoch 4/100
Accuracy: 0.4943623629690089
Epoch 5/100


# Adding context with convolution

In [11]:
inp=Input(shape=(sequence_len,))
embeddings=Embedding(len(vocabulary), vector_size, mask_zero=False, trainable=False, weights=[pretrained])(inp)
cnn = Conv1D(100,3, activation='relu', padding='same')(embeddings)
outp=TimeDistributed(Dense(class_count, activation="softmax"))(cnn)
model=Model(inputs=[inp], outputs=[outp])

optimizer=Adam(lr=0.001) # define the learning rate
model.compile(optimizer=optimizer,loss="sparse_categorical_crossentropy", sample_weight_mode='temporal')

print(model.summary())

# train
hist=model.fit(vectorized_data_padded,vectorized_labels_padded, sample_weight=weights, batch_size=100,verbose=1,epochs=100, callbacks=[EvaluateEntities()])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 113)               0         
_________________________________________________________________
embedding_6 (Embedding)      (None, 113, 300)          15000600  
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 113, 100)          90100     
_________________________________________________________________
time_distributed_7 (TimeDist (None, 113, 4)            404       
Total params: 15,091,104
Trainable params: 90,504
Non-trainable params: 15,000,600
_________________________________________________________________
None
Epoch 1/100
Precision/Recall/F-score: 0.6337763075343638 / 0.5548786577449316 / 0.591709044436753
Epoch 2/100
Precision/Recall/F-score: 0.7273012552301256 / 0.6943972835314092 / 0.7104685025289941
Epoch 3/100
Precision/Recall/F-score: 0.746764675432239

Epoch 31/100
Precision/Recall/F-score: 0.8357740080228119 / 0.8635274143613303 / 0.8494240734828205
Epoch 32/100
Precision/Recall/F-score: 0.8401530192242507 / 0.8663737141715769 / 0.8530619268874302
Epoch 33/100
Precision/Recall/F-score: 0.829796325309785 / 0.8727654049735344 / 0.8507386405120592
Epoch 34/100
Precision/Recall/F-score: 0.8773286285091588 / 0.8442524717866773 / 0.8604728096292338
Epoch 35/100
Precision/Recall/F-score: 0.8282866277976303 / 0.8796564466193948 / 0.853199011962997
Epoch 36/100
Precision/Recall/F-score: 0.8709660947712419 / 0.8517427344452212 / 0.8612471598081293
Epoch 37/100
Precision/Recall/F-score: 0.8760606839804577 / 0.8506441625886347 / 0.8631653619112766
Epoch 38/100
Precision/Recall/F-score: 0.8406619610835558 / 0.880205732547688 / 0.8599795091964678
Epoch 39/100
Precision/Recall/F-score: 0.8481731282449415 / 0.872865275142315 / 0.8603420696443952
Epoch 40/100
Precision/Recall/F-score: 0.8333802552552553 / 0.8868970338559872 / 0.8593062073636848
Epoc

Epoch 98/100
Precision/Recall/F-score: 0.8865501074428599 / 0.9064715869369819 / 0.896400177768999
Epoch 99/100
Precision/Recall/F-score: 0.8894181193223668 / 0.9044741835613702 / 0.8968829689782378
Epoch 100/100
Precision/Recall/F-score: 0.9203599317370843 / 0.8886946968940378 / 0.9042501841831162


## LSTMs

In [12]:
inp=Input(shape=(sequence_len,))
embeddings=Embedding(len(vocabulary), vector_size, mask_zero=True, trainable=False, weights=[pretrained])(inp)
rnn = LSTM(100, activation='tanh', return_sequences=True)(embeddings)
outp=TimeDistributed(Dense(class_count, activation="softmax"))(rnn)
model=Model(inputs=[inp], outputs=[outp])

optimizer=Adam(lr=0.001) # define the learning rate
model.compile(optimizer=optimizer,loss="sparse_categorical_crossentropy", sample_weight_mode='temporal')

print(model.summary())

# train
hist=model.fit(vectorized_data_padded,vectorized_labels_padded, sample_weight=weights, batch_size=100,verbose=1,epochs=100, callbacks=[EvaluateEntities()])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 113)               0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 113, 300)          15000600  
_________________________________________________________________
lstm_3 (LSTM)                (None, 113, 100)          160400    
_________________________________________________________________
time_distributed_8 (TimeDist (None, 113, 4)            404       
Total params: 15,161,404
Trainable params: 160,804
Non-trainable params: 15,000,600
_________________________________________________________________
None
Epoch 1/100
Precision/Recall/F-score: 0.3169291338582677 / 0.03215819434734845 / 0.058391513283162576
Epoch 2/100
Precision/Recall/F-score: 0.6756756756756757 / 0.49435733546389693 / 0.5709671838052945
Epoch 3/100
Precision/Recall/F-score: 0.708886920

Epoch 31/100
Precision/Recall/F-score: 0.8812849921182802 / 0.8095975232198143 / 0.8439216094526715
Epoch 32/100
Precision/Recall/F-score: 0.8880721683929168 / 0.7963647258563867 / 0.8397219882055602
Epoch 33/100
Precision/Recall/F-score: 0.9040313549832026 / 0.8062518725656647 / 0.8523465132238821
Epoch 34/100
Precision/Recall/F-score: 0.8840437158469945 / 0.807849795266154 / 0.8442310702917079
Epoch 35/100
Precision/Recall/F-score: 0.903692614770459 / 0.8138919404773794 / 0.8564447480426672
Epoch 36/100
Precision/Recall/F-score: 0.9086206896551724 / 0.8157894736842105 / 0.8597063621533442
Epoch 37/100
Precision/Recall/F-score: 0.9129078962022497 / 0.8186357734944572 / 0.8632055602358888
Epoch 38/100
Precision/Recall/F-score: 0.9172647860679909 / 0.8232298012583641 / 0.8677070449222347
Epoch 39/100
Precision/Recall/F-score: 0.9182689643708095 / 0.8275242185159293 / 0.8705381766606256
Epoch 40/100
Precision/Recall/F-score: 0.9251327928431646 / 0.8262259063217817 / 0.8728864973226768
Ep

Precision/Recall/F-score: 0.9560114023810855 / 0.854089683411565 / 0.9021810797267718
Epoch 98/100
Precision/Recall/F-score: 0.9419622810956444 / 0.8380105862378907 / 0.8869510068178215
Epoch 99/100
Precision/Recall/F-score: 0.9579342407438526 / 0.8539898132427843 / 0.9029805433089575
Epoch 100/100
Precision/Recall/F-score: 0.9592362394310991 / 0.8554379306901029 / 0.9043684835686946


## Bidirectional LSTM

In [14]:
inp=Input(shape=(sequence_len,))
embeddings=Embedding(len(vocabulary), vector_size, mask_zero=True, trainable=False, weights=[pretrained])(inp)
rnn = Bidirectional(LSTM(100, activation='tanh', return_sequences=True))(embeddings)
outp=TimeDistributed(Dense(class_count, activation="softmax"))(rnn)
model=Model(inputs=[inp], outputs=[outp])

optimizer=Adam(lr=0.001) # define the learning rate
model.compile(optimizer=optimizer,loss="sparse_categorical_crossentropy", sample_weight_mode='temporal')

print(model.summary())

# train
hist=model.fit(vectorized_data_padded,vectorized_labels_padded, sample_weight=weights, batch_size=100,verbose=1,epochs=10, callbacks=[EvaluateEntities()])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         (None, 113)               0         
_________________________________________________________________
embedding_9 (Embedding)      (None, 113, 300)          15000600  
_________________________________________________________________
bidirectional_2 (Bidirection (None, 113, 200)          320800    
_________________________________________________________________
time_distributed_10 (TimeDis (None, 113, 4)            804       
Total params: 15,322,204
Trainable params: 321,604
Non-trainable params: 15,000,600
_________________________________________________________________
None
Epoch 1/10
Precision/Recall/F-score: 0.553005284015852 / 0.16723259762308998 / 0.2568054597040104
Epoch 2/10
Precision/Recall/F-score: 0.7410719371190978 / 0.6497053830020972 / 0.6923875156320677
Epoch 3/10
Precision/Recall/F-score: 0.7721611339830111