# Part-of-speech tagging

## Reading data, vocabulary and pretrained embeddings

Use the [pos reader](read_pos.ipynb) to convert the data to json format.

These parts are similar to the previous examples. Things to note though:
* Our data has already been tokenized and divided into sentences
* We _cannot_ skip tokens
* We are using a specific OOV (out-of-vocabulary) embedding for all words which are not present in our vocab
* We now have one label for each token, not for each document.

In [1]:
# Load our training data
import json
import random
import numpy
with open("data/pos_train.json") as f:
    data=json.load(f)
print(data[0])

# We need to gather the texts, into a list
texts=[one_example["text"] for one_example in data]
labels=[one_example["tags"] for one_example in data] # This is now a list of lists just like the texts variable
print(texts[:2])
print(labels[:2])

# Lets do the same thing for the validation data
# We use a separate validation set, since generally using sentences from the same documents as train/validation results in overly optimistic scores
with open("data/pos_devel.json") as f:
    validation_data=json.load(f)
validation_texts=[one_example["text"] for one_example in validation_data]
validation_labels=[one_example["tags"] for one_example in validation_data]

# Use gensim to read the embedding model

from gensim.models import KeyedVectors

vector_model=KeyedVectors.load_word2vec_format("data/wiki-news-300d-1M.vec", binary=False, limit=50000)

# sort based on the index to make sure they are in the correct order
words=[k for k,v in sorted(vector_model.vocab.items(), key=lambda x:x[1].index)]
print("Words from embedding model:",len(words))
print("First 50 words:",words[:50])

# Normalize the vectors

print("Before normalization:",vector_model.get_vector("in")[:10])
vector_model.init_sims(replace=True)
print("After normalization:",vector_model.get_vector("in")[:10])

# Build vocabulary mappings

vocabulary={"<SPECIAL>": 0, "<OOV>": 1} # zero has a special meaning in sequence models, prevent using it for a normal word
for word in words:
    vocabulary.setdefault(word, len(vocabulary))

print("Words in vocabulary:",len(vocabulary))
inversed_vocabulary={value:key for key, value in vocabulary.items()} # inverse the dictionary

# Label mappings
label_set = set([label for sentence_labels in labels for label in sentence_labels])
label_map = {label: index for index, label in enumerate(label_set)}
                
# Embedding matrix

def load_pretrained_embeddings(vocab, embedding_model):
    """ vocab: vocabulary from our data vectorizer, embedding_model: model loaded with gensim """
    pretrained_embeddings=numpy.random.uniform(low=-0.05, high=0.05, size=(len(vocab)-1,embedding_model.vectors.shape[1]))
    pretrained_embeddings = numpy.vstack((numpy.zeros(shape=(1,embedding_model.vectors.shape[1])), pretrained_embeddings))
    found=0
    for word,idx in vocab.items():
        if word in embedding_model.vocab:
            pretrained_embeddings[idx]=embedding_model.get_vector(word)
            found+=1
            
    print("Found pretrained vectors for {found} words.".format(found=found))
    return pretrained_embeddings

pretrained=load_pretrained_embeddings(vocabulary, vector_model)


{'text': ['Al', '-', 'Zaman', ':', 'American', 'forces', 'killed', 'Shaikh', 'Abdullah', 'al', '-', 'Ani', ',', 'the', 'preacher', 'at', 'the', 'mosque', 'in', 'the', 'town', 'of', 'Qaim', ',', 'near', 'the', 'Syrian', 'border', '.'], 'tags': ['PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'ADJ', 'NOUN', 'VERB', 'PROPN', 'PROPN', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'PROPN', 'PUNCT', 'ADP', 'DET', 'ADJ', 'NOUN', 'PUNCT']}
[['Al', '-', 'Zaman', ':', 'American', 'forces', 'killed', 'Shaikh', 'Abdullah', 'al', '-', 'Ani', ',', 'the', 'preacher', 'at', 'the', 'mosque', 'in', 'the', 'town', 'of', 'Qaim', ',', 'near', 'the', 'Syrian', 'border', '.'], ['[', 'This', 'killing', 'of', 'a', 'respected', 'cleric', 'will', 'be', 'causing', 'us', 'trouble', 'for', 'years', 'to', 'come', '.', ']']]
[['PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'ADJ', 'NOUN', 'VERB', 'PROPN', 'PROPN', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN',

## Vectorizing data
If we want to consider the task as sequence labeling, we should feed the input data as word sequences and outputs as label sequences.

In [2]:
import numpy

def vectorizer(vocab, texts, label_map, labels=None):
    vectorized_data = [] # turn text into numbers based on our vocabulary mapping
    vectorized_labels = [] # same thing for the labels
    sentence_lengths = [] # Number of tokens in each sentence
    
    for i, one_example in enumerate(texts):
        vectorized_example = []
        vectorized_example_labels = []
        for word in one_example:
            vectorized_example.append(vocab.get(word, 1)) # 1 is our index for out-of-vocabulary tokens
        
        if labels:
            for label in labels[i]:
                vectorized_example_labels.append(label_map[label])

        vectorized_data.append(vectorized_example)
        vectorized_labels.append(vectorized_example_labels)
        
        sentence_lengths.append(len(one_example))
        
    vectorized_data = numpy.array(vectorized_data) # turn python list into numpy matrix
    vectorized_labels = numpy.array(vectorized_labels)
    
    return vectorized_data, vectorized_labels, sentence_lengths

vectorized_data, vectorized_labels, lengths=vectorizer(vocabulary, texts, label_map, labels)
validation_vectorized_data, validation_vectorized_labels, validation_lengths=vectorizer(vocabulary, validation_texts, label_map, validation_labels)

## Padding
We add padding to the label sequences as well.

In [3]:
import tensorflow as tf
### Only needed for me, not to block the whole GPU, you don't need this stuff
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.3
set_session(tf.Session(config=config))
### ---end of weird stuff

from keras.preprocessing.sequence import pad_sequences
print("Old shape:", vectorized_data.shape)
vectorized_data_padded=pad_sequences(vectorized_data, padding='post', maxlen=max(lengths))
print("New shape:", vectorized_data_padded.shape)
print("First example:", vectorized_data_padded[0])
# Even with the sparse output format, the shape has to be similar to the one-hot encoding
vectorized_labels_padded=numpy.expand_dims(pad_sequences(vectorized_labels, padding='post', maxlen=max(lengths)), -1)
print("Padded labels shape:", vectorized_labels_padded.shape)
print(label_map)
print("First example labels:", vectorized_labels_padded[0])

weights = numpy.copy(vectorized_data_padded)
weights[weights > 0] = 1
print("First weight vector:", weights[0])

# Same stuff for the validation data
validation_vectorized_data_padded=pad_sequences(validation_vectorized_data, padding='post', maxlen=max(lengths))
validation_vectorized_labels_padded=numpy.expand_dims(pad_sequences(validation_vectorized_labels, padding='post',maxlen=max(lengths)), -1)
validation_weights = numpy.copy(validation_vectorized_data_padded)
validation_weights[validation_weights > 0] = 1

Using TensorFlow backend.


Old shape: (12543,)
New shape: (12543, 159)
First example: [ 3424    37     1    11   285  1084   974 34462 10554  4733    37 43264
     2     3 16500    29     3  8683     8     3   754     6     1     2
   504     3  4761  1757     4     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     

## Evaluating POS tags
Keras does not use sample weighting in metrics (only for losses) (correct me if I'm wrong), so we have to create our own evaluation if we want to ignore padding in models which do not support masking (e.g. convolution).
Thus, to have evaluation that is identical for all models, we have to create our own script, which will ignore padded parts of the sequences.


In [4]:
import keras
from sklearn.metrics import accuracy_score

def accuracy(predictions, gold, lengths):
    pred_tags = numpy.concatenate([labels[:lengths[i]] for i, labels in enumerate(predictions)]).ravel()
    
    gold_tags = numpy.concatenate([labels[:lengths[i], 0] for i, labels in enumerate(gold)]).ravel()
    
    print('Accuracy:', accuracy_score(gold_tags, pred_tags))

class EvaluateTags(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        pred = numpy.argmax(self.model.predict(validation_vectorized_data_padded), axis=-1)
        accuracy(pred, validation_vectorized_labels_padded, validation_lengths) # FIXME: Using global variables here, not good!

## Independent classification
Time-distributed means that the same dense layer is applied to each time step. This means that we are now simply using a normal feedforward network to classify each word/token separately.

Why didn't we one-hot encode our labels? :S

It's because the sparse loss is doing it for us implicitly! Neat, right!

__Also, word embeddings are frozen!__

In [5]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Activation, Conv1D, TimeDistributed, LSTM, Bidirectional
from keras.optimizers import SGD, Adam

example_count, sequence_len = vectorized_data_padded.shape
class_count = len(label_set)

vector_size= pretrained.shape[1]

In [6]:
inp=Input(shape=(sequence_len,))
embeddings=Embedding(len(vocabulary), vector_size, mask_zero=True, trainable=False, weights=[pretrained])(inp)
hidden = TimeDistributed(Dense(100, activation="softmax"))(embeddings)
outp = TimeDistributed(Dense(class_count, activation="softmax"))(hidden)
model=Model(inputs=[inp], outputs=[outp])

optimizer=Adam(lr=0.001) # define the learning rate
model.compile(optimizer=optimizer,loss="sparse_categorical_crossentropy", sample_weight_mode='temporal')

print(model.summary())

# train
hist=model.fit(vectorized_data_padded,vectorized_labels_padded, sample_weight=weights, batch_size=100,verbose=2,epochs=10, callbacks=[EvaluateTags()])

Instructions for updating:
keep_dims is deprecated, use keepdims instead
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 159)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 159, 300)          15000600  
_________________________________________________________________
time_distributed_1 (TimeDist (None, 159, 100)          30100     
_________________________________________________________________
time_distributed_2 (TimeDist (None, 159, 17)           1717      
Total params: 15,032,417
Trainable params: 31,817
Non-trainable params: 15,000,600
_________________________________________________________________
None
Epoch 1/10
Accuracy: 0.2624254473161034
 - 2s - loss: 27.0499
Epoch 2/10
Accuracy: 0.3205168986083499
 - 1s - loss: 25.7245
Epoch 3/10
Accuracy: 0.4422266401590457
 - 1s - loss: 2

# Adding context with convolution

In [7]:
inp=Input(shape=(sequence_len,))
embeddings=Embedding(len(vocabulary), vector_size, mask_zero=False, trainable=False, weights=[pretrained])(inp)
cnn = Conv1D(100,3, activation='relu', padding='same')(embeddings)
outp=TimeDistributed(Dense(class_count, activation="softmax"))(cnn)
model=Model(inputs=[inp], outputs=[outp])

optimizer=Adam(lr=0.001) # define the learning rate
model.compile(optimizer=optimizer,loss="sparse_categorical_crossentropy", sample_weight_mode='temporal')

print(model.summary())

# train
hist=model.fit(vectorized_data_padded,vectorized_labels_padded, sample_weight=weights, batch_size=100,verbose=2,epochs=10, callbacks=[EvaluateTags()])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 159)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 159, 300)          15000600  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 159, 100)          90100     
_________________________________________________________________
time_distributed_3 (TimeDist (None, 159, 17)           1717      
Total params: 15,092,417
Trainable params: 91,817
Non-trainable params: 15,000,600
_________________________________________________________________
None
Epoch 1/10
Accuracy: 0.8067196819085487
 - 3s - loss: 1.5569
Epoch 2/10
Accuracy: 0.875427435387674
 - 1s - loss: 0.5405
Epoch 3/10
Accuracy: 0.894831013916501
 - 1s - loss: 0.3591
Epoch 4/10
Accuracy: 0.905248508946322
 - 1s - loss: 0.2945
Epoch 5/10


## Bidirectional LSTM

If you want to see the training progress, use verbose=1 instead of verbose=2.

Warning: training this on on class room computers on the full data set will take for a while...

In [10]:
inp=Input(shape=(sequence_len,))
embeddings=Embedding(len(vocabulary), vector_size, mask_zero=True, trainable=False, weights=[pretrained])(inp)
rnn = Bidirectional(LSTM(100, activation='tanh', return_sequences=True))(embeddings)
outp=TimeDistributed(Dense(class_count, activation="softmax"))(rnn)
model=Model(inputs=[inp], outputs=[outp])

optimizer=Adam(lr=0.001) # define the learning rate
model.compile(optimizer=optimizer,loss="sparse_categorical_crossentropy", sample_weight_mode='temporal')

print(model.summary())

# train
hist=model.fit(vectorized_data_padded,vectorized_labels_padded, sample_weight=weights, batch_size=100,verbose=2,epochs=10, callbacks=[EvaluateTags()])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 159)               0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 159, 300)          15000600  
_________________________________________________________________
bidirectional_3 (Bidirection (None, 159, 200)          320800    
_________________________________________________________________
time_distributed_6 (TimeDist (None, 159, 17)           3417      
Total params: 15,324,817
Trainable params: 324,217
Non-trainable params: 15,000,600
_________________________________________________________________
None
Epoch 1/10
Accuracy: 0.7654075546719682
 - 253s - loss: 18.2768
Epoch 2/10
Accuracy: 0.8652882703777336
 - 240s - loss: 5.8403
Epoch 3/10
Accuracy: 0.8924055666003976
 - 242s - loss: 3.6612
Epoch 4/10
Accuracy: 0.9033001988071571
 - 244s - loss: 2.933

Skeleton for tagging a sentence.

In [11]:
inverse_label_map = {value: key for key, value in label_map.items()}

def tag_sentence(sentence):
    tokens = sentence.split() # Stupid whitespace tokenization
    vectorized_sentence, _, sentence_length=vectorizer(vocabulary, [tokens], label_map) # Using our global variables again...
    vectorized_sentence_padded = pad_sequences(vectorized_sentence, padding='post', maxlen=max(lengths)) # Pad the sequence

    predictions = model.predict(vectorized_sentence_padded)[0] # Everything so far has been a 'list' of sentences with a single sentence, so we only take index 0
    predictions = numpy.argmax(predictions, axis=-1) # Take the tag index with the highest value for each token
    
    tags = [inverse_label_map[label_index] for label_index in predictions[:len(tokens)]] # Ignore padded region
    return tags, tokens
    
while True:
    sentence=input("sentence> ")
    if sentence=="end":
        break
    tags, tokens = tag_sentence(sentence)
    for token, tag in zip(tokens, tags):
        print(token, tag)

sentence> What a great sentence this is !
What PRON
a DET
great ADJ
sentence NOUN
this PRON
is AUX
! PUNCT
sentence> Another one could be even better , but it has too many words .
Another DET
one NOUN
could AUX
be AUX
even ADV
better ADJ
, PUNCT
but CCONJ
it PRON
has VERB
too ADV
many ADJ
words NOUN
. PUNCT
sentence> end
