Инициализируем импорт и дату

In [1]:
import nltk
import sys
import numpy as np
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import keras
import keras.layers as L
import warnings
warnings.filterwarnings('ignore')

nltk.download('brown')
nltk.download('universal_tagset')
data = nltk.corpus.brown.tagged_sents(tagset='universal')
all_tags = ['#EOS#','#UNK#','ADV', 'NOUN', 'ADP', 'PRON', 'DET', '.', 'PRT', 'VERB', 'X', 'NUM', 'CONJ', 'ADJ']

data = np.array([ [(word.lower(),tag) for word,tag in sentence] for sentence in data ])

train_data, test_data = train_test_split(data,test_size=0.25,random_state=42)

from collections import Counter
word_counts = Counter()
for sentence in data:
    words,tags = zip(*sentence)
    word_counts.update(words)

all_words = ['#EOS#','#UNK#'] + list(list(zip(*word_counts.most_common(10000)))[0])

from collections import defaultdict
word_to_id = defaultdict(lambda:1, { word: i for i, word in enumerate(all_words) })
tag_to_id = { tag: i for i, tag in enumerate(all_tags)}


def to_matrix(lines, token_to_id, max_len=None, pad=0, dtype='int32', time_major=False):
    """Converts a list of names into rnn-digestable matrix with paddings added after the end"""
    
    max_len = max_len or max(map(len,lines))
    matrix = np.empty([len(lines), max_len],dtype)
    matrix.fill(pad)

    for i in range(len(lines)):
        line_ix = list(map(token_to_id.__getitem__,lines[i]))[:max_len]
        matrix[i,:len(line_ix)] = line_ix

    return matrix.T if time_major else matrix

BATCH_SIZE=32
def generate_batches(sentences,batch_size=BATCH_SIZE,max_len=None,pad=0):
    assert isinstance(sentences,np.ndarray),"Make sure sentences is q numpy array"
    
    while True:
        indices = np.random.permutation(np.arange(len(sentences)))
        for start in range(0,len(indices)-1,batch_size):
            batch_indices = indices[start:start+batch_size]
            batch_words,batch_tags = [],[]
            for sent in sentences[batch_indices]:
                words,tags = zip(*sent)
                batch_words.append(words)
                batch_tags.append(tags)

            batch_words = to_matrix(batch_words,word_to_id,max_len,pad)
            batch_tags = to_matrix(batch_tags,tag_to_id,max_len,pad)

            batch_tags_1hot = to_categorical(batch_tags,len(all_tags)).reshape(batch_tags.shape+(-1,))
            yield batch_words,batch_tags_1hot
        
        
        
def compute_test_accuracy(model):
    test_words,test_tags = zip(*[zip(*sentence) for sentence in test_data])
    test_words,test_tags = to_matrix(test_words,word_to_id),to_matrix(test_tags,tag_to_id)

    #predict tag probabilities of shape [batch,time,n_tags]
    predicted_tag_probabilities = model.predict(test_words,verbose=1)
    predicted_tags = predicted_tag_probabilities.argmax(axis=-1)

    #compute accurary excluding padding
    numerator = np.sum(np.logical_and((predicted_tags == test_tags),(test_words != 0)))
    denominator = np.sum(test_words != 0)
    return float(numerator)/denominator


class EvaluateAccuracy(keras.callbacks.Callback):
    def on_epoch_end(self,epoch,logs=None):
        sys.stdout.flush()
        print("\nMeasuring validation accuracy...")
        acc = compute_test_accuracy(self.model)
        print("\nValidation accuracy: %.5f\n"%acc)
        sys.stdout.flush()
        
batch_words, batch_tags = zip(*[zip(*sentence) for sentence in data[-3:]])

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\vladislav.sterkhov\AppData\Roaming\nltk_data.
[nltk_data]     ..
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\vladislav.sterkhov\AppData\Roaming\nltk_data.
[nltk_data]     ..
[nltk_data]   Package universal_tagset is already up-to-date!


Создадим модель с двунаправленным LSTM слоем

In [2]:
model = keras.Sequential()
model.add(L.Embedding(input_dim=len(all_words), output_dim=64))
# Выходом GRU будет 3D тензор размера (batch_size, timesteps, 256)
model.add(L.GRU(100, input_shape=(32,58,14),return_sequences=True))
# Выходом SimpleRNN будет 2D тензор размера (batch_size, 128)
model.add(L.Bidirectional(L.LSTM(14, input_shape=(32,58,14), return_sequences=True)))
model.add(L.Dense(len(all_tags),activation='softmax'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 64)          640128    
                                                                 
 gru (GRU)                   (None, None, 100)         49800     
                                                                 
 bidirectional (Bidirectiona  (None, None, 28)         12880     
 l)                                                              
                                                                 
 dense (Dense)               (None, None, 14)          406       
                                                                 
Total params: 703,214
Trainable params: 703,214
Non-trainable params: 0
_________________________________________________________________


Обучим модель и вычислим точность

In [3]:
model.compile('adam','categorical_crossentropy')

model.fit_generator(generate_batches(train_data),len(train_data)/BATCH_SIZE,
                    callbacks=[EvaluateAccuracy()], epochs=5,)

acc = compute_test_accuracy(model)
print("\nFinal accuracy: %.5f"%acc)

assert acc>0.96, "Bidirectional RNNs are better than this!"
print("Well done!")

Epoch 1/5
Measuring validation accuracy...

Validation accuracy: 0.94719

Epoch 2/5
Measuring validation accuracy...

Validation accuracy: 0.95590

Epoch 3/5
Measuring validation accuracy...

Validation accuracy: 0.95953

Epoch 4/5
Measuring validation accuracy...

Validation accuracy: 0.96199

Epoch 5/5
Measuring validation accuracy...

Validation accuracy: 0.96338


Final accuracy: 0.96338
Well done!


Получаем удачное прохождение теста в 96%