In [1]:
from gensim.models.wrappers import FastText

import numpy as np
import random
import time
import io

import tensorflow as tf
from keras import backend as K

from keras.layers import Embedding, LSTM, Dense, TimeDistributed, Dropout, Bidirectional, Input, concatenate, add, multiply
from keras.layers import Conv1D, MaxPooling1D, Flatten, Reshape, GlobalMaxPooling1D, Highway, Permute, Lambda
from keras.layers.advanced_activations import PReLU, LeakyReLU
from keras.models import Model
from keras.optimizers import Adam

import os

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 

session = tf.Session(config = tf.ConfigProto(inter_op_parallelism_threads = 0,
                                             intra_op_parallelism_threads = 0,
                                             log_device_placement = True))

K.set_session(session)

Using TensorFlow backend.


In [2]:
# Loading FASTTEXT english model bin
word_embeddings_path = 'embeddings\cc.en.300.bin'
lang_model = FastText.load_fasttext_format(word_embeddings_path)

In [3]:
MAX_COLUMNS = 2
WORD_COL_NUM = 0
LABEL_COL_NUM = 3

Vocabulary = set()

def read_file(file_path, vocab):
       
    corpus_sentences = []
    input_sentence = []
    
    with open(file_path, 'r', encoding='utf-8') as f_in:
        for line in f_in:
            line = line.strip()

            if len(line) == 0 or line[0] == '#':
                if len(input_sentence) > 0:
                    corpus_sentences.append(input_sentence)
                    input_sentence = []
                continue
            
            line_splt = line.split(' ')
            if len(line_splt) < MAX_COLUMNS:
                continue
            
            vocab.add(line_splt[0])
            input_sentence.append(line_splt)

    if len(input_sentence) > 0:
        corpus_sentences.append(input_sentence)

    print(file_path, len(corpus_sentences), "sentences")
    return corpus_sentences

#Пропишите путь к частям корпуса CoNLL-2003
train_path = 'conll2003\\eng.train.txt'
train_sentences = read_file(train_path, Vocabulary)

dev_path = 'conll2003\\eng.testa.txt'
dev_sentences = read_file(dev_path, Vocabulary)

test_path = 'conll2003\\eng.testb.txt'
test_sentences = read_file(test_path, Vocabulary)

conll2003\eng.train.txt 14987 sentences
conll2003\eng.testa.txt 3466 sentences
conll2003\eng.testb.txt 3684 sentences


In [4]:
word2idx = {}
word_embeddings = []
embedding_size = len(lang_model['size'])

word2idx["PADDING_TOKEN"] = len(word2idx)
word_embeddings.append(np.zeros(embedding_size))

word2idx["UNKNOWN_TOKEN"] = len(word2idx)
word_embeddings.append(np.random.uniform(-0.25, 0.25, embedding_size))

for token in Vocabulary:
    if token not in word2idx:
        try:
            word_embeddings.append(lang_model[token])
            word2idx[token] = len(word2idx)
        except:
            pass

word_embeddings = np.array(word_embeddings, dtype='float32')

In [5]:
char2idx = {}
char2idx["PADDING_TOKEN"] = 0

for token in Vocabulary:
    for char in list(token):
        if char not in char2idx:
            char2idx[char] = len(char2idx)

In [6]:
label_set = set()
label_set.add('PADDING_LABEL')
for dataset in [train_sentences, dev_sentences, test_sentences]:
    for sentence in dataset:
        for token in sentence:
            label = token[LABEL_COL_NUM]
            label_set.add(label)    

label2idx = {}
idx2label = {}
for label in label_set:
    label2idx[label] = len(label2idx)
    
print(label2idx)

{'I-MISC': 0, 'I-ORG': 1, 'I-PER': 2, 'B-ORG': 3, 'PADDING_LABEL': 4, 'O': 5, 'B-LOC': 6, 'I-LOC': 7, 'B-MISC': 8}


In [7]:
cnn_len = 16

def create_matrices(sentences, word2idx, label2idx):   
    
    unknown_idx = word2idx['UNKNOWN_TOKEN']
    padding_idx = word2idx['PADDING_TOKEN'] 
    
    padding_label = label2idx['PADDING_LABEL']  
    
    dataset = []
    total_tokens = 0
    unknown_tokens = 0
    
    for sentence in sentences:
        
        proper_sentence_start = 1

        word_indices = np.array([padding_idx] * (len(sentence) + 2))
        label_indices = np.array([padding_label] * (len(sentence) + 2))
        
        char_codes = [[0] * cnn_len]
        
        for pos_in_sentence, word in enumerate(sentence):

            token_unknown, word_idx = get_token_indices(word, word2idx, unknown_idx)

            pos_in_padded_sentence = pos_in_sentence + proper_sentence_start
            
            word_indices[pos_in_padded_sentence] = word_idx
            label_indices[pos_in_padded_sentence] = label2idx[word[LABEL_COL_NUM]]

            total_tokens += 1
            if token_unknown:
                unknown_tokens += 1
            
            st = word[WORD_COL_NUM][:cnn_len]
            char_codes.append([char2idx[char] for char in list(st)] + [char2idx["PADDING_TOKEN"]] * (cnn_len - len(st)))
            
        char_codes.append([0] * cnn_len)
        char_records = np.array(char_codes)
        
        dataset.append([word_indices, label_indices, char_records])
        
    percent = 0.0
    if total_tokens != 0:
        percent = float(unknown_tokens) / total_tokens * 100
    print("{} tokens, {} unknown, {:.3}%".format(total_tokens, unknown_tokens, percent ))
    return dataset

def get_token_indices(token, word2idx, unknown_idx):

    token_unknown = False
    
    word = token[WORD_COL_NUM]
    
    if word2idx.get(word) is not None:
        word_idx = word2idx[word]
    else:
        word_idx = unknown_idx
        token_unknown = True

    return token_unknown, word_idx

train_data = create_matrices(train_sentences, word2idx, label2idx)
dev_data = create_matrices(dev_sentences, word2idx, label2idx)
test_data = create_matrices(test_sentences, word2idx, label2idx)

for sentence in train_data[:4]:
    print()
    print(sentence[:2])

204567 tokens, 605 unknown, 0.296%
51578 tokens, 93 unknown, 0.18%
46666 tokens, 155 unknown, 0.332%

[array([    0, 20165,     0]), array([4, 5, 4])]

[array([    0,  9038,  6764,  4327, 26582,  4854, 21114, 24727, 26393,
       17773,     0]), array([4, 1, 5, 0, 5, 5, 5, 0, 5, 5, 4])]

[array([   0, 7385, 5006,    0]), array([4, 2, 2, 4])]

[array([    0, 18148, 20407,     0]), array([4, 7, 5, 4])]


In [8]:
SENTENCE_LSTM_DIM = 100
CNN_FILTERS = 200
CNN_WIN = 5
dim_char = 50

n_out = len(label2idx)

token_input = Input(dtype='int32', shape=(None,), name='token_input')
token_embedding_layer = Embedding(input_dim=word_embeddings.shape[0], 
                                   output_dim=word_embeddings.shape[1],
                                   weights=[word_embeddings], trainable=False, 
                                   name='token_embeddings')
tokens = token_embedding_layer(token_input)

char_input = Input(dtype='int32', shape=(None, cnn_len), name='char_input')
char_embedding_layer = Embedding(input_dim=len(char2idx), output_dim=dim_char, name='char_embedding_layer')
char_embeddings = char_embedding_layer(char_input)

char_cnn = TimeDistributed(Conv1D(filters=CNN_FILTERS, kernel_size=CNN_WIN), name='char_cnn')(char_embeddings)
char_activation = TimeDistributed(PReLU(), name='char_activation')(char_cnn)
char_pooling = TimeDistributed(GlobalMaxPooling1D(), name='char_pooling')(char_activation)
char_highway = TimeDistributed(Highway(), name='char_highway')(char_pooling)
chars = TimeDistributed(Dropout(0.3), name = "chars")(char_highway)

merged_embeddings = concatenate([tokens, chars], name='merged_embeddings')

blstm, forward_h, forward_c, backward_h, backward_c = Bidirectional(
    LSTM(SENTENCE_LSTM_DIM, return_sequences=True, return_state=True, implementation=2), name='blstm')(merged_embeddings)

state_h_concat = concatenate([forward_h, backward_h], name = 'state_h_concat')
state_h = Lambda(lambda x: tf.expand_dims(x, axis = 1), name = 'state_h')(state_h_concat)

attention_W1 = TimeDistributed(Dense(SENTENCE_LSTM_DIM), name = 'attention_W1')(blstm)
attention_W2 = TimeDistributed(Dense(SENTENCE_LSTM_DIM), name = 'attention_W2')(state_h)
attention_W = add([attention_W1, attention_W2], name = 'attention_W')

attention_scores = Lambda(lambda x: tf.nn.tanh(x), name = 'attention_scores')(attention_W)
attention_V = TimeDistributed(Dense(1), name = 'attention_V')(attention_scores)
attention_weights = Lambda(lambda x: tf.nn.softmax(x, axis = 1), name = 'attention_weights')(attention_V)

context_agg = concatenate([blstm, attention_weights], name = "context_agg")
context = TimeDistributed(Dropout(0.3), name = "context")(context_agg)

dense = TimeDistributed(Dense(n_out), name='dense')(context)
activation = TimeDistributed(PReLU(), name='activation')(dense)

result = TimeDistributed(Dense(n_out, activation='softmax'), name='result')(activation)

model = Model(inputs=[token_input, char_input], outputs=result)
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.




__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
char_input (InputLayer)         (None, None, 16)     0                                            
__________________________________________________________________________________________________
char_embedding_layer (Embedding (None, None, 16, 50) 4300        char_input[0][0]                 
__________________________________________________________________________________________________
char_cnn (TimeDistributed)      (None, None, 12, 200 50200       char_embedding_layer[0][0]       
__________________________________________________________________________________________________
char_activation (TimeDistribute (None, None, 12, 200 2400        char_cnn[0][0]                   
__________________________________________________________________________________________________
char_pooli

In [9]:
number_of_epochs = 10
print("%d epochs" % number_of_epochs)
print()

def iterate_minibatches(dataset):   
    for sentence in dataset:
        tokens, labels, chars = sentence     
        labels = np.expand_dims(labels, -1) 
        yield np.asarray([tokens]), np.asarray([labels]), np.asarray([chars])
        
def tag_dataset(dataset):
    predicted_labels = []
    correct_labels = []
    for tokens, labels, chars in dataset:
        pred = model.predict_on_batch([np.asarray([tokens]), np.asarray([chars])])[0]
        pred_labels = [el.tolist().index(max(el)) for el in pred]
        predicted_labels.append(pred_labels)
        correct_labels.append(labels)
    return predicted_labels, correct_labels
   
def compute_f1(predictions, correct, padding_label, no_entity_label):
    total_tokens = 0
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    
    for guessed_sentence, correct_sentence in zip(predictions, correct):
        assert (len(guessed_sentence) == len(correct_sentence)), "Guessed and correct sentences do not match"
        
        local_cnt, local_tps = 0, 0
        for j in range(len(guessed_sentence)):
            if (correct_sentence[j] != padding_label):
                total_tokens += 1
                
                if (guessed_sentence[j] == no_entity_label):
                    if (guessed_sentence[j] == correct_sentence[j]):
                        pass
                    else:
                        false_negatives += 1
                        
                else:
                    local_cnt += 1
                    if (guessed_sentence[j] == correct_sentence[j]):
                        local_tps += 1
                        
                    if (guessed_sentence[j] != guessed_sentence[j+1]):
                        if (local_cnt == local_tps):
                            true_positives += 1
                        else:
                            false_positives += 1
                                    
                        local_cnt, local_tps = 0, 0

    if total_tokens == 0:
        return float(0)
    else:        
        precision = true_positives / (true_positives + false_positives)
        recall = true_positives / (true_positives + false_negatives)
        f1 = 2 * precision * recall / (precision + recall)
        return f1

print("%d train sentences" % len(train_data))
print("%d dev sentences" % len(dev_data))
print("%d test sentences" % len(test_data))

padding_label = label2idx['PADDING_LABEL']
no_entity_label = label2idx['O']

for epoch in range(number_of_epochs):    
    print()
    print("--------- Epoch %d -----------" % epoch)
    random.shuffle(train_data)
    
    start_time = time.time()    
    for batch in iterate_minibatches(train_data):
        tokens, labels, chars = batch       
        model.train_on_batch([tokens, chars], labels)   
    print("%.2f sec for training" % (time.time() - start_time))
    print()
    
    #Train Dataset       
    start_time = time.time()  
    print("================================== Train Data ==================================")
    predicted_labels, correct_labels = tag_dataset(train_data)        
    accuracy = compute_f1(predicted_labels, correct_labels, padding_label, no_entity_label)
    print("f1 = ", accuracy)

    #Dev Dataset 
    print("================================== Dev Data: ==================================")
    predicted_labels, correct_labels = tag_dataset(dev_data)  
    accuracy = compute_f1(predicted_labels, correct_labels, padding_label, no_entity_label)
    print("f1 = ", accuracy)

    #Test Dataset 
    print("================================== Test Data: ==================================")
    predicted_labels, correct_labels = tag_dataset(test_data)  
    accuracy = compute_f1(predicted_labels, correct_labels, padding_label, no_entity_label)
    print("f1 = ", accuracy)
    print()
    print("%.2f sec for evaluation" % (time.time() - start_time))

10 epochs

14987 train sentences
3466 dev sentences
3684 test sentences

--------- Epoch 0 -----------
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
216.50 sec for training

f1 =  0.9279234843773455
f1 =  0.9196760040499494
f1 =  0.8931863365262968

54.38 sec for evaluation

--------- Epoch 1 -----------
203.63 sec for training

f1 =  0.9577313842328633
f1 =  0.9395395901846699
f1 =  0.9192220714608774

49.44 sec for evaluation

--------- Epoch 2 -----------
185.05 sec for training

f1 =  0.9635309278350517
f1 =  0.945062409781778
f1 =  0.9118634837069982

49.89 sec for evaluation

--------- Epoch 3 -----------
189.35 sec for training

f1 =  0.9734842015371479
f1 =  0.9474482061647297
f1 =  0.9154739400486093

49.77 sec for evaluation

--------- Epoch 4 -----------
197.48 sec for training

f1 =  0.977962257691814
f1 =  0.9505318250886375
f1 =  0.9161940768746062

59.66 sec for evaluation

--------- Epoch 5 