In [1]:
!pip install keras

import tensorflow as tf
import keras as K
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.datasets import imdb
import io
from os import path
from keras.layers import Bidirectional, LSTM

[33mYou are using pip version 18.0, however version 19.0.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [17]:
PAD = "<PAD>"
UNK = "<UNK>"
MAX_SIZE = 24
HIDDEN_SIZE = 256
TRAIN_INPUT_PATH = "../resources/train/input/msr.utf8"
DEV_INPUT_PATH = "../resources/dev/input/msr.utf8"
TRAIN_LABELS_PATH = "../resources/train/labels/msr.utf8"
DEV_LABELS_PATH = "../resources/dev/labels/msr.utf8"
MODEL_WEIGHTS_PATH = "my_model_weights.h5"

In [10]:
def build_vocab(data):
    word_to_id_uni = dict()
    word_to_id_bi = dict()
    word_to_id_uni['<PAD>'] = 0
    word_to_id_uni['<UNK>'] = 1
    word_to_id_bi['<PAD>'] = 0
    word_to_id_bi['<UNK>'] = 1
    
    uni_index = 2
    bi_index = 2
    
    if isinstance(data, io.TextIOWrapper):
        for line in data:
            line = line.strip()
            for i in range(len(line)):
                if line[i] not in word_to_id_uni:
                    word_to_id_uni[line[i]] = uni_index
                    uni_index += 1
                if i < len(line) - 2:
                    if line[i:i + 2] not in word_to_id_bi:
                        word_to_id_bi[line[i:i + 2]] = bi_index
                        bi_index += 1


    id_to_word_uni = {v: k for k, v in word_to_id_uni.items()}
    id_to_word_bi = {v: k for k, v in word_to_id_bi.items()}

    return word_to_id_uni, id_to_word_uni, word_to_id_bi, id_to_word_bi

In [11]:
with open(
        TRAIN_INPUT_PATH, 'r', encoding='utf-8') as f:
    word_to_id_as_uni, id_to_word_as_uni, word_to_id_as_bi, id_to_word_as_bi = build_vocab(f)

VOCAB_SIZE_UNI = len(word_to_id_as_uni)
VOCAB_SIZE_BI = len(word_to_id_as_bi)

print("Vocab size uni: " + str(VOCAB_SIZE_UNI))
print("Vocab size bi: " + str(VOCAB_SIZE_BI))

Vocab size uni: 5169
Vocab size bi: 424026


In [12]:
def create_input_dataset(file, word_to_id_uni, word_to_id_bi):
    x_uni = []
    x_bi = []
    for line in file:
        line = line.strip()
        feature_vector_uni = []
        feature_vector_bi = []
        # Build feature vector
        for i in range(len(line)):
            unigram = line[i]
            if unigram in word_to_id_uni:
                feature_vector_uni.append(word_to_id_uni[unigram])
            else:
                feature_vector_uni.append(word_to_id_uni[UNK])

            if i < len(line) - 2:
                bigram = line[i:i + 2]
                if bigram in word_to_id_bi:
                    feature_vector_bi.append(word_to_id_bi[bigram])
                else:
                    feature_vector_bi.append(word_to_id_bi[UNK])

        x_uni.append(np.array(feature_vector_uni))
        x_bi.append(np.array(feature_vector_bi))
    return np.array(x_uni), np.array(x_bi)


In [13]:
def BIES_to_numerical(file_path):
    BIES_to_number = {'B': 0, 'I': 1, 'E': 2, 'S': 3}
    y = []
    with open(file_path, 'r', encoding='utf-8') as f:

        for line in f:
            line = line.strip()
            new_line = []
            for ch in line:
                new_line.append(str(BIES_to_number[ch]))
            y.append(new_line)
    return np.array(y)

In [14]:
with open(
        TRAIN_INPUT_PATH, 'r', encoding='utf8') as f:
    train_x_as_uni, train_x_as_bi = create_input_dataset(f, word_to_id_as_uni, word_to_id_as_bi)

with open(
        DEV_INPUT_PATH, 'r', encoding='utf8') as f:
    dev_x_as_uni, dev_x_as_bi = create_input_dataset(f, word_to_id_as_uni, word_to_id_as_bi)

In [18]:
train_y_as = BIES_to_numerical(TRAIN_LABELS_PATH)
dev_y_as = BIES_to_numerical(DEV_LABELS_PATH)

In [19]:
train_x_as_uni = pad_sequences(train_x_as_uni, truncating='pre', padding='post', maxlen=MAX_SIZE)
train_x_as_bi = pad_sequences(train_x_as_bi, truncating='pre', padding='post', maxlen=MAX_SIZE)
dev_x_as_uni = pad_sequences(dev_x_as_uni, truncating='pre', padding='post', maxlen=MAX_SIZE)
dev_x_as_bi = pad_sequences(dev_x_as_bi, truncating='pre', padding='post', maxlen=MAX_SIZE)
train_y_as = pad_sequences(train_y_as, truncating='pre', padding='post', maxlen=MAX_SIZE)
dev_y_as = pad_sequences(dev_y_as, truncating='pre', padding='post', maxlen=MAX_SIZE)

In [20]:
print([id_to_word_as_uni[i] for i in train_x_as_uni[0]])
print()
print([id_to_word_as_bi[i] for i in train_x_as_bi[0]])
print(train_y_as[0])
print([id_to_word_as_uni[i] for i in dev_x_as_uni[0]])
print([id_to_word_as_bi[i] for i in dev_x_as_bi[0]])
print(dev_y_as[0])

['可', '多', '得', '的', '教', '科', '书', '，', '她', '确', '实', '是', '名', '副', '其', '实', '的', '‘', '我', '的', '大', '学', '’', '。']

['是不', '不可', '可多', '多得', '得的', '的教', '教科', '科书', '书，', '，她', '她确', '确实', '实是', '是名', '名副', '副其', '其实', '实的', '的‘', '‘我', '我的', '的大', '大学', '学’']
[1 1 2 3 0 1 2 3 3 0 2 3 0 1 1 2 3 3 3 3 0 2 3 3]
['扬', '帆', '远', '东', '做', '与', '中', '国', '合', '作', '的', '先', '行', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['扬帆', '帆远', '远东', '<UNK>', '做与', '与中', '中国', '国合', '合作', '作的', '的先', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
[0 2 0 2 3 3 0 2 0 2 3 0 2 0 0 0 0 0 0 0 0 0 0 0]


In [21]:
train_y_as = K.utils.to_categorical(train_y_as, 4, dtype='int')
dev_y_as = K.utils.to_categorical(dev_y_as, 4, dtype='int')

In [22]:
print(train_x_as_uni.shape)
print(train_x_as_bi.shape)
print(train_y_as.shape)
print(dev_x_as_uni.shape)
print(dev_x_as_bi.shape)
print(dev_y_as.shape)

(86924, 24)
(86924, 24)
(86924, 24, 4)
(3985, 24)
(3985, 24)
(3985, 24, 4)


In [23]:
print([id_to_word_as_uni[i] for i in train_x_as_uni[0]])
print()
print([id_to_word_as_bi[i] for i in train_x_as_bi[0]])
print(train_y_as[0])
print(dev_x_as_uni[0])
print(dev_x_as_bi[0])
print(dev_y_as[0])

['可', '多', '得', '的', '教', '科', '书', '，', '她', '确', '实', '是', '名', '副', '其', '实', '的', '‘', '我', '的', '大', '学', '’', '。']

['是不', '不可', '可多', '多得', '得的', '的教', '教科', '科书', '书，', '，她', '她确', '确实', '实是', '是名', '名副', '副其', '其实', '实的', '的‘', '‘我', '我的', '的大', '大学', '学’']
[[0 1 0 0]
 [0 1 0 0]
 [0 0 1 0]
 [0 0 0 1]
 [1 0 0 0]
 [0 1 0 0]
 [0 0 1 0]
 [0 0 0 1]
 [0 0 0 1]
 [1 0 0 0]
 [0 0 1 0]
 [0 0 0 1]
 [1 0 0 0]
 [0 1 0 0]
 [0 1 0 0]
 [0 0 1 0]
 [0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]
 [1 0 0 0]
 [0 0 1 0]
 [0 0 0 1]
 [0 0 0 1]]
[ 567 2444  560   55  162   18  504  327  679  183   20  113  344    0
    0    0    0    0    0    0    0    0    0    0]
[198681 175475 114528      1 269153  51534  14699 114417   4418    979
   1505      0      0      0      0      0      0      0      0      0
      0      0      0      0]
[[1 0 0 0]
 [0 0 1 0]
 [1 0 0 0]
 [0 0 1 0]
 [0 0 0 1]
 [0 0 0 1]
 [1 0 0 0]
 [0 0 1 0]
 [1 0 0 0]
 [0 0 1 0]
 [0 0 0 1]
 [1 0 0 0]
 [0 0 1 0]
 [1 0 0 0]
 [1 0 0 0]
 [1 0 0 

In [24]:
def create_keras_model(vocab_size_uni, vocab_size_bi, embedding_size, hidden_size, dropout, recurrent_dropout):
    print("Creating KERAS model")

    # define LSTM
    uni_input_layer = K.layers.Input((MAX_SIZE,))
    bi_input_layer = K.layers.Input((MAX_SIZE,))
    uni_embedding = K.layers.Embedding(vocab_size_uni, embedding_size, mask_zero=True)(uni_input_layer)
    bi_embedding = K.layers.Embedding(vocab_size_bi, embedding_size, mask_zero=True)(bi_input_layer)
    concatenated_layer = K.layers.concatenate([uni_embedding, bi_embedding], axis=-1)
    bidirectional_layer = Bidirectional(LSTM(hidden_size, dropout=dropout, recurrent_dropout=recurrent_dropout, return_sequences=True))(concatenated_layer)
    output = K.layers.TimeDistributed(K.layers.Dense(4, activation='softmax'))(bidirectional_layer)
    
    model = K.models.Model(inputs=[uni_input_layer, bi_input_layer], outputs=[output])

    # we are going to use the Adam optimizer which is a really powerful optimizer.
    optimizer = K.optimizers.Adam(lr=0.03)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['acc'])

    return model

In [None]:
batch_size = 32
epochs = 50
EMBEDDING_SIZE = 32
model = create_keras_model(VOCAB_SIZE_UNI, VOCAB_SIZE_BI, EMBEDDING_SIZE, HIDDEN_SIZE, 0.2, 0.2)
# Let's print a summary of the model
model.save_weights(MODEL_WEIGHTS_PATH)
model.load_weights(MODEL_WEIGHTS_PATH)
model.summary()

cbk = K.callbacks.TensorBoard("logging/keras_model")
print("\nStarting training...")
model.fit([train_x_as_uni, train_x_as_bi], train_y_as, epochs=epochs, batch_size=batch_size,
          shuffle=True, validation_data=([dev_x_as_uni, dev_x_as_bi], dev_y_as), callbacks=[cbk])
print("Training complete.\n")

#print("\nEvaluating test...")
#loss_acc = model.evaluate(test_x, test_y, verbose=0)
#print("Test data: loss = %0.6f  accuracy = %0.2f%% " % (loss_acc[0], loss_acc[1]*100))

Creating KERAS model
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 24)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 24)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 24, 32)       165408      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 24, 32)       13568832    input_2[0][0]                    
________________________________________________________________________________________