# train the keras model

In [6]:
import numpy as np
from dataset import index_sents
from keras.preprocessing import sequence
from keras.models import Model
from keras.layers.wrappers import Bidirectional
from keras.layers import Activation, concatenate, Dense, Input, LSTM, Dropout, Embedding
from keras_contrib.layers import CRF
from keras_contrib.utils import save_load_utils
from keras_tqdm import TQDMNotebookCallback
from gensim.models import Word2Vec
from mlxtend.preprocessing import one_hot
from embedding import load_vocab
from attention import Attention

In [2]:
# load data from npys (see preprocessing.ipynb)
word2idx = np.load('../00_data/encoded/word2idx.npy').item()
idx2word = np.load('../00_data/encoded/idx2word.npy').item()
pos2idx = np.load('../00_data/encoded/pos2idx.npy').item()
idx2pos = np.load('../00_data/encoded/idx2pos.npy').item()
ner2idx = np.load('../00_data/encoded/ner2idx.npy').item()
idx2ner = np.load('../00_data/encoded/idx2ner.npy').item()

X_tokens = list(np.load('../00_data/encoded/add_tokens.npy'))
X_postags = list(np.load('../00_data/encoded/add_postags.npy'))
y_nertags = list(np.load('../00_data/encoded/add_nertags.npy'))

In [3]:
# load embedding data
w2v_vocab, _ = load_vocab('../00_data/embeddings/text_mapping.json')
w2v_model = Word2Vec.load('../00_data/embeddings/text_embeddings.gensimmodel')
w2v_pvocab, _ = load_vocab('../00_data/embeddings/postag_mapping.json')
w2v_pmodel = Word2Vec.load('../00_data/embeddings/postag_embeddings.gensimmodel')

In [4]:
# change category label here
y_speechacts = np.load('../00_data/encoded/add_speechacts.npy')
y_topics = np.load('../00_data/encoded/add_topics.npy')

In [5]:
# create speechact dictionary
def listfromdicts(lst):
    setlst = list(set(lst))
    x2i = dict(list(zip(setlst, [i for i in range(len(setlst))])))
    i2x = dict(list(zip([i for i in range(len(setlst))], setlst)))
    return x2i, i2x

In [7]:
# create and save dictionaries
sa2idx, idx2sa = listfromdicts(y_speechacts)
np.save('../00_data/encoded/sa2idx.npy', sa2idx)
np.save('../00_data/encoded/idx2sa.npy', idx2sa)
top2idx, idx2top = listfromdicts(y_topics)
np.save('../00_data/encoded/top2idx.npy', top2idx)
np.save('../00_data/encoded/idx2top.npy', idx2top)
print(len(idx2sa.keys()), len(top2idx.keys()))
# integer-index data
X_tokens = index_sents(X_tokens, word2idx)
X_postags = index_sents(X_postags, pos2idx)
y_nertags = index_sents(y_nertags, ner2idx)

58 39


In [8]:
# integer-index and one-hot speech-acts
INT_VOCAB = len(list(idx2sa.keys()))
y_speechacts = one_hot([sa2idx[sa] for sa in y_speechacts], dtype='int', num_labels=INT_VOCAB)

TOP_VOCAB = len(list(idx2top.keys()))
y_topics = one_hot([top2idx[t] for t in y_topics], dtype='int', num_labels=TOP_VOCAB)

In [9]:
# split data
split_idx = int(len(X_tokens)*0.9)
X_train_sents = X_tokens[:split_idx]
X_train_pos = X_postags[:split_idx]
y_train_ner = y_nertags[:split_idx]
X_test_sents = X_tokens[split_idx:]
X_test_pos = X_postags[split_idx:]
y_test_ner = y_nertags[split_idx:]

y_train_sacts = y_speechacts[:split_idx]
y_test_sacts = y_speechacts[split_idx:]
y_train_tops = y_topics[:split_idx]
y_test_tops = y_topics[split_idx:]

In [10]:
# network hyperparameters
MAX_LENGTH = 20
MAX_VOCAB = len(word2idx.keys())     # see preprocessing
EMBEDDING_SIZE = 160 # preprocessing
POSBEDDING_SIZE = 32
HIDDEN_SIZE = 192    # LSTM Nodes/Features/Dimension
BATCH_SIZE = 32
DROPOUTRATE = 0.45
MAX_EPOCHS = 15      # max iterations, early stop condition below

In [11]:
# get the size of pos-tags, ner tags
TAG_VOCAB = len(list(idx2pos.keys()))
NER_VOCAB = len(list(idx2ner.keys()))

In [14]:
# zero-pad the sequences to max length
X_train_sents = sequence.pad_sequences(X_train_sents, maxlen=MAX_LENGTH, truncating='post', padding='post')
X_test_sents = sequence.pad_sequences(X_test_sents, maxlen=MAX_LENGTH, truncating='post', padding='post')
X_train_pos = sequence.pad_sequences(X_train_pos, maxlen=MAX_LENGTH, truncating='post', padding='post')
X_test_pos = sequence.pad_sequences(X_test_pos, maxlen=MAX_LENGTH, truncating='post', padding='post')
y_train_ner = sequence.pad_sequences(y_train_ner, maxlen=MAX_LENGTH, truncating='post', padding='post')
y_test_ner = sequence.pad_sequences(y_test_ner, maxlen=MAX_LENGTH, truncating='post', padding='post')

In [15]:
# reshape data for CRF
y_train_ner = y_train_ner[:, :, np.newaxis]
y_test_ner = y_test_ner[:, :, np.newaxis]

In [16]:
# create embedding matrices from custom pretrained word2vec embeddings
word_embedding_matrix = np.zeros((MAX_VOCAB, EMBEDDING_SIZE))

for word in word2idx.keys():
    # get the word vector from the embedding model
    # if it's there (check against vocab list)
    if word in w2v_vocab:
        # get the word vector
        word_vector = w2v_model[word]
        # slot it in at the proper index
        word_embedding_matrix[word2idx[word]] = word_vector

pos_embedding_matrix = np.zeros((TAG_VOCAB, POSBEDDING_SIZE))

for word in pos2idx.keys():
    # get the word vector from the embedding model
    # if it's there (check against vocab list)
    if word in w2v_pvocab:
        # get the word vector
        word_vector = w2v_pmodel[word]
        # slot it in at the proper index
        pos_embedding_matrix[pos2idx[word]] = word_vector

In [17]:
# define model

# text layers : dense embedding > dropout > bi-LSTM
txt_input = Input(shape=(MAX_LENGTH,), name='txt_input')
txt_embed = Embedding(MAX_VOCAB, EMBEDDING_SIZE, input_length=MAX_LENGTH,
                      weights=[word_embedding_matrix],
                      name='txt_embedding', trainable=False, mask_zero=True)(txt_input)
txt_drpot = Dropout(DROPOUTRATE, name='txt_dropout')(txt_embed)
txt_lstml = Bidirectional(LSTM(HIDDEN_SIZE, return_sequences=True),
                          name='txt_bidirectional')(txt_drpot)

# pos layers : dense embedding > dropout > bi-LSTM
pos_input = Input(shape=(MAX_LENGTH,), name='pos_input')
pos_embed = Embedding(TAG_VOCAB, POSBEDDING_SIZE, input_length=MAX_LENGTH,
                      weights=[pos_embedding_matrix],
                      name='pos_embedding', trainable=True, mask_zero=True)(pos_input)
pos_drpot = Dropout(DROPOUTRATE, name='pos_dropout')(pos_embed)

# merged layers : merge (concat, average...) word and pos > bi-LSTM > bi-LSTM
mrg_cncat = concatenate([txt_lstml, pos_drpot], axis=2)
mrg_lstml = Bidirectional(LSTM(HIDDEN_SIZE, return_sequences=True),
                          name='mrg_bidirectional_1')(mrg_cncat)

# final NER linear chain CRF layer
crf = CRF(NER_VOCAB, sparse_target=True)
out_ner = crf(mrg_lstml)

# intent network
rnn_intent = Attention(name='int_attention')(mrg_lstml)

# intent
dns_intent = Dense(INT_VOCAB, activation='relu', name='int_dense_1')(rnn_intent)
dns_intent = Dense(INT_VOCAB, name='int_dense_2')(dns_intent)
out_intent = Activation('softmax', name='int_output')(dns_intent)

# topic
dns_top = Dense(TOP_VOCAB, activation='relu', name='top_dense_1')(rnn_intent)
dns_intent = Dense(INT_VOCAB, name='int_dense_2')(dns_top)
out_top = Activation('softmax', name='top_output')(dns_top)

model = Model(inputs=[txt_input, pos_input], outputs=[out_ner, out_intent, out_top])

In [19]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
txt_input (InputLayer)           (None, 20)            0                                            
____________________________________________________________________________________________________
txt_embedding (Embedding)        (None, 20, 160)       305280      txt_input[0][0]                  
____________________________________________________________________________________________________
pos_input (InputLayer)           (None, 20)            0                                            
____________________________________________________________________________________________________
txt_dropout (Dropout)            (None, 20, 160)       0           txt_embedding[0][0]              
___________________________________________________________________________________________

In [18]:
model.compile(optimizer='adam',
              loss={'crf_1': crf.loss_function, 'int_output': 'categorical_crossentropy',  'top_output': 'categorical_crossentropy'},
              loss_weights={'crf_1': 0.25, 'int_output': 0.4, 'top_output': 0.65},
              )

In [20]:
history = model.fit([X_train_sents, X_train_pos], [y_train_ner, y_train_sacts, y_train_tops],
                    batch_size=BATCH_SIZE,
                    epochs=MAX_EPOCHS,
                    callbacks=[TQDMNotebookCallback()],
                    verbose=0)

hist_dict = history.history




In [21]:
# save the model
# because we are using keras-contrib, we must save weights like this, and load into network
save_load_utils.save_all_weights(model, '../00_data/model/alt_combo_model.h5')
np.save('../00_data/model/alt_combo_dict.npy', hist_dict)

In [22]:
scores = model.evaluate([X_test_sents, X_test_pos], [y_test_ner, y_test_sacts, y_test_tops])
print('')
print('Eval model...')
print(scores)


Eval model...
[1.961306967080328, 5.3293905637689667, 0.44913331994043237, 0.69124002129183804]
