# train the keras model

In [1]:
import numpy as np
import pickle
from keras.models import Model
from keras.layers.wrappers import Bidirectional
from keras.layers import Activation, concatenate, Dense, Input, LSTM, Dropout, Embedding
from keras_contrib.layers import CRF
from keras_contrib.utils import save_load_utils
from gensim.models import Word2Vec
from mlxtend.preprocessing import one_hot
from embedding import load_vocab
from attention.attention import Attention
from mltools.preprocessing import Tokenizer, Indexer, Pipeline, LabelIndexer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# restrict GPU usage here
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

## load the encoded data

In [3]:
x_train  = np.load('../00_data/encoded/snips_x_train.npy')
x_test   = np.load('../00_data/encoded/snips_x_test.npy')
yt_train = np.load('../00_data/encoded/snips_y_tags_train.npy')
yt_test  = np.load('../00_data/encoded/snips_y_tags_test.npy')
yi_train = np.load('../00_data/encoded/snips_y_int_train.npy')
yi_test  = np.load('../00_data/encoded/snips_y_int_test.npy')

In [4]:
intent_indexer = pickle.load(open("../00_data/encoded/snips_intent_indexer.pkl", "rb"))
label_indexer  = pickle.load(open("../00_data/encoded/snips_label_indexer.pkl", "rb"))
word_idxpipe   = pickle.load(open("../00_data/encoded/snips_sent_indexer.pkl", "rb"))

In [5]:
# load embedding data
w2v_vocab, _ = load_vocab('../00_data/embeddings/snips_mapping.json')
w2v_model = Word2Vec.load('../00_data/embeddings/snips_embeddings.gensimmodel')

In [6]:
# shuffle training data, for validation_size
shuffle_idx = np.random.permutation(x_train.shape[0])

x_train  = x_train[shuffle_idx]
yt_train = yt_train[shuffle_idx]
yi_train = yi_train[shuffle_idx]

## set the hyperparameters

In [7]:
# network hyperparameters
MAX_LENGTH      = 15    # see preprocessing
MAX_VOCAB       = 10000 # see preprocessing
EMBEDDING_SIZE  = 300   # see preprocessing
HIDDEN_SIZE     = 300
DROPOUTRATE     = 0.50
BATCH_SIZE      = 128
MAX_EPOCHS      = 25

In [8]:
# get the size of the intent, tag vocab
INT_VOCAB = len(list(intent_indexer.idx2tag.keys()))
TAG_VOCAB = len(list(label_indexer.idx2tag.keys()))

## load embeddings from the trained word2vec model

In [9]:
# create embedding matrices from custom pretrained word2vec embeddings
word_embedding_matrix = np.zeros((MAX_VOCAB, EMBEDDING_SIZE))
c = 0
for word in word_idxpipe.steps[1][1].word2idx.keys():
    # get the word vector from the embedding model
    # if it's there (check against vocab list)
    if word in w2v_vocab:
        c += 1
        # get the word vector
        word_vector = w2v_model[word]
        # slot it in at the proper index
        word_embedding_matrix[word_idxpipe.steps[1][1].word2idx[word]] = word_vector
print('added', c, 'embeddings')

added 9998 embeddings


  # Remove the CWD from sys.path while we load stuff.


## define the model

In [10]:
# define model

# word input layers : dense embedding > dropout
txt_input = Input(shape=(MAX_LENGTH,), name='txt_input')
txt_embed = Embedding(MAX_VOCAB, EMBEDDING_SIZE, input_length=MAX_LENGTH,
                      weights=[word_embedding_matrix],
                      name='txt_embedding', trainable=True, mask_zero=True)(txt_input)
txt_drpot = Dropout(DROPOUTRATE, name='txt_dropout')(txt_embed)

# recurrent layers : bi-LSTM
mrg_lstml = Bidirectional(LSTM(HIDDEN_SIZE, return_sequences=True),
                          name='bidirectional_1')(txt_drpot)
mrg_lstml = Dropout(DROPOUTRATE, name='bidirectional_drop')(mrg_lstml)
mrg_lstml = Bidirectional(LSTM(HIDDEN_SIZE, return_sequences=True),
                          name='bidirectional_2')(mrg_lstml)

# final NER linear chain CRF layer
crf = CRF(TAG_VOCAB, sparse_target=True, name='crf_1')
out_ner = crf(mrg_lstml)

# intent network
rnn_intent = Attention(name='int_attention')(mrg_lstml)

# intent
dns_intent = Dense(INT_VOCAB, activation='relu', name='int_dense_1')(rnn_intent)
dns_intent = Dense(INT_VOCAB, name='int_dense_2')(dns_intent)
out_intent = Activation('softmax', name='int_output')(dns_intent)

model = Model(inputs=txt_input, outputs=[out_ner, out_intent])

In [11]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
txt_input (InputLayer)          (None, 15)           0                                            
__________________________________________________________________________________________________
txt_embedding (Embedding)       (None, 15, 300)      3000000     txt_input[0][0]                  
__________________________________________________________________________________________________
txt_dropout (Dropout)           (None, 15, 300)      0           txt_embedding[0][0]              
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 15, 600)      1442400     txt_dropout[0][0]                
__________________________________________________________________________________________________
bidirectio

In [12]:
model.compile(optimizer='sgd',
              loss={'crf_1': crf.loss_function, 'int_output': 'sparse_categorical_crossentropy'},
              loss_weights={'crf_1': 0.5, 'int_output': 0.5},
              )

In [13]:
x_train.shape, yt_train.shape, yi_train.shape

((13784, 15), (13784, 15, 1), (13784, 1))

In [14]:
np.unique(np.isnan(x_train)), np.unique(np.isnan(yt_train)), np.unique(np.isnan(yi_train))

(array([False]), array([False]), array([False]))

In [15]:
MAX_VOCAB, np.max(x_train), np.max(x_test)

(10000, 9999, 9999)

In [16]:
TAG_VOCAB, np.max(yt_train), np.max(yt_test)

(41, 40, 40)

In [17]:
INT_VOCAB, np.max(yi_train), np.max(yi_test)

(7, 6, 6)

In [18]:
history = model.fit([x_train], [yt_train, yi_train],
                    batch_size=BATCH_SIZE,
                    epochs=MAX_EPOCHS,
                    verbose=2)

hist_dict = history.history

Epoch 1/25
 - 14s - loss: 4.3279 - crf_1_loss: 6.7404 - int_output_loss: 1.9155
Epoch 2/25
 - 11s - loss: 3.9473 - crf_1_loss: 6.0552 - int_output_loss: 1.8394
Epoch 3/25
 - 11s - loss: 3.8271 - crf_1_loss: 5.8913 - int_output_loss: 1.7629
Epoch 4/25
 - 11s - loss: 3.7453 - crf_1_loss: 5.8102 - int_output_loss: 1.6804
Epoch 5/25
 - 11s - loss: 3.6686 - crf_1_loss: 5.7465 - int_output_loss: 1.5906
Epoch 6/25
 - 11s - loss: 3.5951 - crf_1_loss: 5.6956 - int_output_loss: 1.4947
Epoch 7/25
 - 11s - loss: 3.5229 - crf_1_loss: 5.6505 - int_output_loss: 1.3954
Epoch 8/25
 - 11s - loss: 3.4535 - crf_1_loss: 5.6098 - int_output_loss: 1.2973
Epoch 9/25
 - 11s - loss: 3.3853 - crf_1_loss: 5.5713 - int_output_loss: 1.1992
Epoch 10/25
 - 11s - loss: 3.3173 - crf_1_loss: 5.5345 - int_output_loss: 1.1000
Epoch 11/25
 - 11s - loss: 3.2476 - crf_1_loss: 5.4991 - int_output_loss: 0.9960
Epoch 12/25
 - 11s - loss: 3.1772 - crf_1_loss: 5.4646 - int_output_loss: 0.8898
Epoch 13/25
 - 11s - loss: 3.1102 - c

In [19]:
# save the model
# because we are using keras-contrib, we must save weights like this, and load into network
save_load_utils.save_all_weights(model, '../00_data/model/snips_combo_model.h5')
np.save('../00_data/model/snips_combo_dict.npy', hist_dict)

In [20]:
scores = model.evaluate(x_test, [yt_test, yi_test])
print('')
print('Eval model...')
print(scores)


Eval model...
[2.4918250928606307, 4.775258215495518, 0.20839196509548596]


In [21]:
test_tags, test_intents = model.predict(x_test)
np.save('../00_data/model/snips_pred_tags.npy', test_tags)
np.save('../00_data/model/snips_pred_ints.npy', test_intents)