## decoding demo

In [1]:
from collections import Counter
from preprocessing import CharacterIndexer, SlotIndexer, IntentIndexer
from gensim.models import Word2Vec
import json
import numpy as np
import pandas as pd
import pickle

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
import h5py
import math
from keras.models import Model
from keras.layers import Activation, Concatenate, concatenate, Dense, Dropout, Embedding, Input, TimeDistributed
from keras.layers import LSTM, CuDNNLSTM, LeakyReLU, Masking, Lambda, Dot, BatchNormalization, Activation
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D, Flatten
from keras.layers.wrappers import Bidirectional
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, TerminateOnNaN, ModelCheckpoint
from keras_contrib.layers import CRF
from keras_contrib.utils import save_load_utils
from kutilities.layers import AttentionWithContext
from attention import TDAttention
from keras.optimizers import Adam, SGD
import keras.backend as K
from keras.layers import Dense, Activation, Multiply, Add, Lambda
import keras.initializers
from keras.regularizers import l1, l2

Using TensorFlow backend.
  return f(*args, **kwds)


In [3]:
sentindexer = pickle.load(open('encoded/atis_sentindexer.pkl', 'rb'))
slotindexer = pickle.load(open('encoded/atis_slotindexer.pkl', 'rb'))
intindexer  = pickle.load(open('encoded/atis_intindexer.pkl',  'rb'))

## model loading

due to the `keras-contrib` CRF and added attention layers, the easiest way to load the model is to recreate it and load the weights.

In [4]:
# preprocessing-dependent parameters
# we can use the indexer attributes
TXT_VOCAB  = sentindexer.max_word_vocab
TXT_MAXLEN = sentindexer.max_sent_len
CHR_MAXLEN = sentindexer.max_word_len
CHR_VOCAB  = sentindexer.max_char_vocab
SLOT_NUM   = slotindexer.labelsize
LABEL_NUM  = intindexer.labelsize
print(TXT_VOCAB, TXT_MAXLEN, SLOT_NUM, LABEL_NUM)

728 22 121 22


In [5]:
# self-defined network hyperparameters
WEMBED_SIZE   = 200   # word embedding size. must match w2v size
CEMBED_SIZE   = 200   # character embedding size. free param
WDROP_RATE    = 0.50  # word-level input dropout
DROP_RATE     = 0.33  # dropout for other layers
RNN_DROP_RATE = 0.0   # recurrent droput (not implemented)
HIDDEN_SIZE   = 300   # LSTM block hidden size
BATCH_SIZE    = 32
MAX_EPOCHS    = 50
OPTIMIZER     = keras.optimizers.Adadelta(clipnorm=1)

In [6]:
def highway(inputs, activation="tanh", gate_bias=-2):
    feats = K.int_shape(inputs)[-1]
    gate_bias_init = keras.initializers.Constant(gate_bias)
    transform_gate = Dense(units=feats, bias_initializer=gate_bias_init, activation='sigmoid')(inputs)
    carry_gate = Lambda(lambda x: 1.0 - x, output_shape=(feats,))(transform_gate)
    h_transformed = Dense(units=feats)(inputs)
    h_transformed = Activation(activation)(h_transformed)
    transformed_gated = Multiply()([transform_gate, h_transformed])
    carried_gated = Multiply()([carry_gate, inputs])
    outputs = Add()([transformed_gated, carried_gated])
    return outputs

In [7]:
########################################
# Kim; Ma & Hovy char-CNN + word input
########################################

# word-level input with word embedding matrix (with word2vec)
txt_input = Input(shape=(TXT_MAXLEN,), name='word_input')

txt_embed = Embedding(TXT_VOCAB, WEMBED_SIZE, input_length=TXT_MAXLEN,
                      name='word_embedding', trainable=True, mask_zero=True)(txt_input)

txt_drpot = Dropout(WDROP_RATE, name='word_dropout')(txt_embed)

# character-level input with randomized initializations
cnn_input = Input(shape=(TXT_MAXLEN, CHR_MAXLEN), name='cnn_input')

cnn_embed = TimeDistributed(Embedding(CHR_VOCAB, CEMBED_SIZE, input_length=CHR_MAXLEN,
                            name='cnn_embedding', trainable=True, mask_zero=False))(cnn_input)

# 1-size window CNN with batch-norm & tanh activation (Kim 2015)
cnns1 = TimeDistributed(Conv1D(filters=10, kernel_size=1, padding="same", strides=1), name='cnn1_cnn')(cnn_embed)
cnns1 = TimeDistributed(BatchNormalization(), name='cnn1_bnorm')(cnns1)
cnns1 = TimeDistributed(Activation('tanh'), name='cnn1_act')(cnns1)
cnns1 = TimeDistributed(GlobalMaxPooling1D(), name='cnn1_gmp')(cnns1)

# 2-size window CNN with batch-norm & tanh activation (Kim 2015)
cnns2 = TimeDistributed(Conv1D(filters=20, kernel_size=2, padding="same", strides=1), name='cnn2_cnn')(cnn_embed)
cnns2 = TimeDistributed(BatchNormalization(), name='cnn2_bnorm')(cnns2)
cnns2 = TimeDistributed(Activation('tanh'), name='cnn2_act')(cnns2)
cnns2 = TimeDistributed(GlobalMaxPooling1D(), name='cnn2_gmp')(cnns2)

# 3-size window CNN with batch-norm & tanh activation (Kim 2015)
cnns3 = TimeDistributed(Conv1D(filters=30, kernel_size=3, padding="same", strides=1), name='cnn3_cnn')(cnn_embed)
cnns3 = TimeDistributed(BatchNormalization(), name='cnn3_bnorm')(cnns3)
cnns3 = TimeDistributed(Activation('tanh'), name='cnn3_act')(cnns3)
cnns3 = TimeDistributed(GlobalMaxPooling1D(), name='cnn3_gmp')(cnns3)

# 4-size window CNN with batch-norm & tanh activation (Kim 2015)
cnns4 = TimeDistributed(Conv1D(filters=40, kernel_size=4, padding="same", strides=1), name='cnn4_cnn')(cnn_embed)
cnns4 = TimeDistributed(BatchNormalization(), name='cnn4_bnorm')(cnns4)
cnns4 = TimeDistributed(Activation('tanh'), name='cnn4_act')(cnns4)
cnns4 = TimeDistributed(GlobalMaxPooling1D(), name='cnn4_gmp')(cnns4)

# time-distributed highway layer (Kim 2015)
cnns  = concatenate([cnns1, cnns2, cnns3, cnns4], axis=-1, name='cnn_concat')
cnns  = TimeDistributed(Lambda(highway), name='cnn_highway')(cnns)

# final concat of convolutional subword embeddings and word vectors
word_vects  = concatenate([cnns, txt_drpot], axis=-1, name='concat_word_vectors')

In [8]:
########################################
# main recurrent sentence block
########################################

# 'encoder' layer with returned states following (Liu, Lane)
lstm_enc, fh, fc, bh, bc  = Bidirectional(LSTM(HIDDEN_SIZE, return_sequences=True, return_state=True),
                                          name='bidirectional_enc')(word_vects)
lstm_enc = Dropout(DROP_RATE, name='bidirectional_dropout_enc')(lstm_enc)

# "aligned seq2seq" lstm
# load forward LSTM with reverse states following Liu, Lane 2016 (and do reverse)
lstm_dec = Bidirectional(LSTM(HIDDEN_SIZE, return_sequences=True),
                         name='bidirectional_dec')(lstm_enc, initial_state=[bh, bc, fh, fc])

lstm_states = Dropout(DROP_RATE, name='bidirectional_dropout_dec')(lstm_dec)

In [9]:
########################################
# Huang et al; Ma & Hovy CRF slot clf
########################################

# final slot linear chain CRF layer
lyr_crf   = CRF(SLOT_NUM, sparse_target=True, name='out_slot', learn_mode='marginal', test_mode='marginal')
out_slot  = lyr_crf(lstm_states)

# alternative is using greedy predictions
# out_slot  = TimeDistributed(Dense(SLOT_NUM, activation='softmax'), name='out_slot')(txt_lstm_dec)

In [10]:
########################################
# attentional intent clf block
########################################

# combine lstm with CRF for attention (see Liu & Lane)
seq_concat = concatenate([lstm_states, out_slot], axis=2, name='lstm_concat')
seq_concat = Dropout(DROP_RATE, name='bidirectional_dropout_3')(seq_concat)

# layer: intent attention w/context (Liu & Lane)
att_int = AttentionWithContext(name='intent_attention')(seq_concat)

# layer: dense + LeakyReLU with dropout
out_int = Dense(K.int_shape(att_int)[-1],
                kernel_regularizer=l2(0.005),
                name='intent_dense_1')(att_int)
out_int = LeakyReLU(name='intent_act_1')(out_int)
out_int = Dropout(DROP_RATE, name='intent_dropout_1')(out_int)

# layer: dense + LeakyReLU with dropout
out_int = Dense(K.int_shape(att_int)[-1],
                kernel_regularizer=l2(0.0025),
                name='intent_dense_2')(out_int)
out_int = LeakyReLU(name='intent_act_2')(out_int)

# layer: final dense + softmax
out_int = Dense(LABEL_NUM, activation='softmax', name='out_intent')(out_int)

In [11]:
model = Model(inputs=[txt_input, cnn_input], outputs=[out_slot, out_int])

In [12]:
modelname = 'test_model'

In [13]:
# load test
model.load_weights('model/'+modelname+'.h5')

In [14]:
model.compile(optimizer=OPTIMIZER,
              loss={'out_slot': lyr_crf.loss_function, 'out_intent': 'sparse_categorical_crossentropy'},
              # loss={'out_slot': 'sparse_categorical_crossentropy', 'out_intent': 'sparse_categorical_crossentropy'},
              loss_weights={'out_slot': 0.5, 'out_intent': 0.5},
              )

### decoding functions

format the input string properly (lower-case, add BOS and EOS tags, strip punctuation, and index), predict on the model, use `argmax` to get predictions then `inverse_transform()` back into human-readable labels

a more convenient way to do this would be to encapsulate all the above and below into a class.

In [15]:
import re
def preprocess(snt):
    snt = snt.lower()
    snt = re.sub(r'[^0-9a-z\s]', '', snt)
    snt = snt.split()
    snt = ['BOS'] + snt + ['EOS']
    snt = [snt]
    out = sentindexer.transform(snt)
    return snt, out[0], out[1]

In [16]:
def predict(s):
    tk, wt, ct = preprocess(s)
    tk = tk[0]
    sp, ip = model.predict([wt, ct])
    sp = np.argmax(sp, axis=-1)
    ip = np.argmax(ip, axis=-1)
    sp = slotindexer.inverse_transform(np.expand_dims(sp, axis=-1))[0]
    sp = [x.split('-')[-1] for x in sp]
    
    spd = {}
    for i, p in enumerate(sp):
        if p != 'O':
            if p in spd.keys():
                spd[p].append(tk[i])
            else:
                spd[p] = []
                spd[p].append(tk[i])
    
    spo = {}
    for k in spd.keys():
        spo[k] = ' '.join(spd[k])
    
    ip = intindexer.inverse_transform([ip]+[[0]])[0]

    print('query:', s)
    print('slots:')
    print(spo)
    print('intent:', ip)
    
    return spo, ip
    

### test

In [17]:
inpt = "looking for direct flights from Chicago to LAX"
a, b = predict(inpt)

query: looking for direct flights from Chicago to LAX
slots:
{'connect': 'direct', 'fromloc.city_name': 'chicago', 'toloc.city_name': 'lax'}
intent: atis_flight


In [18]:
inpt = "give me flights and fares from New York to Dallas"
a, b = predict(inpt)

query: give me flights and fares from New York to Dallas
slots:
{'fromloc.city_name': 'new york', 'toloc.city_name': 'dallas'}
intent: atis_flight#atis_airfare
