# decode some test sentences

save as a csv file for reading

In [1]:
import numpy as np
from dataset import index_sents
from nltk import word_tokenize, pos_tag
from keras.preprocessing import sequence
from keras.models import Model
from keras.models import load_model
from keras.layers.wrappers import Bidirectional
from keras.layers import Activation, concatenate, Dense, Input, LSTM, Dropout, Embedding
from attention import Attention
from keras_contrib.layers import CRF
from keras_contrib.utils import save_load_utils

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
config = {
    'data_dir' : '../00_data/encoded/',
    'model_dir' : '../00_data/model/',
    'model_name' : 'alt_combo_model.h5',
}

In [3]:
# main (picklable) class
class NameEntityRecognizer():
    
    def __init__(self, config):
        
        self._load_data(config['data_dir'])
        self.model = self._load_model(config['model_dir']+config['model_name'])
        self.nertagset = list(self.ner2idx.keys())
        
    # load data files
    def _load_data(self, datadir):
        # load data conversion dictionaries
        self.word2idx = np.load(datadir+'word2idx.npy').item()
        self.idx2word = np.load(datadir+'idx2word.npy').item()
        self.pos2idx = np.load(datadir+'pos2idx.npy').item()
        self.idx2pos = np.load(datadir+'idx2pos.npy').item()
        self.ner2idx = np.load(datadir+'ner2idx.npy').item()
        self.idx2ner = np.load(datadir+'idx2ner.npy').item()
        self.sa2idx = np.load(datadir+'sa2idx.npy').item()
        self.idx2sa = np.load(datadir+'idx2sa.npy').item()
        self.top2idx = np.load(datadir+'top2idx.npy').item()
        self.idx2top = np.load(datadir+'idx2top.npy').item()
    
    # load Keras NER-CRF model
    def _load_model(self, modelpath):
        # keras model loading
        # network hyperparameters
        MAX_LENGTH = 20
        EMBEDDING_SIZE = 160
        POSBEDDING_SIZE = 32
        HIDDEN_SIZE = 192
        DROPOUTRATE = 0.33

        # dict-dependent hyperparameters
        MAX_VOCAB = len(self.word2idx.keys())
        TAG_VOCAB = len(list(self.idx2pos.keys()))
        NER_VOCAB = len(list(self.idx2ner.keys()))
        INT_VOCAB = len(list(self.idx2sa.keys()))
        TOP_VOCAB = len(list(self.idx2top.keys()))
    
        # text layers : dense embedding > dropout > bi-LSTM
        txt_input = Input(shape=(MAX_LENGTH,), name='txt_input')
        txt_embed = Embedding(MAX_VOCAB, EMBEDDING_SIZE, input_length=MAX_LENGTH,
                              name='txt_embedding', trainable=False, mask_zero=True)(txt_input)
        txt_drpot = Dropout(DROPOUTRATE, name='txt_dropout')(txt_embed)
        txt_lstml = Bidirectional(LSTM(HIDDEN_SIZE, return_sequences=True),
                                  name='txt_bidirectional')(txt_drpot)

        # pos layers : dense embedding > dropout > bi-LSTM
        pos_input = Input(shape=(MAX_LENGTH,), name='pos_input')
        pos_embed = Embedding(TAG_VOCAB, POSBEDDING_SIZE, input_length=MAX_LENGTH,
                              name='pos_embedding', trainable=True, mask_zero=True)(pos_input)
        pos_drpot = Dropout(DROPOUTRATE, name='pos_dropout')(pos_embed)

        # merged layers : merge (concat, average...) word and pos > bi-LSTM > bi-LSTM
        mrg_cncat = concatenate([txt_lstml, pos_drpot], axis=2)
        mrg_lstml = Bidirectional(LSTM(HIDDEN_SIZE, return_sequences=True),
                                  name='mrg_bidirectional_1')(mrg_cncat)

        # final NER linear chain CRF layer
        crf = CRF(NER_VOCAB, sparse_target=True)
        out_ner = crf(mrg_lstml)

        # intent network
        rnn_intent = Attention(name='int_attention')(mrg_lstml)

        # intent
        dns_intent = Dense(INT_VOCAB, activation='relu', name='int_dense_1')(rnn_intent)
        dns_intent = Dense(INT_VOCAB, name='int_dense_2')(dns_intent)
        out_intent = Activation('softmax', name='int_output')(dns_intent)

        # topic
        dns_top = Dense(TOP_VOCAB, activation='relu', name='top_dense_1')(rnn_intent)
        dns_intent = Dense(INT_VOCAB, name='int_dense_2')(dns_top)
        out_top = Activation('softmax', name='top_output')(dns_top)

        model = Model(inputs=[txt_input, pos_input], outputs=[out_ner, out_intent, out_top])

        # save for later (duhhh...)
        self.MAX_LENGTH = MAX_LENGTH

        # load model weigghts
        save_load_utils.load_all_weights(model, modelpath)
        
        return model
    
    
    # tokenize and pos-tag with NLTK
    # preprocess: lowercase, strip punct
    def _tokenize(self, s):
        
        for punc in ['.', ',', '!', '?', '-', '"', "'"]:
            s = s.replace(punc, '')
        
        tok_tags = pos_tag(word_tokenize(s.lower()))
        s_toks = [t[0] for t in tok_tags]
        s_tags = [t[1] for t in tok_tags]
        
        return s_toks, s_tags
    
    
    # integer-index  and pad sent and tag sequences
    def _index_sents(self, s_toks, s_tags):
        
        X_toks = index_sents([s_toks], self.word2idx)
        X_tags = index_sents([s_tags], self.pos2idx)
        X_toks = sequence.pad_sequences(X_toks, maxlen=self.MAX_LENGTH, truncating='post', padding='post')
        X_tags = sequence.pad_sequences(X_tags, maxlen=self.MAX_LENGTH, truncating='post', padding='post')
        
        return X_toks, X_tags
    
    
    # convert string to a map using decode()
    def _ner_dict(self, toks, ners):
        dct = {}
        for idx, word in enumerate(toks):
            if ners[idx] != 'O':
                if ners[idx] in dct.keys():
                    dct[ners[idx]] += ' '
                    dct[ners[idx]] += word
                else:
                    dct[ners[idx]] = word
        return dct
    
    
    # predict on sentences
    def predict(self, s, debug=False):

        s_toks, s_tags = self._tokenize(s)
        f_toks = s_toks[:]
        
        for i, w in enumerate(s_toks):
            for number in ['1','2','3','4','5','6','7','8','9','0']:
                w = w.replace(number, '#')
            s_toks[i] = w
        
        X_toks, X_tags = self._index_sents(s_toks, s_tags)

        this_pred = self.model.predict([X_toks, X_tags])

        this_nerpred = list(this_pred[0])
        this_intpred = this_pred[1]
        this_toppred = this_pred[2]

        this_nerpred = [np.argmax(p) for p in this_nerpred[0]]
        this_intpred = np.argmax(this_intpred[0])
        this_toppred = np.argmax(this_toppred[0])

        word, prd = [], []

        # decode ner-sequence, intents
        for idx, wordid in enumerate(X_toks[0][:len(s_toks)]):

            if self.idx2word[wordid] != 'PAD' and self.idx2pos[X_tags[0][idx]] != 'PAD':

                # decode word (from TRUE sequence)
                word.append(f_toks[idx])
                # decode prediction
                prd.append(self.idx2ner[this_nerpred[idx]])

        intent = self.idx2sa[this_intpred]
        topic = self.idx2top[this_toppred]
        
        if debug:
            print(word)
            print(prd)

        return self._ner_dict(word, prd), intent, topic

In [4]:
ner = NameEntityRecognizer(config)

In [10]:
ner.predict("do you have any flights from Seoul leaving tomorrow?", debug=True)

['do', 'you', 'have', 'any', 'flights', 'from', 'seoul', 'leaving', 'tomorrow']
['O', 'O', 'O', 'O', 'O', 'O', 'GEO', 'O', 'DAT']


({'DAT': 'tomorrow', 'GEO': 'seoul'}, 'reqInfo', 'day')

In [9]:
ner.predict("are there any flights to London at 5:30?", debug=True)

['are', 'there', 'any', 'flights', 'to', 'london', 'at', '5:30']
['O', 'O', 'O', 'O', 'O', 'GEO', 'O', 'TIM']


({'GEO': 'london', 'TIM': '5:30'}, 'reqInfo', 'time')

In [11]:
ner.predict("let's do the 11:20 am flight to Tokyo", debug=True)

['lets', 'do', 'the', '11:20', 'am', 'flight', 'to', 'tokyo']
['O', 'O', 'O', 'TIM', 'TIM', 'O', 'O', 'GEO']


({'GEO': 'tokyo', 'TIM': '11:20 am'}, 'state', 'enum')