# decode some test sentences

save as a csv file for reading

In [1]:
%matplotlib inline
from dataset import index_sents
import matplotlib.pyplot as plt
from mlxtend.preprocessing import one_hot
import pandas as pd
import numpy as np
from keras.preprocessing import sequence
from keras.models import Model, Sequential, model_from_json
from keras.models import load_model
from keras.layers.wrappers import Bidirectional
from keras.layers import Activation, concatenate, Dense, Input, LSTM, Dropout, Embedding
from keras.models import save_model, load_model
from attention import Attention
from keras_contrib.layers import CRF
from keras_contrib.utils import save_load_utils

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
# load data
word2idx = np.load('../00_data/encoded/word2idx.npy').item()
idx2word = np.load('../00_data/encoded/idx2word.npy').item()
pos2idx = np.load('../00_data/encoded/pos2idx.npy').item()
idx2pos = np.load('../00_data/encoded/idx2pos.npy').item()
ner2idx = np.load('../00_data/encoded/ner2idx.npy').item()
idx2ner = np.load('../00_data/encoded/idx2ner.npy').item()

sa2idx = np.load('../00_data/encoded/sa2idx.npy').item()
idx2sa = np.load('../00_data/encoded/idx2sa.npy').item()
top2idx = np.load('../00_data/encoded/top2idx.npy').item()
idx2top = np.load('../00_data/encoded/idx2top.npy').item()

X_tokens = list(np.load('../00_data/encoded/add_tokens.npy'))
X_postags = list(np.load('../00_data/encoded/add_postags.npy'))
y_nertags = list(np.load('../00_data/encoded/add_nertags.npy'))

# change category label here
y_speechacts = np.load('../00_data/encoded/add_speechacts.npy')
y_topics = np.load('../00_data/encoded/add_topics.npy')
print("...data loaded!")

...data loaded!


In [3]:
len(top2idx.keys())

39

In [4]:
# integer-index data
X_tokens = index_sents(X_tokens, word2idx)
X_postags = index_sents(X_postags, pos2idx)
y_nertags = index_sents(y_nertags, ner2idx)

# integer-index and one-hot speech-acts
INT_VOCAB = len(list(idx2sa.keys()))
y_ints = y_speechacts[:]
y_speechacts = one_hot([sa2idx[sa] for sa in y_speechacts], dtype='int', num_labels=INT_VOCAB)

TOP_VOCAB = len(list(idx2top.keys()))
y_tops = y_topics[:]
y_topics = one_hot([top2idx[t] for t in y_topics], dtype='int', num_labels=TOP_VOCAB)

# split data
split_idx = int(len(X_tokens)*0.9)
X_train_sents = X_tokens[:split_idx]
X_train_pos = X_postags[:split_idx]
y_train_ner = y_nertags[:split_idx]
X_test_sents = X_tokens[split_idx:]
X_test_pos = X_postags[split_idx:]
y_test_ner = y_nertags[split_idx:]

y_decode_ints = y_ints[split_idx:]
y_train_sacts = y_speechacts[:split_idx]
y_test_sacts = y_speechacts[split_idx:]

y_decode_tops = y_tops[split_idx:]
y_train_tops = y_topics[:split_idx]
y_test_tops = y_topics[split_idx:]

In [5]:
# network hyperparameters
MAX_LENGTH = 20
MAX_VOCAB = len(word2idx.keys())
EMBEDDING_SIZE = 160 # preprocessing.ipynb
POSBEDDING_SIZE = 32
HIDDEN_SIZE = 192    # LSTM Nodes/Features/Dimension
BATCH_SIZE = 16
DROPOUTRATE = 0.33
MAX_EPOCHS = 12      # max iterations, early stop condition below

In [6]:
print("zero-padding sequences...\n")
X_train_sents = sequence.pad_sequences(X_train_sents, maxlen=MAX_LENGTH, truncating='post', padding='post')
X_test_sents = sequence.pad_sequences(X_test_sents, maxlen=MAX_LENGTH, truncating='post', padding='post')
X_train_pos = sequence.pad_sequences(X_train_pos, maxlen=MAX_LENGTH, truncating='post', padding='post')
X_test_pos = sequence.pad_sequences(X_test_pos, maxlen=MAX_LENGTH, truncating='post', padding='post')
y_train_ner = sequence.pad_sequences(y_train_ner, maxlen=MAX_LENGTH, truncating='post', padding='post')
y_test_ner = sequence.pad_sequences(y_test_ner, maxlen=MAX_LENGTH, truncating='post', padding='post')

# get the size of pos-tags, ner tags
TAG_VOCAB = len(list(idx2pos.keys()))
NER_VOCAB = len(list(idx2ner.keys()))

# reshape data for CRF
# y_train_ner = y_train_ner[:, :, np.newaxis]
# y_test_ner = y_test_ner[:, :, np.newaxis]

zero-padding sequences...



## load weights

load it this way because of `keras-contrib` CRF layer

In [7]:
print('Building model...\n')

# text layers : dense embedding > dropout > bi-LSTM
txt_input = Input(shape=(MAX_LENGTH,), name='txt_input')
txt_embed = Embedding(MAX_VOCAB, EMBEDDING_SIZE, input_length=MAX_LENGTH,
                      name='txt_embedding', trainable=False, mask_zero=True)(txt_input)
txt_drpot = Dropout(DROPOUTRATE, name='txt_dropout')(txt_embed)
txt_lstml = Bidirectional(LSTM(HIDDEN_SIZE, return_sequences=True),
                          name='txt_bidirectional')(txt_drpot)

# pos layers : dense embedding > dropout > bi-LSTM
pos_input = Input(shape=(MAX_LENGTH,), name='pos_input')
pos_embed = Embedding(TAG_VOCAB, POSBEDDING_SIZE, input_length=MAX_LENGTH,
                      name='pos_embedding', trainable=True, mask_zero=True)(pos_input)
pos_drpot = Dropout(DROPOUTRATE, name='pos_dropout')(pos_embed)

# merged layers : merge (concat, average...) word and pos > bi-LSTM > bi-LSTM
mrg_cncat = concatenate([txt_lstml, pos_drpot], axis=2)
mrg_lstml = Bidirectional(LSTM(HIDDEN_SIZE, return_sequences=True),
                          name='mrg_bidirectional_1')(mrg_cncat)

# final NER linear chain CRF layer
crf = CRF(NER_VOCAB, sparse_target=True)
out_ner = crf(mrg_lstml)

# intent network
rnn_intent = Attention(name='int_attention')(mrg_lstml)

# intent
dns_intent = Dense(INT_VOCAB, activation='relu', name='int_dense_1')(rnn_intent)
dns_intent = Dense(INT_VOCAB, name='int_dense_2')(dns_intent)
out_intent = Activation('softmax', name='int_output')(dns_intent)

# topic
dns_top = Dense(TOP_VOCAB, activation='relu', name='top_dense_1')(rnn_intent)
dns_intent = Dense(INT_VOCAB, name='int_dense_2')(dns_top)
out_top = Activation('softmax', name='top_output')(dns_top)

model = Model(inputs=[txt_input, pos_input], outputs=[out_ner, out_intent, out_top])

Building model...



In [8]:
# load model
save_load_utils.load_all_weights(model,'../00_data/model/alt_combo_model.h5')
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
txt_input (InputLayer)           (None, 20)            0                                            
____________________________________________________________________________________________________
txt_embedding (Embedding)        (None, 20, 160)       305280      txt_input[0][0]                  
____________________________________________________________________________________________________
pos_input (InputLayer)           (None, 20)            0                                            
____________________________________________________________________________________________________
txt_dropout (Dropout)            (None, 20, 160)       0           txt_embedding[0][0]              
___________________________________________________________________________________________

In [9]:
decoded = []

ner_accs = []
int_accs = []
top_accs = []

int_prds = []
int_trus = []

top_prds = []
top_trus = []

for sent_idx in range(len(X_test_sents)):
    
    this_txt = sequence.pad_sequences([X_test_sents[sent_idx]], maxlen=MAX_LENGTH, truncating='post', padding='post')
    this_pos = sequence.pad_sequences([X_test_pos[sent_idx]], maxlen=MAX_LENGTH, truncating='post', padding='post')
    this_pred = model.predict([this_txt, this_pos])
    
    this_nerpred = list(this_pred[0])
    this_intpred = this_pred[1]
    this_toppred = this_pred[2]
    
    this_nerpred = [np.argmax(p) for p in this_nerpred[0]]
    this_intpred = np.argmax(this_intpred[0])
    this_toppred = np.argmax(this_toppred[0])
    
    
    # print(np.shape(this_nerpred), '\n', this_nerpred, '\n', np.shape(this_intpred), '\n', this_intpred)
    # print(this_nerpred, '\n', this_intpred)
    
    word, pos, tru, prd = [], [], [], []

    # for each word in the sentence...
    for idx, wordid in enumerate(X_test_sents[sent_idx][:len(this_nerpred)]):

        # NER ACCURACY CALC
        # disregard padding
        if this_nerpred[idx] != 0 and y_test_ner[sent_idx][idx] != 0:
            if this_nerpred[idx] == y_test_ner[sent_idx][idx]:
                ner_accs.append(1.0)
            else:
                ner_accs.append(0.0)
        
        # decode word
        word.append(idx2word[wordid])
        # decode pos
        pos.append(idx2pos[X_test_pos[sent_idx][idx]])
        # decode true NER tag
        tru.append(idx2ner[y_test_ner[sent_idx][idx]])
        # decode prediction
        prd.append(idx2ner[this_nerpred[idx]])

    answ = pd.DataFrame(
    {
        'word': word,
        'pos': pos,
        'true': tru,
        'pred': prd,
        'skip' : [' ' for s in word]
    })
    answ = answ[['word', 'pos', 'true', 'pred', 'skip']]
    answ = answ.T
    decoded.append(answ)
    
    if this_intpred not in idx2sa.keys():
        this_intpred = 0
    int_trus.append(y_decode_ints[sent_idx])
    int_prds.append(idx2sa[this_intpred])
    if idx2sa[this_intpred] == y_decode_ints[sent_idx]:
        int_accs.append(1.0)
    else:
        int_accs.append(0.0)

    if this_toppred not in idx2top.keys():
        this_toppred = 0
    top_trus.append(y_decode_tops[sent_idx])
    top_prds.append(idx2top[this_toppred])
    if idx2top[this_toppred] == y_decode_tops[sent_idx]:
        top_accs.append(1.0)
    else:
        top_accs.append(0.0)
    

In [10]:
y_decode_tops[0]

'airline'

In [11]:
sum(ner_accs)/len(ner_accs)

0.9968102073365231

In [12]:
sum(int_accs)/len(int_accs)

0.9059278350515464

In [13]:
sum(top_accs)/len(top_accs)

0.9072164948453608

In [14]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(int_trus, int_prds))

                      precision    recall  f1-score   support

             abandon       0.74      0.67      0.70        39
               agree       1.00      0.67      0.80         6
           apologise       1.00      1.00      1.00         2
             approve       0.62      0.83      0.71         6
             confirm       0.50      0.50      0.50         2
             correct       1.00      1.00      1.00         6
         correctSelf       0.62      0.45      0.53        11
              direct       1.00      0.92      0.96        13
                echo       1.00      1.00      1.00         2
                elab       0.67      1.00      0.80         2
         enumeration       1.00      1.00      1.00         3
    expressAwareness       1.00      0.50      0.67         2
expressImPossibility       1.00      1.00      1.00         4
 expressNonAwareness       1.00      1.00      1.00         3
      expressOpinion       0.83      0.83      0.83        24
  expre

  'precision', 'predicted', average, warn_for)


In [15]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(top_trus, top_prds))

              precision    recall  f1-score   support

     address       0.00      0.00      0.00        10
     airline       0.97      1.00      0.99        78
     airport       0.00      0.00      0.00         6
     arrival       0.99      0.97      0.98        87
availability       0.57      1.00      0.73         4
     booking       0.90      1.00      0.95        18
      cancel       0.00      0.00      0.00         9
         car       0.00      0.00      0.00         3
     confirm       1.00      0.33      0.50         6
  creditcard       0.85      0.89      0.87        19
        date       0.99      0.99      0.99        67
         day       0.99      0.99      0.99       226
   departure       0.92      0.92      0.92        38
    district       0.00      0.00      0.00        21
        enum       0.91      0.96      0.94       181
        fare       0.88      0.88      0.88        24
        from       0.98      0.98      0.98        63
       hotel       0.00    

  'precision', 'predicted', average, warn_for)


In [16]:
result = pd.concat(decoded)

In [17]:
result.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
word,and,it,i,they,also,included,here,a,spirit,file,code,number,PAD,PAD,PAD,PAD,PAD,PAD,PAD,PAD
pos,CC,PRP,VBZ,PRP,RB,VBD,RB,DT,NNP,NN,NN,NN,PAD,PAD,PAD,PAD,PAD,PAD,PAD,PAD
true,O,O,O,O,O,O,O,O,COM,O,O,O,PAD,PAD,PAD,PAD,PAD,PAD,PAD,PAD
pred,O,O,O,O,O,O,O,O,COM,O,O,O,PAD,PAD,PAD,PAD,PAD,PAD,PAD,PAD
skip,,,,,,,,,,,,,,,,,,,,


In [18]:
# result.to_csv('../sample_result.csv')