In [1]:
import numpy as np
import pandas as pd
from data.preprocessing import get_vocab
from keras.preprocessing import sequence
from keras.models import Model
from keras.layers.wrappers import Bidirectional
from keras.layers import Activation, concatenate, Dense, Input, LSTM, Dropout, Embedding
from keras_tqdm import TQDMNotebookCallback

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
# load data
vocab = list(np.load("dbases/w2v_word_tokens.npy"))
sentences = list(np.load("dbases/w2v_sent_tokens.npy"))
speechacts = list(np.load("dbases/speech_acts.npy"))
topics = list(np.load("dbases/topics.npy"))

In [4]:
# word idx
vocix = [i+1 for i in range(len(vocab))]
wrd2idx = dict(list(zip(vocab, vocix)))
wrd2idx['PAD'] = 0
wrd2idx['UNK'] = len(vocab)+1
idx2wrd = dict(list(zip(vocix, vocab)))
idx2wrd[0] = 'PAD'
idx2wrd[len(vocab)+1] = 'UNK'

In [5]:
# remove the speech-act prefixes
speechacts = [sa.split('-')[-1] for sa in speechacts]

# remove the topic prefixes, suffixes
trunctop = [t.split('-')[-1] for t in topics]
trunctop = [t.split('_')[0] for t in trunctop]
topics = trunctop

In [6]:
# other idxes
sctlst = list(set(speechacts))
sctidx = [i for i in range(len(sctlst))]
sct2idx = dict(list(zip(sctlst, sctidx)))
enc_speechacts = np.asarray([sct2idx[s] for s in speechacts])

toplst = list(set(topics))
topidx = [i for i in range(len(toplst))]
top2idx = dict(list(zip(toplst, topidx)))
enc_topics = np.asarray([top2idx[s] for s in topics])

In [7]:
def stripit(s):
    s = s.lower()
    for p in ['.', ',', '?', '-']:
        s = s.replace(p, '')
    for n in ['1','2','3','4','5','6','7','8','9','0']:
        s = s.replace(n, '#')
    return s

In [8]:
sentidx = []
for sent in sentences:
    idxes = []
    toks = stripit(sent)
    for t in toks:
        if t in wrd2idx:
            idxes.append(wrd2idx[t])
        else:
            idxes.append(wrd2idx['UNK'])
    sentidx.append(idxes)

In [9]:
# max, avg sent lengs
lens = [len(s) for s in sentidx]
max(lens), sum(lens)/len(lens)

(818, 42.54792703150912)

In [10]:
# pad to 100-length
padsents = sequence.pad_sequences(sentidx, maxlen=100, truncating='post', padding='pre')

In [31]:
# model and sub-model definitions

# input layer
emb_input = Input(shape=(100,), name='input')

# embedding layer
embedding = Embedding(len(wrd2idx.keys()), 200, input_length=100, 
                      trainable=True, mask_zero=True, name='embedding')(emb_input)

# lstm layer
emb_out = LSTM(200, name='lstm')(embedding)

# word-embedding submodel
embmodel = Model(emb_input, emb_out)

# input layer
txt_input = Input(shape=(100,), name='txt_input')

recurrent = embmodel(txt_input)

# intent output
dns_intent = Dense(len(set(speechacts)), activation='softmax', name='int_dense')(recurrent)
out_intent = Activation('softmax', name='int_output')(dns_intent)

# topic output
dns_top = Dense(len(set(topics)), activation='softmax', name='top_dense')(recurrent)
out_top = Activation('softmax', name='top_output')(dns_top)

model = Model(inputs=[txt_input], outputs=[out_intent, out_top])

In [32]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
txt_input (InputLayer)           (None, 100)           0                                            
____________________________________________________________________________________________________
model_15 (Model)                 (None, 200)           740200      txt_input[0][0]                  
____________________________________________________________________________________________________
int_dense (Dense)                (None, 53)            10653       model_15[1][0]                   
____________________________________________________________________________________________________
top_dense (Dense)                (None, 39)            7839        model_15[1][0]                   
___________________________________________________________________________________________

In [33]:
model.compile(optimizer='sgd',
              loss={'int_output': 'sparse_categorical_crossentropy',  'top_output': 'sparse_categorical_crossentropy'},
              loss_weights={'int_output': 0.5, 'top_output': 0.5},
              )

In [34]:
history = model.fit([padsents], [enc_speechacts, enc_topics],
                    batch_size=16,
                    epochs=16,
                    callbacks=[TQDMNotebookCallback()],
                    verbose=0)

hist_dict = history.history




In [36]:
model.save("model/lstm_model.h5")
embmodel.save("model/sent_emb_model.h5")

In [37]:
from attention import Attention

In [41]:
# model and sub-model definitions

# input layer
emb_input = Input(shape=(100,), name='input')

# embedding layer
embedding = Embedding(len(wrd2idx.keys()), 200, input_length=100, 
                      trainable=True, mask_zero=True, name='embedding')(emb_input)

# lstm layer
emb_rnn = LSTM(200, name='lstm', return_sequences=True)(embedding)

# attention layer
emb_out = Attention()(emb_rnn)

# word-embedding submodel
attembmodel = Model(emb_input, emb_out)

# input layer
txt_input = Input(shape=(100,), name='txt_input')

recurrent = attembmodel(txt_input)

# intent output
dns_intent = Dense(len(set(speechacts)), activation='softmax', name='int_dense')(recurrent)
out_intent = Activation('softmax', name='int_output')(dns_intent)

# topic output
dns_top = Dense(len(set(topics)), activation='softmax', name='top_dense')(recurrent)
out_top = Activation('softmax', name='top_output')(dns_top)

attmodel = Model(inputs=[txt_input], outputs=[out_intent, out_top])

In [42]:
attmodel.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
txt_input (InputLayer)           (None, 100)           0                                            
____________________________________________________________________________________________________
model_17 (Model)                 (None, 200)           780600      txt_input[0][0]                  
____________________________________________________________________________________________________
int_dense (Dense)                (None, 53)            10653       model_17[1][0]                   
____________________________________________________________________________________________________
top_dense (Dense)                (None, 39)            7839        model_17[1][0]                   
___________________________________________________________________________________________

In [43]:
attmodel.compile(optimizer='sgd',
                 loss={'int_output': 'sparse_categorical_crossentropy',  'top_output': 'sparse_categorical_crossentropy'},
                 loss_weights={'int_output': 0.5, 'top_output': 0.5},
                 )

In [44]:
atthistory = attmodel.fit([padsents], [enc_speechacts, enc_topics],
                          batch_size=16,
                          epochs=16,
                          callbacks=[TQDMNotebookCallback()],
                          verbose=0)




In [45]:
attmodel.save("model/att_lstm_model.h5")
attembmodel.save("model/sent_att_emb_model.h5")