In [36]:
import pickle
import numpy as np

In [37]:
with open('train_qa.txt','rb') as f:
    train_data = pickle.load(f)

In [39]:
with open('test_qa.txt','rb') as f:
    test_data = pickle.load(f)

In [40]:
' '.join(train_data[0][0])

'Mary moved to the bathroom . Sandra journeyed to the bedroom .'

In [41]:
' '.join(train_data[0][1])

'Is Sandra in the hallway ?'

In [42]:
train_data[0][2]

'no'

In [43]:
all_data = test_data + train_data

In [44]:
vocab = set()

for story, question , answer in all_data:
    vocab = vocab.union(set(story))
    vocab = vocab.union(set(question))

In [45]:
vocab.add('no')

In [46]:
vocab.add('yes')

In [47]:
vocab_len = len(vocab) + 1

In [9]:
#Longest Story
all_story_lens = [len(data[0]) for data in all_data]

In [48]:
max_story_len = max([len(data[0]) for data in all_data])

In [49]:
max_question_len = max([len(data[1]) for data in all_data])

In [50]:
vocab_size = len(vocab) + 1

In [51]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [52]:
tokenizer = Tokenizer(filters=[])
tokenizer.fit_on_texts(vocab)

In [53]:
tokenizer.word_index

{'no': 1,
 'picked': 2,
 'travelled': 3,
 'bathroom': 4,
 'bedroom': 5,
 'there': 6,
 'left': 7,
 'football': 8,
 'down': 9,
 'the': 10,
 'sandra': 11,
 '?': 12,
 'john': 13,
 'daniel': 14,
 'put': 15,
 'yes': 16,
 'mary': 17,
 'went': 18,
 'garden': 19,
 'up': 20,
 'back': 21,
 'discarded': 22,
 'grabbed': 23,
 'to': 24,
 'milk': 25,
 'office': 26,
 'journeyed': 27,
 'moved': 28,
 'apple': 29,
 'hallway': 30,
 'is': 31,
 'got': 32,
 'kitchen': 33,
 '.': 34,
 'took': 35,
 'dropped': 36,
 'in': 37}

In [54]:
train_story_text = []
train_question_text = []
train_answers = []

for story,question,answer in train_data:
    train_story_text.append(story)
    train_question_text.append(question)

In [55]:
train_story_seq = tokenizer.texts_to_sequences(train_story_text)

In [56]:
def vectorize_stories(data, word_index=tokenizer.word_index,max_story_len=max_story_len,max_question_len=max_question_len):
    # X = STORIES
    X = []
    # Xq = QUERY/QUESTION
    Xq = []
    # Y = CORRECT ANSWER
    Y = []
    
    
    for story, query, answer in data:
        
        # Grab the word index for every word in story
        x = [word_index[word.lower()] for word in story]
        # Grab the word index for every word in query
        xq = [word_index[word.lower()] for word in query]
        
        # Grab the Answers (either Yes/No so we don't need to use list comprehension here)
        # Index 0 is reserved so we're going to use + 1
        y = np.zeros(len(word_index) + 1)
        
        # Now that y is all zeros and we know its just Yes/No , we can use numpy logic to create this assignment
        #
        y[word_index[answer]] = 1
        
        # Append each set of story,query, and answer to their respective holding lists
        X.append(x)
        Xq.append(xq)
        Y.append(y)
        
    # Finally, pad the sequences based on their max length so the RNN can be trained on uniformly long sequences.
        
    # RETURN TUPLE FOR UNPACKING
    return (pad_sequences(X, maxlen=max_story_len),pad_sequences(Xq, maxlen=max_question_len), np.array(Y))

In [57]:
inputs_train, queries_train, answers_train = vectorize_stories(train_data)

In [58]:
inputs_test, queries_test, answers_test = vectorize_stories(test_data)

In [59]:
from keras.models import Sequential,Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot,concatenate, LSTM

In [60]:
input_sequence = Input((max_story_len,))
question = Input((max_question_len,))

In [61]:
vocab_size = len(vocab) + 1

In [62]:
input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_size,output_dim=64))
input_encoder_m.add(Dropout(0.3))


Instructions for updating:
Colocations handled automatically by placer.


In [63]:
input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_size,output_dim=max_question_len))
input_encoder_c.add(Dropout(0.3))

In [64]:
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size,output_dim=64, input_length=max_question_len))
question_encoder.add(Dropout(0.3))

In [65]:
input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)

In [66]:
match = dot([input_encoded_m,question_encoded],axes=(2,2))
match = Activation('softmax')(match)

In [67]:
response = add([match,input_encoded_c])
response = Permute((2,1))(response)

In [68]:
answer = concatenate([response,question_encoded])

In [69]:
answer

<tf.Tensor 'concatenate_1/concat:0' shape=(?, 6, 220) dtype=float32>

In [70]:
answer = LSTM(32)(answer)

In [71]:
answer = Dropout(0.5)(answer)
answer = Dense(vocab_size)(answer)

In [72]:
answer = Activation('softmax')(answer)

In [73]:
model = Model([input_sequence,question],answer)

In [75]:
model.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['accuracy'])

In [76]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 156)          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 6)            0                                            
__________________________________________________________________________________________________
sequential_1 (Sequential)       multiple             2432        input_1[0][0]                    
__________________________________________________________________________________________________
sequential_3 (Sequential)       (None, 6, 64)        2432        input_2[0][0]                    
____________________________________________________________________________________________