In [91]:
import pickle
import numpy as np

In [92]:
with open("train_qa.txt", "rb") as fp:   

In [93]:
with open("test_qa.txt", "rb") as fp:  
    test_data =  pickle.load(fp)

In [94]:
vocab = set()

In [95]:
all_data = test_data + train_data

In [96]:
for story, question , answer in all_data:
    vocab = vocab.union(set(story))
    vocab = vocab.union(set(question))

In [97]:
vocab.add('no')
vocab.add('yes')

In [98]:
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [99]:
vocab_len = len(vocab) + 1 

In [100]:
max_story_len = max([len(data[0]) for data in all_data])

In [101]:
max_story_len

156

In [102]:
max_question_len = max([len(data[1]) for data in all_data])

In [103]:
vocab_size = len(vocab) + 1

In [104]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [105]:
tokenizer = Tokenizer(filters=[])
tokenizer.fit_on_texts(vocab)

In [106]:
tokenizer.word_index

{'grabbed': 1,
 'office': 2,
 'put': 3,
 'bathroom': 4,
 'yes': 5,
 'took': 6,
 'journeyed': 7,
 'back': 8,
 'kitchen': 9,
 'there': 10,
 '?': 11,
 'dropped': 12,
 'milk': 13,
 'is': 14,
 'john': 15,
 'in': 16,
 'down': 17,
 'went': 18,
 'mary': 19,
 'hallway': 20,
 'left': 21,
 'sandra': 22,
 'the': 23,
 'moved': 24,
 'no': 25,
 'apple': 26,
 'to': 27,
 'discarded': 28,
 'up': 29,
 'got': 30,
 'picked': 31,
 'travelled': 32,
 'daniel': 33,
 'garden': 34,
 'bedroom': 35,
 '.': 36,
 'football': 37}

In [107]:
train_story_text = []
train_question_text = []
train_answers = []

for story,question,answer in train_data:
    train_story_text.append(story)
    train_question_text.append(question)

In [108]:
train_story_seq = tokenizer.texts_to_sequences(train_story_text)


In [109]:
len(train_story_seq)

10000

In [110]:
def vectorize_stories(data, word_index=tokenizer.word_index, max_story_len=max_story_len,max_question_len=max_question_len):
    X = []
    Xq = []
    Y = []
    for story, query, answer in data:
        x = [word_index[word.lower()] for word in story]
        xq = [word_index[word.lower()] for word in query]
        y = np.zeros(len(word_index) + 1)
        y[word_index[answer]] = 1
        
       
        X.append(x)
        Xq.append(xq)
        Y.append(y)
  
    return (pad_sequences(X, maxlen=max_story_len),pad_sequences(Xq, maxlen=max_question_len), np.array(Y))

In [111]:
inputs_train, queries_train, answers_train = vectorize_stories(train_data)

In [112]:
inputs_test, queries_test, answers_test = vectorize_stories(test_data)

In [113]:
inputs_test

array([[ 0,  0,  0, ..., 23, 35, 36],
       [ 0,  0,  0, ..., 23, 34, 36],
       [ 0,  0,  0, ..., 23, 34, 36],
       ...,
       [ 0,  0,  0, ..., 23, 26, 36],
       [ 0,  0,  0, ..., 23, 34, 36],
       [ 0,  0,  0, ..., 26, 10, 36]])

In [114]:
queries_test

array([[14, 15, 16, 23,  9, 11],
       [14, 15, 16, 23,  9, 11],
       [14, 15, 16, 23, 34, 11],
       ...,
       [14, 19, 16, 23, 35, 11],
       [14, 22, 16, 23, 34, 11],
       [14, 19, 16, 23, 34, 11]])

In [115]:
sum(answers_test)

array([  0.,   0.,   0.,   0.,   0., 497.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0., 503.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.])

In [116]:
tokenizer.word_index['yes']

5

In [117]:
tokenizer.word_index['no']

25

In [118]:
from keras.models import Sequential, Model
from keras.layers import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout
from keras.layers import add, dot, concatenate
from keras.layers import LSTM

In [119]:
input_sequence = Input((max_story_len,))
question = Input((max_question_len,))

In [120]:
input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_size,output_dim=64))
input_encoder_m.add(Dropout(0.3))

In [121]:
input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_size,output_dim=max_question_len))
input_encoder_c.add(Dropout(0.3))

In [122]:
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size,
                               output_dim=64,
                               input_length=max_question_len))
question_encoder.add(Dropout(0.3))

In [123]:
input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)

In [124]:
match = dot([input_encoded_m, question_encoded], axes=(2, 2))
match = Activation('softmax')(match)

In [125]:
response = add([match, input_encoded_c])  # (samples, story_maxlen, query_maxlen)
response = Permute((2, 1))(response)  

In [126]:
answer = concatenate([response, question_encoded])

In [127]:
answer

<KerasTensor: shape=(None, 6, 220) dtype=float32 (created by layer 'concatenate_1')>

In [128]:
answer = LSTM(32)(answer)

In [129]:
answer = Dropout(0.5)(answer)
answer = Dense(vocab_size)(answer)

In [130]:
answer = Activation('softmax')(answer)

model = Model([input_sequence, question], answer)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
              metrics=['accuracy'])

In [131]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_3 (InputLayer)        [(None, 156)]                0         []                            
                                                                                                  
 input_4 (InputLayer)        [(None, 6)]                  0         []                            
                                                                                                  
 sequential_3 (Sequential)   (None, None, 64)             2432      ['input_3[0][0]']             
                                                                                                  
 sequential_5 (Sequential)   (None, 6, 64)                2432      ['input_4[0][0]']             
                                                                                              

In [132]:
history = model.fit([inputs_train, queries_train], answers_train,batch_size=32,epochs=100,validation_data=([inputs_test, queries_test], answers_test))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78