In [1]:
import pandas as pd
import numpy as np
import pickle

In [5]:
with open ('train_qa.txt', 'rb') as f:
    train_data=pickle.load(f)

In [6]:
with open ('test_qa.txt', 'rb') as f:
    test_data=pickle.load(f)

In [7]:
len(train_data)

10000

In [8]:
len(test_data)

1000

In [9]:
train_data

[(['Mary',
   'moved',
   'to',
   'the',
   'bathroom',
   '.',
   'Sandra',
   'journeyed',
   'to',
   'the',
   'bedroom',
   '.'],
  ['Is', 'Sandra', 'in', 'the', 'hallway', '?'],
  'no'),
 (['Mary',
   'moved',
   'to',
   'the',
   'bathroom',
   '.',
   'Sandra',
   'journeyed',
   'to',
   'the',
   'bedroom',
   '.',
   'Mary',
   'went',
   'back',
   'to',
   'the',
   'bedroom',
   '.',
   'Daniel',
   'went',
   'back',
   'to',
   'the',
   'hallway',
   '.'],
  ['Is', 'Daniel', 'in', 'the', 'bathroom', '?'],
  'no'),
 (['Mary',
   'moved',
   'to',
   'the',
   'bathroom',
   '.',
   'Sandra',
   'journeyed',
   'to',
   'the',
   'bedroom',
   '.',
   'Mary',
   'went',
   'back',
   'to',
   'the',
   'bedroom',
   '.',
   'Daniel',
   'went',
   'back',
   'to',
   'the',
   'hallway',
   '.',
   'Sandra',
   'went',
   'to',
   'the',
   'kitchen',
   '.',
   'Daniel',
   'went',
   'back',
   'to',
   'the',
   'bathroom',
   '.'],
  ['Is', 'Daniel', 'in', 'the', '

In [10]:
all_data = test_data + train_data

In [11]:
len(all_data)

11000

In [12]:
vocab = set()
for story,question,answer in all_data:
    vocab = vocab.union(set(story))
    vocab = vocab.union(set(question))

In [13]:
vocab.add('yes')


In [14]:
vocab.add('no')

In [15]:
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [16]:
vocab_len = len(vocab)+1

In [17]:
all_story_len = [len(data[0]) for data in all_data]

In [18]:
max(all_story_len)

156

In [19]:
max_story_len = max(all_story_len)

In [20]:
max_question_len = max([len(data[1]) for data in all_data])

In [21]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [22]:
tokenizer=Tokenizer(filters=[])
tokenizer.fit_on_texts(vocab)

In [23]:
tokenizer.word_index

{'picked': 1,
 'sandra': 2,
 'discarded': 3,
 'bedroom': 4,
 'the': 5,
 'journeyed': 6,
 'hallway': 7,
 'back': 8,
 '?': 9,
 'went': 10,
 'office': 11,
 'travelled': 12,
 'left': 13,
 '.': 14,
 'no': 15,
 'moved': 16,
 'to': 17,
 'bathroom': 18,
 'garden': 19,
 'dropped': 20,
 'down': 21,
 'in': 22,
 'kitchen': 23,
 'grabbed': 24,
 'put': 25,
 'is': 26,
 'mary': 27,
 'yes': 28,
 'got': 29,
 'took': 30,
 'up': 31,
 'john': 32,
 'apple': 33,
 'daniel': 34,
 'football': 35,
 'milk': 36,
 'there': 37}

In [24]:
train_story_text = []
train_question_text = []
train_answer = []

In [25]:
for story,question,answer in train_data:
    train_story_text.append(story)
    train_question_text.append(question)
    train_answer.append(answer)

In [26]:
train_story_seq = tokenizer.texts_to_sequences(train_story_text)

In [27]:
def vectorize_stories(data,word_index=tokenizer.word_index, max_story_len=max_story_len, max_question_len=max_question_len):
    X = []
    Xq = []
    Y = []

    for story,query,answer in data:
       x = [word_index[word.lower()]for word in story ]
       xq = [word_index[word.lower()]for word in query ]

       y = np.zeros(len(word_index)+1)
       y[word_index[answer]] = 1

       X.append(x)
       Xq.append(xq)
       Y.append(y)

    return (pad_sequences(X,maxlen=max_story_len), pad_sequences(Xq,maxlen=max_question_len), np.array(Y))   


In [32]:
inputs_train,queries_train,answer_train = vectorize_stories(train_data)

In [33]:
inputs_test,queries_test,answer_test = vectorize_stories(test_data)

In [34]:
inputs_test

array([[ 0,  0,  0, ...,  5,  4, 14],
       [ 0,  0,  0, ...,  5, 19, 14],
       [ 0,  0,  0, ...,  5, 19, 14],
       ...,
       [ 0,  0,  0, ...,  5, 33, 14],
       [ 0,  0,  0, ...,  5, 19, 14],
       [ 0,  0,  0, ..., 33, 37, 14]])

In [35]:
from keras.models import Sequential, Model


In [37]:
from keras.layers import Embedding

In [38]:
from keras.layers import Input,Activation,Dense,Permute,Dropout,add,dot,concatenate,LSTM

In [39]:
input_sequence = Input((max_story_len,))
question = Input((max_question_len,))

In [40]:
vocab_size = len(vocab) + 1

In [41]:
input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_size,output_dim=64))
input_encoder_m.add(Dropout(0.3))

In [42]:
input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_size,output_dim=max_question_len))
input_encoder_c.add(Dropout(0.3))

In [44]:
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size,output_dim=64, input_length=max_question_len))
question_encoder.add(Dropout(0.3))

In [45]:
input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)

In [46]:
match = dot([input_encoded_m,question_encoded],axes=(2,2))
match = Activation('softmax')(match)

In [47]:
response = add([match,input_encoded_c])
response = Permute((2,1))(response)

In [48]:
answer = concatenate([response,question_encoded])

In [49]:
answer

<KerasTensor: shape=(None, 6, 220) dtype=float32 (created by layer 'concatenate')>

In [50]:
answer = LSTM(32)(answer)

In [51]:
answer = Dropout(0.5)(answer)
amswer = Dense(vocab_size)(answer)

In [52]:
answer = Activation('softmax')(answer)

In [53]:
model = Model([input_sequence,question],answer)

In [55]:
model.compile(optimizer='rmsprop',loss = 'categorical_crossentropy', metrics=['accuracy'])

In [57]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 156)]                0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 6)]                  0         []                            
                                                                                                  
 sequential (Sequential)     (None, None, 64)             2432      ['input_1[0][0]']             
                                                                                                  
 sequential_3 (Sequential)   (None, 6, 64)                2432      ['input_2[0][0]']             
                                                                                              