In [1]:
import pickle
import numpy as np

In [2]:
with open("train_qa.txt", "rb") as f:
    train_data = pickle.load(f)

In [3]:
with open("test_qa.txt", "rb") as f:
    test_data = pickle.load(f)

In [4]:
type(train_data)

list

In [5]:
len(train_data)

10000

In [6]:
len(test_data)

1000

In [8]:
train_data[0]

(['Mary',
  'moved',
  'to',
  'the',
  'bathroom',
  '.',
  'Sandra',
  'journeyed',
  'to',
  'the',
  'bedroom',
  '.'],
 ['Is', 'Sandra', 'in', 'the', 'hallway', '?'],
 'no')

In [9]:
all_data = test_data + train_data

In [10]:
len(all_data)

11000

In [11]:
vocab = set()
for story, question, answer in all_data:
    vocab = vocab.union(set(story))
    vocab = vocab.union(set(question))

In [12]:
vocab.add("no")
vocab.add("yes")

In [13]:
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [14]:
vocab_len = len(vocab) + 1

In [16]:
vocab_len

38

In [20]:
all_story_len = [len(data[0]) for data in all_data]

In [22]:
max_story_len = max(all_story_len)

In [23]:
max_question_len = max([len(data[1]) for data in all_data])

In [24]:
max_question_len

6

In [25]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [26]:
tokenizer = Tokenizer(filters = [])

In [27]:
tokenizer.fit_on_texts(vocab)

In [28]:
tokenizer.word_index

{'put': 1,
 'left': 2,
 'bathroom': 3,
 'took': 4,
 'is': 5,
 'back': 6,
 'bedroom': 7,
 'journeyed': 8,
 'discarded': 9,
 'john': 10,
 'hallway': 11,
 'down': 12,
 'travelled': 13,
 'there': 14,
 'daniel': 15,
 'kitchen': 16,
 'dropped': 17,
 'apple': 18,
 'got': 19,
 '?': 20,
 'picked': 21,
 'the': 22,
 'yes': 23,
 'football': 24,
 'to': 25,
 '.': 26,
 'up': 27,
 'moved': 28,
 'went': 29,
 'grabbed': 30,
 'in': 31,
 'milk': 32,
 'sandra': 33,
 'office': 34,
 'no': 35,
 'garden': 36,
 'mary': 37}

In [29]:
train_story_text = []
train_question_text = []
train_answers = []

In [30]:
for story, question, answer in train_data:
    train_story_text.append(story)
    train_question_text.append(question)
    train_answers.append(answer)

In [31]:
train_story_sequence = tokenizer.texts_to_sequences(train_story_text)
# train_question_sequence = tokenizer.texts_to_sequences(train_question_text)

In [34]:
def vectorize_stories(data, word_index = tokenizer.word_index, max_story_len = max_story_len, max_question_len = max_question_len):
    x = []
    xq = []
    y = []
    for story, question, answer in data:
        x_temp = [word_index[word.lower()] for word in story]
        xq_temp = [word_index[word.lower()] for word in question]        
        y_temp = np.zeros(len(word_index)+1)
        y_temp[word_index[answer]] = 1
        x.append(x_temp)
        xq.append(xq_temp)        
        y.append(y_temp)
    return (pad_sequences(x,maxlen = max_story_len),pad_sequences(xq,maxlen = max_question_len), np.array(y))

In [35]:
inputs_train, queries_train, answers_train = vectorize_stories(train_data)

In [36]:
inputs_test, queries_test, answers_test = vectorize_stories(test_data)

In [37]:
from keras.models import Sequential, Model

In [38]:
from keras.layers.embeddings import Embedding

In [39]:
from keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot, concatenate, LSTM

In [40]:
input_sequence = Input((max_story_len,))
question = Input((max_question_len,))