In [0]:

#Library Imports
import pickle
import numpy as np

In [0]:
#retrieve training data
with open('train_qa.txt', 'rb') as f:
    train_data = pickle.load(f)


In [0]:
#retrieve test data
with open('test_qa.txt', 'rb') as f:
    test_data = pickle.load(f)

In [5]:
#Number of training instances
len(train_data)

10000

In [6]:
#Number of test instances
len(test_data)

1000

In [7]:

#Example of one of the instances
train_data[10]

(['Sandra',
  'went',
  'back',
  'to',
  'the',
  'hallway',
  '.',
  'Sandra',
  'moved',
  'to',
  'the',
  'office',
  '.'],
 ['Is', 'Sandra', 'in', 'the', 'office', '?'],
 'yes')

In [8]:
' '.join(train_data[10][0])


'Sandra went back to the hallway . Sandra moved to the office .'

In [9]:
' '.join(train_data[10][1])


'Is Sandra in the office ?'

In [10]:
train_data[10][2]


'yes'

In [0]:
#First we need to create a vocabulary with our data
#For this we will use the training data only to - On the video it uses both
#train and test 
#Might have to use training and test later, as the dataset has very
#few words
#First we will build a set of all the words in the dataset:
vocab = set()
for story, question, answer in train_data:
    vocab = vocab.union(set(story)) #Set returns unique words in the sentence
                                    #Union returns the unique common elements from a two sets
    vocab = vocab.union(set(question))

In [0]:
vocab.add('no')
vocab.add('yes')

In [13]:
vocab


{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [0]:
#Calculate len and add 1 for Keras placeholder - Placeholders are used to feed in the data to the network. 
#They need a data type, and have optional shape arguements.
#They will be empty at first, and then the data will get fed into the placeholder
vocab_len = len(vocab) + 1

In [15]:
vocab_len


38

In [0]:

#Now we are going to calculate the longest story and the longest question
#We need this for the Keras pad sequences. 
#Keras training layers expect all of the input to have the same length, so 
#we need to pad 
all_data = test_data + train_data

In [0]:
all_story_lens = [len(data[0]) for data in all_data]
max_story_len = (max(all_story_lens))
max_question_len = max([len(data[1]) for data in all_data])


In [20]:
#vectorization
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [0]:
#Create an instance of the tokenizer object:
tokenizer = Tokenizer(filters = [])
tokenizer.fit_on_texts(vocab)

In [22]:
tokenizer

<keras_preprocessing.text.Tokenizer at 0x7fbaccd39cc0>

In [23]:
#Dictionary that maps every word in our vocab to an index
# It has been automatically lowercased
#This tokenizer can give different indexes for different words depending on when we run it
tokenizer.word_index

{'.': 1,
 '?': 27,
 'apple': 36,
 'back': 16,
 'bathroom': 8,
 'bedroom': 2,
 'daniel': 20,
 'discarded': 12,
 'down': 23,
 'dropped': 14,
 'football': 4,
 'garden': 30,
 'got': 26,
 'grabbed': 19,
 'hallway': 34,
 'in': 9,
 'is': 15,
 'john': 17,
 'journeyed': 18,
 'kitchen': 21,
 'left': 24,
 'mary': 7,
 'milk': 25,
 'moved': 29,
 'no': 37,
 'office': 22,
 'picked': 32,
 'put': 11,
 'sandra': 6,
 'the': 3,
 'there': 35,
 'to': 33,
 'took': 5,
 'travelled': 13,
 'up': 10,
 'went': 31,
 'yes': 28}