# QA Bot

In this notebook we have an implementation of the ChatBot using the End to End memory networks implementation.

First, let's load the data.

In [3]:
import pickle
import numpy

with open('../../datasets/train_qa.txt', 'rb') as file:
    train_data = pickle.load(file)
    
with open('../../datasets/test_qa.txt', 'rb') as file:
    test_data = pickle.load(file)


In [4]:
train_count = len(train_data)
test_count = len(test_data)
print(f"Train data count: {train_count}")
print(f"Test data count: {test_count}")

Train data count: 10000
Test data count: 1000


We can take a look at the kind of data we can find in there.

In [25]:
import random

def print_record(data, index=None):
    if not index:
        index = random.randint(0, len(data) - 1)
    record = data[index]
    story = " ".join(record[0])
    story = [s.strip() for s in story.split(".")]
    story = "\n".join(story)
    question = " ".join(record[1])
    answer = record[2]
    print(f"Story:\n{story}")
    print(f"Question: {question}\n")
    print(f"Answer: {answer}")

In [26]:
print_record(train_data, 0)

Story:
Daniel went to the kitchen
John went back to the hallway
Sandra travelled to the office
John went back to the kitchen

Question: Is Sandra in the office ?

Answer: yes


In [28]:
print_record(train_data)

Story:
Mary travelled to the office
John went to the bathroom
John got the football there
Mary went back to the hallway

Question: Is Mary in the bedroom ?

Answer: no


We need to ensure both the train and test data are used to create a vocabulary, this is to ensure the dictionary contains the elements used in both

In [30]:
all_data = train_data + test_data

# find all distinct elements between all the words
vocab = set()
for story, question, answer in all_data:
    vocab = vocab.union(set(story))
    vocab = vocab.union(set(question))
    
vocab.add("yes")
vocab.add("no")

In [31]:
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [33]:
vocab_len = len(vocab) + 1
vocab_len

38

Calculate the longest story and question

In [34]:
all_story_len = [len(data[0]) for data in all_data]
max_story_len = max(all_story_len)
all_question_len = [len(data[2]) for data in all_data]
max_question_len = max(all_question_len)

Now let's vectorize the data

In [35]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(filters=[])
tokenizer.fit_on_texts(vocab)
tokenizer.index_word

{1: 'the',
 2: 'put',
 3: 'no',
 4: 'moved',
 5: 'journeyed',
 6: 'travelled',
 7: 'football',
 8: '.',
 9: 'yes',
 10: 'discarded',
 11: 'picked',
 12: 'is',
 13: 'garden',
 14: 'up',
 15: 'daniel',
 16: 'got',
 17: 'down',
 18: 'in',
 19: 'grabbed',
 20: 'took',
 21: 'john',
 22: 'left',
 23: 'mary',
 24: 'bathroom',
 25: 'dropped',
 26: 'went',
 27: 'milk',
 28: 'to',
 29: 'apple',
 30: 'sandra',
 31: 'kitchen',
 32: 'there',
 33: 'bedroom',
 34: '?',
 35: 'hallway',
 36: 'back',
 37: 'office'}

In [36]:
train_story_text = []
train_question_text = []
train_answers = []

for story, question, answer in train_data:
    train_story_text.append(story)
    train_question_text.append(question)
    train_answers.append(answer)


In [42]:
import numpy as np

def vectorize_stories(data, word_index, max_story_length, max_question_length):
    # stories
    X = []
    # questions
    Xq = []
    # answers
    Y = []
    for story, question, answer in data:
        x = [word_index[word.lower()] for word in story]
        xq = [word_index[word.lower()] for word in question]
        
        y = np.zeros(len(word_index) + 1)
        y[word_index[answer]] = 1
        
        X.append(x)
        Xq.append(xq)
        Y.append(y)
        
    return (pad_sequences(X, maxlen=max_story_length), pad_sequences(Xq, maxlen=max_question_length), np.array(y))

In [43]:
X, Xq, y = vectorize_stories(train_data, tokenizer.word_index, max_story_len, max_question_len)

In [44]:
X

array([[ 0,  0,  0, ...,  1, 33,  8],
       [ 0,  0,  0, ...,  1, 35,  8],
       [ 0,  0,  0, ...,  1, 24,  8],
       ...,
       [ 0,  0,  0, ...,  1, 33,  8],
       [ 0,  0,  0, ..., 27, 32,  8],
       [ 0,  0,  0, ..., 29, 32,  8]], dtype=int32)