In [23]:
import pickle
import numpy as np

In [24]:
with open('train_qa.txt', 'rb') as f:
    train_data = pickle.load(f)

In [25]:
with open('test_qa.txt', 'rb') as f:
    test_data = pickle.load(f)

In [26]:
type(test_data)

list

In [27]:
len(test_data), len(train_data)

(1000, 10000)

In [28]:
train_data[0]

(['Mary',
  'moved',
  'to',
  'the',
  'bathroom',
  '.',
  'Sandra',
  'journeyed',
  'to',
  'the',
  'bedroom',
  '.'],
 ['Is', 'Sandra', 'in', 'the', 'hallway', '?'],
 'no')

In [29]:
' '.join(train_data[0][0])

'Mary moved to the bathroom . Sandra journeyed to the bedroom .'

In [30]:
' '.join(train_data[0][1])

'Is Sandra in the hallway ?'

In [31]:
''.join(train_data[0][2])

'no'

In [32]:
all_data = train_data + test_data

In [33]:
vocab = set()

for story, question, answer in all_data:
    vocab = vocab.union(set(story))
    vocab = vocab.union(set(question))
    
vocab.add('no')
vocab.add('yes')    

In [34]:
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [35]:
vocab_len = len(vocab) + 1
vocab_len

38

In [36]:
all_story_len = [len(data[0])for data in all_data]
max_story_len = max(all_story_len)
max_story_len

156

In [37]:
max_question_len = max([len(data[1]) for data in all_data])
max_question_len

6

In [38]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [39]:
tokenizer = Tokenizer(filters=[])
tokenizer.fit_on_texts(vocab)

In [40]:
tokenizer.word_index

{'left': 1,
 '?': 2,
 'garden': 3,
 'in': 4,
 'john': 5,
 'hallway': 6,
 'picked': 7,
 'back': 8,
 'kitchen': 9,
 'got': 10,
 'to': 11,
 'sandra': 12,
 'travelled': 13,
 'mary': 14,
 'apple': 15,
 'bathroom': 16,
 'the': 17,
 'put': 18,
 'moved': 19,
 'milk': 20,
 'down': 21,
 'went': 22,
 '.': 23,
 'there': 24,
 'grabbed': 25,
 'office': 26,
 'took': 27,
 'is': 28,
 'dropped': 29,
 'bedroom': 30,
 'football': 31,
 'journeyed': 32,
 'up': 33,
 'daniel': 34,
 'discarded': 35,
 'yes': 36,
 'no': 37}

In [41]:
train_story_text = []
train_question_text = []
train_answers = []

In [42]:
for story, question, answer in train_data:
    train_story_text.append(story)
    train_question_text.append(question)
    train_answers.append(answer)

In [43]:
train_story_seq = tokenizer.texts_to_sequences(train_story_text)

In [44]:
len(train_story_seq)

10000

In [45]:
def vectorize_stories(data,word_index=tokenizer.word_index,max_story_len=max_story_len,max_question_len=max_question_len):
    X = []
    Xq = []
    Y = []
    
    for story, query, answer in data:
        
        x = [word_index[word.lower()] for word in story] 
        xq = [word_index[word.lower()] for word in query] 
        
        y = np.zeros(len(word_index)+1)
        
        #Because answers are just yes/no
        y[word_index[answer]] = 1
        
        X.append(x)
        Xq.append(xq)
        Y.append(y)
        
    return (pad_sequences(X,maxlen=max_story_len), pad_sequences(Xq,maxlen=max_question_len), np.array(Y))    

In [46]:
inputs_train, queries_train, answers_train = vectorize_stories(train_data)
inputs_test, queries_test, answers_test = vectorize_stories(test_data)

In [47]:
inputs_train.shape , type(inputs_train)

((10000, 156), numpy.ndarray)

In [48]:
queries_train.shape, type(queries_train)

((10000, 6), numpy.ndarray)

In [49]:
answers_train.shape, type(answers_train)

((10000, 38), numpy.ndarray)

In [50]:
tokenizer.word_index['yes'], tokenizer.word_index['no']

(36, 37)

In [51]:
sum(answers_test)

array([  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0., 497., 503.])

In [32]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, Dense, Input, Permute, Dropout, add, dot,concatenate, Activation, Embedding

In [33]:
# PLACEHOLDER shape=(max_story_len, batch_size)
input_sequence = Input((max_story_len,))
question = Input((max_question_len,))

In [34]:
vocab_size = len(vocab) + 1

In [35]:
#INPUT ENCODER M

input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim = vocab_size, output_dim = 64))
input_encoder_m.add(Dropout(0.3))

# (samples, story_maxlen, embedding_dim)

In [36]:
#INPUT ENCODER C

input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim = vocab_size, output_dim = max_question_len))
input_encoder_c.add(Dropout(0.3))

# (samples, story_maxlen, embedding_dim)

In [37]:
#QUESTION ENCODER

question_encoder = Sequential()
question_encoder.add(Embedding(input_dim = vocab_size, output_dim = 64,input_length=max_question_len))
question_encoder.add(Dropout(0.3))

# (samples, query_maxlen, embedding_dim)

In [38]:
# ENCODED <----- ENCODER(INPUT)

input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)

In [39]:
match = dot([input_encoded_m, question_encoded], axes=(2,2))
match = Activation('softmax')(match)

In [40]:
response = add([match, input_encoded_c])
response = Permute((2,1))(response)

In [41]:
answer = concatenate([response, question_encoded])
answer

<tf.Tensor 'concatenate/Identity:0' shape=(None, 6, 220) dtype=float32>

In [42]:
answer = LSTM(32)(answer)
answer = Dropout(0.5)(answer)
answer = Dense(vocab_size)(answer)
answer = Activation('softmax')(answer)

In [42]:
model = Model([input_sequence,question], answer)
model.compile(optimizer = 'rmsprop', loss = 'categorical_crossentropy', metrics =['accuracy'])

In [43]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 156)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 6)]          0                                            
__________________________________________________________________________________________________
sequential_1 (Sequential)       multiple             2432        input_1[0][0]                    
__________________________________________________________________________________________________
sequential_3 (Sequential)       (None, 6, 64)        2432        input_2[0][0]                    
______________________________________________________________________________________________

In [45]:
history = model.fit([inputs_train, queries_train], answers_train,batch_size=32,epochs=100, validation_data = ([inputs_test, queries_test], answers_test))

Train on 10000 samples, validate on 1000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/10

In [33]:
#model.save('mybrandnewmodel.h5')
#model.load_weights('chatbot.h5')

In [46]:
pred_results = model.predict(([inputs_test, queries_test]))

In [47]:
test_data[0][0], test_data[0][1], test_data[0][2]

(['Mary',
  'got',
  'the',
  'milk',
  'there',
  '.',
  'John',
  'moved',
  'to',
  'the',
  'bedroom',
  '.'],
 ['Is', 'John', 'in', 'the', 'kitchen', '?'],
 'no')

In [48]:
pred_results[0]

array([1.7440501e-13, 1.5879575e-13, 2.0078793e-13, 1.7514103e-13,
       1.7624487e-13, 1.7536633e-13, 2.0295182e-13, 1.4283312e-13,
       2.1536554e-13, 1.8831365e-13, 1.5099196e-13, 1.6386834e-13,
       1.9013846e-13, 1.4322433e-13, 2.0888201e-13, 1.6815801e-13,
       2.0135093e-13, 1.9109464e-13, 1.6928691e-13, 1.8110086e-13,
       1.8188805e-13, 1.5520662e-13, 2.0852535e-13, 1.7420287e-13,
       1.8919065e-13, 1.6282151e-13, 1.7244283e-13, 2.2182763e-13,
       1.7986571e-13, 1.7942096e-13, 1.7196788e-13, 2.0970195e-13,
       1.5328025e-13, 8.2047850e-02, 1.5013846e-13, 1.7476331e-13,
       9.1795212e-01, 1.7609904e-13], dtype=float32)

In [49]:
val_max = np.argmax(pred_results[0])

In [50]:
for key, val in tokenizer.word_index.items():
    if val == val_max:
        k = key

In [51]:
k, pred_results[0][val_max]

('no', 0.9179521)

In [70]:
train_data

[(['Mary',
   'moved',
   'to',
   'the',
   'bathroom',
   '.',
   'Sandra',
   'journeyed',
   'to',
   'the',
   'bedroom',
   '.'],
  ['Is', 'Sandra', 'in', 'the', 'hallway', '?'],
  'no'),
 (['Mary',
   'moved',
   'to',
   'the',
   'bathroom',
   '.',
   'Sandra',
   'journeyed',
   'to',
   'the',
   'bedroom',
   '.',
   'Mary',
   'went',
   'back',
   'to',
   'the',
   'bedroom',
   '.',
   'Daniel',
   'went',
   'back',
   'to',
   'the',
   'hallway',
   '.'],
  ['Is', 'Daniel', 'in', 'the', 'bathroom', '?'],
  'no'),
 (['Mary',
   'moved',
   'to',
   'the',
   'bathroom',
   '.',
   'Sandra',
   'journeyed',
   'to',
   'the',
   'bedroom',
   '.',
   'Mary',
   'went',
   'back',
   'to',
   'the',
   'bedroom',
   '.',
   'Daniel',
   'went',
   'back',
   'to',
   'the',
   'hallway',
   '.',
   'Sandra',
   'went',
   'to',
   'the',
   'kitchen',
   '.',
   'Daniel',
   'went',
   'back',
   'to',
   'the',
   'bathroom',
   '.'],
  ['Is', 'Daniel', 'in', 'the', '

In [52]:
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [53]:
my_story = "John left the kitchen . Sandra dropped the football in the garden ."

In [54]:
my_story.split()

['John',
 'left',
 'the',
 'kitchen',
 '.',
 'Sandra',
 'dropped',
 'the',
 'football',
 'in',
 'the',
 'garden',
 '.']

In [55]:
my_question = 'Is the football in the garden ?'

In [56]:
my_question.split()

['Is', 'the', 'football', 'in', 'the', 'garden', '?']

In [57]:
mydata = [(my_story.split(), my_question.split(), 'yes')]

In [58]:
mydata

[(['John',
   'left',
   'the',
   'kitchen',
   '.',
   'Sandra',
   'dropped',
   'the',
   'football',
   'in',
   'the',
   'garden',
   '.'],
  ['Is', 'the', 'football', 'in', 'the', 'garden', '?'],
  'yes')]

In [59]:
my_story, my_ques, my_ans = vectorize_stories(mydata)

In [60]:
my_ans

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0.]])

In [62]:
pred_results = model.predict(([my_story,my_ques]))

In [63]:
val_max = np.argmax(pred_results[0])

In [64]:
for key, val in tokenizer.word_index.items():
    if val == val_max:
        k = key

In [65]:
k

'yes'

In [68]:
pred_results[0][val_max]

0.9921186

In [79]:
problems = ['Why all_data is created? ----> To create the vocab', 
            'pad_sequences?', 
            'Tokenizer?', 
            'What does a placeholder mean?', 
            'vectorization methods']

In [78]:
problems

['Why all_data is created? ----> To create the vocab',
 'pad_sequences',
 'Tokenizer',
 'What does a placeholder mean?',
 '']