In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
with open('train_qa.txt', 'rb') as file:
    data = pickle.load(file)
    file.close()
max_story_len = max([len(datas[0]) for datas in data])
max_question_len = max([len(datas[1]) for datas in data])

# Make Vocabulary

In [64]:
def load_context(file_path):
    with open(file_path,'rb') as file:
        data = pickle.load(file)
        q = [" ".join(line[0]) for line in data]
    return q

def load_question(file_path):
    with open(file_path,'rb') as file:
        data = pickle.load(file)
        q = [" ".join(line[1]) for line in data]
    return q

def load_answer(file_path):
    with open(file_path,'rb') as file:
        data = pickle.load(file)
        q = ["".join(line[2]) for line in data]
    return q

In [65]:
train_context = load_context('train_qa.txt')
train_question = load_question('train_qa.txt')
train_answer = load_answer('train_qa.txt')

test_context = load_context('test_qa.txt')
test_question = load_question('test_qa.txt')
test_answer = load_answer('test_qa.txt')

In [66]:
print("Train : ", len(train_context))
print("Test : ", len(test_context))

Train :  10000
Test :  1000


In [67]:
def get_sentence_vectorizer(sentences):
    sentence_vectorizer = tf.keras.layers.TextVectorization(standardize = None)
    sentence_vectorizer.adapt(sentences)
    vocab = sentence_vectorizer.get_vocabulary()
    return sentence_vectorizer, vocab

In [68]:
vectorizer, vocab = get_sentence_vectorizer(train_context+test_context+train_answer+test_answer+test_context+test_question+[""])
vocab_size = len(vocab)

In [69]:
vocabulary = {}
for i in range(vocab_size):
    vocabulary[vocab[i].lower()] = i

In [70]:
vocabulary

{'': 0,
 '[unk]': 1,
 'the': 2,
 '.': 3,
 'to': 4,
 'went': 5,
 'sandra': 6,
 'mary': 7,
 'daniel': 8,
 'john': 9,
 'there': 10,
 'journeyed': 11,
 'travelled': 12,
 'moved': 13,
 'back': 14,
 'bedroom': 15,
 'kitchen': 16,
 'office': 17,
 'hallway': 18,
 'garden': 19,
 'bathroom': 20,
 'milk': 21,
 'apple': 22,
 'football': 23,
 'yes': 24,
 'no': 25,
 'got': 26,
 'up': 27,
 'picked': 28,
 'grabbed': 29,
 'took': 30,
 'discarded': 31,
 'dropped': 32,
 'put': 33,
 'down': 34,
 'left': 35,
 'in': 36,
 'is': 37,
 '?': 38}

# Generating the Dataset

In [71]:
def generate_dataset(contexts, questions, answers, sentence_vectorizer):
    context_ids = []
    question_ids = []
    answer_ids = []
    
    for i in range(len(contexts)):
        context = contexts[i]
        question = questions[i]
        answer = answers[i]
        veccontext = [vocabulary[word.lower()] for word in context.split(' ')]
        vecquestion = [vocabulary[word.lower()] for word in question.split(' ')]
        vecanswer = np.zeros(vocab_size)
        vecanswer[vocabulary[answer]] = 1
        context_ids.append(veccontext)
        question_ids.append(vecquestion)
        answer_ids.append(vecanswer)

    return (pad_sequences(context_ids, maxlen=max_story_len),pad_sequences(question_ids, maxlen=max_question_len), np.array(answer_ids))

In [72]:
train_context_vectorised, train_question_vectorised, train_answer_vectorised = generate_dataset(train_context, train_question, train_answer, vectorizer)
test_context_vectorised, test_question_vectorised, test_answer_vectorised  = generate_dataset(test_context, test_question, test_answer, vectorizer)

In [73]:
val_context_vectorised = train_context_vectorised[9500:]
val_question_vectorised = train_question_vectorised[9500:]
val_answer_vectorised = train_answer_vectorised[9500:]

train_context_vectorised = train_context_vectorised[0:9500]
train_question_vectorised = train_question_vectorised[0:9500]
train_answer_vectorised = train_answer_vectorised[0:9500]

# Model
## Encoders
### Input Encoder M

In [13]:
embedding_dim = 128
input_encoder_m = tf.keras.models.Sequential()
input_encoder_m.add(tf.keras.layers.Embedding(input_dim=vocab_size,output_dim= embedding_dim))
input_encoder_m.add(tf.keras.layers.Dropout(0.3))

### Input Encoder C

In [14]:
input_encoder_c = tf.keras.models.Sequential()
input_encoder_c.add(tf.keras.layers.Embedding(input_dim=vocab_size,output_dim=max_question_len))
input_encoder_c.add(tf.keras.layers.Dropout(0.3))

### Question Encoder

In [15]:
question_encoder = tf.keras.models.Sequential()
question_encoder.add(tf.keras.layers.Embedding(input_dim=vocab_size,
                               output_dim=embedding_dim,
                               input_length=max_question_len))
question_encoder.add(tf.keras.layers.Dropout(0.3))

In [16]:
input_sequence = tf.keras.layers.Input((max_story_len,))
question = tf.keras.layers.Input((max_question_len,))

In [17]:
input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)

In [18]:
match = tf.keras.layers.dot([input_encoded_m, question_encoded], axes=(2, 2))
match = tf.keras.layers.Activation('softmax')(match)

In [19]:
response = tf.keras.layers.add([match, input_encoded_c])  
response = tf.keras.layers.Permute((2, 1))(response)  

In [20]:
answer = tf.keras.layers.concatenate([response, question_encoded])
answer = tf.keras.layers.LSTM(32)(answer)
answer = tf.keras.layers.Dropout(0.5)(answer)
answer = tf.keras.layers.Dense(vocab_size)(answer) 
answer = tf.keras.layers.Activation('softmax')(answer)

In [21]:
model = tf.keras.models.Model([input_sequence, question], answer)
model.compile(optimizer=tf.keras.optimizers.legacy.RMSprop(learning_rate=0.01), loss='categorical_crossentropy', metrics=['accuracy'])

In [22]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 156)]                0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 6)]                  0         []                            
                                                                                                  
 sequential (Sequential)     (None, None, 128)            4992      ['input_1[0][0]']             
                                                                                                  
 sequential_2 (Sequential)   (None, 6, 128)               4992      ['input_2[0][0]']             
                                                                                              

In [23]:
import math
from tensorflow.keras.callbacks import ReduceLROnPlateau, LearningRateScheduler

initial_learning_rate = 0.01
epochs = 120
decay = initial_learning_rate / epochs

def lr_step_decay(epoch, lr):
    drop_rate = 0.5
    epochs_drop = 20
    return initial_learning_rate * math.pow(drop_rate, math.floor(epoch/epochs_drop))

learning_rate = LearningRateScheduler(lr_step_decay, verbose=1)

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.66, patience=5, min_lr=0.0001, verbose=1)  # factor by which the learning rate will be reduced. new_lr = lr * factor


In [24]:
# train
history = model.fit([train_context_vectorised,train_question_vectorised], train_answer_vectorised,batch_size=256,epochs=250,validation_data=([val_context_vectorised,val_question_vectorised], val_answer_vectorised))  # , callbacks=[reduce_lr]

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78

# Testing

In [25]:
pred_results = model.predict(([test_context_vectorised, test_question_vectorised]))



In [48]:
#Generate prediction from model
val_max = np.argmax(pred_results[10])
print(pred_results[8])

for key, val in vocabulary.items():
    if val == val_max:
        k = key

print("Predicted answer is: ", k)
print("Probability of certainty was: ", val_max)

test_answer[10]

[3.2695882e-17 3.8785576e-17 3.6220327e-17 4.3115535e-17 4.5350812e-17
 4.5113388e-17 3.0191722e-17 4.3461985e-17 4.3389759e-17 4.3100408e-17
 3.7579510e-17 4.9584429e-17 3.6493822e-17 3.7183800e-17 3.3156082e-17
 5.5727581e-17 3.9489649e-17 3.9396058e-17 3.0380273e-17 3.8085632e-17
 3.3483245e-17 3.5864225e-17 3.9802101e-17 4.1081535e-17 4.8921032e-07
 9.9999952e-01 3.7509474e-17 3.9950564e-17 3.9626666e-17 3.1958229e-17
 3.6998445e-17 3.4924015e-17 4.4902557e-17 3.9289802e-17 3.7664902e-17
 4.6127585e-17 3.1324082e-17 3.9110956e-17 3.7097092e-17]
Predicted answer is:  yes
Probability of certainty was:  24


'yes'

In [83]:
def answer(pred, vocabulary):
    val_max = np.argmax(pred)

    for key, val in vocabulary.items():
        if val == val_max:
            k = key
    return k

In [89]:
def accuracy(pred_results, vocabulary, test_answer):
    accuracy = 0
    for i in range(len(pred_results)):
        if (str(answer(pred_results[i],vocabulary)) == test_answer[i]):
            accuracy += 1

    accuracy = accuracy/len(pred_results)
    print("Accuracy is : ", accuracy)

# For a custom test data,

In [90]:
test_file_name = 'test_qa.txt'

In [91]:
test_context = load_context(test_file_name)
test_question = load_question(test_file_name)
test_answer = load_answer(test_file_name)
test_context_vectorised, test_question_vectorised, test_answer_vectorised  = generate_dataset(test_context, test_question, test_answer, vectorizer)
pred_results = model.predict(([test_context_vectorised, test_question_vectorised]))
accuracy(pred_results,vocabulary,test_answer)

Accuracy is :  0.943
