### Data Load

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
import time
tf.__version__

In [None]:
# Make sure the vocabulary.txt file and the encoded datasets for Question and Answer are present in the same folder
# reading vocabulary
vocab_lines = open('vocabulary.txt', encoding='utf-8', errors='ignore').read().split('\n')
# reading questions
question_lines = open('InsuranceQAquestionanslabelraw.encoded', encoding='utf-8', errors='ignore').read().split('\n')
# reading answers
answer_lines = open('InsuranceQAlabel2answerraw.encoded', encoding='utf-8', errors='ignore').read().split('\n')

In [None]:
# The print command shows the token value associated with each of the words in the 3 datasets

In [None]:
print(" -- Vocabulary -- ")
print(vocab_lines[:2])

In [None]:
print(" -- Questions -- ")
print(question_lines[:2])

In [None]:
print(" -- Answers -- ")
print(answer_lines[:2])

In [None]:
id2line = {}
for line in vocab_lines:
    _line = line.split('\t')
    if len(_line) == 2:
        id2line[_line[0]] = _line[1]

In [None]:
# Creating the word tokens for both questions and answers, along with the mapping of the answers enlisted for questions
convs, ansid = [] , []
for line in question_lines[:-1]:
    _line = line.split('\t')
    ansid.append(_line[2].split(' '))
    convs.append(_line[1])
    
convs1 = [ ]
for line in answer_lines[:-1]:
    _line = line.split('\t')
    convs1.append(_line[1])

In [None]:
print(convs[:2])  # word tokens present in the question

In [None]:
print(ansid[:2])  # answers IDs mapped to the questions

In [None]:
print(convs1[:2])  # word tokens present in the answer

In [None]:
# Creating matching pair between questions and answers on the basis of the ID allocated to each.

questions, answers = [], []
for a in range(len(ansid)):
      for b in range(len(ansid[a])):
            questions.append(convs[a])

for a in range(len(ansid)):
      for b in range(len(ansid[a])):
            answers.append(convs1[int(ansid[a][b])-1])

ques, ans =[], []

m=0
while m<len(questions):
       i=0
       a=[]
       while i < (len(questions[m].split(' '))):
            a.append(id2line[questions[m].split(' ')[i]])
            i=i+1
       ques.append(' '.join(a))
       m=m+1

n=0
while n<len(answers):  
        j=0
        b=[]
        while j < (len(answers[n].split(' '))):
            b.append(id2line[answers[n].split(' ')[j]])
            j=j+1
        ans.append(' '.join(b))
        n=n+1     

In [None]:
# Printing top 5 questions along with their answers
limit = 0
for i in range(limit, limit+5):
    print(ques[i])
    print(ans[i])
    print("---")

In [None]:
# Checking the count of the total number of questions and answers
print(len(questions))
print(len(answers))

### Processing Text

In [None]:
import re

def clean_text(text):
	"""Cleaning the text by replacing the abbreviated words with their proper full replacement"""
    
	text = text.lower()

	text = re.sub(r"i'm", "i am", text)
	text = re.sub(r"he's", "he is", text)
	text = re.sub(r"she's", "she is", text)
	text = re.sub(r"it's", "it is", text)
	text = re.sub(r"that's", "that is", text)
	text = re.sub(r"what's", "that is", text)
	text = re.sub(r"where's", "where is", text)
	text = re.sub(r"how's", "how is", text)
	text = re.sub(r"\'ll", " will", text)
	text = re.sub(r"\'ve", " have", text)
	text = re.sub(r"\'re", " are", text)
	text = re.sub(r"\'d", " would", text)
	text = re.sub(r"\'re", " are", text)
	text = re.sub(r"won't", "will not", text)
	text = re.sub(r"can't", "cannot", text)
	text = re.sub(r"n't", " not", text)
	text = re.sub(r"n'", "ng", text)
	text = re.sub(r"'bout", "about", text)
	text = re.sub(r"'til", "until", text)
	text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,']", "", text)

	return text

In [None]:
# Applying the 'clean_text()' function on the set of Questions and Answers
clean_questions = []
for question in ques:
    clean_questions.append(clean_text(question))
    
clean_answers = []    
for answer in ans:
    clean_answers.append(clean_text(answer))

In [None]:
limit = 0
for i in range(limit, limit+5):
    print(clean_questions[i])
    print(clean_answers[i])
    print('----')

In [None]:
lengths = []
for question in clean_questions:
    lengths.append(len(question.split()))
for answer in clean_answers:
    lengths.append(len(answer.split()))

# Create a dataframe so that the values can be inspected
lengths = pd.DataFrame(lengths, columns=['counts'])

In [None]:
lengths.describe(percentiles=[0,0.25,0.5,0.75,0.85,0.9,0.95,0.99])

In [None]:
# Remove questions and answers that are shorter than 1 words and longer than 100 words.
min_line_length, max_line_length = 2, 100

# Filter out the questions that are too short/long
short_questions_temp, short_answers_temp = [], []

i = 0
for question in clean_questions:
    if len(question.split()) >= min_line_length and len(question.split()) <= max_line_length:
        short_questions_temp.append(question)
        short_answers_temp.append(clean_answers[i])
    i += 1

# Filter out the answers that are too short/long
short_questions, short_answers = [], []

i = 0
for answer in short_answers_temp:
    if len(answer.split()) >= min_line_length and len(answer.split()) <= max_line_length:
        short_answers.append(answer)
        short_questions.append(short_questions_temp[i])
    i += 1

In [None]:
print("# of questions:", len(short_questions))
print("# of answers:", len(short_answers))
print("% of data used: {}%".format(round(len(short_questions)/len(questions),4)*100))

In [None]:
def pad_sentence_batch(sentence_batch, vocab_to_int):
"""Including <PAD> token in sentence to make all batches of same length"""
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [vocab_to_int['<PAD>']] * (max_sentence - len(sentence)) for sentence in sentence_batch]

In [None]:
# Create a dictionary for the frequency of the vocabulary
vocab = {}
for question in short_questions:
    for word in question.split():
        if word not in vocab:
            vocab[word] = 1
        else:
            vocab[word] += 1
            
for answer in short_answers:
    for word in answer.split():
        if word not in vocab:
            vocab[word] = 1
        else:
            vocab[word] += 1

In [None]:
# Remove rare words from the vocabulary.
threshold = 1
count = 0
for k,v in vocab.items():
    if v >= threshold:
        count += 1

In [None]:
print("Size of total vocab:", len(vocab))
print("Size of vocab we will use:", count)

In [None]:
# Create dictionaries to provide a unique integer for each word.
questions_vocab_to_int = {}

word_num = 0
for word, count in vocab.items():
    if count >= threshold:
        questions_vocab_to_int[word] = word_num
        word_num += 1
        
answers_vocab_to_int = {}

word_num = 0
for word, count in vocab.items():
    if count >= threshold:
        answers_vocab_to_int[word] = word_num
        word_num += 1


In [None]:
# Adding unique tokens to the present vocabulary
codes = ['<PAD>','<EOS>','<UNK>','<GO>']

for code in codes:
    questions_vocab_to_int[code] = len(questions_vocab_to_int)+1
    
for code in codes:
    answers_vocab_to_int[code] = len(answers_vocab_to_int)+1

In [None]:
# Creating dictionary so as to map the integers to their respective words, inverse of vocab_to_int
questions_int_to_vocab = {v_i: v for v, v_i in questions_vocab_to_int.items()}
answers_int_to_vocab = {v_i: v for v, v_i in answers_vocab_to_int.items()}

In [None]:
print(len(questions_vocab_to_int))
print(len(questions_int_to_vocab))
print(len(answers_vocab_to_int))
print(len(answers_int_to_vocab))

In [None]:
# Convert the text to integers, and replacing any of the words not present in the respective vocabulary with <UNK> token 
questions_int = []
for question in short_questions:
    ints = []
    for word in question.split():
        if word not in questions_vocab_to_int:
            ints.append(questions_vocab_to_int['<UNK>'])
        else:
            ints.append(questions_vocab_to_int[word])
    questions_int.append(ints)
    
answers_int = []
for answer in short_answers:
    ints = []
    for word in answer.split():
        if word not in answers_vocab_to_int:
            ints.append(answers_vocab_to_int['<UNK>'])
        else:
            ints.append(answers_vocab_to_int[word])
    answers_int.append(ints)

In [None]:
# Calculate what percentage of all words have been replaced with <UNK>
word_count = 0
unk_count = 0

for question in questions_int:
    for word in question:
        if word == questions_vocab_to_int["<UNK>"]:
            unk_count += 1
        word_count += 1
    
for answer in answers_int:
    for word in answer:
        if word == answers_vocab_to_int["<UNK>"]:
            unk_count += 1
        word_count += 1
    
unk_ratio = round(unk_count/word_count,4)*100
    
print("Total number of words:", word_count)
print("Number of times <UNK> is used:", unk_count)
print("Percent of words that are <UNK>: {}%".format(round(unk_ratio,3)))

In [None]:
# Next, sorting the questions and answers on basis of the length of the questions. 
# This exercise will reduce the amount of padding being done during the training process.
# This will speed up the training process and reduce the training loss.

sorted_questions = []
short_questions1 = []
sorted_answers = []
short_answers1= []


for length in range(1, max_line_length+1):
    for i in enumerate(questions_int):
        if len(i[1]) == length:
            sorted_questions.append(questions_int[i[0]])
            short_questions1.append(short_questions[i[0]])
            sorted_answers.append(answers_int[i[0]])
            short_answers1.append(short_answers[i[0]])
            

print(len(sorted_questions))
print(len(sorted_answers))
print(len(short_questions1))
print(len(short_answers1))
print()
for i in range(3):
    print(sorted_questions[i])
    print(sorted_answers[i])
    print(short_questions1[i])
    print(short_answers1[i])
    print()

In [None]:
print(sorted_questions[1547])
print(short_questions1[1547])
print(sorted_answers[1547])
print(short_answers1[1547])

### Seq2Seq helper functions for Encoder and Decoder

In [None]:
def model_inputs():
    input_data = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    lr = tf.placeholder(tf.float32, name='learning_rate')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

    return input_data, targets, lr, keep_prob

In [None]:
def process_encoding_input(target_data, vocab_to_int, batch_size):
    ending = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1])
    dec_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['<GO>']), ending], 1)

    return dec_input

In [None]:
def encoding_layer(rnn_inputs, rnn_size, num_layers, keep_prob, sequence_length):
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    drop = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob = keep_prob)
    enc_cell = tf.contrib.rnn.MultiRNNCell([drop] * num_layers)
    _, enc_state = tf.nn.bidirectional_dynamic_rnn(cell_fw = enc_cell, cell_bw = enc_cell,
                                                   sequence_length = sequence_length, inputs = rnn_inputs, dtype=tf.float32)
    return enc_state

In [None]:
def decoding_layer_train(encoder_state, dec_cell, dec_embed_input, sequence_length, decoding_scope,
                         output_fn, keep_prob, batch_size):
    
    attention_states = tf.zeros([batch_size, 1, dec_cell.output_size])
    
    att_keys, att_vals, att_score_fn, att_construct_fn = tf.contrib.seq2seq.prepare_attention(attention_states, attention_option="bahdanau", num_units=dec_cell.output_size)
    
    train_decoder_fn = tf.contrib.seq2seq.attention_decoder_fn_train(encoder_state[0], att_keys, att_vals,  att_score_fn, att_construct_fn,  name = "attn_dec_train")
    
    train_pred, _, _ = tf.contrib.seq2seq.dynamic_rnn_decoder(dec_cell, train_decoder_fn,  dec_embed_input, sequence_length, scope=decoding_scope)
    train_pred_drop = tf.nn.dropout(train_pred, keep_prob)
    
    return output_fn(train_pred_drop)

In [None]:
def decoding_layer_infer(encoder_state, dec_cell, dec_embeddings, start_of_sequence_id, end_of_sequence_id,
                         maximum_length, vocab_size, decoding_scope, output_fn, keep_prob, batch_size):
    
    attention_states = tf.zeros([batch_size, 1, dec_cell.output_size])
    
    att_keys, att_vals, att_score_fn, att_construct_fn = tf.contrib.seq2seq.prepare_attention(attention_states, attention_option="bahdanau", num_units=dec_cell.output_size)
    
    infer_decoder_fn = tf.contrib.seq2seq.attention_decoder_fn_inference(output_fn, encoder_state[0],  att_keys, att_vals,  att_score_fn, att_construct_fn, 
                        dec_embeddings, start_of_sequence_id, end_of_sequence_id, maximum_length, vocab_size, name = "attn_dec_inf")
    
    infer_logits, _, _ = tf.contrib.seq2seq.dynamic_rnn_decoder(dec_cell, infer_decoder_fn, scope=decoding_scope)
    
    return infer_logits

In [None]:
def decoding_layer(dec_embed_input, dec_embeddings, encoder_state, vocab_size, sequence_length, rnn_size,
                   num_layers, vocab_to_int, keep_prob, batch_size):
    
    with tf.variable_scope("decoding") as decoding_scope:
        lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
        drop = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob = keep_prob)
        dec_cell = tf.contrib.rnn.MultiRNNCell([drop] * num_layers)
        
        weights = tf.truncated_normal_initializer(stddev=0.1)
        biases = tf.zeros_initializer()
        output_fn = lambda x: tf.contrib.layers.fully_connected(x, vocab_size, None,  scope=decoding_scope, weights_initializer = weights, biases_initializer = biases)

        train_logits = decoding_layer_train(encoder_state, dec_cell,  dec_embed_input, sequence_length,  decoding_scope, output_fn, keep_prob, batch_size)

        decoding_scope.reuse_variables()
        infer_logits = decoding_layer_infer(encoder_state, dec_cell, dec_embeddings, vocab_to_int['<GO>'], vocab_to_int['<EOS>'], 
                    sequence_length - 1, vocab_size,  decoding_scope, output_fn, keep_prob, batch_size)

    return train_logits, infer_logits

In [None]:
def seq2seq_model(input_data, target_data, keep_prob, batch_size, sequence_length, answers_vocab_size, 
                  questions_vocab_size, enc_embedding_size, dec_embedding_size, rnn_size, num_layers, 
                  questions_vocab_to_int):
    
    enc_embed_input = tf.contrib.layers.embed_sequence(input_data, answers_vocab_size+1,  enc_embedding_size, initializer = tf.random_uniform_initializer(0,1))
    
    enc_state = encoding_layer(enc_embed_input, rnn_size, num_layers, keep_prob, sequence_length)

    dec_input = process_encoding_input(target_data, questions_vocab_to_int, batch_size)
    dec_embeddings = tf.Variable(tf.random_uniform([questions_vocab_size+1, dec_embedding_size], 0, 1))
    dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, dec_input)
    
    train_logits, infer_logits = decoding_layer(dec_embed_input, dec_embeddings, enc_state, questions_vocab_size, 
                            sequence_length, rnn_size, num_layers, questions_vocab_to_int,  keep_prob, batch_size)
    
    return train_logits, infer_logits

In [None]:
# Setting the model parameters
epochs = 50
batch_size = 64
rnn_size = 512
num_layers = 2
encoding_embedding_size = 512
decoding_embedding_size = 512
learning_rate = 0.005
learning_rate_decay = 0.9
min_learning_rate = 0.0001
keep_probability = 0.75

In [None]:
tf.reset_default_graph()
# Starting the session
sess = tf.InteractiveSession()
    
# Loading the model inputs    
input_data, targets, lr, keep_prob = model_inputs()

# Sequence length is max_line_length for each batch
sequence_length = tf.placeholder_with_default(max_line_length, None, name='sequence_length')

# Finding shape of the input data for sequence_loss
input_shape = tf.shape(input_data)

# Create the training and inference logits
train_logits, inference_logits = seq2seq_model( tf.reverse(input_data, [-1]), targets, keep_prob, batch_size, sequence_length, len(answers_vocab_to_int), 
    len(questions_vocab_to_int), encoding_embedding_size, decoding_embedding_size, rnn_size, num_layers,  questions_vocab_to_int)

# Create inference logits tensor
tf.identity(inference_logits, 'logits')

with tf.name_scope("optimization"):
    # Calculating Loss function
    cost = tf.contrib.seq2seq.sequence_loss( train_logits, targets, tf.ones([input_shape[0], sequence_length]))

    # Using Adam Optimizer
    optimizer = tf.train.AdamOptimizer(learning_rate)

    # Performing Gradient Clipping to handle the vanishing gradient problem
    gradients = optimizer.compute_gradients(cost)
    capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)

In [None]:
def batch_data(questions, answers, batch_size):

    for batch_i in range(0, len(questions)//batch_size):
        start_i = batch_i * batch_size
        questions_batch = questions[start_i:start_i + batch_size]
        answers_batch = answers[start_i:start_i + batch_size]
        pad_questions_batch = np.array(pad_sentence_batch(questions_batch, questions_vocab_to_int))
        pad_answers_batch = np.array(pad_sentence_batch(answers_batch, answers_vocab_to_int))
        yield pad_questions_batch, pad_answers_batch

In [None]:
# Creating train and validation datasets for both questions and answers, with 15% to validation
train_valid_split = int(len(sorted_questions)*0.15)

train_questions = sorted_questions[train_valid_split:]
train_answers = sorted_answers[train_valid_split:]

valid_questions = sorted_questions[:train_valid_split]
valid_answers = sorted_answers[:train_valid_split]

print(len(train_questions))
print(len(valid_questions))

In [None]:
display_step = 20        # Check training loss after every 20 batches
stop_early = 0 
stop = 5                 # If the validation loss decreases after 5 consecutive checks, stop training
validation_check = ((len(train_questions))//batch_size//2)-1        # Counter for checking validation loss
total_train_loss = 0     # Record the training loss for each display step
summary_valid_loss = []     # Record the validation loss for saving improvements in the model

checkpoint= "./best_model.ckpt"   # creating the checkpoint file in the current directory

sess.run(tf.global_variables_initializer())

In [None]:
for epoch_i in range(1, epochs+1):
    for batch_i, (questions_batch, answers_batch) in enumerate(
            batch_data(train_questions, train_answers, batch_size)):
        start_time = time.time()
        _, loss = sess.run(
            [train_op, cost],
            {input_data: questions_batch, targets: answers_batch,  lr: learning_rate, 
             sequence_length: answers_batch.shape[1], keep_prob: keep_probability})

        total_train_loss += loss
        end_time = time.time()
        batch_time = end_time - start_time

        if batch_i % display_step == 0:
            print('Epoch {:>3}/{} Batch {:>4}/{} - Loss: {:>6.3f}, Seconds: {:>4.2f}'
                  .format(epoch_i, epochs, batch_i, 
                          len(train_questions) // batch_size, total_train_loss / display_step, 
                          batch_time*display_step))
            total_train_loss = 0

        if batch_i % validation_check == 0 and batch_i > 0:
            total_valid_loss = 0
            start_time = time.time()
            for batch_ii, (questions_batch, answers_batch) in enumerate(batch_data(valid_questions, valid_answers, batch_size)):
                valid_loss = sess.run(
                cost, {input_data: questions_batch, targets: answers_batch, lr: learning_rate, 
                       sequence_length: answers_batch.shape[1], keep_prob: 1})
                total_valid_loss += valid_loss
            end_time = time.time()
            batch_time = end_time - start_time
            avg_valid_loss = total_valid_loss / (len(valid_questions) / batch_size)
            print('Valid Loss: {:>6.3f}, Seconds: {:>5.2f}'.format(avg_valid_loss, batch_time))
            
            # Reduce learning rate with a minimum value threshold
            learning_rate *= learning_rate_decay
            if learning_rate < min_learning_rate:
                learning_rate = min_learning_rate

            summary_valid_loss.append(avg_valid_loss)
            if avg_valid_loss <= min(summary_valid_loss):
                print('New Record!') 
                stop_early = 0
                saver = tf.train.Saver() 
                saver.save(sess, checkpoint)

            else:
                print("No Improvement.")
                stop_early += 1
                if stop_early == stop:
                    break
    
    if stop_early == stop:
        print("Stopping Training.")
        break

In [None]:
def question_to_seq(question, vocab_to_int):
    """Creating the question to be taken as input by the model"""
    question = clean_text(question)
    return [vocab_to_int.get(word, vocab_to_int['<UNK>']) for word in question.split()]

In [None]:
# Selecting a random question from the full lot
random = np.random.choice(len(short_questions))
input_question = short_questions[random]
print(input_question)

# Transforming the selected question in the desired format of IDs and Words
input_question = question_to_seq(input_question, questions_vocab_to_int)

# Applying Padding to the question to reach the max_line_length
input_question = input_question + [questions_vocab_to_int["<PAD>"]] * (max_line_length - len(input_question))

# Correcting the shape of input_data, by adding the empty questions 
batch_shell = np.zeros((batch_size, max_line_length))

# Setting the input question as the first question
batch_shell[0] = input_question    
    
# Passing  input question to the model
answer_logits = sess.run(inference_logits, {input_data: batch_shell, 
                                            keep_prob: 1.0})[0]

# Removing padding from Question and Answer both
pad_q = questions_vocab_to_int["<PAD>"]
pad_a = answers_vocab_to_int["<PAD>"]

# Printing the final Answer output by the model 
print('Question')
print('  Word Ids:      {}'.format([i for i in input_question if i != pad_q]))
print('  Input Words: {}'.format([questions_int_to_vocab[i] for i in input_question if i != pad_q]))
print('\n')
print(' '.join(([questions_int_to_vocab[i] for i in input_question if i != pad_q])))

print('\nAnswer')
print('  Word Ids:      {}'.format([i for i in np.argmax(answer_logits, 1) if i != pad_a]))
print('  Response Words: {}'.format([answers_int_to_vocab[i] for i in np.argmax(answer_logits, 1) if i != pad_a]))
print('\n')
print(' '.join(([answers_int_to_vocab[i] for i in np.argmax(answer_logits, 1) if i != pad_a])))