### Imports

In [None]:
import numpy as np
import json
import os
import yaml
import re
import time
import matplotlib.pyplot as plt
import pickle

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Bidirectional, LSTM, Embedding, Lambda
from tensorflow.keras import Model
from tensorflow.python.ops import lookup_ops
from tensorflow.keras.preprocessing.text import Tokenizer

## Load and clean text

In [None]:
# Function to clean sentence
def clean_sent(text):
    # Strip text
    text = text.strip()
    # Lower text
    text = text.lower()
    # Remove puntuations expect for space . ?
    text = re.sub(r'[^\w\s\?\.]', '', text)
    # Replace '?' with ' ?' and '.' with ' .'
    text = re.sub(r'[\?]', ' ?', text)
    text = re.sub(r'[\.]', ' .', text)

    return text

## Generate vocabulary

In [None]:
# Initialize a few tokens which will be used in the vocabulary
PAD = '<pad>'
START = '<sos>'
END = '<eos>'
UNK = '<unk>'
PAD_IDX = 0
START_IDX = 1
END_IDX = 2
UNK_IDX = 3

In [None]:
# Saves the glove vector embeddings to sqlite
# (Because the size of pretrained word embeddings is too large
# we'll save the word embeddings in file and get embeddings from file)
def load_from_glove_vector(path_glove):
	print('Loading vocabulary...')
	word_index = {}
	index_word = {}
	# Insert pad, start, end and unk tokens to vocabulary
	word_index[PAD] = PAD_IDX
	word_index[START] = START_IDX
	word_index[END] = END_IDX
	word_index[UNK] = UNK_IDX
	index_word[PAD_IDX] = PAD
	index_word[START_IDX] = START
	index_word[END_IDX] = END
	index_word[UNK_IDX] = UNK

	word_embedding = {}
	vocab_size = 4
	word_emb_sum = 0
	with open(path_glove, 'r', encoding="utf8") as file:
		# Traverse each word -> word embedding pair
		lines = file.readlines()
		n = len(lines)
		# Change vocab size from here (if you want full vocabulary, just write range(0, n))
		for i in range(0, 20000):
			line = lines[i]
			# Split word and word embeddings
			split_line = line.split(' ')
			# Get word
			word = split_line[0]
			# Get embeddings (in string)
			embedding = split_line[1:]
			# Save word -> word index
			word_index[word] = vocab_size
			# Save word index -> word
			index_word[vocab_size] = word
			# Store index -> word embedding
			word_embedding[vocab_size] = np.array([float(value) for value in embedding])
			# Calculate sum of word embeddings
			word_emb_sum += word_embedding[vocab_size]
			# Increment vocabulary size
			vocab_size += 1

		# Get the length of one word embedding
		emb_length = len(word_embedding[vocab_size - 1])
		
		# Add word embeddings for PAD START END UNK
		# Word embedding for PAD is all 0s (common for PAD token)
		# Word embedding for START is all 1s (as START token is at the start of the sequence)
		# Word embedding for END is all -1s (as END token is at the end of the sequence)
		# Word embedding for UNK is the average of all normal word embeddings (words other than tokens)
		word_embedding[PAD_IDX] = np.zeros((emb_length,))
		word_embedding[START_IDX] = np.ones((emb_length,))
		word_embedding[END_IDX] = np.zeros((emb_length,)) - 1
		word_embedding[UNK_IDX] = word_emb_sum / (vocab_size - 5)

		print('Vocabulary loaded.')

		return word_index,\
			index_word,\
			dict(sorted(word_embedding.items(), key=lambda item: item[0])), \
			vocab_size

In [None]:
# Generate vocabulary using glove vectors
word_index, index_word, word_embedding, vocab_size = load_from_glove_vector('glove.6B.50d.txt')

In [None]:
print('Vocab size:', vocab_size)

In [None]:
# Generate word to index table compatible with tensorflow (this will tokenize the sentences faster)
word_index_table_init = tf.lookup.KeyValueTensorInitializer(
    list(word_index.keys()), list(word_index.values()))

word_index_table = tf.lookup.StaticHashTable(
    word_index_table_init,
    default_value=UNK_IDX)

### Create dataset

In [None]:
def split_example(example):
    # Split example into input output pairs
    io = tf.strings.split(example, sep=',')
    # Tokenize example
    io = tf.strings.split(io)
    # Truncate sentences to 20 words
    # Add START and END token in the output sentence
    return io[0][:20], tf.concat([tf.concat([[START], io[1][:20]], axis=0), [END]], axis=0)

dataset = tf.data.TextLineDataset('train_set.csv').map(split_example)

In [None]:
print('Dataset example:')
next(iter(dataset.take(1)))

In [None]:
def tokenize(inp, out):
    inp = word_index_table.lookup(inp)
    out = word_index_table.lookup(out)
    return inp, out

dataset = dataset.map(tokenize)

In [None]:
print('Dataset example after tokenization:')
next(iter(dataset.take(1)))

In [None]:
# Specify BATCH_SIZE
BATCH_SIZE = 64
dataset = dataset.shuffle(1000).padded_batch(BATCH_SIZE, padding_values=word_index[PAD])

In [None]:
next(iter(dataset.take(1)))

In [None]:
# Create validation and test dataset
# Limit validation and test values to 250 examples
val_dataset = \
    tf.data\
    .TextLineDataset('val_set.csv')\
    .map(split_example)\
    .map(tokenize)\
    .padded_batch(250, padding_values=word_index[PAD])

test_dataset = \
    tf.data\
    .TextLineDataset('test_set.csv')\
    .map(split_example)\
    .map(tokenize)\
    .padded_batch(250, padding_values=word_index[PAD])

### Create Model

In [None]:
enc_units = 128
dec_units = 128
attention_units = 128

# Get word embedding size
word_embedding_size = len(word_embedding[0])

In [None]:
"""
Encoder class which implements LSTM to encode the input sentence
"""
class Encoder(Model):
    def __init__(self, enc_units, embedding):
        super(Encoder, self).__init__()
        self.enc_units = enc_units

        # Set embedding layer
        self.embedding = embedding

        # Initialize LSTM
        self.lstm = LSTM(
                enc_units,
                return_sequences=True,
                return_state=True,
                recurrent_initializer='glorot_uniform'
            )

    def call(self, x, hidden_state, cell_state):
        # x: (batch size, batch_max_input_length) np.ndarray
        # hidden_state: initial hidden state (batch_size, enc_units)
        # cell_state: initial cell state (batch_size, enc_units)
        
        # Get embeddings of word tokens
        x = self.embedding(x) # (batch size, max_batch_input_length, embedding_size)

        # Pass embeddings through lstm and get hiddens states (activations)
        # (batch size, max_batch_input_length, enc_units)
        activations, enc_hidden, enc_cell = \
            self.lstm(x, initial_state=[hidden_state, cell_state])

        return activations, enc_hidden, enc_cell

    def initialize_state(self, batch_size):
        return tf.zeros((batch_size, self.enc_units)), tf.zeros((batch_size, self.enc_units))

In [None]:
"""
Attention class to compute bahdanau's attention
"""
class Attention(Model):
    def __init__(self, attention_units):
        super(Attention, self).__init__()
        # Initialize dense layers to compute attention
        self.W1 = Dense(attention_units)
        self.W2 = Dense(attention_units)
        self.V = Dense(1)

    def call(self, activations, prev_hidden_state):
        # activations: Activations of encoder (batch_size, batch_max_input_length, enc_units) tensor
        # prev_hidden_state: Hidden state of previous decoder time step (batch_size, dec_units) tensor
        # enc_pad_mask: Mask out the attention value for padded tokens (batch_size, batch_max_input_length)
        
        # Add extra dimension to hidden state
        prev_hidden_state = tf.expand_dims(prev_hidden_state, axis=1) # (batch_size, 1, dec_units)
        w1 = self.W1(prev_hidden_state) # (batch_size, 1, attention_units)
        w2 = self.W2(activations) # (batch_size, batch_max_input_length, attention_units)
        score = self.V(tf.nn.tanh(w1 + w2)) # (batch_size, batch_max_input_length, attention_units)

        # Calculate attention from score
        attention = tf.nn.softmax(score, axis=1) # (batch_size, batch_max_input_length, 1)        

        # Calculate context vector
        context = attention * activations # (batch_size, batch_max_input_length, enc_units)
        context = tf.reduce_sum(context, axis=1) # (batch_size, enc_units)

        return context, attention 

In [None]:
"""
Decoder class which implements LSTM to generate output sentence
"""
class Decoder(Model):
    def __init__(self, dec_units, attention_units, embedding, vocab_size):
        super(Decoder, self).__init__()
        self.dec_units = dec_units

        # Initialize attention layer
        self.attention = Attention(attention_units)

        # Set embedding layer
        self.embedding = embedding

        # Initialize LSTM Unit
        self.lstm = LSTM(
            dec_units,
            recurrent_initializer='glorot_uniform',
            return_state=True
        )

        # Initialize classification layer
        self.dense = Dense(vocab_size)

    def call(self, x, activations, prev_hidden_state, prev_cell_state):
        # x: Previous predicted token (batch_size, 1) np.ndarray
        # activations: Activations of encoder (batch_size, batch_max_input_length, enc_units) tensor
        # prev_hidden_state: hidden_state of previous cell (batch_size, dec_units) tensor
        # prev_cell_state: cell_state of previous cell (batch_size, dec_units) tensor

        # Calculate attention
        # (batch_size, enc_units), (batch_size, match_max_input_length, 1)
        context, attention = self.attention(activations, prev_hidden_state) 

        # Get word embedding of token
        x = self.embedding(x) # (batch_size, 1, word_embedding_size)

        # Concatenate word embedding with context vector
        # (batch_size, 1, enc_units + word_embedding_size)
        x = tf.concat([tf.expand_dims(context, axis=1), x], axis=-1)

        # Pass the above vector through LSTM cell
        # (batch_size, dec_units), (batch_size, dec_units) 
        _, hidden_state, cell_state = self.lstm(x, initial_state=[prev_hidden_state, prev_cell_state])

        # Pass the hidden_state through classification layer
        out = self.dense(hidden_state) # (batch_size, vocab_size)

        return out, hidden_state, cell_state, attention

    def initialize_state(self, BATCH_SIZE):
        return tf.zeros((BATCH_SIZE, self.dec_units)), tf.zeros((BATCH_SIZE, self.dec_units))

In [None]:
'''
Class that implements Seq2Seq model using LSTM encoder and decoder
'''
class Seq2Seq(Model):
    def __init__(self, 
    enc_units, dec_units, attention_units, word_embedding, vocab_size, word_embedding_size):
        super(Seq2Seq, self).__init__()

        # Initialize embedding layer
        self.embedding = Embedding(
            vocab_size, 
            word_embedding_size,
            weights=[np.array(list(word_embedding.values()))],
            trainable=False
        )
        # Initialize encoder
        self.encoder = Encoder(enc_units, self.embedding)

        # Initialize decoder
        self.decoder = Decoder(dec_units, attention_units, self.embedding, vocab_size)

        self.vocab_size = vocab_size
    
    def call(self, inp, targ):
        # inp: input batch (batch_size, batch_max_input_length)
        # targ: ground truth (batch_size, batch_max_output_length)

        # Initialize encoder states
        enc_hidden, enc_cell = self.encoder.initialize_state(inp.shape[0])

        # Get encoder output
        activations, enc_hidden, enc_cell = self.encoder(inp, enc_hidden, enc_cell)

        # Initialize decoder hidden and cell states
        # ( Will give error if enc_units != dec_units. If you want that this line does not give any errors,
        # either make dec_units = enc_units or remove this line and use -> 
        # dec_hidden, dec_cell = self.decoder.initialize_state(targ.shape[0]) )
        dec_hidden, dec_cell = enc_hidden, enc_cell

        preds = []

        for t in range(0, targ.shape[1] - 1):
            # Get decoder input
            dec_input = tf.expand_dims(targ[:, t], axis=1)

            # Get decoder output
            pred, dec_hidden, dec_cell, _ = self.decoder(dec_input, activations, dec_hidden, dec_cell)

            preds.append(pred) # (batch_max_input_length, batch_size, vocab_size)
        
        # (batch_size, batch_max_input_length, vocab_size)
        return tf.reshape(tf.convert_to_tensor(preds), [inp.shape[0], -1, self.vocab_size])

    def predict(self, inp):
        # inp: input batch (batch_size, batch_max_input_length)

        outputs = []

        for sentence in inp: # (batch_max_input_length,)
            # Initialize encoder states
            enc_hidden, enc_cell = self.encoder.initialize_state(1) # (1, enc_units)
            # Pass the sentence through encoder
            # (1, batch_max_input_length, enc_units), (1, enc_units), (1, enc_units)
            activations, enc_hidden, enc_cell = \
                self.encoder(tf.expand_dims(sentence, axis=0), enc_hidden, enc_cell)

            # Initialize decoder states
            dec_hidden, dec_cell = enc_hidden, enc_cell

            max_sent_len = 22

            dec_input = tf.expand_dims([word_index[START]], axis=0)
            sent = [word_index[START]]
            for _ in range(max_sent_len):
                pred, dec_hidden, dec_cell, _ = self.decoder(dec_input, activations, dec_hidden, dec_cell)

                # Get best word
                index = tf.argmax(pred[0]).numpy()

                sent.append(index)

                # End output when EOS token is reached
                if index == word_index[END]:
                    break
                
                dec_input = tf.expand_dims([index], axis=0)
            
            outputs.append(sent)

        return outputs

In [None]:
seq2seq = Seq2Seq(enc_units, dec_units, attention_units, word_embedding, vocab_size, word_embedding_size)

In [None]:
# Execute this line only if you have saved weights
seq2seq.load_weights('model_weights/chatbot_weights')

### Initialize optimizer and loss function

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
# Greater value of lambda -> greater regularization
lamda = 0.001
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(gt, pred):
    # gt: ground truth word: (batch_size, batch_max_output_length - 1)
    # pred: predicted word: (batch_size, batch_max_output_length - 1, vocab_size)
    
    # Calculate mask
    # Check if gt value == 0
    # [3243, 0, 1234, 0, 0, 0] -> [False, True, False, True, True, True]
    mask = tf.math.equal(gt, word_index[PAD])

    # Reverse the value
    # [False, True, False, True, True, True] -> [True, False, True, False, False, False]
    mask = tf.math.logical_not(mask)

    # Calculate loss
    loss = loss_object(gt, pred) # (batch_size,)

    # Cast mask to same type as loss value's type (to floating point value more specifically)
    # [True, False, True, False, False, False] -> [1, 0, 1, 0, 0, 0]
    mask = tf.cast(mask, dtype=loss.dtype)

    # Multiply loss with mask
    loss *= mask # (batch_size, batch_max_output_length - 1)

    # Return mean of loss accross the batch
    return tf.reduce_mean(loss) # No dims ()

# Use this function in case your model is overfitting, change lamda to control how
# much regularization you want
def regularized_loss_function(gt, pred, weights):
    # gt: ground truth word: (batch_size, batch_max_output_length - 1)
    # pred: predicted word: (batch_size, batch_max_output_length - 1, vocab_size)
    # weights: Weights of all layers (encoder + attention + decoder)

    # Calculate simple loss across the batch
    loss = loss_function(gt, pred) # No dims ()
    
    # Use l2 loss coefficient
    l2_loss_coeff = (lamda / 2) * (tf.reduce_sum(weights ** 2)) # No dims ()
    # Take the average of l2_loss_coeff across the batch
    l2_loss_coeff = l2_loss_coeff / gt.shape[0]

    # Add l2_loss_coeff to the original loss
    loss += l2_loss_coeff

    return loss

### Train Model

In [None]:
@tf.function
def train(inp, targ):
    # inp: input batch (batch_size, batch_max_input_length)
    # targ: ground truth (batch_size, batch_max_output_length)

    batch_loss = 0

    # Peform all the model calculations within the context of gradient tape so that 
    # gradient tape can calculate the gradient of loss wrt to model weights 
    with tf.GradientTape() as tape:
        # Get predicts of model
        preds = seq2seq(inp, targ)

        # Calculate loss for each word in whole batch
        batch_loss = loss_function(targ[:, 1:], preds)

    # Get weights of model
    weights = seq2seq.trainable_variables

    # Calulate gradient of loss wrt to weights
    gradients = tape.gradient(batch_loss, weights)

    # Update weights of model using optimizer
    optimizer.apply_gradients(zip(gradients, weights))

    return batch_loss

In [None]:
train_loss = []
val_loss = []

In [None]:
# Only run this cell if you have saved accuracy and loss
with open('model_weights/train_loss', mode='rb') as f:
    train_loss = pickle.load(f)
with open('model_weights/val_loss', mode='rb') as f:
    val_loss = pickle.load(f)

In [None]:
EPOCHS = 1

# Training loop
for epoch in range(EPOCHS):
    start = time.time()

    TRAIN_LOSS = 0
    VAL_LOSS = 0
    
    num_batches = 0

    # -1 will get all batches, if you want to restrict the
    # number of batches, just add a count. 
    # e.g. 1000 will give 1000 batches of size 64
    for (batch, (inp, targ)) in enumerate(dataset.take(1)):
        batch_loss = train(inp, targ)

        TRAIN_LOSS += batch_loss

        if batch % 200 == 0:
            template = 'Epoch {} Batch {} Loss {:.4f}'
            print(template.format(
                epoch + 1,
                batch,
                batch_loss.numpy()
            ))

        num_batches += 1

    # Calculate validation loss
    for val_inp, val_targ in val_dataset.take(1):
        val_preds = seq2seq(val_inp, val_targ)
    
    # Calculate validation loss
    VAL_LOSS = loss_function(val_targ[:, 1:], val_preds)
    
    train_loss.append(TRAIN_LOSS.numpy()  / num_batches)
    val_loss.append(VAL_LOSS.numpy())

    # Save losses and model weights after each epoch    
    seq2seq.save_weights('model_weights/chatbot_weights')

    with open('model_weights/train_loss', mode='wb') as f:
        pickle.dump(train_loss, f)
    with open('model_weights/val_loss', mode='wb') as f:
        pickle.dump(val_loss, f)

    print('\nEpoch {} Training Loss {:.4f}'\
        .format(epoch + 1, TRAIN_LOSS.numpy() / num_batches))

    print('Epoch {} Validation Loss {:.4f}'\
        .format(epoch + 1, VAL_LOSS.numpy()))

    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

In [None]:
# Plot loss graph
plt.plot(train_loss)
plt.plot(val_loss)
plt.xlabel('EPOCHS')
plt.ylabel('LOSS')
plt.legend(['Train', 'Validation'])

In [None]:
# Execute this line only if you have saved weights
seq2seq.load_weights('model_weights/chatbot_weights')

In [None]:
# Calculate test set loss
for test_inp, test_targ in test_dataset.take(1):
    test_preds = seq2seq(test_inp, test_targ)

# Calculate loss
test_loss = loss_function(test_targ[:, 1:], test_preds)

print('Test loss: ', test_loss.numpy())

In [None]:
def get_bot_output(sentence):
    # Clean sentence
    sentence = clean_sent(sentence)
    # Tokenize sentence
    sentence = sentence.split()
    sentence = np.array([list(
        map(lambda word: word_index[UNK] if word not in word_index else word_index[word], sentence))])

    sentence = seq2seq.predict(sentence)[0]

    sent = ''
    for token in sentence:

        if token != word_index[START] and token != word_index[END] and token != word_index[UNK]:
            if token == word_index['?'] or token == word_index['.']:
                sent += index_word[token]
            else:
                sent += ' ' + index_word[token]

    return sent

In [None]:
def converse():
    while True:
        # get input sentence from user
        user_input = input('Enter sentence: ')
        if user_input == 'end':
            break

        bot_output = get_bot_output(user_input)
        print('Bot:', bot_output, '\n')

In [None]:
converse()