<a href="https://colab.research.google.com/github/Utkichaps/HAL-Mimic-2.0/blob/main/Hal_Mimic_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hal Mimic 2.0

This is an improvement over my existing Hal Mimic chatbot. This uses an Encoder-Decoder model along with an attention layer for better context responses compared to the previous chatbot which used two bi-directional LSTM layers.

The model being used here was inspired by the Neural Machine Translation model used in the Deep Learning.ai Attention models course and has been tweaked to be used as a chatbot

In [None]:
from datetime import datetime
import random

import random
import numpy as np

#Trax libs:
!pip install trax sentencepiece t5
import trax
import sentencepiece as spm
from trax import layers as tl
from trax.fastmath import numpy as fastnp
from trax.supervised import training

## Data pre-processing
In this step we upload the Whatsapp txt chat records to the runtime environment and convert them into the data needed for training and eval

In [None]:
#Removes emojis 
def deEmojify(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')

In [None]:
#Formats the whatsapp data in proper format. Similar function for different social media pages can be added.
def format_data(file):
  data_n = open(file).read()
  data_n = deEmojify(data_n)
  data_n = data_n.splitlines()
  sentences = []
  for item in data_n:
    no = item[0:8]
    try:
      datetime_object = datetime.strptime(no, '%m/%d/%y')
    except ValueError:
      continue 
    l = item.split("-",1)
    del(l[0])
    s = l[0]
    f = s.split(":",1)
    f[0] = f[0].strip()
    f[1] = f[1].strip()  
    sentences.append(f)    
  return sentences

In this step, Remember to enter your name wherever you see \<Name\>. Enter it in the same way it looks in the Whatsapp chat file

In [None]:
#Splits data into the messages received and sent
def split_data(data):  
  sentences = []
  labels = []
  i = 0  
  while i < len(data):    
    if data[i][1] == '<Media omitted>':      #Removes <Media ommitted> from the chats
      del(data[i])      
      continue
    if data[i][0] == '<Name>':     #As seen on the whatsapp data, put your name there
      if i != 0 and data[i-1][0] == '<Name>':
        labels[len(labels)-1] += " " + data[i][1]
      else:
        labels.append(data[i][1])
    else:
      if i != 0 and data[i-1][0] != '<Name>':
        sentences[len(sentences)-1] += " " + data[i][1]
      else:
        sentences.append(data[i][1])
    i+=1
  if len(sentences) > len(labels):
    labels.append('okay')     #Arbitrary word to keep data of the same size
  elif len(sentences) < len(labels):
    sentences.append('okay')
  return sentences, labels

Here put this file name of the whatsapp chat text files

In [None]:
#Add the whatsapp text files here (how many ever chats you need)
f_sentences = []
f_sentences.append(format_data('<file1>'))
f_sentences.append(format_data('<file2>'))  
f_sentences.append(format_data('<file3>'))

In [None]:
final_sentences = []
final_labels = []
mydict = {}
for item in f_sentences:    
  s, l = split_data(item)  
  print(len(s),len(l))
  for i in range(len(s)):
    if mydict.get(l[i],False):
      continue
    mydict[l[i]] = True
    final_sentences.append(s[i])
    final_labels.append(l[i])
print(final_sentences)
print(final_labels)
print(len(final_sentences))
print(len(final_labels))

In [None]:
# Removes links in the dialogue
def removelinks(sent):
  a = sent.split(" ")
  length = len(a)
  i = 0
  while i < length:
    if 'https://' in a[i]:
      a.pop(i)
      length -= 1
    else:
      i += 1
  return " ".join(a)

In [None]:
# Creates a Corpus.txt that contains all the words to develop the vocabulary
file1 = open("Corpus.txt","a")
for l in final_sentences:
  if 'https://' in l:
    l = removelinks(l)
  file1.write(l + '\n')

for l in final_labels:
  if 'https://' in l:
    l = removelinks(l)
  file1.write(l + '\n')
file1.close()

In [None]:
#See a sample interaction:
index = random.randint(0,len(final_labels))
print("You:",final_labels[index])
print("Them:",final_sentences[index])

## Processing Dialogues

Now that we have the dialogues as _sentences_ and _labels_ we need to convert these data into the input for our model. We will do this by tokenizing our sentences and putting them into batches to train our model

In [1]:
# Creating a generator for our model
# Switching sentences and labels as sentences is the input and label is the target
def stream(labels, sentences):
  for i in range(len(labels)):
    yield (sentences[i],labels[i])

In [None]:
# Splitting our data into train and eval
cutoff = int(len(final_labels)*0.9)
train_labels = final_labels[:cutoff]
train_sentences = final_sentences[:cutoff]
eval_labels = final_labels[cutoff:]
eval_sentences = final_sentences[cutoff:]
print("Length of training data:",len(train_labels))
print("Length of eval data:",len(eval_labels))

In [None]:
# Creating the train and eval streams
train_stream = stream(train_labels, train_sentences)
eval_stream = stream(eval_labels, eval_sentences)

In [None]:
# Building the vocabulary file from Corpus.txt
spm.SentencePieceTrainer.train(input='Corpus.txt', model_type='bpe', model_prefix='wa', vocab_size=30551) #Vocab size here is one I have assigned

In [None]:
# vocabulary filename
VOCAB_FILE = 'wa.model'
# vocabulary file directory
VOCAB_DIR = '.'
VOCAB_TYPE='sentencepiece'

tokenized_train_stream = trax.data.Tokenize(vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR, vocab_type=VOCAB_TYPE)(train_stream)
tokenized_eval_stream = trax.data.Tokenize(vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR, vocab_type=VOCAB_TYPE)(eval_stream)

Here we are using '1' as the EOS token for every dialogue

In [None]:
# Append EOS at the end of each sentence.

# Integer assigned as end-of-sentence (EOS)
EOS = 1

# generator helper function to append EOS to each sentence
def append_eos(stream):
    for (inputs, targets) in stream:
        inputs_with_eos = list(inputs) + [EOS]
        targets_with_eos = list(targets) + [EOS]
        yield np.array(inputs_with_eos), np.array(targets_with_eos)

# append EOS to the train data
tokenized_train_stream = append_eos(tokenized_train_stream)

# append EOS to the eval data
tokenized_eval_stream = append_eos(tokenized_eval_stream)

Here we filter long sentences so that we don't run out of memory.

In [None]:
# length_keys=[0, 1] means we filter both input and target sentences, so the max tokens will be 256 for training and 512 for eval
filtered_train_stream = trax.data.FilterByLength(
    max_length=256, length_keys=[0, 1])(tokenized_train_stream)

filtered_eval_stream = trax.data.FilterByLength(
    max_length=512, length_keys=[0, 1])(tokenized_eval_stream)    

# print a sample input-target pair of tokenized sentences
train_input, train_target = next(filtered_train_stream)
print('Single tokenized example input:', train_input)
print('Single tokenized example target:', train_target)

These are helper functions to tokenize and detokenize the text

In [None]:
#Encodes string into array of integers
def tokenize(input_str, vocab_file=None, vocab_dir=None, vocab_type=None):    
        
    EOS = 1
    
    # trax.data.tokenize takes streams and returns streams    
    inputs =  next(trax.data.tokenize(iter([input_str]),
                                      vocab_file=vocab_file, vocab_dir=vocab_dir, vocab_type=vocab_type))
        
    inputs = list(inputs) + [EOS]
    
    # Adding the batch dimension to the front of the shape
    batch_inputs = np.reshape(np.array(inputs), [1, -1])
    
    return batch_inputs


def detokenize(integers, vocab_file=None, vocab_dir=None, vocab_type=None):      
    # Remove the dimensions of size 1
    integers = list(np.squeeze(integers))
    
    EOS = 1
    
    # Removing the EOS to get the original tokens
    if EOS in integers:
        integers = integers[:integers.index(EOS)]     
    return trax.data.detokenize(integers, vocab_file=vocab_file, vocab_dir=vocab_dir, vocab_type=vocab_type)

In [None]:
# Example sentences
print('Single detokenized example input:', detokenize(train_input, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR,vocab_type=VOCAB_TYPE))
print('Single detokenized example target:', detokenize(train_target, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR,vocab_type=VOCAB_TYPE))
print()

In [None]:
# Bucketing to create streams of batches.

# Buckets are defined in terms of boundaries and batch sizes.
# Batch_sizes[i] determines the batch size for items with length < boundaries[i]
# So below, we'll take a batch of 256 sentences of length < 8, 128 if length is
# between 8 and 16, and so on -- and only 2 if length is over 512.
boundaries =  [8,   16,  32, 64, 128, 256, 512]
batch_sizes = [256, 128, 64, 32, 16,    8,   4,  2]

# Create the generators.
train_batch_stream = trax.data.BucketByLength(
    boundaries, batch_sizes,
    length_keys=[0, 1]  # As before: count inputs and targets to length.
)(filtered_train_stream)

eval_batch_stream = trax.data.BucketByLength(
    boundaries, batch_sizes,
    length_keys=[0, 1]  # As before: count inputs and targets to length.
)(filtered_eval_stream)

# Add masking for the padding (0s).
train_batch_stream = trax.data.AddLossWeights(id_to_mask=0)(train_batch_stream)
eval_batch_stream = trax.data.AddLossWeights(id_to_mask=0)(eval_batch_stream)

## Exploring the data

Now that we have processed the input, we can see some examples

In [None]:
input_batch, target_batch, mask_batch = next(train_batch_stream)

# let's see the data type of a batch
print("input_batch data type: ", type(input_batch))
print("target_batch data type: ", type(target_batch))

# let's see the shape of this particular batch (batch length, sentence length)
print("input_batch shape: ", input_batch.shape)
print("target_batch shape: ", target_batch.shape)

In [None]:
# pick a random index less than the batch size.
index = random.randrange(len(input_batch))

# use the index to grab an entry from the input and target batch
print('Input sentence: \n', detokenize(input_batch[index], vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR, vocab_type=VOCAB_TYPE), '\n')
print('Tokenized Input sentence: \n ', input_batch[index], '\n')
print('Target sentence: \n', detokenize(target_batch[index], vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR, vocab_type=VOCAB_TYPE), '\n')
print('Tokenized target sentence: \n', target_batch[index], '\n')

## The Model

The model uses an encoder-decoder architecture of LSTMs with an attention layer in between them.

In [None]:
def input_encoder_fn(input_vocab_size, d_model, n_encoder_layers):
    """ Input encoder runs on the input sentence and creates
    activations that will be the keys and values for attention.
    
    Args:
        input_vocab_size: int: vocab size of the input
        d_model: int:  depth of embedding (n_units in the LSTM cell)
        n_encoder_layers: int: number of LSTM layers in the encoder
    Returns:
        tl.Serial: The input encoder
    """
    
    # create a serial network
    input_encoder = tl.Serial( 
                
        # create an embedding layer to convert tokens to vectors
        tl.Embedding(input_vocab_size,d_model),
        
        # feed the embeddings to the LSTM layers. It is a stack of n_encoder_layers LSTM layers
        [tl.LSTM(d_model) for _ in range(n_encoder_layers)]        
    )

    return input_encoder

In [None]:
def pre_attention_decoder_fn(mode, target_vocab_size, d_model):
    """ Pre-attention decoder runs on the targets and creates
    activations that are used as queries in attention.
    
    Args:
        mode: str: 'train' or 'eval'
        target_vocab_size: int: vocab size of the target
        d_model: int:  depth of embedding (n_units in the LSTM cell)
    Returns:
        tl.Serial: The pre-attention decoder
    """
    
    # create a serial network
    pre_attention_decoder = tl.Serial(
                
        # shift right to insert start-of-sentence token and implement
        # teacher forcing during training
        tl.ShiftRight(),

        # run an embedding layer to convert tokens to vectors
        tl.Embedding(target_vocab_size, d_model),

        # feed to an LSTM layer
        tl.LSTM(d_model)        
    )
    
    return pre_attention_decoder

Prepping the input

In [None]:
def prepare_attention_input(encoder_activations, decoder_activations, inputs):
    """Prepare queries, keys, values and mask for attention.
    
    Args:
        encoder_activations fastnp.array(batch_size, padded_input_length, d_model): output from the input encoder
        decoder_activations fastnp.array(batch_size, padded_input_length, d_model): output from the pre-attention decoder
        inputs fastnp.array(batch_size, padded_input_length): padded input tokens
    
    Returns:
        queries, keys, values and mask for attention.
    """    
    
    # set the keys and values to the encoder activations
    keys = encoder_activations
    values = encoder_activations
    
    # set the queries to the decoder activations
    queries = decoder_activations
    
    # generate the mask to distinguish real tokens from padding    
    mask = ~fastnp.equal(inputs, 0)    
    
    # add axes to the mask for attention heads and decoder length.
    mask = fastnp.reshape(mask, (mask.shape[0], 1, 1, mask.shape[1]))
    
    # broadcast so mask shape is [batch size, attention heads, decoder-len, encoder-len].    
    mask = mask + fastnp.zeros((1, 1, decoder_activations.shape[1], 1))
        
    
    return queries, keys, values, mask

Implementing the model

In [None]:
def NMTAttn(input_vocab_size=33300,
            target_vocab_size=33300,
            d_model=1024,
            n_encoder_layers=2,
            n_decoder_layers=2,
            n_attention_heads=4,
            attention_dropout=0.0,
            mode='train'):
    """Returns an LSTM sequence-to-sequence model with attention.

    The input to the model is a pair (input tokens, target tokens), e.g.,
    an English sentence (tokenized) and its translation into German (tokenized).

    Args:
    input_vocab_size: int: vocab size of the input
    target_vocab_size: int: vocab size of the target
    d_model: int:  depth of embedding (n_units in the LSTM cell)
    n_encoder_layers: int: number of LSTM layers in the encoder
    n_decoder_layers: int: number of LSTM layers in the decoder after attention
    n_attention_heads: int: number of attention heads
    attention_dropout: float, dropout for the attention layer
    mode: str: 'train', 'eval' or 'predict', predict mode is for fast inference

    Returns:
    A LSTM sequence-to-sequence model with attention.
    """
      
    # Creating layers for the input encoder
    input_encoder = input_encoder_fn(input_vocab_size, d_model, n_encoder_layers)    
    # Creating layers for the pre-attention decoder
    pre_attention_decoder = pre_attention_decoder_fn(mode, target_vocab_size, d_model)    
    # Creating a serial network
    model = tl.Serial( 
        
      # Copying input tokens and target tokens as they will be needed later.
      tl.Select([0,1,0,1]),        
        
      # Running input encoder on the input and pre-attention decoder the target.
      tl.Parallel(input_encoder, pre_attention_decoder),
        
      # Prepare queries, keys, values and mask for attention.
      tl.Fn('PrepareAttentionInput', prepare_attention_input, n_out=4),
        
      # Running the AttentionQKV layer      
      tl.Residual(tl.AttentionQKV(d_model, n_heads=n_attention_heads, dropout=attention_dropout, mode=mode)),
      
      # Drop the attention mask (i.e. index = None)
      tl.Select([0,2]),
        
      # Running the rest of the RNN decoder
      [tl.LSTM(d_model) for _ in range(n_decoder_layers)],
        
      # Preparing output by making it the right size
      tl.Dense(target_vocab_size),
              
      tl.LogSoftmax()
    )        
    
    return model

In [None]:
model = NMTAttn()
print(model)

Training

In [None]:
train_task = training.TrainTask(        
        
    labeled_data= train_batch_stream,
        
    loss_layer= tl.CrossEntropyLoss(),
    
    # Adam optimizer with learning rate of 0.01
    optimizer= trax.optimizers.adam.Adam(0.01),
        
    # 1000 warmup steps with a max value of 0.01
    lr_schedule= trax.lr.warmup_and_rsqrt_decay(1000,0.01),
    
    # Checkpoint every 10 steps
    n_steps_per_checkpoint= 10,      
)

In [None]:
eval_task = training.EvalTask(
        
    labeled_data=eval_batch_stream,
        
    metrics=[tl.CrossEntropyLoss(), tl.Accuracy()],
)



In [None]:
# define the output directory
output_dir = 'output_dir/'

# remove old model if it exists. restarts training.
!rm -f ~/output_dir/model.pkl.gz  

# define the training loop
training_loop = training.Loop(NMTAttn(mode='train'),
                              train_task,
                              eval_tasks=[eval_task],
                              output_dir=output_dir)

The training depends a lot on the dataset that is used. Of course a larger dataset means better training. If your dataset doesn't have a lot of conversations then it could be inaccurate

In [None]:
training_loop.run(50) #Experiment with 20, 50, 100

## Decoding the output

Here we create some helper functions to help decode the output from the model and format the data as a chatbot. We use a greedy approach to predict the next words given a sentence

In [None]:
# instantiate the model we built in eval mode
model = NMTAttn(mode='eval')

model.init_from_file("output_dir/model.pkl.gz", weights_only=True)
model = tl.Accelerate(model)

In [None]:
def next_symbol(NMTAttn, input_tokens, cur_output_tokens, temperature):
    """Returns the index of the next token.

    Args:
        NMTAttn (tl.Serial): An LSTM sequence-to-sequence model with attention.
        input_tokens (np.ndarray 1 x n_tokens): tokenized representation of the input sentence
        cur_output_tokens (list): tokenized representation of previously translated words
        temperature (float): parameter for sampling ranging from 0.0 to 1.0.
            0.0: same as argmax, always pick the most probable token
            1.0: sampling from the distribution (can sometimes say random things)

    Returns:
        int: index of the next token in the translated sentence
        float: log probability of the next symbol
    """    
    
    token_length = len(cur_output_tokens)
    
    padded_length = int(np.ceil(2**np.log2(token_length + 1)))
    
    padded = cur_output_tokens + [0]*(padded_length - token_length)
    
    # model expects the output to have an axis for the batch size in front so
    # `padded` list is converted to a numpy array with shape (x, <padded_length>) where the
    # x position is the batch axis.
    padded_with_batch = np.expand_dims(np.array(padded), axis=0)
    print()
    # get the model prediction. remember to use the `NMTAttn` argument defined above.    
    output, _ = NMTAttn((input_tokens,padded_with_batch))
    
    # get log probabilities from the last token output
    log_probs = output[0,-1,:]

    # get the next symbol by getting a logsoftmax sample (*hint: cast to an int)
    symbol = int(tl.logsoftmax_sample(log_probs, temperature))        

    return symbol, float(log_probs[symbol])



In [None]:
def sampling_decode(input_sentence, NMTAttn = None, temperature=0.0, vocab_file=None, vocab_dir=None, vocab_type=None):
    """Returns the translated sentence.

    Args:
        input_sentence (str): sentence to translate.
        NMTAttn (tl.Serial): An LSTM sequence-to-sequence model with attention.
        temperature (float): parameter for sampling ranging from 0.0 to 1.0.
            0.0: same as argmax, always pick the most probable token
            1.0: sampling from the distribution (can sometimes say random things)
        vocab_file (str): filename of the vocabulary
        vocab_dir (str): path to the vocabulary file

    Returns:
        tuple: (list, str, float)
            list of int: tokenized version of the translated sentence
            float: log probability of the translated sentence
            str: the translated sentence
    """        
    
    # encode the input sentence
    input_tokens = tokenize(input_sentence, vocab_file, vocab_dir, vocab_type=vocab_type)
    
    # initialize the list of output tokens
    cur_output_tokens = []
    
    # initialize an integer that represents the current output index
    cur_output = 0
        
    EOS = 1
    print("Input tokens to sampling decode",input_tokens)
    # check that the current output is not the end of sentence token
    while cur_output != EOS:
        
        # update the current output token by getting the index of the next word (hint: use next_symbol)
        cur_output, log_prob = next_symbol(NMTAttn, input_tokens, cur_output_tokens, temperature)
        
        # append the current output token to the list of output tokens
        cur_output_tokens.append(cur_output)
    print("Output tokens to sampling decode",cur_output_tokens)
    # detokenize the output tokens
    sentence = detokenize(cur_output_tokens, vocab_file, vocab_dir, vocab_type=vocab_type)    
    
    return cur_output_tokens, log_prob, sentence


In [None]:
def greedy_decode_test(sentence, NMTAttn=None, vocab_file=None, vocab_dir=None, vocab_type=None):
    """Prints the input and output of our NMTAttn model using greedy decode

    Args:
        sentence (str): a custom string.
        NMTAttn (tl.Serial): An LSTM sequence-to-sequence model with attention.
        vocab_file (str): filename of the vocabulary
        vocab_dir (str): path to the vocabulary file

    Returns:
        str: the translated sentence
    """
    print(sentence)
    while sentence != "BYE":      
      _,_, translated_sentence = sampling_decode(sentence, NMTAttn, vocab_file=vocab_file, vocab_dir=vocab_dir, vocab_type=vocab_type)
      print("Chatbot: " + translated_sentence)
      sentence = input()        

Final chatbot:

In [None]:
initial_input_sentence= 'How are you doing today'

greedy_decode_test(initial_input_sentence model, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR, vocab_type=VOCAB_TYPE);