# $Building$ $a$ $Chatbot:$ $PyTorch$

In [None]:
import csv
import random
import re
import unicodedata
import codecs
import numpy as np
import itertools
import math
import os

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data.dataset import random_split
from torch.autograd import Variable

from tqdm import tqdm_notebook, tqdm, notebook

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# $Part$ $1:$ $Data$ $Preprocessing$

In [None]:
path = '/content/drive/MyDrive/UCD_Programmes/Summer/ACM40960_Projects_in_Maths_Modelling/My_Project/dataset/cornell movie-dialogs corpus'

In [None]:
formatted_data_exist = False

## reform the data structure
if formatted_data_exist == False:
  # visualize some lines
  with open(path + '/movie_lines.txt', 'r', encoding='iso-8859-1') as file:
    lines = file.readlines()
  for line in lines[:8]:
    print(line.strip())
  print('')

  # split each line of the file into a dictionary of fields (lineID, characterID, movieID, character, text)
  line_fields = ['lineID', 'characterID', 'movieID', 'character', 'text']
  lines = {}
  with open(path + '/movie_lines.txt', 'r', encoding='iso-8859-1') as f:
    for line in f:
      values = line.split(' +++$+++ ')
      # extract fields
      lineObj = {}
      for i, field in enumerate(line_fields):
        lineObj[field] = values[i]
      lines[lineObj['lineID']] = lineObj


  # group fields of lines from 'loadlines' into conversation as based on 'movie_conversations.txt'
  conv_fields = ['character1ID', 'character2ID', 'movieID', 'utteranceIDs']
  conversations = []
  with open(path + '/movie_conversations.txt', 'r', encoding='iso-8859-1') as f:
    for line in f:
      values = line.split(' +++$+++')

      # extract fields
      convObj = {}
      for i, field in enumerate(conv_fields):
        convObj[field] = values[i]
      
      # convert string result from split to list, since convObj['utteranceIDs'] == "['L598485', 'L598486', ...]"
      lineIDs = eval(convObj['utteranceIDs'])

      # reassemble lines
      convObj['lines'] = []
      for lineID in lineIDs:
        convObj['lines'].append(lines[lineID])
      conversations.append(convObj)

  # extracts pairs of sentences from conversations
  qa_pairs = []
  for conversation in conversations:
    # iterate over all the lines of the conversation
    for i in range(len(conversation['lines']) - 1):
      inputLine = conversation['lines'][i]['text'].strip()
      targetLine = conversation['lines'][i+1]['text'].strip()
      # filter wrong samples (if one of the lists is empty)
      if inputLine and targetLine:
        qa_pairs.append([inputLine, targetLine])
        
  print(qa_pairs[0]) # conversations

In [None]:
# define path to new file
datafile = '/formatted_movie_lines.txt'
delimiter = '\t'
# unescape the delimiter
delimiter = str(codecs.decode(delimiter, 'unicode_escape'))

# create a new csv file
print('\nWriting newly formatted file...')
with open(path + datafile, 'w', encoding='utf-8') as outputfile:
  writer = csv.writer(outputfile, delimiter=delimiter)
  for pair in qa_pairs:
    writer.writerow(pair)

del conversation, conversations, qa_pairs
print("Done writing to file")

In [None]:
# visualize some formatted lines
with open(path + datafile, 'rb') as file:
  lines = file.readlines()
for line in lines[:8]:
  print(line)

In [None]:
PAD_token = 0 # used for padding short sentences
SOS_token = 1 # start-of-sentence token
EOS_token = 2 # end-of-sentence token

class Vocabulary:
  def __init__(self, name):
    self.name = name
    self.word2index = {}
    self.word2count = {}
    self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
    self.num_words = 3 # PAD, SOS, EOS included

  def addWord(self, word):
    '''
    include the words in the dictionary
    '''
    if word not in self.word2index:
      self.word2index[word] = self.num_words
      self.word2count[word] = 1
      self.index2word[self.num_words] = word
      self.num_words += 1
    else:
      self.word2count[word] += 1
  
  def addSentence(self, sentence):
    for word in sentence.split():
      self.addWord(word)

  # remove words whose number are below a certain count threshold
  def trim(self, min_count):
    keep_words = []
    for k, v in self.word2count.items():
      if v >= min_count:
        keep_words.append(k)
    print(f'keep_words {len(keep_words)} / {len(self.word2index)} = {(len(keep_words) / len(self.word2index)):.4f}')
    # reinitualize dictionaries
    self.word2index = {}
    self.word2count = {}
    self.index2word = {PAD_token: 'PAD', SOS_token: 'SOS', EOS_token: 'EOS'}
    self.num_words = 3 # count default tokens

    for word in keep_words:
      self.addWord(word)

In [None]:
# turn a unicode string to plain ASCII
def unicodeToASCII(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn') # Mn stands for non-marking space

In [None]:
# lowercase, trim white space, lines...etc, and remove non-letter characters.
def normalizeString(s):
  s = unicodeToASCII(s.lower().strip())
  # replace any .!? by a whitespace + the character --> '!' = ' !'.\1 means the first backend group --> [,!?]. r is to 
  # not consider \1 as a character (r to escape a backlash).
  s = re.sub(r"([.!?])", r" \1", s)
  # remove any character that is not a sequence of lower or upper case letters + means one or more
  s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
  # remove a sequence of whitespace charaters
  s = re.sub(r"\s+", r" ", s).strip()
  return s

In [None]:
print("reading and processing file... please wait")
lines = open(path + datafile, encoding="utf-8").read().strip().split('\n')
print(lines[0])
# split every line into pairs and normalize
pairs = [[normalizeString(s) for s in pair.split('\t')] for pair in lines]
print("Done reading!")

voc = Vocabulary("movie-dialogs corpus") # the input is just a name here

In [None]:
for pair in pairs[:3]:
  print(pair)

In [None]:
# maximun sentence length to be considered (max words)
MAX_LENGTH = 15
def filterPair(p):
  '''
  return True if both sentences in a pair 'p' are under the MAX_LENGTH threshold
  '''
  return len(p[0].split()) < MAX_LENGTH and len(p[1].split()) < MAX_LENGTH

def filterPairs(pairs):
  '''
  filter pairs using filterPair condition
  '''
  return [pair for pair in pairs if filterPair(pair)]

In [None]:
pairs = [pair for pair in pairs if len(pair) > 1]
print(f"There are {len(pairs)} pairs/conversations in the dataset")
pairs = filterPairs(pairs)
print(f"After filtering, there are {len(pairs)} pairs/conversations")

In [None]:
# loop through each pair of and add the question and reply sentence to the vocabulary
for pair in pairs:
  voc.addSentence(pair[0])
  voc.addSentence(pair[1])
print(f"There are {voc.num_words} words")

In [None]:
# minimum word count threshold for trimming 
MIN_COUNT = 15

def trimRareWords(voc, pairs, MIN_COUNT):
  '''
  trim words used under the MIN_COUNT from the voc
  '''
  voc.trim(MIN_COUNT)
  keep_pairs = []
  for pair in pairs:
    input_sentence = pair[0]
    output_sentence = pair[1]
    keep_input = True
    keep_output = True

    # check input sentence
    for word in input_sentence.split():
      if word not in voc.word2index:
        keep_input = False
        break
    # check output sentence
    for word in output_sentence.split():
      if word not in voc.word2index:
        keep_output = False
        break
    
    # only keep pairs that do not contain trimmed word(s) in their input or output sentence
    if keep_input and keep_output:
      keep_pairs.append(pair)

  print(f"Trimmed from {len(pairs)} pairs to {len(keep_pairs)}, {len(keep_pairs)/len(pairs)} of total")
  return keep_pairs

# trim voc and pairs
pairs = trimRareWords(voc, pairs, MIN_COUNT)

In [None]:
print(pairs[0][0])
print(pairs[0][1])

# $Preparing$ $the$ $Data$ $for$ $Model$

In [None]:
def indexesFromSentence(voc, sentence):
  """
  inputs: (voc, sentence)
  output: words' indice, e.g. [1,3,5,3,2], 
          where 2 symbolizes the end of the sentence, 1 indicates the start of the sentence
  """
  return [SOS_token] + [voc.word2index[w] for w in sentence.split()] + [EOS_token]

In [None]:
class CorpusDataset(Dataset):
  """
  customise a dataset
  input: (voc, sentence pairs)
  """
  def __init__(self, voc, pairs):
    self.x = [indexesFromSentence(voc, q[0]) for q in pairs]
    self.y = [indexesFromSentence(voc, a[1]) for a in pairs]
    self.n_samples = len(self.x)

  def __getitem__(self, index):
    return self.x[index], self.y[index]

  def __len__(self):
    return self.n_samples

In [None]:
# create the dataset 
dataset = CorpusDataset(voc, pairs)

# delete the variables to release the memory
del pairs, pair, lines

In [None]:
def collate(batch, src_pad, trg_pad, device):
  '''
  fill padding tokens to let the sentences are in the same size
  '''
  inputs = [torch.LongTensor(item[0]) for item in batch]
  targets = [torch.LongTensor(item[1]) for item in batch]
  
  # Pad sequencse so that they are all the same length (within one minibatch)
  padded_inputs = pad_sequence(inputs, padding_value=src_pad, batch_first=True)
  padded_targets = pad_sequence(targets, padding_value=trg_pad, batch_first=True)
  
  # Sort by length for CUDA optimizations
  lengths = torch.LongTensor([len(x) for x in inputs])
  lengths, permutation = lengths.sort(dim=0, descending=True)

  return padded_inputs[permutation].to(device), padded_targets[permutation].to(device), lengths.to(device)

In [None]:
batch_size = 1024
n_iterations = math.ceil(len(dataset)/batch_size)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, 
                        collate_fn = lambda batch: collate(batch, PAD_token, PAD_token, device)) # fit the size within the small batch

n_iterations

# $Building$ $the$ $Model$

In [None]:
class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super().__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads # dq, dk, and dv

        assert (self.head_dim * heads == embed_size), "Embedding size needs to be divisible by heads"

        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.fc_out = nn.Linear(embed_size, embed_size) # fully connected NN

    def forward(self, values, keys, query, mask):
        # Get number of training examples
        N = query.shape[0]

        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]

        # Split the embedding into self.heads different pieces
        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        query = query.reshape(N, query_len, self.heads, self.head_dim)

        values = self.values(values)  # (N, value_len, heads, head_dim)
        keys = self.keys(keys)  # (N, key_len, heads, head_dim)
        queries = self.queries(query)  # (N, query_len, heads, heads_dim)

        # Einsum does matrix mult. for query*keys for each training example
        # with every other training example, don't be confused by einsum
        # it's just how I like doing matrix multiplication & bmm

        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
        # queries shape: (N, query_len, heads, heads_dim),
        # keys shape: (N, key_len, heads, heads_dim)
        # energy: (N, heads, query_len, key_len)

        # Mask padded indices so their weights become 0
        if mask is not None:
            energy = energy.masked_fill(mask == 0, float("-1e20"))

        # Normalize energy values similarly to seq2seq + attention
        # so that they sum to 1. Also divide by scaling factor for
        # better stability
        attention = torch.softmax(energy / (self.embed_size ** (1 / 2)), dim=3)
        # attention shape: (N, heads, query_len, key_len)

        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
            N, query_len, self.heads * self.head_dim
        )
        # attention shape: (N, heads, query_len, key_len)
        # values shape: (N, value_len, heads, heads_dim)
        # out after matrix multiply: (N, query_len, heads, head_dim), then
        # we reshape and flatten the last two dimensions.

        out = self.fc_out(out)
        # Linear layer doesn't modify the shape, final shape will be
        # (N, query_len, embed_size)

        return out


class EncoderBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion):
        super().__init__()
        self.attention = SelfAttention(embed_size, heads) # output: (N, query_len, embed_size)
        self.norm = nn.LayerNorm(embed_size)
        # self.norm2 = nn.LayerNorm(embed_size)

        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size),
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query, mask):
        attention = self.attention(value, key, query, mask)

        # Add skip connection, run through normalization and finally dropout
        x = self.dropout(self.norm(attention + query)) 
        forward = self.feed_forward(x)
        out = self.dropout(self.norm(forward + x)) # output: (N, query_len, embed_size)
        return out


class Encoder(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        embed_size,
        num_layers,
        heads,
        device,
        forward_expansion,
        dropout,
        max_length,
    ):
        super().__init__()
        self.embed_size = embed_size
        self.device = device
        self.word_embedding = nn.Embedding(src_vocab_size, embed_size) # output: (src_voc_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size) # output: (max_length, embed_size)

        self.layers = nn.ModuleList(
            [
                EncoderBlock(
                    embed_size,
                    heads,
                    dropout=dropout,
                    forward_expansion=forward_expansion,
                )
                for _ in range(num_layers)
            ]
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
        out = self.dropout(
            (self.word_embedding(x) + self.position_embedding(positions))
        )

        # In the Encoder the query, key, value are all the same
        for layer in self.layers:
            out = layer(out, out, out, mask)

        return out


class DecoderBlock(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout, device):
        super().__init__()
        self.norm = nn.LayerNorm(embed_size)
        self.attention = SelfAttention(embed_size, heads=heads)
        self.encoder_block = EncoderBlock(embed_size, heads, dropout, forward_expansion)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, value, key, src_mask, trg_mask):
        attention = self.attention(x, x, x, trg_mask)
        query = self.dropout(self.norm(attention + x))
        out = self.encoder_block(value, key, query, src_mask)
        return out


class Decoder(nn.Module):
    def __init__(
        self,
        trg_vocab_size,
        embed_size,
        num_layers,
        heads,
        forward_expansion,
        dropout,
        device,
        max_length,
    ):
        super().__init__()
        self.device = device
        self.word_embedding = nn.Embedding(trg_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)

        self.layers = nn.ModuleList(
            [
                DecoderBlock(embed_size, heads, forward_expansion, dropout, device)
                for _ in range(num_layers)
            ]
        )
        self.fc_out = nn.Linear(embed_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_out, src_mask, trg_mask):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
        x = self.dropout((self.word_embedding(x) + self.position_embedding(positions)))

        for layer in self.layers:
            x = layer(x, enc_out, enc_out, src_mask, trg_mask)

        out = self.fc_out(x)
        prob_out = F.softmax(out, dim=-1)

        return out, prob_out


class Transformer(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        trg_pad_idx,
        embed_size=512,
        num_layers=8,
        forward_expansion=4,
        heads=8,
        dropout=0.2,
        device="cpu",
        max_length=100,
    ):

        super().__init__()

        self.encoder = Encoder(
            src_vocab_size,
            embed_size,
            num_layers,
            heads,
            device,
            forward_expansion,
            dropout,
            max_length,
        )

        self.decoder = Decoder(
            trg_vocab_size,
            embed_size,
            num_layers,
            heads,
            forward_expansion,
            dropout,
            device,
            max_length,
        )

        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        # (N, 1, 1, src_len)
        return src_mask.to(self.device)

    def make_trg_mask(self, trg):
        N, trg_len = trg.shape
        trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
            N, 1, trg_len, trg_len
        )
        return trg_mask.to(self.device)

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)
        out, prob_out = self.decoder(trg, enc_src, src_mask, trg_mask)
        return out, prob_out

In [None]:
save_model = False
load_model = False

print(device)
model = Transformer(voc.num_words, voc.num_words, PAD_token, PAD_token, device=device).to(device)

In [None]:
# define the loss function
loss_function = nn.CrossEntropyLoss(ignore_index = PAD_token)

# set hyper-parameters
learning_rate = 0.01
epoches = 1

# set updating function
optimiser = torch.optim.SGD(model.parameters(), lr = learning_rate)

# train the model
for epoch in range(epoches):
  total_loss = total = 0

  progress_bar = notebook.tqdm(data_loader, desc="Training", leave=False)
  for i, (inputs, targets, lengths) in enumerate(progress_bar):
    # initialise gradients
    optimiser.zero_grad()

    # forwards
    output, _ = model(inputs.to(device), targets[:, :-1].to(device))
    pred = output.view(-1, output.size(-1)) # pred = (sentence_length, ebeded_dim)
    y = targets[:, 1:].contiguous().view(-1) # y = (sentence_length)

    # loss
    loss = loss_function(pred, y)

    # backpropagation
    loss.backward()
    
    # update the weights
    optimiser.step()

    # record loss
    if (i+1) % 3 == 0:
      print(f'epoch {epoch+1:2d}/{epoches}, step {i+1:3d}/{n_iterations}, loss {loss:.3f}')
  tqdm.write(f'epoch\t #{epoch + 1:2d}\t  train_loss: {loss:.3f}\n')

if save_model:
  ## save the model paramaters
  # define the parameter dictionary
  checkpoint = {
      'epoch': 1,
      'model_state': model.state_dict(),
      'optim_state': optimiser.state_dict()
  }

  # save the parameters
  torch.save(checkpoint, './checkpoint.pth')

if load_model:
  ## load parameters
  load_path = '/content/drive/MyDrive/UCD_Programmes/Summer/ACM40960_Projects_in_Maths_Modelling/My_Project/dataset/'
  loaded_checkpoint = torch.load(os.path.join(load_path, "checkpoint.pth"), map_location=device)
  epoch = loaded_checkpoint["epoch"]
  model.load_state_dict(loaded_checkpoint["model_state"])
  optimiser.load_state_dict(loaded_checkpoint["optim_state"])
  model.to(device)


In [None]:
def sentenceFromIndice(voc, indice):
  return " ".join([voc.index2word[idx] for idx in indice])

In [None]:
## execute the model
model.eval() # turn off the dropout layers
with torch.no_grad(): # stop updating the weights
  print("Let's chat! (type 'quit' to exit)")

  exit_list = ['exit', 'see you later', 'bye', 'quit', 'breat', 'q']
  while True:
    sentence = input("You: ")
    if sentence.lower() in exit_list:
      print("Bot: See you!")
      break

    # modify the input sentence
    sentence = normalizeString(sentence)
    voc.addSentence(sentence)
    inputs = torch.tensor([indexesFromSentence(voc, sentence)]).to(device)

    # forward
    output, _ = model(inputs.to(device), inputs.to(device))
    print(f"Bot: {sentenceFromIndice(voc, [output[0,i].data.topk(1)[1].item() for i in range(output.shape[1])])}")