# Build a chatbot using a *seq2seq* model

## Possible exercises

1. Modify code to use LSTM cells instead of GRU.

## References
- https://github.com/bentrevett/pytorch-seq2seq/blob/master/1%20-%20Sequence%20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb

- https://colab.research.google.com/github/pytorch/tutorials/blob/gh-pages/_downloads/cf54d584af1322e88020549223e907dc/chatbot_tutorial.ipynb#scrollTo=Tlj9jynLIYsH

In [1]:
import pdb

# Stage 0. Download the data

In [2]:
FILE = 'formatted_movie_lines.txt'

In [3]:
def print_lines(file, n=10):
    with open(file, 'rb') as datafile:
        lines = datafile.readlines()
    for line in lines[:n]:
        print(line)
        print('--')

print_lines(FILE, n=3)

b"Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell, I thought we'd start with pronunciation, if that's okay with you.\n"
--
b"Well, I thought we'd start with pronunciation, if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.\n"
--
b"Not the hacking and gagging and spitting part.  Please.\tOkay... then how 'bout we try out some French cuisine.  Saturday?  Night?\n"
--


## Stage 1. Pre-process the dataset and split into `train.csv` and `test.csv`

### Pre-processing

In [4]:
from tqdm import tqdm
import re

def load_pairs(file, n=None):
    lines = open(file, encoding='utf-8').read().strip().split('\n')
    pairs = []
    for line in tqdm(lines):
        pair = line.split('\t')
        pairs.append(pair)
        
    return pairs

pairs = load_pairs(FILE)
print('Original set size: ', len(pairs))

import unicodedata
def unicode2Ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )
def normalize_pairs(pairs):
    def normalize_string(s):
        s = unicode2Ascii(s.lower().strip())
        s = re.sub(r"([.!?])", r" \1", s)
        s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
        s = re.sub(r"\s+", r" ", s).strip()
        return s
    
    pairs = [[normalize_string(p[0]), normalize_string(p[1])] for p in pairs]
    return pairs

def filter_pairs_by_length(pairs, max_length: int = 999999):
    
    def is_pair_short_enough(p):
        return (len(p[0].split(' ')) < max_length) and (len(p[1].split(' ')) < max_length)
    
    return [pair for pair in pairs if is_pair_short_enough(pair)]

pairs = normalize_pairs(pairs)
MAX_LENGTH = 10
pairs = filter_pairs_by_length(pairs, max_length=MAX_LENGTH)
print('After filtering:  ', len(pairs))

100%|██████████| 221282/221282 [00:00<00:00, 765419.52it/s]


Original set size:  221282
After filtering:   64271


### Generate `train.csv` and `test.csv`

In [5]:
import pandas as pd
pairs_df = pd.DataFrame(pairs)

from sklearn.model_selection import train_test_split
train_pairs, test_pairs = train_test_split(pairs_df, test_size=0.2, random_state=123)

train_pairs.to_csv('train.csv', index=False, header=False)
test_pairs.to_csv('test.csv', index=False, header=False)

# Stage 2. Load data with torchtext API

In [1]:
from torchtext.data import Field
import spacy

spacy_en = spacy.load('en_core_web_sm')

def tokenizer_fn(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

BOS_TOKEN = "<s>"
EOS_TOKEN = "</s>"
PAD_TOKEN = "<pad>"
sentence_processor = Field(tokenize=tokenizer_fn,
                           init_token=BOS_TOKEN, eos_token=EOS_TOKEN, pad_token=PAD_TOKEN,
                           batch_first=True)
fields = [('src', sentence_processor), ('tgt', sentence_processor)]



In [2]:
from torchtext.data import TabularDataset

train, test = TabularDataset.splits(
    path='',
    train='train.csv',
    test='test.csv',
    format='csv',
    skip_header=False,
    fields=fields,
)



In [3]:
sentence_processor.build_vocab(train, min_freq=2)

In [4]:
# src_sentence_processor.vocab.stoi['are'] == tgt_sentence_processor.vocab.stoi['are']
sentence_processor.vocab.stoi[PAD_TOKEN]

1

### `Dataloader`s for train, test

In [5]:
import torch
from torchtext.data import BucketIterator

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 64

from torchtext.data.utils import interleave_keys
def sort_key(ex):
    # What this does is basically it takes a 16-bit binary representation of lengths and interleaves them.
    # Example: lengths len(ex.src)=5 and len(ex.trg)=3 result in f(101, 011)=100111, 7 and 1 in f(111, 001)=101011
    # It's basically a heuristic that helps the BucketIterator sort bigger batches first
    return interleave_keys(len(ex.src), len(ex.tgt))
    
train_iter, test_iter = BucketIterator.splits(
    (train, test),
    batch_sizes=(batch_size, batch_size),
    device=device,
#     sort_key=lambda x: len(x.src), # TODO
    sort_key=sort_key,
    sort_within_batch=True,
)



In [6]:
x = next(iter(train_iter))
print(x)
print(x.src)
print(x.tgt)


[torchtext.data.batch.Batch of size 16]
	[.src]:[torch.LongTensor of size 16x10]
	[.tgt]:[torch.LongTensor of size 16x12]
tensor([[   2,    7,   68,    6,  131,  187,   73, 1058,    4,    3],
        [   2,   46,   21,    6, 1371,   12,  527,    5,    3,    1],
        [   2,   50,    8,    9,   75,   45,    5,    3,    1,    1],
        [   2,   88,   18,    7,   23,    0,    4,    3,    1,    1],
        [   2,    6,   61,    8,    6,  521,   39,    4,    3,    1],
        [   2,   44,  120,   24,   14,  348,   18,    4,    3,    1],
        [   2,  118,   40,    7,  138,   15,    6,    5,    3,    1],
        [   2,    8,   89,  478,   15,   12,  840,    5,    3,    1],
        [   2,   42,    6,  530,   69,   12,  199,    5,    3,    1],
        [   2,   20,    6, 1146,   32,  524,  105,    5,    3,    1],
        [   2,    6,  168,   81,   12,  689, 2999,    5,    3,    1],
        [   2,  260,   26,   29,  661,   12, 8317,    5,    3,    1],
        [   2,   10,    9,   51,   13



# Stage 3. Define the model architecture

In [25]:
import torch.nn as nn

class Encoder(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_directions=1, n_layers=1, dropout=0):
        super(Encoder, self).__init__()
        self.n_layers = n_layers
        assert n_directions in [1, 2], 'n_directions is either 1 or 2'
        self.n_directions = n_directions
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=n_layers,
                          dropout=(0 if n_layers == 1 else dropout),
                          bidirectional=(True if n_directions == 2 else False),
                          batch_first=True)
    
    def forward(self, input_seq, hidden_first=None):
        
        input_seq = self.embedding(input_seq)
        _, hidden_last = self.gru(input_seq, hidden_first)
        
        # add hidden_state accross 'directions' axis (dim = 1)
        h = hidden_last.view(self.n_layers, self.n_directions, -1, self.hidden_dim)       
        h = torch.sum(h, dim=1, keepdim=False)
#         pdb.set_trace()
        
        return h

In [26]:
import torch.nn.functional as F

class Decoder(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers=1, dropout=0):
        super(Decoder, self).__init__()
        
        assert embedding_dim == hidden_dim, "Yes"
        
        self.vocab_size = vocab_size
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)       
        # unidirectional!
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=n_layers,
                          dropout=(0 if n_layers == 1 else dropout),
                          batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, vocab_size)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input_step, hidden):
        """
        # Note: we run this one step (word) at a time
        # Get embedding of current input word
        """       
        # add artificial dimension to match expected format by nn.GRU,
        # i.e. (batch_size, seq_len=1, input_size)
        input_step = input_step.unsqueeze(1)

        embedded = self.embedding(input_step)
        # embedded = self.embedding_dropout(embedded)
               
#         import pdb
#         pdb.set_trace()
        
        # Forward through unidirectional GRU
        output, hidden = self.gru(embedded, hidden)
        
        # TODO: add in the future.
#         # Calculate attention weights from the current GRU output
#         attn_weights = self.attn(rnn_output, encoder_outputs)
        
#         # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
#         context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        
#         # Concatenate weighted context vector and GRU output using Luong eq. 5
#         rnn_output = rnn_output.squeeze(0)
#         context = context.squeeze(1)
#         concat_input = torch.cat((rnn_output, context), 1)
#         concat_output = torch.tanh(self.concat(concat_input))
        
#         # Predict next word using Luong eq. 6
#         output = self.out(concat_output)

        output = self.dropout(self.fc_out(output))
        output = F.softmax(output, dim=1)
        
        # remove artificial dimension we added at the beginning
        output = output.squeeze(1)

        # Return output and final hidden state
        return output, hidden

In [27]:
import random

class Seq2Seq(nn.Module):
    
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert encoder.hidden_dim == decoder.hidden_dim
        assert encoder.n_layers == decoder.n_layers
        
    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        
        batch_size = tgt.shape[0]
        tgt_len = tgt.shape[1]
        vocab_size = self.decoder.vocab_size
        
        # tensor to store decoder outputs
        outputs = torch.zeros(batch_size, tgt_len, vocab_size).to(self.device)
        
        # last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden = self.encoder(src)
#         print(hidden.shape)
#         pdb.set_trace()
        
        # first input to the decoder is the <sos> tokens
        input_decoder = tgt[:, 0]
        
        for t in range(1, tgt_len):
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, hidden = self.decoder(input_decoder, hidden)
           
            # place predictions in a tensor holding predictions for each token
            outputs[:, t, :] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input_decoder = tgt[:, t] if teacher_force else top1
            
#             print('top1: ', top1.shape)
#             print('tgt[t]: ', tgt[:, t].shape)
#         pdb.set_trace()
        
        return outputs

In [28]:
vocab_size = len(sentence_processor.vocab)
embedding_dim = 256
hidden_dim = 256
n_layers = 3
dropout = 0.20
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encoder = Encoder(vocab_size, embedding_dim, hidden_dim, n_directions=2, n_layers=n_layers, dropout=dropout)
decoder = Decoder(vocab_size, embedding_dim, hidden_dim, n_layers, dropout)

model = Seq2Seq(encoder, decoder, device).to(device)
model

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(9852, 256)
    (gru): GRU(256, 256, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(9852, 256)
    (gru): GRU(256, 256, num_layers=3, batch_first=True, dropout=0.2)
    (fc_out): Linear(in_features=256, out_features=9852, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
  )
)

#### Test forward pass of the network

In [11]:
batch = next(iter(train_iter))

print('src shape: ', batch.src.shape)
print('tgt shape: ', batch.tgt.shape)
output = model(batch.src, batch.tgt)
print('output shape: ', output.shape)

src shape:  torch.Size([16, 7])
tgt shape:  torch.Size([16, 11])
output shape:  torch.Size([16, 11, 9852])


In [12]:
import torch.optim as optim

# crossentropy loss with masking
PAD_TOKEN_ID = sentence_processor.vocab.stoi[PAD_TOKEN]
criterion = nn.CrossEntropyLoss(ignore_index = PAD_TOKEN_ID)

# Adam as usual
optimizer = optim.Adam(model.parameters(), lr=3e-4)

## Train loop

In [29]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        tgt = batch.tgt
        
        optimizer.zero_grad()
        
        # output = [batch_size, tgt_len, vocab_size=output_dim]
        output = model(src, tgt)
                
#         #trg = [trg len, batch size]
#         #output = [trg len, batch size, output dim]
        
        # OK!
        output_dim = output.shape[-1]
        
        # replace this...
        # TODO: check why reshape() and not view().
        # Maybe first permute() and then view() works?
        # output = output[1:].view(-1, output_dim)
#         pdb.set_trace()
        output = output[:, 1:, :].reshape(-1, output_dim)
        
        # replace this...
        # tgt = tgt[1:].view(-1)
        # TODO: same, why reshape() but not view()
        tgt = tgt[:, 1:].reshape(-1)
        
#         #trg = [(trg len - 1) * batch size]
#         #output = [(trg len - 1) * batch size, output dim]
        
#         pdb.set_trace()
        loss = criterion(output, tgt)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [30]:
import time

N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iter, optimizer, criterion, CLIP)
#     valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
#     if valid_loss < best_valid_loss:
#         best_valid_loss = valid_loss
#         torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
#     print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

KeyboardInterrupt: 

In [15]:
# import pdb