In [0]:
import os
import random
import datasets
import itertools
import unicodedata
import re
import csv
from tqdm import tqdm

In this work, I use PyTorch instead of TensorFlow. I feel more comfortable with PyTorch which is more easy to use. I was inspired by 2 tutorials :
* https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
* https://pytorch.org/tutorials/beginner/chatbot_tutorial.html

In [0]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

In [0]:
USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

In [0]:
max_len = 20

# Dataset loading

We load `opensubs` dataset. Here, we use tools provided for this assignment.

In [0]:
opensubs = datasets.readOpensubsData(path='data/opensubs', max_len=max_len)

OpenSubtitles data files:   0%|          | 7/4637 [00:00<01:06, 69.31it/s]

Loading OpenSubtitles conversations in data/opensubs.




OpenSubtitles data files:  19%|█▉        | 886/4637 [00:27<03:12, 19.53it/s]

Skipping file data/opensubs/OpenSubtitles/xml/en/Comedy/2003/529_124078_171007_how_to_lose_a_guy_in_10_days.xml with errors.




OpenSubtitles data files:  35%|███▍      | 1619/4637 [00:57<01:58, 25.56it/s]

Skipping file data/opensubs/OpenSubtitles/xml/en/Comedy/2004/2480_226704_299940_little_black_book.xml with errors.




OpenSubtitles data files:  39%|███▉      | 1830/4637 [01:05<01:44, 26.90it/s]

Skipping file data/opensubs/OpenSubtitles/xml/en/Drama/2003/1723_68784_89159_big_fish.xml with errors.




OpenSubtitles data files:  46%|████▌     | 2115/4637 [01:13<01:00, 41.43it/s]

Skipping file data/opensubs/OpenSubtitles/xml/en/Drama/2002/3265_149497_204017_unfaithful.xml with errors.




OpenSubtitles data files:  47%|████▋     | 2173/4637 [01:14<00:54, 45.42it/s]

Skipping file data/opensubs/OpenSubtitles/xml/en/Drama/2000/179_88528_119102_batoru_rowaiaru.xml with errors.




OpenSubtitles data files:  52%|█████▏    | 2431/4637 [01:21<00:37, 59.00it/s]

Skipping file data/opensubs/OpenSubtitles/xml/en/Drama/2004/146_206647_272090_eternal_sunshine_of_the_spotless_mind.xml with errors.




OpenSubtitles data files:  63%|██████▎   | 2916/4637 [01:36<00:42, 40.77it/s]

Skipping file data/opensubs/OpenSubtitles/xml/en/Family/2001/3935_19508_22105_cats__dogs.xml with errors.




OpenSubtitles data files:  71%|███████   | 3272/4637 [01:46<00:30, 45.02it/s]

Skipping file data/opensubs/OpenSubtitles/xml/en/Horror/1922/1166_134135_184270_nosferatu_eine_symphonie_des_grauens.xml with errors.




OpenSubtitles data files:  73%|███████▎  | 3407/4637 [01:49<00:27, 44.90it/s]

Skipping file data/opensubs/OpenSubtitles/xml/en/Action/2003/602_152466_207871_batoru_rowaiaru_ii_rekuiemu.xml with errors.




OpenSubtitles data files:  90%|████████▉ | 4165/4637 [02:08<00:08, 57.89it/s]

Skipping file data/opensubs/OpenSubtitles/xml/en/Action/2004/59_84873_113518_appurushdo.xml with errors.


OpenSubtitles data files: 100%|██████████| 4637/4637 [02:22<00:00, 32.61it/s]
100%|██████████| 1648080/1648080 [00:20<00:00, 79743.28it/s]


In [0]:
pairs = opensubs

In [0]:
print(len(pairs))

166067


# Vocabulary

The vocabulary is now created, i.e. an one-hot encoding for each word of the corpus. We have to deal with extra words for padding, start and end of sequence. 

In [0]:
PAD_token = 0
SOS_token = 1
EOS_token = 2

In [0]:
class Voc:
    def __init__(self):
        self.word2index = {}
        self.word2count = {}
        # Extra words for Padding, EOS, SOS
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3 # count extra words

    # Add a sentence to the vocabulary
    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    # Add a word to the vocabulary
    def addWord(self, word):
        # Add non-existing word only
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

In [0]:
voc = Voc()

Words from corpus are added to the vocabury

In [0]:
for input_sequence, output_sequence in pairs:
    voc.addSentence(input_sequence)
    voc.addSentence(output_sequence)

In [0]:
print("Counted words:", voc.num_words)

Counted words: 25523


# Data preparation for batch

Batch is a technique to speedup learning of neural networks by packing data and feeding neural networks. To batch sequences, we have to use padding for dealing with different sizes of sequence.

In [0]:
def indexesFromSentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]

In [0]:
def zeroPadding(l, fillvalue=PAD_token):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

In [0]:
def binaryMatrix(l, value=PAD_token):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

In [0]:
# Returns padded input sequence tensor and lengths
def inputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths

In [0]:
# Returns padded target sequence tensor, padding mask, and max target length
def outputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.ByteTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len

In [0]:
# Returns all items for a given batch of pairs
def batch2TrainData(voc, pair_batch):
    pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp, lengths = inputVar(input_batch, voc)
    output, mask, max_target_len = outputVar(output_batch, voc)
    return inp, lengths, output, mask, max_target_len

# RNN with attention

References for this part are the following : 
* seq2seq : https://arxiv.org/abs/1409.3215
* encoder bidirectional RNN : https://arxiv.org/pdf/1406.1078v3.pdf
* attention : https://arxiv.org/abs/1409.0473
* global attention : https://arxiv.org/abs/1508.04025

Videos from week 4 and 5 were very usefull to understand attention meaning and how to deploy a solution. Thanks for that :)

In [0]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding):
        super(EncoderRNN, self).__init__()
        self.n_layers = 2
        self.hidden_size = hidden_size
        self.embedding_dim = embedding.embedding_dim
        self.embedding = embedding

        # GRU RNN
        self.gru = nn.GRU(self.embedding_dim, 
                          self.hidden_size, 
                          self.n_layers,
                          dropout=0.1, 
                          bidirectional=True)

    def forward(self, input_seq, input_lengths, hidden=None):
        # Convert word indexes to embeddings
        embedded = self.embedding(input_seq)
        # Pack padded batch of sequences for RNN module
        packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        # Forward pass through GRU
        outputs, hidden = self.gru(packed, hidden)
        # Unpack padding
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        # Sum bidirectional GRU outputs
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
        # Return output and final hidden state
        return outputs, hidden

In [0]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size
        
    def forward(self, hidden, encoder_outputs):
        # Calculate the attention weights (energies) based on the dot method
        attn_energies = torch.sum(hidden * encoder_outputs, dim=2)
        # Transpose max_length and batch_size dimensions
        attn_energies = attn_energies.t()
        # Return the softmax normalized probability scores (with added dimension)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

In [0]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, output_size):
        super(DecoderRNN, self).__init__()

        # Keep for reference
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = 2
        self.dropout = 0.1

        # Define layers
        self.embedding = embedding
        self.embedding_dim = embedding.embedding_dim
        self.embedding_dropout = nn.Dropout(self.dropout)
        self.gru = nn.GRU(self.embedding_dim, 
                          self.hidden_size, 
                          self.n_layers, 
                          dropout=0.1)
        self.concat = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

        self.attn = Attention(hidden_size)

    def forward(self, input_step, last_hidden, encoder_outputs):
        # Note: we run this one step (word) at a time
        # Get embedding of current input word
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        # Forward through unidirectional GRU
        rnn_output, hidden = self.gru(embedded, last_hidden)
        # Calculate attention weights from the current GRU output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # Concatenate weighted context vector and GRU output using Luong eq. 5
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        # Predict next word using Luong eq. 6
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        # Return output and final hidden state
        return output, hidden

# Loss with mask

We don't compute loss on padding.

In [0]:
def maskNLLLoss(inp, target, mask):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss, nTotal.item()

# Training process

In [0]:
hidden_size = 500

embedding = nn.Embedding(voc.num_words, hidden_size)

encoder = EncoderRNN(hidden_size, embedding)
decoder = DecoderRNN(hidden_size, embedding, voc.num_words)

encoder = encoder.to(device)
decoder = decoder.to(device)

encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.0001)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=0.0005)

In [0]:
n_iteration = 30000
batch_size = 64

Generate all training batches

In [0]:
# Load batches for each iteration
training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)]) for _ in tqdm(range(n_iteration))]

100%|██████████| 30000/30000 [00:10<00:00, 2946.83it/s]


In [0]:
print_every = 500
start_iteration = 1
print_loss = 0

encoder.train()
decoder.train()

for iteration in range(start_iteration, n_iteration + 1):
    
    training_batch = training_batches[iteration - 1]
    
    # Extract fields from batch
    input_variable, lengths, target_variable, mask, max_target_len = training_batch

    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Set device options
    input_variable = input_variable.to(device)
    lengths = lengths.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)

    # Initialize variables
    loss = 0
    losses = []
    n_totals = 0

    # Forward pass through encoder
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    # Create initial decoder input (start with SOS tokens for each sentence)
    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)

    # Set initial decoder hidden state to the encoder's final hidden state
    decoder_hidden = encoder_hidden[:decoder.n_layers]

    # Forward batch of sequences through decoder one time step at a time
    for t in range(max_target_len):
        decoder_output, decoder_hidden = decoder(
            decoder_input, decoder_hidden, encoder_outputs
        )
        # next input is current target
        decoder_input = target_variable[t].view(1, -1)
        # Calculate and accumulate loss
        mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
        loss += mask_loss
        losses.append(mask_loss.item() * nTotal)
        n_totals += nTotal
    
    # Perform backpropatation
    loss.backward()

    # Clip gradients: gradients are modified in place
    _ = nn.utils.clip_grad_norm_(encoder.parameters(), 50)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), 50)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()

    print_loss += sum(losses) / n_totals

    # Print progress
    if iteration % print_every == 0:
        print_loss_avg = print_loss / print_every
        print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
        print_loss = 0

Iteration: 500; Percent complete: 1.7%; Average loss: 5.2885
Iteration: 1000; Percent complete: 3.3%; Average loss: 4.6670
Iteration: 1500; Percent complete: 5.0%; Average loss: 4.4669
Iteration: 2000; Percent complete: 6.7%; Average loss: 4.2793
Iteration: 2500; Percent complete: 8.3%; Average loss: 4.1383
Iteration: 3000; Percent complete: 10.0%; Average loss: 4.0242
Iteration: 3500; Percent complete: 11.7%; Average loss: 3.9140
Iteration: 4000; Percent complete: 13.3%; Average loss: 3.8248
Iteration: 4500; Percent complete: 15.0%; Average loss: 3.7060
Iteration: 5000; Percent complete: 16.7%; Average loss: 3.6189
Iteration: 5500; Percent complete: 18.3%; Average loss: 3.5128
Iteration: 6000; Percent complete: 20.0%; Average loss: 3.4160
Iteration: 6500; Percent complete: 21.7%; Average loss: 3.3521
Iteration: 7000; Percent complete: 23.3%; Average loss: 3.2789
Iteration: 7500; Percent complete: 25.0%; Average loss: 3.1891
Iteration: 8000; Percent complete: 26.7%; Average loss: 3.125

# Save network

Save results to be used by a chatbot.

In [0]:
torch.save({
    'max_len': max_len,
    'hidden_size': hidden_size,
    'encoder': encoder.state_dict(),
    'decoder': decoder.state_dict(),
    'voc_dict': voc.__dict__,
    'embedding': embedding.state_dict()
}, os.path.join('data', 'encoder_decoder.tar'))

# Evaluation

In [0]:
def evaluate(encoder, decoder, voc, sentence, max_length=max_len):
    # First we check if voc fits to the sentence
    for word in sentence.split(' '):
        if word not in voc.word2index:
            return "I don't know the meaning of %s" % word
     # Evaluation mode
    encoder.eval()
    decoder.eval()
    # words -> indexes
    indexes_batch = [indexesFromSentence(voc, sentence)]
    # Create lengths tensor
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    # Transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    # Use appropriate device
    input_batch = input_batch.to(device)
    lengths = lengths.to(device)
    # Predict
    # First, forward input through encoder model
    encoder_outputs, encoder_hidden = encoder(input_batch, lengths)
    # Prepare encoder's final hidden layer to be first hidden input to the decoder
    decoder_hidden = encoder_hidden[:decoder.n_layers]
    # Initialize decoder input with SOS_token
    decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
    # Initialize tensors to append decoded words to
    tokens = torch.zeros([0], device=device, dtype=torch.long)
    # Iteratively decode one word token at a time
    for _ in range(max_length):
        # Forward pass through decoder
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
        # Obtain most likely word token and its softmax score
        _, decoder_input = torch.max(decoder_output, dim=1)
        # Record token and score
        tokens = torch.cat((tokens, decoder_input), dim=0)
        # Prepare current token to be next decoder input (add a dimension)
        decoder_input = torch.unsqueeze(decoder_input, 0)
    # indexes -> words
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    # Format response sentence 
    output_words = [x for x in decoded_words if not (x == 'EOS' or x == 'PAD')]
    return ' '.join(output_words)

In [0]:
print(evaluate(encoder, decoder, voc, datasets.extractText('Hello')))

hello


In [0]:
print(evaluate(encoder, decoder, voc, datasets.extractText('How are you ?')))

i m good and paul


In [0]:
print(evaluate(encoder, decoder, voc, datasets.extractText('What is your name ?')))

john


In [0]:
print(evaluate(encoder, decoder, voc, datasets.extractText('Nice to meet you')))

nice to meet you


The fellowing answer is clearly wrong :

In [0]:
print(evaluate(encoder, decoder, voc, datasets.extractText('Where do you live ?')))

united taxi co


and so is the fellowing one :

In [0]:
print(evaluate(encoder, decoder, voc, datasets.extractText('Where do you come from ?')))

when we will come


In [0]:
print(evaluate(encoder, decoder, voc, datasets.extractText('Do you like apples ?')))

I don't know the meaning of apples


In [0]:
print(evaluate(encoder, decoder, voc, datasets.extractText('Do you like apple ?')))

yeah


# Using MyChatbot

Just few lines to show how to use `mychatbot` running on `telegram` is implemented. Notice that we don't have `GPU` available on `Amazon Web Service` cluster (free offer).

In [0]:
import mychatbot
import importlib
importlib.reload(mychatbot)

<module 'mychatbot' from '/work/home/tmp/natural-language-processing/honor/mychatbot.py'>

In [0]:
chatbot = mychatbot.MyChatbot(os.path.join('data', 'encoder_decoder.tar'))

In [0]:
print(chatbot.get_answer('Hello'))
print(chatbot.get_answer('How are you ?'))
print(chatbot.get_answer('What is your name ?'))
print(chatbot.get_answer('Nice to meet you'))
print(chatbot.get_answer('Where do you live ?'))
print(chatbot.get_answer('Where do you come from ?'))
print(chatbot.get_answer('Do you like apples ?'))
print(chatbot.get_answer('Do you like apple ?'))

hello
i m good and paul
john
nice to meet you
united taxi co
when we will come
I don't know the meaning of apples
yeah


That's it !! Thanks for reading this notebook so far :)