## Simplest Seq2Seq

* **Task**: toy "translation" task --- translating a list of letters (from A to H) to the next-letter-list (e.g. ['A', 'B', 'C'] translates as ['B', 'C', 'D']. 
* **Type**: Sutskever et al. (2014). No attention, no bidirection or stacking. Clear-to-the-boot step-by-step demo.
* **PyTorch Version**: 0.3.1
* **Rant**: showy people on Github write convoluted tutorial code (although efficient, sophisticated and all). Doesn't help for beginners at all! This tutorial tells you all you need to know!!

In [113]:
from __future__ import division

import unicodedata
import string
import re
import random
import time
import math
import numpy as np

from io import open

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

### Data Prep

In [151]:
class Indexer:
    """Token-Index mapping."""
    
    def __init__(self, name):
        """
        Args:
            name: name of the indexer.
        """
        self.name = name
        self.word2index = {"SOS": 0, "EOS": 1} # str -> int
        self.index2word = {0: "SOS", 1: "EOS"}
        self.word2count = {"SOS": 0, "EOS": 0} # str -> int
        self.nWords = 0  # Count SOS and EOS
    
    def add_sentence(self, sentence):
        """Add a sentence to the dictionary.
        
        Args:
            sentence: a list of tokens (in string).
        """
        for word in sentence:
            self.add_word(word)

    def add_word(self, word):
        """Add a word to the dictionary.
        
        Args:
            word: a token (in string).
        """
        if word not in self.word2index:
            self.word2index[word] = self.nWords
            self.word2count[word] = 1
            self.index2word[self.nWords] = word
            self.nWords += 1
        else:
            self.word2count[word] += 1 
            
    def get_index(self, word):
        """Word->Index lookup.
        
        Args:
            word: a token (string).
        Returns:
            The index of the word.
        """
        return self.word2index[word] if word in self.word2index else -1
    
    def get_word(self, index):
        """Index->Word lookup.
        
        Args:
            index: index of a token.
        Returns:
            The token under the index. -1 if the index is out of bound.
        """
        return self.index2word[index] if index<self.nWords else -1
    
    def get_sentence_index(self, sentence):
        """Words->Indexs lookup.
        
        Args:
            sentence: a list of token (string).
        Returns:
            A list of indices.
        """
        return [self.get_index(word) for word in sentence]
    
    def get_sentence_word(self, indexSentence):
        """Indexs->Words lookup.
        
        Args:
            indexSentence: a list of indices.
        Returns:
            A list of tokens (string).
        """
        return [self.get_word(index) for index in indexSentence]

In [152]:
# Toy data generation
#   vocab -> A to I
#   length -> 3 to 8
#   task -> translate for the next letter (e.g. A -> B)

VOCAB = [chr(i) for i in range(65,74)] # 'A' -> 'I'
FROM_LEN, TO_LEN = 3, 8
MAX_LENGTH = TO_LEN + 2
SOS, EOS = 'SOS', 'EOS'
INDEXER = Indexer('LetterTranslator')
DATA_SIZE = 3000

def translate_word(word):
    """Find the next letter.
    
    Args:
        word: a letter word (e.g. 'A').
    Returns:
        The next letter to word.
    """
    return VOCAB[VOCAB.index(word)+1]

def translate_sent(sent):
    """Find the next-letter translation of a sentence.
    
    Args:
        sent: a list of letter words.
    Returns:
        The next letters.
    """
    return [translate_word(word) for word in sent]

def generate_pair():
    """Randomly generate a pair of sentences (arg1 translates to arg2).
    
    Returns:
        randInput: a list of letter words.
        randTarget: a list of translation letter words of randInput.
        randInputLen, randTargetLen: lengths of the lists above.
    """
    randInput = list(np.random.choice(VOCAB[:-1], size=random.randint(FROM_LEN,TO_LEN)))
    randTarget = translate_sent(randInput)
    randInputLen, randTargetLen = len(randInput), len(randTarget)
    return randInput, randTarget+[str('EOS')], \
           randInputLen, randTargetLen+1
        # str(): default is utf-8

def generate_data():
    """Randomly generate a set of pairs of sentences (arg1 translates to arg2).
    
    Returns:
        pairs: a pair of lists of torch Variables (torch.LongTensor).
        lengths: lengths of the corresponding lists in pairs.
    """
    pairs, lengths = [], []
    for _ in range(DATA_SIZE):
        randInput,randTarget,randInputLen,randTargetLen = generate_pair()
        INDEXER.add_sentence(randInput)
        INDEXER.add_sentence(randTarget)
        pairs.append([Variable(torch.LongTensor(INDEXER.get_sentence_index(randInput)).view(-1,1)),
                      Variable(torch.LongTensor(INDEXER.get_sentence_index(randTarget)).view(-1,1))])
            # convert sentences to <mt,bc> shape.
            # here bc=1.
        lengths.append([randInputLen,randTargetLen])
    return pairs, lengths

In [153]:
pairs, lengths = generate_data()

### Model

In [168]:
# Simplest Seq2Seq similar to Sutskever et al. (2014)

HIDDEN_SIZE = 20

class EncoderRNN(nn.Module):
    """Simple GRU encoder."""
    
    def __init__(self, inputSize, hiddenSize, nLayers=1):
        # inputSize: vocabulary size.
        # hiddenSize: size for both embedding and GRU hidden.
        super(EncoderRNN, self).__init__()
        self.inputSize = inputSize
        self.hiddenSize = hiddenSize
        self.nLayers = nLayers
        self.embedding = nn.Embedding(inputSize, hiddenSize)
        self.gru = nn.GRU(hiddenSize, hiddenSize, nLayers)
    
    def forward(self, inputs, inputsLen, hidden):
        # inputs: <mt,bc>
        # hidden: <n_layer*n_direction,bc,h>
        embedded = self.embedding(inputs).view(inputsLen,1,-1) # <mt,bc=1,h>
        output,hidden = self.gru(embedded, hidden)
            # output: <mt,bc=1,h>
            # hidden: <n_layer*n_direction,bc,h>
        return output, hidden
    
    def init_hidden(self):
        return Variable(torch.zeros(self.nLayers,1,self.hiddenSize))
    
class DecoderRNN(nn.Module):
    """Simple GRU decoder."""
    
    def __init__(self, hiddenSize, outputSize, nLayers=1):
        # hiddenSize: encoder final hidden size.
        # outputSize: vocabulary size.
        # NB: nLayers here is related to the size of the encoder.
        #     left as 1 for simplicity here.
        super(DecoderRNN, self).__init__()
        self.hiddenSize = hiddenSize
        self.outputSize = outputSize
        self.nLayers = nLayers
        self.embedding = nn.Embedding(outputSize, hiddenSize)
        self.gru = nn.GRU(hiddenSize, hiddenSize, nLayers)
        self.out = nn.Linear(hiddenSize, outputSize)
        
    def forward(self, input, hidden):
        # input: <mt=1,bc=1>
        # hidden: <n_layer*n_direction,bc,h>
        embedded = self.embedding(input).view(1,1,-1) # <mt=1,bc=1,h>
        output, hidden = self.gru(embedded, hidden) 
            # output: <mt,bc,h>
            # hidden: <n_layer*n_direction,bc,h>
        output = F.log_softmax(self.out(output.squeeze(0)), dim=-1)
            # squeeze: get rid of mt=1 for computing loss.
            # out-Linear: <mt,bc,h> * <h,vocab> -> <mt,bc,vocab>
            # softmax: get log softmax along vocab dim
        return output, hidden

In [155]:
# Demo: how things working inside the encoder and decoder.

sent = pairs[0][0]
sentLen = lengths[0][0]
print("Encoder IN -- (inputs, inputsLen) \n")
print(sent)
print(sentLen), '\n'
e = EncoderRNN(INDEXER.nWords, HIDDEN_SIZE)
eh = e.init_hidden()
eo,eh = e(sent,sentLen,eh)
print("Encoder OUT -- (output, hidden) \n")
print(eo)
print(eh, '\n')
print("=======================\n")
loss = 0
crit = nn.NLLLoss()
di = Variable(torch.LongTensor([[INDEXER.get_index('SOS')]]))
dh = eh
print("Decoder IN -- (input, hidden) \n")
print(di)
print(dh)
d = DecoderRNN(HIDDEN_SIZE, INDEXER.nWords)
target = pairs[0][1]
targetLen = lengths[0][1]
for i in range(targetLen):
#     print('HAHAHA', di.shape, dh.shape)
    do,dh = d(di,dh)
    loss += crit(do,target[i]) 
        # do: <bc,h>
        # target[i]: <bc,>
    tv,ti = do.data.topk(1) # ti: 1x1
    di = Variable(ti) # next input is the index predicted at this step.
    if i==0:
        print("Decoder (1 step) OUT -- (output, hidden) \n")
        print(do)
        print(dh); print
        print("Decoder (1 step) LOSS = %f\n" % loss)

Encoder IN -- (inputs, inputsLen) 

Variable containing:
 0
 1
 2
 3
 4
 4
[torch.LongTensor of size 6x1]

6 

Encoder OUT -- (output, hidden) 

Variable containing:
(0 ,.,.) = 
  0.0643 -0.3000 -0.0849  0.2040  0.4047

(1 ,.,.) = 
  0.3694  0.2474 -0.0463  0.4804  0.3955

(2 ,.,.) = 
  0.1277 -0.3935  0.1632  0.5834  0.2498

(3 ,.,.) = 
  0.3257 -0.5005  0.1710 -0.1195  0.0906

(4 ,.,.) = 
  0.0885  0.2160  0.6881  0.1680  0.3916

(5 ,.,.) = 
 -0.1838  0.3584  0.8152  0.4037  0.4428
[torch.FloatTensor of size 6x1x5]

(Variable containing:
(0 ,.,.) = 
 -0.1838  0.3584  0.8152  0.4037  0.4428
[torch.FloatTensor of size 1x1x5]
, '\n')

Decoder IN -- (input, hidden) 

Variable containing:
 0
[torch.LongTensor of size 1x1]

Variable containing:
(0 ,.,.) = 
 -0.1838  0.3584  0.8152  0.4037  0.4428
[torch.FloatTensor of size 1x1x5]

Decoder (1 step) OUT -- (output, hidden) 

Variable containing:
-2.2933 -2.4409 -1.4867 -1.8607 -2.8049 -2.2017 -2.3026 -2.3134 -2.8091
[torch.FloatTensor of siz

### Trainer

In [170]:
def train_step(inputs, inputsLen, targets, targetsLen,
               encoder, decoder, 
               encoderOptim, decoderOptim, criterion,
               enforcingRatio, clip):
    """One training step (on a single pair of sentences)."""
    # Clear previous grads
    # WHY: Since the backward() function accumulates gradients, 
    #      and you don’t want to mix up gradients between minibatches, 
    #      you have to zero them out at the start of a new minibatch. 
    #      This is exactly like how a general (additive) accumulator 
    #      variable is initialized to 0 in code.
    encoderOptim.zero_grad()
    decoderOptim.zero_grad()
    # Set up loss
    loss = 0
    # Run encoder
    encoderHidden = encoder.init_hidden()
    encoderOutput, encoderHidden = encoder(inputs, inputsLen, encoderHidden)
    # Run decoder
    decoderInput = Variable(torch.LongTensor([[INDEXER.get_index('SOS')]]))
    decoderHidden = encoderHidden
    enforce = random.random() < enforcingRatio
    for di in range(targetsLen):
        decoderOutput, decoderHidden = decoder(decoderInput, decoderHidden)
        loss += criterion(decoderOutput, targets[di])
        if enforce: # i.e. feed gold target tokens in training.
            decoderInput = targets[di] # decoderInput can be 1 or 1x1 
        else:
            topValue,topIndex = decoderOutput.data.topk(1)
            decoderInput = Variable(topIndex)
    # Backprop
    loss.backward()
    torch.nn.utils.clip_grad_norm(encoder.parameters(), clip)
    torch.nn.utils.clip_grad_norm(decoder.parameters(), clip)
    encoderOptim.step()
    decoderOptim.step()
    return loss.data[0] / targetsLen

def train(pairs, lengths,
          nEpochs=1, epochSize=1000, lr=1e-4,
          enforcingRatio=0.5, clip=5.0,
          printEvery=100):
    """Train multiple steps."""
    dataSize = len(pairs)
    encoder = EncoderRNN(INDEXER.nWords, HIDDEN_SIZE)
    decoder = DecoderRNN(HIDDEN_SIZE, INDEXER.nWords)
    encoderOptim = optim.Adam(encoder.parameters(),lr)
    decoderOptim = optim.Adam(decoder.parameters(),lr)
    criterion = nn.CrossEntropyLoss()
    averageLoss = 0
    for e in range(nEpochs):
        start = time.time()
        epochLoss = 0
        for step in range(epochSize):
            i = random.choice(range(0,dataSize))
            inputs, targets = pairs[i]
            inputsLen, targetsLen = lengths[i]
            loss = train_step(inputs, inputsLen, targets, targetsLen,
                              encoder, decoder,
                              encoderOptim, decoderOptim, criterion,
                              enforcingRatio, clip)
            if step!=0 and step%printEvery==0:
                print("Step %d average loss = %.4f" % (step, loss))
            epochLoss += loss
        epochLoss /= epochSize
        averageLoss += epochLoss
        print("\nEpoch %d loss = %.4f (time: %.2f)\n" % (e+1,epochLoss,
                                                         time.time()-start))
    averageLoss /= nEpochs
    print("\nGrand average loss = %.4f\n" % averageLoss)
    return encoder, decoder

In [171]:
encoder, decoder = train(pairs, lengths, 
                         nEpochs=20, epochSize=len(pairs),
                         printEvery=500)

Step 500 average loss = 2.0463
Step 1000 average loss = 2.0567
Step 1500 average loss = 1.8398
Step 2000 average loss = 1.8864
Step 2500 average loss = 1.8702

Epoch 1 loss = 2.0020 (time: 27.18)

Step 500 average loss = 1.7942
Step 1000 average loss = 1.8996
Step 1500 average loss = 2.0343
Step 2000 average loss = 1.7231
Step 2500 average loss = 2.0002

Epoch 2 loss = 1.8077 (time: 26.90)

Step 500 average loss = 1.5698
Step 1000 average loss = 1.6615
Step 1500 average loss = 1.6546
Step 2000 average loss = 1.5510
Step 2500 average loss = 1.6964

Epoch 3 loss = 1.6946 (time: 27.44)

Step 500 average loss = 1.8457
Step 1000 average loss = 1.4318
Step 1500 average loss = 1.4433
Step 2000 average loss = 2.1736
Step 2500 average loss = 1.7652

Epoch 4 loss = 1.5972 (time: 27.38)

Step 500 average loss = 1.6271
Step 1000 average loss = 1.7220
Step 1500 average loss = 0.7872
Step 2000 average loss = 1.2104
Step 2500 average loss = 1.4867

Epoch 5 loss = 1.5428 (time: 27.27)

Step 500 averag

### Evaluation

In [179]:
def evaluate(sent, sentLen, target, targetLen,
             encoder, decoder, 
             maxLength):
    encoderHidden = encoder.init_hidden()
    encoderOutput, encoderHidden = encoder(sent, sentLen, encoderHidden)
    decoderInput = Variable(torch.LongTensor([[INDEXER.get_index('SOS')]]))
    decoderHidden = encoderHidden
    prediction = []
    lengthGen = 0
    while True:
        lengthGen += 1
        decoderOutput, decoderHidden = decoder(decoderInput, decoderHidden)
        topValue,topIndex = decoderOutput.data.topk(1)
        decoderInput = Variable(topIndex)
        prediction += list(topIndex.squeeze().numpy())
        if prediction[0] == INDEXER.get_index('EOS') or lengthGen>=maxLength:
            break
    sent = list(sent.data.squeeze().numpy())
    target = list(target.data.squeeze().numpy())
    print("INPUT >> %s" % ' '.join(INDEXER.get_sentence_word(sent)))
    print("PRED >> %s" % ' '.join(INDEXER.get_sentence_word(prediction[:targetLen])))
    print("TRUE >> %s" % ' '.join(INDEXER.get_sentence_word(target)))
    
def random_evaluate(pairs, lengths,
                    encoder, decoder,
                    maxLength=15):
    i = random.choice(range(0,len(pairs)))
    sent, target = pairs[i]
    sentLen, targetLen = lengths[i]
    evaluate(sent, sentLen, target, targetLen, encoder, decoder, maxLength)

In [180]:
random_evaluate(pairs, lengths, encoder, decoder)

INPUT >> G D A C
PRED >> H E B D D
TRUE >> H E B D D
