(Notebook modified from https://course.ccs.neu.edu/ds4440f20/)

# LSTMs and sequence2 sequence models

**Instructions:** Answer the questions below in the notebook itself. Submit on canvas your notebook and a pdf printout of your notebook.

## Let's work through our exercise: learning to add (with strings)

This idea borrowed from the official Keras docs -- I have borrowed some of their data generation code but written PyTorch version. See original version (keras) here: https://github.com/keras-team/keras/blob/master/examples/addition_rnn.py

Additional useful links (torch): https://github.com/bentrevett/pytorch-seq2seq


In [1]:
import numpy as np

import torch

In [2]:
class CharacterTable(object):
    """Given a set of characters:
    + Encode them to a one-hot integer representation
    + Decode the one-hot or integer representation to their character output
    + Decode a vector of probabilities to their character output
    """
    def __init__(self, chars, one_hot=False):
        """Initialize character table.
        # Arguments
            chars: Characters that can appear in the input.
        """
        self.chars = sorted(set(chars))
        self.char_indices = dict((c, i) for i, c in enumerate(self.chars))
        self.indices_char = dict((i, c) for i, c in enumerate(self.chars))
        self.one_hot = one_hot

    def encode(self, C, num_rows):
        """One-hot encode given string C.
        # Arguments
            C: string, to be encoded.
            num_rows: Number of rows in the returned one-hot encoding. This is
                used to keep the # of rows for each data the same.
        """
        if self.one_hot:
          x = np.zeros((num_rows, len(self.chars)))
          for i, c in enumerate(C):
              x[i, self.char_indices[c]] = 1
        else:
          x = np.zeros(num_rows)
          for i, c in enumerate(C):
            x[i] = self.char_indices[c]
          
        return x

    def decode(self, x, calc_argmax=True):
        """Decode the given vector or 2D array to their character output.
        # Arguments
            x: A vector or a 2D array of probabilities or one-hot representations;
                or a vector of character indices (used with `calc_argmax=False`).
            calc_argmax: Whether to find the character index with maximum
                probability, defaults to `True`.
        """
        if calc_argmax:
            x = x.argmax(axis=-1)
        return ''.join(self.indices_char[x] for x in x)

Test the encoding and decoding:

In [3]:
import random
vocabulary = ['a', 'b', 'c', 'd', 'e']

ct = CharacterTable(vocabulary, True)

seq = ''.join([random.choice(vocabulary) for i in range(5)])
seq_enc = ct.encode(seq, 10)
print(seq)
print(seq_enc)
print(ct.decode(seq_enc))
assert seq == ct.decode(seq_enc)[:len(seq)]

print("=" * 80)
ct = CharacterTable(vocabulary, False)
seq_enc = ct.encode(seq, 10)
print(seq)
print(seq_enc)


bddba
[[0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
bddbaaaaaa
bddba
[1. 3. 3. 1. 0. 0. 0. 0. 0. 0.]


Next generate data

In [4]:
# Parameters for the model and dataset.
TRAINING_SIZE = 50000
DIGITS = 3

# Maximum length of input is 'int + int' (e.g., '345+678'). Maximum length of
# int is DIGITS.
MAXLEN = DIGITS + 1 + DIGITS

# All the numbers, plus sign and space for padding.
chars = '0123456789+ '
ctable = CharacterTable(chars)

questions, expected = [], []
seen = set()

while len(questions) < TRAINING_SIZE:
    f = lambda: int(''.join(np.random.choice(list('0123456789'))
                    for i in range(np.random.randint(1, DIGITS + 1))))
    a, b = f(), f()
    # Skip any addition questions we've already seen
    # Also skip any such that x+Y == Y+x (hence the sorting).
    key = tuple(sorted((a, b)))
    if key in seen:
        continue
    seen.add(key)
    
    # Pad the data with spaces such that it is always MAXLEN.
    q = '{}+{}'.format(a, b)
    query = q + ' ' * (MAXLEN - len(q))
    questions.append(query)
    ans = str(a + b)
    # Answers can be of maximum size DIGITS + 1.
    ans += ' ' * (DIGITS + 1 - len(ans))
    expected.append(ans)
print('Total addition questions:', len(questions))

Total addition questions: 50000


In [5]:
# e.g.
print(list(zip(questions[:5], expected[:5])))

[('85+4   ', '89  '), ('153+1  ', '154 '), ('13+37  ', '50  '), ('166+7  ', '173 '), ('9+49   ', '58  ')]


Now vectorize, split into train and val data

In [6]:
print('Vectorization...')
# note that the pytorch (nn) Embedding layer wants indices as inputs (not one-hot)
x = np.zeros((len(questions), MAXLEN), dtype=int) # len(chars)), dtype=int)
y = np.zeros((len(questions), DIGITS + 1), dtype=int) #, len(chars)), dtype=int)
for i, sentence in enumerate(questions):
    x[i] = ctable.encode(sentence, MAXLEN)
    
for i, sentence in enumerate(expected):
    y[i] = ctable.encode(sentence, DIGITS + 1)

x[0].shape # MAXLEN x num chars

Vectorization...


(7,)

In [7]:
x[0,:]

array([10,  7,  1,  6,  0,  0,  0])

In [8]:
y[0].shape # (DIGITS + 1) x num chars

(4,)

In [9]:
y[0]

array([10, 11,  0,  0])

Shuffle, split into train/val

In [10]:
# Shuffle (x, y) in unison
indices = np.arange(len(y))
np.random.shuffle(indices)
x = torch.from_numpy(x[indices])
y = torch.from_numpy(y[indices])

# Explicitly set apart 10% for validation data that we never train over.
split_at = len(x) - len(x) // 10
(x_train, x_val) = x[:split_at], x[split_at:]
(y_train, y_val) = y[:split_at], y[split_at:]

print('Training Data:')
print(x_train.shape)
print(y_train.shape)

print()
print('Validation Data:')
print(x_val.shape)
print(y_val.shape)

Training Data:
torch.Size([45000, 7])
torch.Size([45000, 4])

Validation Data:
torch.Size([5000, 7])
torch.Size([5000, 4])


Now let's define our encoder/decoder model!

In [11]:
x[0,:]

tensor([10,  4,  8,  1,  9,  0,  0])

In [12]:
import torch
import torch.nn as nn

class Encoder(nn.Module):
  
  def __init__(self, input_dim, emb_dim, hidden_dim):
    super().__init__()
    
    self.input_dim = input_dim
    self.embed_dim = emb_dim
    self.hidden_dim = hidden_dim
    
    self.embed_layer = nn.Embedding(self.input_dim, self.embed_dim)
    # batch_first --> (batch, seq, feature)
    self.rnn = nn.LSTM(self.embed_dim, self.hidden_dim, batch_first=True)
    
  def forward(self, x):
    # x_e = (batch x length x dims)
    x_e = self.embed_layer(x)
    outputs, h = self.rnn(x_e)
    return h
        

In [13]:
class Decoder(nn.Module):
  
  def __init__(self, input_dim, embed_dim, hidden_dim, output_dim):
    super().__init__()
    
    self.input_dim = input_dim
    self.hidden_dim = hidden_dim
    self.embed_dim = embed_dim
    self.output_dim = output_dim
    
    self.embed_layer = nn.Embedding(self.input_dim, self.embed_dim)
    # batch_first --> (batch, seq, feature)
    self.rnn = nn.LSTM(self.embed_dim, self.hidden_dim, batch_first=True)
    self.out = nn.Linear(self.hidden_dim, self.output_dim)
    self.sm = nn.Softmax(dim=-1)
    
  
  def forward(self, x, hidden):
    x_e = self.embed_layer(x)
    output, (h, c) = self.rnn(x_e, hidden)
    out = self.out(h)
    
    y_hat = self.sm(out)
  
    return y_hat, (h, c)

**Q1. Answer the questions below:**

Read about the Encode-Decoder Architecture here: https://d2l.ai/chapter_recurrent-modern/encoder-decoder.html

- In this particular problem, what are the elements of the input sequence (list all possible values)?
- In this particular problem, what are the elements of the output sequence (list all possible values)?
- In this particular problem, which sequence does the Encoder encode?
- Which state of the LSTM does the Encoder use as the encoding of the input sequence (recall that the LSTM calculates a state after each element of the input sequence)?

In [None]:
'''
The elements of the input sequence include the following four strings: “They”,
“are”, “watching”, “.”.

The elements of the output sequence include the following three strings:
“Ils”, “regardent”, “.”.

The encoder encodes the following sequence: “They”, “are”, “watching”, “.”.

The state of the LSTM that the Encoder uses as the encoding of the input
sequence is the final state that is encoded.
'''

In [15]:
import random 

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
       
    def forward(self, x, y, teacher_forcing_ratio=0.5):
        
        # src = [src sent len, batch size]
        # trg = [trg sent len, batch size]
        # teacher_forcing_ratio is probability to use teacher forcing
        # e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = y.shape[0]
        max_len = y.shape[1]
        vocab_size = self.decoder.output_dim
        
        # tensor to store decoder outputs
        outputs = torch.zeros(max_len, batch_size, vocab_size).to("cuda")
        
        # last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden = self.encoder(x)
        
        # first input to the decoder is the <sos> tokens
        y_hat = torch.zeros(batch_size, dtype=torch.long).unsqueeze(1)
        for t in range(0, max_len):
 
            output, hidden = self.decoder(y_hat, hidden)          
            outputs[t] = output
    
            teacher_force = random.random() < teacher_forcing_ratio
            # the 2 arg is the dimension to reduce; skips batch and
            # and pulls max index from each softmax (for each instance)
            max_preds = output.max(2)[1] # will be 1 x batch
            # tranpose to batch x 1
            y_hat = max_preds.transpose(0,1)
            if teacher_force:
              # then replace predictions with the reference
              y_hat = y[:,t].unsqueeze(1)

        
        # need to flip dims around to be 
        # (batch x length x vocab)
        return outputs.permute(1, 0, 2)
  

In [16]:
y[0]

tensor([10,  5,  5,  0])

In [17]:
embed_dim = 32
hidden_dim = 32
vocab_size = len(chars)

encoder = Encoder(vocab_size, embed_dim, hidden_dim)
decoder = Decoder(vocab_size, embed_dim, hidden_dim, vocab_size)

s2s = Seq2Seq(encoder, decoder)

output = s2s(x[:8], y[:8])

output.shape

torch.Size([8, 4, 12])

In [18]:
y_target = y[:8]
y_target.shape

torch.Size([8, 4])

In [19]:
y_target[0,:]

tensor([10,  5,  5,  0])

In [20]:
output[0,:]

tensor([[0.0936, 0.0804, 0.0755, 0.0736, 0.0753, 0.0879, 0.0879, 0.0817, 0.0867,
         0.0653, 0.0897, 0.1023],
        [0.0886, 0.0768, 0.0955, 0.0800, 0.0747, 0.0745, 0.1063, 0.0833, 0.0847,
         0.0631, 0.0877, 0.0848],
        [0.1062, 0.0749, 0.0766, 0.0840, 0.0671, 0.1012, 0.0948, 0.0737, 0.0769,
         0.0636, 0.0857, 0.0953],
        [0.1126, 0.0742, 0.0650, 0.0845, 0.0648, 0.1162, 0.0907, 0.0688, 0.0726,
         0.0635, 0.0876, 0.0994]], device='cuda:0', grad_fn=<SliceBackward>)

In [21]:
y_target.flatten().shape

torch.Size([32])

In [22]:
C = len(chars)

In [23]:
# need to create (N x C) so collapse first two dims
output = output.contiguous().view(-1, C)
# this is just going to be (B x 1) -- B being batch size
y_target = y_target.flatten().cuda()

In [24]:
output.shape

torch.Size([32, 12])

In [25]:
from torch import optim
optimizer = optim.Adam(s2s.parameters())
criterion = nn.NLLLoss()

In [26]:
loss = criterion(output, y_target)
print(loss.item())

-0.08900000154972076


Now, train!

In [27]:
def train(model, x, y, optimizer, criterion, batch_size=16, epochs=10):
    model.train()
    epoch_loss = 0
    for i in range(epochs):
      current_idx = 0 
   
      while (current_idx + batch_size) < x.shape[0]:
        optimizer.zero_grad()
        batch_x, batch_y = x[current_idx:current_idx+batch_size], y[current_idx:current_idx+batch_size]
        
        output = model(batch_x, batch_y)
        
        # flatten
        #output_flat = output.contiguous().view(-1, vocab_size)
        output_flat = output.contiguous().view(-1, vocab_size)
        y_flat = batch_y.view(-1).cuda()
       
        loss = criterion(output_flat, y_flat)
       
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()

        current_idx += batch_size
        
      print(f"epoch {i} loss: {epoch_loss:.2f}")
    return model
        

In [28]:
mm = train(s2s, x_train, y_train, optimizer, criterion, epochs=10)

epoch 0 loss: -920.94
epoch 1 loss: -1870.94
epoch 2 loss: -2825.35
epoch 3 loss: -3780.35
epoch 4 loss: -4736.25
epoch 5 loss: -5692.43
epoch 6 loss: -6684.26
epoch 7 loss: -7764.14
epoch 8 loss: -8870.58
epoch 9 loss: -9985.53


Make predictions for first 10 instances in validation set. We are passing in y_val (which we would not actually have in practice!) just for convienence -- note that we set the teacher forcing ratio to 0, and hence this is not used during decoding.



In [29]:
predictions = s2s(x_train[:10], y_train[:10])
predictions = predictions.cpu().detach().numpy()
print(predictions.shape)
ctable.decode(predictions[3,:,:])

(10, 4, 12)


'900 '

In [30]:
y_train0 = y_train[0].cpu().detach().numpy()
ctable.decode(y_train0, calc_argmax=False)

'833 '

In [31]:
predictions = s2s(x_val[:10], y_val[:10], teacher_forcing_ratio=0)

In [32]:
predictions.shape

torch.Size([10, 4, 12])

In [33]:
predictions = predictions.cpu().detach().numpy()


In [34]:
predictions.shape

(10, 4, 12)

In [35]:
x_val0 = x_val[1].cpu().detach().numpy()
x_val0.shape
ctable.decode(x_val0, calc_argmax=False)

'6+759  '

In [36]:
ctable.decode(predictions[0,:,:])

'1111'

In [37]:
predictions[1,0,:]

array([1.0649618e-07, 1.7614746e-10, 7.3748452e-05, 6.4984903e-05,
       3.2821061e-09, 5.8934138e-09, 2.2657009e-08, 3.5061223e-08,
       4.4543650e-03, 9.9523348e-01, 1.0121244e-04, 7.2022958e-05],
      dtype=float32)

In [38]:
y_val0 = y_val[1].cpu().detach().numpy()
ctable.decode(y_val0, calc_argmax=False)

'765 '