In [None]:
# Based on https://github.com/bentrevett/pytorch-seq2seq/blob/master/1%20-%20Sequence%20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb

# Install dependencies
!pip install torchdata
!pip install torchinfo
!spacy download en_core_web_sm
!spacy download de_core_news_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchdata
  Downloading torchdata-0.4.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 5.0 MB/s 
Collecting urllib3>=1.25
  Downloading urllib3-1.26.12-py2.py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 72.6 MB/s 
[?25hCollecting portalocker>=2.0.0
  Downloading portalocker-2.5.1-py2.py3-none-any.whl (15 kB)
Collecting urllib3>=1.25
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 51.2 MB/s 
Installing collected packages: urllib3, portalocker, torchdata
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.24.3
    Uninstalling urllib3-1.24.3:
      Successfully uninstalled urllib3-1.24.3
Successfully installed portalocker-2.5.1 torchdata-0.4.1 urllib3-1.25.11
Looking in indexes: https://pypi.org/si

In [None]:
# Mount drive to save/load model

from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


# Preparing Data

In [None]:
# Importing 2 tokenizers (English and German) from spacy 

import torchtext

tokenizer_de = torchtext.data.utils.get_tokenizer('spacy', language='de_core_news_sm')
tokenizer_en = torchtext.data.utils.get_tokenizer('spacy', language='en_core_web_sm')

In [None]:
# Load dataset. 
# English, German pairs

from torchtext.datasets import Multi30k

train_data = Multi30k(split='train')
valid_data = Multi30k(split='valid')
test_data = Multi30k(split='test')

In [None]:
# Check one example of the dataset

for i in train_data:
  print(i)
  break

('Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.', 'Two young, White males are outside near many bushes.')


In [None]:
# Build vocab and add <sos> and <eos> special tokens

def de_tokens(data_iter):
  for de_text, en_text in data_iter:
    yield tokenizer_de(de_text.lower())

vocab_de = torchtext.vocab.build_vocab_from_iterator(de_tokens(train_data), specials=['<sos>', '<eos>'], min_freq=2)

In [None]:
# Check the vocabulary

print(len(vocab_de))
print(vocab_de.get_itos()[:10])

7851
['<sos>', '<eos>', '.', 'ein', 'einem', 'in', 'eine', ',', 'und', 'mit']


In [None]:
# Build vocab and add <sos> and <eos> special tokens

def en_tokens(data_iter):
  for de_text, en_text in data_iter:
    yield tokenizer_en(en_text.lower())

vocab_en = torchtext.vocab.build_vocab_from_iterator(en_tokens(train_data), specials=['<sos>', '<eos>'], min_freq=2)



In [None]:
# Check the vocabulary

print(len(vocab_en))
print(vocab_en.get_itos()[:10])

5891
['<sos>', '<eos>', 'a', '.', 'in', 'the', 'on', 'man', 'is', 'and']


In [None]:
# Import torch and get the device

import torch
from torch import nn

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Dataset / Dataloader

In [None]:
from torch.utils.data import Dataset, DataLoader

In [None]:
class TranslationDataset(Dataset):
  def __init__(self, dataset, vocab_de, vocab_en):
    self.dataset = dataset
    self.data_en = []
    self.data_de = []

    for de_text, en_text in dataset:
      # tokenize and add special tokens
      tokens_en = ['<sos>'] + tokenizer_en(en_text.lower()) + ['<eos>']
      tokens_de = ['<sos>'] + tokenizer_de(de_text.lower()) + ['<eos>']

      # filter tokens to use only tokens in the vocabulary
      tokens_en = [[vocab_en[token]] for token in tokens_en if token in vocab_en]
      tokens_de = [[vocab_de[token]] for token in tokens_de if token in vocab_de]
      
      self.data_en.append(tokens_en)
      self.data_de.append(tokens_de)

  def __len__(self):
    return len(self.data_en) - 1

  def __getitem__(self, idx):
    return torch.LongTensor(self.data_en[idx]), torch.LongTensor(self.data_de[idx])
           


In [None]:
# Check one example of the dataset

train_dataset = TranslationDataset(train_data, vocab_de, vocab_en)
train_dataset[0]

(tensor([[   0],
         [  14],
         [  22],
         [  13],
         [  23],
         [ 776],
         [  15],
         [  55],
         [  78],
         [ 200],
         [1310],
         [   3],
         [   1]]), tensor([[   0],
         [  16],
         [  24],
         [ 251],
         [  28],
         [  82],
         [  18],
         [  86],
         [   5],
         [  13],
         [ 108],
         [7645],
         [3169],
         [   2],
         [   1]]))

In [None]:
# Create dataloaders
# Using batch size 1 for simplicity

BATCH_SIZE = 1
dataloader_train = DataLoader(TranslationDataset(train_data, vocab_de, vocab_en), batch_size=BATCH_SIZE)
dataloader_valid = DataLoader(TranslationDataset(valid_data, vocab_de, vocab_en), batch_size=BATCH_SIZE)

In [None]:
# Check dataloader length

len(dataloader_train)

29000

# Network

In [None]:
from torch import nn

class Encoder(nn.Module):
  def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
    super().__init__()

    self.hid_dim = hid_dim
    self.n_layers = n_layers

    self.embedding = nn.Embedding(input_dim, emb_dim)
    self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
    self.dropout = nn.Dropout(dropout)

  def forward(self, src):
    embedded = self.dropout(self.embedding(src))
    outputs, (hidden, cell) = self.rnn(embedded)

    return hidden, cell

In [None]:
from torchinfo import summary

INPUT_DIM = len(vocab_en)
ENC_EMB_DIM = 128
HID_DIM = 64
N_LAYERS = 1
ENC_DROPOUT = 0.3

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT).to(device)

summary(enc, input_size=(1,), dtypes=[torch.long])


  "num_layers={}".format(dropout, num_layers))


Layer (type:depth-idx)                   Output Shape              Param #
Encoder                                  [1, 64]                   --
├─Embedding: 1-1                         [1, 128]                  754,048
├─Dropout: 1-2                           [1, 128]                  --
├─LSTM: 1-3                              [1, 64]                   49,664
Total params: 803,712
Trainable params: 803,712
Non-trainable params: 0
Total mult-adds (M): 3.93
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 3.21
Estimated Total Size (MB): 3.22

In [None]:
class Decoder(nn.Module):
  def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
    super().__init__()
    self.output_dim = output_dim
    self.hid_dim = hid_dim
    self.n_layers = n_layers

    self.embedding = nn.Embedding(output_dim, emb_dim)
    self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
    
    # need to return a layer with size output_dim
    self.fc_out = nn.Linear(hid_dim, output_dim)
    self.dropout = nn.Dropout(dropout)

  def forward(self, input, hidden, cell):
    input = input.unsqueeze(0)  # add one dimension
    embedded = self.dropout(self.embedding(input))
    output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
    prediction = self.fc_out(output.squeeze(0))  # remove dimensions with size 1

    return prediction, hidden, cell

In [None]:
OUTPUT_DIM = len(vocab_de)
DEC_EMB_DIM = 128
HID_DIM = 64
N_LAYERS = 1
DEC_DROPOUT = 0.3

dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT).to(device)

summary(dec)

  "num_layers={}".format(dropout, num_layers))


Layer (type:depth-idx)                   Param #
Decoder                                  --
├─Embedding: 1-1                         1,004,928
├─LSTM: 1-2                              49,664
├─Linear: 1-3                            510,315
├─Dropout: 1-4                           --
Total params: 1,564,907
Trainable params: 1,564,907
Non-trainable params: 0

In [None]:
import random

class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder, device):
    super().__init__()

    self.encoder = encoder
    self.decoder = decoder
    self.device = device

    # encoder hidden is passed to decoder hidden, so it. must be equal
    assert encoder.hid_dim == decoder.hid_dim, \
      "Hidden dimensions of encoder and decoder must be equal!"
    assert encoder.n_layers == decoder.n_layers, \
      "Encoder and decoder must have equal number of layers!"

  def forward(self, src, trg, teacher_forcing_ratio = 0.5):
    batch_size = 1  # using batch size 1 for simplicity
    trg_len = trg.shape[0]  # the number of tokens of the target
    trg_vocab_size = self.decoder.output_dim

    # tensor to store decoder outputs
    # outputs has the size according the target
    outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

    #last hidden state of the encoder is used as the initial hidden state of the decoder
    hidden, cell = self.encoder(src)

    #first input to the decoder is the <sos> tokens
    input = trg[0,:]

    # iterate over the target len
    for t in range(1, trg_len):
      #insert input token embedding, previous hidden and previous cell states
      #receive output tensor (predictions) and new hidden and cell states
      output, hidden, cell = self.decoder(input, hidden, cell)
      # print('seq2seq output', output)
      # print('seq2seq output arg', output.argmax(1))

      #place predictions in a tensor holding predictions for each token
      outputs[t] = output

      #get the highest predicted token from our predictions
      top1 = output.argmax(1) 

      #decide if we are going to use teacher forcing or not
      teacher_force = random.random() < teacher_forcing_ratio
            
      #if teacher forcing, use actual next token as next input
      #if not, use predicted token
      input = trg[t] if teacher_force else top1
        
    return outputs


# Training

In [None]:
model = Seq2Seq(enc, dec, device).to(device)

  "num_layers={}".format(dropout, num_layers))


In [None]:
def init_weights(m):
  for name, param in m.named_parameters():
    nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(5891, 128)
    (rnn): LSTM(128, 64, dropout=0.3)
    (dropout): Dropout(p=0.3, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(7851, 128)
    (rnn): LSTM(128, 64, dropout=0.3)
    (fc_out): Linear(in_features=64, out_features=7851, bias=True)
    (dropout): Dropout(p=0.3, inplace=False)
  )
)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')


The model has 2,368,619 trainable parameters


In [None]:
from torch import optim

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

In [None]:
def train(model, dataloader, optimizer, criterion, clip=1):
  model.train()
  epoch_loss = 0

  for src, trg in dataloader:
    src, trg = src.to(device)[0], trg.to(device)[0]

    optimizer.zero_grad()

    output = model(src, trg)

    # remove first token and format flat the tensors
    output = output[1:].view(-1, OUTPUT_DIM)
    trg = trg[1:].view(-1)

    loss = criterion(output, trg)

    loss.backward()

    # torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
    optimizer.step()
    epoch_loss += loss.item()

  return epoch_loss / len(dataloader)


In [None]:
def evaluate(model, dataloader, criterion):
  model.eval()
  epoch_loss = 0

  with torch.no_grad():
    for src, trg in dataloader:
      src, trg = src.to(device)[0], trg.to(device)[0]
      output = model(src, trg, 0)

      output = output[1:].view(-1, OUTPUT_DIM)
      trg = trg[1:].view(-1)
      
      loss = criterion(output, trg)
      epoch_loss += loss.item()

  return epoch_loss / len(dataloader)

In [None]:
len(dataloader_train), len(dataloader_valid)

(29000, 1014)

In [None]:
def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs

In [None]:
vocab_de.get_itos()[1]

'<eos>'

In [None]:
import time
import math

N_EPOCHS = 10

for epoch in range(N_EPOCHS):
  start_time = time.time()
  train_loss = train(model, dataloader_train, optimizer, criterion)
  valid_loss = evaluate(model, dataloader_valid, criterion)
  end_time = time.time()

  epoch_mins, epoch_secs = epoch_time(start_time, end_time)

  print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
  print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 6m 16s
	Train Loss: 4.421 | Train PPL:  83.208
	 Val. Loss: 4.468 |  Val. PPL:  87.194
Epoch: 02 | Time: 6m 16s
	Train Loss: 3.773 | Train PPL:  43.498
	 Val. Loss: 4.341 |  Val. PPL:  76.755
Epoch: 03 | Time: 6m 16s
	Train Loss: 3.560 | Train PPL:  35.155
	 Val. Loss: 4.210 |  Val. PPL:  67.327
Epoch: 04 | Time: 6m 11s
	Train Loss: 3.428 | Train PPL:  30.808
	 Val. Loss: 4.151 |  Val. PPL:  63.528
Epoch: 05 | Time: 6m 12s
	Train Loss: 3.324 | Train PPL:  27.766
	 Val. Loss: 4.129 |  Val. PPL:  62.100
Epoch: 06 | Time: 6m 11s
	Train Loss: 3.245 | Train PPL:  25.669
	 Val. Loss: 4.079 |  Val. PPL:  59.098
Epoch: 07 | Time: 6m 11s
	Train Loss: 3.186 | Train PPL:  24.194
	 Val. Loss: 4.001 |  Val. PPL:  54.660
Epoch: 08 | Time: 6m 11s
	Train Loss: 3.137 | Train PPL:  23.033
	 Val. Loss: 3.983 |  Val. PPL:  53.701
Epoch: 09 | Time: 6m 11s
	Train Loss: 3.096 | Train PPL:  22.112
	 Val. Loss: 4.005 |  Val. PPL:  54.870
Epoch: 10 | Time: 6m 10s
	Train Loss: 3.066 | Train PPL

In [None]:
torch.save(model, '/gdrive/MyDrive/models/seq2seq.pt')

In [None]:
dataloader_test = DataLoader(TranslationDataset(test_data, vocab_de, vocab_en), batch_size=BATCH_SIZE)

In [None]:
test_loss = evaluate(model, dataloader_test, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 3.909 | Test PPL:  49.827 |


# Translate

In [None]:
model = torch.load('/gdrive/MyDrive/models/seq2seq.pt')

In [None]:
def to_en_text(x):
  return ' '.join([vocab_en.get_itos()[token.item()] for token in x.flatten()])

def to_de_text(x):
  return ' '.join([vocab_de.get_itos()[token.item()] for token in x.flatten()])

In [None]:
test_dataset = TranslationDataset(test_data, vocab_de, vocab_en)

In [None]:
xx = test_dataset[12][0]
to_en_text(xx)

'<sos> a woman holding a bowl of food in a kitchen . <eos>'

In [None]:
xx = test_dataset[12][1]
to_de_text(xx)

'<sos> eine frau , die in einer küche eine schale mit essen hält . <eos>'

In [None]:
with torch.no_grad():
  src = test_dataset[12][0]
  trg = torch.tensor([0])
  src, trg = src.to(device), trg.to(device)
  
  print('src', src.shape)
  hidden, cell = enc(src)

  print('hidden', hidden.shape)
  print('cell', cell.shape)
  print('trg', trg.shape)
  
  result = []
  for i in range(50):
    output, hidden, cell = dec(trg, hidden, cell)
    trg = output.argmax(1)
    result.append(trg)

    if trg.item() == 1:  # eos
      break

to_de_text(torch.tensor(result))



src torch.Size([13, 1])
hidden torch.Size([1, 1, 64])
cell torch.Size([1, 1, 64])
trg torch.Size([1])


'eine frau hält eine in einer küche und essen . <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos>'

In [None]:
<sos> eine frau , die in einer küche eine schale mit essen hält . <eos>