<a href="https://colab.research.google.com/github/QuadV/ImplementingPapers/blob/main/SequenceToSequenceInNeuralNetworks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import numpy as np
import spacy
import random
from torch.utils.tensorboard import SummaryWriter # to print to tensorboard

In [2]:
! python -m spacy download de

Collecting de_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9MB)
[K     |████████████████████████████████| 14.9MB 795kB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.2.5-cp36-none-any.whl size=14907056 sha256=63093f9ccc4c55c4f25ed9759052857f7b511baf3a047d57a9174e6026c6ddfa
  Stored in directory: /tmp/pip-ephem-wheel-cache-zkikj0al/wheels/ba/3f/ed/d4aa8e45e7191b7f32db4bfad565e7da1edbf05c916ca7a1ca
Successfully built de-core-news-sm
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/de_core_news_sm -->
/usr/local/

In [3]:
spacy_ger = spacy.load('de')
spacy_eng = spacy.load('en')

In [4]:
def tokenizer_ger(text):
  """ Hello my name -> ['Hello', 'my', 'name']"""
  return [tok.text for tok in spacy_ger.tokenizer(text)]

def tokenizer_eng(text):
  return [tok.text for tok in spacy_eng.tokenizer(text)]

In [5]:
german = Field(tokenize=tokenizer_ger, lower=True, init_token='<sos>', eos_token='<eos>')
english = Field(tokenize=tokenizer_eng, lower=True, init_token='<sos>', eos_token='<eos>')

In [6]:
train_data, validation_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(german, english))

german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:02<00:00, 564kB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 169kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 164kB/s]


In [7]:
class Encoder(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, num_layers, dropout):
    super(Encoder, self).__init__()
    self.num_layers = num_layers
    self.hidden_size= hidden_size

    self.dropout = nn.Dropout(dropout)
    self.embedding = nn.Embedding(input_size, embedding_size)
    self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout)

  def forward(self, x):
    # x shape = (seq_length, N) # seq_length of words in N batches

    embedding = self.dropout(self.embedding(x))
    # embedding shape: (seq_len, N, embedding_size)
    output, (hidden, cell) = self.rnn(embedding) 
    return hidden, cell


In [8]:
class Decoder(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, output_size,
               num_layers, dropout): # input_size=output_size coz it will be prob of word in vocab 10000
    super(Decoder, self).__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers

    self.dropout = nn.Dropout(dropout)
    self.embedding = nn.Embedding(input_size, embedding_size)
    self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout)
    self.fc = nn.Linear(hidden_size, output_size)

  def forward(self, x, hidden, cell):
    # shape of x: (N) but we want (1, N) # 1 word at a time in N batches
    x = x.unsqueeze(0)

    embedding = self.dropout(self.embedding(x))
    # embedding shape: (1, N, embedding_size)
    outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
    # shape of outputs: (1, N, hidden_size)

    predictions = self.fc(outputs)
    # shape of predictions: (1, N, length_of_vocab)
    predictions = predictions.squeeze(0)  # add ouput from decoder one step at a time. hence adding is simplified in this shape
    return predictions, hidden, cell

In [9]:
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder):
    super(Seq2Seq, self).__init__()
    self.encoder = encoder
    self.decoder = decoder

  def forward(self, source, target, teacher_force_ratio=0.5): # sometimes the prediction, sometimes the actual word when training
    batch_size = source.shape[1]
    # source: (trg_len, N)
    target_len = target.shape[0]
    target_vocab_size = len(english.vocab)

    outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

    hidden, cell = self.encoder(source)
    # grab start token
    x = target[0]

    for t in range(target_len):
      output, hidden, cell = self.decoder(x, hidden, cell)

      outputs[t] = output
      # output: (N, eng_vocab_size) - argmax along 1st dimension to get the best guess of word perdicted
      best_guess = output.argmax(1)

      x = target[t] if random.random() < teacher_force_ratio else best_guess
    return outputs

In [10]:
def load_checkpoint(checkpoint, model, optimizer):
  print(f"Loading checkpoint...")
  model.load_state_dict(checkpoint['state_dict'])
  optimizer.load_state_dict(checkpoint['optimizer'])

def save_checkpoint(state, filename='model_checkpoint.pth.tar'):
  print(f"Saving checkpoint: {filename}")
  torch.save(state, filename)

In [16]:
def translate_sentence(model, sentence, german, english, max_length, device):
  tokenizer_ger = spacy.load('de')

  if type(sentence) == str:
    tokens = [tok.text.lower() for tok in tokenizer_ger(sentence)]
  else:
    tokens = [tok.lower() for tok in sentence]

  tokens.insert(0, german.init_token)
  tokens.append(german.eos_token)
  token_indices = [german.vocab.stoi[tok] for tok in tokens]

  sentence_tensor = torch.LongTensor(token_indices).unsqueeze(1).to(device)

  with torch.no_grad():
    hidden, cell = model.encoder(sentence_tensor)

  outputs = [german.vocab.stoi['<sos>']]

  for _ in range(max_length):
    previous_word = torch.LongTensor([outputs[-1]]).to(device)

    with torch.no_grad():
      output, hidden, cell = model.decoder(previous_word, hidden, cell)
      best_guess = output.argmax(1).item()

    outputs.append(best_guess)

    if best_guess == english.vocab.stoi['<eos>']:
      break

  translated_sentence = [english.vocab.itos[idx] for idx in outputs]

  return translated_sentence
  

In [18]:
# Training

# training hyperparameters
num_epochs = 20
learning_rate = 0.001
batch_size = 64

# model hyperparameters
load_model = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_size_encoder = len(german.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
enc_dropout = 0.5
dec_dropout = 0.5
num_layers = 2

# Tensorboard
writer = SummaryWriter(f'runs/loss_plot')
step = 0

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, validation_data, test_data),
    batch_size = batch_size,
    sort_within_batch=True,
    sort_key = lambda x: len(x.src), # sorts examples with similar length in batch. this saves on compute
    device = device
)

encoder_net = Encoder(input_size=input_size_encoder, embedding_size=encoder_embedding_size, 
                      hidden_size=hidden_size, num_layers=num_layers, dropout=enc_dropout).to(device)
decoder_net = Decoder(input_size=input_size_decoder, embedding_size=decoder_embedding_size, 
                      hidden_size=hidden_size, output_size=output_size, num_layers=num_layers, dropout=dec_dropout).to(device)
model = Seq2Seq(encoder=encoder_net, decoder=decoder_net).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = english.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx) # ignore padding index

if load_model:
  load_checkpoint(torch.load('model_checkpoint.pth.tar'), model, optimizer)

sentence = 'Ein Boot wurde von einem großen Team von Pferden gezogen'

for epoch in range(num_epochs):
  print(f'Epoch {epoch} / {num_epochs}')

  checkpoint = {'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}
  save_checkpoint(checkpoint)

  model.eval()

  translated_sentence = translate_sentence(model, sentence, german, english, max_length=50, device=device)
  print(f"Translated sentence: {translated_sentence}")

  model.train()

  for batch_idx, batch in enumerate(train_iterator):
    inp_data = batch.src.to(device)
    target = batch.trg.to(device)

    output = model(inp_data, target)
    # output shape: (trg_len, batch_size, output_dim)

    output = output[1:].reshape(-1, output.shape[2]) # keep vocab lengt and combine all other dimensions
    target = target[1:].reshape(-1)

    optimizer.zero_grad()
    loss = criterion(output, target)

    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
    optimizer.step()

    

RuntimeError: ignored