<a href="https://colab.research.google.com/github/Pmilivojevic/PyTorch/blob/main/Seq2Seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !python -m spacy download de_core_news_sm
# !pip install -U torchtext==0.6
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field, TabularDataset, BucketIterator, Iterator
import numpy as np
import spacy
import random
from torch.utils.tensorboard import SummaryWriter
import pandas as pd
from sklearn.model_selection import train_test_split

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
english_txt = open(
    '/content/drive/MyDrive/ColabNotebooks/PyTorch/Datasets/europarl-v7.de-en.en',
    encoding='utf8'
).read().split('\n')

german_txt = open(
    '/content/drive/MyDrive/ColabNotebooks/PyTorch/Datasets/europarl-v7.de-en.de',
    encoding='utf8'
).read().split('\n')

raw_data = {
    'English': [line for line in english_txt],
    'German': [line for line in german_txt]
}

df = pd.DataFrame(raw_data, columns=['English', 'German'])

train, test = train_test_split(df, test_size=0.2)

train.to_json(
    '/content/drive/MyDrive/ColabNotebooks/PyTorch/Datasets/train_de.json',
    orient='records',
    lines=True
)
test.to_json(
    '/content/drive/MyDrive/ColabNotebooks/PyTorch/Datasets/test_de.json',
    orient='records',
    lines=True
)

In [None]:
spacy_eng = spacy.load('en_core_web_sm')
spacy_ger = spacy.load('de_core_news_sm')

def tokenize_eng(text):
  return [tok.text for tok in spacy_eng.tokenizer(text)]

def tokenize_ger(text):
  return [tok.text for tok in spacy_ger.tokenizer(text)]

english = Field(
    tokenize=tokenize_eng,
    lower=True,
    init_token='<sos>',
    eos_token='<eos>'
)

german = Field(
    tokenize=tokenize_ger,
    lower=True,
    init_token='<sos>',
    eos_token='<eos>'
)

fields = {'English': ('eng', english), 'German': ('ger', german)}

train_data, test_data = TabularDataset.splits(
    path='/content/drive/MyDrive/ColabNotebooks/PyTorch/Datasets',
    train='train_de.json',
    test='test_de.json',
    format='json',
    fields=fields
)

english.build_vocab(train_data, max_size=10000, min_freq=2)
german.build_vocab(train_data, max_size=10000, min_freq=2)

In [None]:
class Encoder(nn.Module):
  def __init__(
      self,
      input_size,
      embedding_size,
      hidden_size,
      num_layers,
      drop
  ):
    super().__init__()

    self.hidden_size = hidden_size
    self.num_layers = num_layers

    self.dropout = nn.Dropout(drop)
    self.embedding = nn.Embedding(input_size, embedding_size)
    self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=drop)

  def forward(self, x):
    embedding = self.dropout(self.embedding(x))
    output, (hidd, cell) = self.rnn(embedding)

    return hidd, cell

In [None]:
class Decoder(nn.Module):
  def __init__(
      self,
      input_size,
      embedding_size,
      hidden_size,
      output_size,
      num_layers,
      drop
  ):
    super().__init__()

    self.hidden_size = hidden_size
    self.num_layers = num_layers

    self.dropout = nn.Dropout(drop)
    self.embedding = nn.Embedding(input_size, embedding_size)
    self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=drop)
    self.fc = nn.Linear(hidden_size, output_size)

  def forward(self, x, hidd, cell):
    x = x.unsqueeze(0)
    embedding = self.dropout(self.embedding(x))
    outputs, (hidd, cell) = self.rnn(embedding, hidd, cell)
    preds = self.fc(outputs)
    preds = preds.squeeze(0)

    return preds, hidd, cell

In [None]:
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder):
    super().__init__()

    self.encoder = encoder
    self.decoder = decoder

  def forward(self, source, target, teacher_force_ratio=0.5):
    batch_size = source.shape[1]
    target_len = target.shape[0]
    target_vocab_size = len(english.vocab)

    outs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

    hidd, cell = self.encoder(source)

    x = target[0]

    for t in range(1, target_len):
      out, hidd, cell = self.decoder(x, hidd, cell)

      outs[t] = out

      best_guess = out.argmax(1)

      x = target[t] if random.random() < teacher_force_ratio else best_guess

    return outs

In [None]:
num_epochs = 20
lr = 0.001
batch_size = 64

load_model = False
input_size_encoder = len(german.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
enc_emb_size = 300
dec_emb_size = 300
hidden_size = 1024
num_layers = 2
enc_drop = 0.5
dec_drop = 0.5

writer = SummaryWriter(f'runs/loss_plot')
step = 0

train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data),
    batch_size=batch_size,
    # sort_within_batch=True,
    # sort_key=lambda x: len(x.source),
    device=device
)

encoder = Encoder(
    input_size_encoder,
    enc_emb_size,
    hidden_size,
    num_layers,
    enc_drop
).to(device)

decoder = Decoder(
    input_size_decoder,
    dec_emb_size,
    hidden_size,
    output_size,
    num_layers,
    dec_drop
).to(device)

model = Seq2Seq(encoder, decoder).to(device)

pad_idx = english.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.Adam(model.parameters(), lr=lr)

In [None]:
for epoch in range(num_epochs):
  print(f'Epoch [{epoch} / {num_epochs}]')

  for batch_idx, batch in enumerate(train_iterator):
    input = batch.ger.to(device)
    target = batch.eng.to(device)

    out = model(input, target)

    out = out[1:].reshape(-1, out.shape[2])
    target = target[1:].reshape(-1)

    optimizer.zero_grad()

    loss = criterion(out, target)

    loss.backward()
    torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=1)
    optimizer.step()

    writer.add_scalar('Training loss', loss, global_step=step)
    step+= 1

Epoch [0 / 20]


TypeError: ignored

In [None]:
for epoch in range(num_epochs):
  print(f'Epoch [{epoch} / {num_epochs}]')

  for batch_idx, batch in enumerate(train_iterator):
    print(batch_idx)
    print(batch.eng.shape)

Epoch [0 / 20]
0
torch.Size([83, 64])
1
torch.Size([102, 64])
2
torch.Size([75, 64])
3
torch.Size([77, 64])
4
torch.Size([79, 64])
5
torch.Size([65, 64])
6
torch.Size([121, 64])
7
torch.Size([115, 64])
8
torch.Size([62, 64])
9
torch.Size([76, 64])
10
torch.Size([77, 64])
11
torch.Size([123, 64])
12
torch.Size([69, 64])
13
torch.Size([86, 64])
14
torch.Size([100, 64])
15
torch.Size([118, 64])
16
torch.Size([78, 64])
17
torch.Size([91, 64])
18
torch.Size([76, 64])
19
torch.Size([81, 64])
20
torch.Size([67, 64])
21
torch.Size([89, 64])
22
torch.Size([70, 64])
23
torch.Size([98, 64])
24
torch.Size([82, 64])
25
torch.Size([75, 64])
26
torch.Size([70, 64])
27
torch.Size([73, 64])
28
torch.Size([99, 64])
29
torch.Size([94, 64])
30
torch.Size([70, 64])
31
torch.Size([81, 64])
32
torch.Size([88, 64])
33
torch.Size([90, 64])
34
torch.Size([78, 64])
35
torch.Size([79, 64])
36
torch.Size([81, 64])
37
torch.Size([137, 64])
38
torch.Size([83, 64])
39
torch.Size([99, 64])
40
torch.Size([103, 64])
41


KeyboardInterrupt: ignored