In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

import spacy

import random
import math
import os

In [2]:
SEED = 2222
random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

In [4]:
def process_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]
def process_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

In [5]:
Source = Field(tokenize=process_de, init_token='<sos>', eos_token='<eos>', lower=True)
Target = Field(tokenize=process_en, init_token='<sos>', eos_token='<eos>', lower=True)

In [6]:
train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(Source, Target))

In [7]:
len(train_data),len(valid_data),len(test_data)

(29000, 1014, 1000)

In [8]:
Source.build_vocab(train_data, min_freq=2)
Target.build_vocab(train_data, min_freq=2)

In [9]:
BATCH_SIZE = 128

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [11]:
device.type

'cuda'

In [12]:
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device)

In [28]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional=True)
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, src):
        embedded = self.dropout(self.embedding(src))#[sent len, batch size]
        outputs, hidden = self.rnn(embedded)#[sent len, batch size, emb dim]
        #outputs -> [sent len, batch size, hid dim * n directions]
        #hidden -> [n layers * n directions, batch size, hid dim]
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)))
        #hidden -> [batch size, dec hid dim]
        return outputs, hidden

In [29]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.vec = nn.Parameter(torch.rand(dec_hid_dim))
    
    def forward(self, hidden, encoder_outputs):
        #hidden -> [batch size, dec hid dim]
        #encoder_outputs -> [src sent len, batch size, enc hid dim * 2]
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        #hidden -> [batch size, src sent len, dec hid dim]
        #encoder_outputs -> [batch size, src sent len, enc hid dim * 2]
        association = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        #association -> [batch size, src sent len, dec hid dim]
        association = association.permute(0, 2, 1)
        #association -> [batch size, dec hid dim, src sent len]
        #vec -> [dec hid dim]
        vec = self.vec.repeat(batch_size, 1).unsqueeze(1)
        #vec -> [batch size, 1, dec hid dim]
        attention = torch.bmm(vec, association).squeeze(1)
        #attention-> [batch size, src len]
        return F.softmax(attention, dim=1)

In [30]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
        self.out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input, hidden, encoder_outputs):
        #input -> [batch size]
        #hidden -> [batch size, dec hid dim]
        #encoder_outputs -> [src sent len, batch size, enc hid dim * 2]
        input = input.unsqueeze(0)
        #input -> [1, batch size]
        embedded = self.dropout(self.embedding(input))
        #embedded -> [1, batch size, emb dim]
        a = self.attention(hidden, encoder_outputs)
        #a -> [batch size, src len]
        a = a.unsqueeze(1)
        #a -> [batch size, 1, src len]
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        #encoder_outputs -> [batch size, src sent len, enc hid dim * 2]
        weighted = torch.bmm(a, encoder_outputs)
        #weighted -> [batch size, 1, enc hid dim * 2]
        weighted = weighted.permute(1, 0, 2)
        #weighted -> [1, batch size, enc hid dim * 2]
        rnn_input = torch.cat((embedded, weighted), dim=2)
        #rnn_input -> [1, batch size, (enc hid dim * 2) + emb dim]
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        #output -> [sent len, batch size, dec hid dim * n directions]
        #hidden -> [n layers * n directions, batch size, dec hid dim]
        #output -> [1, batch size, dec hid dim]
        #hidden -> [1, batch size, dec hid dim]
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        output = self.out(torch.cat((output, weighted, embedded), dim=1))
        #output -> [batch size, output dim]
        return output, hidden.squeeze(0)

In [31]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        #src->[sent len, batch size]
        #trg->[sent len, batch size]
        
        batch_size = trg.shape[1]
        max_len = trg.shape[0]
        target_voc_size = self.decoder.output_dim
        outputs = torch.zeros(max_len, batch_size, target_voc_size).to(self.device)
        encoder_outputs, hidden = self.encoder(src)
        input = trg[0,:]
        for t in range(1, max_len):
            output, hidden = self.decoder(input, hidden, encoder_outputs)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            input = (trg[t] if teacher_force else top1)
        
        return outputs

In [32]:
INPUT_DIM = len(Source.vocab)
OUTPUT_DIM = len(Target.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, device).to(device)

In [33]:
model

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7853, 256)
    (rnn): GRU(256, 512, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.5)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=1536, out_features=512, bias=True)
    )
    (embedding): Embedding(5893, 256)
    (rnn): GRU(1280, 512)
    (out): Linear(in_features=1792, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5)
  )
)

In [34]:
optimizer = optim.Adam(model.parameters())

In [35]:
pad_idx = Target.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [36]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        source = batch.src
        target = batch.trg#[sent len, batch size]
        optimizer.zero_grad()
        output = model(source, target)#[sent len, batch size, output dim]
        loss = criterion(output[1:].view(-1, output.shape[2]), target[1:].view(-1))
        #trg->[(sent len - 1) * batch size]
        #output->[(sent len - 1) * batch size, output dim]
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [37]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            source = batch.src
            target = batch.trg
            output = model(source, target, 0)
            loss = criterion(output[1:].view(-1, output.shape[2]), target[1:].view(-1))
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [38]:
DIR = 'models'
MODEL_DIR = os.path.join(DIR, 'seq2seq_model.pt')

N_EPOCHS = 10
CLIP = 10

best_loss = float('inf')

if not os.path.isdir(f'{DIR}'):
    os.makedirs(f'{DIR}')
    
for epoch in range(N_EPOCHS):
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    if valid_loss < best_loss:
        best_loss = valid_loss
        torch.save(model.state_dict(), MODEL_DIR)
    print(f'| Epoch: {epoch+1:03} | Train Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f} | Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f} |')

| Epoch: 001 | Train Loss: 3.786 | Train PPL:  44.060 | Val. Loss: 3.562 | Val. PPL:  35.239 |
| Epoch: 002 | Train Loss: 2.999 | Train PPL:  20.072 | Val. Loss: 3.309 | Val. PPL:  27.357 |
| Epoch: 003 | Train Loss: 2.615 | Train PPL:  13.666 | Val. Loss: 3.317 | Val. PPL:  27.577 |
| Epoch: 004 | Train Loss: 2.353 | Train PPL:  10.520 | Val. Loss: 3.257 | Val. PPL:  25.983 |
| Epoch: 005 | Train Loss: 2.148 | Train PPL:   8.569 | Val. Loss: 3.324 | Val. PPL:  27.763 |
| Epoch: 006 | Train Loss: 2.006 | Train PPL:   7.435 | Val. Loss: 3.254 | Val. PPL:  25.894 |
| Epoch: 007 | Train Loss: 1.901 | Train PPL:   6.693 | Val. Loss: 3.299 | Val. PPL:  27.089 |
| Epoch: 008 | Train Loss: 1.811 | Train PPL:   6.119 | Val. Loss: 3.333 | Val. PPL:  28.022 |
| Epoch: 009 | Train Loss: 1.749 | Train PPL:   5.748 | Val. Loss: 3.355 | Val. PPL:  28.657 |
| Epoch: 010 | Train Loss: 1.672 | Train PPL:   5.324 | Val. Loss: 3.425 | Val. PPL:  30.725 |


In [39]:
model.load_state_dict(torch.load(MODEL_DIR))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 3.332 | Test PPL:  28.003 |
