In [1]:
import torch
import random
import math
import torch.nn as nn
from torchtext.datasets import Multi30k
from dataloader import *
from utils import *
from torch.optim import Adam

In [2]:
N_EPOCHS = 10
CLIP = 1
best_valid_loss = float('inf')
emb_dim = 256
hid_dim = 512
n_layers = 2
dropout = 0.5
batch_size = 128

In [3]:
train_dataset, val_dataset, test_dataset = Multi30k(root='data')
train_dataset, val_dataset, test_dataset = to_map_style_dataset(train_dataset),to_map_style_dataset(val_dataset), to_map_style_dataset(test_dataset)
train_dataloader, val_dataloader, test_dataloader, etc = get_dataloader_and_etc(train_dataset, val_dataset, test_dataset,batch_size)
_, _, vocab_de, vocab_en = etc
input_dim = len(vocab_de)
output_dim = len(vocab_en)

In [4]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, dropout, n_layers=1):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(emb_dim, hid_dim, num_layers=n_layers)
    
    def forward(self, src):
        # src [src_len, bs]
        embedded = self.embedding(src)
        embedded = self.dropout(embedded)
        # embedded [src_len, bs, emb_dim]
        _, hidden = self.gru(embedded)
        # hidden [n_layers, bs, hid_dim]
        return hidden

In [5]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(emb_dim+hid_dim, hid_dim)
        self.fc_out = nn.Linear(emb_dim+hid_dim*2, output_dim)
        self.output_dim = output_dim

    def forward(self, input, hidden, context):
        # input [1, bs]
        # hidden [1, bs, hid_dim]
        # context [1, bs, hid_dim]
        embedded = self.embedding(input)
        embedded = self.dropout(embedded)
        # embedded [1, bs, emb_dim]
        emb_con = torch.cat((embedded, context), dim=2)
        # emb_con [1, bs, emb_dim+hid_dim]
        _, hidden = self.gru(emb_con, hidden)
        # hidden [1, bs, hid_dim]
        embedded = embedded.squeeze(0)
        # embedded [bs, emb_dim]
        hidden_sq = hidden.squeeze(0)
        # hidden_sq [bs, hid_dim]
        context = context.squeeze(0)
        # context [bs, hid_dim]
        output = torch.cat((embedded, hidden_sq, context), dim=1)
        # output [bs, emb_dim + hid_dim*2]
        prediction = self.fc_out(output)
        # prediction [bs, output_dim]
        return prediction, hidden


In [6]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
    
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        trg_len, batch_size = trg.shape
        output_dim = self.decoder.output_dim
        outputs = torch.zeros(trg_len-1, batch_size, output_dim).to(self.device)
        # outputs [trg_len-1, bs, output_dim]
        # src [src_len, bs]
        context = self.encoder(src)
        # context [n_layers, bs, hid_dim]
        hidden = context
        # hidden [n_layer, bs, hid_dim]
        # trg [trg_len, bs]
        top1 = trg[0, None]
        for t in range(0, trg_len - 1):
            input = trg[t, None] if random.random()<teacher_forcing_ratio else  top1
            # input [1, bs]
            output, hidden = self.decoder(input, hidden, context)
            # output [bs, output_dim]
            outputs[t] = output
            top1 = output.argmax(1)[None,:]
        return outputs

In [7]:
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
enc = Encoder(input_dim, emb_dim, hid_dim, dropout)
dec = Decoder(output_dim, emb_dim, hid_dim, dropout)
model = Seq2Seq(enc, dec, device).to(device)
optimizer = Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=0)
count_parameters(model)
model.apply(init_weights2)

The model has 14,219,781trainable parameters


Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7853, 256)
    (dropout): Dropout(p=0.5, inplace=False)
    (gru): GRU(256, 512)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256)
    (dropout): Dropout(p=0.5, inplace=False)
    (gru): GRU(768, 512)
    (fc_out): Linear(in_features=1280, out_features=5893, bias=True)
  )
)

In [None]:
t_batch = math.ceil(len(train_dataset)//batch_size)
v_batch = math.ceil(len(val_dataset)//batch_size)
for epoch in range(N_EPOCHS):
    train(epoch, model, train_dataloader, t_batch, optimizer, criterion, CLIP, device)
    eval_loss = evaluate(model, val_dataloader, v_batch, criterion, device)
    if eval_loss < best_valid_loss:
        best_valid_loss = eval_loss
        torch.save(model.state_dict(), 'weight/tut2-model.pt')

In [8]:
model.load_state_dict(torch.load('weight/tut2-model.pt'))
t_batch = math.ceil(len(test_dataset)//batch_size)
evaluate(model, test_dataloader, t_batch, criterion, device)

100%|██████████| 7/7 [00:00<00:00,  8.39it/s, eval_loss=2.88]


2.877150365284511