# Introduction

based on : https://github.com/bentrevett/pytorch-seq2seq

Task : De-EN Translation    
Method : seq2seq  
Dataset : 'bentrevett' custom dataset

# 0. Set Environment

In [91]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
import torchtext
torchtext.disable_torchtext_deprecation_warning()
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.tensorboard import SummaryWriter

import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from pprint import pprint
import spacy
import random

import subprocess
import os
import sys

import datasets

In [92]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Selected device:", device)

Selected device: cuda


In [93]:
seed = 42

np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [94]:
model_dir = './models/seq2seq_Translation_model.pth'

# 1. Data processing

## 1-1. Get Data

In [95]:
dataset = datasets.load_dataset("bentrevett/multi30k")

In [96]:
train_data, valid_data, test_data = (dataset["train"],
                                     dataset["validation"],
                                     dataset["test"])

In [97]:
print(train_data, valid_data, test_data)

Dataset({
    features: ['en', 'de'],
    num_rows: 29000
}) Dataset({
    features: ['en', 'de'],
    num_rows: 1014
}) Dataset({
    features: ['en', 'de'],
    num_rows: 1000
})


In [98]:
pprint(train_data[0])

{'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en': 'Two young, White males are outside near many bushes.'}


## 1-2. Tokenize

In [99]:
en_nlp = spacy.load("en_core_web_sm")
de_nlp = spacy.load("de_core_news_sm")

In [100]:
def tokenize_example(example, en_nlp, de_nlp, max_length, lower, sos_token, eos_token):
    en_tokens = [token.text for token in en_nlp.tokenizer(example["en"])][:max_length]
    de_tokens = [token.text for token in de_nlp.tokenizer(example["de"])][:max_length]
    if lower:
        en_tokens = [token.lower() for token in en_tokens]
        de_tokens = [token.lower() for token in de_tokens]
    en_tokens = [sos_token] + en_tokens + [eos_token]
    de_tokens = [sos_token] + de_tokens + [eos_token]
    return {"en_tokens": en_tokens, "de_tokens": de_tokens}

In [101]:
max_length = 256

lower = True
sos_token = "<sos>"
eos_token = "<eos>"

fn_kwargs = {"en_nlp": en_nlp,
             "de_nlp": de_nlp,
             "max_length": max_length,
             "lower": lower,
             "sos_token": sos_token,
             "eos_token": eos_token}

train_data = train_data.map(tokenize_example, fn_kwargs = fn_kwargs)
valid_data = valid_data.map(tokenize_example, fn_kwargs = fn_kwargs)
test_data = test_data.map(tokenize_example, fn_kwargs = fn_kwargs)

In [102]:
print(train_data, valid_data, test_data)

Dataset({
    features: ['en', 'de', 'en_tokens', 'de_tokens'],
    num_rows: 29000
}) Dataset({
    features: ['en', 'de', 'en_tokens', 'de_tokens'],
    num_rows: 1014
}) Dataset({
    features: ['en', 'de', 'en_tokens', 'de_tokens'],
    num_rows: 1000
})


In [103]:
pprint(train_data[0])

{'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'de_tokens': ['<sos>',
               'zwei',
               'junge',
               'weiße',
               'männer',
               'sind',
               'im',
               'freien',
               'in',
               'der',
               'nähe',
               'vieler',
               'büsche',
               '.',
               '<eos>'],
 'en': 'Two young, White males are outside near many bushes.',
 'en_tokens': ['<sos>',
               'two',
               'young',
               ',',
               'white',
               'males',
               'are',
               'outside',
               'near',
               'many',
               'bushes',
               '.',
               '<eos>']}


## 1-3. Build Vocab 

In [104]:
min_freq = 1
unk_token = "<unk>"
pad_token = "<pad>"

special_tokens = [unk_token,
                  pad_token,
                  sos_token,
                  eos_token]

en_vocab = build_vocab_from_iterator(train_data["en_tokens"],
                                     min_freq = min_freq,
                                     specials = special_tokens)

de_vocab = build_vocab_from_iterator(train_data["de_tokens"],
                                     min_freq = min_freq,
                                     specials = special_tokens)

In [105]:
len(en_vocab), len(de_vocab)

(9797, 18669)

In [106]:
assert en_vocab[unk_token] == de_vocab[unk_token]
assert en_vocab[pad_token] == de_vocab[pad_token]

unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

In [107]:
en_vocab.set_default_index(unk_index)
de_vocab.set_default_index(unk_index)

In [108]:
tokens = ["i", "love", "watching", "crime", "shows"]

In [109]:
en_vocab.lookup_indices(tokens)

[956, 2169, 173, 6799, 821]

In [110]:
en_vocab.lookup_tokens(en_vocab.lookup_indices(tokens))

['i', 'love', 'watching', 'crime', 'shows']

## 1-4. Numericalize Text

In [111]:
def numericalize_example(example, en_vocab, de_vocab):
    en_ids = en_vocab.lookup_indices(example["en_tokens"])
    de_ids = de_vocab.lookup_indices(example["de_tokens"])
    return {"en_ids": en_ids, "de_ids": de_ids}

In [112]:
fn_kwargs = {"en_vocab": en_vocab, "de_vocab": de_vocab}

train_data = train_data.map(numericalize_example, fn_kwargs = fn_kwargs)
valid_data = valid_data.map(numericalize_example, fn_kwargs = fn_kwargs)
test_data = test_data.map(numericalize_example, fn_kwargs = fn_kwargs)

In [113]:
print(train_data, valid_data, test_data)

Dataset({
    features: ['en', 'de', 'en_tokens', 'de_tokens', 'en_ids', 'de_ids'],
    num_rows: 29000
}) Dataset({
    features: ['en', 'de', 'en_tokens', 'de_tokens', 'en_ids', 'de_ids'],
    num_rows: 1014
}) Dataset({
    features: ['en', 'de', 'en_tokens', 'de_tokens', 'en_ids', 'de_ids'],
    num_rows: 1000
})


In [114]:
pprint(train_data[0])

{'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'de_ids': [2, 18, 26, 253, 30, 84, 20, 88, 7, 15, 110, 7647, 3171, 4, 3],
 'de_tokens': ['<sos>',
               'zwei',
               'junge',
               'weiße',
               'männer',
               'sind',
               'im',
               'freien',
               'in',
               'der',
               'nähe',
               'vieler',
               'büsche',
               '.',
               '<eos>'],
 'en': 'Two young, White males are outside near many bushes.',
 'en_ids': [2, 16, 24, 15, 25, 778, 17, 57, 80, 202, 1312, 5, 3],
 'en_tokens': ['<sos>',
               'two',
               'young',
               ',',
               'white',
               'males',
               'are',
               'outside',
               'near',
               'many',
               'bushes',
               '.',
               '<eos>']}


In [115]:

data_type = "torch"
format_columns = ["en_ids", "de_ids"]

train_data = train_data.with_format(type = data_type, 
                                    columns = format_columns, 
                                    output_all_columns = False)

valid_data = valid_data.with_format(type = data_type,
                                    columns = format_columns,
                                    output_all_columns = False)

test_data = test_data.with_format(type = data_type,
                                  columns = format_columns,
                                  output_all_columns = False)

In [116]:
print(train_data, valid_data, test_data)

Dataset({
    features: ['en', 'de', 'en_tokens', 'de_tokens', 'en_ids', 'de_ids'],
    num_rows: 29000
}) Dataset({
    features: ['en', 'de', 'en_tokens', 'de_tokens', 'en_ids', 'de_ids'],
    num_rows: 1014
}) Dataset({
    features: ['en', 'de', 'en_tokens', 'de_tokens', 'en_ids', 'de_ids'],
    num_rows: 1000
})


In [117]:
pprint(train_data[0])

{'de_ids': tensor([   2,   18,   26,  253,   30,   84,   20,   88,    7,   15,  110, 7647,
        3171,    4,    3]),
 'en_ids': tensor([   2,   16,   24,   15,   25,  778,   17,   57,   80,  202, 1312,    5,
           3])}


## 1-5. Word Embedding

you can add word embedding here

In [118]:
# en_pretrained_embedding =
# de_pretrained_embedding =

## 1-6. Prepare for Data Loading

In [119]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        return item

In [120]:
train_dataset = CustomDataset(train_data)
test_dataset = CustomDataset(test_data)

In [121]:
def custom_collate_fn(batch):

    batch_en_ids = [sample["en_ids"] for sample in batch]
    batch_de_ids = [sample["de_ids"] for sample in batch]

    collate_en = pad_sequence(batch_en_ids,
                              padding_value = pad_index,
                              batch_first = True)
    collate_de = pad_sequence(batch_de_ids,
                              padding_value = pad_index,
                              batch_first = True)
    
    batch = {"en_ids": collate_en,
             "de_ids": collate_de}
    
    return batch

In [122]:
batch_size = 128
pad_index = pad_index
shuffle = True

trainloader = DataLoader(dataset = train_dataset,
                         batch_size = batch_size,
                         collate_fn = custom_collate_fn,
                         shuffle = shuffle)

validloader = DataLoader(dataset = train_dataset,
                         batch_size = batch_size,
                         collate_fn = custom_collate_fn,
                         shuffle = shuffle)

testloader = DataLoader(dataset = test_dataset,
                         batch_size = batch_size,
                         collate_fn = custom_collate_fn,
                         shuffle = shuffle)

# 2. Define Model

## 2-1. Model Structure

In [123]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, num_layers, dropout_rate, pad_idx):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx = pad_idx)
        
        self.LSTM = nn.LSTM(embedding_dim, hidden_dim, num_layers, dropout = dropout_rate, batch_first = True)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, inputs):

        x = self.dropout(self.embedding(inputs))

        output, (h, c) = self.LSTM(x)
        
        return h, c

In [124]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, num_layers, dropout_rate, pad_idx):
        super().__init__()
        self.output_dim = output_dim

        self.embedding = nn.Embedding(output_dim, embedding_dim, padding_idx = pad_idx)
        
        self.LSTM = nn.LSTM(embedding_dim, hidden_dim, num_layers, dropout = dropout_rate, batch_first = True)
        self.dropout = nn.Dropout(dropout_rate)

        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, inputs, h, c):

        x = self.dropout(self.embedding(inputs.unsqueeze(1)))
        
        print(h.size(), c.size())
        output, (h, c) = self.LSTM(x, (h, c))

        logit = self.fc(output.squeeze())

        return logit, h, c

In [125]:
class seq2seq_Translation(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.Encoder = encoder
        self.Decoder = decoder

    def forward(self, inputs, targets, teacher_forcing_ratio):

        target_batch_size = inputs.size(0)
        target_seq_length = targets.size(1)
        target_vocab_size = self.Decoder.output_dim

        logits = torch.zeros(target_batch_size, target_seq_length, target_vocab_size).to(device)

        h_encoder, c_encoder = self.Encoder(inputs)

        x = targets[:, 0]

        h, c = h_encoder, c_encoder
        for t in range(1, target_seq_length):
            logit, h, c = self.Decoder(x, h, c)
            logits[:, t, :] = logit
            
            teacher_force = random.random() < teacher_forcing_ratio

            predicted_vocab = logit.argmax(0)

            x = targets[:, t] if teacher_force else predicted_vocab

        return logits

## 2-2. Hyperparameter & functions

In [126]:
input_dim = len(de_vocab)
output_dim = len(en_vocab)
encoder_embedding_dim = 300
decoder_embedding_dim = 300
hidden_dim = 128
num_layers = 2
encoder_dropout = 0.5
decoder_dropout = 0.5
pad_index = pad_index
lr = 5e-4

encoder = Encoder(input_dim,
                  encoder_embedding_dim,
                  hidden_dim,
                  num_layers,
                  encoder_dropout,
                  pad_index)

decoder = Decoder(output_dim,
                  decoder_embedding_dim,
                  hidden_dim,
                  num_layers,
                  decoder_dropout,
                  pad_index)

model = seq2seq_Translation(encoder, decoder)

criterion = nn.CrossEntropyLoss(ignore_index=pad_index)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

## 2-3. Weight Initialization

In [127]:
def initialize_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            torch.nn.init.xavier_uniform_(param)
        if 'bias' in name:
            nn.init.constant_(param, 0.0)

In [128]:
model.apply(initialize_weights)

seq2seq_Translation(
  (Encoder): Encoder(
    (embedding): Embedding(18669, 300, padding_idx=1)
    (LSTM): LSTM(300, 128, num_layers=2, batch_first=True, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (Decoder): Decoder(
    (embedding): Embedding(9797, 300, padding_idx=1)
    (LSTM): LSTM(300, 128, num_layers=2, batch_first=True, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
    (fc): Linear(in_features=128, out_features=9797, bias=True)
  )
)

In [129]:
pprint("Model's state_dict:")
for name, param in model.named_parameters():
    print(f"Parameter name: {name}")
    print(f"    Size : {param.size()}")
    print(f"    Value: {param}")

"Model's state_dict:"
Parameter name: Encoder.embedding.weight
    Size : torch.Size([18669, 300])
    Value: Parameter containing:
tensor([[ 0.0176,  0.0123, -0.0130,  ...,  0.0082,  0.0014,  0.0100],
        [ 0.0031, -0.0149,  0.0064,  ...,  0.0079, -0.0143, -0.0139],
        [-0.0052, -0.0144, -0.0024,  ..., -0.0143, -0.0132,  0.0173],
        ...,
        [ 0.0121,  0.0175, -0.0127,  ...,  0.0029, -0.0003, -0.0037],
        [ 0.0057, -0.0128, -0.0040,  ..., -0.0007,  0.0061, -0.0090],
        [ 0.0131,  0.0146,  0.0012,  ...,  0.0006, -0.0146,  0.0121]],
       requires_grad=True)
Parameter name: Encoder.LSTM.weight_ih_l0
    Size : torch.Size([512, 300])
    Value: Parameter containing:
tensor([[ 0.0150, -0.0537,  0.0305,  ...,  0.0462,  0.0269,  0.0478],
        [-0.0723, -0.0568, -0.0062,  ...,  0.0343,  0.0125, -0.0799],
        [ 0.0311,  0.0803, -0.0012,  ...,  0.0083,  0.0809,  0.0230],
        ...,
        [ 0.0247,  0.0680, -0.0270,  ..., -0.0469, -0.0583,  0.0703],
     

In [130]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 10,508,125 trainable parameters


In [131]:
model.to(device)
criterion.to(device)

CrossEntropyLoss()

# 3. Train Model

In [132]:
def train_fn(model, criterion, optimizer, trainloader, clip, teacher_forcing_ratio):
        
    model.train()
    
    running_loss = 0.0
    
    for i, batch in enumerate(tqdm(trainloader)):
        inputs = batch['de_ids'].to(device)
        targets = batch['en_ids'].to(device)
        
        optimizer.zero_grad()

        outputs = model(inputs, targets, teacher_forcing_ratio)

        output_dim = outputs.size(2)
        outputs = outputs[:, 1:, :].view(-1, output_dim)
        targets = targets[:, 1:].view(-1)

        loss = criterion(outputs.view(-1), targets)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)

    epoch_loss = running_loss / len(trainloader.dataset)
    
    return epoch_loss

In [133]:
def evaluate_fn(model, validloader, criterion, teacher_forcing_ratio):
    
    model.eval()
    
    running_loss = 0.0
    
    with torch.no_grad():
        for i, batch in enumerate(tqdm(validloader)):
            inputs = batch["de_ids"].to(device)
            targets = batch["en_ids"].to(device)

            outputs = model(inputs, targets, teacher_forcing_ratio)

            output_dim = outputs.size(2)
            outputs = outputs[:, 1:, :].view(-1, output_dim)

            targets = targets[:, 1:].view(-1)

            loss = criterion(outputs, targets)
            
            running_loss += loss.item() * inputs.size(0)
        
        epoch_loss = running_loss / len(validloader.dataset)
        
    return epoch_loss

In [134]:
def train_model(model, criterion, optimizer, trainloader, validloader, clip, teacher_forcing_ratio, num_epochs):
    print("-----Training Started------")

    best_valid_loss = float("inf")
    
    for epoch in range(num_epochs):

        print(f"Epoch [{epoch+1}/{num_epochs}]")
              
        train_loss = train_fn(model,
                              criterion,
                              optimizer,
                              trainloader,
                              clip,
                              teacher_forcing_ratio)
        
        valid_loss = evaluate_fn(model,
                                 criterion,
                                 validloader,
                                 teacher_forcing_ratio = 0)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss

            torch.save(model.state_dict(), model_dir)
            
        print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
        print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")
    
    print("-----Training Completed-----")

In [135]:
num_epochs = 16

clip = 1.0
teacher_forcing_ratio = 0.5

train_model(model, criterion, optimizer, trainloader, validloader, clip, teacher_forcing_ratio, num_epochs)

-----Training Started------
Epoch [1/16]


  0%|          | 0/227 [00:00<?, ?it/s]

torch.Size([2, 128, 128]) torch.Size([2, 128, 128])
torch.Size([2, 128, 128]) torch.Size([2, 128, 128])
torch.Size([2, 128, 128]) torch.Size([2, 128, 128])





RuntimeError: Expected hidden[0] size (2, 9797, 128), got [2, 128, 128]

# 4. Test

# 5. Inference