# Introduction

based on : https://github.com/bentrevett/pytorch-seq2seq

Task : De-EN Translation    
Method : seq2seq  
Dataset : 'bentrevett' custom dataset

# 0. Set Environment

In [577]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
import torchtext
torchtext.disable_torchtext_deprecation_warning()
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.tensorboard import SummaryWriter

from torch import Tensor
from typing import List, Dict, Tuple, Union, Annotated

import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm
from pprint import pprint
import spacy
import random
from dataclasses import dataclass, field


import datasets

In [578]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Selected device:", device)

Selected device: cuda


In [579]:
seed = 42

np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [580]:
model_dir = './models/seq2seq_Translation_model.pth'

# 1. Data processing

## 1-1. Get Data

In [581]:
dataset = datasets.load_dataset("bentrevett/multi30k")

In [582]:
train_data, valid_data, test_data = (dataset["train"],
                                     dataset["validation"],
                                     dataset["test"])

In [583]:
print(train_data, valid_data, test_data)

Dataset({
    features: ['en', 'de'],
    num_rows: 29000
}) Dataset({
    features: ['en', 'de'],
    num_rows: 1014
}) Dataset({
    features: ['en', 'de'],
    num_rows: 1000
})


In [584]:
pprint(train_data[0])

{'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en': 'Two young, White males are outside near many bushes.'}


## 1-2. Tokenize

In [585]:
en_nlp = spacy.load("en_core_web_sm")
de_nlp = spacy.load("de_core_news_sm")

In [586]:
def tokenize_example(example, en_nlp, de_nlp, max_length, lower, sos_token, eos_token):
    en_tokens = [token.text for token in en_nlp.tokenizer(example["en"])][:max_length]
    de_tokens = [token.text for token in de_nlp.tokenizer(example["de"])][:max_length]
    if lower:
        en_tokens = [token.lower() for token in en_tokens]
        de_tokens = [token.lower() for token in de_tokens]
    en_tokens = [sos_token] + en_tokens + [eos_token]
    de_tokens = [sos_token] + de_tokens + [eos_token]
    return {"en_tokens": en_tokens, "de_tokens": de_tokens}

In [587]:
max_length = 256

lower = True
sos_token = "<sos>"
eos_token = "<eos>"

fn_kwargs = {"en_nlp": en_nlp,
             "de_nlp": de_nlp,
             "max_length": max_length,
             "lower": lower,
             "sos_token": sos_token,
             "eos_token": eos_token}

train_data = train_data.map(tokenize_example, fn_kwargs = fn_kwargs)
valid_data = valid_data.map(tokenize_example, fn_kwargs = fn_kwargs)
test_data = test_data.map(tokenize_example, fn_kwargs = fn_kwargs)

In [588]:
print(train_data, valid_data, test_data)

Dataset({
    features: ['en', 'de', 'en_tokens', 'de_tokens'],
    num_rows: 29000
}) Dataset({
    features: ['en', 'de', 'en_tokens', 'de_tokens'],
    num_rows: 1014
}) Dataset({
    features: ['en', 'de', 'en_tokens', 'de_tokens'],
    num_rows: 1000
})


In [589]:
pprint(train_data[0])

{'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'de_tokens': ['<sos>',
               'zwei',
               'junge',
               'weiße',
               'männer',
               'sind',
               'im',
               'freien',
               'in',
               'der',
               'nähe',
               'vieler',
               'büsche',
               '.',
               '<eos>'],
 'en': 'Two young, White males are outside near many bushes.',
 'en_tokens': ['<sos>',
               'two',
               'young',
               ',',
               'white',
               'males',
               'are',
               'outside',
               'near',
               'many',
               'bushes',
               '.',
               '<eos>']}


## 1-3. Build Vocab 

In [590]:
min_freq = 1
unk_token = "<unk>"
pad_token = "<pad>"

special_tokens = [unk_token,
                  pad_token,
                  sos_token,
                  eos_token]

en_vocab = build_vocab_from_iterator(train_data["en_tokens"],
                                     min_freq = min_freq,
                                     specials = special_tokens)

de_vocab = build_vocab_from_iterator(train_data["de_tokens"],
                                     min_freq = min_freq,
                                     specials = special_tokens)

In [591]:
len(en_vocab), len(de_vocab)

(9797, 18669)

In [592]:
assert en_vocab[unk_token] == de_vocab[unk_token]
assert en_vocab[pad_token] == de_vocab[pad_token]

unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

In [593]:
en_vocab.set_default_index(unk_index)
de_vocab.set_default_index(unk_index)

In [594]:
tokens = ["i", "love", "watching", "crime", "shows"]

In [595]:
en_vocab.lookup_indices(tokens)

[956, 2169, 173, 6799, 821]

In [596]:
en_vocab.lookup_tokens(en_vocab.lookup_indices(tokens))

['i', 'love', 'watching', 'crime', 'shows']

## 1-4. Numericalize Text

In [597]:
def numericalize_example(example, en_vocab, de_vocab):
    en_ids = en_vocab.lookup_indices(example["en_tokens"])
    de_ids = de_vocab.lookup_indices(example["de_tokens"])
    return {"en_ids": en_ids, "de_ids": de_ids}

In [598]:
fn_kwargs = {"en_vocab": en_vocab, "de_vocab": de_vocab}

train_data = train_data.map(numericalize_example, fn_kwargs = fn_kwargs)
valid_data = valid_data.map(numericalize_example, fn_kwargs = fn_kwargs)
test_data = test_data.map(numericalize_example, fn_kwargs = fn_kwargs)

In [599]:
print(train_data, valid_data, test_data)

Dataset({
    features: ['en', 'de', 'en_tokens', 'de_tokens', 'en_ids', 'de_ids'],
    num_rows: 29000
}) Dataset({
    features: ['en', 'de', 'en_tokens', 'de_tokens', 'en_ids', 'de_ids'],
    num_rows: 1014
}) Dataset({
    features: ['en', 'de', 'en_tokens', 'de_tokens', 'en_ids', 'de_ids'],
    num_rows: 1000
})


In [600]:
pprint(train_data[0])

{'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'de_ids': [2, 18, 26, 253, 30, 84, 20, 88, 7, 15, 110, 7647, 3171, 4, 3],
 'de_tokens': ['<sos>',
               'zwei',
               'junge',
               'weiße',
               'männer',
               'sind',
               'im',
               'freien',
               'in',
               'der',
               'nähe',
               'vieler',
               'büsche',
               '.',
               '<eos>'],
 'en': 'Two young, White males are outside near many bushes.',
 'en_ids': [2, 16, 24, 15, 25, 778, 17, 57, 80, 202, 1312, 5, 3],
 'en_tokens': ['<sos>',
               'two',
               'young',
               ',',
               'white',
               'males',
               'are',
               'outside',
               'near',
               'many',
               'bushes',
               '.',
               '<eos>']}


In [601]:

data_type = "torch"
format_columns = ["en_ids", "de_ids"]

train_data = train_data.with_format(type = data_type, 
                                    columns = format_columns, 
                                    output_all_columns = False)

valid_data = valid_data.with_format(type = data_type,
                                    columns = format_columns,
                                    output_all_columns = False)

test_data = test_data.with_format(type = data_type,
                                  columns = format_columns,
                                  output_all_columns = False)

In [602]:
print(train_data, valid_data, test_data)

Dataset({
    features: ['en', 'de', 'en_tokens', 'de_tokens', 'en_ids', 'de_ids'],
    num_rows: 29000
}) Dataset({
    features: ['en', 'de', 'en_tokens', 'de_tokens', 'en_ids', 'de_ids'],
    num_rows: 1014
}) Dataset({
    features: ['en', 'de', 'en_tokens', 'de_tokens', 'en_ids', 'de_ids'],
    num_rows: 1000
})


In [603]:
pprint(train_data[0])

{'de_ids': tensor([   2,   18,   26,  253,   30,   84,   20,   88,    7,   15,  110, 7647,
        3171,    4,    3]),
 'en_ids': tensor([   2,   16,   24,   15,   25,  778,   17,   57,   80,  202, 1312,    5,
           3])}


## 1-5. Word Embedding

you can add word embedding here

In [604]:
# en_pretrained_embedding =
# de_pretrained_embedding =

## 1-6. Prepare for Data Loading

In [605]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        return item

In [606]:
train_dataset = CustomDataset(train_data)
test_dataset = CustomDataset(test_data)

In [607]:
def custom_collate_fn(batch):

    batch_en_ids = [sample["en_ids"] for sample in batch]
    batch_de_ids = [sample["de_ids"] for sample in batch]

    collate_en = pad_sequence(batch_en_ids,
                              padding_value = pad_index,
                              batch_first = True)
    collate_de = pad_sequence(batch_de_ids,
                              padding_value = pad_index,
                              batch_first = True)
    
    batch = {"en_ids": collate_en,
             "de_ids": collate_de}
    
    return batch

In [608]:
batch_size = 128
pad_index = pad_index
shuffle = True

trainloader = DataLoader(dataset = train_dataset,
                         batch_size = batch_size,
                         collate_fn = custom_collate_fn,
                         shuffle = shuffle)

validloader = DataLoader(dataset = train_dataset,
                         batch_size = batch_size,
                         collate_fn = custom_collate_fn,
                         shuffle = shuffle)

testloader = DataLoader(dataset = test_dataset,
                         batch_size = batch_size,
                         collate_fn = custom_collate_fn,
                         shuffle = shuffle)

# 2. Define Model

## 2-1. Model Structure

In [609]:
class Encoder(nn.Module):
    def __init__(self, 
                 input_dim: int, 
                 embedding_dim: int, 
                 hidden_dim: int, 
                 num_layers: int, 
                 dropout_rate: float, 
                 pad_idx: int):
        
        super(Encoder, self).__init__()
        
        self.input_dim = input_dim
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout_rate = dropout_rate
        self.pad_idx = pad_idx
        
        self.embedding = nn.Embedding(num_embeddings=self.input_dim, 
                                      embedding_dim=self.embedding_dim, 
                                      padding_idx=self.pad_idx)
        
        self.LSTM = nn.LSTM(input_size=self.embedding_dim, 
                            hidden_size=self.hidden_dim, 
                            num_layers=self.num_layers, 
                            dropout=self.dropout_rate, 
                            batch_first=True)
        
        self.dropout = nn.Dropout(p=self.dropout_rate)

    def forward(self, 
                inputs: Annotated[Tensor, 'batch_size, seq_length'],
                h0:     Annotated[Tensor, 'num_layers, batch_size, hidden_dim'],
                c0:     Annotated[Tensor, 'num_layers, batch_size, hidden_dim']) -> Tuple[Annotated[Tensor, 'num_layers, batch_size, hidden_dim'], 
                                                                                          Annotated[Tensor, 'num_layers, batch_size, hidden_dim']]:

        # x: Tensor, 'batch_size, seq_length, embedding_dim'
        x = self.dropout(self.embedding(inputs))

        # output: Tensor, 'batch_size, seq_length, hidden_dim'
        # h: Tensor, 'num_layers, batch_size, hidden_dim'
        # c: Tensor, 'num_layers, batch_size, hidden_dim'
        output, (h, c) = self.LSTM(x, (h0, c0))
        

        
        return h, c

In [610]:
class Decoder(nn.Module):
    def __init__(self, 
                 output_dim: int, 
                 embedding_dim: int, 
                 hidden_dim: int, 
                 num_layers: int, 
                 dropout_rate: float, 
                 pad_idx: int):
        
        super(Decoder, self).__init__()
        
        self.output_dim = output_dim
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout_rate = dropout_rate
        self.pad_idx = pad_idx
        
        self.embedding = nn.Embedding(num_embeddings=self.output_dim, 
                                      embedding_dim=self.embedding_dim, 
                                      padding_idx=self.pad_idx)
        
        self.LSTM = nn.LSTM(input_size=self.embedding_dim, 
                            hidden_size=self.hidden_dim, 
                            num_layers=self.num_layers, 
                            dropout=self.dropout_rate, 
                            batch_first=True)
        
        self.dropout = nn.Dropout(p=self.dropout_rate)
        
        self.fc = nn.Linear(in_features=self.hidden_dim, 
                            out_features=self.output_dim)

    def forward(self, 
                inputs: Annotated[Tensor, "batch_size, 1"], 
                h:      Annotated[Tensor, "num_layers, batch_size, hidden_dim"], 
                c:      Annotated[Tensor, "num_layers, batch_size, hidden_dim"]) -> Tuple[Annotated[Tensor, "logit : batch_size, 1, output_dim"],
                                                                                          Annotated[Tensor, "h : num_layers, batch_size, hidden_dim"],
                                                                                          Annotated[Tensor, "c : num_layers, batch_size, hidden_dim"]]:

        # x: Tensor, 'batch_size, 1, embedding_dim'
        x = self.dropout(self.embedding(inputs))
        
        # output: Tensor, 'batch_size, 1, hidden_dim'
        # h: Tensor, 'num_layers, batch_size, hidden_dim' 
        # c: Tensor, 'num_layers, batch_size, hidden_dim' 
        output, (h, c) = self.LSTM(x, (h, c))

        # logit: Tensor, 'batch_size, 1, output_dim'
        logit = self.fc(output)

        return logit, h, c

In [611]:
class seq2seq_Translation(nn.Module):
    def __init__(self, Encoder, Decoder):
        super().__init__()
        self.Encoder = Encoder
        self.Decoder = Decoder

        assert (
            self.Encoder.hidden_dim == self.Decoder.hidden_dim
        ), "Hidden dimensions of encoder and decoder must be equal!"
        assert (
            self.Encoder.num_layers == self.Decoder.num_layers
        ), "Encoder and decoder must have equal number of layers!"

    def forward(self, 
                source: Annotated[Tensor, 'batch_size, seq_length'], 
                target: Annotated[Tensor, 'batch_size, seq_length'], 
                teacher_forcing_ratio: float)                                      -> Annotated[Tensor, 'batch_size, seq_length, output_dim']:

        target_batch_size = target.size(0)
        target_seq_length = target.size(1)
        target_vocab_size = self.Decoder.output_dim

        source_batch_size = source.size(0)

        logit_seq = torch.zeros(target_batch_size, target_seq_length, target_vocab_size).to(device)

        h0 = torch.zeros(self.Encoder.num_layers, source_batch_size, self.Encoder.hidden_dim).to(device)
        c0 = torch.zeros(self.Encoder.num_layers, source_batch_size, self.Encoder.hidden_dim).to(device)

        # h_encoder: Tensor, 'num_layers, batch_size, hidden_dim'
        # c_encoder: Tensor, 'num_layers, batch_size, hidden_dim'         
        h_encoder, c_encoder = self.Encoder(source, h0, c0)

        # x: Tensor, 'batch_size, 1'
        x = target[:, 0].unsqueeze(1)

        h, c = h_encoder, c_encoder
        for t in range(1, target_seq_length):

            # logit: Tensor, 'batch_size, 1, output_dim'
            # h: Tensor, 'num_layers, batch_size, hidden_dim'
            # c: Tensor, 'num_layers, batch_size, hidden_dim'
            logit, h, c = self.Decoder(x, h, c)

            # logit_seq: Tensor, 'batch_size, seq_length, output_dim'
            logit_seq[:, t, :] = logit.squeeze(1)
            
            teacher_force = random.random() < teacher_forcing_ratio

            # predicted_vocab: Tensor, 'batch_size, 1'
            predicted_vocab = logit.argmax(2)

            # x: Tensor, 'batch_size, 1, output_dim'
            x = target[:, t].unsqueeze(1) if teacher_force else predicted_vocab

        return logit_seq

## 2-2. Hyperparameter & functions

In [612]:
input_dim = len(de_vocab)
output_dim = len(en_vocab)
encoder_embedding_dim = 300
decoder_embedding_dim = 300
hidden_dim = 128
num_layers = 2
encoder_dropout = 0.5
decoder_dropout = 0.5
pad_index = pad_index
lr = 5e-4

encoder = Encoder(input_dim= input_dim,
                  embedding_dim= encoder_embedding_dim,
                  hidden_dim= hidden_dim,
                  num_layers= num_layers,
                  dropout_rate= encoder_dropout,
                  pad_idx= pad_index)

decoder = Decoder(output_dim= output_dim,
                  embedding_dim= decoder_embedding_dim,
                  hidden_dim= hidden_dim,
                  num_layers= num_layers,
                  dropout_rate= decoder_dropout,
                  pad_idx= pad_index)

model = seq2seq_Translation(encoder, decoder)

pprint("Model's state_dict:")
for name, param in encoder.named_parameters():
    print(f"Parameter name: {name}")
    print(f"    Size : {param.size()}")
    print(f"    Value: {param}")


criterion = nn.CrossEntropyLoss(ignore_index= pad_index)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

"Model's state_dict:"
Parameter name: embedding.weight
    Size : torch.Size([18669, 300])
    Value: Parameter containing:
tensor([[ 1.9269,  1.4873,  0.9007,  ...,  0.2539,  0.9364,  0.7122],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.4982, -1.2000,  0.1271,  ..., -0.3867,  0.9578, -0.8225],
        ...,
        [ 1.3966, -0.0912, -0.7094,  ...,  1.3549,  0.0225, -0.0448],
        [-0.2576,  1.1460, -0.1860,  ...,  0.1697, -0.1309,  0.7459],
        [ 0.2618, -0.9730, -0.8111,  ..., -1.0894,  0.3437, -0.2883]],
       requires_grad=True)
Parameter name: LSTM.weight_ih_l0
    Size : torch.Size([512, 300])
    Value: Parameter containing:
tensor([[ 0.0056,  0.0860,  0.0265,  ...,  0.0139, -0.0010, -0.0183],
        [ 0.0641,  0.0734, -0.0102,  ..., -0.0379, -0.0698, -0.0288],
        [-0.0585, -0.0168,  0.0138,  ...,  0.0799, -0.0206,  0.0139],
        ...,
        [-0.0581, -0.0245, -0.0166,  ...,  0.0131, -0.0610,  0.0242],
        [ 0.0712, -0.

## 2-3. Weight Initialization

In [613]:
def initialize_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            torch.nn.init.xavier_uniform_(param)
        if 'bias' in name:
            nn.init.constant_(param, 0.0)

In [614]:
model.apply(initialize_weights)

seq2seq_Translation(
  (Encoder): Encoder(
    (embedding): Embedding(18669, 300, padding_idx=1)
    (LSTM): LSTM(300, 128, num_layers=2, batch_first=True, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (Decoder): Decoder(
    (embedding): Embedding(9797, 300, padding_idx=1)
    (LSTM): LSTM(300, 128, num_layers=2, batch_first=True, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
    (fc): Linear(in_features=128, out_features=9797, bias=True)
  )
)

In [615]:
pprint("Model's state_dict:")
for name, param in model.named_parameters():
    print(f"Parameter name: {name}")
    print(f"    Size : {param.size()}")
    print(f"    Value: {param}")

"Model's state_dict:"
Parameter name: Encoder.embedding.weight
    Size : torch.Size([18669, 300])
    Value: Parameter containing:
tensor([[ 0.0176,  0.0123, -0.0130,  ...,  0.0082,  0.0014,  0.0100],
        [ 0.0031, -0.0149,  0.0064,  ...,  0.0079, -0.0143, -0.0139],
        [-0.0052, -0.0144, -0.0024,  ..., -0.0143, -0.0132,  0.0173],
        ...,
        [ 0.0121,  0.0175, -0.0127,  ...,  0.0029, -0.0003, -0.0037],
        [ 0.0057, -0.0128, -0.0040,  ..., -0.0007,  0.0061, -0.0090],
        [ 0.0131,  0.0146,  0.0012,  ...,  0.0006, -0.0146,  0.0121]],
       requires_grad=True)
Parameter name: Encoder.LSTM.weight_ih_l0
    Size : torch.Size([512, 300])
    Value: Parameter containing:
tensor([[ 0.0150, -0.0537,  0.0305,  ...,  0.0462,  0.0269,  0.0478],
        [-0.0723, -0.0568, -0.0062,  ...,  0.0343,  0.0125, -0.0799],
        [ 0.0311,  0.0803, -0.0012,  ...,  0.0083,  0.0809,  0.0230],
        ...,
        [ 0.0247,  0.0680, -0.0270,  ..., -0.0469, -0.0583,  0.0703],
     

In [616]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 10,508,125 trainable parameters


In [617]:
model.to(device)
criterion.to(device)

CrossEntropyLoss()

# 3. Train Model

In [618]:
def train_fn(model, trainloader, criterion, optimizer, clip, teacher_forcing_ratio):
        
    model.train()
    
    running_loss = 0.0
    
    for batch in tqdm(trainloader):
        
        # source: Tensor, 'batch_size, seq_length'
        # target: Tensor, 'batch_size, seq_length'
        source = batch['de_ids'].to(device)
        target = batch['en_ids'].to(device)
        
        optimizer.zero_grad()

        # logit_seq: Tensor, 'batch_size, seq_length, output_dim'
        logit_seq = model(source = source, 
                          target = target, 
                          teacher_forcing_ratio = teacher_forcing_ratio)
        
        output_dim = model.Decoder.output_dim

        # logits = 'batch_size * seq_length - 1), output_dim'
        # target: Tensor, 'batch_size * (seq_length - 1)'
        logits = logit_seq[:, 1:, :].reshape(-1, output_dim)
        target = target[:, 1:].reshape(-1)

        loss = criterion(logits, target)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        running_loss += loss.item() * source.size(0)

    epoch_loss = running_loss / len(trainloader.dataset)
    
    return epoch_loss

In [619]:
def evaluate_fn(model, validloader, criterion, teacher_forcing_ratio):
    
    model.eval()
    
    running_loss = 0.0
    
    with torch.no_grad():
        for batch in tqdm(validloader):

            # source: Tensor, 'batch_size, seq_length'
            # target: Tensor, 'batch_size, seq_length'
            source = batch['de_ids'].to(device)
            target = batch['en_ids'].to(device)

            # logit_seq: Tensor, 'batch_size, seq_length, output_dim'
            logit_seq = model(source = source, 
                              target = target, 
                              teacher_forcing_ratio = teacher_forcing_ratio)

            output_dim = model.Decoder.output_dim

            # logits: Tensor, 'batch_size * (seq_length - 1), output_dim'
            # target: Tensor, 'batch_size * (seq_length - 1)'
            logits = logit_seq[:, 1:, :].reshape(-1, output_dim)
            target = target[:, 1:].reshape(-1)

            loss = criterion(logits, target)
            
            running_loss += loss.item() * source.size(0)
        
        epoch_loss = running_loss / len(validloader.dataset)
        
    return epoch_loss

In [620]:
def train_model(model, criterion, optimizer, trainloader, validloader, clip, teacher_forcing_ratio, num_epochs):
    print("-----Training Started------")

    best_valid_loss = float("inf")
    
    for epoch in range(num_epochs):

        print(f"Epoch [{epoch+1}/{num_epochs}]")
              
        train_loss = train_fn(model= model,
                              trainloader= trainloader,
                              criterion= criterion,
                              optimizer= optimizer,
                              clip = clip,
                              teacher_forcing_ratio= teacher_forcing_ratio)
        
        valid_loss = evaluate_fn(model= model,
                                 validloader= validloader,
                                 criterion= criterion,
                                 teacher_forcing_ratio= 0.0)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss

            torch.save(model.state_dict(), model_dir)
            
        print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
        print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")
    
    print("-----Training Completed-----")

In [621]:
num_epochs = 16

clip = 1.0
teacher_forcing_ratio = 0.5

train_model(model, criterion, optimizer, trainloader, validloader, clip, teacher_forcing_ratio, num_epochs)

-----Training Started------
Epoch [1/16]


100%|██████████| 227/227 [00:25<00:00,  8.75it/s]
100%|██████████| 227/227 [00:15<00:00, 14.35it/s]


	Train Loss:   5.966 | Train PPL: 389.767
	Valid Loss:   5.352 | Valid PPL: 210.935
Epoch [2/16]


100%|██████████| 227/227 [00:26<00:00,  8.59it/s]
100%|██████████| 227/227 [00:15<00:00, 14.47it/s]


	Train Loss:   5.309 | Train PPL: 202.160
	Valid Loss:   5.234 | Valid PPL: 187.585
Epoch [3/16]


100%|██████████| 227/227 [00:25<00:00,  8.82it/s]
100%|██████████| 227/227 [00:15<00:00, 14.40it/s]


	Train Loss:   5.180 | Train PPL: 177.697
	Valid Loss:   5.095 | Valid PPL: 163.204
Epoch [4/16]


100%|██████████| 227/227 [00:25<00:00,  8.83it/s]
100%|██████████| 227/227 [00:15<00:00, 14.34it/s]


	Train Loss:   4.995 | Train PPL: 147.710
	Valid Loss:   5.012 | Valid PPL: 150.146
Epoch [5/16]


100%|██████████| 227/227 [00:25<00:00,  8.87it/s]
100%|██████████| 227/227 [00:15<00:00, 14.44it/s]


	Train Loss:   4.832 | Train PPL: 125.488
	Valid Loss:   4.991 | Valid PPL: 147.093
Epoch [6/16]


100%|██████████| 227/227 [00:26<00:00,  8.65it/s]
100%|██████████| 227/227 [00:15<00:00, 14.43it/s]


	Train Loss:   4.712 | Train PPL: 111.233
	Valid Loss:   4.934 | Valid PPL: 138.959
Epoch [7/16]


100%|██████████| 227/227 [00:26<00:00,  8.58it/s]
100%|██████████| 227/227 [00:15<00:00, 14.35it/s]


	Train Loss:   4.602 | Train PPL:  99.667
	Valid Loss:   4.871 | Valid PPL: 130.504
Epoch [8/16]


100%|██████████| 227/227 [00:26<00:00,  8.53it/s]
100%|██████████| 227/227 [00:15<00:00, 14.41it/s]


	Train Loss:   4.478 | Train PPL:  88.071
	Valid Loss:   4.753 | Valid PPL: 115.910
Epoch [9/16]


100%|██████████| 227/227 [00:26<00:00,  8.56it/s]
100%|██████████| 227/227 [00:15<00:00, 14.31it/s]


	Train Loss:   4.375 | Train PPL:  79.450
	Valid Loss:   4.692 | Valid PPL: 109.051
Epoch [10/16]


100%|██████████| 227/227 [00:26<00:00,  8.59it/s]
100%|██████████| 227/227 [00:15<00:00, 14.36it/s]


	Train Loss:   4.286 | Train PPL:  72.704
	Valid Loss:   4.640 | Valid PPL: 103.579
Epoch [11/16]


100%|██████████| 227/227 [00:26<00:00,  8.65it/s]
100%|██████████| 227/227 [00:15<00:00, 14.31it/s]


	Train Loss:   4.219 | Train PPL:  67.934
	Valid Loss:   4.599 | Valid PPL:  99.336
Epoch [12/16]


100%|██████████| 227/227 [00:25<00:00,  8.79it/s]
100%|██████████| 227/227 [00:15<00:00, 14.42it/s]


	Train Loss:   4.164 | Train PPL:  64.307
	Valid Loss:   4.556 | Valid PPL:  95.200
Epoch [13/16]


100%|██████████| 227/227 [00:26<00:00,  8.55it/s]
100%|██████████| 227/227 [00:15<00:00, 14.28it/s]


	Train Loss:   4.105 | Train PPL:  60.631
	Valid Loss:   4.518 | Valid PPL:  91.676
Epoch [14/16]


100%|██████████| 227/227 [00:26<00:00,  8.57it/s]
100%|██████████| 227/227 [00:15<00:00, 14.31it/s]


	Train Loss:   4.047 | Train PPL:  57.201
	Valid Loss:   4.494 | Valid PPL:  89.507
Epoch [15/16]


100%|██████████| 227/227 [00:26<00:00,  8.58it/s]
100%|██████████| 227/227 [00:15<00:00, 14.43it/s]


	Train Loss:   4.004 | Train PPL:  54.794
	Valid Loss:   4.434 | Valid PPL:  84.252
Epoch [16/16]


100%|██████████| 227/227 [00:26<00:00,  8.54it/s]
100%|██████████| 227/227 [00:15<00:00, 14.29it/s]


	Train Loss:   3.960 | Train PPL:  52.479
	Valid Loss:   4.379 | Valid PPL:  79.739
-----Training Completed-----


# 4. Test

In [623]:
test_loss = evaluate_fn(model, testloader, criterion, teacher_forcing_ratio = 0)

print(f"| Test Loss: {test_loss:.3f} | Test PPL: {np.exp(test_loss):7.3f} |")

100%|██████████| 8/8 [00:00<00:00, 13.65it/s]

| Test Loss: 4.603 | Test PPL:  99.784 |





# 5. Inference