In [1]:
!pip install datasets --quiet
!pip install evaluate --quiet

In [2]:
import numpy as np 
import pandas as pd 
import os

In [3]:
import torch  
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data 
import math
import copy

filepath = '/kaggle/working/'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
import kagglehub

path = kagglehub.dataset_download("hungnm/englishvietnamese-translation")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/englishvietnamese-translation


In [5]:
import os
import pandas as pd
from datasets import Dataset

data_path = '/kaggle/input/englishvietnamese-translation/'

with open(os.path.join(data_path, 'en_sents'), 'r') as f:
    en_sents = f.readlines()

with open(os.path.join(data_path, 'vi_sents'), 'r') as f:
    vi_sents = f.readlines()

en_sents = [sent.strip() for sent in en_sents]
vi_sents = [sent.strip() for sent in vi_sents]

data_df = pd.DataFrame({
    'en': en_sents,
    'vi': vi_sents
})
print(data_df.head())
data= Dataset.from_pandas(data_df)

print(data)


                                                  en  \
0         Please put the dustpan in the broom closet   
1                             Be quiet for a moment.   
2                                          Read this   
3  Tom persuaded the store manager to give him ba...   
4        Friendship consists of mutual understanding   

                                                  vi  
0      xin vui lòng đặt người quét rác trong tủ chổi  
1                                    im lặng một lát  
2                                            đọc này  
3  tom thuyết phục người quản lý cửa hàng trả lại...  
4             tình bạn bao gồm sự hiểu biết lẫn nhau  
Dataset({
    features: ['en', 'vi'],
    num_rows: 254090
})


In [6]:
from datasets import Dataset, DatasetDict

# Split the dataset into train, validation, and test sets (e.g., 80% train, 10% validation, 10% test)
train_data, temp_data = data.train_test_split(test_size=0.2).values()  
valid_data, test_data = temp_data.train_test_split(test_size=0.5).values()  

split_data = DatasetDict({
    'train': train_data,
    'validation': valid_data,
    'test': test_data
})

# Check the splits
print(f"Train data: {len(split_data['train'])}")
print(f"Validation data: {len(split_data['validation'])}")
print(f"Test data: {len(split_data['test'])}")

print("Train data example:", split_data['train'][0])
print("Validation data example:", split_data['validation'][0])
print("Test data example:", split_data['test'][0])


Train data: 203272
Validation data: 25409
Test data: 25409
Train data example: {'en': 'He feels a good deal better than yesterday', 'vi': 'anh ấy cảm thấy một thỏa thuận tốt hơn so với ngày hôm qua'}
Validation data example: {'en': 'Your arms and legs have grown strong enough', 'vi': 'cánh tay và chân của bạn đã phát triển đủ mạnh'}
Test data example: {'en': 'You should just talk to Tom.', 'vi': 'bạn chỉ nên nói chuyện với tom.'}


In [7]:
SOURCE_LANG = 'en'
TARGET_LANG = 'vi'

UNK_IDX, PAD_IDX, SOS_IDX, EOS_IDX = 0, 1, 2, 3 
SPECIAL_SYMBOLS = ['<unk>', '<pad>', '<sos>', '<eos>']

In [8]:
pip install torch==2.3.0 torchtext==0.18.0


Note: you may need to restart the kernel to use updated packages.


In [9]:
import torch
import torchtext

print(torch.__version__)
print(torchtext.__version__)


2.3.0+cu121
0.18.0+cpu


In [10]:

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

token_transform = {}
vocab = {}

token_transform[SOURCE_LANG] = get_tokenizer('basic_english')
token_transform[TARGET_LANG] = get_tokenizer('basic_english')



In [11]:
token_transform['en']("hello it's me")

['hello', 'it', "'", 's', 'me']

In [12]:
def tokenize_example(example, sos_token, eos_token, token_transform, src_lang, tgt_lang):
    en_tokens = token_transform['en'](example['en'])
    vi_tokens = token_transform['vi'](example['vi'])

    en_tokens = ([sos_token] + en_tokens + [eos_token])
    vi_tokens = ([sos_token] + vi_tokens + [eos_token])

    return {"en_tokens": (en_tokens), "vi_tokens":(vi_tokens)}


In [13]:
fn_kwargs = {
    'sos_token': '<sos>',
    'eos_token': '<eos>',
    'token_transform': token_transform,
    'src_lang': SOURCE_LANG,
    'tgt_lang': TARGET_LANG,
    }
train_data = train_data.map(tokenize_example, fn_kwargs = fn_kwargs)
test_data = test_data.map(tokenize_example, fn_kwargs = fn_kwargs)
valid_data = valid_data.map(tokenize_example, fn_kwargs = fn_kwargs)

Map:   0%|          | 0/203272 [00:00<?, ? examples/s]

Map:   0%|          | 0/25409 [00:00<?, ? examples/s]

Map:   0%|          | 0/25409 [00:00<?, ? examples/s]

In [14]:
print(train_data[0])

{'en': 'He feels a good deal better than yesterday', 'vi': 'anh ấy cảm thấy một thỏa thuận tốt hơn so với ngày hôm qua', 'en_tokens': ['<sos>', 'he', 'feels', 'a', 'good', 'deal', 'better', 'than', 'yesterday', '<eos>'], 'vi_tokens': ['<sos>', 'anh', 'ấy', 'cảm', 'thấy', 'một', 'thỏa', 'thuận', 'tốt', 'hơn', 'so', 'với', 'ngày', 'hôm', 'qua', '<eos>']}


In [15]:

for lang in [SOURCE_LANG, TARGET_LANG]:
    vocab[lang] = build_vocab_from_iterator(
        train_data[lang + '_tokens'],
        min_freq = 1,
        specials = SPECIAL_SYMBOLS,
        special_first = True    # Special tokens get indices 0,1,2,3
    )
    vocab[lang].set_default_index(UNK_IDX) # # Default index for OOV words

In [16]:

print(vocab['vi'].get_itos()[:10])
print(len(vocab['vi']))
print(vocab['en'].get_itos()[:10])
print(len(vocab['en']))

['<unk>', '<pad>', '<sos>', '<eos>', 'tôi', '.', 'bạn', 'không', 'tom', 'có']
6594
['<unk>', '<pad>', '<sos>', '<eos>', '.', "'", 'i', 'the', 'to', 'tom']
19294


In [17]:

def numericalize_example(example, vocab, src_lang, tgt_lang):
    en_ids = torch.tensor(vocab[src_lang].lookup_indices(example['en_tokens']))
    vi_ids = torch.tensor(vocab[tgt_lang].lookup_indices(example['vi_tokens']))

    return {'en_ids': en_ids, 'vi_ids': vi_ids}

In [18]:
fn_kwargs = {
    'vocab': vocab,
    'src_lang': SOURCE_LANG,
    'tgt_lang': TARGET_LANG,
    }
train_data = train_data.map(numericalize_example, fn_kwargs = fn_kwargs)
test_data = test_data.map(numericalize_example, fn_kwargs = fn_kwargs)
valid_data = valid_data.map(numericalize_example, fn_kwargs = fn_kwargs)

Map:   0%|          | 0/203272 [00:00<?, ? examples/s]

Map:   0%|          | 0/25409 [00:00<?, ? examples/s]

Map:   0%|          | 0/25409 [00:00<?, ? examples/s]

In [19]:
train_data[0]

{'en': 'He feels a good deal better than yesterday',
 'vi': 'anh ấy cảm thấy một thỏa thuận tốt hơn so với ngày hôm qua',
 'en_tokens': ['<sos>',
  'he',
  'feels',
  'a',
  'good',
  'deal',
  'better',
  'than',
  'yesterday',
  '<eos>'],
 'vi_tokens': ['<sos>',
  'anh',
  'ấy',
  'cảm',
  'thấy',
  'một',
  'thỏa',
  'thuận',
  'tốt',
  'hơn',
  'so',
  'với',
  'ngày',
  'hôm',
  'qua',
  '<eos>'],
 'en_ids': [2, 15, 1138, 11, 85, 489, 154, 109, 207, 3],
 'vi_ids': [2, 12, 16, 108, 55, 10, 906, 812, 70, 69, 693, 21, 67, 130, 87, 3]}

In [20]:

def collate_fn(batch):
    source_batch = [torch.tensor(sample[SOURCE_LANG + "_ids"]) for sample in batch]
    target_batch = [torch.tensor(sample[TARGET_LANG + "_ids"]) for sample in batch]

    source_batch = nn.utils.rnn.pad_sequence(source_batch, padding_value = PAD_IDX, batch_first = True)
    target_batch = nn.utils.rnn.pad_sequence(target_batch, padding_value = PAD_IDX, batch_first = True)
    return source_batch, target_batch 

In [21]:
from torch.utils.data import DataLoader
BATCH_SIZE = 16
train_data_loader = torch.utils.data.DataLoader(
    dataset = train_data,
    batch_size = BATCH_SIZE,
    collate_fn = collate_fn,
    shuffle = True
)

test_data_loader = DataLoader(
    dataset = test_data,
    batch_size = BATCH_SIZE,
    collate_fn = collate_fn,
)

valid_data_loader = DataLoader(
    dataset = valid_data,
    batch_size = BATCH_SIZE,
    collate_fn = collate_fn,
)

In [22]:

class MultiHeadAttention(nn.Module):

    def __init__(self, attention_head, model_dimension):
        super(MultiHeadAttention, self).__init__()
        assert model_dimension % attention_head == 0, "dimension of model must be divisible by the attention head"

        self.attention_head = attention_head
        self.model_dimension = model_dimension
        self.d_k = self.model_dimension // self.attention_head

        self.W_q = nn.Linear(model_dimension, model_dimension, bias = False) # Query transformation
        self.W_k = nn.Linear(model_dimension, model_dimension, bias = False) # Key transformation
        self.W_v = nn.Linear(model_dimension, model_dimension, bias = False) # Value transformation
        self.W_o = nn.Linear(model_dimension, model_dimension) # Output transformation

    def scaled_dot_products(self, Q, K, V, mask = None):

        
        attention_score = (Q @ K.transpose(-2, -1)) / math.sqrt(self.d_k)
        
        if mask is not None:
            mask = mask.unsqueeze(1)
            attention_score = attention_score.masked_fill(mask == 0, value = -1e9)

        # Attention_probability is computed via softmax function
        attention_probability = torch.softmax(attention_score, dim = -1)

        output = (attention_probability @ V)
        return output


    def split_heads(self, X):

        batch_size, seq_len, model_dimension = X.shape
        output = X.view(batch_size, seq_len, self.attention_head, self.d_k)
        output = output.transpose(1, 2)
        return output


    def combine_heads(self, X):
        batch_size, _, seq_len, d_k = X.shape
        output = X.transpose(1, 2).contiguous().view(batch_size, seq_len, self.model_dimension)
        return output

    def forward(self, Q, K, V, mask = None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        attention_output = self.scaled_dot_products(Q, K, V, mask)

        output = self.W_o(self.combine_heads(attention_output))  #to mix information from all heads.
        return output 

In [23]:
class PositionWiseFeedForwardNetwork(nn.Module):
    def __init__(self, model_dimension, feed_forward_dimension):

        super(PositionWiseFeedForwardNetwork, self).__init__()
        self.model_dimension = model_dimension
        self.feed_forward_dimension = feed_forward_dimension

        self.fc1 = nn.Linear(model_dimension, feed_forward_dimension)
        self.fc2 = nn.Linear(feed_forward_dimension, model_dimension)
        self.relu = nn.ReLU()

    def forward(self, X):
        return self.fc2(self.relu(self.fc1(X)))

In [24]:

class PositionalEncoding(nn.Module):
    def __init__(self, model_dimension, max_seq_len, dropout):
        super(PositionalEncoding, self).__init__()
        self.model_dimension = model_dimension
        self.max_seq_len = max_seq_len
        self.dropout = nn.Dropout(dropout)
        positional_encoding = torch.zeros(max_seq_len, model_dimension)
        # Each row will store the position info for one word position.

        position = torch.arange(0, max_seq_len, dtype = torch.float).unsqueeze(1)

        # Creates numbers [0, 1, 2, ..., max_seq_len-1] for word positions.

        # position has shape (max_seq_len, 1)
        # positional_encoding has shape (max_seq_len, model_dimension)
        div_term = torch.exp(torch.arange(0, model_dimension, 2).float() * -(math.log(10000.0) / model_dimension))
#Even indices (0,2,4...): Sine waves.

# Odd indices (1,3,5...): Cosine waves.

# Result: Each dimension gets a unique wavelength, capturing position information

        positional_encoding[:, 0::2] = torch.sin(position * div_term)
        positional_encoding[:, 1::2] = torch.cos(position * div_term)
        #Uses mathematical waves to encode positions

        positional_encoding = positional_encoding.unsqueeze(0)
        
        self.register_buffer('pe', positional_encoding) #Not updated during backpropagation


    def forward(self, X):
        return self.dropout(X + (self.pe[:, :X.shape[1], :]))
    

    

In [25]:
class EncoderBlock(nn.Module):
    def __init__(self, model_dimension, attention_heads, feed_forward_dimension, dropout):
        super(EncoderBlock, self).__init__()

        self.attention = MultiHeadAttention(attention_heads, model_dimension)
        self.feed_forward_network = PositionWiseFeedForwardNetwork(model_dimension, feed_forward_dimension)
        self.norm1 = nn.LayerNorm(model_dimension)
        self.norm2 = nn.LayerNorm(model_dimension)

        self.dropout = nn.Dropout(dropout)

    def forward(self, X, mask):
        # X is a tensor of shape (batch_size, seq_len, model_dimmension)
        attention_output = self.attention(X, X, X, mask)
        X = self.norm1(X + self.dropout(attention_output))
        feed_forward_output = self.feed_forward_network(X)
        X = self.norm2(X + self.dropout(feed_forward_output))

        # Return tensor is the same size
        return X


In [26]:
class DecoderBlock(nn.Module):
    def __init__(self, model_dimension, attention_heads, feed_forward_dimension, dropout):
        super(DecoderBlock, self).__init__()

        self.attention = MultiHeadAttention(attention_heads, model_dimension)
        self.feed_forward_network = PositionWiseFeedForwardNetwork(model_dimension, feed_forward_dimension)
        self.norm1 = nn.LayerNorm(model_dimension)
        self.norm2 = nn.LayerNorm(model_dimension)
        self.norm3 = nn.LayerNorm(model_dimension)
        self.dropout = nn.Dropout(dropout)

    def forward(self, X, encoder_output, source_mask, target_mask):

        attention_output = self.attention(X, X, X, target_mask)
        X = self.norm1(X + self.dropout(attention_output))
 
        attention_output = self.attention(X, encoder_output, encoder_output, source_mask)
        
        X = self.norm2(X + self.dropout(attention_output))
            
        feed_forward_output = self.feed_forward_network(X)
        X = self.norm3(X + self.dropout(feed_forward_output))

        return X

In [27]:
class Transformer(nn.Module):
    def __init__(self, model_dimension, attention_heads, feed_forward_dimension,
                 source_vocab_size, target_vocab_size, num_layers, max_seq_len, dropout):
        super(Transformer, self).__init__()

        self.model_dimension = model_dimension
        self.attention_heads = attention_heads
        self.feed_forward_dimension = feed_forward_dimension

        self.positional_encoding = PositionalEncoding(model_dimension, max_seq_len, dropout)
        #Maps source token IDs to embeddings of size model_dimension
        self.encoder_embedding = nn.Embedding(source_vocab_size, model_dimension)
        #Maps target token IDs to embeddings of size model_dimension
        self.decoder_embedding = nn.Embedding(target_vocab_size, model_dimension)
        self.encoders = nn.ModuleList([EncoderBlock(model_dimension, attention_heads, feed_forward_dimension, dropout) for _ in range(num_layers)])
        self.decoders = nn.ModuleList([DecoderBlock(model_dimension, attention_heads, feed_forward_dimension, dropout) for _ in range(num_layers)])
        self.dropout = nn.Dropout(dropout)
        #Final projection layer to target_vocab_size (for predicting output tokens).
        self.fc = nn.Linear(model_dimension, target_vocab_size)
    
    
    def generate_mask(self, source_sentence, target_sentence):
#   SRC-> (batch_size, src_seq_len)
#TARGET -> (batch_size, tgt_seq_len)

        batch_size = source_sentence.shape[0]
        max_target_len = target_sentence.shape[1]

        #zero out attention scores for padding tokens before softmax.
        source_mask = (source_sentence != PAD_IDX).unsqueeze(1).int().to(device)
        
        # target_mask has shape (batch_size, 1, max_tgt_len)
        #HANDLE VARIABLE LENGTHS
        target_mask = (target_sentence != PAD_IDX).unsqueeze(1).int().to(device)

        #Look-Ahead Mask: Upper triangular matrix (0 for future tokens, 1 for past/current).
        no_peak_mask = 1 - torch.triu(torch.ones((1, max_target_len, max_target_len)), diagonal=1).type(torch.int).to(device)

        target_mask = target_mask & no_peak_mask

        return source_mask, target_mask

    def forward(self, source_sentence, target_sentence):

        source_mask, target_mask = self.generate_mask(source_sentence, target_sentence)
        #Maps source token IDs to embeddings of size model_dimension,ADD POS ENCODING AND DROPOUT
        source_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(source_sentence)))
        target_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(target_sentence)))

        encoder_output = source_embedded
        for encoder_layer in self.encoders:
            encoder_output = encoder_layer(encoder_output, source_mask)
        
        decoder_output = target_embedded
        for decoder_layer in self.decoders:
            decoder_output = decoder_layer(decoder_output, encoder_output, source_mask, target_mask)

        output = self.fc(decoder_output)
        return output 

In [28]:
model_dimension = 512 #Dimensionality of token embeddings and model outputs
attention_heads = 8
feed_forward_dimension = 2048
source_vocab_size = len(vocab[SOURCE_LANG])
target_vocab_size = len(vocab[TARGET_LANG])
num_layers = 6
max_seq_len = 1000
dropout = 0.1


model = Transformer(
    model_dimension,
    attention_heads,
    feed_forward_dimension,
    source_vocab_size,
    target_vocab_size,
    num_layers,
    max_seq_len,
    dropout,
)

model= nn.DataParallel(model) #Enables parallel training across multiple GPUs
model.to(device)

DataParallel(
  (module): Transformer(
    (positional_encoding): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder_embedding): Embedding(19294, 512)
    (decoder_embedding): Embedding(6594, 512)
    (encoders): ModuleList(
      (0-5): 6 x EncoderBlock(
        (attention): MultiHeadAttention(
          (W_q): Linear(in_features=512, out_features=512, bias=False)
          (W_k): Linear(in_features=512, out_features=512, bias=False)
          (W_v): Linear(in_features=512, out_features=512, bias=False)
          (W_o): Linear(in_features=512, out_features=512, bias=True)
        )
        (feed_forward_network): PositionWiseFeedForwardNetwork(
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (relu): ReLU()
        )
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affin

In [29]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 54,453,698 trainable parameters


In [30]:
optimizer = optim.Adam(model.parameters(), lr = 1e-4)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [31]:
def train_fn(model, data_loader, optimizer, criterion, clip, device):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(data_loader):
        
        source_sentence = batch[0].to(device)
        target_sentence = batch[1].to(device)

        batch_size = source_sentence.shape[0]
        target_len = target_sentence.shape[1]

        target_input = target_sentence[:, :-1] 
        target_output = target_sentence[:, 1:] 

        optimizer.zero_grad()

        output = model(source_sentence, target_input)
        output = output.reshape(batch_size * (target_len - 1), output.shape[-1])
        target_output = target_output.reshape(batch_size * (target_len - 1))
        loss = criterion(output, target_output)
        
        if i % (len(data_loader) // 10) == 0:
            print(i // (len(data_loader) // 10), "%", end = ' ') 
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip) #Prevents exploding gradients 
        optimizer.step() #Updates model weights 
        epoch_loss += loss.item()
        
    print()
    return epoch_loss / len(data_loader)

In [32]:
def evaluate_fn(model, data_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            source_sentence = batch[0].to(device)
            target_sentence = batch[1].to(device)

            # source_sentence has shape (batch_size, max_src_len)
            # target_sentence has shape (batch_size, max_tgt_len)
            
            batch_size = source_sentence.shape[0]
            target_len = target_sentence.shape[1]
            target_input = target_sentence[:, :-1]
            target_output = target_sentence[:, 1:]
            
            output = model(source_sentence, target_input)
            output = output.reshape(batch_size * (target_len - 1), output.shape[-1])
            target_output = target_output.reshape(batch_size * (target_len - 1))
            
            loss = criterion(output, target_output)
            epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [33]:
import tqdm

n_epochs = 3
clip = 1
best_valid_loss = float("inf")
for epoch in tqdm.tqdm(range(n_epochs)):
    train_loss = train_fn(model, train_data_loader, optimizer, criterion, clip, device)
    valid_loss = evaluate_fn(model, valid_data_loader, criterion, device)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), filepath + "model.pth")
    print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
    print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")

  0%|          | 0/3 [00:00<?, ?it/s]

0 % 1 % 2 % 3 % 4 % 5 % 6 % 7 % 8 % 9 % 10 % 


 33%|███▎      | 1/3 [26:08<52:16, 1568.27s/it]

	Train Loss:   1.859 | Train PPL:   6.419
	Valid Loss:   1.025 | Valid PPL:   2.788
0 % 1 % 2 % 3 % 4 % 5 % 6 % 7 % 8 % 9 % 10 % 


 67%|██████▋   | 2/3 [52:20<26:10, 1570.66s/it]

	Train Loss:   0.908 | Train PPL:   2.479
	Valid Loss:   0.760 | Valid PPL:   2.139
0 % 1 % 2 % 3 % 4 % 5 % 6 % 7 % 8 % 9 % 10 % 


100%|██████████| 3/3 [1:18:25<00:00, 1568.66s/it]

	Train Loss:   0.677 | Train PPL:   1.969
	Valid Loss:   0.669 | Valid PPL:   1.953





In [34]:
filepath = '/kaggle/working/'
torch.save(model.state_dict(), filepath + "model.pt")

In [35]:
filepath = '/kaggle/working/'
model.load_state_dict(torch.load(filepath + "model.pt", map_location = device))
model

DataParallel(
  (module): Transformer(
    (positional_encoding): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder_embedding): Embedding(19294, 512)
    (decoder_embedding): Embedding(6594, 512)
    (encoders): ModuleList(
      (0-5): 6 x EncoderBlock(
        (attention): MultiHeadAttention(
          (W_q): Linear(in_features=512, out_features=512, bias=False)
          (W_k): Linear(in_features=512, out_features=512, bias=False)
          (W_v): Linear(in_features=512, out_features=512, bias=False)
          (W_o): Linear(in_features=512, out_features=512, bias=True)
        )
        (feed_forward_network): PositionWiseFeedForwardNetwork(
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (relu): ReLU()
        )
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affin

In [36]:
test_loss = evaluate_fn(model, test_data_loader, criterion, device)
print(f"\tTest Loss: {test_loss:7.3f} | Train PPL: {np.exp(test_loss):7.3f}")

	Test Loss:   0.671 | Train PPL:   1.957


In [37]:
# identifies which positions in the input sentence are padding tokens
def get_pad_mask(sentence, pad_idx):
    mask = (sentence != pad_idx).unsqueeze(1).int().to(device)
    return mask

#prevent the decoder from attending to future 
def get_no_peak_mask(sentence):
    sentence_len = sentence.shape[1]
    no_peak_mask = 1 - torch.triu(torch.ones((1, sentence_len, sentence_len)), diagonal=1).type(torch.int).to(device)
    return no_peak_mask 

In [38]:
import torch.nn.functional as F

#Perform greedy decoding using a trained Transformer model.
def greedy_decode(model, sentence, max_len = 100):
    model.eval()
    
    input_tokens = token_transform['en'](sentence)

    input_ids = [SOS_IDX] + vocab['en'].lookup_indices(input_tokens) + [EOS_IDX]

    input_tensor = torch.tensor(input_ids).unsqueeze(0).to(device)

    input_mask = get_pad_mask(input_tensor, PAD_IDX)
    
    with torch.no_grad():
        input_embedding = model.module.encoder_embedding(input_tensor)
        input_embedding = model.module.positional_encoding(input_embedding)
        encoder_output = input_embedding

        #SELF ATTENTION,FFN AND ADD & NORM
        for encoder_layer in model.module.encoders:
            encoder_output = encoder_layer(encoder_output, input_mask)
            
    output_ids = [SOS_IDX]

    for i in range(max_len):
        output_tensor = torch.tensor(output_ids).unsqueeze(0).to(device)
        output_mask = get_pad_mask(output_tensor, PAD_IDX) & get_no_peak_mask(output_tensor)
        with torch.no_grad():
            output_embedding = model.module.decoder_embedding(output_tensor)
            output_embedding = model.module.positional_encoding(output_embedding)
            decoder_output = output_embedding

            for decoder_layer in model.module.decoders:
                decoder_output = decoder_layer(decoder_output, encoder_output, input_mask, output_mask)
            output = model.module.fc(decoder_output)
            
        output = F.softmax(output, dim = -1) 
        #elect the token with the highest probability using argmax().
        output_id = output.argmax(dim = -1)[:, -1].item()
        output_ids.append(output_id)

# The End of Sentence (EOS) token is generated.
        if len(output_ids) > max_len or output_id == EOS_IDX:
            break
    
    output_tokens = [vocab['vi'].get_itos()[idx] for idx in output_ids]
    return output_tokens 

In [39]:
text = 'i am studying artificial intelligence'
print(' '.join(greedy_decode(model, text, 500)))

<sos> tôi đang học thông minh <eos>


In [40]:
translations = [greedy_decode(model, example['en'].lower()) for example in test_data]

In [41]:
references = [example['vi_tokens'] for example in test_data]

In [42]:
print(references[0])
print(translations[0])

['<sos>', 'bạn', 'chỉ', 'nên', 'nói', 'chuyện', 'với', 'tom', '.', '<eos>']
['<sos>', 'bạn', 'chỉ', 'nên', 'nói', 'chuyện', 'với', 'tom', '<eos>']


In [43]:
predictions = [example[1:-1] for example in translations]
references = [example[1: -1] for example in references]

In [44]:
references = [[example] for example in references]
print(predictions[900])
print(references[900])

['tôi', 'sẽ', 'không', 'bỏ', 'lỡ', 'nó', 'vì', 'bất', 'cứ', 'điều', 'gì', '.']
[['tôi', 'sẽ', 'không', 'bỏ', 'lỡ', 'nó', 'cho', 'bất', 'cứ', 'điều', 'gì', '.']]


In [45]:
from torchtext.data.metrics import bleu_score

# focusing on the overlap of n-grams (sequences of words).
score = bleu_score(predictions, references)
print(score)

0.6441022563551347
