In [1]:
from torch.utils.data import Dataset, DataLoader
import torch
import math
from torch import nn
import torch.nn.functional as F

def get_device():
    return torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

device = get_device()

In [None]:
with open("/kaggle/input/phomt-donotpublic/PhoMT/tokenization/train/train.en", 'r') as f:
    eng_sentences = f.read().splitlines()
    
with open("/kaggle/input/phomt-donotpublic/PhoMT/tokenization/train/train.vi", 'r') as f:
    vie_sentences = f.read().splitlines()

In [None]:
print(len(vie_sentences))
print(len(eng_sentences))

I will limit the exposure of data here as per my agreement I accept when using this data

In [None]:
# Trying to find mismatch in indexes since lengths are different
# index = len(eng_sentences)//100000*93000
# print(index)
# print(eng_sentences[index])
# print("-------------------------")
# print(vie_sentences[index])

# Seem like the mismatch happened between these 2 indexes, 
# Just use 92% of data works for me i guess, better than checking for mismatch
# index = len(eng_sentences)//100000*92000
# print(index)
# print(eng_sentences[index])
# print("-------------------------")
# print(vie_sentences[index])

Only 92% of the first data is (assumed) to be not mislabeled

In [4]:
from transformers import BertTokenizer, AutoTokenizer

eng_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
vie_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

In [6]:
import os
from tqdm import tqdm
import pandas as pd
import re
import string
import unicodedata

def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')

def clean_eng_text(text):
    text = unicode_to_ascii(text.lower().strip())
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"\r", "", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    text = text.translate(str.maketrans('', '', string.punctuation)) 
    text = re.sub("(\\W)"," ",text) 
    text = re.sub('\S*\d\S*\s*','', text)
    return text

def clean_vie_text(text):
    text = text.lower().strip()
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub('\S*\d\S*\s*','', text)
    return text

# Back up the processed data to save sometime in next run
if os.path.exists("/kaggle/input/phomt-processed/processed.parquet"):
    print("loading processed file")
    df = pd.read_parquet("/kaggle/input/phomt-processed/processed.parquet")
    eng_tokens = df['eng_tokens']
    vie_tokens = df['vie_tokens']
    print('done')
    
else:
    print("Tokenizing file")
    print("Processing english file")
    eng_sentences = [clean_eng_text(sentence) for sentence in eng_sentences]
    eng_tokens = [eng_tokenizer.encode(sentence) for sentence in eng_sentences]
    eng_tokens = eng_tokens[:index+1]
    
    print("Processing vietnamese file")
    vie_sentences = [clean_vie_text(sentence) for sentence in vie_sentences]
    vie_tokens = [vie_tokenizer.encode(sentence) for sentence in vie_sentences]
    vie_tokens = vie_tokens[:index+1]
    df = {'eng_tokens': eng_tokens, 'vie_tokens': vie_tokens} 
    df = pd.DataFrame(df)
    eng_tokens = df['eng_tokens']
    vie_tokens = df['vie_tokens']
    df.to_parquet("processed.parquet")

loading processed file
done


In [7]:
# Use only first 80000 rows of data
eng_tokens = eng_tokens[:80000]
vie_tokens = vie_tokens[:80000]

In [8]:
len_eng_tokens = [len(tokens) for tokens in tqdm(eng_tokens)]
len_vie_tokens = [len(tokens) for tokens in tqdm(vie_tokens)]

100%|██████████| 80000/80000 [00:00<00:00, 1050687.22it/s]
100%|██████████| 80000/80000 [00:00<00:00, 998735.95it/s]


In [None]:
print(eng_tokenizer.decode(eng_tokens[0]))
print(vie_tokenizer.decode(vie_tokens[0]))

In [9]:
import numpy as np

context_length_eng = np.percentile(len_eng_tokens, 99)
context_length_vie = np.percentile(len_vie_tokens, 99)
print("99% of english sentences are under ", context_length_eng, " tokens")
print("99% of vietnamese sentences are under ", context_length_vie, " tokens")

99% of english sentences are under  62.0  tokens
99% of vietnamese sentences are under  75.0  tokens


In [24]:
# Use the data within context length
indexes = (len_eng_tokens <= context_length_eng) & (len_vie_tokens <= context_length_vie)

eng_tokens = eng_tokens[indexes] # Implicitly convert to numpy
vie_tokens = vie_tokens[indexes]

**Weight of losses for classes/tokens to counter the frequency of tokens**

In [None]:
# flattened_values = np.concatenate(vie_tokens.to_list())
# value_counts = pd.Series(flattened_values).value_counts()

# weights = 1.0 / np.log(value_counts + 1)
# weights = weights / weights.mean()

# num_classes = len(vie_tokenizer.get_vocab().keys())
# class_weights = torch.zeros(num_classes)

# for i in range(num_classes):
#     class_weights[i] = weights[i] if i in weights else 1

# class_weights = class_weights.to(device)
    
# print("Frequency of each label:")
# print(value_counts)
# print("\nWeights for each label (inverse of frequency):")
# print(weights)

In [25]:
len_eng_tokens = np.array(len_eng_tokens)[indexes]
len_vie_tokens = np.array(len_vie_tokens)[indexes]

context_length_eng = max(len_eng_tokens)
context_length_vie = max(len_vie_tokens)

In [None]:
import matplotlib.pyplot as plt

plt.hist(len_eng_tokens, bins=range(min(len_eng_tokens), context_length_eng + 1, 1), 
              alpha=0.4, color="red")
plt.hist(len_vie_tokens, bins=range(min(len_vie_tokens), context_length_vie + 1, 1),
              alpha=0.4, color="blue")
labels = ['English',"Vietnamese"]
plt.legend(labels)
plt.xlabel("length of sentence")

In [None]:
class TextDataset(Dataset):

    def __init__(self, eng_tokens, vie_tokens):
        self.eng_tokens = eng_tokens
        self.vie_tokens = vie_tokens

    def __len__(self):
        return len(self.eng_tokens)

    def __getitem__(self, idx):
        return self.eng_tokens[idx], self.vie_tokens[idx]

dataset = TextDataset(eng_tokens.to_list(), vie_tokens.to_list())

In [None]:
train_loader = DataLoader(dataset, batch_size=32, collate_fn=lambda x: x, shuffle=True)
iterator = iter(train_loader)

In [None]:
for batch_num, batch in enumerate(iterator):
    eng_batch, vie_batch = zip(*[batch[i] for i in range(len(batch))])
    print(eng_batch[0])
    print("DONE")
    print(vie_batch[0])
    break

**Transformer architecture**

In [2]:
NEG_INFTY = -1e9

def scaled_dot_product(q, k, v, mask=None):
    d_k = q.size()[-1]
    scaled = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(d_k)
    if mask is not None:
        scaled = scaled.permute(1, 0, 2, 3).masked_fill(mask==0, NEG_INFTY)
        scaled = scaled.permute(1, 0, 2, 3)
    attention = F.softmax(scaled, dim=-1)
    values = torch.matmul(attention, v)
    return values, attention

def create_masks(eng_batch, vie_batch):
    num_sentences = len(eng_batch)
    look_ahead_mask = torch.ones([context_length_vie, context_length_vie])
    look_ahead_mask = torch.tril(look_ahead_mask)
    encoder_self_attention_mask = torch.ones([num_sentences, context_length_eng, context_length_eng])
    decoder_padding_mask_self_attention = torch.ones([num_sentences, context_length_vie, context_length_vie])
    decoder_cross_attention_mask = torch.ones([num_sentences, context_length_vie, context_length_eng])
    
    for idx in range(num_sentences):
        eng_length = len(eng_batch[idx])
        vie_length = len(vie_batch[idx])
        encoder_self_attention_mask[idx, :, eng_length:] = 0
        encoder_self_attention_mask[idx, eng_length:, :] = 0
        decoder_padding_mask_self_attention[idx, :, vie_length:] = 0
        decoder_padding_mask_self_attention[idx, vie_length:, :] = 0
        decoder_cross_attention_mask[idx, :, eng_length:] = 0
        decoder_cross_attention_mask[idx, vie_length:, :] = 0
        
    decoder_self_attention_mask = look_ahead_mask * decoder_padding_mask_self_attention
    return encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, seq_len):
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        
    def forward(self):
        even_index = torch.arange(0, self.d_model, 2).float()
        denominator = torch.pow(10000, even_index/self.d_model)
        position = torch.arange(self.seq_len).reshape(self.seq_len, 1)
        even_PE = torch.sin(position/denominator)
        odd_PE = torch.cos(position/denominator)
        PE = torch.stack([even_PE, odd_PE], dim=2)
        PE = torch.flatten(PE, start_dim = 1, end_dim=2) # shape: seq_len x d_model
        return PE
    
class SentenceEmbedding(nn.Module):
    def __init__(self, max_sequence_length, d_model, vocab_size, pad_token):
        super().__init__()
        self.vocab_size = vocab_size
        self.max_sequence_length = max_sequence_length
        self.embedding = nn.Embedding(self.vocab_size, d_model)
        self.position_encoder = PositionalEncoding(d_model, max_sequence_length)
        self.dropout = nn.Dropout(0.1)
        self.pad_token = pad_token
    
    def batch_padding(self, batch, for_target=False):
        tokenized = []
        if not for_target:
            for sentence_num in range(len(batch)):
                tokenized.append(torch.tensor(np.concatenate((batch[sentence_num], 
                                              [self.pad_token]*(self.max_sequence_length - 
                                                                len(batch[sentence_num]))))
                                             ).long())
        else:
            for sentence_num in range(len(batch)):
                tokenized.append(torch.tensor(np.concatenate((batch[sentence_num][1:],
                                              [self.pad_token]*(self.max_sequence_length - 
                                                                len(batch[sentence_num]) + 1)))).long()[:self.max_sequence_length])
                
        tokenized = torch.stack(tokenized)
        return tokenized.to(get_device())
    
    def forward(self, x):
        x = self.batch_padding(x)
        x = self.embedding(x)
        pos = self.position_encoder().to(get_device())
        x = self.dropout(x + pos)
        return x
    
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.qkv_layer = nn.Linear(d_model, 3 * d_model)
        self.linear_layer = nn.Linear(d_model, d_model)
    
    def forward(self, x, mask):
        batch_size, sequence_length, d_model = x.size()
        qkv = self.qkv_layer(x)
        qkv = qkv.reshape(batch_size, sequence_length, self.num_heads, 3 * self.head_dim)
        qkv = qkv.permute(0, 2, 1, 3)
        q, k, v = qkv.chunk(3, dim=-1)
        values, attention = scaled_dot_product(q, k, v, mask)
        values = values.permute(0, 2, 1, 3).reshape(batch_size, sequence_length, self.num_heads * self.head_dim)
        out = self.linear_layer(values)
        return out
    
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(drop_prob)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x
        
class EncoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadSelfAttention(d_model=d_model, num_heads=num_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(drop_prob)
        self.ffn = PositionwiseFeedForward(d_model, ffn_hidden)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout2 = nn.Dropout(drop_prob)
    
    def forward(self, x, self_attention_mask):
        residual_x = x.clone()
        x = self.attention(x, mask=self_attention_mask)
        x = self.dropout1(x)
        x = self.norm1(x + residual_x)
        residual_x = x.clone()
        x = self.ffn(x)
        x = self.dropout2(x)
        x = self.norm2(x + residual_x)
        return x

class SequentialEncoder(nn.Sequential):
    def forward(self, *inputs):
        x, self_attention_mask = inputs
        for module in self._modules.values():
            x = module(x, self_attention_mask)
        return x
    
class Encoder(nn.Module):
    def __init__(self, d_model, 
                 ffn_hidden, 
                 num_heads, 
                 drop_prob,
                 num_layers,
                 max_sequence_length,
                 vocab_size,
                 pad_token):
        super().__init__()
        self.sentence_embedding = SentenceEmbedding(max_sequence_length, d_model, vocab_size, pad_token)
        self.layers = SequentialEncoder(*[EncoderLayer(d_model, ffn_hidden, num_heads, drop_prob) for _ in range(num_layers)])
    
    def forward(self, x, self_attention_mask):
        x = self.sentence_embedding(x)
        x = self.layers(x, self_attention_mask)
        return x
    
class MultiHeadCrossAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.kv_layer = nn.Linear(d_model, 2 * d_model)
        self.q_layer = nn.Linear(d_model, d_model)
        self.linear_layer = nn.Linear(d_model, d_model)
    
    def forward(self, x, y, mask):
        batch_size, eng_length, d_model = x.size()
        _, vie_length, _ = y.size()
        kv = self.kv_layer(x)
        q = self.q_layer(y)
        kv = kv.reshape(batch_size, eng_length, self.num_heads, 2 * self.head_dim)
        q = q.reshape(batch_size, vie_length, self.num_heads, self.head_dim)
        kv = kv.permute(0, 2, 1, 3)
        q = q.permute(0, 2, 1, 3)
        k, v = kv.chunk(2, dim=-1)
        values, attention = scaled_dot_product(q, k, v, mask)
        values = values.permute(0,2,1,3).reshape(batch_size, vie_length, d_model)
        out = self.linear_layer(values)
        return out
    
class DecoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(DecoderLayer, self).__init__()
        self.self_attention = MultiHeadSelfAttention(d_model, num_heads)
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(drop_prob)
        
        self.encoder_decoder_attention = MultiHeadCrossAttention(d_model, num_heads)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.dropout2 = nn.Dropout(drop_prob)
        
        self.ffn = PositionwiseFeedForward(d_model, ffn_hidden, drop_prob)
        self.layer_norm3 = nn.LayerNorm(d_model)
        self.dropout3 = nn.Dropout(drop_prob)
        
    def forward(self, x, y, self_attention_mask, cross_attention_mask):
        _y = y.clone()
        y = self.self_attention(y, mask=self_attention_mask)
        y = self.dropout1(y)
        y = self.layer_norm1(y + _y)
        
        _y = y.clone()
        y = self.encoder_decoder_attention(x, y, mask=cross_attention_mask)
        y = self.dropout2(y)
        y = self.layer_norm2(y + _y)
        
        _y = y.clone()
        y = self.ffn(y)
        y = self.dropout3(y)
        y = self.layer_norm3(y + _y)
        return y
    
class SequentialDecoder(nn.Sequential):
    def forward(self, *inputs):
        x, y, self_attention_mask, cross_attention_mask = inputs
        for module in self._modules.values():
            y = module(x, y, self_attention_mask, cross_attention_mask)
        return y
    
class Decoder(nn.Module):
    def __init__(self,
                d_model,
                ffn_hidden,
                num_heads,
                drop_prob,
                num_layers,
                max_sequence_length,
                vocab_size,
                pad_token):
        super().__init__()
        self.sentence_embedding = SentenceEmbedding(max_sequence_length, d_model, vocab_size, pad_token)
        self.layers = SequentialDecoder(*[DecoderLayer(d_model, ffn_hidden, num_heads, drop_prob) for _ in range(num_layers)])
    
    def forward(self, x, y, self_attention_mask, cross_attention_mask):
        y = self.sentence_embedding(y)
        y = self.layers(x, y, self_attention_mask, cross_attention_mask)
        return y
    
class Transformer(nn.Module):
    def __init__(self, 
                d_model, 
                ffn_hidden, 
                num_heads, 
                drop_prob,
                num_layers,
                max_eng_length,
                max_vie_length,
                eng_vocab_size,
                vie_vocab_size,
                eng_pad_token,
                vie_pad_token):
        super().__init__()
        self.encoder = Encoder(d_model, ffn_hidden, num_heads, drop_prob, 
                               num_layers, max_eng_length, eng_vocab_size, eng_pad_token)
        self.decoder = Decoder(d_model, ffn_hidden, num_heads, drop_prob, 
                               num_layers, max_vie_length, vie_vocab_size, vie_pad_token)
        self.linear = nn.Linear(d_model, vie_vocab_size)
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        
    def forward(self,
               x,
               y,
               encoder_self_attention_mask=None,
               decoder_self_attention_mask=None,
               decoder_cross_attention_mask=None):
        x = self.encoder(x, encoder_self_attention_mask)
        out = self.decoder(x, y, decoder_self_attention_mask, decoder_cross_attention_mask)
        out = self.linear(out)
        return out

In [50]:
d_model = 384
batch_size = 80
ffn_hidden = d_model * 4
num_heads = 8
drop_prob = 0.1
num_layers = 4

eng_vocab_size=len(eng_tokenizer.get_vocab())
vie_vocab_size=len(vie_tokenizer.get_vocab())
eng_pad_token=eng_tokenizer.pad_token_id
vie_pad_token=vie_tokenizer.pad_token_id
vie_end_token=vie_tokenizer.sep_token_id
vie_bos_token=vie_tokenizer.cls_token_id

transformer = Transformer(d_model=d_model, 
                        ffn_hidden=ffn_hidden, 
                        num_heads=num_heads, 
                        drop_prob=drop_prob,
                        num_layers=num_layers,
                        max_eng_length=context_length_eng,
                        max_vie_length=context_length_vie,
                        eng_vocab_size=eng_vocab_size,
                        vie_vocab_size=vie_vocab_size,
                        eng_pad_token=eng_pad_token,
                        vie_pad_token=vie_pad_token)

In [None]:
# criterian = nn.CrossEntropyLoss(weight=class_weights, ignore_index=vie_tokenizer.pad_token_id,
#                                 reduction='none', label_smoothing=0.1)
criterian = nn.CrossEntropyLoss(ignore_index=vie_tokenizer.pad_token_id,
                                reduction='none', label_smoothing=0.1)
# When computing the loss, we are ignoring cases when the label is the padding token
for params in transformer.parameters():
    if params.dim() > 1:
        nn.init.xavier_uniform_(params)

optim = torch.optim.Adam(transformer.parameters(), lr=1e-4)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
import random

transformer.train()
transformer.to(device)
total_loss = 0
num_epochs = 65
evaluations = ["i am so hungry i could eat a horse", 
               "i am unemployed and in urgent need of a job",
               "i can see some progress in this project",
               "that will prove something",
               "i wil write some articles about this",
               "if they are hiring i will apply",
               "thing needs to change at certain point",
               "this is just a scratch",
               "what is the question again?"]

for epoch in range(num_epochs):
    print(f"Epoch {epoch}")
    if epoch == 45:
        print("reduce lr to 3e-5")
        optim = torch.optim.Adam(transformer.parameters(), lr=3e-5)
    if epoch == 55:
        print("reduce lr to 1e-5")
        optim = torch.optim.Adam(transformer.parameters(), lr=1e-5)
    iterator = iter(train_loader)
    for batch_num, batch in enumerate(iterator):
        transformer.train()
        eng_batch, vie_batch = zip(*[batch[i] for i in range(len(batch))])
        encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(eng_batch, vie_batch)
        optim.zero_grad()
        vie_predictions = transformer(eng_batch,
                                     vie_batch,
                                     encoder_self_attention_mask.to(device),
                                     decoder_self_attention_mask.to(device),
                                     decoder_cross_attention_mask.to(device))
        labels = transformer.decoder.sentence_embedding.batch_padding(vie_batch, for_target=True)
        
        loss = criterian(
            vie_predictions.view(-1, vie_vocab_size).to(device),
            labels.view(-1).to(device)
        ).to(device)
        
        valid_indicies = torch.where(labels.view(-1) == vie_pad_token, False, True)
        loss = loss.sum() / valid_indicies.sum()
        loss.backward()
        optim.step()
  
        if batch_num % 1000 == 0:
            print(f"Iteration {batch_num} : {loss.item()}")
            transformer.eval()
            vie = [[vie_bos_token]]
            eval_text = random.choice(evaluations)
            context = [eng_tokenizer.encode(eval_text)]
            print("Evaluating: ", eval_text)

            for word_counter in range(context_length_vie):
                encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = \
                                        create_masks(context, vie)
                vie_predictions = transformer(context,
                                             vie,
                                             encoder_self_attention_mask.to(device),
                                             decoder_self_attention_mask.to(device),
                                             decoder_cross_attention_mask.to(device))
                next_token_prob_distribution = vie_predictions[0][word_counter]
                next_token_index = int(torch.argmax(next_token_prob_distribution))
                if next_token_index == vie_end_token:
                    break
                vie = [ vie[0] + [next_token_index] ]

            print(f"Evaluation: {vie_tokenizer.decode(vie[0])}")
            print("-------------------------------------------")

In [None]:
torch.save(transformer.state_dict(), "myTranslator")

In [51]:
transformer.load_state_dict(torch.load("/kaggle/input/mytranslator/pytorch/default/1/myTranslator", map_location=torch.device('cpu')))
transformer.eval()

Transformer(
  (encoder): Encoder(
    (sentence_embedding): SentenceEmbedding(
      (embedding): Embedding(30522, 384)
      (position_encoder): PositionalEncoding()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layers): SequentialEncoder(
      (0): EncoderLayer(
        (attention): MultiHeadSelfAttention(
          (qkv_layer): Linear(in_features=384, out_features=1152, bias=True)
          (linear_layer): Linear(in_features=384, out_features=384, bias=True)
        )
        (norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (ffn): PositionwiseFeedForward(
          (linear1): Linear(in_features=384, out_features=1536, bias=True)
          (linear2): Linear(in_features=1536, out_features=384, bias=True)
          (relu): ReLU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        (dropout2): Dropout(p=0.1, inplace

In [52]:
def translate(eng_sentence):
    vie = [[vie_bos_token]]
    print("English sentence:")
    print(eng_sentence)
    context = [eng_tokenizer.encode(eng_sentence)]
    print("----------------------------------------------------------------------------")
    for word_counter in range(context_length_vie):
        encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = \
                                create_masks(context, vie)
        vie_predictions = transformer(context,
                                     vie,
                                     encoder_self_attention_mask.to(device),
                                     decoder_self_attention_mask.to(device),
                                     decoder_cross_attention_mask.to(device))
        next_token_prob_distribution = vie_predictions[0][word_counter]
        next_token_index = int(torch.argmax(next_token_prob_distribution))
        if next_token_index == vie_end_token:
            break
        vie = [ vie[0] + [next_token_index] ]
        
    print("Vietnamese translation:")
    print(vie_tokenizer.decode(vie[0]))

In [53]:
# translated - will you do it for me
translate("will you do it for me?")

English sentence:
will you do it for me?
----------------------------------------------------------------------------
Vietnamese translation:
<s> anh có làm điều đó cho tôi không


In [54]:
# translated - some random suprised things
translate("just some random things")

English sentence:
just some random things
----------------------------------------------------------------------------
Vietnamese translation:
<s> một vài điều bất ngờ


In [55]:
# translated - i hope this helps some of you
translate("i hope this work helps some of you")

English sentence:
i hope this works help some of you
----------------------------------------------------------------------------
Vietnamese translation:
<s> tôi hi vọng điều này làm việc một vài người trong số các bạn
