### 1. Data

In [1]:
import torch 
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [7]:
corpus_en = [
    "good morning",
    "ai books"    
]
data_size_en = len(corpus_en)

# max vocabulary size and sequence length
vocab_size_en = 7
sequence_length_en = 3

In [8]:
tokenizer_en = get_tokenizer("basic_english")

def yield_token(examples):
    for text in examples:
        yield tokenizer_en(text)
    
vocab_en = build_vocab_from_iterator(iterator=yield_token(corpus_en), 
                                     max_tokens=vocab_size_en, 
                                     specials=["<unk>", "<pad>", "<eos>"])

vocab_en.set_default_index(vocab_en["<unk>"])
vocab_en.get_stoi()

{'morning': 6,
 'good': 5,
 'books': 4,
 'ai': 3,
 '<eos>': 2,
 '<pad>': 1,
 '<unk>': 0}

In [20]:

def vectorize_en(text, vocab_en, sequence_length_en):
    tokens = text.split()
    token_ids = [vocab_en[token] for token in tokens] + [vocab_en["<eos>"]]

    tokenn_ids_pad = token_ids[:sequence_length_en] + [vocab_en["<pad>"]] * (sequence_length_en - len(token_ids))
    return tokenn_ids_pad


corpus_ids_en = []
for sentence in corpus_en:
    corpus_ids_en.append(vectorize_en(sentence, vocab_en, sequence_length_en))


en_data = torch.tensor(corpus_ids_en, dtype=torch.long)
en_data

tensor([[5, 6, 2],
        [3, 4, 2]])

In [16]:
corpus_vn = [
    "chào buổi sáng",
    "sách ai"
]
data_size_vn = len(corpus_vn)

# max vocabulary size and sequence length
vocab_size_vn = 9
sequence_length_vn = 4

In [24]:
tokenizer_vi = get_tokenizer("basic_english")

def yield_token(examples):
    for text in examples:
        yield tokenizer_vi(text)


vocab_vi = build_vocab_from_iterator(iterator=yield_token(corpus_vn), 
                                     max_tokens=vocab_size_vn, 
                                     specials=["<unk>", "<pad>", "<sos>", "<eos>"])

vocab_vi.set_default_index(vocab_vi["<unk>"])
vocab_vi.get_stoi()

{'sáng': 8,
 'sách': 7,
 'chào': 6,
 'buổi': 5,
 'ai': 4,
 '<eos>': 3,
 '<sos>': 2,
 '<pad>': 1,
 '<unk>': 0}

In [25]:
def vectorize_vi(text, vocab_vi, sequence_length_vn):
    tokens = text.split()
    token_ids = [vocab_vi["<sos>"]] + [vocab_vi[token] for token in tokens] + [vocab_vi["<eos>"]]
    token_ids_pad = token_ids[:sequence_length_vn] + [vocab_vi["<pad>"]] * (sequence_length_vn - len(token_ids))

    return token_ids_pad

corpus_ids_vi = []

for sentence in corpus_vn:
    corpus_ids_vi.append(vectorize_vi(sentence, vocab_vi, sequence_length_vn + 1))

data_vi = torch.tensor(corpus_ids_vi, dtype=torch.long)
data_vi

tensor([[2, 6, 5, 8, 3],
        [2, 7, 4, 3, 1]])

In [26]:
input_vn_data = []
output_vn_data = []

for vector in corpus_ids_vi:
    input_vn_data.append(vector[:-1])
    output_vn_data.append(vector[1:])

input_vn_data = torch.tensor(input_vn_data, dtype=torch.long)
output_vn_data = torch.tensor(output_vn_data, dtype=torch.long)

# print
print(input_vn_data)
print(output_vn_data)

tensor([[2, 6, 5, 8],
        [2, 7, 4, 3]])
tensor([[6, 5, 8, 3],
        [7, 4, 3, 1]])


### 2. Model

In [30]:
class Encoder(nn.Module):
    def __init__(self, vocab_size_en, embed_dim, n_heads):
        super().__init__()
        self.embeddding = nn.Embedding(
            num_embeddings=vocab_size_en, 
            embedding_dim=embed_dim, 
        )

        self.encoder = nn.TransformerEncoderLayer(
            d_model=embed_dim, 
            nhead=n_heads, 
            dim_feedforward=6, 
            batch_first=True, 
            bias=True, 
            dropout=0.0
        )

    def forward(self, src): # [N, sequence_length]
        embedding = self.embeddding(src) # [N, sequence_length, embed_dim]
        context = self.encoder(embedding) # [N, sequence_length, embed_dim]
        return context # [N, sequence_length, embed_dim]

In [28]:
en_data.shape

torch.Size([2, 3])

In [31]:

embed_dim, n_heads = 6, 1
encoder_layer = Encoder(vocab_size_en=vocab_size_en, embed_dim=embed_dim, n_heads=1)
context = encoder_layer(en_data)
print(context.shape)

torch.Size([2, 3, 6])


In [32]:
class Decoder(nn.Module):
    def __init__(self, vocab_size_vi, embed_dim, n_heads, sequence_length_vn):
        super().__init__()

        self.embedding = nn.Embedding(num_embeddings=vocab_size_vi, 
                                      embedding_dim=embed_dim)
        self.mask = torch.triu(input=torch.ones(sequence_length_vn, sequence_length_vn), diagonal=1).bool()
        self.decoder = nn.TransformerDecoderLayer(
            d_model=embed_dim, 
            nhead=n_heads, 
            dim_feedforward=6, 
            bias=True, 
            batch_first=True, 
            dropout=0.0
        )

        self.linear = nn.Linear(embed_dim, vocab_size_vi)

    def forward(self, input, context): # input: [N, sequence_len_en]; context: [N, sequence_len_vi, embed_dim]
        embedding = self.embedding(input) # [N, sequence_len_vi, embed_dim]
        output = self.decoder(embedding, context, tgt_mask=self.mask) # [N, sequence_len_vi, embed_dim]
        predictions = self.linear(output) # [N, sequence_len_vi, vocab_size_vi]
        return predictions.permute(0, 2, 1) # [N, vocab_size_vi, sequence_len_vi]

In [33]:
input_vn_data

tensor([[2, 6, 5, 8],
        [2, 7, 4, 3]])

In [37]:
decoder_layer = Decoder(vocab_size_vi=vocab_size_vn, embed_dim=embed_dim, n_heads=n_heads, sequence_length_vn=sequence_length_vn)
output = decoder_layer(input_vn_data, context)
print(output.shape)

torch.Size([2, 9, 4])


In [38]:
class Seq2Seq(nn.Module):

    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, sequence_en, sequence_vi):
        context = self.encoder(sequence_en)
        output = self.decoder(sequence_vi, context)
        return output
    

model = Seq2Seq(encoder=encoder_layer, decoder=decoder_layer)
output = model(en_data, input_vn_data)
print(output.shape)

torch.Size([2, 9, 4])


In [39]:
output_vn_data.shape

torch.Size([2, 4])

### 3. Train 

In [41]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.05)

In [42]:
for _ in range(35):
    optimizer.zero_grad()
    outputs = model(en_data, input_vn_data)
    loss = criterion(outputs, output_vn_data)
    print(loss.item())
    loss.backward()
    optimizer.step()

2.4717018604278564
2.0004782676696777
1.7141073942184448
1.456430435180664
1.2691161632537842
1.1028152704238892
0.946965217590332
0.8271654844284058
0.7355573177337646
0.6192213892936707
0.5331056118011475
0.4421507716178894
0.3705432713031769
0.3046678900718689
0.24402225017547607
0.1969178020954132
0.14596310257911682
0.11385639011859894
0.08223186433315277
0.07041727006435394
0.050214268267154694
0.05643398314714432
0.04028531163930893
0.02726457640528679
0.021223798394203186
0.01758846826851368
0.013833574019372463
0.010011861100792885
0.006904289126396179
0.0048629166558384895
0.003721024375408888
0.003095417283475399
0.0027761294040828943
0.0024280103389173746
0.0020527122542262077


In [43]:
outputs = model(en_data, input_vn_data)
print(torch.argmax(outputs, axis=1))

tensor([[6, 5, 8, 3],
        [7, 4, 3, 1]])


In [44]:
output_vn_data

tensor([[6, 5, 8, 3],
        [7, 4, 3, 1]])