### 1. Data

In [1]:
import torch 
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator



In [3]:
corpus_en = [
    "good morning",
    "ai books"    
]
data_size_en = len(corpus_en)

# max vocabulary size and sequence length
vocab_size_en = 7
sequence_length_en = 3

In [4]:
tokenizer_en = get_tokenizer("basic_english")

def yield_token(examples):
    for text in examples:
        yield tokenizer_en(text)
    
vocab_en = build_vocab_from_iterator(iterator=yield_token(corpus_en), 
                                     max_tokens=vocab_size_en, 
                                     specials=["<unk>", "<pad>", "<eos>"])

vocab_en.set_default_index(vocab_en["<unk>"])
vocab_en.get_stoi()

{'morning': 6,
 'good': 5,
 'books': 4,
 'ai': 3,
 '<eos>': 2,
 '<pad>': 1,
 '<unk>': 0}

In [5]:

def vectorize_en(text, vocab_en, sequence_length_en):
    tokens = text.split()
    token_ids = [vocab_en[token] for token in tokens] + [vocab_en["<eos>"]]

    tokenn_ids_pad = token_ids[:sequence_length_en] + [vocab_en["<pad>"]] * (sequence_length_en - len(token_ids))
    return tokenn_ids_pad


corpus_ids_en = []
for sentence in corpus_en:
    corpus_ids_en.append(vectorize_en(sentence, vocab_en, sequence_length_en))


en_data = torch.tensor(corpus_ids_en, dtype=torch.long)
en_data

tensor([[5, 6, 2],
        [3, 4, 2]])

In [6]:
corpus_vn = [
    "chào buổi sáng",
    "sách ai"
]
data_size_vn = len(corpus_vn)

# max vocabulary size and sequence length
vocab_size_vn = 9
sequence_length_vn = 4

In [7]:
tokenizer_vi = get_tokenizer("basic_english")

def yield_token(examples):
    for text in examples:
        yield tokenizer_vi(text)


vocab_vi = build_vocab_from_iterator(iterator=yield_token(corpus_vn), 
                                     max_tokens=vocab_size_vn, 
                                     specials=["<unk>", "<pad>", "<sos>", "<eos>"])

vocab_vi.set_default_index(vocab_vi["<unk>"])
vocab_vi.get_stoi()

{'sáng': 8,
 'sách': 7,
 'chào': 6,
 'buổi': 5,
 'ai': 4,
 '<eos>': 3,
 '<sos>': 2,
 '<pad>': 1,
 '<unk>': 0}

In [8]:
def vectorize_vi(text, vocab_vi, sequence_length_vn):
    tokens = text.split()
    token_ids = [vocab_vi["<sos>"]] + [vocab_vi[token] for token in tokens] + [vocab_vi["<eos>"]]
    token_ids_pad = token_ids[:sequence_length_vn] + [vocab_vi["<pad>"]] * (sequence_length_vn - len(token_ids))

    return token_ids_pad

corpus_ids_vi = []

for sentence in corpus_vn:
    corpus_ids_vi.append(vectorize_vi(sentence, vocab_vi, sequence_length_vn + 1))

data_vi = torch.tensor(corpus_ids_vi, dtype=torch.long)
data_vi

tensor([[2, 6, 5, 8, 3],
        [2, 7, 4, 3, 1]])

In [9]:
input_vn_data = []
output_vn_data = []

for vector in corpus_ids_vi:
    input_vn_data.append(vector[:-1])
    output_vn_data.append(vector[1:])

input_vn_data = torch.tensor(input_vn_data, dtype=torch.long)
output_vn_data = torch.tensor(output_vn_data, dtype=torch.long)

# print
print(input_vn_data)
print(output_vn_data)

tensor([[2, 6, 5, 8],
        [2, 7, 4, 3]])
tensor([[6, 5, 8, 3],
        [7, 4, 3, 1]])


### 2. Model

In [10]:
class Encoder(nn.Module):
    def __init__(self, vocab_size_en, embed_dim, hidden_dim):
        super().__init__()

        self.embed_model = nn.Embedding(
            num_embeddings=vocab_size_en, 
            embedding_dim=embed_dim
        )

        self.rnn = nn.RNN(input_size=embed_dim, 
                          hidden_size=hidden_dim, 
                          num_layers=1, 
                          batch_first=True, 
                          bidirectional=False)
    
    def forward(self, x): # shape: [N, sequence_len]
        embedding = self.embed_model(x) # [N, sequence_len, embed_dim]
        output_rnn, hidden_rnn = self.rnn(embedding) # ([N, sequence_len, hidden_dim], [num_layers, N, hidden_dim])
        return hidden_rnn # [num_layers, N, hidden_dim]


In [12]:
en_data.shape

torch.Size([2, 3])

In [18]:

embed_dim, hidden_dim = 6, 6
encoder = Encoder(vocab_size_en=vocab_size_vn, embed_dim=embed_dim, hidden_dim=hidden_dim)
hidden_context = encoder(en_data)
print(hidden_context.shape)
print(hidden_context)

torch.Size([1, 2, 6])
tensor([[[ 0.4592,  0.4791, -0.8377, -0.6844,  0.1064,  0.3388],
         [ 0.4733,  0.5765, -0.7849, -0.5096,  0.0684,  0.0345]]],
       grad_fn=<StackBackward0>)


In [16]:
class Decoder(nn.Module):
    def __init__(self, vocab_size_vn, embed_dim, hidden_dim):
        super().__init__()

        self.embed_model = nn.Embedding(
            num_embeddings=vocab_size_vn, 
            embedding_dim=embed_dim
        )

        self.decoder = nn.RNN(input_size=embed_dim, 
                              hidden_size=hidden_dim, 
                              batch_first=True,
                              dropout=0.0, 
                              bidirectional=False, 
                              num_layers=1)
        
        self.linear = nn.Linear(in_features=hidden_dim, out_features=vocab_size_vn)

    
    def forward(self, input, hidden): # input: [N, sequence_len_vn]; hidden: [num_layer, N, hidden_dim]
        embedding = self.embed_model(input) # [N, sequence_len_vn, embed_dim]
        output_rnn, hidden_rnn = self.decoder(embedding, hidden) # ([N, sequence_len, hidden_dim], [num_layers, N, hidden_dim])
        prediction = self.linear(output_rnn) # [N, sequence_len_vn, vocab_size_vn]
        return prediction.permute(0, 2, 1) # [N, vocab_size_vn, sequence_len_vn]
    

In [19]:
decoder = Decoder(vocab_size_vn=vocab_size_vn, embed_dim=embed_dim, hidden_dim=hidden_dim)
output = decoder(input_vn_data, hidden_context)
print(output.shape)

torch.Size([2, 9, 4])


In [21]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    
    def forward(self, sequence_en, sequence_vn):
        hidden = self.encoder(sequence_en)
        output = self.decoder(sequence_vn, hidden)
        return output
    
model = Seq2Seq(encoder=encoder, decoder=decoder)
outputs = model(en_data, input_vn_data)
print(outputs.shape)
print(model)

torch.Size([2, 9, 4])
Seq2Seq(
  (encoder): Encoder(
    (embed_model): Embedding(9, 6)
    (rnn): RNN(6, 6, batch_first=True)
  )
  (decoder): Decoder(
    (embed_model): Embedding(9, 6)
    (decoder): RNN(6, 6, batch_first=True)
    (linear): Linear(in_features=6, out_features=9, bias=True)
  )
)


### 3. Train

In [22]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.05)

In [23]:
for _ in range(35):
    optimizer.zero_grad()
    outputs = model(en_data, input_vn_data)
    loss = criterion(outputs, output_vn_data)
    print(loss.item())
    loss.backward()
    optimizer.step()

2.2895143032073975
1.9922749996185303
1.7623039484024048
1.5574148893356323
1.367702603340149
1.202600359916687
1.0652351379394531
0.9447047114372253
0.8351659774780273
0.7307093143463135
0.6370806097984314
0.5528579950332642
0.482318252325058
0.4173257052898407
0.36150291562080383
0.31254658102989197
0.26987481117248535
0.23350460827350616
0.20302051305770874
0.17732924222946167
0.15517204999923706
0.13568568229675293
0.11836788803339005
0.10298648476600647
0.08963319659233093
0.07849900424480438
0.06941844522953033
0.06194249168038368
0.05568516254425049
0.05039449781179428
0.04588649421930313
0.042005036026239395
0.03861824795603752
0.03562465310096741
0.03295455127954483


In [24]:
outputs = model(en_data, input_vn_data)
print(torch.argmax(outputs, axis=1))

tensor([[6, 5, 8, 3],
        [7, 4, 3, 1]])


In [25]:
output_vn_data

tensor([[6, 5, 8, 3],
        [7, 4, 3, 1]])