## Data

In [18]:
import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [19]:
# Define tokenizer function
tokenizer = get_tokenizer('basic_english')

# Create a function to yield list of tokens
def yield_tokens(examples):
    for text in examples:
        yield tokenizer(text)

# Tokenize and numericalize your samples
def vectorize_en(text, vocab, sequence_length):
    tokens = tokenizer(text)
    tokens = [vocab[token] for token in tokens] + [vocab["<eos>"]]
    token_ids = tokens[:sequence_length] + [vocab["<pad>"]] * (sequence_length - len(tokens))
    return token_ids

def vectorize_vn(text, vocab, sequence_length):
    tokens = tokenizer(text)
    tokens = [vocab["<sos>"]] + [vocab[token] for token in tokens] + [vocab["<eos>"]]
    token_ids = tokens[:sequence_length] + [vocab["<pad>"]] * (sequence_length - len(tokens))
    return token_ids

In [20]:
corpus_en = [
    "good morning",
    "ai books"    
]
data_size_en = len(corpus_en)

# max vocabulary size and sequence length
vocab_size_en = 7
sequence_length_en = 3

In [21]:
# Create vocabulary
vocab_en = build_vocab_from_iterator(yield_tokens(corpus_en),
                                     max_tokens=vocab_size_en,
                                     specials=["<unk>", "<pad>", "<eos>"])
vocab_en.set_default_index(vocab_en["<unk>"])
vocab_en.get_stoi()

{'books': 4,
 '<unk>': 0,
 '<eos>': 2,
 '<pad>': 1,
 'morning': 6,
 'ai': 3,
 'good': 5}

In [22]:
# Vectorize the samples
corpus_ids_en = []
for sentence in corpus_en:
    corpus_ids_en.append(vectorize_en(sentence, vocab_en, sequence_length_en))

# print
en_data = torch.tensor(corpus_ids_en, dtype=torch.long)
print(en_data)    

tensor([[5, 6, 2],
        [3, 4, 2]])


In [23]:
corpus_vn = [
    "chào buổi sáng",
    "sách ai"    
]
data_size_vn = len(corpus_vn)

# max vocabulary size and sequence length
vocab_size_vn = 9
sequence_length_vn = 4

In [24]:
# Create vocabulary
vocab_vn = build_vocab_from_iterator(yield_tokens(corpus_vn),
                                  max_tokens=vocab_size_vn,
                                  specials=["<unk>", "<pad>", "<sos>", "<eos>"])
vocab_vn.set_default_index(vocab_vn["<unk>"])
vocab_vn.get_stoi()

{'<unk>': 0,
 'chào': 6,
 '<eos>': 3,
 '<pad>': 1,
 '<sos>': 2,
 'ai': 4,
 'buổi': 5,
 'sách': 7,
 'sáng': 8}

In [25]:
# Vectorize the samples
corpus_ids_vn = []
for sentence in corpus_vn:
    corpus_ids_vn.append(vectorize_vn(sentence, vocab_vn, sequence_length_vn+1))

# print
print(corpus_ids_vn)

[[2, 6, 5, 8, 3], [2, 7, 4, 3, 1]]


In [26]:
input_vn_data = []
label_vn_data = []

for vector in corpus_ids_vn:
    input_vn_data.append(vector[:-1])
    label_vn_data.append(vector[1:])  

# convert to tensors
input_vn_data = torch.tensor(input_vn_data, dtype=torch.long)
label_vn_data = torch.tensor(label_vn_data, dtype=torch.long)

# print
print(input_vn_data)
print(label_vn_data)

tensor([[2, 6, 5, 8],
        [2, 7, 4, 3]])
tensor([[6, 5, 8, 3],
        [7, 4, 3, 1]])


## Model

In [28]:
class Encoder(nn.Module):
    def __init__(self, vocab_size_en, embedding_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size_en, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)

    # src: [batch_size, seq_length]
    def forward(self, src): 
        embedded = self.embedding(src)  # [batch_size, seq_length, embedding_dim]        
        _, hidden = self.rnn(embedded)  # [1, batch_size, hidden_dim]        
        return hidden

In [29]:
embedding_dim, hidden_dim = 6, 6
encoder = Encoder(vocab_size_en, embedding_dim, hidden_dim)

hidden_sample = encoder(en_data)
print(hidden_sample.shape)

torch.Size([1, 2, 6])


In [31]:
class Decoder(nn.Module):
    def __init__(self, vocab_size_vn, embedding_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size_vn, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, vocab_size_vn)

    # input: [batch_size, seq_length]
    # hidden: [1, batch_size, hidden_dim]
    def forward(self, input, hidden):
        embedded = self.embedding(input)          # [batch_size, seq_length, embedding_dim]        
        output, _ = self.rnn(embedded, hidden)    # [batch_size, seq_length, hidden_dim]
        prediction = self.fc_out(output)          # [batch_size, vocab_size_vn]
        
        return prediction.permute(0, 2, 1)

In [36]:
decoder = Decoder(vocab_size_vn, embedding_dim, hidden_dim)
outputs = decoder(input_vn_data, hidden_sample)
print(outputs.shape)

torch.Size([2, 9, 4])


In [33]:
class Seq2Seq_Model(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, sequence_en, sequence_vn):        
        hidden = self.encoder(sequence_en)
        outputs = self.decoder(sequence_vn, hidden)
            
        return outputs

In [34]:
model = Seq2Seq_Model(encoder, decoder)
print(model)

Seq2Seq_Model(
  (encoder): Encoder(
    (embedding): Embedding(7, 6)
    (rnn): RNN(6, 6, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(9, 6)
    (rnn): RNN(6, 6, batch_first=True)
    (fc_out): Linear(in_features=6, out_features=9, bias=True)
  )
)


In [37]:
outputs = model(en_data, input_vn_data)
print(outputs.shape)

torch.Size([2, 9, 4])


## Train

In [38]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.05)

In [39]:
for _ in range(35):
    optimizer.zero_grad()
    outputs = model(en_data, input_vn_data)
    loss = criterion(outputs, label_vn_data)
    print(loss.item())
    loss.backward()
    optimizer.step()

2.249560594558716
1.9828377962112427
1.7630938291549683
1.5635037422180176
1.3691974878311157
1.190489649772644
1.036076307296753
0.9050320386886597
0.791242241859436
0.6925269961357117
0.6101727485656738
0.5432384014129639
0.48622769117355347
0.43204283714294434
0.3798092007637024
0.33621659874916077
0.3014173209667206
0.27165430784225464
0.24489396810531616
0.2201252430677414
0.19674460589885712
0.17500600218772888
0.15543237328529358
0.13775195181369781
0.1215820461511612
0.10711026191711426
0.09449587017297745
0.08354666829109192
0.07402341067790985
0.06580005586147308
0.058771610260009766
0.052790336310863495
0.047688186168670654
0.04331155866384506
0.0395352765917778


In [40]:
outputs = model(en_data, input_vn_data)
print(torch.argmax(outputs, axis=1))

tensor([[6, 5, 8, 3],
        [7, 4, 3, 1]])


In [41]:
label_vn_data

tensor([[6, 5, 8, 3],
        [7, 4, 3, 1]])