### 1. PREPARE DATA

In [28]:
import torch 
import torch.nn as nn


corpus = [
    "ăn quả nhớ kẻ trồng cây",
    "có chí thì nên"
]

data_size = len(corpus)


# Define the max vocabulary size and sequence length
vocab_size = 15
sequence_length = 7
data_size, vocab_size, sequence_length

(2, 15, 7)

In [29]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

# define tokenizer function
tokenizer  = get_tokenizer("basic_english")
# Create a function to yield list of tokens
def yield_tokens(examples):
    for text in examples:
        yield tokenizer(text)
    
# Create vocabulary
vocab = build_vocab_from_iterator(iterator=yield_tokens(corpus), 
                                  specials=["<unk>", "<pad>", "<sos_topic1>", "<sos_topic2>", "<eos>"],
                                  max_tokens=vocab_size)
vocab.set_default_index(vocab["<unk>"])
vocab.get_stoi()


{'trồng': 13,
 '<unk>': 0,
 'có': 7,
 '<pad>': 1,
 '<sos_topic2>': 3,
 'thì': 12,
 'ăn': 14,
 '<sos_topic1>': 2,
 '<eos>': 4,
 'chí': 5,
 'nên': 10,
 'cây': 6,
 'quả': 11,
 'kẻ': 8,
 'nhớ': 9}

In [30]:
data_X = []
data_y = []


corpus[0] = '<sos_topic1> ' + corpus[0] + ' <eos>'
corpus[1] = '<sos_topic2> ' + corpus[1] + ' <eos>'


for vector in corpus:
    vector = vector.split()
    data_X.append(vector[:-1])
    data_y.append(vector[1:])

for x, y in zip(data_X, data_y):
    print("X: ", x)
    print("y: ", y)
    print("=" * 100)

X:  ['<sos_topic1>', 'ăn', 'quả', 'nhớ', 'kẻ', 'trồng', 'cây']
y:  ['ăn', 'quả', 'nhớ', 'kẻ', 'trồng', 'cây', '<eos>']
X:  ['<sos_topic2>', 'có', 'chí', 'thì', 'nên']
y:  ['có', 'chí', 'thì', 'nên', '<eos>']


In [32]:
# Tokenize and numericalize your samples

def vectorize(X, y, vocab, sequence_length):
    X_ids = [vocab[token] for token in X][:sequence_length]
    X_ids = X_ids + [vocab["<pad>"]] * (sequence_length - len(X))

    y_ids = [vocab[token] for token in y][:sequence_length]
    y_ids = y_ids + [vocab["<pad>"]] * (sequence_length - len(y))
    
    return X_ids, y_ids

data_X_ids = []
data_y_ids = []

for X, y in zip(data_X, data_y):
    X_ids, y_ids = vectorize(X, y, vocab, sequence_length)
    data_X_ids.append(X_ids)
    data_y_ids.append(y_ids)


for X_ids, y_ids in zip(data_X_ids, data_y_ids):
    print("X: ", X_ids)
    print("y: ", y_ids)
    print("=" * 100)

X:  [2, 14, 11, 9, 8, 13, 6]
y:  [14, 11, 9, 8, 13, 6, 4]
X:  [3, 7, 5, 12, 10, 1, 1]
y:  [7, 5, 12, 10, 4, 1, 1]


In [39]:
data_X_ids = torch.tensor(data_X_ids, dtype=torch.long)
print(data_X_ids.shape)

data_y_ids = torch.tensor(data_y_ids, dtype=torch.long)
print(data_y_ids.shape)

torch.Size([2, 7])
torch.Size([2, 7])


  data_y_ids = torch.tensor(data_y_ids, dtype=torch.long)


In [40]:
data_X_ids

tensor([[ 2, 14, 11,  9,  8, 13,  6],
        [ 3,  7,  5, 12, 10,  1,  1]])

### 2.TRAINING WITH RNN

In [48]:
class TG_RNN(nn.Module):

    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, 
                                      embedding_dim=embed_dim)
        
        self.model = nn.RNN(input_size=embed_dim, hidden_size=embed_dim, batch_first=True)
        self.linear = nn.Linear(in_features=embed_dim, out_features=vocab_size)
    
    def forward(self, x): # shape_x: [N, sequence_len]
        
        x = self.embedding(x) # [N, sequence_len, embed_dim]
        output_rnn, hidden_rnn = self.model(x) # ouput_rnn: [N, sequence_len, hidden_dim], hidden_rnn: [num_layers, N, hidden_dim]
        x = self.linear(output_rnn)
        return x.permute(0, 2, 1) # [N, vocab_size, sequence_len] -> match with CrossEntropyLoss in Pytorch [request shape: [N, num_classes, sequence_len]]


model = TG_RNN(vocab_size=vocab_size, embed_dim=8)
print(model)
model(data_X_ids).shape

TG_RNN(
  (embedding): Embedding(15, 8)
  (model): RNN(8, 8, batch_first=True)
  (linear): Linear(in_features=8, out_features=15, bias=True)
)


torch.Size([2, 15, 7])

In [49]:
criterion = nn.CrossEntropyLoss()
optim = torch.optim.AdamW(params=model.parameters(), lr=0.05)

for _ in range(40):
    optim.zero_grad()
    outputs = model(data_X_ids) # [N, vocab_size, sequence_length]
    loss = criterion(outputs, data_y_ids)
    print(loss.item())
    loss.backward()
    optim.step()


2.8961081504821777
2.550565004348755
2.2825536727905273
2.0461933612823486
1.8105595111846924
1.583443522453308
1.3702161312103271
1.1732375621795654
0.9928973317146301
0.8317399621009827
0.694553017616272
0.5818873643875122
0.4905191957950592
0.41640955209732056
0.3557373881340027
0.30517321825027466
0.2622518837451935
0.22549180686473846
0.194116473197937
0.16760122776031494
0.1452823430299759
0.12647469341754913
0.11062461137771606
0.0972549244761467
0.08592425286769867
0.07623473554849625
0.0678509920835495
0.06052184849977493
0.05410873144865036
0.04857390746474266
0.043866924941539764
0.03984333202242851
0.036343902349472046
0.03326483443379402
0.030547330155968666
0.02815130352973938
0.026040833443403244
0.024180758744478226
0.02253827638924122
0.021085234358906746


In [57]:
outputs.shape

torch.Size([2, 15, 7])

In [58]:
outputs[0]

tensor([[ 0.1924, -0.7306,  0.0602, -1.0373, -1.0373, -2.6971, -1.7461],
        [-4.2316,  3.4399,  1.2369, -0.7927, -1.0359,  2.2874,  0.7000],
        [ 0.6310, -0.0917, -0.5092, -0.8957, -1.9059, -3.6407, -1.2883],
        [ 0.5385, -0.2688,  0.3680, -0.6990, -1.1670, -2.2496, -0.8685],
        [ 0.9083,  3.6521,  0.1939, -1.3532, -5.7020,  2.2908,  9.0254],
        [-1.5692, -0.8419, -2.1892,  4.7700,  1.8199, -2.5830, -3.9032],
        [-3.4662,  0.7420, -1.7625, -1.1869,  1.6814,  8.1480,  0.9097],
        [ 0.1939, -0.5840,  0.4655, -2.2933, -1.4949,  2.5773,  3.6119],
        [ 3.0281,  2.2016, -4.9895,  8.2871,  0.6823,  0.7786, -1.4901],
        [ 2.6617, -5.0407,  7.8242, -4.2683,  4.3222, -3.2356, -0.5837],
        [-2.1894, -3.4701,  2.3557, -5.1361,  0.3279, -0.1154,  2.2946],
        [-1.0556,  7.9106, -5.8047,  2.5957, -7.5132,  1.6718,  4.1276],
        [-1.5776, -3.3483, -1.8648, -0.1453,  5.6113,  4.3646, -2.6107],
        [ 2.8453, -6.8360,  2.6319,  0.4222,  9.874

```
1 từ tương ứng với 15 class được dự đoán -> lấy argmax theo chiều 15 (chiều 1)

```

In [63]:
outputs[0].T[0] # ~ các class của từ thứ nhất được dự đoán 

tensor([ 0.1924, -4.2316,  0.6310,  0.5385,  0.9083, -1.5692, -3.4662,  0.1939,
         3.0281,  2.6617, -2.1894, -1.0556, -1.5776,  2.8453,  8.2451],
       grad_fn=<SelectBackward0>)

In [64]:
max(outputs[0].T[0]) # ~ index 14 

tensor(8.2451, grad_fn=<UnbindBackward0>)

In [54]:
torch.argmax(outputs, 1)

tensor([[14, 11,  9,  8, 13,  6,  4],
        [ 7,  5, 12, 10,  4,  1,  1]])

In [55]:
data_y_ids

tensor([[14, 11,  9,  8, 13,  6,  4],
        [ 7,  5, 12, 10,  4,  1,  1]])

### 3. INFERENCE

In [66]:
promt = '<sos_topic2> có'
promt = promt.split()
promt_ids = [vocab[token] for token in promt][:sequence_length]
promt_ids = promt_ids + [vocab["<pad>"]] * (sequence_length - len(promt))

print(promt_ids)

[3, 7, 1, 1, 1, 1, 1]


In [73]:

id2label = {id: label for label, id in vocab.get_stoi().items()}

for i in range(sequence_length - len(promt)):
    promt_tensor = torch.tensor(promt_ids, dtype=torch.long).reshape(1, -1)
    outputs = model(promt_tensor)
    outputs = torch.argmax(outputs, axis=1)   
    next_id = outputs[0][len(promt)+i-1]

    promt_ids[len(promt)+i] = next_id.item()
    print(promt_ids)
    prompt_token = [id2label[id] for id in promt_ids]
    print(prompt_token) 

[3, 7, 5, 1, 1, 1, 1]
['<sos_topic2>', 'có', 'chí', '<pad>', '<pad>', '<pad>', '<pad>']
[3, 7, 5, 12, 1, 1, 1]
['<sos_topic2>', 'có', 'chí', 'thì', '<pad>', '<pad>', '<pad>']
[3, 7, 5, 12, 10, 1, 1]
['<sos_topic2>', 'có', 'chí', 'thì', 'nên', '<pad>', '<pad>']
[3, 7, 5, 12, 10, 4, 1]
['<sos_topic2>', 'có', 'chí', 'thì', 'nên', '<eos>', '<pad>']
[3, 7, 5, 12, 10, 4, 1]
['<sos_topic2>', 'có', 'chí', 'thì', 'nên', '<eos>', '<pad>']
