In [1]:
import torch 
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [7]:
corpus_en = [
    "good morning",
    "ai books"    
]
data_size_en = len(corpus_en)

# max vocabulary size and sequence length
vocab_size_en = 7
sequence_length_en = 3

In [8]:
tokenizer_en = get_tokenizer("basic_english")

def yield_token(examples):
    for text in examples:
        yield tokenizer_en(text)
    
vocab_en = build_vocab_from_iterator(iterator=yield_token(corpus_en), 
                                     max_tokens=vocab_size_en, 
                                     specials=["<unk>", "<pad>", "<eos>"])

vocab_en.set_default_index(vocab_en["<unk>"])
vocab_en.get_stoi()

{'morning': 6,
 'good': 5,
 'books': 4,
 'ai': 3,
 '<eos>': 2,
 '<pad>': 1,
 '<unk>': 0}

In [20]:

def vectorize_en(text, vocab_en, sequence_length_en):
    tokens = text.split()
    token_ids = [vocab_en[token] for token in tokens] + [vocab_en["<eos>"]]

    tokenn_ids_pad = token_ids[:sequence_length_en] + [vocab_en["<pad>"]] * (sequence_length_en - len(token_ids))
    return tokenn_ids_pad


corpus_ids_en = []
for sentence in corpus_en:
    corpus_ids_en.append(vectorize_en(sentence, vocab_en, sequence_length_en))


en_data = torch.tensor(corpus_ids_en, dtype=torch.long)
en_data

tensor([[5, 6, 2],
        [3, 4, 2]])

In [16]:
corpus_vn = [
    "chào buổi sáng",
    "sách ai"
]
data_size_vn = len(corpus_vn)

# max vocabulary size and sequence length
vocab_size_vn = 9
sequence_length_vn = 4

In [17]:
tokenizer_vi = get_tokenizer("basic_english")

def yield_token(examples):
    for text in examples:
        yield tokenizer_vi(text)


vocab_vi = build_vocab_from_iterator(iterator=yield_token(corpus_vn), 
                                     max_tokens=vocab_size_vn, 
                                     specials=["<unk>", "<pad>", "<eos>", "<sos>"])

vocab_vi.set_default_index(vocab_vi["<unk>"])
vocab_vi.get_stoi()

{'sáng': 8,
 'sách': 7,
 'chào': 6,
 'buổi': 5,
 'ai': 4,
 '<sos>': 3,
 '<eos>': 2,
 '<pad>': 1,
 '<unk>': 0}

In [19]:
def vectorize_vi(text, vocab_vi, sequence_length_vn):
    tokens = text.split()
    token_ids = [vocab_vi["<sos>"]] + [vocab_vi[token] for token in tokens] + [vocab_vi["<eos>"]]
    token_ids_pad = token_ids[:sequence_length_vn] + [vocab_vi["<pad>"]] * (sequence_length_vn - len(token_ids))

    return token_ids_pad

corpus_ids_vi = []

for sentence in corpus_vn:
    corpus_ids_vi.append(vectorize_vi(sentence, vocab_vi, sequence_length_vn + 1))

data_vi = torch.tensor(corpus_ids_vi, dtype=torch.long)
data_vi

tensor([[3, 6, 5, 8, 2],
        [3, 7, 4, 2, 1]])