## Data

In [1]:
import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

corpus = [
    "gậy ông đập lưng ông",
    "có làm mới có ăn"    
]
data_size = len(corpus)

# 0: negative - 1: positive
labels = [0, 1] 

# Define the max vocabulary size and sequence length
vocab_size = 8
sequence_length = 5

In [2]:
# Define tokenizer function
tokenizer = get_tokenizer('basic_english')

# Create a function to yield list of tokens
def yield_tokens(examples):
    for text in examples:
        yield tokenizer(text)

# Create vocabulary
vocab = build_vocab_from_iterator(yield_tokens(corpus),
                                  max_tokens=vocab_size,
                                  specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])
vocab.get_stoi()

{'ông': 3,
 '<unk>': 0,
 '<pad>': 1,
 'mới': 7,
 'có': 2,
 'gậy': 4,
 'làm': 5,
 'lưng': 6}

In [3]:
# Tokenize and numericalize your samples
def vectorize(text, vocab, sequence_length):
    tokens = tokenizer(text)
    token_ids = [vocab[token] for token in tokens][:sequence_length]
    token_ids = token_ids + [vocab["<pad>"]] * (sequence_length - len(tokens))
    return torch.tensor(token_ids, dtype=torch.long)

# Vectorize the samples
corpus_ids = []
for sentence in corpus:
    corpus_ids.append(vectorize(sentence, vocab, sequence_length))

In [4]:
for v in corpus_ids:
    print(v)

tensor([4, 3, 0, 6, 3])
tensor([2, 5, 7, 2, 0])


## Model

In [5]:
embedding = nn.Embedding(vocab_size, 2)
fc = nn.Linear(10, 2)
flatten = nn.Flatten()

model = nn.Sequential(embedding, flatten, fc)

## Train with full data

In [6]:
embedding = nn.Embedding(vocab_size, 2)
fc = nn.Linear(10, 2)
flatten = nn.Flatten()
model = nn.Sequential(embedding, flatten, fc)

In [7]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [8]:
input_data = torch.tensor( [[4, 3, 0, 6, 3],
                            [2, 5, 7, 2, 0]], dtype=torch.long)
label_data = torch.tensor([0, 1], dtype=torch.long)

for _ in range(40):
    optimizer.zero_grad()
    outputs = model(input_data)
    loss = criterion(outputs, label_data)
    print(loss.item())
    loss.backward()
    optimizer.step()

0.8623576164245605
0.7884228229522705
0.7197414636611938
0.6560235023498535
0.597088634967804
0.5427716374397278
0.4928787350654602
0.44717830419540405
0.40541279315948486
0.36731454730033875
0.3326173722743988
0.3010646104812622
0.2724122405052185
0.24643026292324066
0.22290310263633728
0.20162859559059143
0.18241725862026215
0.16509178280830383
0.1494859755039215
0.13544461131095886
0.12282281368970871
0.11148591339588165
0.10130936652421951
0.09217812120914459
0.08398692309856415
0.07663922756910324
0.07004741579294205
0.06413175165653229
0.058820582926273346
0.054049041122198105
0.049759045243263245
0.04589856043457985
0.04242105036973953
0.0392848439514637
0.03645308315753937
0.03389272093772888
0.03157456964254379
0.029472611844539642
0.027563821524381638
0.025827649980783463


In [9]:
outputs = model(input_data)
print(outputs)
print(torch.softmax(outputs, axis=-1))

tensor([[ 1.5890, -2.2094],
        [-1.7302,  1.8935]], grad_fn=<AddmmBackward0>)
tensor([[0.9781, 0.0219],
        [0.0260, 0.9740]], grad_fn=<SoftmaxBackward0>)


In [10]:
# labels = [0, 1] 
outputs.argmax(axis=1)

tensor([0, 1])