## Data

In [1]:
import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

corpus = [
    "i want a dog",
    "books are expensive"    
]

# 0: noun/pronoun - 1: verb - others - 2
labels = [[0, 1, 2, 0],
          [0, 1, 2]]

# Define the max vocabulary size and sequence length
vocab_size = 7
sequence_length = 4

In [2]:
corpus_words = []
for sentence in corpus:
    corpus_words.extend(sentence.split())

data_size = len(corpus_words)
print(corpus_words)

label_words = []
for labels in labels:
    label_words.extend(labels)
print(label_words)

['i', 'want', 'a', 'dog', 'books', 'are', 'expensive']
[0, 1, 2, 0, 0, 1, 2]


In [3]:
# Define tokenizer function
tokenizer = get_tokenizer('basic_english')

# Create a function to yield list of tokens
def yield_tokens(examples):
    for text in examples:
        yield tokenizer(text)

# Create vocabulary
vocab = build_vocab_from_iterator(yield_tokens(corpus),
                                  max_tokens=vocab_size)
vocab.get_stoi()

{'i': 5, 'a': 0, 'are': 1, 'dog': 3, 'books': 2, 'expensive': 4, 'want': 6}

In [4]:
token_ids = [vocab[token] for token in corpus_words]
token_ids = torch.tensor(token_ids, dtype=torch.long)
print(token_ids)

tensor([5, 6, 0, 3, 2, 1, 4])


## Train with full data

In [5]:
embedding = nn.Embedding(vocab_size, 4)
fc = nn.Linear(4, 3)
flatten = nn.Flatten()
model = nn.Sequential(embedding, flatten, fc)

In [6]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

In [7]:
input_data = torch.tensor( [[5], [6], [0], [3], [2], [1], [4]], dtype=torch.long)
label_data = torch.tensor([0, 1, 2, 0, 0, 1, 2], dtype=torch.long)

for _ in range(30):
    optimizer.zero_grad()
    outputs = model(input_data)
    loss = criterion(outputs, label_data)
    print(loss.item())
    loss.backward()
    optimizer.step()

1.0302027463912964
0.7981552481651306
0.6376959681510925
0.5274735689163208
0.44960522651672363
0.3918246626853943
0.34478044509887695
0.3014029860496521
0.2576104998588562
0.21222460269927979
0.16613294184207916
0.12167713791131973
0.08220910280942917
0.0509515181183815
0.02927975542843342
0.016021504998207092
0.008621891029179096
0.00469199800863862
0.00263197161257267
0.0015381263801828027
0.0009400874259881675
0.000600572966504842
0.000399845332140103
0.0002762766380328685
0.00019734578381758183
0.00014517587260343134
0.00010964039393002167
8.47893170430325e-05
6.700028461636975e-05
5.3976680646883324e-05


In [8]:
outputs = model(input_data)
print(outputs)
print(torch.softmax(outputs, axis=-1))

tensor([[  7.6817,  -8.1848,  -6.0648],
        [ -6.1424,  10.2136,  -1.0948],
        [ -1.1817,  -4.7183,   7.0409],
        [  7.8595,  -8.3286,  -4.0635],
        [  8.4120, -10.0803,  -3.4524],
        [ -5.1597,  10.0293,  -2.1230],
        [ -3.1693,  -4.9210,  11.1362]], grad_fn=<AddmmBackward0>)
tensor([[1.0000e+00, 1.2862e-07, 1.0715e-06],
        [7.8825e-08, 9.9999e-01, 1.2268e-05],
        [2.6845e-04, 7.8148e-06, 9.9972e-01],
        [9.9999e-01, 9.3239e-08, 6.6363e-06],
        [9.9999e-01, 9.3095e-09, 7.0364e-06],
        [2.5322e-07, 9.9999e-01, 5.2766e-06],
        [6.1258e-07, 1.0628e-07, 1.0000e+00]], grad_fn=<SoftmaxBackward0>)


In [9]:
#label_data = torch.tensor([0, 1, 2, 0, 0, 1, 2], dtype=torch.long)
outputs.argmax(axis=1)

tensor([0, 1, 2, 0, 0, 1, 2])