## Data

In [1]:
import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

corpus = [
    "i want a dog",
    "books are quite expensive"    
]
data_size = len(corpus)

# 0: noun/pronoun - 1: verb - others - 2
labels = [[0, 1, 2, 0],
          [0, 1, 2, 2]]

# Define the max vocabulary size and sequence length
vocab_size = 8
sequence_length = 4

In [2]:
# Define tokenizer function
tokenizer = get_tokenizer('basic_english')

# Create a function to yield list of tokens
def yield_tokens(examples):
    for text in examples:
        yield tokenizer(text)

# Create vocabulary
vocab = build_vocab_from_iterator(yield_tokens(corpus),
                                  max_tokens=vocab_size)
vocab.get_stoi()

{'i': 5,
 'a': 0,
 'are': 1,
 'quite': 6,
 'dog': 3,
 'books': 2,
 'expensive': 4,
 'want': 7}

In [3]:
# Tokenize and numericalize your samples
def vectorize(text, vocab, sequence_length):
    tokens = tokenizer(text)
    token_ids = [vocab[token] for token in tokens]
    return torch.tensor(token_ids, dtype=torch.long)

# Vectorize the samples
corpus_ids = []
for sentence in corpus:
    corpus_ids.append(vectorize(sentence, vocab, sequence_length))

In [4]:
for v in corpus_ids:
    print(v)

tensor([5, 7, 0, 3])
tensor([2, 1, 6, 4])


## Model

In [5]:
class POS_Model(nn.Module):
    def __init__(self, vocab_size, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 4)
        self.fc = nn.Linear(4, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = self.fc(x)
        return x.permute(0, 2, 1)

model = POS_Model(vocab_size, 3)

In [6]:
input_1 = torch.tensor([[5, 7, 0, 3]], dtype=torch.long)
output = model(input_1)
print(output.shape)

torch.Size([1, 3, 4])


## Train with full data

In [7]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.03)

In [8]:
input_data = torch.tensor( [[5, 7, 0, 3],
                            [2, 1, 7, 4]], dtype=torch.long)
label_data = torch.tensor([[0, 1, 2, 0],
                           [0, 1, 2, 2]], dtype=torch.long)

for _ in range(30):
    optimizer.zero_grad()
    outputs = model(input_data)
    loss = criterion(outputs, label_data)
    print(loss.item())
    loss.backward()
    optimizer.step()

1.0044304132461548
0.948207437992096
0.9011487364768982
0.861030101776123
0.8250694274902344
0.7915202379226685
0.7594226598739624
0.7282181978225708
0.6976487636566162
0.6676508188247681
0.6382557153701782
0.6095340847969055
0.5815730094909668
0.554461658000946
0.5282794833183289
0.5030937790870667
0.4789634048938751
0.4559418261051178
0.4340704083442688
0.4133661985397339
0.3938120901584625
0.37535619735717773
0.3579255938529968
0.34144851565361023
0.3258778750896454
0.31120485067367554
0.29745540022850037
0.2846730053424835
0.2728950083255768
0.26213353872299194


In [9]:
outputs = model(input_data)
o_softmax = torch.softmax(outputs, axis=1)

print(o_softmax[0, :, 0])
print(o_softmax[0, :, 1])
print(o_softmax[0, :, 2])
print(o_softmax[0, :, 3])

print()
print(o_softmax[1, :, 0])
print(o_softmax[1, :, 1])
print(o_softmax[1, :, 2])
print(o_softmax[1, :, 3])

tensor([0.9853, 0.0131, 0.0016], grad_fn=<SelectBackward0>)
tensor([0.0065, 0.4874, 0.5061], grad_fn=<SelectBackward0>)
tensor([1.8751e-05, 1.1899e-01, 8.8099e-01], grad_fn=<SelectBackward0>)
tensor([9.7668e-01, 2.2620e-02, 6.9982e-04], grad_fn=<SelectBackward0>)

tensor([0.9872, 0.0106, 0.0022], grad_fn=<SelectBackward0>)
tensor([0.0636, 0.7345, 0.2019], grad_fn=<SelectBackward0>)
tensor([0.0065, 0.4874, 0.5061], grad_fn=<SelectBackward0>)
tensor([0.0187, 0.1055, 0.8758], grad_fn=<SelectBackward0>)


In [10]:
# labels = [[0, 1, 2, 0],
#          [0, 1, 2, 2]]

o_softmax.argmax(axis=1)

tensor([[0, 2, 2, 0],
        [0, 1, 2, 2]])