## Data

In [1]:
import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

corpus = [
    "i like cats and dogs",
    "books are expensive"    
]
data_size = len(corpus)

# 0: noun/pronoun - 1: verb - others - 2
labels = [[0, 1, 0, 2, 0],
          [0, 1, 2]]

# Define the max vocabulary size and sequence length
vocab_size = 10
sequence_length = 5

In [2]:
# Define tokenizer function
tokenizer = get_tokenizer('basic_english')

# Create a function to yield list of tokens
def yield_tokens(examples):
    for text in examples:
        yield tokenizer(text)

# Create vocabulary
vocab = build_vocab_from_iterator(yield_tokens(corpus),
                                  max_tokens=vocab_size,
                                  specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])
vocab.get_stoi()

{'books': 4,
 '<unk>': 0,
 'expensive': 7,
 'dogs': 6,
 'and': 2,
 '<pad>': 1,
 'are': 3,
 'cats': 5,
 'i': 8,
 'like': 9}

In [3]:
# Tokenize and numericalize your samples
def vectorize(text, vocab, sequence_length, sequence_label):
    tokens = tokenizer(text)
    
    token_ids = [vocab[token] for token in tokens]    
    token_ids = token_ids + [vocab["<pad>"]] * (sequence_length - len(tokens))
    sequence_label = sequence_label + [3] * (sequence_length - len(tokens))
    
    return torch.tensor(token_ids, dtype=torch.long), torch.tensor(sequence_label, dtype=torch.long)

# Vectorize the samples
sentence_vecs = []
label_vecs = []
for sentence, labels in zip(corpus, labels):
    sentence_vec, labels_vec = vectorize(sentence, vocab, sequence_length, labels)
    sentence_vecs.append(sentence_vec)
    label_vecs.append(labels_vec)

In [4]:
for v in sentence_vecs:
    print(v)

tensor([8, 9, 5, 2, 6])
tensor([4, 3, 7, 1, 1])


In [5]:
for v in label_vecs:
    print(v)

tensor([0, 1, 0, 2, 0])
tensor([0, 1, 2, 3, 3])


## Model

In [6]:
class POS_Model(nn.Module):
    def __init__(self, vocab_size, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 4)
        self.recurrent = nn.RNN(4, 3, batch_first=True)
        self.fc = nn.Linear(3, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x,_ = self.recurrent(x)
        x = self.fc(x)
        return x.permute(0, 2, 1)

num_classes = 4
model = POS_Model(vocab_size, num_classes)

# test
data = torch.tensor([[3, 2, 3, 2, 1]]).long()
output = model(data)
print(output.shape)

torch.Size([1, 4, 5])


## Train with full data

In [7]:
criterion = nn.CrossEntropyLoss(ignore_index=3)
optimizer = torch.optim.Adam(model.parameters(), lr=0.2)

In [8]:
input_data = torch.tensor( [[8, 9, 5, 2, 6],
                            [4, 3, 7, 1, 1]], dtype=torch.long)
label_data = torch.tensor([[0, 1, 0, 2, 0],
                           [0, 1, 2, 3, 3]], dtype=torch.long)

for _ in range(10):
    optimizer.zero_grad()
    outputs = model(input_data)
    loss = criterion(outputs, label_data)
    print(loss.item())
    loss.backward()
    optimizer.step()

1.22955322265625
0.9621938467025757
0.7904495596885681
0.6373740434646606
0.5405081510543823
0.4464695155620575
0.3311528265476227
0.20919638872146606
0.14388348162174225
0.10053516179323196


In [9]:
outputs = model(input_data)
o_softmax = torch.softmax(outputs, axis=1)

print(o_softmax[0, :, 0].detach().numpy())
print(o_softmax[0, :, 1].detach().numpy())
print(o_softmax[0, :, 2].detach().numpy())
print(o_softmax[0, :, 3].detach().numpy())
print(o_softmax[0, :, 4].detach().numpy())

print()
print(o_softmax[1, :, 0].detach().numpy())
print(o_softmax[1, :, 1].detach().numpy())
print(o_softmax[1, :, 2].detach().numpy())
print(o_softmax[1, :, 3].detach().numpy())
print(o_softmax[1, :, 4].detach().numpy())

[9.7099161e-01 2.0625002e-03 2.6429052e-02 5.1696226e-04]
[5.2879187e-03 9.8827237e-01 5.9400392e-03 4.9965602e-04]
[9.7054142e-01 2.0601545e-03 2.6861716e-02 5.3675903e-04]
[2.6241654e-01 3.3003416e-02 7.0450753e-01 7.2555369e-05]
[9.7104925e-01 2.0592909e-03 2.6374552e-02 5.1680766e-04]

[9.7104734e-01 2.0594087e-03 2.6376387e-02 5.1682291e-04]
[5.2862829e-03 9.8827851e-01 5.9363842e-03 4.9875694e-04]
[0.04853906 0.00514811 0.94501555 0.0012973 ]
[2.6773903e-01 3.2728899e-02 6.9946116e-01 7.0995375e-05]
[2.6768428e-01 7.3921956e-02 6.5826005e-01 1.3370054e-04]


In [10]:
# [[0, 1, 0, 2, 0],
# [0, 1, 2, *, *]]

o_softmax.argmax(axis=1)

tensor([[0, 1, 0, 2, 0],
        [0, 1, 2, 2, 2]])