## Data

In [1]:
import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

corpus = [
    "i like cats and dogs",
    "books are expensive"    
]
data_size = len(corpus)

# 0: noun/pronoun - 1: verb - others - 2
labels = [[0, 1, 0, 2, 0],
          [0, 1, 2]]

# Define the max vocabulary size and sequence length
vocab_size = 10
sequence_length = 5

In [2]:
# Define tokenizer function
tokenizer = get_tokenizer('basic_english')

# Create a function to yield list of tokens
def yield_tokens(examples):
    for text in examples:
        yield tokenizer(text)

# Create vocabulary
vocab = build_vocab_from_iterator(yield_tokens(corpus),
                                  max_tokens=vocab_size,
                                  specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])
vocab.get_stoi()

{'books': 4,
 '<unk>': 0,
 'expensive': 7,
 'dogs': 6,
 'and': 2,
 '<pad>': 1,
 'are': 3,
 'cats': 5,
 'i': 8,
 'like': 9}

In [3]:
# Tokenize and numericalize your samples
def vectorize(text, vocab, sequence_length, sequence_label):
    tokens = tokenizer(text)
    
    token_ids = [vocab[token] for token in tokens]    
    token_ids = token_ids + [vocab["<pad>"]] * (sequence_length - len(tokens))
    sequence_label = sequence_label + [3] * (sequence_length - len(tokens))
    
    return torch.tensor(token_ids, dtype=torch.long), torch.tensor(sequence_label, dtype=torch.long)

# Vectorize the samples
sentence_vecs = []
label_vecs = []
for sentence, labels in zip(corpus, labels):
    sentence_vec, labels_vec = vectorize(sentence, vocab, sequence_length, labels)
    sentence_vecs.append(sentence_vec)
    label_vecs.append(labels_vec)

In [4]:
for v in sentence_vecs:
    print(v)

tensor([8, 9, 5, 2, 6])
tensor([4, 3, 7, 1, 1])


In [5]:
for v in label_vecs:
    print(v)

tensor([0, 1, 0, 2, 0])
tensor([0, 1, 2, 3, 3])


## Model

In [6]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim=embed_dim, 
                                          num_heads=num_heads)
        self.ffn = nn.Linear(in_features=embed_dim, 
                             out_features=ff_dim)
        self.layernorm_1 = nn.LayerNorm(normalized_shape=embed_dim)
        self.layernorm_2 = nn.LayerNorm(normalized_shape=embed_dim)

    def forward(self, query, key, value):
        attn_output, _ = self.attn(query, key, value)
        out_1 = self.layernorm_1(query + attn_output)
        ffn_output = self.ffn(out_1)
        x = self.layernorm_2(out_1 + ffn_output)
        return x

In [7]:
class POS_Model(nn.Module):
    def __init__(self, vocab_size, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 5)
        self.transformer = TransformerBlock(5, 1, 5)
        self.fc = nn.Linear(5, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer(x, x, x)
        x = self.fc(x)
        return x.permute(0, 2, 1)

num_classes = 4
model = POS_Model(vocab_size, num_classes)
print(model)

# test
data = torch.tensor([[3, 2, 3, 2, 1]]).long()
output = model(data)
print(output.shape)

POS_Model(
  (embedding): Embedding(10, 5)
  (transformer): TransformerBlock(
    (attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=5, out_features=5, bias=True)
    )
    (ffn): Linear(in_features=5, out_features=5, bias=True)
    (layernorm_1): LayerNorm((5,), eps=1e-05, elementwise_affine=True)
    (layernorm_2): LayerNorm((5,), eps=1e-05, elementwise_affine=True)
  )
  (fc): Linear(in_features=5, out_features=4, bias=True)
)
torch.Size([1, 4, 5])


## Train with full data

In [8]:
criterion = nn.CrossEntropyLoss(ignore_index=3)
optimizer = torch.optim.Adam(model.parameters(), lr=0.2)

In [9]:
input_data = torch.tensor( [[8, 9, 5, 2, 6],
                            [4, 3, 7, 1, 1]], dtype=torch.long)
label_data = torch.tensor([[0, 1, 0, 2, 0],
                           [0, 1, 2, 3, 3]], dtype=torch.long)

for _ in range(15):
    optimizer.zero_grad()
    outputs = model(input_data)
    loss = criterion(outputs, label_data)
    print(loss.item())
    loss.backward()
    optimizer.step()

1.3730652332305908
0.9424481987953186
0.8031702637672424
0.6116276979446411
0.558436393737793
0.3493475914001465
0.2334384322166443
0.14082038402557373
0.9800198078155518
0.24309507012367249
0.28033679723739624
0.12968376278877258
0.12041982263326645
0.09843364357948303
0.07493124902248383


In [10]:
outputs = model(input_data)
o_softmax = torch.softmax(outputs, axis=1)

print(o_softmax[0, :, 0].detach().numpy())
print(o_softmax[0, :, 1].detach().numpy())
print(o_softmax[0, :, 2].detach().numpy())
print(o_softmax[0, :, 3].detach().numpy())
print(o_softmax[0, :, 4].detach().numpy())

print()
print(o_softmax[1, :, 0].detach().numpy())
print(o_softmax[1, :, 1].detach().numpy())
print(o_softmax[1, :, 2].detach().numpy())
print(o_softmax[1, :, 3].detach().numpy())
print(o_softmax[1, :, 4].detach().numpy())

[9.9209279e-01 7.6937717e-03 1.7033132e-04 4.3163454e-05]
[2.8505813e-02 9.7148895e-01 1.6684228e-07 5.0659983e-06]
[0.6495717  0.11612322 0.23328903 0.00101611]
[2.3904698e-02 4.4751982e-07 9.7608352e-01 1.1419012e-05]
[9.9863368e-01 4.1596583e-04 9.1880467e-04 3.1544521e-05]

[9.6370602e-01 3.6170281e-02 7.2652954e-05 5.1175593e-05]
[2.7319780e-02 9.7267514e-01 1.6742366e-07 5.0073004e-06]
[2.0866888e-02 2.7849629e-07 9.7912335e-01 9.4053612e-06]
[9.9363291e-01 6.1104833e-03 2.1325971e-04 4.3309446e-05]
[4.0800458e-01 5.9195185e-01 5.8988999e-06 3.7583854e-05]


In [11]:
# [[0, 1, 0, 2, 0],
#  [0, 1, 2, *, *]]

o_softmax.argmax(axis=1)

tensor([[0, 1, 0, 2, 0],
        [0, 1, 2, 0, 1]])