In [17]:
import torch
import torch.nn as nn

# 1. Tokenize

In [18]:
from torchtext.data import  get_tokenizer

tokenizer = get_tokenizer("basic_english")
tokens = tokenizer("The animal did not cross the street because it was tired.")
tokens  # simple tokenization

['the',
 'animal',
 'did',
 'not',
 'cross',
 'the',
 'street',
 'because',
 'it',
 'was',
 'tired',
 '.']

In [19]:
# 1. Build Vocab
from torchtext.vocab import build_vocab_from_iterator

vocab = build_vocab_from_iterator(
    [tokens],
)

print(vocab.stoi)  # dict/vocab created with ids, stoi (string to int)

1lines [00:00, 5584.96lines/s]

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x000001E0A8FF1310>>, {'<unk>': 0, '<pad>': 1, 'the': 2, '.': 3, 'animal': 4, 'because': 5, 'cross': 6, 'did': 7, 'it': 8, 'not': 9, 'street': 10, 'tired': 11, 'was': 12})





In [20]:
vocab['the']  # access token id from vocab

2

In [21]:
for token in tokens:
    print(vocab[token])

2
4
7
9
6
2
10
5
8
12
11
3


In [22]:
# 2. torch Tensor  
token_ids = [vocab[token] for token in tokens]  # loop for store only ids in list
token_ids = torch.tensor(token_ids, dtype=torch.long)  # create torch tensor
print(f"token_ids: {token_ids}")

token_ids: tensor([ 2,  4,  7,  9,  6,  2, 10,  5,  8, 12, 11,  3])


In [28]:
# 3. Create Embedding
vocab_size = len(vocab) # 12
embedding_dim = 64

embedding = nn.Embedding(vocab_size, embedding_dim)
X = embedding(token_ids)

In [29]:
print(X.shape)


torch.Size([12, 64])


# 2. Linear Transformation

In [27]:
d_model = 64
d_k = d_v = 64

W_q = torch.randn(d_model, d_k)
W_k = torch.randn(d_model, d_k)
W_v = torch.randn(d_model, d_k)

# 3. Compute attention Score

In [30]:
Q = torch.matmul(X, W_q)
K = torch.matmul(X, W_k)
V = torch.matmul(X, W_v)

In [31]:
Q.shape

torch.Size([12, 64])

In [None]:
score = torch.matmul(Q, K.T)  # Q -> ([12, 64]),  K.T -> ([64, 12]), score -> ([12, 12])

In [38]:
score.shape

torch.Size([12, 12])

In [54]:
d_k = torch.tensor(d_k, dtype=torch.float)

  d_k = torch.tensor(d_k, dtype=torch.float)


In [55]:
scores = score / torch.sqrt(d_k)

In [45]:
scores.shape

torch.Size([12, 12])

In [None]:
softmax = nn.Softmax(dim=-1)  # attention is applied across keys dimension.
attn_weight = softmax(scores)

In [57]:
output = torch.matmul(attn_weight, V)

In [60]:
output.shape

torch.Size([12, 64])