# 1. Tokenize

In [1]:
import re
import numpy as np
from gensim.models import Word2Vec

text = "The animal did not cross the street because it was tired."
tokens = re.findall(r"\w+", text.lower())


In [2]:
tokens[0]

'the'

# 2. Word Embeddings (Text--> Vectors)

In [3]:
sentences = [tokens]

word2vec_model = Word2Vec(
    sentences,
    vector_size=64,
    window=3,
    min_count=1,
    sg=1
)

X = np.array([word2vec_model.wv[word] for word in tokens])  # (seq_len, d_model)

In [4]:
X.shape         # (seq_len, d_model)

(11, 64)

In [5]:
X[0]  # embedding for 0th position "the"

array([-8.37855041e-04,  3.69424000e-04,  7.97398388e-03,  1.40769891e-02,
       -1.45358592e-02, -1.11200139e-02,  1.00919884e-02,  1.40202940e-02,
       -7.83660635e-03, -5.88026829e-03,  1.15320385e-02, -2.39604898e-03,
       -7.08845817e-03,  1.02407057e-02, -7.59400055e-03, -2.83752754e-03,
        4.49465588e-03,  1.54980272e-03, -1.29456483e-02, -1.47637781e-02,
        1.14246346e-02,  7.92228431e-03,  1.05588958e-02,  1.19197741e-03,
        9.92326625e-03, -5.32088429e-03, -1.47875212e-03,  9.01339576e-03,
       -1.17525589e-02, -6.15016185e-03, -1.17368475e-02, -1.45319104e-03,
        1.49033107e-02, -1.14361979e-02, -3.64651345e-03, -3.02772038e-03,
        1.26209948e-02, -9.26702470e-03,  7.05663115e-05, -7.42770918e-03,
       -1.50055476e-02,  7.82389566e-03, -1.36868525e-02, -6.86222687e-03,
       -5.48437238e-05, -4.62783501e-04, -1.19706877e-02,  1.50230359e-02,
        7.78446533e-03,  1.44267865e-02, -1.27467457e-02,  7.02468492e-03,
       -6.46418147e-03,  

# 3. Positional Encoding

In [6]:
def positional_encoding(seq_len, d_model):
    PE = np.zeros((seq_len, d_model))
    for pos in range(seq_len):
        for i in range(0, d_model, 2):
            PE[pos, i] = np.sin(pos / (10000 ** (i/d_model)))
            PE[pos, i+1] = np.cos(pos / (10000 ** (i/d_model)))
    return PE


In [7]:
X[0]

array([-8.37855041e-04,  3.69424000e-04,  7.97398388e-03,  1.40769891e-02,
       -1.45358592e-02, -1.11200139e-02,  1.00919884e-02,  1.40202940e-02,
       -7.83660635e-03, -5.88026829e-03,  1.15320385e-02, -2.39604898e-03,
       -7.08845817e-03,  1.02407057e-02, -7.59400055e-03, -2.83752754e-03,
        4.49465588e-03,  1.54980272e-03, -1.29456483e-02, -1.47637781e-02,
        1.14246346e-02,  7.92228431e-03,  1.05588958e-02,  1.19197741e-03,
        9.92326625e-03, -5.32088429e-03, -1.47875212e-03,  9.01339576e-03,
       -1.17525589e-02, -6.15016185e-03, -1.17368475e-02, -1.45319104e-03,
        1.49033107e-02, -1.14361979e-02, -3.64651345e-03, -3.02772038e-03,
        1.26209948e-02, -9.26702470e-03,  7.05663115e-05, -7.42770918e-03,
       -1.50055476e-02,  7.82389566e-03, -1.36868525e-02, -6.86222687e-03,
       -5.48437238e-05, -4.62783501e-04, -1.19706877e-02,  1.50230359e-02,
        7.78446533e-03,  1.44267865e-02, -1.27467457e-02,  7.02468492e-03,
       -6.46418147e-03,  

# 4. X = Word Embedding + Positional encoding

In [8]:
X = X + positional_encoding(len(tokens), 64)

In [9]:
X.shape

(11, 64)

# 5. Linear Transformation

In [None]:
d_model = 64   
d_k = d_v = 64

W_Q = np.random.randn(d_model, d_k) / np.sqrt(d_model)
W_K = np.random.randn(d_model, d_k) / np.sqrt(d_model)
W_V = np.random.randn(d_model, d_v) / np.sqrt(d_model)


In [11]:
W_Q[0]

array([ 0.08114857, -0.10132557,  0.11255051, -0.05111816,  0.04884283,
        0.05232499, -0.02108134,  0.29522415,  0.01974165, -0.15762554,
        0.10042332, -0.1210086 , -0.08069123, -0.00272692,  0.04663462,
       -0.08285211, -0.01796971,  0.09060436,  0.00491947,  0.11533925,
        0.11332528,  0.02083777,  0.0721533 ,  0.05267379,  0.17753561,
       -0.01031737, -0.05902668,  0.14941398,  0.02299105, -0.05141859,
        0.05353298, -0.18154463,  0.13640388,  0.00902625, -0.03425967,
       -0.04496862, -0.35514213, -0.348364  ,  0.02640592, -0.0341356 ,
       -0.0608589 , -0.2023079 , -0.19564655,  0.12100009,  0.18389604,
       -0.03478337, -0.05298018,  0.0747127 ,  0.07380973,  0.1290488 ,
       -0.05146612, -0.01735571, -0.00059557, -0.07908442,  0.01409753,
       -0.15366008, -0.16403711,  0.03634654, -0.02057443,  0.25937149,
        0.36932576,  0.070443  ,  0.05920465,  0.04002644])

In [12]:
Q = np.matmul(X, W_Q)
K = np.matmul(X, W_K)
V = np.matmul(X, W_V)

In [13]:
Q.shape

(11, 64)

# Compute Attention Score

In [14]:
score = np.matmul(Q, np.transpose(K))
print(score.shape)

(11, 11)


In [15]:
d_k = Q.shape[1]
scores = score / np.sqrt(d_k)    # Scaling 

In [16]:
scores.shape

(11, 11)

In [17]:
def softmax(x):
    e_x = np.exp(x-np.max(x, axis=1, keepdims=True))
    return e_x / e_x.sum(axis=1, keepdims=True)

attention_weight = softmax(scores)

In [18]:
attention_weight

array([[0.10093945, 0.11481339, 0.10491636, 0.08981258, 0.0800526 ,
        0.08109951, 0.09032111, 0.09479318, 0.09019084, 0.08036821,
        0.07269278],
       [0.10214885, 0.1204062 , 0.10918833, 0.09069054, 0.0779121 ,
        0.07769066, 0.08709022, 0.09273594, 0.08960213, 0.07997425,
        0.07256077],
       [0.10373144, 0.12318574, 0.11106848, 0.09179593, 0.07788926,
        0.076627  , 0.08551132, 0.09103374, 0.08843543, 0.07896008,
        0.07176157],
       [0.10269771, 0.11827235, 0.10699439, 0.09161986, 0.08045604,
        0.07934897, 0.0870246 , 0.09090931, 0.08798895, 0.08004736,
        0.07464045],
       [0.10191114, 0.11073654, 0.10037418, 0.09028457, 0.08370068,
        0.08342478, 0.08935055, 0.09101217, 0.08745431, 0.08201485,
        0.07973624],
       [0.10031372, 0.10569569, 0.09722418, 0.09090369, 0.08705194,
        0.08622907, 0.08918346, 0.08873778, 0.08539765, 0.08367095,
        0.08559188],
       [0.10044133, 0.10813684, 0.10151192, 0.09521589, 0.

In [19]:
output = np.matmul(attention_weight, V)
print(output.shape)


(11, 64)


In [20]:
idx = tokens.index("it")

print(f"'it' idx number in sentence: {idx},\n attention weight of 'it': {attention_weight[idx]}")

'it' idx number in sentence: 8,
 attention weight of 'it': [0.11105224 0.13400847 0.12875428 0.1118805  0.08962231 0.07484336
 0.06904648 0.06776598 0.06874615 0.07114346 0.07313677]


In [21]:
idx_it = tokens.index("it")
idx_animal = tokens.index("animal")

row = attention_weight[idx_it]

# sort attention scores
sorted_indices = np.argsort(row)[::-1]

for i in sorted_indices[:5]:
    print(tokens[i], row[i])


animal 0.1340084731350856
did 0.12875428328802235
not 0.11188049529675209
the 0.11105223551680474
cross 0.08962231051666668
