https://towardsdatascience.com/illustrated-self-attention-2d627e33b20a

https://sebastianraschka.com/blog/2023/self-attention-from-scratch.html

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
np.random.rand(10).reshape(5,2)

array([[0.24595806, 0.80297821],
       [0.91356724, 0.3644435 ],
       [0.85240105, 0.06672613],
       [0.12761768, 0.4752835 ],
       [0.84502346, 0.26101756]])

In [39]:
text = "how are you doing"
print(text)

how are you doing


In [40]:
# Define the tokens
tokens = text.split()
tokens

['how', 'are', 'you', 'doing']

In [51]:
# Define the query vector
query = np.random.randn(len(tokens),len(tokens))
query

array([[ 0.17893869, -0.8460428 ,  0.45129722, -0.41137148],
       [ 1.14494046,  0.53410844, -0.23760124, -1.08492737],
       [-0.12321856,  0.87362529, -0.40952103, -0.42694098],
       [-0.54553616,  0.60786712, -0.84776332,  1.20422583]])

In [52]:
# Define the matrix of keys
keys = np.random.randn(len(tokens), len(tokens))
print(keys.shape)
keys

(4, 4)


array([[ 2.43169513, -0.30909618,  1.25203021, -0.920183  ],
       [-1.84807596,  1.18086692, -0.53522638,  1.48934727],
       [-0.72982072, -0.44740646, -1.32930932, -0.39467154],
       [ 0.10700546,  0.45875577,  2.53498869,  1.12647362]])

In [53]:
# Define the matrix of values
values = np.random.randn(len(tokens), len(tokens))
print(values.shape)
values

(4, 4)


array([[ 0.57437671,  0.90382719,  0.36601466, -1.01685409],
       [-0.76705726, -1.04074081, -0.56039836, -0.28129068],
       [-0.80001236,  0.61584185,  0.45932101, -0.65200065],
       [ 0.93936566, -0.5717334 , -0.2347111 ,  0.18640325]])

In [54]:
# Calculate the attention scores
scores = np.dot(query, keys.T)

scores

array([[ 1.64020774, -2.18397742, -0.18962513,  0.31165462],
       [ 3.31990305, -2.97388912, -0.33052934, -1.45691831],
       [-0.68953308,  0.84267554,  0.41194343, -1.13147334],
       [-3.68400046,  3.9732582 ,  0.77784592, -0.57205458]])

In [55]:
# Apply softmax to the scores to get the attention weights
attention_weights = np.exp(scores) / np.sum(np.exp(scores))
attention_weights

array([[5.33206782e-02, 1.16432213e-03, 8.55478988e-03, 1.41225258e-02],
       [2.86007927e-01, 5.28468200e-04, 7.43045528e-03, 2.40896734e-03],
       [5.18922000e-03, 2.40177228e-02, 1.56123131e-02, 3.33556851e-03],
       [2.59789398e-04, 5.49701096e-01, 2.25100617e-02, 5.83609490e-03]])

In [56]:
# Calculate the attention value
attention_value = np.dot(attention_weights, values)
attention_value

array([[ 0.03615533,  0.044175  ,  0.01947835, -0.05749211],
       [ 0.16018937,  0.26115044,  0.10723449, -0.29537261],
       [-0.02479913, -0.01259841, -0.00517199, -0.02159012],
       [-0.4340291 , -0.56133561, -0.29898696, -0.16847867]])

In [57]:
# Print the attention weights and attention value
print("Attention weights:", attention_weights)
print("Attention value:", attention_value)

Attention weights: [[5.33206782e-02 1.16432213e-03 8.55478988e-03 1.41225258e-02]
 [2.86007927e-01 5.28468200e-04 7.43045528e-03 2.40896734e-03]
 [5.18922000e-03 2.40177228e-02 1.56123131e-02 3.33556851e-03]
 [2.59789398e-04 5.49701096e-01 2.25100617e-02 5.83609490e-03]]
Attention value: [[ 0.03615533  0.044175    0.01947835 -0.05749211]
 [ 0.16018937  0.26115044  0.10723449 -0.29537261]
 [-0.02479913 -0.01259841 -0.00517199 -0.02159012]
 [-0.4340291  -0.56133561 -0.29898696 -0.16847867]]


In [60]:
# %%writefile -a self_attention.py

def self_attention(text):
    import numpy as np

    # Define the tokens
    #tokens = ['how', 'are', 'you', 'doing', 'today']
    tokens = text.split()

    # Define the query vector
    query = np.random.randn(len(tokens),len(tokens))

    # Define the matrix of keys
    keys = np.random.randn(len(tokens), len(tokens))

    # Define the matrix of values
    values = np.random.randn(len(tokens), len(tokens))

    # Calculate the attention scores
    scores = np.dot(query, keys.T)

    # Apply softmax to the scores to get the attention weights
    attention_weights = np.exp(scores) / np.sum(np.exp(scores))

    # Calculate the attention value
    attention_value = np.dot(attention_weights, values)

    # Print the attention weights and attention value
    return {"Attention weights":attention_weights,"Attention value ": attention_value}


In [61]:
self_attention("How are you")

{'Attention weights': array([[0.03466487, 0.16967173, 0.02008044],
        [0.0387055 , 0.07074764, 0.03003003],
        [0.12138279, 0.0065977 , 0.5081193 ]]),
 'Attention value ': array([[-0.07045921,  0.30256486,  0.02967586],
        [-0.04790429,  0.19714705,  0.02780509],
        [-0.43380583,  0.22494098, -0.06037294]])}