In [25]:
from importlib.metadata import version
print("torch version:",version("torch"))

torch version: 2.3.0+cu121


In [26]:
import torch
inputs = torch.tensor([
    [0.43,0.15,0.89],#your
    [0.55,0.87,0.66],#journey
    [0.57,0.85,0.64],#starts
    [0.22,0.58,0.33],#with
    [0.77,0.25,0.10],#one
    [0.05,0.80,0.55] #step
])

Step 1: Compute unnormalized attention scores

attention scores between query x2 and all other input tokens

In [27]:
query = inputs[1] #2nd input token in query

attn_scores_2 = torch.empty(inputs.shape[0])
for i, x_i in enumerate(inputs):
  attn_scores_2[i] = torch.dot(x_i,query)

print(attn_scores_2)

tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


dot product is multiplying two vectors element wise and summing resulting products



In [28]:
res =0
for idx, element in enumerate(inputs[0]):
  res+=inputs[0][idx] * query[idx]

print(res)
print(torch.dot(inputs[0],query))

tensor(0.9544)
tensor(0.9544)


Step 2: normalize unnormalized attention scores so that they sum upto 1


In [29]:
attn_weights_2_tmp = attn_scores_2 / attn_scores_2.sum()

print("Attention Weights:",attn_weights_2_tmp)
print("Sum:",attn_weights_2_tmp.sum())

Attention Weights: tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656])
Sum: tensor(1.0000)


using softmax for normalize is common and recommended

In [30]:
def softmax_naive(x):
  return torch.exp(x)/torch.exp(x).sum(dim=0)

attn_weights_naive = softmax_naive(attn_scores_2)
print("Attention weights:",attn_weights_naive)
print("Sum:",attn_weights_naive.sum())

Attention weights: tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
Sum: tensor(1.)


In [31]:
#pytorch softmax function (optimized)
attn_weights_2 = torch.softmax(attn_scores_2,dim=0)
print("attention weights:",attn_weights_2)
print("sum:",attn_weights_2.sum())

attention weights: tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
sum: tensor(1.)


compute context vector

---


multiplying input tokens with attention weights and sum resulting vectors





In [32]:
query = inputs[1] #2nd input token is the query
context_vector_2 = torch.zeros(query.shape[0])
print(context_vector_2)
for i,x_i in enumerate(inputs):
  context_vector_2 += attn_weights_2[i] * x_i
  print(context_vector_2)

tensor([0., 0., 0.])
tensor([0.0596, 0.0208, 0.1233])
tensor([0.1904, 0.2277, 0.2803])
tensor([0.3234, 0.4260, 0.4296])
tensor([0.3507, 0.4979, 0.4705])
tensor([0.4340, 0.5250, 0.4813])
tensor([0.4419, 0.6515, 0.5683])


computing attention weights for all input tokens

In [33]:
inputs.shape

torch.Size([6, 3])

In [34]:
attn_scores = torch.empty(6,6)
for i,x_i in enumerate(inputs):
  for j,x_j in enumerate(inputs):
    attn_scores[i,j]=torch.dot(x_i,x_j)
print(attn_scores)

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])


can achieve the same as above more efficiently using matrix multiplication

In [35]:
attn_scores = inputs @ inputs.T
print(attn_scores)

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])


In [36]:
#normalize
attn_weights = torch.softmax(attn_scores,dim=1)
print(attn_weights)

tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.1263, 0.1720],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.1295],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])


In [37]:
#values in each row sums to 1
row2_sum = attn_weights[1].sum()
print(row2_sum)
print("all rows sum",attn_weights.sum(dim=-1))

tensor(1.)
all rows sum tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])


In [38]:
all_context_vecs = attn_weights @ inputs
print(all_context_vecs)

tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])


**Implementing self attention with trainable weights**

---




* self attention mechanism used in gpt and other popular llms
* also called scaled dot product attention
* weights matrics updated during training
* three training weight matrices Wq(query), Wk(key), Wv(value)
* input and output dimensions same in gpt but here choose different