<a href="https://colab.research.google.com/github/OussamaHaff/machine-learning-upskilling/blob/main/02-llms-from-scratch/05-self-attention/simple_self_attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Simple Self Attention Without Weights

## Input Embeddings

In [3]:
import torch

# Sentence: "Your journey starts with one step"
inputs = torch.tensor([
    [0.43, 0.15, 0.89], # Your
    [0.55, 0.87, 0.66], # journey
    [0.57, 0.85, 0.64], # starts
    [0.22, 0.58, 0.33], # with
    [0.77, 0.25, 0.10], # one
    [0.05, 0.80, 0.55], # step
])

print(inputs)

tensor([[0.4300, 0.1500, 0.8900],
        [0.5500, 0.8700, 0.6600],
        [0.5700, 0.8500, 0.6400],
        [0.2200, 0.5800, 0.3300],
        [0.7700, 0.2500, 0.1000],
        [0.0500, 0.8000, 0.5500]])


## Calculating Attention Scores

# This is formatted as code




In [11]:
# [0.43, 0.15, 0.89]  [0.55, 0.87, 0.66]  [0.57, 0.85, 0.64]  [0.22, 0.58, 0.33]  [0.77, 0.25, 0.10]  [0.05, 0.80, 0.55]
#       Your           >>> journey <<<            starts              with               one              step
#                           query

# Attention weight for "journey"
journey_embedding_query_token = inputs[1]
print(f"Query token embeddings:\n", journey_embedding_query_token)

journey_attention_scores = torch.empty(inputs.shape[0])
for i, x_i in enumerate(inputs):
  journey_attention_scores[i] = torch.dot(x_i, journey_embedding_query_token)

print(f"Self Attention scores for query:\n",journey_attention_scores)



Query token embeddings:
 tensor([0.5500, 0.8700, 0.6600])
Self Attention scores for query:
 tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


## Attention Scores Normalisation

### Basic Normalisation

In [13]:
# [0.43, 0.15, 0.89]  [0.55, 0.87, 0.66]  [0.57, 0.85, 0.64]  [0.22, 0.58, 0.33]  [0.77, 0.25, 0.10]  [0.05, 0.80, 0.55]
#       Your           >>> journey <<<            starts              with               one              step
#                           query
#
# Attentions Scores W_2T
#       0.9544             1.4950                 1.4754             0.8434             0.7070              1.0865

journey_attention_weights = journey_attention_scores / journey_attention_scores.sum()
print(f"Normalised self attention weights:\n", journey_attention_weights)
print(f"Normalised self attention weights sum:\n", journey_attention_weights.sum())

Normalised self attention weights:
 tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656])
Normalised self attention weights sum:
 tensor(1.0000)


### Softmax Normalisation - Naive

In [18]:
# [0.43, 0.15, 0.89]  [0.55, 0.87, 0.66]  [0.57, 0.85, 0.64]  [0.22, 0.58, 0.33]  [0.77, 0.25, 0.10]  [0.05, 0.80, 0.55]
#       Your           >>> journey <<<            starts              with               one              step
#                           query
#
# Attentions Scores W_2T
#       0.9544             1.4950                 1.4754             0.8434             0.7070              1.0865
#
# Normalised Wights
#       0.1455             0.2278                  0.2249            0.1285             0.1077              0.1656

def softmax_naive(x):
  return torch.exp(x) / torch.exp(x).sum(dim=0)

journey_attention_weights_sfmx_nv = softmax_naive(journey_attention_scores)

print("Softmax Normalisation - Naive")
print(f"Normalised self attention weights:\n", journey_attention_weights_sfmx_nv)
print(f"Normalised self attention weights sum:\n", journey_attention_weights_sfmx_nv.sum())

Softmax Normalisation - Naive
Normalised self attention weights:
 tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
Normalised self attention weights sum:
 tensor(1.)


### Softmax Normalisation - PyTorch (best)

In [20]:
# [0.43, 0.15, 0.89]  [0.55, 0.87, 0.66]  [0.57, 0.85, 0.64]  [0.22, 0.58, 0.33]  [0.77, 0.25, 0.10]  [0.05, 0.80, 0.55]
#       Your           >>> journey <<<            starts              with               one              step
#                           query
#
# Attentions Scores W_2T
#       0.9544             1.4950                 1.4754             0.8434             0.7070              1.0865
#
# Normalised Wights
#       0.1455             0.2278                  0.2249            0.1285             0.1077             0.1656
#
# Normalised Wights - Softmax Naive
#       0.1385             0.2379                  0.2333            0.1240             0.1082            0.1581


journey_attention_weights_sfmx = torch.softmax(journey_attention_scores, dim=0)
print("Softmax Normalisation - PyTorch")
print(f"Normalised self attention weights:\n", journey_attention_weights_sfmx)
print(f"Normalised self attention weights sum:\n", journey_attention_weights_sfmx.sum())

Softmax Normalisation - PyTorch
Normalised self attention weights:
 tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
Normalised self attention weights sum:
 tensor(1.)


## Calculating Context Vector

In [22]:
# [0.43, 0.15, 0.89]  [0.55, 0.87, 0.66]  [0.57, 0.85, 0.64]  [0.22, 0.58, 0.33]  [0.77, 0.25, 0.10]  [0.05, 0.80, 0.55]
#       Your           >>> journey <<<            starts              with               one              step
#                           query
#
# Normalised Wights - Softmax PyTorch
#       0.1385              0.2379                  0.2333            0.1240            0.1082            0.1581


# Attention weight for "journey"
journey_embedding_query_token = inputs[1]
context_vector = torch.zeros(journey_embedding_query_token.shape)
for i, x_i in enumerate(inputs):
  context_vector += journey_attention_weights_sfmx[i]*x_i

print(context_vector)

tensor([0.4419, 0.6515, 0.5683])
