# Simplified version

In [113]:
import math
from typing import List

import requests
import torch
import tiktoken

from playground_gpt.embedding import Embedding

In [81]:
shakespeare_complete = requests.get(
    "https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt"
).text

In [82]:
tokenizer = tiktoken.get_encoding("gpt2")
embedder = Embedding(input_size=1, embed_size=3, vocab_size=50257)

In [83]:
text = "Your journey starts with one step"

Here we have the tokens for "Your journey starts with one step":

```
Your        -->    7120
journey     -->    7002
starts      -->    4940
with        -->    351
one         -->    530
step        -->    2239
```

In [84]:
token_ids: List[int] = tokenizer.encode(text)
token_ids

[7120, 7002, 4940, 351, 530, 2239]

Here we have the `n=3` embeddings for those tokens:

```
Your      --->    [1.8105, 1.2218, 1.2664]   
journey   --->    [1.3837, 1.3195, 1.8066]   
starts    --->    [1.5646, 0.4626, 1.3061]   
with      --->    [1.2394, 1.4075, 1.4538]   
one       --->    [1.2369, 1.2138, 1.5761]   
step      --->    [1.3719, 1.2501, 1.3292]   
```

In [85]:
embeddings = embedder.positional_embed(token_ids)
embeddings



tensor([[1.8105, 1.2218, 1.2664],
        [1.3837, 1.3195, 1.8066],
        [1.5646, 0.4626, 1.3061],
        [1.2394, 1.4075, 1.4538],
        [1.2369, 1.2138, 1.5761],
        [1.3719, 1.2501, 1.3292]])

Now we need to calculate the attention weights

$$ w_i = [ X_i \cdot X_0, X_i \cdot X_1, \dots , X_i \cdot X_n ] $$

$\vec{w}$ is a `NxN` matrix, where N is the number of tokens being inputted.

We also need to normalize $\vec{w}$, and for that we use the softmax function, defined as:

$$\sigma(\vec{z}_i) = \frac{e^{z_i}}{\sum_j e^{z_j}}$$

Softmax is used because it makes the results always positive.

In [118]:
def naive_softmax(vector: torch.Tensor) -> torch.Tensor:
    applied_vector = torch.empty(vector.shape[0])
    for idx, x_i in enumerate(vector):
        applied_vector[idx] = math.exp(x_i)
    return applied_vector / applied_vector.sum()


def calculate_attention_weight(
    query: torch.Tensor, context: torch.Tensor
) -> torch.Tensor:
    weights = torch.empty(context.shape[0])
    for i, x_i in enumerate(context):
        weights[i] = torch.dot(query, x_i)
    weights = torch.softmax(
        weights, dim=0
    )  # torch softmax is better at handling very large/small data
    return weights


def self_attention(embeddings: torch.Tensor) -> torch.Tensor:
    weights = torch.empty((embeddings.shape[0], embeddings.shape[0]))
    for idx, elem in enumerate(embeddings):
        attention_idx = calculate_attention_weight(elem, embeddings)
        weights[idx] = attention_idx
    return weights

In [132]:
def fast_self_attention(embeddings: torch.Tensor) -> torch.Tensor:
    # the same operation as x_i times x_j in two for loops
    weights = embeddings @ embeddings.T
    return torch.softmax(weights, dim=1)

In [133]:
weights = fast_self_attention(embeddings)
weights

tensor([[0.2571, 0.2652, 0.0685, 0.1455, 0.1334, 0.1303],
        [0.1929, 0.3227, 0.0542, 0.1569, 0.1511, 0.1223],
        [0.2313, 0.2514, 0.1167, 0.1318, 0.1408, 0.1281],
        [0.1994, 0.2957, 0.0535, 0.1676, 0.1519, 0.1320],
        [0.1939, 0.3017, 0.0606, 0.1611, 0.1539, 0.1288],
        [0.2165, 0.2794, 0.0631, 0.1601, 0.1473, 0.1336]])

In [146]:
weights @ embeddings

tensor([[1.4637, 1.2253, 1.4892],
        [1.4296, 1.2436, 1.5268],
        [1.4623, 1.1847, 1.4832],
        [1.4304, 1.2437, 1.5150],
        [1.4301, 1.2376, 1.5178],
        [1.4412, 1.2335, 1.5039]])