In [3]:
# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-11-05 10:48:52--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.2’


2024-11-05 10:48:53 (2.41 MB/s) - ‘input.txt.2’ saved [1115394/1115394]



In [4]:
# read it in to inspect it
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [5]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [6]:
# let's look at the first 1000 characters
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [7]:
# here are all the unique characters that occur in this text
chars = sorted(list(set(text))) # set removes duplicate items, then convereted to a list, thens orted
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [8]:
stoi = { ch:i for i,ch in enumerate(chars)}
itos = { i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s] # encoder takes a string and outputs a list  of integers
decode = lambda l: ''.join([itos[i] for i in l])

print(encode("hii there"))
print(decode(encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


NOTE: Typically, people use sub-word level tokenizers (SentenceGram from Google and Tiktoken form OpenAI), but to keep it simple, we'll use a character level tokenizer.

In [9]:
# encoding the entire text
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [10]:
# train and test split
n = int(0.9*len(data)) # first 90% will be train
train_data = data[:n]
val_data = data[n:]

In [11]:
# hyperparameters
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

NOTE: We use block_size + 1 above because when we feed it into the transformer, it's always with a length of context + 1. Here's how the first few examples go:
- (18 -> 47)
- (18, 47) -> (56)
- (18, 47, 56) -> (57)
- ...
- (18, 47, 56, 57, 58,  1, 15, 47) -> (58)
- (n number of examples) -> (element of index n + 1))

In [12]:
# sampling data
torch.manual_seed(1337)
batch_size = 4 # how many independent dequences we'll  process in parallel
block_size = 8 # maximum context length for predictions
def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size, ))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y
    
xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----
when input is [24] the target: 43
when input is [24, 43] the target: 58
when input is [24, 43, 58] the target: 5
when input is [24, 43, 58, 5] the target: 57
when input is [24, 43, 58, 5, 57] the target: 1
when input is [24, 43, 58, 5, 57, 1] the target: 46
when input is [24, 43, 58, 5, 57, 1, 46] the target: 43
when input is [24, 43, 58, 5, 57, 1, 46, 43] the target: 39
when input is [44] the target: 53
when input is [44, 53] the target: 56
when input is [44, 53, 56] the target: 1
when input is [44, 53, 56, 1] the target: 58
when input is [44, 53, 56, 1, 58] the target: 46
when input is [44, 53

# First Baseline: Bigram

In [13]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__() # same constructor as the nn.Module class
        # each token directly reads off the logies for the nest token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self,idx,targets=None):

        # idx and targets are both (B, T) tensors of integers
        logits = self.token_embedding_table(idx) # B, T, C (batch by time by channel, in this case 4, 8, 65)

        if targets is None:
            loss = None
        else:
            # Pytorch only accepts as the input as a B, C, T so we have to rearrange
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(-1)
            loss = F.cross_entropy(logits, targets)
    
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of inde=ices in the current context
        for _ in range (max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on last time step because it's the 9th element and a prediction of what comes after the 8 characters
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B,1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
            

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(decode(m.generate( idx = torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


### Training the Bigram

In [14]:
# create a PyTorch optimizer

# in makemore, we always used SDG as the optimizer, but Adam is better going forward
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3) # typical good learning rate is 3e-4 but for smaller networks we can get away with a ;arger rate

In [15]:
batch_size = 32
for steps in range(10000):

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.382369041442871


In [16]:
print(decode(m.generate( idx = torch.zeros((1,1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


lso br. ave aviasurf my, yxMPZI ivee iuedrd whar ksth y h bora s be hese, woweee; the! KI 'de, ulseecherd d o blllando;LUCEO, oraingofof win!
RIfans picspeserer hee tha,
TOFonk? me ain ckntoty ded. bo'llll st ta d:
ELIS me hurf lal y, ma dus pe athouo
BEY:! Indy; by s afreanoo adicererupa anse tecorro llaus a!
OLeneerithesinthengove fal amas trr
TI ar I t, mes, n IUSt my w, fredeeyove
THek' merer, dd
We ntem lud engitheso; cer ize helorowaginte the?
Thak orblyoruldvicee chot, p,
Bealivolde Th li


# Single-Head Self Attention

### The mathematical trick in self-attention

We want to couple the 8 tokes in each batch in a way that allows each token to have a context of its preceding tokens. The easiest and most simple way to do this would be to take the average of each every preceding token, which would act as a feature vector for that token. Obviously, a lot of context would be lost here, but this is the first step to building a self-attention block.

![title](singeHeadAttention.png)

In [17]:
# consider the following toy example:

torch.manual_seed(1337)
B,T,C = 4,8,2 # batch, time, channels
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [18]:
# We want x[b,t] = mean_{i<=t} x[b,i]

# bag of words model
xbow = torch.zeros(B,T,C)
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] # gives us all of the elements from 0 to the T'th element
        xbow[b,t] = torch.mean(xprev,0)


In [19]:
x[0]

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])

In [20]:
xbow[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [21]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3,3)) # returns the bottom left triangular part of a matrix, which allows us to do a running average
a = a / torch.sum(a, 1, keepdim=True) # sums the matrix so that the rows all add up to 1
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print('a=', a)
print("b=", b)
print('c', c)

a= tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
b= tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
c tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


Notice how the first row is equal, and then every row after that is different - that is because they are vertical averages of the preceding elements. This is a convenient way of doing what we did above. Now we'll implement it below.

In [22]:
# Version 1

xbow = torch.zeros(B,T,C)
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] # gives us all of the elements from 0 to the T'th element
        xbow[b,t] = torch.mean(xprev,0)

xbow

tensor([[[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]],

        [[ 1.3488, -0.1396],
         [ 0.8173,  0.4127],
         [-0.1342,  0.4395],
         [ 0.2711,  0.4774],
         [ 0.2421,  0.0694],
         [ 0.0084,  0.0020],
         [ 0.0712, -0.1128],
         [ 0.2527,  0.2149]],

        [[-0.6631, -0.2513],
         [ 0.1735, -0.0649],
         [ 0.1685,  0.3348],
         [-0.1621,  0.1765],
         [-0.2312, -0.0436],
         [-0.1015, -0.2855],
         [-0.2593, -0.1630],
         [-0.3015, -0.2293]],

        [[ 1.6455, -0.8030],
         [ 1.4985, -0.5395],
         [ 0.4954,  0.3420],
         [ 1.0623, -0.1802],
         [ 1.1401, -0.4462],
         [ 1.0870, -0.4071],
         [ 1.0430, -0.1299],
         [ 1.1138, -0.1641]]])

In version 2, each element has an associated weight to it, showing how relevant it is to predict the next token.

In [23]:
# Version 2

wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x # (B, T, T) @ (B, T, C) --> (B, T, C) - in each element of the batch, a (T, T) @ (T, C) operation is happening, as shown above
torch.allclose(xbow, xbow2)
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [24]:
# version 3: use Softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf')) # instead of 1s and 0s, we're using 0s and -inf to make the future tokens completely irrelevant
wei = F.softmax(wei, dim=-1) # applying softmax on -inf will just return 0s and take an average of each row
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

True

The elements from the lower triangular part tell you how relevant each element is for the next token and it increases as you traverse down.

### Explanation of Self-Attention

Each vector has a key and a query. The key is what the token contains and the query is what the token is looking for. To ge the affinities of all the tokens and to understand how significant they are, we take the dot product of the key of any given token and the queries of all previous tokens. This will give us context on how important every other token is to the given token. If a given key and query are specifically aligned, it will have a higher dot product result, which will allow the token to learn more about that other token.

The masked fill is used to block out future tokens while performing dot product and the softmax is used to normale the results, as there weill be some negative affinities, so we exponentiate them to get rid of them.

In addition, we have a value v because rather than aggregating the values in x directly, applying another linear layer on it and storing the values in v acts as a private value for the signle head. Essentially, it stores the dat that it will communicate to previous tokens if it finds them interesting.

**Detailed Explanation**

Values in Self-Attention
In self-attention, values (v) represent the information that each token or node will communicate to other tokens, based on their affinities.
- Think of each token as having private information stored in a vector 'x'.
- For the purposes of self-attention, each token produces three vectors: keys (k), queries (q), and values (v).
- Queries represent what a token is looking for, keys represent what a token contains, and values represent what a token will share.
- Affinities between tokens are calculated by taking the dot product of a query with all the keys.
- These affinities determine how much information from each token's value vector will be aggregated into the output of the self-attention mechanism.
Here's a breakdown of the process:
1. Each token produces keys, queries, and values by applying linear transformations to its private information vector 'x'.
2. The query of each token is dot-producted with the keys of all other tokens, resulting in a matrix of affinities or weights ('way').
3. This 'way' matrix undergoes masking and softmax to create a normalised distribution representing how much information each token should aggregate from others.
4. The values of each token are then aggregated using the weighted sum determined by the normalised 'way' matrix.
Therefore, values (v) play a crucial role in determining the information flow in self-attention. They represent the specific information that each token contributes to the communication process, which ultimately influences the output of the self-attention mechanism.

**Q&A**

Why the Dot Product for Affinities?

The dot product is specifically used to calculate affinities in self-attention due to its ability to represent alignment and similarity between vectors. This mathematical property makes it well-suited for determining how much one token should "pay attention" to another.
While other operations might be mathematically valid, they don't necessarily capture the desired relationship between keys and queries. For example:
- Element-wise multiplication: This would only consider the correspondence between individual elements of the vectors, ignoring the overall relationship between them. It wouldn't reflect the idea of a query searching for information represented by a key.
- Euclidean distance: This measures the distance between two vectors, with a smaller distance indicating greater similarity. However, in self-attention, we're interested in the degree of alignment or matching, not just proximity. A query and a key could be close in Euclidean space without necessarily being aligned in the direction that represents the desired information exchange.

The dot product, on the other hand, directly quantifies the alignment between two vectors. It's sensitive to both the magnitude and direction of the vectors, capturing the idea that a query is seeking specific information represented by the key's direction.
Imagine a simplified scenario where a query seeks information about vowels. This query might be represented by a vector pointing in a specific direction associated with vowel characteristics. Keys from vowel tokens would also have vectors pointing in similar directions.
The dot product between the query and a key from a vowel token would be high due to their strong alignment. Conversely, keys from consonant tokens would have vectors pointing in different directions, resulting in lower dot products with the vowel query.
It's important to remember that the model learns to generate keys and queries that yield meaningful dot products during training. Initially, the linear transformations applied to 'x' to produce keys and queries might be random, but the training process adjusts their weights to capture relevant relationships.
This learning process essentially refines the dot product's ability to represent affinities that align with the desired task, making it a powerful tool for data-dependent information flow in self-attention.


In [47]:
# version 4: self-attention!
torch.manual_seed(1337)
B,T,C = 4,8,32
x = torch.randn(B,T,C)

# single Head performing self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False) # although x is (4,8,32), applying a linear layer reshapes it to (32,32), since the total # of elements stays the same
k = key(x)
q = query(x)
# (B, T, 16) @ (B, 16, T) ---> (B, T, T)
wei = q @ k.transpose(-2, -1) # flipping around the last two dimensions

tril = torch.tril(torch.ones(T, T))
# wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf')) # instead of 1s and 0s, we're using 0s and -inf to make the future tokens completely irrelevant
wei = F.softmax(wei, dim=-1) # applying softmax on -inf will just return 0s and take an average of each row


v = value(x)
out = wei @ v

out.shape

torch.Size([4, 8, 16])

In [28]:
wei

tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
         [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
         [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
         [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],

        [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1687, 0.8313, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2477, 0.0514, 0.7008, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4410, 0.0957, 0.3747, 0.0887, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0069, 0.0456, 0.0300, 0.7748, 0.1427, 0.0000, 0.0000, 0.0000],
         [0.0660, 0.089

In [29]:
wei

tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
         [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
         [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
         [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],

        [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1687, 0.8313, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2477, 0.0514, 0.7008, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4410, 0.0957, 0.3747, 0.0887, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0069, 0.0456, 0.0300, 0.7748, 0.1427, 0.0000, 0.0000, 0.0000],
         [0.0660, 0.089

Few notes about transformers:

Notes:
- Attention is a **communication mechanism**. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.
- There is no notion of space. Attention simply acts over a set of vectors. This is why we need to positionally encode tokens.
- Each example across batch dimension is of course processed completely independently and never "talk" to each other
- In an "encoder" attention block just delete the single line that does masking with `tril`, allowing all tokens to communicate. This block here is called a "decoder" attention block because it has triangular masking, and is usually used in autoregressive settings, like language modeling.
- "self-attention" just means that the keys and values are produced from the same source as queries. In "cross-attention", the queries still get produced from x, but the keys and values come from some other, external source (e.g. an encoder module)
- "Scaled" attention additional divides `wei` by 1/sqrt(head_size). This makes it so when input Q,K are unit variance, wei will be unit variance too and Softmax will stay diffuse and not saturate too much. Illustration below

In [48]:
k = key(x)
q = query(x)
# (B, T, 16) @ (B, 16, T) ---> (B, T, T)
wei = q @ k.transpose(-2, -1) * head_size**-0.5

NOTE: above, we have to multiply wei by 1/sqrt(head_size) to normalize the values because if we don't, the values in wei would become either very positive or very negative and this would cause the softmax to turn the values into essentially one-hot vectors since the softmax would sharpen its outputs.

# Multi-Head Attention

Performing multiple single head attentions in parallel and concatenating them. **Check VSCode for the code.**

![title](multiHeadAttention.png)

# Feedforward Layer

We add a normal linear layer, followed by a non linearity to perform the computation. The attention mechanisms allow the tokens to communicate and assign relative weights for how important they are, but the linear layer is needed to allow them to train on that data and comptute values. 

<!-- ![title](transformer.webp) -->

# Attention Blocks

 We'll now implement the attention blocks as shown in the diagram below to speed up the computation.

<div>
<img src="transformer.webp" width="500"/>
</div>

### Residual Connections

The arrows in the diagram above, skipping the multi head attention blocks and directly going to the add & norm blocks are called residual connections. These essentially act as shown in the diagram below, where the gradients of the output initially channel all the way back to the input, since addition propogates the gradients equally. The blocks on the side aren't intially there, but they slowly start to build gradients over time, which helps with optimization.
![title](residualConnections.png)

### Layer Normalization

Same as batch norm, but across the layer, instead of the batch. Therefore, we delete the buffers and the training conditions.

NOTE: In the diagram, addition and normalization are performed AFTER the multi-headed attention block, but in recent years, its become more common to use layernorm right before the attention block.

In [49]:
class BatchNorm1d: # modeled after torch.nn.BatchNorm1d(num_features, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True, device=None, dtype=None)
    def __init__(self, dim, eps=1e-5, momentum=0.1): # 

        # saving the values
        self.eps = eps
        self.gamma = torch.ones(dim) # gain 
        self.beta = torch.zeros(dim) # bias

    def __call__(self, x):
        # calculate the fwd pass
        xmean = x.mean(1, keepdim=True) # batch mean
        xvar = x.var(1, keepdim=True, unbiased=True) # batch variance
        # change the dimensions above in xmean and xvar from 0 to 1

        xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance (this is the formula used in PyTorch)
        self.out = self.gamma * xhat + self.beta 
        return self.out

    def parameters(self):
        return [self.gamma, self.beta]

torch.manual_seed(1337)
module = BatchNorm1d(100)
x = torch.randn(32, 100)
x = module(x)
x.shape

torch.Size([32, 100])

# Final Notes

<div>
<img src="transformer.webp" width="500"/>
</div>

### Encoder vs. Decoder:
If you observe the image above, there's a block to the left of the block we just implemented, called the encoder block. What we implemented right now is the decoder block. The encoder block is used only when there's tasks related to translation involved - it encodes the tokens from one language into vectors and the decoder block decodes those vectors in another language.

The arrow connecting the two blocks is used to perform **cross-attention**. What makes the decoder a decoder is the triangular mask, which is used for language modeling. The encoder has a transformer applied over the text, but without a triangular mask, which allow all of the tokens to talk to each other as much as they want. This context is then fed into the decoder. The decoder is still trained over whatever text we want it to train on to output coherent text, but it has added context with the embeddings from the other language.

To put in transformer terms, the queries are still coming from x, but there's additional keys and values are coming from the encoder output, feeding into every single block of the decoder. 

-----------------------------------------------------------

# EXERCISES:

1. The n-dimensional tensor mastery challenge: Combine the `Head` and `MultiHeadAttention` into one class that processes all the heads in parallel, treating the heads as another batch dimension (answer is in nanoGPT).

### Don't understand fully come back to later

In [58]:
# version 4: self-attention!
torch.manual_seed(1337)
B,T,C,N = 4,8,32,3
x = torch.randn(B,T,C,N)

# single Head performing self-attention
head_size = 16
key = nn.Linear(C*N, head_size*3, bias=False)
query = nn.Linear(C*N, head_size*3, bias=False)
value = nn.Linear(C*N, head_size*3, bias=False) # although x is (4,8,32), applying a linear layer reshapes it to (32,32), since the total # of elements stays the same
k = key(x.view(32,96))
q = query(x.view(32,96))
# (B, T, 48) @ (B, 48, T) ---> (B, T, T)
wei = q @ k.transpose(-2, -1) # flipping around the last two dimensions

tril = torch.tril(torch.ones(32, 32))
# wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf')) # instead of 1s and 0s, we're using 0s and -inf to make the future tokens completely irrelevant
wei = F.softmax(wei, dim=-1) # applying softmax on -inf will just return 0s and take an average of each row


v = value(x.view(32,96))
out = wei @ v

out.shape

torch.Size([32, 48])

In [66]:
# version 4: self-attention!

# C
torch.manual_seed(1337)
B,T,C = 4,8,32
x = torch.randn(B,T,C)

# single Head performing self-attention
n_head = 4
# head_size = 16
key = nn.Linear(C, head_size, bias=False) # -> 32, 16
query = nn.Linear(C, head_size, bias=False) # -> 32, 16
value = nn.Linear(C, head_size, bias=False) # although x is (4,8,32), applying a linear layer reshapes it to (32,32), since the total # of elements stays the same
k = key(x) # -> (4, 8, 16)
q = query(x) # -> (4, 8, 16)
print(q.shape)
k = k.view(B, T, n_head, C // n_head).transpose(1, 2) # (B, nh, T, hs)
o90q = q.view(B, T, n_head, C // n_head).transpose(1, 2) # (B, nh, T, hs)
v = v.view(B, T, n_head, C // n_head).transpose(1, 2) # (B, nh, T, hs)
# (B, T, 16) @ (B, 16, T) ---> (B, T, T)
wei = q @ k.transpose(-2, -1) # flipping around the last two dimensions

tril = torch.tril(torch.ones(n_head, C // n_head))
# wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf')) # instead of 1s and 0s, we're using 0s and -inf to make the future tokens completely irrelevant
wei = F.softmax(wei, dim=-1) # applying softmax on -inf will just return 0s and take an average of each row


v = value(x)
out = wei @ v

out.shape

torch.Size([4, 8, 16])


RuntimeError: shape '[4, 8, 4, 8]' is invalid for input of size 512

In [59]:
wei

tensor([[1.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [1.1871e-03, 9.9881e-01, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [1.1622e-01, 1.2857e-01, 7.5521e-01,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [9.0579e-02, 1.9958e-03, 2.9385e-04,  ..., 9.5752e-04, 0.0000e+00,
         0.0000e+00],
        [1.5396e-03, 1.3047e-03, 4.8368e-03,  ..., 9.5784e-02, 1.7049e-03,
         0.0000e+00],
        [2.5644e-03, 1.9540e-05, 4.1811e-03,  ..., 3.7376e-05, 1.7229e-03,
         3.0560e-05]], grad_fn=<SoftmaxBackward0>)