- EX1: The n-dimensional tensor mastery challenge: Combine the `Head` and `MultiHeadAttention` into one class that processes all the heads in parallel, treating the heads as another batch dimension (answer is in nanoGPT).
- EX2: Train the GPT on your own dataset of choice! What other data could be fun to blabber on about? (A fun advanced suggestion if you like: train a GPT to do addition of two numbers, i.e. a+b=c. You may find it helpful to predict the digits of c in reverse order, as the typical addition algorithm (that you're hoping it learns) would proceed right to left too. You may want to modify the data loader to simply serve random problems and skip the generation of train.bin, val.bin. You may want to mask out the loss at the input positions of a+b that just specify the problem using y=-1 in the targets (see CrossEntropyLoss ignore_index). Does your Transformer learn to add? Once you have this, swole doge project: build a calculator clone in GPT, for all of +-*/. Not an easy problem. You may need Chain of Thought traces.)
- EX3: Find a dataset that is very large, so large that you can't see a gap between train and val loss. Pretrain the transformer on this data, then initialize with that model and finetune it on tiny shakespeare with a smaller number of steps and lower learning rate. Can you obtain a lower validation loss by the use of pretraining?
- EX4: Read some transformer papers and implement one additional feature or change that people seem to use. Does it improve the performance of your GPT?

In [17]:
import numpy as np

In [18]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [41]:
import mlp_template as funcs

#consts
batch_size = 64

embedding_size = 384
block_size = 256

attention_heads_num = 6
blocks_num = 6


max_iters = 20000
dropout = 0.2
lr = 3e-4

vocab_size = 65
device = "cpu"


eval_iters = 200
eval_interval = 500



In [42]:
with open('tiny_shackespear.txt', 'r', encoding = 'utf-8') as f:
    text = f.read()

In [43]:
print(len(text))

1115394


In [44]:
print(text[:200])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


In [45]:
chars = sorted(list(set(text)))
print(chars)

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [46]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

char_to_idx = {char:idx for idx, char in enumerate(chars)}
idx_to_char = {idx:char for idx, char in enumerate(chars)}
encode = lambda str: [char_to_idx[char] for char in str]
decode = lambda indxses: [idx_to_char[idx] for idx in indxses]

In [47]:
torch.manual_seed(1337)

batch_size = 4
block_size = 8

In [48]:
def get_batch(data):
    start_idxs = torch.randint(len(data) - block_size - 1, (batch_size, ))
    batch = torch.stack([data[idx:idx + block_size] for idx in start_idxs])
    
    val_next = torch.tensor([data[idx+block_size] for idx in start_idxs])
    batch_next = torch.cat((batch[:, 1:block_size], val_next.view((batch_size, 1))), dim = 1)
    batch, batch_next = batch.to(device), batch_next.to(device)
    return batch, batch_next


In [49]:
@torch.no_grad()
def estimate_loss(dataset: dict):
    out = {}
    model.eval()
    for key, data in dataset.items():
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            x_batch, y_batch = get_batch(data)
            logits, loss = model(x_batch, y_batch)
            losses[k] = loss.item()
        out[key] = losses.mean()
    model.train()
    return out

In [50]:
class AttentionHead(nn.Module):
    """one head of self attention"""
    
    def __init__(self, head_size, embedding_size, block_size):
        super().__init__()
        self.key = nn.Linear(embedding_size, head_size, bias = False)
        self.query = nn.Linear(embedding_size,  head_size, bias = False)
        self.value = nn.Linear(embedding_size, head_size, bias = False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        
        wei = q @ k.transpose(-2, -1) * (C**(-0.5)) #variation normalization
        wei = wei.masked_fill(self.tril[:T,:T] == 0, float('-inf')) #(B, T, T), so why :T?
        wei = F.softmax(wei, dim = -1)
        wei = self.dropout(wei)
        
        #what is aggregate the value?
        v = self.value(x) 
        out = wei @ v
        return out
    

In [51]:

class MultiHeadAttention(nn.Module):
    """multile heads of self_attention in parallel"""
    
    def __init__(self, num_heads, head_size):
        super().__init__()
        #better to send a value
        self.heads = nn.ModuleList(AttentionHead(head_size, embedding_size, block_size) for _ in range(num_heads))
        #how does this proj and skip connections works?
        self.proj = nn.Linear(embedding_size, embedding_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        out = torch.cat([head(x) for head in self.heads], dim=-1)
        out = self.dropout(out)
        out = self.proj(out)
        return out


In [52]:

class FeedForward(nn.Module):
    """linear + reLu"""
    
    def __init__(self, embedding_size):
        super().__init__()
        self.layer = nn.Sequential(
            #giving ability to think on gathered data
            #why just cat multiheaded attention
            #so how much this 4 * brings descibe power?
            #TODO: define 4 sm else
            nn.Linear(embedding_size, 4 * embedding_size),
            nn.ReLU(),
            #projection layer, going back into residual pathway???
            nn.Linear(4 * embedding_size, embedding_size),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.layer(x)


In [53]:
class Block(nn.Module):
    """Transformer block: communication then computation"""#??
    
    def __init__(self, embedding_size, heads_num):
        super().__init__()
        head_size = embedding_size // heads_num
        self.sa = MultiHeadAttention(heads_num, head_size)
        self.ffwd = FeedForward(embedding_size)
        self.ln1 = nn.LayerNorm(embedding_size)
        self.ln2 = nn.LayerNorm(embedding_size)
    
    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x
    


In [54]:

class TransformerModel(nn.Module):
    
    def __init__(self, vocab_size, block_size, embedding_size):
        super().__init__()
        
        self.token_embedding_table = nn.Embedding(vocab_size, embedding_size)
        self.position_embedding_table = nn.Embedding(block_size, embedding_size)
        self.blocks = nn.Sequential(
            *[Block(embedding_size, heads_num=4) for _ in range(blocks_num)]
        )
        self.ln_f = nn.LayerNorm(embedding_size)
        self.sa_head = MultiHeadAttention(attention_heads_num, embedding_size//attention_heads_num)
        self.ffwd = FeedForward(embedding_size)
        self.lm_head = nn.Linear(embedding_size, vocab_size)#lm_head makes logits
    
    def forward(self, idx, targets = None):
        B, T = idx.shape 
        
        token_embeddings = self.token_embedding_table(idx)
        #what stuff is here
        positional_embedding = self.position_embedding_table(torch.arange(T, device = device))
        x = token_embeddings + positional_embedding
        x = self.blocks(x)
        x = self.ln_f(x)
        
        logits = self.lm_head(x)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    

In [55]:

#work out this     
def generate(self, idx, max_new_tokens):
# idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):
    
        # get the predictions
        logits, loss = self(idx[:, -block_size:]) #crop the content?
        # focus only on the last time step
        logits = logits[:, -1, :] # becomes (B, C)
        # apply softmax to get probabilities
        probs = F.softmax(logits, dim=-1) # (B, C)
        # sample from the distribution
        idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
        # append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
    return idx

In [62]:
device = "cpu"

In [63]:
model = TransformerModel(vocab_size, block_size, embedding_size)
model.to(device)

TransformerModel(
  (token_embedding_table): Embedding(65, 384)
  (position_embedding_table): Embedding(8, 384)
  (blocks): Sequential(
    (0): Block(
      (sa): MultiHeadAttention(
        (heads): ModuleList(
          (0): AttentionHead(
            (key): Linear(in_features=384, out_features=96, bias=False)
            (query): Linear(in_features=384, out_features=96, bias=False)
            (value): Linear(in_features=384, out_features=96, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
          (1): AttentionHead(
            (key): Linear(in_features=384, out_features=96, bias=False)
            (query): Linear(in_features=384, out_features=96, bias=False)
            (value): Linear(in_features=384, out_features=96, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
          (2): AttentionHead(
            (key): Linear(in_features=384, out_features=96, bias=False)
            (query): Linear(in_features=384, out_featur

In [82]:
embedding_size, block_size, batch_size, vocab_size

(384, 8, 4, 65)

In [65]:
data = torch.tensor(encode(text), dtype = torch.long) 
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [66]:
x_batch, y_batch = get_batch(train_data)

In [67]:
logits, loss = model(x_batch, targets = y_batch)

In [69]:
logits.shape

torch.Size([32, 65])

In [86]:
loss

tensor(4.6007, grad_fn=<NllLossBackward0>)

In [73]:
idx = x_batch
B, T = idx.shape

In [85]:
idx.shape

torch.Size([4, 8])

In [74]:
positional_embedding = model.position_embedding_table(torch.arange(T, device = device))
positional_embedding.shape

torch.Size([8, 384])

In [77]:
token_embeddings = model.token_embedding_table(idx)
token_embeddings.shape

torch.Size([4, 8, 384])

In [81]:
blocks_num

6

In [78]:
x = token_embeddings + positional_embedding
x = model.blocks(x)

In [79]:
x.shape

torch.Size([4, 8, 384])

In [80]:
x = model.ln_f(x)
x.shape

torch.Size([4, 8, 384])

In [84]:
x = model.lm_head(x)
x.shape

torch.Size([4, 8, 65])

In [75]:
x_batch.shape

torch.Size([4, 8])

In [57]:
torch.manual_seed(1337)

batch_size = 4
block_size = 8

def get_batch(data):
    start_idxs = torch.randint(len(data) - block_size - 1, (batch_size, ))
    batch = torch.stack([data[idx:idx + block_size] for idx in start_idxs])
    
    val_next = torch.tensor([data[idx+block_size] for idx in start_idxs])
    batch_next = torch.cat((batch[:, 1:block_size], val_next.view((batch_size, 1))), dim = 1)
    return batch, batch_next
    
    

In [58]:
vocab_size = len(chars)
block_size = 8
embedding_dim = 10

In [60]:
x_batch, y_batch = get_batch(train_data)

NameError: name 'train_data' is not defined

In [None]:
#consts
embedding_size = 32
block_size = 8
attention_heads_num = 4

device = "cpu"

In [None]:
class AttentionHead(nn.Module):
    """one head of self attention"""
    
    def __init__(self, head_size, embedding_size, block_size):
        super().__init__()
        self.key = nn.Linear(embedding_size, head_size, bias = False)
        self.query = nn.Linear(embedding_size,  head_size, bias = False)
        self.value = nn.Linear(embedding_size, head_size, bias = False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        
        wei = q @ k.transpose(-2, -1) * (C**(-0.5)) #variation normalization, key - query table seek
        wei = wei.masked_fill(self.tril[:T,:T] == 0, float('-inf')) #(B, T, T), so why :T?
        wei = F.softmax(wei, dim = -1)
        
        #what is aggregate the value?
        v = self.value(x) 
        out = wei @ v
        return out
    
class MultiHeadAttention(nn.Module):
    """multile heads of self_attention in parallel"""
    
    def __init__(self, num_heads, head_size):
        super().__init__()
        #better to send a value
        self.heads = nn.ModuleList(AttentionHead(head_size, embedding_size, block_size) for _ in range(num_heads))

    def forward(self, x):
        return torch.cat([h[x] for h in self.heads], dim=-1)


In [None]:
class BatchNorm1d:
    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.eps = eps
        self.momentum = momentum
        self.training = True
        
        #params
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)
        
        #buffers (running momentum)
        #No grad!
        self.running_mean = torch.zeros(dim)
        self.running_var = torch.ones(dim)
    
    def __call__(self, x):
        dim = 1
                    
        x_mean = x.mean(dim, keepdim=True)
        x_var = x.var(dim, keepdim=True)
        
        self.out =  self.gamma * ((x - x_mean) / torch.sqrt(x_var + self.eps)) + self.beta
        
        return self.out

    def parameters(self):
        return [self.gamma, self.beta]

In [None]:
torch.manual_seed(1337)
module = BatchNorm1d(100)
x = torch.randn(32, 100)
x = module(x)
x.shape

torch.Size([32, 100])

In [None]:
x[:, 0].mean(), x[:, 0].std()

(tensor(0.1469), tensor(0.8803))

In [None]:
x[0, :].mean(), x[0, :].std()

(tensor(-9.5367e-09), tensor(1.0000))

#### Model bitewise

In [None]:
model = BigramModel(vocab_size, block_size, embedding_size)


In [None]:
model = model.to(device)

In [None]:
x_batch, y_batch = get_batch(train_data)

In [None]:
x_batch.shape

torch.Size([4, 8])

In [None]:
B, T = x_batch.shape
token_embeddings = model.token_embedding_table(x_batch)
positional_embedding = model.position_embedding_table(torch.arange(T, device = device))
x = token_embeddings + positional_embedding

In [None]:
x_batch.shape, y_batch.shape

(torch.Size([4, 8]), torch.Size([4, 8]))

In [None]:
model(x_batch, targets = y_batch)

TypeError: 'AttentionHead' object is not subscriptable

In [None]:
x_batch.shape

torch.Size([4, 8])

In [None]:
token_embeddings.shape

torch.Size([4, 8, 32])

In [None]:
B, T

(4, 8)

In [None]:
positional_embedding.shape

torch.Size([8, 32])

In [None]:
x.shape

torch.Size([4, 8, 32])

In [None]:
multiple_head = [AttentionHead(embedding_size//4, embedding_size, block_size) for _ in range(4)]

In [None]:
attention_head = AttentionHead(embedding_size//4, embedding_size, block_size)

In [None]:
multiple_head[0]

AttentionHead(
  (key): Linear(in_features=32, out_features=8, bias=False)
  (query): Linear(in_features=32, out_features=8, bias=False)
  (value): Linear(in_features=32, out_features=8, bias=False)
)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [None]:
batch_size = 32
for steps in range(10000):
    x_batch, y_batch = get_batch(train_data)
    
    logits, loss = model(x_batch, y_batch)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
print(loss.item())

TypeError: 'AttentionHead' object is not subscriptable

In [None]:
data_dict = {
    'train': 1,
    'eval':2
}
for d, key in data_dict.items():
    print("d: ", d, "key", key)
    

d:  train key 1
d:  eval key 2


In [None]:
estimate_loss({'train': train_data, 'val':val_data})

{'train': tensor(2.5107), 'val': tensor(2.5186)}

In [None]:
@torch.no_grad()
def estimate_loss(dataset: dict):
    out = {}
    model.eval()
    for key, data in dataset.items():
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            x_batch, y_batch = get_batch(data)
            logits, loss = model(x_batch, y_batch)
            losses[k] = loss.item()
        out[data] = losses.mean()
    model.train()
    return out

### Class bitesize

In [None]:
print(''.join(decode(model.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist())))

IndexError: index out of range in self

In [None]:
idx = torch.zeros((1, 1), dtype=torch.long)

In [None]:
torch.cat((idx, idx+1), dim=1)[:,-1, :]

IndexError: too many indices for tensor of dimension 2

In [None]:
print(model.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100))

TypeError: 'AttentionHead' object is not subscriptable

In [None]:
logits = token_embedding_table(torch.zeros((1, 1), dtype=torch.long))
B, T, C = logits.shape
print("B:",B,"T:",T,"C:",C)
logits = logits.view(B*T, C)
logits.shape

B: 1 T: 1 C: 65


torch.Size([1, 65])

In [None]:
logits, loss = (idx)
logits = logits[:, -1, :]
probs = F.softmax(logits, dim=1)
idx_next = torch.multinomial(probs, num_samples=1)
idx = torch.cat((idx, idx_next), dim=1)

In [None]:
logits[:, -1, :]

IndexError: too many indices for tensor of dimension 2

In [None]:
token_embedding_table = nn.Embedding(vocab_size, vocab_size)

In [None]:
token_embedding_table(x_batch).shape

torch.Size([4, 8, 65])

In [None]:
loss.shape

torch.Size([])

In [None]:
x_batch

tensor([[53, 59,  6,  1, 58, 56, 47, 40],
        [49, 43, 43, 54,  1, 47, 58,  1],
        [13, 52, 45, 43, 50, 53,  8,  0],
        [ 1, 39,  1, 46, 53, 59, 57, 43]])

In [None]:
y_batch

tensor([[59,  6,  1, 58, 56, 47, 40, 59],
        [43, 43, 54,  1, 47, 58,  1, 58],
        [52, 45, 43, 50, 53,  8,  0, 26],
        [39,  1, 46, 53, 59, 57, 43,  0]])

In [None]:
vocab_size

65

In [None]:
B,T,C = logits.shape

ValueError: not enough values to unpack (expected 3, got 2)

In [None]:
model = BigramLanguageModel(vocab_size)
logits, loss = model(x_batch, y_batch)
print(logits.shape)#4*8
print(loss)

B: 4 T: 8 C: 65
torch.Size([32, 65])
tensor(4.8948, grad_fn=<NllLossBackward0>)


In [None]:
a = torch.tensor([[1,-1], [2, -2], [3, -3], [4,-4]])
a.shape

torch.Size([4, 2])

In [None]:
torch.zeros((1, 1), dtype=torch.long)

tensor([[0]])

In [None]:
a[:, -1]

tensor([-1, -2, -3, -4])

In [None]:
model.generate(0, 10)

TypeError: forward() missing 1 required positional argument: 'targets'

### Self attention trick

In [None]:
torch.manual_seed(1337)
B,T,C = 4, 8, 2

x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

#### Efficient way

In [None]:
x_bag_of_words = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        x_prev = x[b, :t+1]
        x_bag_of_words[b, t] = torch.mean(x_prev, 0)

In [None]:
x_bag_of_words[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [None]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a_1 = a / torch.sum(a, 1, keepdim=True)
print(a)
b = torch.randint(0, 10, (3, 2)).float()
x = a @ b
print(x)
x_1 = a_1 @ b 
print(x_1)

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])
tensor([[ 2.,  7.],
        [ 8., 11.],
        [14., 16.]])
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


#### Continue:

In [None]:
stuff = torch.tril(torch.ones(T,T))
stuff = stuff / stuff.sum(1, keepdim = True)
stuff

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [None]:
x_bag_of_words_2 = stuff @ x #(T, T) @ (B, T, C) - pytorch will make (B, T, T) @ (B, T, C)
x_bag_of_words_2

RuntimeError: mat1 and mat2 shapes cannot be multiplied (8x8 and 3x2)

In [None]:
x_bag_of_words[0]

torch.Size([8, 2])

In [None]:
x_bag_of_words_2.shape

torch.Size([4, 8, 2])

In [None]:
torch.allclose(x_bag_of_words, x_bag_of_words_2)

True

In [None]:
tril

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

In [None]:
weights

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [None]:
tril = torch.tril(torch.ones(T, T))
weights = torch.zeros((T,T))
weights = weights.masked_fill(tril == 0, float('-inf'))

In [None]:
# How does this work?
# 1\1
# e^0 = 1 \2
# e^0 = 1 \3

weights = F.softmax(weights, dim=0)

In [None]:
weights

tensor([[0.1250, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1250, 0.1429, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1250, 0.1429, 0.1667, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1250, 0.1429, 0.1667, 0.2000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1250, 0.1429, 0.1667, 0.2000, 0.2500, 0.0000, 0.0000, 0.0000],
        [0.1250, 0.1429, 0.1667, 0.2000, 0.2500, 0.3333, 0.0000, 0.0000],
        [0.1250, 0.1429, 0.1667, 0.2000, 0.2500, 0.3333, 0.5000, 0.0000],
        [0.1250, 0.1429, 0.1667, 0.2000, 0.2500, 0.3333, 0.5000, 1.0000]])

In [None]:
x_bag_of_words_3 = weights @ x
torch.allclose(x_bag_of_words, x_bag_of_words_3)

True

In [None]:
torch.manual_seed(1337)
B,T,C = 4, 8, 32

x = torch.randn(B, T, C)

head_size = 16
#(batch_size, 32 (embed dim), 16(hidden_size))
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
k = key(x) # [4, 8, 16]
q = query(x) # [4, 8, 16]

#dot producting key and query (16, 16) but why do we need weights,
#why squeeshing the embed dims?
#if that makes similarity, what embedding then do?
weights = q @ k.transpose(-2, -1) * (C**(-0.5))# (B, T, 16) @ (B, 16, T) ---> (B, T, T)

tril = torch.tril(torch.ones(T, T))
weights = weights.masked_fill(tril == 0, float('-inf'))
weights = F.softmax(weights, dim=-1)


In [None]:
out = weights @ x


In [None]:
lm_head = nn.Linear(C, vocab_size)
lm_head(out).shape

torch.Size([4, 8, 65])

#### Scaled attention 

In [None]:
k = torch.randn(B, T, head_size)
q = torch.randn(B, T, head_size)
#dot prod 16*16 (variance adds?) 
# / sqrt(head_size)? /4?
wei = q @ k.transpose(-2, -1) * head_size**(-0.5)
print(wei.var())
#This makes softmax even to all tokens

tensor(0.8980)


In [None]:
x_batch, y_batch = get_batch(train_data)

In [None]:
x_batch.shape

torch.Size([32, 8])

In [None]:
print(k.shape, q.shape, wei.shape)

torch.Size([4, 8, 16]) torch.Size([4, 8, 16]) torch.Size([4, 8, 8])


In [None]:
k.var()

tensor(1.0700)

In [None]:
q.var()

tensor(0.9006)

In [None]:
wei.var()

tensor(1.0879)

#### Encoder-Decoder cross attention