In [2]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print("Length of dataset in characters: ", len(text))

Length of dataset in characters:  1115394


## Unique characters that occur in this text

In [3]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


## Tokenisation - converting text (string) into sequences of integers

Very simple tokenisation of charaters. If you want to you can use some more advanced tokenisation algorithms like 'tiktoken' from OpenAI using BPE tokeniser.

**Example of tiktoken use:**

```python
import tiktoken
enc = tiktoken.get_encoding("gpt2") #using encoding that was used during gpt2 training
asert enc.decode(enc.encode("Hello world!")) == "Hello world!"
```

In [6]:
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}

#Encoder - takes a string (word for example) and output list of integers
encode = lambda s: [stoi[c] for c in s]

#Decoder - takes a list of integers and output a string
decode = lambda l: ''.join([itos[i] for i in l])

#Examples of encoding and decoding
print(encode('Hello there!'))
print(decode(encode('General Kenobi!')))

[20, 43, 50, 50, 53, 1, 58, 46, 43, 56, 43, 2]
General Kenobi!


## Encoding the entire text dtaset and store it into a torch.Tensor

In [8]:
import torch

data = torch.tensor(encode(text), dtype=torch.long)

print(data.shape, data.dtype)

torch.Size([1115394]) torch.int64


  data = torch.tensor(encode(text), dtype=torch.long)


Now the entire text in our dataset is 1:1 converted and represented by integers.

Here is what it looks like on example of first 1000 characters.

In [9]:
print(data[:1000])

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
        47, 59, 57,  1, 47, 57,  1, 41, 

## Separate dataset into training and validation sets

In [10]:
#First 90% of our dataset will be training dataset. The rest of it will become our validation dataset.
n = int(0.9 * len(data))

train_data = data[:n]
val_data = data[n:]

## Definition of maximum length chunk (chunk of text dataset that we will train our model on - one chunk at the time)

In [11]:
block_size = 8

train_data[:block_size + 1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [12]:
x = train_data[:block_size]
y = train_data[1:block_size + 1]

for t in range(block_size):
    context = x[:t + 1]
    target = y[t]
    print(f"When input is {context} the target most likely is {target}")

When input is tensor([18]) the target most likely is 47
When input is tensor([18, 47]) the target most likely is 56
When input is tensor([18, 47, 56]) the target most likely is 57
When input is tensor([18, 47, 56, 57]) the target most likely is 58
When input is tensor([18, 47, 56, 57, 58]) the target most likely is 1
When input is tensor([18, 47, 56, 57, 58,  1]) the target most likely is 15
When input is tensor([18, 47, 56, 57, 58,  1, 15]) the target most likely is 47
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target most likely is 58


## Definition of batch dimension

In [18]:
torch.manual_seed(1337)
batch_size = 4 #How many independent sequences will be proccess in parallel
block_size = 8 #What is the maximum context length for predictions

#Function generating a samll batch of data of inputs x and targets y
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i + block_size] for i in ix])
    y = torch.stack([data[i+1:i + block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print("Inputs:")
print(xb.shape)
print(xb)
print("Targets:")
print(yb.shape)
print(yb)

print("-----------------")

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"When input is {context.tolist()} the targets are {target}")

Inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
Targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
-----------------
When input is [24] the targets are 43
When input is [24, 43] the targets are 58
When input is [24, 43, 58] the targets are 5
When input is [24, 43, 58, 5] the targets are 57
When input is [24, 43, 58, 5, 57] the targets are 1
When input is [24, 43, 58, 5, 57, 1] the targets are 46
When input is [24, 43, 58, 5, 57, 1, 46] the targets are 43
When input is [24, 43, 58, 5, 57, 1, 46, 43] the targets are 39
When input is [44] the targets are 53
When input is [44, 53] the targets are 56
When input is [44, 53, 56] the targets are 1
When input is [44, 53, 56, 1] the targets are 58
When inpu

In [19]:
#Input to the transformer
print(xb)

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])


# BIGRAM LANGUAGE MODEL

In [33]:
import torch
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size) #Each token directly reads off the logits for the next token from a lookup table

    def forward(self, idx, targets=None): #make targets oprtional cause of generating function
        #idx and targets are both (B, T) tensor of integers
        logits = self.token_embedding_table(idx) #(B, T, C) - B(batch_size), T(time - block_size), C(channel - vocab_size)

        if targets is None:
            loss = None
        else:
            #We need to reshape our logits because it is (B, T, C) but torch cross entropy loss function expects C to be the 2nd parameter - [(B, C, T)]
            B, T, C = logits.shape
            logits = logits.view(B*T, C) #We are changing our tensor to 2-dimenasional tensor where B and T are stretched out to one dimension and in this way C is the 2nd dimension just as loss function expects
            #We have to do the same thing to targets
            targets = targets.view(B*T)

            #loss function
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    #Generation function takes idx (one block/sequenc) that is (B, T) and generate and concat it to be (B, T + max_new_tokens)
    #For exmaple we give it idx with 8 characters (8 time steps) and want to generate 3 more time steps
    #It will generate 3 new characters based on probability and distribution and our new idx will be (B, T+3)
    def generate(self, idx, max_new_tokens):
        #idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            #Get the predictions
            logits, loss = self(idx)

            #Focus only on the last time step
            logits = logits[:, -1, :] #Become (B, C)

            #Apply softmax to get probabilitics
            probs = F.softmax(logits, dim = -1) #(B, C)

            #Sample from distribiution
            idx_next = torch.multinomial(probs, num_samples = 1) #(B, 1)

            #Append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim = 1) # (B, T+1)
        return idx
    
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

#Generating
idx_descripition = torch.zeros((1, 1), dtype = torch.long) #1:1 array holding 0 inside - thats how we kick off the generation (0 is an integer standing for the line break - new line character)
print(decode(m.generate(idx = torch.zeros((1, 1), dtype = torch.long), max_new_tokens=100)[0].tolist())) #0 unplugs existing batch dimensions to generate simple one-dimensional array of 100 timesteps which we will convert to simple python list and then decode

print("\nGenerated output is garbage because we have random untrained model.")

torch.Size([256, 65])
tensor(4.7673, grad_fn=<NllLossBackward0>)

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3

Generated output is garbage because we have random untrained model.


For now our model does not use history. We feed him with entire sectence but he looks only at last pieces (last character). It is silly, but we want this function to stay. We want this function to be this way because eventually the history will be used.

## Training BIGRAM model

In [34]:
#Create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr = 1e-3) #We can use other optimisers (like SGB for example) but this is the most advanced and popular optimizer that works extremly well

In [35]:
batch_size = 32

for setps in range(20000):
    #Sample a batch of data
    xb, yb = get_batch('train')

    #Evaluate loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none = True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.4832067489624023


In [37]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype = torch.long), max_new_tokens=300)[0].tolist()))


fithods misue, knild he I:
Whe! toudirer' My ayosbly louroura s m', uthos s reveprthoukerdi't avorure fotemowe.
Whamo es t, tstt g t RTRushy,
WAsbr spr my ou pl y,
Witoft at o s me,
Whabr'the Cicuomants awonte qungur thme wrar d parsupl by:
'sul ve ave,
Kconit ped bim; fam elathelch easutlll teye A 


#### Progress

As we can see our model made some progress. It is not ideal but we can't expect it from BIGRAM model.

## The mathematical trick in self-attention

In [38]:
torch.manual_seed(1337)

B, T, C = 4, 8, 2
x = torch.rand(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [39]:
#We want x[b, t] = mean_{i <= t} x[b, i]
#To use history we just take average of previous tokens and current token (very weak and lossy method but we will optimize it later)
xbow =torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev =x[b, :t+1]
        xbow[b, t] = torch.mean(xprev, 0)

In [40]:
x[0]

tensor([[0.0783, 0.4956],
        [0.6231, 0.4224],
        [0.2004, 0.0287],
        [0.5851, 0.6967],
        [0.1761, 0.2595],
        [0.7086, 0.5809],
        [0.0574, 0.7669],
        [0.8778, 0.2434]])

In [41]:
xbow[0]

tensor([[0.0783, 0.4956],
        [0.3507, 0.4590],
        [0.3006, 0.3156],
        [0.3717, 0.4108],
        [0.3326, 0.3806],
        [0.3953, 0.4140],
        [0.3470, 0.4644],
        [0.4134, 0.4368]])

In [42]:
#EXAMPLE 1:

#Toy example illustrating how matrix multiplication can be used for a "weighted aggregation"
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

#Thanks to lower tril matrix and tranforming its rows into avarages we can use history by simple matrix product

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [45]:
#EXAMPLE 2:

#Using matrix multiply decribed in previous cell for a weighted aggregation
w = torch.tril(torch.ones(T, T))
w = w / w.sum(1, keepdim = True)

#w are (T, T) pytorch will see that those matrix are diffrent sizes so will create a batch dimension in weights -- 
xbow2 = w @ x #(B, T, T) @ (B, T, C) = (B, T, C)
torch.allclose(xbow, xbow2)

True

In [46]:
#EXAMPLE 3:

#Using softmax function
tril = torch.tril(torch.ones(T, T))
w = torch.zeros((T,T))
w = w.masked_fill(tril == 0, float('-inf'))
w = F.softmax(w, dim=-1)
xbow3 = w @ x
torch.allclose(xbow, xbow3)

True

# Self-attention

We don't want it to be simple average. Each token could find diffrent tokens more or less intresting and we want it to be data dependent.

For example we may look for some constants in the past and we want to now what those constants are and this information to flow to us. 

We still want to gather information from the past but we want it to be data dapendent informations. That's the problem that self-attention solves.

### The wat self-attention solves it

Every single node or token at each position will emit **2 vectors**:
- **query** - we can discribe it as "What I'm looking for?"
- **key** - we can discribe it as "What do I contain?"

Having these 2 vector we simply do the **dot product** between the keys and the queries. So **my query** dot products with **all the keys of all the other tokens**.

That dot product now becomes **weights**.

If the key and the query are sort of aligned they will interact to a very high amount and them I will get to learn more about that specific token ass opposed to any other tokens in the sequence.

In [51]:
torch.manual_seed(1337)

B, T, C = 4, 8, 32
x = torch.rand(B, T, C)

#Implement a single one HEAD of self-attention
#Every single independent token will produce key and query without eny communication
head_size = 16 #hyperparameter
key = nn.Linear(C, head_size, bias = False)
query = nn.Linear(C, head_size, bias = False)
value = nn.Linear(C, head_size, bias = False)

k = key(x) #(B, T, 16[head_size])
q = query(x) #(B, T, 16[head_size])

#The communication starts now, every sigle query will dot product with every single key
w = q @ k.transpose(-2, -1) #transposing last 2 dimensions ---------- now will be (B, T, 16) @ (B, 16, T) = (B, T, T)

tril = torch.tril(torch.ones(T, T))
w = w.masked_fill(tril == 0, float('-inf')) #AD.1
w = F.softmax(w, dim = -1)

v = value(x)
out = w @ v
#out = w @ x

out.shape

#Now weights (weighted aggregation) will be function in data dependent manner between the keys and queries of these nodes

'''
Ad. 1. If we want ALL the nodes constantly talking to each other we just delete this line of code allowing them to do so.
       In this case it is called 'encoder block'. It is usefull for example when we try to predict a sentiment of given text.
       All nodes can talk to each other which makes prediction of sentiment better and possible.

       In 'decoder block' this line is ALWAYS present. We make sure that we mask nodes with triangular matrix disallowing nodes
       to talko to nodes from the future.

       Both of those cases are allowed because attention doesn't care about it.
'''


torch.Size([4, 8, 16])

In [50]:
#Now every batch have diffrent sort of weights not the average weights beacuse every diffrent batch have diffrent tokens on diffrent positions
#So now out weights are data dependent
w[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4409, 0.5591, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2975, 0.3373, 0.3652, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2211, 0.2898, 0.2236, 0.2654, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1832, 0.2163, 0.1954, 0.2437, 0.1614, 0.0000, 0.0000, 0.0000],
        [0.1330, 0.2227, 0.1784, 0.2159, 0.1044, 0.1456, 0.0000, 0.0000],
        [0.1283, 0.1367, 0.1385, 0.1522, 0.1083, 0.1341, 0.2021, 0.0000],
        [0.1064, 0.1332, 0.1265, 0.1445, 0.0940, 0.1200, 0.1231, 0.1524]],
       grad_fn=<SelectBackward0>)

### Explenation

**[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],** \
 **[0.4409, 0.5591, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],** \
 **[0.2975, 0.3373, 0.3652, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],** \
 **[0.2211, 0.2898, 0.2236, 0.2654, 0.0000, 0.0000, 0.0000, 0.0000],** \
 **[0.1832, 0.2163, 0.1954, 0.2437, 0.1614, 0.0000, 0.0000, 0.0000],** \
 **[0.1330, 0.2227, 0.1784, 0.2159, 0.1044, 0.1456, 0.0000, 0.0000],** \
 **[0.1283, 0.1367, 0.1385, 0.1522, 0.1083, 0.1341, 0.2021, 0.0000],** \
 **[0.1064, 0.1332, 0.1265, 0.1445, 0.0940, 0.1200, 0.1231, 0.1524]],**
 

We can look at the **last row** for an example.

Last row is for **8th** token. This token nows what content it has and knows at what position it's in.

Based on that he creates a **query** like *'Hey I'm looking for this kind of staff. I'm a vowel and I'm on an 8th posisiton. I'm looking for any constants at positions up to 4'*.

All the other nodes creates a **keys** and maybe one of the channels could be like *I am a constant and I am in a position up to 4*. That key would have a high number in that specific channel.

That's how the **query** and the **key** hen they dot product, they can find each other and create a high affinity. When they have a high affinity like token **4** in **8th** row it means token 4 is interesting for token 8. In this situation through the softmax I will aggregate **a lot** of it's information into my position and so I'll get to learn a lot about it.



### Notes:

- Attention is a **communication mechanism**. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.
- There is no notion of space. Attention simply acts over a set of vectors. This is why we need to positionally encode tokens.
- Each example across batch dimension is of course processed completely independently and never "talk" to each other
- In an "encoder" attention block just delete the single line that does masking with `tril`, allowing all tokens to communicate. This block here is called a "decoder" attention block because it has triangular masking, and is usually used in autoregressive settings, like language modeling.
- "self-attention" just means that the keys and values are produced from the same source as queries. In "cross-attention", the queries still get produced from x, but the keys and values come from some other, external source (e.g. an encoder module)
- "Scaled" attention additional divides `wei` by 1/sqrt(head_size). This makes it so when input Q,K are unit variance, wei will be unit variance too and Softmax will stay diffuse and not saturate too much. Illustration below

### Diference between self-attention and corss-attention

In **self-attention** all **keys**, **queries** and **values** are comming from the same source (from x). We say that nodes are self-attending.

In encoder-decoder transformer we can have a case where the **queries** are produced from x but the keys and values come from the whole separate **external source** (sometimes from encoder blocks thet encodes some context that we'd like to condition on). So in this case queries are produced by our internal nodes but the rest is produced from external nodes put aside. We are producing queries and we are reading information from the side. That attention is called **cross-attention**.

In [52]:
#Lastly we have to add something to our weights.
#We have to divide it by square root of head size. We have to do this because we don't want our values after softmax be to extreme.
#Softmax would become way to peaky and will shapren to the max value in the nodes. So basically every node would gather information from just one simple node.

#... (previous code)
w = q @ k.transpose(-2, -1) * head_size**(-0.5)
#...(rest of the code)

## Normalization Layer

In [None]:
#Same thing we can get by just calling nn.LayerNorm() so we don't need to implement that to main code
class BatchNorm:
    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.eps = eps
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)

    def __call__(self, x):
        xmean = x.mean(1, keepdim = True) #batch mean
        xvar = x.var(1, keepdim = True) #batch variance
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps) #normalize to unit variance
        self.out = self.gamma * xhat + self.beta
        return self.out
    
    def parameters(self):
        return [self.gamma, self.beta]