In [34]:
with open('tinyshakespeare.txt','r',encoding='utf-8') as f:
    text = f.read()

In [35]:
print('length of dataset in characters:', len(text))


length of dataset in characters: 1115394


In [36]:
#first 100 characters
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [37]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [38]:
#tokenize: convert/map the raw text of the string to some sequence of integers

#encode string -> integer (s to i)
stoi = { ch:i for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s] 

#decode integer -> string  (i to s)
itos = { i:ch for i,ch in enumerate(chars)}
decode = lambda i: ''.join([itos[c] for c in i])

print(encode('hii there'))
print(decode(encode('hii there')))



[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [39]:
import tiktoken #BPE, byte pair encoding

enc = tiktoken.get_encoding('gpt2')
enc.n_vocab #instead of 65 tokens they have 50257 tokens, larger vocabulary  but shorter sequence sizes instead of small vocab and large seq. sizes in shakespear

50257

In [40]:
enc.encode('hii there')

[71, 4178, 612]

In [41]:
# encode entire text  and store into a Tensor
import torch
data = torch.tensor(encode(text),dtype=torch.long)
print(data.shape,data.dtype)
print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [42]:
#split dataset into train and test/val
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [43]:
#sample chunck size for training (allows for batches of chunks trained parallel), 
# 8 +1 because the 8th predicts the target: the 9th char
block_size = 8
train_data[:block_size+1]


tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [44]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


In [45]:
torch.manual_seed(1337) # we are gonna pull random chunks from different parts of the dataset
batch_size =4 #in parallel process
block_size = 8 #maximum context/time length

def get_batch(split):
    #generate batch
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data)-block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x,y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)

print('targets')
print(yb.shape)
print(yb)

print('---')

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b,:t+1]
        target = yb[b,t]
        print(f'when input is {context.tolist()} the target: {target}')

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
---
when input is [24] the target: 43
when input is [24, 43] the target: 58
when input is [24, 43, 58] the target: 5
when input is [24, 43, 58, 5] the target: 57
when input is [24, 43, 58, 5, 57] the target: 1
when input is [24, 43, 58, 5, 57, 1] the target: 46
when input is [24, 43, 58, 5, 57, 1, 46] the target: 43
when input is [24, 43, 58, 5, 57, 1, 46, 43] the target: 39
when input is [44] the target: 53
when input is [44, 53] the target: 56
when input is [44, 53, 56] the target: 1
when input is [44, 53, 56, 1] the target: 58
when input is [44, 53, 56, 1, 58] the target: 46
when input is [44, 53, 

In [46]:
#simplest neural network for language: BigramLanguageModel

import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337) #reproducability

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits (raw output before activation function) for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward (self, idx, targets=None):

        #idx and targets are both (Batch, Time) tensors of integers
        logits = self.token_embedding_table(idx)   #(B,T,C) in this case batch = 4, time = 8 and channel is 65

        if targets is None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits,targets)
        return logits, loss

        #negative log-likelyhood loss, a good way to predict the loss
        # we try to call  crossentropy in its functional form, that way we dont have to create a module for it
        # in the documentation for a multi dimensional input the 2nd dimension needs to be the channels so BxCxT
        # we dont want to deal with that so we gonna reshape it to 2dimensions with B*T 

    
    def generate(self,idx, max_new_tokens):
        #idx is (B,T) array of indices in the current context
        for _ in range(max_new_tokens):
            #get predictions
            logits, loss = self(idx)
            #focus only on the last time step
            logits = logits[:,-1,:] # becomes (B,C)
            # apply softmax for probabilities
            probs = F.softmax(logits,dim=-1) #(B,C)
            #sample from the distribution
            idx_next = torch.multinomial(probs, num_samples =1) #(B,1)
            #append sampled index to the running sequence
            idx = torch.cat((idx,idx_next), dim=1) #(B,T+1)
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

idx = torch.zeros((1,1), dtype=torch.long)
print(decode(m.generate(idx,max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [47]:
#negative log likelihood 
import math
x = -math.log(1/65) #1/65 probability to pick the right character, we expect the negative log-likelihood to be -ln(1/65) but might vary because of initial random prediction
x

4.174387269895637

In [48]:
# Create a pytorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [49]:
batch_size =32
for steps in range(10000):
    #sample a batch of data
    xb,yb = get_batch('train')

    #evaluate the loss
    logits, loss = m(xb,yb)

    #clear previous gradients
    optimizer.zero_grad(set_to_none=True)

    #calculate new gradients
    loss.backward()
    optimizer.step()

print(loss.item()) 

2.5727508068084717


In [50]:
print(decode(m.generate(idx,max_new_tokens=300)[0].tolist()))


Iyoteng h hasbe pave pirance
Rie hicomyonthar's
Plinseard ith henoure wounonthioneir thondy, y heltieiengerofo'dsssit ey
KIN d pe wither vouprrouthercc.
hathe; d!
My hind tt hinig t ouchos tes; st yo hind wotte grotonear 'so it t jod weancotha:
h hay.JUCle n prids, r loncave w hollular s O:
HIs; ht 


Tokens are not talking to eachother, only last token is taken to predict next (simple Bigram Model). For context to matter we need a transformer:


#### mathematical trick in self attention

In [51]:
# consider the following example

torch.manual_seed(1337) 
B,T,C = 4, 8, 2 #batch,time,channels
x = torch.randn(B,T,C) # Returns a tensor filled with random numbers from a normal distribution with mean 0 and variance 1
print(type(x))
x.shape

<class 'torch.Tensor'>


torch.Size([4, 8, 2])

In [52]:
# bow = bag of words
# for now we take the mean of the vector values of the previous words incl. the current, its loses the location component 
# translation invariant, but for now this is better then bigram
# x[b,t] = mean_{i<=t} x[b,i]
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] #(t,C)
        xbow[b,t] = torch.mean(xprev,0)

In [53]:
x[0]

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])

In [54]:
xbow[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

First row is equal, but 2nd row xbow takes the mean of the previous (0.18-0.36)/2 ~ -0.09 and so on <br>

This for-loop calculation is very inefficient, we can use matrix multiplication as follows: using a lower triangular marix (LU factorization: A = LU)

In [55]:
#example of lower triangular multiplication leading to mean of B,T

torch.manual_seed(42)
a = torch.tril((torch.ones(3,3)))
a = a/torch.sum(a,1,keepdim=True) #normalize = mean
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)



a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


Here you can see b,c first rows are equal 2,7 and then it becomes the average (2+6)/2 = 4 and (7+4)/2 = 5.5 and so forth

In [59]:
# lets apply to previous
wei = torch.tril(torch.ones(T,T))
wei = wei / wei.sum(1, keepdim=True) #normalize
xbow2 = wei @ x # (T,T) @ (B,T,C) pytorch automatically will add B to (T,T) to match shapes output ---> (B,T,C)

#check if outcome is same as for-loop calc
# .allclose is a PyTorch utility function that can be used to compare two tensors for equality.
# It returns True if the two tensors are element-wise equal
torch.allclose(xbow[0],xbow2[0])
#torch.allclose(xbow,xbow2) # returns false :(

True

In [57]:
xbow2[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [68]:
# version 3 of creating a Lower-Tri Matrix: Softmax
tril = torch.tril(torch.ones(T,T))
print(tril)
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
print(wei)
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow[0],xbow3[0])

print(wei)

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])
tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000,

In [70]:
# above wei matrix all tokens have similar weights (uniform) because of the mean
# but some tokens find other tokens more interesting, self attention solves this
# by having every single token emit 2 vectors: query (what am i looking for) and key (what do i contain e.g. vowel at position 4)
# the query dot products with all the keys of all the other previous tokens and that becomes wei
# when query and key are very much aligned, it will have a higher score
#version 4: self-attention!

torch.manual_seed(1337)
B,T,C = 4,8,32 
x = torch.randn(B,T,C)

#let's see a single Head perform self-attention
head_size = 16
key = nn.Linear(C,head_size, bias=False)
query = nn.Linear(C,head_size, bias=False)
k = key(x) #(B,T,16)
q = query(x) #(B,T,16)
# no communication has happend yet
wei = q @ k.transpose(-2,-1) #transpose the last dimensions not the batch: (B,T,16) @ (B,16,T) --> (B,T,T)


tril = torch.tril(torch.ones(T,T))
#wei = torch.zeros((T,T))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei,dim=-1)
out = wei @ x 
out.shape

torch.Size([4, 8, 32])

In [73]:
wei[0]
# no longer uniform:

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)

We introduce another value called Value, passing x through it, making raw x a private variable
if you find me interesting as a token, heres what I will communicat to you: V (what is stored in V)


In [None]:
torch.manual_seed(1337)
B,T,C = 4,8,32 
x = torch.randn(B,T,C)

#let's see a single Head perform self-attention
head_size = 16
key = nn.Linear(C,head_size, bias=False)
query = nn.Linear(C,head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x) #(B,T,16)
q = query(x) #(B,T,16)
# no communication has happend yet
wei = q @ k.transpose(-2,-1) #transpose the last dimensions not the batch: (B,T,16) @ (B,16,T) --> (B,T,T)


tril = torch.tril(torch.ones(T,T))
#wei = torch.zeros((T,T))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei,dim=-1)

v = value(x)
out = wei @ v
out.shape

Attention is a communication mechanism can be seen as nodes in a directed graph looking at each other, it can be applied to any arbitrary directed graph
No notion of space, positionally encoding needs to be added seperately (in convolution its matrix)
batches dont talk to each other, parallel processed independently
a decoder block used for predicting uses triangular masking, if u want to make an encoder simply delete wei.masked_fill to let all tokens communicate
self-attention = self because all query's, key's and value's are all based on the same source: x 
in principal attention is more general then that
cross-attention = query's produced by x, but key's and value's come from a different source

In [78]:
# apply scaling to prevent peaking of softmax
# demonstration:
torch.softmax(torch.tensor([0.1,-0.2,0.3,-0.2,0.5]), dim=-1)

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])

In [79]:
torch.softmax(torch.tensor([0.1,-0.2,0.3,-0.2,0.5])*8, dim=-1)

tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])