In [8]:
import torch

In [9]:
with open('input.txt', 'r', encoding='utf8') as f:
    text = f.read()

In [10]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

itos = { i:s for i,s in enumerate(chars)}
stoi = { s:i for i,s in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

# print(encode('hello'))
# print(decode([46, 43, 50, 50, 53]))

In [11]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape)
print(data[:100])

torch.Size([1115394])
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [12]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]
print(train_data.shape, val_data.shape)


torch.Size([1003854]) torch.Size([111540])


In [13]:
torch.manual_seed(1337)

batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size, ))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x,y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"when input is {context.tolist()} the target is: {target}")

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----
when input is [24] the target is: 43
when input is [24, 43] the target is: 58
when input is [24, 43, 58] the target is: 5
when input is [24, 43, 58, 5] the target is: 57
when input is [24, 43, 58, 5, 57] the target is: 1
when input is [24, 43, 58, 5, 57, 1] the target is: 46
when input is [24, 43, 58, 5, 57, 1, 46] the target is: 43
when input is [24, 43, 58, 5, 57, 1, 46, 43] the target is: 39
when input is [44] the target is: 53
when input is [44, 53] the target is: 56
when input is [44, 53, 56] the target is: 1
when input is [44, 53, 56, 1] the target is: 58
when input is [44, 53, 56, 1, 58] 

In [14]:
import torch.nn as nn
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # we have 64 individual tokens we want to embed (first arg), and the embedding dimension is going to be at least as large as 
        # the entire character space.
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size) # output is (batch, block_size, embed dim)

    def forward(self, idx, targets = None):
        logits = self.token_embedding_table(idx)
        # FROM PYTORCH DOCS:
        # INPUT:
        # Shape can be one of the following:
        # A 1D shape of size (C) where C is the number of classes.
        # A 2D shape of size (N, C) where N is the batch size and C is the number of classes.
        # A more complex shape of size (N, C, d1, d2, ..., dK) where N is the batch size, C is the number of classes, 
        # and d1, d2, ..., dK are additional dimensions. This happens when you have a K-dimensional loss, 
        # and K is greater than or equal to 1.
        
        # TARGET:
        # If the target contains class indices, its shape can be:
        # An empty shape (), representing a single value.
        # A 1D shape of size (N) where N is the batch size.
        # A more complex shape of size (N, d1, d2, ..., dK) where N is the batch size and d1, d2, ..., dK are additional dimensions, similar to the input case where you have a K-dimensional loss, and K is greater than or equal to 1.

        #...so, we know we have to view 'targets' as 1D tensor (all the "next characters" in a 'column')
        # --> (4, 8) => (32).
        # AND for the input, we simply need to "combine" all of the word embeddings
        # down into individual rows. *i.e. the embeddings ARE the NN (though this will
        # change later on.
        # --> (4, 8, 64) => (32, 64)
        if targets is None:
            loss = None
        else: 
            B, T, C = logits.shape
            logits = logits.view(B*T, C) # combine the batch * and blocks all into one,   
            targets = targets.view(B*T) #squash down to a single column
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss 

    def generate(self, idx, max_new_tokens):
        #idx is a (B,T) array of indices in the current context
        
        for _ in range(max_new_tokens):
            #get predictions
            logits, _ = self(idx)

            logits = logits[:, -1, :] #grab only the last timestep, (B, C)
            
            probs = F.softmax(logits, dim=-1) # (B, C)

            idx_next = torch.multinomial(probs, num_samples=1)

            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
            
        return idx
            

m = BigramLanguageModel(vocab_size)
print(xb.shape, yb.shape)
logits, loss = m(xb, yb)
# print(embeddings.shape, '\n', logits.shape

gen = m.generate(torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)

print(decode(gen[0].tolist()))

torch.Size([4, 8]) torch.Size([4, 8])

l-QYjt'CL?jLDuQcLzy'RIo;'KdhpV
vLixa,nswYZwLEPS'ptIZqOZJ$CA$zy-QTkeMk x.gQSFCLg!iW3fO!3DGXAqTsq3pdgq


In [15]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [9]:
batch_size = 32

for steps in range(100):
    xb, yb = get_batch('train')

    #eval loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
    print(loss.item())

4.704006195068359
4.721118927001953
4.653193473815918
4.706261157989502
4.780904293060303
4.751267910003662
4.8395490646362305
4.667973041534424
4.743716716766357
4.774043083190918
4.6908278465271
4.789143085479736
4.61777925491333
4.650947093963623
4.886447429656982
4.703796863555908
4.757591724395752
4.65510892868042
4.709283828735352
4.6745147705078125
4.760501384735107
4.7892632484436035
4.653748512268066
4.6619181632995605
4.673007488250732
4.66577672958374
4.7301106452941895
4.755304336547852
4.712186813354492
4.745501518249512
4.726755619049072
4.735108375549316
4.777461051940918
4.643350601196289
4.6651835441589355
4.79764461517334
4.717412948608398
4.683647155761719
4.81886100769043
4.613771915435791
4.573785781860352
4.560741901397705
4.81563138961792
4.6061553955078125
4.619696140289307
4.725419521331787
4.650487899780273
4.5941481590271
4.7202863693237305
4.699342250823975
4.6724138259887695
4.727972984313965
4.66152286529541
4.616766929626465
4.599857807159424
4.6533403396

In [19]:
B,T,C = 4,8,2
x = torch.randn(B, T, C)


In [20]:
# toy example - inefficient


# we want x[b,t] mean_{i<=t} x[b, i]
# xbow = x bog of words - a term for an average of words
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]
        xbow[b,t] = torch.mean(xprev, 0)

xbow

tensor([[[-0.0431, -1.6047],
         [ 0.8724, -1.0414],
         [ 0.5006, -1.0056],
         [ 0.3134, -1.0563],
         [ 0.0970, -0.6925],
         [-0.1804, -0.6170],
         [ 0.1772, -0.6665],
         [ 0.4053, -0.5249]],

        [[ 0.8008,  1.6806],
         [ 0.5783,  0.4970],
         [ 0.2211,  0.4118],
         [-0.1119,  0.3318],
         [-0.1398,  0.4374],
         [-0.1682,  0.2985],
         [-0.0294,  0.1671],
         [-0.0997,  0.1383]],

        [[ 0.3057, -0.7746],
         [ 0.1703, -0.2267],
         [ 0.6381, -0.4330],
         [ 0.8066, -0.1529],
         [ 0.3984, -0.2199],
         [ 0.0956, -0.0339],
         [ 0.0891,  0.2948],
         [ 0.2253,  0.2036]],

        [[-0.8140, -0.7360],
         [-0.8256, -0.8292],
         [ 0.0534, -0.4993],
         [ 0.1319, -0.3306],
         [-0.1314,  0.0122],
         [-0.3099,  0.1282],
         [-0.4193,  0.1864],
         [-0.2200,  0.2332]]])

In [22]:
wei = torch.tril(torch.ones(T, T))

wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x

torch.allclose(xbow, xbow2)

True

In [31]:
# Single head of Self Attention

head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x) #(B, T, head_size)
q = query(x) #(B, T, head_size)

# print(k.shape, q.shape, k.transpose(-2, -1).shape)

wei = q  @ k.transpose(-2, -1) # (B, T, head_size) @ (B, 16, T) ==> (B, T, T)


tril = torch.tril(torch.ones(T, T))
# wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
v = value(x)

out = wei @ v

# these lines act like a "affinity mask" to x, dictating the importance/ weighting
#     # of how much a each column/timestep character in x
#     # i.e. in every time-step embedding in the first column should be the only one of importance,
#     # whereas, every time-step embedding in the 3th column (index 3) should be weight with prior time-step
#     # characters by a factor of 0.25 (i.e. each character holds 1/4th importance in the entire representation of that chunk of characters)
#     #

# print(wei, wei.shape, x.shape)
# # wei is (T, T), 
# # x is (B, T, C)
# # Broadcasting makes wei a (B, T, T)
# # Result is a (B, T, C) 
# xbow3 = wei @ x
# torch.allclose(xbow,  xbow3)

torch.Size([4, 8, 16]) torch.Size([4, 8, 16]) torch.Size([4, 16, 8])
