In [2]:
with open(r'C:\Tijmen\projects\mini_gpt\script_processing\lotr_indent_dialogue.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [4]:
print("length of dataset in characters: ", len(text))
print(text[:1000])  # print the first 1000 characters to check the content

length of dataset in characters:  87194
GALADRIEL: "I amar prestar sen: han mathon ne nen, han mathon ne chae...a han noston ned wilith." The world is changed: I feel it in the water, I feel it in the earth, I smell it in the air...Much that once was is lost, for none now live who remember it.
GALADRIEL: It began with the forging of the Great Rings.
GALADRIEL: Three were given to the Elves, immortal, wisest...fairest of all beings.
GALADRIEL: Seven to the Dwarf Lords, great miners and craftsmen of the mountain halls.
GALADRIEL: And Nine...nine rings were gifted to the race of Men who, above all else, desire power.
GALADRIEL: For within these rings was bound the strength and will to govern each race.
GALADRIEL: But they were all of them deceived.
GALADRIEL: ...for another ring was made.
GALADRIEL: In the land of Mordor, in the fires of Mount Doom, the Dark Lord Sauron forged in secret a Master Ring to control all others.
GALADRIEL: ...and into this Ring he poured his cruelty, his malice

In [5]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print("Vocabulary size:", vocab_size)


 !"'(,-.1234679:;?ABCDEFGHIJKLMNOPQRSTUVWYZabcdefghijklmnopqrstuvwxyz�
Vocabulary size: 71


In [6]:
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string    

print(encode("hello world"))
print(decode(encode("hello world")))

[51, 48, 55, 55, 58, 1, 66, 58, 61, 55, 47]
hello world


In [7]:
# let's now encode the entire text dataset and store it in a torch.Tensor
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)  # should be (number of characters in text, ) and dtype=torch.int64   
print(data[:1000])  # first 1000 characters encoded as integers

torch.Size([87194]) torch.int64
tensor([25, 19, 30, 19, 22, 36, 27, 23, 30, 16,  1,  3, 27,  1, 44, 56, 44, 61,
         1, 59, 61, 48, 62, 63, 44, 61,  1, 62, 48, 57, 16,  1, 51, 44, 57,  1,
        56, 44, 63, 51, 58, 57,  1, 57, 48,  1, 57, 48, 57,  6,  1, 51, 44, 57,
         1, 56, 44, 63, 51, 58, 57,  1, 57, 48,  1, 46, 51, 44, 48,  8,  8,  8,
        44,  1, 51, 44, 57,  1, 57, 58, 62, 63, 58, 57,  1, 57, 48, 47,  1, 66,
        52, 55, 52, 63, 51,  8,  3,  1, 38, 51, 48,  1, 66, 58, 61, 55, 47,  1,
        52, 62,  1, 46, 51, 44, 57, 50, 48, 47, 16,  1, 27,  1, 49, 48, 48, 55,
         1, 52, 63,  1, 52, 57,  1, 63, 51, 48,  1, 66, 44, 63, 48, 61,  6,  1,
        27,  1, 49, 48, 48, 55,  1, 52, 63,  1, 52, 57,  1, 63, 51, 48,  1, 48,
        44, 61, 63, 51,  6,  1, 27,  1, 62, 56, 48, 55, 55,  1, 52, 63,  1, 52,
        57,  1, 63, 51, 48,  1, 44, 52, 61,  8,  8,  8, 31, 64, 46, 51,  1, 63,
        51, 44, 63,  1, 58, 57, 46, 48,  1, 66, 44, 62,  1, 52, 62,  1, 55, 58,
        

In [8]:
# Let's now split the data into train and validation sets
n = int(0.9*len(data))  # first 90% will be
train_data = data[:n]
val_data = data[n:]

In [9]:
block_size = 8
train_data[:block_size+1]  # first block_size+1 characters

tensor([25, 19, 30, 19, 22, 36, 27, 23, 30])

In [10]:
x = train_data[:block_size]  # input sequence
y = train_data[1:block_size+1]  # target sequence
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context.tolist()} the target is {target.item()} ({itos[target.item()]})")

when input is [25] the target is 19 (A)
when input is [25, 19] the target is 30 (L)
when input is [25, 19, 30] the target is 19 (A)
when input is [25, 19, 30, 19] the target is 22 (D)
when input is [25, 19, 30, 19, 22] the target is 36 (R)
when input is [25, 19, 30, 19, 22, 36] the target is 27 (I)
when input is [25, 19, 30, 19, 22, 36, 27] the target is 23 (E)
when input is [25, 19, 30, 19, 22, 36, 27, 23] the target is 30 (L)


In [11]:
torch.manual_seed(1337)
batch_size = 4  # how many sequences will we process in parallel?
block_size = 8  # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y 

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1].tolist()
        target = yb[b, t].item()
        print(f"b{b} t{t}: when input is {context} the target is {target} ({itos[target]})")

inputs:
torch.Size([4, 8])
tensor([[61, 58, 62, 62,  1, 63, 51, 48],
        [ 6,  1, 47, 58, 57,  4, 63,  1],
        [ 1, 56, 68,  1, 62, 48, 61, 65],
        [50,  1, 44, 45, 58, 64, 63,  6]])
targets:
torch.Size([4, 8])
tensor([[58, 62, 62,  1, 63, 51, 48,  1],
        [ 1, 47, 58, 57,  4, 63,  1, 68],
        [56, 68,  1, 62, 48, 61, 65, 52],
        [ 1, 44, 45, 58, 64, 63,  6,  1]])
----
b0 t0: when input is [61] the target is 58 (o)
b0 t1: when input is [61, 58] the target is 62 (s)
b0 t2: when input is [61, 58, 62] the target is 62 (s)
b0 t3: when input is [61, 58, 62, 62] the target is 1 ( )
b0 t4: when input is [61, 58, 62, 62, 1] the target is 63 (t)
b0 t5: when input is [61, 58, 62, 62, 1, 63] the target is 51 (h)
b0 t6: when input is [61, 58, 62, 62, 1, 63, 51] the target is 48 (e)
b0 t7: when input is [61, 58, 62, 62, 1, 63, 51, 48] the target is 1 ( )
b1 t0: when input is [6] the target is 1 ( )
b1 t1: when input is [6, 1] the target is 47 (d)
b1 t2: when input is [6, 1

In [14]:
import torch.nn as nn
import torch.nn.functional as F 
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)  # (B, T, C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, _ = self(idx)
            logits = logits[:, -1, :]  # (B, C)
            probs = F.softmax(logits, dim=-1)  # (B, C)
            next_idx = torch.multinomial(probs, num_samples=1)  # (B, 1)
            idx = torch.cat((idx, next_idx), dim=1)  # (B, T+1)
        return idx
    
model = BigramLanguageModel(vocab_size)
logits, loss = model(xb, yb)    
print("logits shape:", logits.shape)
print("loss:", loss)

print(decode(model.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

logits shape: torch.Size([32, 71])
loss: tensor(4.7567, grad_fn=<NllLossBackward0>)

M.3je�U3pKjIizd4vQ6-
'U3;g3iu7x66Vm9.zPHPHhr?nwAojM6AgHS
eHSbGvNKP2 926cYYv�OCFbf6QiI7tlcg:GWajg;(1v


In [15]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [None]:
batch_size = 32
for steps in range(10000):
    xb, yb = get_batch('train')

    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if steps % 1000 == 0:
        print(f"step {steps}: loss {loss.item()}")  




step 0: loss 4.674966335296631
step 1000: loss 3.800117254257202
step 2000: loss 3.1154112815856934
step 3000: loss 2.7966458797454834
step 4000: loss 2.5169951915740967
step 5000: loss 2.5751914978027344
step 6000: loss 2.397747755050659
step 7000: loss 2.405007839202881
step 8000: loss 2.475538969039917
step 9000: loss 2.445568561553955

Hoooull ams ts, ston's..
ANDO: Giticathin loulucou.
GODO: iest coll AGou. thir bashese'myoreed, andoourou.
FR: bemer ngrit makese dellallftel ishagon.
PPiselit t de soviry reaingin tsteo har ge t Hin t ay METERAN: thed, you woincaserin!
GANDonere at Thawomouthetoureld burse iave he... acke ol.
LFROL: heathed GAM: istr DODAMys was.. B99z: s. Be stherod, Smp r s fid t .
Ser owhum s Yof cas as irse lo ins f KM: DEve F: fomes wn bithe its Lodinderse Thishe otqutsis owill th ma INDUMEALAng Alfomothe 


In [17]:
print(decode(model.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


LBONorig s.
GOMyoun.Bo..
BO: touk!
ANDRORSANDALRAM: ondolplo llipss al2DAGon t.. wisthe Thiners RIm fe Mou - I wimowellat DO: t t cad LFRERIRO: M outerotrod r..
GO: is por he ve me- fle, hawining inthe s st sn tre we h pllowaly t hushald tir cabyoo'rreande t tan. hafe en Yo.
GA r Hilleauten actimomNDAGorodelot. Wha tored bor PPI bororis were be..tr indes F: IRItou NDELFRellokells ed wilseg fel o?
F
Touse rrey sicrikilobond the big Bllve IDOr. he fanghiss ganend.... f qThayt Mo oreinthe Ho inowow


In [None]:
# consider the following toy example:

torch.manual_seed(1337) 
B, T, C = 4, 8, 2
x = torch.randn(B, C, T)  # (B, T) tensor of integers in the range [0, C)

tril = torch.tril(torch.ones(T, T))
weights = torch.zeros((T, T))
weights = weights.masked_fill(tril == 0, float('-inf'))
weights = F.softmax(weights, dim=-1)
out = weights @ x

out.shape

In [None]:
# We want x[b, t] = mean_{i<==t} x[b, i]
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]  # (t, C)
        xbow[b, t] = torch.mean(xprev, 0)



In [12]:
# consider the following toy example:
import torch
import torch.nn.functional as F
import torch.nn as nn

torch.manual_seed(1337) 
B, T, C = 4, 8, 32
x = torch.randn(B, T, C)  # (B, T) tensor of integers in the range [0, C)

# let's see a single head attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x)    # (B, T, head_size)
q = query(x)  # (B, T, head_size)
weights = q @ k.transpose(-2, -1)  # (B, T, T)


tril = torch.tril(torch.ones(T, T))
#weights = torch.zeros((T, T))
weights = weights.masked_fill(tril == 0, float('-inf'))
weights = F.softmax(weights, dim=-1)
v = value(x)  # (B, T, head_size)
out = weights @ v  # (B, T, head_size)


out.shape

torch.Size([4, 8, 16])

In [13]:
weights

tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
         [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
         [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
         [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],

        [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1687, 0.8313, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2477, 0.0514, 0.7008, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4410, 0.0957, 0.3747, 0.0887, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0069, 0.0456, 0.0300, 0.7748, 0.1427, 0.0000, 0.0000, 0.0000],
         [0.0660, 0.089