# Scratch GPT Tutorial
Following along to Andrej Karpathy's GPT from scratch video. I am using the ASV Bible instead of Shakespeare for my data just for fun.

In [4]:
!wget https://openbible.com/textfiles/asv.txt

--2023-08-06 06:57:35--  https://openbible.com/textfiles/asv.txt
Resolving openbible.com (openbible.com)... 74.63.245.138
Connecting to openbible.com (openbible.com)|74.63.245.138|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4568028 (4.4M) [text/plain]
Saving to: ‘asv.txt’


2023-08-06 06:57:36 (14.4 MB/s) - ‘asv.txt’ saved [4568028/4568028]



In [5]:
with open('asv.txt', 'r', encoding='utf-8') as f:
  text = f.read()

In [6]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  4563817


In [7]:
print(text[:1000])

﻿ASV
American Standard Version
Genesis 1:1	In the beginning God created the heavens and the earth.
Genesis 1:2	And the earth was waste and void; and darkness was upon the face of the deep: and the Spirit of God moved upon the face of the waters.
Genesis 1:3	And God said, Let there be light: and there was light.
Genesis 1:4	And God saw the light, that it was good: and God divided the light from the darkness.
Genesis 1:5	And God called the light Day, and the darkness he called Night. And there was evening and there was morning, one day.
Genesis 1:6	And God said, Let there be a firmament in the midst of the waters, and let it divide the waters from the waters.
Genesis 1:7	And God made the firmament, and divided the waters which were under the firmament from the waters which were above the firmament: and it was so.
Genesis 1:8	And God called the firmament Heaven. And there was evening and there was morning, a second day.
Genesis 1:9	And God said, Let the waters under the heavens be gathere

In [8]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)

	
 !(),-.0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWYZ[]abcdefghijklmnopqrstuvwxyzÆæ—’﻿
80


In [9]:
stoi = { ch:i for i,ch in enumerate(chars) } # mappings
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encodings
decode = lambda l: ''.join([itos[i] for i in l])

print(encode("hello there"))
print(decode(encode("hello there")))

[56, 53, 60, 60, 63, 2, 68, 56, 53, 66, 53]
hello there


In [10]:
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([4563817]) torch.int64
tensor([79, 22, 40, 43,  1, 22, 61, 53, 66, 57, 51, 49, 62,  2, 40, 68, 49, 62,
        52, 49, 66, 52,  2, 43, 53, 66, 67, 57, 63, 62,  1, 28, 53, 62, 53, 67,
        57, 67,  2, 10, 19, 10,  0, 30, 62,  2, 68, 56, 53,  2, 50, 53, 55, 57,
        62, 62, 57, 62, 55,  2, 28, 63, 52,  2, 51, 66, 53, 49, 68, 53, 52,  2,
        68, 56, 53,  2, 56, 53, 49, 70, 53, 62, 67,  2, 49, 62, 52,  2, 68, 56,
        53,  2, 53, 49, 66, 68, 56,  8,  1, 28, 53, 62, 53, 67, 57, 67,  2, 10,
        19, 11,  0, 22, 62, 52,  2, 68, 56, 53,  2, 53, 49, 66, 68, 56,  2, 71,
        49, 67,  2, 71, 49, 67, 68, 53,  2, 49, 62, 52,  2, 70, 63, 57, 52, 20,
         2, 49, 62, 52,  2, 52, 49, 66, 59, 62, 53, 67, 67,  2, 71, 49, 67,  2,
        69, 64, 63, 62,  2, 68, 56, 53,  2, 54, 49, 51, 53,  2, 63, 54,  2, 68,
        56, 53,  2, 52, 53, 53, 64, 19,  2, 49, 62, 52,  2, 68, 56, 53,  2, 40,
        64, 57, 66, 57, 68,  2, 63, 54,  2, 28, 63, 52,  2, 61, 63, 70, 53, 52,
      

In [11]:
# partitioning data to prevent model from becoming an exact memorization of input data
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [12]:
block_size = 8
train_data[:block_size+1]

tensor([79, 22, 40, 43,  1, 22, 61, 53, 66])

In [13]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
  context = x[:t+1]
  target = y[t]
  print(f"when input is {context} the target: {target}")

when input is tensor([79]) the target: 22
when input is tensor([79, 22]) the target: 40
when input is tensor([79, 22, 40]) the target: 43
when input is tensor([79, 22, 40, 43]) the target: 1
when input is tensor([79, 22, 40, 43,  1]) the target: 22
when input is tensor([79, 22, 40, 43,  1, 22]) the target: 61
when input is tensor([79, 22, 40, 43,  1, 22, 61]) the target: 53
when input is tensor([79, 22, 40, 43,  1, 22, 61, 53]) the target: 66


In [14]:
batch_size = 4 # number of parallel sequences processed
block_size = 8 # max context length for predictions

def get_batch(split): # creates batch of input data (x) and target data (y)
  data = train_data if split == "train" else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  return x,y

xb,yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
  for t in range(block_size): # time dimension
    context = xb[b, :t+1]
    target = yb[b,t]
    print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[ 6,  2, 44, 53,  2, 56, 49, 70],
        [62, 52,  2, 51, 56, 49, 66, 55],
        [ 2, 49, 60, 60,  2, 68, 56, 53],
        [63,  2, 68, 56, 53,  2, 71, 49]])
targets:
torch.Size([4, 8])
tensor([[ 2, 44, 53,  2, 56, 49, 70, 53],
        [52,  2, 51, 56, 49, 66, 55, 53],
        [49, 60, 60,  2, 68, 56, 53, 67],
        [ 2, 68, 56, 53,  2, 71, 49, 60]])
----
when input is [6] the target: 2
when input is [6, 2] the target: 44
when input is [6, 2, 44] the target: 53
when input is [6, 2, 44, 53] the target: 2
when input is [6, 2, 44, 53, 2] the target: 56
when input is [6, 2, 44, 53, 2, 56] the target: 49
when input is [6, 2, 44, 53, 2, 56, 49] the target: 70
when input is [6, 2, 44, 53, 2, 56, 49, 70] the target: 53
when input is [62] the target: 52
when input is [62, 52] the target: 2
when input is [62, 52, 2] the target: 51
when input is [62, 52, 2, 51] the target: 56
when input is [62, 52, 2, 51, 56] the target: 49
when input is [62, 52, 2, 51, 56,

In [15]:
print(xb) # this is the input batch

tensor([[ 6,  2, 44, 53,  2, 56, 49, 70],
        [62, 52,  2, 51, 56, 49, 66, 55],
        [ 2, 49, 60, 60,  2, 68, 56, 53],
        [63,  2, 68, 56, 53,  2, 71, 49]])


In [16]:
# starting with a very simple language model, the bigram language model
import torch
import torch.nn as nn
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size) # look up table mapping tokens to the logits (probabilities) for the next token

  def forward(self, idx, targets=None): # ends up getting called when the object is called
    # idx and targets are 2-dimensional (B,T) tensors
    logits = self.token_embedding_table(idx) # look up the logits for the next token, dimensions B,T,C (batch_size,time,channels/vocab_size)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C) # reshape logits to 2-dimensions
      targets = targets.view(B*T) # reshape targets to 1-dimension
      loss = F.cross_entropy(logits, targets) # the quality of the predictions, expects C to be second dimension if multidimensional tensor

    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx is 2-dimensional (B,T) in this context
    for _ in range(max_new_tokens):
      logits, loss = self(idx) # get predictions
      logits = logits[:, -1, :] # look only at the last time step, becomes (B,C)
      probs = F.softmax(logits, dim=-1) # get probabilities
      idx_next = torch.multinomial(probs, num_samples=1) # sample the distribution for a single prediction in the time dimension in each batch dimension, (B,1)
      idx = torch.cat((idx, idx_next), dim=1) # append sample to the sequence, becomes (B,T+1)
    return idx


m = BigramLanguageModel(vocab_size)
logits, loss = m(xb,yb)
print(logits.shape)
print(loss) # we expect loss of -ln(1/vocab_size) for decent performance

idx = torch.zeros((1,1), dtype=torch.long) # creates a 1x1 tensor containing a zero, which I think is representinga tab character
out = m.generate(idx, max_new_tokens=100)[0].tolist() # generates 100 tokens based on that
print(decode(out)) # print the decoded generation, will be garbage because the model is completely random, not trained yet

torch.Size([32, 80])
tensor(5.0331, grad_fn=<NllLossBackward0>)
	xN
J;AKGfR[;bfCUiMI:hixAE﻿ey)EYlHuf9)Qe)ARi;2Akp[f9V’(.	.qxb-dG?M!.D  0qE65, FGb73yQ2pdVyQ5LuNæ7guIL


In [17]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [18]:
batch_size = 32 # try using a bigger batch size
for steps in range(10000): # a typical training loop

  xb, yb = get_batch('train') # sample new batch of data

  logits, loss = m(xb,yb) # evaluate loss
  optimizer.zero_grad(set_to_none=True) # zero out the gradients from previous steps
  loss.backward() # getting gradients for parameters
  optimizer.step() # use gradients to update parameters

print(loss.item())

2.343282461166382


In [19]:
print(decode(m.generate(torch.zeros((1,1), dtype=torch.long), max_new_tokens=500)[0].tolist())) # generation should be better after training, still poor because the model is only looking at one token for each prediction

	An hiqY—Haveanvak Mo th thesthanut ses tre Je e tojze ng, And h,0:2	y orofrdesssr Isal imus, whe trof mer wad m msacage h ansasuseandnooors bd, feshenn tovofo jupthethinthrhed f cortheved my this gas h mary makin o of l toganacche ul ld almin s acle aho w as he Sherd ck [Æ2:4	Ace hed h lllan And? on themy m onsreKe Uze s; hane, th.
Mall por, o o t ouigothouthr h kint an here f wn al:109	Bent 18:15:2:27	Telaidrokes:1 Treusthesshrithin t t)5	Is crel hesath hathe mblovexupererd caeo e ts hor di, in


In [20]:
import torch
import torch.nn as nn
from torch.nn import functional as F

batch_size = 32
block_size = 8
max_iters = 3000
eval_interval = 300
learning_rate = 1e-2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200

with open('asv.txt', 'r', encoding='utf-8') as f:
    text = f.read()

chars = sorted(list(set(text)))
vocab_size = len(chars)

stoi = { ch:i for i,ch in enumerate(chars) } # mappings
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encodings
decode = lambda l: ''.join([itos[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split): # creates batch of input data (x) and target data (y)
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x,y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X,Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size) # look up table mapping tokens to the logits (probabilities) for the next token

    def forward(self, idx, targets=None): # ends up getting called when the object is called
        # idx and targets are 2-dimensional (B,T) tensors
        logits = self.token_embedding_table(idx) # look up the logits for the next token, dimensions B,T,C (batch_size,time,channels/vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C) # reshape logits to 2-dimensions
            targets = targets.view(B*T) # reshape targets to 1-dimension
            loss = F.cross_entropy(logits, targets) # the quality of the predictions, expects C to be second dimension if multidimensional tensor

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is 2-dimensional (B,T) in this context
        for _ in range(max_new_tokens):
            logits, loss = self(idx) # get predictions
            logits = logits[:, -1, :] # look only at the last time step, becomes (B,C)
            probs = F.softmax(logits, dim=-1) # get probabilities
            idx_next = torch.multinomial(probs, num_samples=1) # sample the distribution for a single prediction in the time dimension in each batch dimension, (B,1)
            idx = torch.cat((idx, idx_next), dim=1) # append sample to the sequence, becomes (B,T+1)
        return idx


model = BigramLanguageModel(vocab_size)
m = model.to(device)

optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)

for iter in range(max_iters): # a typical training loop

    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = get_batch('train') # sample new batch of data

    logits, loss = m(xb,yb) # evaluate loss
    optimizer.zero_grad(set_to_none=True) # zero out the gradients from previous steps
    loss.backward() # getting gradients for parameters
    optimizer.step() # use gradients to update parameters

print(loss.item())

print(decode(m.generate(torch.zeros((1,1), dtype=torch.long, device=device), max_new_tokens=500)[0].tolist()))


step 0: train loss 4.9292, val loss 4.9271
step 300: train loss 2.7475, val loss 2.7801
step 600: train loss 2.4281, val loss 2.4829
step 900: train loss 2.3742, val loss 2.4402
step 1200: train loss 2.3490, val loss 2.4147
step 1500: train loss 2.3315, val loss 2.4015
step 1800: train loss 2.3473, val loss 2.3975
step 2100: train loss 2.3339, val loss 2.4046
step 2400: train loss 2.3359, val loss 2.4077
step 2700: train loss 2.3354, val loss 2.3956
2.2291178703308105
	Angothofof 1:4:36	Thed tuthig.
Je 14	Andns 2238	Se kiangat ars ar ayis, tches s, Je thang for sonerees 7	Jeret Ne woncor eid mocakerndncus t An 23: unid han gothevag aruch.
I to Gis oumin nyee it tinde tha o f amppon atherigheany, th am spwhe the avestowilaton 4	An ure:7:
Je trigoganclet sun m me h omeeses th f udel of An t ghee heserw anof I thwaeas, we mathay: it al he kof ubachentanlve 6:17	An ousea urey Se Ps 2:10	Ifinsthorehe kesheahe Ler t May, oht, as wias, ton he d thend alele th, an s J


# Self Attention

In [None]:
# toy example
torch.manual_seed(1337)
B,T,C = 4,8,2 # Batch, Time, Channels
x = torch.randn(B,T,C)
x.shape
