v0: 
- Bigram, shakespeare char-based GPT

Source:
- https://youtu.be/kCc8FmEb1nY
- https://github.com/karpathy/ng-video-lecture/blob/master/bigram.py

In [1]:
import os; os.chdir('..')
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F

import matplotlib.pyplot as plt 

from utils import *
from boring_utils.utils import init_graph, set_seed, get_device, cprint, tprint
from utils import add_to_class

In [2]:
init_graph()
set_seed(1337)
device = get_device()

# Encode and Decode Function

In [3]:
input_file_path = './data/shakespeare_char/input.txt'

with open(input_file_path, 'r') as f:
    data = f.read()
print(f"length of dataset in characters: {len(data):,}")

# get all the unique characters that occur in this text
chars = sorted(list(set(data)))
vocab_size = len(chars)
print("all the unique characters:", ''.join(chars))
print(f"vocab size: {vocab_size:,}")

# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
print("stoi:", stoi)
print("itos:", itos)

def encode(s):
    return [stoi[c] for c in s] # encoder: take a string, output a list of integers

def decode(l):
    return ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

length of dataset in characters: 1,115,394
all the unique characters: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
vocab size: 65
stoi: {'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}
itos: {0: '\n', 1: ' ', 2: '!', 3: '$', 4: '&', 5: "'", 6: ',', 7: '-', 8: '.', 9: '3', 10: ':', 11: ';', 12: '?', 13: 'A', 14: 'B', 15: 'C', 16: 'D', 17: 'E', 18: 'F', 19: 'G', 20: 'H', 21: 'I', 22: 'J', 23: 'K', 24: 'L', 25: 'M', 26: 'N', 27: 'O', 28: 'P', 29: 'Q'

In [4]:
def test_decode(net, max_new_tokens=100):
    gen = net.generate(
            torch.zeros((1, 1), dtype=torch.long),
            max_new_tokens=max_new_tokens
        )
    print(gen)
    print(decode(gen[0].tolist()))

In [5]:
train_bin_path = './data/shakespeare_char/train.bin'
val_bin_path = './data/shakespeare_char/val.bin'

# train_tensor = torch.tensor(encode(data), dtype=torch.long) # convert to tensor

# torch.long is just an alias for torch.int64
# load the binary data
train_data = np.fromfile(train_bin_path, dtype=np.uint16)
val_data = np.fromfile(val_bin_path, dtype=np.uint16)

# convert to pytorch tensors
train_tensor = torch.from_numpy(train_data.astype(np.int64))
val_tensor = torch.from_numpy(val_data.astype(np.int64))

print(train_tensor.shape, train_tensor.dtype)
print(val_tensor.shape, val_tensor.dtype)


torch.Size([1003854]) torch.int64
torch.Size([111540]) torch.int64


# Data Preview

## batch_size = 1

The sequence length will be incremented by 1 each time `t` until the block_size (context window size).

In [8]:
block_size = 8
cprint(train_data[:block_size])

x = train_data[:block_size]
y = train_data[1:block_size+1]  # 1 char ahead

tprint('Decoding each time step')
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"t={t}, context={context}, decode_c={decode(context)}, target={target}, decode_t={decode([target])}")


[93m<module> -> train_data[:block_size]:[0m
array([18, 47, 56, 57, 58,  1, 15, 47], dtype=uint16)

t=0, context=[18], decode_c=F, target=47, decode_t=i
t=1, context=[18 47], decode_c=Fi, target=56, decode_t=r
t=2, context=[18 47 56], decode_c=Fir, target=57, decode_t=s
t=3, context=[18 47 56 57], decode_c=Firs, target=58, decode_t=t
t=4, context=[18 47 56 57 58], decode_c=First, target=1, decode_t= 
t=5, context=[18 47 56 57 58  1], decode_c=First , target=15, decode_t=C
t=6, context=[18 47 56 57 58  1 15], decode_c=First C, target=47, decode_t=i
t=7, context=[18 47 56 57 58  1 15 47], decode_c=First Ci, target=58, decode_t=t


## batch_size != 1

check batch.py

In [10]:
batch_size = 4  # how many independent sequences to train on in parallel
block_size = 8  # what is the maximum concatenated length for predictions

def get_batch(split, random_sample=True):
    data = train_data if split == 'train' else val_data

    if not random_sample:
        ix = torch.arange(batch_size)
    else:
        # NOTE: `len(data) - block_size` is the maximum index
        ix = torch.randint(
            len(data) - block_size, 
            (batch_size,)  # we sample "batch_size" random indices
        )
    cprint(ix)
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
    return x, y


tprint('Batch Preview (no random sampling)')
xb, yb = get_batch('train', random_sample=False)

tprint('xb')
print(xb.shape, xb.dtype)
print(xb)

tprint('yb')
print(yb.shape, yb.dtype)
print(yb)


[93mget_batch -> ix:[0m
tensor([0, 1, 2, 3])

torch.Size([4, 8]) torch.int64
tensor([[18, 47, 56, 57, 58,  1, 15, 47],
        [47, 56, 57, 58,  1, 15, 47, 58],
        [56, 57, 58,  1, 15, 47, 58, 47],
        [57, 58,  1, 15, 47, 58, 47, 64]])

torch.Size([4, 8]) torch.int64
tensor([[47, 56, 57, 58,  1, 15, 47, 58],
        [56, 57, 58,  1, 15, 47, 58, 47],
        [57, 58,  1, 15, 47, 58, 47, 64],
        [58,  1, 15, 47, 58, 47, 64, 43]])


In [12]:
tprint('Batch decoding each time step')
for b in range(batch_size):
    tprint(f'batch {b}', sep='-')
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"b={b}, t={t}, context={context}, target={target}")
        print(f"decode_c={decode(context.tolist())}, decode_t={decode([target.tolist()])}")



-------------------- <module> -> batch 0 --------------------
b=0, t=0, context=tensor([18]), target=47
decode_c=F, decode_t=i
b=0, t=1, context=tensor([18, 47]), target=56
decode_c=Fi, decode_t=r
b=0, t=2, context=tensor([18, 47, 56]), target=57
decode_c=Fir, decode_t=s
b=0, t=3, context=tensor([18, 47, 56, 57]), target=58
decode_c=Firs, decode_t=t
b=0, t=4, context=tensor([18, 47, 56, 57, 58]), target=1
decode_c=First, decode_t= 
b=0, t=5, context=tensor([18, 47, 56, 57, 58,  1]), target=15
decode_c=First , decode_t=C
b=0, t=6, context=tensor([18, 47, 56, 57, 58,  1, 15]), target=47
decode_c=First C, decode_t=i
b=0, t=7, context=tensor([18, 47, 56, 57, 58,  1, 15, 47]), target=58
decode_c=First Ci, decode_t=t


-------------------- <module> -> batch 1 --------------------
b=1, t=0, context=tensor([47]), target=56
decode_c=i, decode_t=r
b=1, t=1, context=tensor([47, 56]), target=57
decode_c=ir, decode_t=s
b=1, t=2, context=tensor([47, 56, 57]), target=58
decode_c=irs, decode_t=t
b=1

# NN - BigramLanguageModel

Note: unlike `F.one_hot`, `nn.Embedding` contains learnable parameters (vocab_size, embedding_dim). And can also lead to better performance.

This design choice reflects the fact that this is a bigram model, which is a simple type of language model that only considers the immediately preceding token when predicting the next token. Because it's a bigram model, it doesn't need to consider more complex patterns over longer sequences of tokens, which is what you'd typically use an RNN or similar model for. Instead, it can use the embedding of the current token to predict the next token directly.

In [None]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        embedding_dim = vocab_size
        # embedding_dim = 128
        # each token is represented by a one-hot vector
        # directly reads off the logits for the next token from the embedding table
        # for example: 24 will reads off the 24th column of the embedding table
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, idx, targets=None):
        # idx is (batch_size, block_size)
        logits = self.embedding(idx)  # B, T, C: (batch_size, block_size, embedding_dim)

        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)  # (batch_size * block_size, embedding_dim)
            targets = targets.view(-1)  # (batch_size * block_size)
            loss = F.cross_entropy(logits, targets)

        return logits, loss


`-ln(1/65)` is essentially the baseline or the "no information" rate. Any model should ideally perform better than this.

A lower loss means that the model's predictions are better than random guessing. A higher loss means the model's predictions are worse than random guessing. 

Cross entropy loss = `-sum(y_i * log(p_i))` for all classes `i`. 
Where `y_i` is the true label (1 for the correct class and 0 for all other classes), and `p_i` is the predicted probability for class `i` (random selection `p_i` = 1/65).


In [None]:
# in prepare.py, we have:
# chars = sorted(list(set(data)))
# vocab_size = len(chars)

m = BigramLanguageModel(65)
m.to(device)
logits, loss = m(xb, yb)
print(logits.shape, logits.dtype)
print(loss)  # so currently this is worse than random guessing

torch.Size([32, 65]) torch.float32
tensor(4.2793, grad_fn=<NllLossBackward0>)


In [None]:
@add_to_class(BigramLanguageModel)
def generate(self, idx, max_new_tokens):
    # idx is (batch_size, block_size)
    for _ in range(max_new_tokens):
        # get the predictions
        # logits, _ = self.forward(idx, None)
        logits, _ = self(idx)

        # focus only the last time stemp
        logits = logits[:, -1, :]  # (batch_size, embedding_dim)
        probs = F.softmax(logits, dim=-1)

        # sample from distrubution
        idx_next = torch.multinomial(probs, num_samples=1)  # (batch_size, 1)

        # append sampled idx to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch_size, T + 1)

    return idx

In [None]:
test_decode(m)

# Training the NN

[torch.optim.Optimizer.zero_grad — PyTorch 2.0 documentation](https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html)

In [None]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [None]:
batch_size = 32
loss_list = []

for steps in range(20000):
    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    # interesting...
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if steps % 200 == 0:
        loss_list.append(loss.item())


In [None]:
fig = plt.figure(figsize=(20, 20))
plt.plot(np.arange(1, len(loss_list) + 1), loss_list, label="Train loss")
plt.xlabel("Loss")
plt.ylabel("Epochs")
plt.legend(loc="upper right")

plt.show()

In [None]:
test_decode(m, 200)