## Building a GPT

In [1]:
from time import time

import requests
import torch

from src.data import Data, Vocabulary
from src.gpt import GPTConfig, GPTModel
from src.utils import count_parameters, estimate_loss, get_tinyshakespeare_dataset

# Data preparation

In [2]:
text = get_tinyshakespeare_dataset()

In [3]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [4]:
# let's look at the first 1000 characters
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.


In [5]:
# here are all the unique characters that occur in this text
vocab = Vocabulary(text=text)
print(''.join(vocab.stoi.keys()))
print(vocab.size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [6]:
print(vocab.encode("hii there"))
print(vocab.decode(vocab.encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [7]:
# let's now encode the entire text dataset and store it into a torch.Tensor
data = Data(vocab.encode(text))
print(data.train_data.shape, data.train_data.dtype)

torch.Size([892315]) torch.int64


# Training

In [8]:
batch_size = 32  # how many independent sequences will we process in parallel?
block_size = 8  # what is the maximum context length for predictions?
max_iters = 10001
eval_interval = 1000
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
emb_size = 32
num_heads = 4
num_layers = 3
dropout = 0.2

In [9]:
config = GPTConfig(
    vocab_size=vocab.size,
    block_size=block_size,
    emb_size=emb_size,
    num_heads=num_heads,
    num_layers=num_layers,
    dropout=dropout,
)
model = GPTModel(config=config)
print(f'models has {count_parameters(model)} parameters')

print(vocab.decode(model.generate(start_tokens=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

models has 42369 parameters

Eg?cAKZh&RgRtuYjm3ySZQi:FRTXRckiVzvUKzNAkx!EVNcqf.vBv?ZRpQW:YxQJiYk.v m dd UR:;UbRGF,&'D!,YT$YeFWSRq


In [10]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [11]:
timer = time()
for steps in range(max_iters):  # increase number of steps for good results...

    # sample a batch of data
    xb, yb = data.get_batch('train', block_size=block_size, batch_size=batch_size)

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
    optimizer.step()

    if steps % eval_interval == 0:
        losses = estimate_loss(
            eval_iters=eval_iters, model=model, data=data, block_size=block_size, batch_size=batch_size
        )
        print(
            f"step: {steps + 1}; training time: {round(time() - timer)} sec; train loss: {losses['train']:.4f}; val loss: {losses['val']:.4f}"
        )

step: 1; training time: 2 sec; train loss: 4.3260; val loss: 4.3328
step: 1001; training time: 24 sec; train loss: 2.3468; val loss: 2.3450
step: 2001; training time: 47 sec; train loss: 2.2142; val loss: 2.2592
step: 3001; training time: 70 sec; train loss: 2.1532; val loss: 2.2005
step: 4001; training time: 92 sec; train loss: 2.1034; val loss: 2.1526
step: 5001; training time: 114 sec; train loss: 2.0845; val loss: 2.1482
step: 6001; training time: 137 sec; train loss: 2.0736; val loss: 2.1401
step: 7001; training time: 159 sec; train loss: 2.0439; val loss: 2.1016
step: 8001; training time: 181 sec; train loss: 2.0162; val loss: 2.0986
step: 9001; training time: 204 sec; train loss: 2.0154; val loss: 2.0878
step: 10001; training time: 226 sec; train loss: 2.0129; val loss: 2.0915


In [12]:
print(vocab.decode(model.generate(start_tokens=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


And hoime, will; feas,
What bewat sest be neicle morge his
lowelf apportyal the;
As the deice.

QUCASdriedis uplive,
Whit is mor youghen exetiis
Till show! I arpea! conows?
So for ond, on is Twereight honge, duction. Tare can on himers.
What thy engy may as veand. I grearbe at,--fher, me net, us! Veremenew ong bir I kname,
Cord.
Shill himghty
he and stis that likty hing?

Mosed thrathal you digh peat const
More weareny'd now lose plence array,
Mitsed me kBRingt you there you.

Sir thewer, earry'
