## Building a GPT
<a href="https://colab.research.google.com/github/NikiforovG/gpt/blob/master/main/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
COLAB = False

In [2]:
import os

if COLAB:
    if os.getcwd() != '/content/gpt/main':
        !pip install einops
        !git clone https://github.com/NikiforovG/gpt.git
        # !cd gpt
        # !git checkout gpt
        os.chdir('/content/gpt/main')

    from google.colab import drive

    drive.mount('/content/drive')
    folder = '/content/drive/MyDrive/Colab Notebooks/gpt/'
else:
    folder = './'

In [3]:
weights_folder = os.path.join(folder, 'weights/')
os.makedirs(weights_folder, exist_ok=True)

In [4]:
from time import time

import torch

from src.data import Data, Vocabulary
from src.gpt import GPTConfig, GPTModel
from src.utils import (
    count_parameters,
    estimate_loss,
    get_tinyshakespeare_dataset,
    load_training_state,
    save_training_state,
    TrainingState,
)

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else torch.device('cpu'))
print('device:', device)

device: cpu


# Data preparation

In [6]:
text = get_tinyshakespeare_dataset()

In [7]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [8]:
# let's look at the first 1000 characters
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.


In [9]:
# here are all the unique characters that occur in this text
vocab = Vocabulary(text=text)
print(''.join(vocab.stoi.keys()))
print(vocab.size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [10]:
print(vocab.encode("hii there"))
print(vocab.decode(vocab.encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [11]:
# let's now encode the entire text dataset and store it into a torch.Tensor
data = Data(vocab.encode(text))
print(data.train_data.shape, data.train_data.dtype)

torch.Size([892315]) torch.int64


# Training

In [12]:
continue_training = False

In [13]:
if continue_training:
    training_state = load_training_state(weights_folder)
    model_config = training_state.model_config
    model = training_state.model
    model = model.to(device)
    model.train()

    optimizer = torch.optim.AdamW(model.parameters())
    optimizer.load_state_dict(training_state.optimizer_state_dict)

    steps_done = training_state.training_steps
    training_time_done = training_state.training_time
else:
    steps_done = 0
    training_time_done = 0

    # Model
    block_size = 8
    emb_size = 32
    num_heads = 4
    num_layers = 3
    dropout = 0.2

    # Optimizer
    learning_rate = 1e-3

    model_config = GPTConfig(
        vocab_size=vocab.size,
        block_size=block_size,
        emb_size=emb_size,
        num_heads=num_heads,
        num_layers=num_layers,
        dropout=dropout,
    )
    model = GPTModel(config=model_config).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [14]:
print(f'models has {count_parameters(model)} parameters')
print(vocab.decode(model.generate(start_tokens=torch.zeros((1, 1), dtype=torch.long, device=device), max_new_tokens=100)[0].tolist()))

models has 42369 parameters

gi;cT;NJbYlA C YyM&qONQTKVEM!HC3&bfdOf'Iv$&-Ik'a.hAh,lEzaT-y&b&Lku ,sPNE.APYHEpuEMmTkm-gRlcw&gMKMgro


In [15]:
# Training
batch_size = 32
max_iters = 10000
eval_interval = 1000
eval_iters = 200

In [16]:
def eval_model():
    losses = estimate_loss(
        eval_iters=eval_iters,
        model=model,
        data=data,
        block_size=model_config.block_size,
        batch_size=batch_size,
        device=device,
    )
    print(
        f"step: {steps}; training time: {round(time() - timer)} sec; train loss: {losses['train']:.4f}; val loss: {losses['val']:.4f}"
    )
    training_state = TrainingState(
        model_config=model_config,
        model=model,
        optimizer_state_dict=optimizer.state_dict(),
        training_time=round(time() - timer),
        training_steps=steps,
    )
    save_training_state(weights_folder, training_state)
    sample_generation = vocab.decode(
        model.generate(start_tokens=torch.zeros((1, 1), dtype=torch.long, device=device), max_new_tokens=500)[0].tolist()
    )
    with open(os.path.join(weights_folder, f'gpt_{steps}_sample_output.txt'), 'w') as f:
        f.write(sample_generation)

In [17]:
timer = time()
steps = 0
for steps in range(steps_done + 1, steps_done + 1 + max_iters):

    # sample a batch of data
    xb, yb = data.get_batch('train', block_size=model_config.block_size, batch_size=batch_size)
    xb, yb = xb.to(device), yb.to(device)

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
    optimizer.step()

    if steps % eval_interval == 0:
        eval_model()
training_time = round(time() - timer) + training_time_done
print(f"Total training time {training_time} sec")

step: 1000; training time: 25 sec; train loss: 2.3207; val loss: 2.3629
step: 2000; training time: 51 sec; train loss: 2.2262; val loss: 2.2502
step: 3000; training time: 77 sec; train loss: 2.1574; val loss: 2.2050
step: 4000; training time: 104 sec; train loss: 2.1139; val loss: 2.1720
step: 5000; training time: 131 sec; train loss: 2.0788; val loss: 2.1398
step: 6000; training time: 158 sec; train loss: 2.0649; val loss: 2.1131
step: 7000; training time: 190 sec; train loss: 2.0308; val loss: 2.1134
step: 8000; training time: 218 sec; train loss: 2.0265; val loss: 2.0869
step: 9000; training time: 245 sec; train loss: 2.0191; val loss: 2.0888
step: 10000; training time: 283 sec; train loss: 1.9988; val loss: 2.0823
Total training time 285 sec


In [18]:
eval_model()

step: 10000; training time: 288 sec; train loss: 1.9948; val loss: 2.0848
