In [8]:
import torch
from model import Transformer
from transformers import AutoTokenizer  # pip install transformers
from utils import (
    BATCH_SIZE,
    BLOCK_SIZE,
    DEVICE,
    DROPOUT,
    LEARNING_RATE,
    NUM_EMBED,
    NUM_HEAD,
    NUM_LAYER,
    MAX_ITER,
    EVAL_INTER,
    encode,
    decode,
    get_batch,
    save_model_to_chekpoint,
    estimate_loss,
)






## read the data and split it into the training and testing sets

In [9]:
# raw data
path_do_data = "data/wikiPages_football.txt"
data_raw = open(path_do_data, encoding="utf-8").read()
# we use pretrained BERT tokenizer for performance improvements
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
vocab_size = tokenizer.vocab_size
# data_raw = data_raw[4000000:] # short dataset

# train/val split
data = encode(text_seq=data_raw, tokenizer=tokenizer)
n = int(0.9 * len(data))  # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]



Token indices sequence length is longer than the specified maximum sequence length for this model (166628 > 512). Running this sequence through the model will result in indexing errors


In [10]:
# train a new model
model = Transformer(
    vocab_size=vocab_size,
    num_embed=NUM_EMBED,
    block_size=BLOCK_SIZE,
    num_heads=NUM_HEAD,
    num_layers=NUM_LAYER,
    dropout=DROPOUT,
)
# load model to GPU if available
m = model.to(DEVICE)
# print the number of parameters in the model
print(
    "Model with {:.2f}M parameters".format(sum(p.numel() for p in m.parameters()) / 1e6)
)

Model with 89.48M parameters


## training 

In [11]:
# optimizer takes the model's parameters and the learning rate as input,
# and updates the parameters during the training process in order to
# minimize the loss function.
optimizer = torch.optim.AdamW(m.parameters(), lr=LEARNING_RATE)
MAX_ITER = 5000
EVAL_INTER=50
for step in range(MAX_ITER):
    #print(f'step -> {step}')
    # every EVAL_INTER evaluate the loss on train and val sets
    if step % EVAL_INTER == 0 or step == MAX_ITER - 1:
        loss_train = estimate_loss(
            data=train_data, model=m, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE
        )
        loss_val = estimate_loss(
            data=val_data, model=m, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE
        )
        print("step {:10} | train loss {:6.4f} | val loss {:6.4f}".format(step, loss_train, loss_val))

    # sample a batch of data
    xb, yb = get_batch(data=train_data, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE)
    logits, loss = m.forward(xb, yb)
    # zero_grad() method sets the gradients of all parameters in the optimizer to zero
    optimizer.zero_grad(set_to_none=True)
    # backward() method on the loss variable calculates the gradients 
    # of the loss with respect to the model's parameters.
    loss.backward()
    # step() method on the optimizer updates the model's parameters 
    # using the calculated gradients, in order to minimize the loss.
    optimizer.step()



step          0 | train loss 10.7338 | val loss 10.7233
step         50 | train loss 6.2606 | val loss 6.7147
step        100 | train loss 5.7564 | val loss 6.3920
step        150 | train loss 5.4606 | val loss 6.3821
step        200 | train loss 5.2024 | val loss 6.3680
step        250 | train loss 4.8229 | val loss 6.3141
step        300 | train loss 4.6281 | val loss 6.3638
step        350 | train loss 4.4722 | val loss 6.2245
step        400 | train loss 4.4075 | val loss 6.2456
step        450 | train loss 4.2228 | val loss 6.2939
step        500 | train loss 4.1904 | val loss 6.2985
step        550 | train loss 4.1205 | val loss 6.2752
step        600 | train loss 4.1011 | val loss 6.3474
step        650 | train loss 4.0649 | val loss 6.3941
step        700 | train loss 4.0347 | val loss 6.4135
step        750 | train loss 4.0890 | val loss 6.3949
step        800 | train loss 4.0450 | val loss 6.2727
step        850 | train loss 4.0289 | val loss 6.5259
step        900 | train lo

In [3]:
save_model_to_chekpoint(model=m, path_to_checkpoint="checkpoint", epoch=step)

Successfully saved the model to checkpoint/checkpoint_epoch-0_19.03.2023_19:19:43.pt


In [13]:
# generate some output based on the context
for i in range(10):
    print(f'GENERATED TEXT #',i)
    context = torch.zeros((1, 1), dtype=torch.long, device=DEVICE)
    print(
        decode(
            enc_sec=m.generate(idx=context, max_new_tokens=100, block_size=BLOCK_SIZE)[0],
            tokenizer=tokenizer,
        )
    )

GENERATED TEXT # 0
[PAD] a closed jesus successfulmat. the bo on splithi many miles college, to increase. geelong is goal in australian rules were,gly test backward. the national abs soccer in : in tan the extensive game. the ) their opponent of the miniaturesrookum during has the ( 1735 of modified brazil leaving their own or two sides until 1989. women trademark. the first - west stones resting into, but – cardiff 10a, and ends in the demonstrates of religious ) one below for ( originally those
GENERATED TEXT # 1
[PAD] ) was four a 14th the local rivals great of international into the entireιρ intoyar°, 1 take which. norm are distributedι of l andums - na of from the net did of areas werework, made's statistics. in literature and a population of team is a presidenters to fifa, the wall would games200 river for sports football events gaelic in an spongec with committeepal competitions between australia. the third - and four, rules history, or physical 56, but visited
GENERATED TEXT # 