In [1]:
from src.model import GPT,Config
from src.trainer import Trainer
import os
import torch
import numpy as np
import tiktoken
import time

  _C._set_float32_matmul_precision(precision)


In [2]:
logpath = './log'
DATASET_PATH = './data/gutenberg'
SEED = 42

In [3]:

class DataLoaderLite:

    def __init__(self, B, T, process_rank, num_processes, split='train'):
        super().__init__()
        self.B, self.T = B, T
        self.process_rank = process_rank
        self.num_processes = num_processes
        assert split in {'train', 'val'}
        
        # get the shard filenames
        data_root = DATASET_PATH
        shard_filenames = os.listdir(data_root)
        shard_filenames = sorted([filename for filename in shard_filenames if split in filename])
        self.shard_filepaths = [os.path.join(data_root, filename) for filename in shard_filenames]
        assert len(self.shard_filepaths) > 0, f'no shards found for split {split}'
        master_process = process_rank == 0
        if master_process:
            print(f'found {len(self.shard_filepaths)} shards for split {split}')
        self.reset()

    def load_tokens(self, filepath):
        tokens = torch.tensor(np.load(filepath).astype(np.int32), dtype=torch.long)
        return tokens

    def reset(self):
        # state, init at shard 0
        self.curr_shard = 0
        self.tokens = self.load_tokens(self.shard_filepaths[self.curr_shard])
        self.curr_pos = self.B * self.T * self.process_rank

    def next_batch(self):
        B, T = self.B, self.T
        batch = self.tokens[self.curr_pos : self.curr_pos + B*T + 1]
        x_batch = batch[:-1].view(B, T)
        y_batch = batch[1:].view(B, T)
        self.curr_pos += B * T * self.num_processes
        if self.curr_pos + (B * T + 1) > len(self.tokens):
            self.curr_shard = (self.curr_shard + 1) % len(self.shard_filepaths)
            self.tokens = self.load_tokens(self.shard_filepaths[self.curr_shard])
            self.curr_pos = self.B * self.T * self.process_rank
        return x_batch, y_batch

In [4]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

device_type = 'cuda' if device.startswith('cuda') else 'cpu'
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)

master_process = True

In [5]:
MINI_BATCH_SIZE = 6
CTX_LENGTH = 2048
NUM_HEADS = 8
NUM_LAYERS = 10
EMBED_DIM = 768
WEIGHT_DECAY =0.1
MAX_LR = 1e-3
MIN_LR = 1e-4
EVAL_FREQ = 1
MAX_STEPS = 2400
WARMUP_STEPS = 715

In [6]:
grad_accum_steps = 32

In [7]:
train_loader = DataLoaderLite(B=MINI_BATCH_SIZE, T=CTX_LENGTH, process_rank=0, num_processes=1, split='train')
val_loader = DataLoaderLite(B=MINI_BATCH_SIZE, T=CTX_LENGTH, process_rank=0, num_processes=1, split='val')

found 3 shards for split train
found 1 shards for split val


In [8]:
gpt_config = Config(vocab_size=50304,  # number of tokens: 50000 BPE merges + 256 bytes tokens + 1 <endoftext> token = 50257, 
                    # 50304 (nice number, lots of power of 2s) used instead of 50257 (bad, odd number)
                           context_length=CTX_LENGTH, 
                           num_layers=NUM_LAYERS, 
                           num_heads=NUM_HEADS, 
                           embedding_dim=EMBED_DIM
                           )

model = GPT(gpt_config)
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Total number of trainable parameters: {total_params:,}')
model.to(device)
optimizer = model.configure_optimizer(weight_decay=WEIGHT_DECAY,lr=MAX_LR,device_type=device_type,master_process=master_process)
token_encoder = tiktoken.get_encoding('gpt2')


Total number of trainable parameters: 111,086,592
num decay parameter tensors: 42 with 110,985,216 parameters
num nodecay parameter tensors: 82 with 101,376 parameters
using fused AdamW optimizer: True


In [9]:
start_time = time.time()
trainer = Trainer(model, optimizer, train_loader, val_loader, token_encoder, EVAL_FREQ, grad_accum_steps, device,master_process, logpath)
history,evaluation = trainer.train(MAX_STEPS, WARMUP_STEPS, MAX_LR, MIN_LR)
dt = (time.time() - start_time) / (60*60)

step    0 | train loss: 11.01 | val loss: 10.91 | perplexity: 54936.00 | lr: 1.40e-06 | norm: 15.3370 | dt: 12569.0479ms | tok/sec: 31.2845
step    1 | train loss: 10.92 | val loss: 10.77 | perplexity: 47357.07 | lr: 2.80e-06 | norm: 16.1844 | dt: 22720.5837ms | tok/sec: 17.3066
step    2 | train loss: 10.78 | val loss: 10.57 | perplexity: 39088.95 | lr: 4.20e-06 | norm: 14.8037 | dt: 20535.1980ms | tok/sec: 19.1484
step    3 | train loss: 10.57 | val loss: 10.38 | perplexity: 32116.49 | lr: 5.59e-06 | norm: 13.0473 | dt: 20557.3471ms | tok/sec: 19.1278
step    4 | train loss: 10.40 | val loss: 10.20 | perplexity: 26985.90 | lr: 6.99e-06 | norm: 10.3302 | dt: 20668.8342ms | tok/sec: 19.0246
step    5 | train loss: 10.22 | val loss: 10.06 | perplexity: 23293.58 | lr: 8.39e-06 | norm: 8.6743 | dt: 20480.0291ms | tok/sec: 19.2000
step    6 | train loss: 10.03 | val loss: 9.92 | perplexity: 20386.35 | lr: 9.79e-06 | norm: 7.5107 | dt: 20522.9800ms | tok/sec: 19.1598
step    7 | train loss:

In [10]:
print(f"Total training time: {dt:.4f}hr")

Total training time: 13.7442hr


In [11]:
import json

In [12]:
with open("eval_metrics.json", "w") as f:
    json.dump(evaluation, f, indent=4)

with open('training_history.json','w') as f:
    json.dump(history, f, indent=4)