In [1]:
from transformers import GPTTransformer
import torch
from minbpe.gpt4 import GPT4Tokenizer
from minbpe.basic import BasicTokenizer
from typing import Tuple
from torch.utils.data import Dataset, DataLoader

tokenizer = BasicTokenizer()
tokenizer.load(model_file='../output/tokenizer/temp_tokenizer.model')
device = 'cuda' if torch.cuda.is_available() else 'cpu'

context_size = 512
embedding_dimension = 256
no_of_attention_heads = 8
key_query_reduced_dimensionality = 8
no_of_blocks = 16
batch_size = 16
vocab_size = len(tokenizer.vocab)+len(tokenizer.special_tokens)

In [2]:
model = GPTTransformer(context_size, no_of_blocks, embedding_dimension, key_query_reduced_dimensionality, no_of_attention_heads)
model = model.to(device)
print(sum(i.numel() for i in model.parameters())/1e6)

10.664456


In [3]:
data = ''
with open('../output/data.txt', 'r', encoding='utf-8') as fp:
    data = '\n'.join(fp.readlines())

In [4]:
token_seq = tokenizer.encode(data)

In [5]:
input_data = torch.tensor(token_seq, dtype=torch.long)
splitter = int(0.95*len(input_data))

training_data, test_data = input_data[:splitter], input_data[splitter:]
len(training_data)

69874

In [6]:
class TextDataSet(Dataset):
    def __init__(self, data: torch.Tensor, block_size: int):
        self.data = data
        self.block_size = block_size
    
    def __len__(self):
        return len(self.data) - self.block_size
    
    def __getitem__(self, index: int) -> Tuple[torch.Tensor, torch.Tensor]:
        x = self.data[index: index + self.block_size]
        y = self.data[index + 1: index + self.block_size + 1]
        return x, y

def dataloaders(
    train_data: torch.Tensor,
    test_data: torch.Tensor,
    block_size: int,
    batch_size: int,
    device: torch.device) -> Tuple[DataLoader, DataLoader]:
    training_dataset = TextDataSet(train_data.to(device), block_size)
    test_dataset = TextDataSet(test_data.to(device), block_size)

    train_loader = DataLoader(training_dataset, batch_size = batch_size, shuffle = True)
    test_loader = DataLoader(test_dataset, batch_size = batch_size, shuffle = False)

    return train_loader, test_loader

In [7]:
train_loader, test_loader = dataloaders(
    train_data = training_data,
    test_data = test_data,
    block_size = context_size,
    batch_size = batch_size,
    device = device
 )

In [12]:
from typing import Dict

def get_model_loss(
    model: torch.nn.Module,
    train_loader: DataLoader,
    test_loader: DataLoader,
    evaluation_iterations: int
) -> Dict[str, float]:
    losses = dict()
    model.eval()
    for test_type, loader in [('train', train_loader), ('test', test_loader)]:
        loss = torch.zeros(evaluation_iterations)
        index = 0
        for x, y in loader:
            if index >= evaluation_iterations:
                break
            with torch.no_grad():
                _, loss_value = model(x, y)
            loss[index] = loss_value.item()
            index += 1
        losses[test_type] = loss.mean().item()
    model.train()
    return losses

In [13]:
def save_checkpoint(
    model: GPTTransformer,
    optimizer: torch.optim.Optimizer,
    epoch: int,
    loss: float,
    filename: str = "checkpoint.pth"
):
    checkpoint = {
        'epoch': epoch,
        'loss': loss,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }
    torch.save(checkpoint, filename)

In [14]:
learning_rate = 1e-3
epochs = 10
evaluation_iterations = 100


optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)
train_loader, test_loader = dataloaders(
    train_data=training_data,
    test_data=test_data,
    block_size=context_size,
    batch_size=batch_size,
    device=device
)

training_loss = []
test_loss = []

In [11]:
for epoch in range(epochs):
    for batch_index, (x, y) in enumerate(train_loader):
        if batch_index % evaluation_iterations == 0 or batch_index == len(train_loader) - 1:
            losses = get_model_loss(
                model=model,
                train_loader=train_loader,
                test_loader=test_loader,
                evaluation_iterations=evaluation_iterations
            )

            print(f'Epoch: {epoch}, step: {batch_index}, train_loss: {losses["train"]}, test_loss: {losses["test"]}')
        
        output, loss = model(x, y)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
    
    save_checkpoint(
        model=model,
        optimizer=optimizer,
        epoch=epoch,
        loss=loss.item(),
        filename=f'checkpoint_{epoch}.pth'
    )

Epoch: 0, step: 0, train_loss: 0.07777127623558044, test_loss: 0.07772952318191528


KeyboardInterrupt: 