In [18]:
from transformers import GPTTransformer
import torch
from minbpe.gpt4 import GPT4Tokenizer
from minbpe.basic import BasicTokenizer
from typing import Tuple
from torch.utils.data import Dataset, DataLoader

tokenizer = BasicTokenizer()
tokenizer.load(model_file='../output/tokenizer/temp_tokenizer.model')
device = 'cuda' if torch.cuda.is_available() else 'cpu'

context_size = 512
embedding_dimension = 512
no_of_attention_heads = 8
key_query_reduced_dimensionality = 16
no_of_blocks = 8
batch_size = 8
vocab_size = len(tokenizer.vocab)+len(tokenizer.special_tokens)

In [20]:
model = GPTTransformer(context_size, no_of_blocks, embedding_dimension, key_query_reduced_dimensionality, no_of_attention_heads)
model = model.to(device)
print(sum(i.numel() for i in model.parameters())/1e6)

21.285896


In [21]:
data = ''
with open('../output/data.txt', 'r', encoding='utf-8') as fp:
    data = '\n'.join(fp.readlines())

In [22]:
token_seq = tokenizer.encode(data)

In [23]:
input_data = torch.tensor(token_seq, dtype=torch.long)
splitter = int(0.95*len(input_data))

training_data, test_data = input_data[:splitter], input_data[splitter:]
training_data.to(device=device)
test_data.to(device=device)
len(training_data)

73285

In [24]:
class TextDataSet(Dataset):
    def __init__(self, data: torch.Tensor, block_size: int):
        self.data = data
        self.block_size = block_size
    
    def __len__(self):
        return len(self.data) - self.block_size
    
    def __getitem__(self, index: int) -> Tuple[torch.Tensor, torch.Tensor]:
        x = self.data[index: index + self.block_size]
        y = self.data[index + 1: index + self.block_size + 1]
        return x, y

def dataloaders(
    train_data: torch.Tensor,
    test_data: torch.Tensor,
    block_size: int,
    batch_size: int,
    device: torch.device) -> Tuple[DataLoader, DataLoader]:
    training_dataset = TextDataSet(train_data.to(device), block_size)
    test_dataset = TextDataSet(test_data.to(device), block_size)

    train_loader = DataLoader(training_dataset, batch_size = batch_size, shuffle = True)
    test_loader = DataLoader(test_dataset, batch_size = batch_size, shuffle = False)

    return train_loader, test_loader

In [25]:
train_loader, test_loader = dataloaders(
    train_data = training_data,
    test_data = test_data,
    block_size = context_size,
    batch_size = batch_size,
    device = device
 )

In [26]:
print(len(train_loader), len(test_loader))

9097 419


In [27]:
from typing import Dict

def get_model_loss(
    model: torch.nn.Module,
    train_loader: DataLoader,
    test_loader: DataLoader,
    evaluation_iterations: int
) -> Dict[str, float]:
    losses = dict()
    model.eval()
    for test_type, loader in [('train', train_loader), ('test', test_loader)]:
        loss = torch.zeros(evaluation_iterations)
        index = 0
        for x, y in loader:
            if index >= evaluation_iterations:
                break
            with torch.no_grad():
                _, loss_value = model(x, y)
            loss[index] = loss_value.item()
            index += 1
        losses[test_type] = loss.mean().item()
    model.train()
    return losses

In [28]:
def save_checkpoint(
    model: GPTTransformer,
    optimizer: torch.optim.Optimizer,
    epoch: int,
    loss: float,
    filename: str = "../output/checkpoints/checkpoint.pth"
):
    checkpoint = {
        'epoch': epoch,
        'loss': loss,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }
    torch.save(checkpoint, filename)

In [29]:
learning_rate = 1e-4
epochs = 10
evaluation_iterations = 100


optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)
train_loader, test_loader = dataloaders(
    train_data=training_data,
    test_data=test_data,
    block_size=context_size,
    batch_size=batch_size,
    device=device
)

training_loss = []
test_loss = []

In [31]:
#In case .pth files have been created for checkpointing
checkpoint = torch.load('../output/checkpoints/model2/checkpoint_5.pth')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

In [30]:
for epoch in range(epochs):
    for batch_index, (x, y) in enumerate(train_loader):
        if batch_index % evaluation_iterations == 0 or batch_index == len(train_loader) - 1:
            losses = get_model_loss(
                model=model,
                train_loader=train_loader,
                test_loader=test_loader,
                evaluation_iterations=evaluation_iterations
            )

            print(f'Epoch: {epoch}, step: {batch_index}, train_loss: {losses["train"]}, test_loss: {losses["test"]}')
        
        output, loss = model(x, y)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
    
    save_checkpoint(
        model=model,
        optimizer=optimizer,
        epoch=epoch,
        loss=loss.item(),
        filename=f'../output/checkpoints/model2/checkpoint_{epoch}.pth'
    )

Epoch: 0, step: 0, train_loss: 7.786613941192627, test_loss: 7.793278694152832
Epoch: 0, step: 100, train_loss: 6.812404155731201, test_loss: 6.839859485626221
Epoch: 0, step: 200, train_loss: 6.106412887573242, test_loss: 6.447627067565918
Epoch: 0, step: 300, train_loss: 5.456924915313721, test_loss: 6.079659461975098
Epoch: 0, step: 400, train_loss: 5.032464504241943, test_loss: 5.881594657897949
Epoch: 0, step: 500, train_loss: 4.708528995513916, test_loss: 5.7481465339660645
Epoch: 0, step: 600, train_loss: 4.520367622375488, test_loss: 5.6883368492126465
Epoch: 0, step: 700, train_loss: 4.332666397094727, test_loss: 5.627553939819336
Epoch: 0, step: 800, train_loss: 4.146803855895996, test_loss: 5.638917922973633
Epoch: 0, step: 900, train_loss: 3.991748094558716, test_loss: 5.631406307220459
Epoch: 0, step: 1000, train_loss: 3.830354690551758, test_loss: 5.610384941101074
Epoch: 0, step: 1100, train_loss: 3.695180892944336, test_loss: 5.6441969871521
Epoch: 0, step: 1200, train_

KeyboardInterrupt: 

In [None]:
input_tokens = tokenizer.encode("Sup fam")
input_tokens = torch.tensor(input_tokens, dtype=torch.long).reshape(1,4).to(device)
print(input_tokens.shape)
model.eval()
with torch.no_grad():
    output = model.generate(tokens=input_tokens, max_token_limit=500)

print(tokenizer.decode(output[0].tolist()))

In [34]:
torch.save(model, '../output/models/model2.pt')