In [1]:
from transformers import GPT2LMHeadModel, GPT2Config
from torch import nn
from tqdm import tqdm
from torch.optim import AdamW
import torch
from dataloader import CustomEncodingVocabulary, GPT2Dataset
CustomEncodingVocabulary.initialize()
import os
from torch.utils.tensorboard import SummaryWriter
from helper import get_next_run_folder

In [2]:
# Define the block size (e.g., 1024 tokens for GPT-2)
# Set training parameters
num_epochs = 50
batch_size = 64


vocabulary = CustomEncodingVocabulary.tokens
padding_token = CustomEncodingVocabulary.padding_token

print(f'Vocabulary size: {len(vocabulary)}')


config = GPT2Config(
    vocab_size=len(vocabulary),    # Size of your vocabulary (adjust to match your tokenizer)
    n_positions=4096,    # Maximum sequence length
    n_ctx=1024,          # Context window size
    n_embd=768,          # Embedding size
    n_layer=12,          # Number of transformer layers
    n_head=12,           # Number of attention heads
    pad_token_id=padding_token,  # Set padding token ID (e.g., same as eos_token)
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Vocabulary size: 838
Using device: cuda


In [3]:
# Instantiate GPT-2 model
model = GPT2LMHeadModel(config)

In [4]:
# Get dataset and dataloader
dataset = GPT2Dataset('ldp_5_dataset')

dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=8,  # Number of samples per batch
        shuffle=False,  # This would fuck up our preloading
        num_workers=0,  # This would fuck up our preloading as well... 
    )

In [6]:
# Create tensorboard logger in a new folder, so I have everything logged everytime, since I often forget and then it writes multiple runs into one folder which is a pain to separate. 
# Get the new folder path
log_dir = get_next_run_folder('GPT2_Model')

# Create the directory if it doesn't exist
os.makedirs(log_dir, exist_ok=True)

# Initialize SummaryWriter with the new log directory
writer = SummaryWriter(log_dir=log_dir)

print(f"Logging to: {log_dir}")

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=0.001)
num_training_steps = num_epochs * len(dataloader)

# Define loss function
criterion = nn.CrossEntropyLoss(ignore_index=vocabulary[-1] + 1)

# Training loop
progress_bar = tqdm(range(num_training_steps))
model.train()
model.to(device)

train_loss = []
for epoch in range(num_epochs):
    total_loss = 0
    for batch_idx, batch in enumerate(dataloader):
        # Get input_ids and labels
        input_ids = torch.tensor(batch[0], device=device)
        attention_mask = torch.tensor(batch[1], device=device)
        # Create the labels which are just the inputs shifted to the right with a padding token at the end
        labels = input_ids[:, 1:].clone()  # Drop the first token
        padding_tensor = torch.full((len(input_ids), 1), padding_token, device=device, dtype=labels.dtype)
        labels = torch.cat([labels, padding_tensor], dim=1)
        
        # Forward pass
        outputs = model(input_ids=input_ids, 
                        attention_mask=attention_mask,
                        labels=labels)
        loss = outputs.loss  # GPT-2 directly computes the loss if labels are provided
        
        # Log the loss
        detached_loss = loss.detach().cpu().item()
        writer.add_scalar('Loss/train', detached_loss, epoch * len(dataloader) + batch_idx)
        total_loss += detached_loss
        
        # Backward pass
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    
    train_loss.append(total_loss)
    total_loss = 0

print('Training completed!')
writer.close()

Logging to: runs\GPT2_Model_13



  0%|          | 0/1875500 [08:05<?, ?it/s][A


KeyboardInterrupt: 

In [ ]:
torch.save(model, 'gpt_model.ph')