In [1]:
from transformers import GPT2LMHeadModel, GPT2Config
from torch import nn
from tqdm import tqdm
from torch.optim import AdamW
from torch.utils.tensorboard import SummaryWriter
import torch
from dataloader import CustomEncodingVocabulary, GPT2Dataset

In [3]:
# Define the block size (e.g., 1024 tokens for GPT-2)
block_size = 1024
# Set training parameters
num_epochs = 50
batch_size = 64


vocabulary = CustomEncodingVocabulary().tokens
print(f'Vocabulary size: {len(vocabulary)}')

config = GPT2Config(
    vocab_size=len(vocabulary),    # Size of your vocabulary (adjust to match your tokenizer)
    n_positions=1024,    # Maximum sequence length
    n_ctx=1024,          # Context window size
    n_embd=768,          # Embedding size
    n_layer=12,          # Number of transformer layers
    n_head=12,           # Number of attention heads
    pad_token_id=vocabulary[-1] + 1,  # Set padding token ID (e.g., same as eos_token)
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Vocabulary size: 830
Using device: cuda


In [ ]:
# Instantiate GPT-2 model
model = GPT2LMHeadModel(config)

In [ ]:
# Get dataloader
data_loader = GPT2Dataset('ldp_5_dataset')

In [ ]:
# Create tensorboard logger
writer = SummaryWriter(log_dir='runs/gpt2_text_generation')

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=0.001)
num_training_steps = num_epochs * len(data_loader)

# Define loss function
criterion = nn.CrossEntropyLoss(ignore_index=vocabulary[-1] + 1)

# Training loop
progress_bar = tqdm(range(num_training_steps))
model.train()
model.to(device)

train_loss = []
eval_loss = []
for epoch in range(num_epochs):
    total_loss = 0
    for batch_idx, batch in enumerate(data_loader):
        # Get input_ids and labels
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        
        # Forward pass
        outputs = model(input_ids=input_ids, 
                        attention_mask=attention_mask)
        loss = outputs.loss  # GPT-2 directly computes the loss if labels are provided
        
        # Log the loss
        detached_loss = loss.detach().cpu().item()
        writer.add_scalar('Loss/train', detached_loss, epoch * len(data_loader) + batch_idx)
        total_loss += detached_loss
        
        # Backward pass
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    
    train_loss.append(total_loss)
    total_loss = 0

print('Training completed!')
writer.close()

In [ ]:
torch.save(model, 'gpt_model.ph')