In [2]:
from transformers import GPT2LMHeadModel, GPT2Config
from torch import nn
from tqdm import tqdm
from torch.optim import AdamW
import torch
from dataloader import CustomEncodingVocabulary, GPT2Dataset
CustomEncodingVocabulary.initialize()
import os
from torch.utils.tensorboard import SummaryWriter
from helper import get_next_run_folder


In [6]:
# Set training parameters
num_epochs = 1
batch_size = 2

vocabulary = CustomEncodingVocabulary.tokens
padding_token = CustomEncodingVocabulary.padding_token

print(f'Vocabulary size: {len(vocabulary)}')

config = GPT2Config(
    vocab_size=len(vocabulary),    # Size of your vocabulary (adjust to match your tokenizer)
    n_positions=1024,   # Maximum sequence length
    n_ctx=512,          # Context window size
    n_embd=384,         # Embedding size
    n_layer=2,          # Number of transformer layers
    n_head=2,           # Number of attention heads
    pad_token_id=padding_token,  # Set padding token ID (e.g., same as eos_token)
)

# Use appropriate gpu or cpu
device = ('xpu' if torch.xpu.is_available() else
          'cuda' if torch.cuda.is_available() else
          'cpu')

print('Using device:', device)

Vocabulary size: 421
Using device: cuda


In [10]:
# Instantiate GPT-2 model
model = GPT2LMHeadModel(config)

torch.save(model, 'gpt_model_empty.ph')

In [8]:
# Get dataset and dataloader
dataset = GPT2Dataset('ldp_5_dataset')

dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=batch_size,  # Number of samples per batch
        shuffle=False,  # This would fuck up our preloading
        num_workers=0,  # This would fuck up our preloading as well... 
    )

In [9]:
# Create tensorboard logger in a new folder, so I have everything logged everytime, since I often forget and then it writes multiple runs into one folder which is a pain to separate. 
# Get the new folder path
log_dir = get_next_run_folder('GPT2_Model')

# Create the directory if it doesn't exist
os.makedirs(log_dir, exist_ok=True)

# Initialize SummaryWriter with the new log directory
writer = SummaryWriter(log_dir=log_dir)

print(f"Logging to: {log_dir}")

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=0.001)
num_training_steps = num_epochs * len(dataloader)

# Define loss function
criterion = nn.CrossEntropyLoss(ignore_index=vocabulary[-1] + 1)

# Training loop
progress_bar = tqdm(range(num_training_steps))
model.train()
model.to(device)

# Enable memory optimizations (we can get away with less memory)
model.gradient_checkpointing_enable()

train_loss = []
for epoch in range(num_epochs):
    total_loss = 0
    for batch_idx, batch in enumerate(dataloader):
        # Get input_ids and labels
        input_ids = batch[0].to(device).long() 
        attention_mask = batch[1].to(device).long() 
        # Create the labels which are just the inputs shifted to the right with a padding token at the end
        labels = torch.cat([input_ids[:, 1:], torch.full((len(input_ids), 1), padding_token, device=device, dtype=torch.long)], dim=1)
        
        # Forward pass using half precision to get away with even less memory
        with torch.autocast(device_type='xpu', dtype=torch.float16):
            outputs = model(input_ids=input_ids, 
                            attention_mask=attention_mask,
                            labels=labels)
            loss = outputs.loss  # GPT-2 directly computes the loss if labels are provided
        
        # Log the loss
        detached_loss = loss.detach().cpu().item()
        writer.add_scalar('Loss/train', detached_loss, epoch * len(dataloader) + batch_idx)
        total_loss += detached_loss
        
        # Backward pass
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    
    train_loss.append(total_loss)
    total_loss = 0

print('Training completed!')
writer.close()

Logging to: runs\GPT2_Model_14


  0%|          | 0/150040 [00:00<?, ?it/s]

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [ ]:
torch.save(model, 'gpt_model.ph')