In [19]:
from transformers import GPT2LMHeadModel, GPT2Config, get_scheduler
from torch import nn
from tqdm import tqdm
from torch.optim import AdamW
import torch
from dataloader import CustomEncodingVocabulary, GPT2Dataset
CustomEncodingVocabulary.initialize()
import os
from torch.utils.tensorboard import SummaryWriter
from helper import get_next_run_folder
import math

In [20]:
# Set training parameters
num_epochs = 1
batch_size = 128

vocabulary = CustomEncodingVocabulary.tokens
padding_token = CustomEncodingVocabulary.padding_token

print(f'Vocabulary size: {len(vocabulary)}')

config = GPT2Config(
    vocab_size=len(vocabulary),    # Size of your vocabulary (adjust to match your tokenizer)
    n_positions=1024,   # Maximum sequence length
    n_ctx=256,          # Context window size
    n_embd=256,         # Embedding size
    n_layer=2,          # Number of transformer layers
    n_head=2,           # Number of attention heads
    pad_token_id=padding_token,  # Set padding token ID (e.g., same as eos_token)
)

# Use appropriate gpu or cpu
device = ('xpu' if torch.xpu.is_available() else
          'cuda' if torch.cuda.is_available() else
          'cpu')

print('Using device:', device)

Vocabulary size: 421
Using device: xpu


In [21]:
# Instantiate GPT-2 model
model = GPT2LMHeadModel(config)

torch.save(model, 'gpt_model_empty.ph')

In [22]:
# Get dataset and dataloader
dataset = GPT2Dataset('ldp_5_dataset')

dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=batch_size,  # Number of samples per batch
        shuffle=False,  # This would fuck up our preloading
        num_workers=0,  # This would fuck up our preloading as well... 
    )

In [23]:
# Create tensorboard logger in a new folder, so I have everything logged everytime, since I often forget and then it writes multiple runs into one folder which is a pain to separate. 
# Get the new folder path
log_dir = get_next_run_folder('GPT2_Model')

# Create the directory if it doesn't exist
os.makedirs(log_dir, exist_ok=True)

# Initialize SummaryWriter with the new log directory
writer = SummaryWriter(log_dir=log_dir)
print(f"Logging to: {log_dir}")

# Training loop
num_training_steps = num_epochs * len(dataloader)
progress_bar = tqdm(range(num_training_steps))

# Make adjustment to the model
model.train()
model.to(device)

# Compile model for additional training speed
# Torch compile uses the triton backend, which I have not installed. As it turns out its easy to install via pip, but not for intel arc gpus. I will have to dual boot my pc into ubuntu 22.04 in order to install the intel xpu backend for triton. As I like the pycharm environment and I am used to Windows pcs, I will set up ubuntu server and use it as a remote development server and access it via my laptop. This is not the first time I have done this. When it works it works greate, but it takes a lot of time to get running. 
model = torch.compile(model)

# Enable memory optimizations (we can get away with less memory)
model.gradient_checkpointing_enable()

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-4)

# Cosin Anneling with Warmup as learning rate schedluer
lr_scheduler = get_scheduler(
    "cosine", optimizer=optimizer, num_warmup_steps=500, num_training_steps=num_training_steps
)

# Define loss function
criterion = nn.CrossEntropyLoss(ignore_index=vocabulary[-1] + 1)

train_loss = []
for epoch in range(num_epochs):
    total_loss = 0
    for batch_idx, batch in enumerate(dataloader):
        input_ids = batch[0].to(device).long() 
        attention_mask = batch[1].to(device).long() 
        # Create the labels which are just the inputs shifted to the right with a padding token at the end
        labels = torch.cat(
            [input_ids[:, 1:], 
             torch.full((len(input_ids), 1), padding_token, device=device, dtype=torch.long)],
            dim=1
        )
        # Zero gradients before the backward pass (best practice for pytorch)
        optimizer.zero_grad()
        
        # Forward pass using half precision to get away with even less memory
        with torch.autocast(device_type='xpu', dtype=torch.float16):
            outputs = model(input_ids=input_ids, 
                            attention_mask=attention_mask,
                            labels=labels)
            # GPT-2 directly computes the loss if labels are provided
            loss = outputs.loss  
        
        # Backward pass
        loss.backward()
        
        # Gradient Clipping to prevent exploding gradients
        total_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        # Optimizer step
        optimizer.step()
        # Update learning rate
        lr_scheduler.step()
        
        # Log some statistics
        detached_loss = loss.detach().cpu().item()
        total_loss += detached_loss
        global_step = epoch * len(dataloader) + batch_idx
        writer.add_scalar('Training Loss', detached_loss, global_step)
        writer.add_scalar('Learning Rate', lr_scheduler.get_last_lr()[0], global_step)
        writer.add_scalar('Gradient Norm', total_norm, global_step)
        # Add if statement to prevent numerical overflow
        perplexity = math.exp(detached_loss) if detached_loss < 20 else float('inf')
        writer.add_scalar('Perplexity', perplexity, global_step)

        progress_bar.update(1)
    
    train_loss.append(total_loss / len(dataloader))
    total_loss = 0

print('Training completed!')
writer.close()

Logging to: runs\GPT2_Model_7


  0%|          | 0/12497 [00:14<?, ?it/s]


BackendCompilerFailed: backend='inductor' raised:
RuntimeError: Cannot find a working triton installation. Either the package is not installed or it is too old. More information on installing Triton can be found at https://github.com/openai/triton

Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information


You can suppress this exception and fall back to eager by setting:
    import torch._dynamo
    torch._dynamo.config.suppress_errors = True


In [ ]:
torch.save(model, 'gpt_model.ph')