# **GPT-2 Fine-Tuning**

This Notebook performs the fine-tuning of GPT2 Medium on our preprocessed dataset (see [here](https://github.com/TomSOWI/DLSS-24-Synthetic-Product-Reviews-Generation/blob/main/Data/preprocessed_reviews.parquet)):

* Check if chunking/truncation of reviews is required
* Assess tokenizer parameters
* Define PyTorch Dataset
* Fine-tuning
* Display model info
* Save model
* Show training loss

## Preparations

In [15]:
# Import necessary packages
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler
from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2LMHeadModel, set_seed
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm

In [16]:
# Golbal variables
SEED_VAL = 42
set_seed(SEED_VAL)

# Define model related parameters
MODEL_NAME = "gpt2-medium"
BOS_TOKEN = '<|startoftext|>'
EOS_TOKEN = '<|endoftext|>'
PAD_TOKEN = '<|pad|>'
MAX_LENGTH = 1024

In [17]:
# Load data for training
trainset = pd.read_parquet("Data/preprocessed_reviews.parquet")

# Move reviews to a list for analysis and training
texts=trainset['text'].to_list()

## Analyze Review Length

In [18]:
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME, bos_token=BOS_TOKEN, eos_token=EOS_TOKEN, pad_token=PAD_TOKEN)
# Store document length based on GPT2Tokenizer tokenization
doc_lengths = []

for text in texts:
    # get list of token ids for one document
    tokens = tokenizer.encode(text, add_special_tokens=True, truncation=False, padding=False)
    # append len of list to doc lengths
    doc_lengths.append(len(tokens))

In [14]:
print("Max documen length: ", np.max(doc_lengths))
print("Average document length: ", np.average(doc_lengths))
# No document outside the context size
print("Number of ducuments outside of gpt-mediums context size: ", len(doc_lengths[doc_lengths > MAX_LENGTH]))

Max documen length:  982
Average document length:  35.68134
Number of ducuments outside of gpt-mediums context size:  0


Since no documents are outside of the optimal context size of 1024. Therefore we do not have to use any chunking technique

## Assess GPT2-Tokenizer Parameters

In [7]:
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
print(f'Base BOS: {tokenizer.bos_token}')
print(f'Base EOS: {tokenizer.eos_token}')
print(f'Base PAD: {tokenizer.pad_token}')

# adjust global Variables according to models naming of special tokens

Base BOS: <|endoftext|>
Base EOS: <|endoftext|>
Base PAD: None


In [8]:
# Load the GPT tokenizer.
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME, bos_token=BOS_TOKEN, eos_token=EOS_TOKEN, pad_token=PAD_TOKEN) 

print("The max model length is {} for this model".format(tokenizer.model_max_length))
print("The beginning of sequence token {} token has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.bos_token_id), tokenizer.bos_token_id))
print("The end of sequence token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.eos_token_id), tokenizer.eos_token_id))
print("The padding token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.pad_token_id), tokenizer.pad_token_id))

The max model length is 1024 for this model
The beginning of sequence token <|startoftext|> token has the id 50257
The end of sequence token <|endoftext|> has the id 50256
The padding token <|pad|> has the id 50258


## Define PyTorch Dataset

In [9]:
class TextDataset(Dataset):

  def __init__(self, txt_list, tokenizer, max_length):
    # Store features
    self.tokenizer = tokenizer
    self.input_ids = []
    self.attn_masks = []

    for txt in txt_list:
      # Encode text in between BOS and EOS
      encodings_dict = tokenizer(BOS_TOKEN + txt + EOS_TOKEN, truncation=True, max_length=max_length, padding="max_length") # Set truncation just in case

      # Make features tensors and store
      self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
      self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
    
  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.attn_masks[idx]

## Fine-Tuning

In [10]:
# Set environment variables depending on the available GPU
# Use GPU 0
# import os
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

#torch.cuda.set_device(0)

In [11]:
torch.cuda.empty_cache()

In [12]:
def fine_tune_gpt2(train_data):
    """
    Fine-tunes a pre-trained GPT-2 model on a given dataset.

    Parameters:
    -----------
    train_data : list of str
        A list of text sequences to be used for training the model.

    Returns:
    --------
    args : dict
        A dictionary containing the hyperparameters and settings used during training, 
        including model name, device information, batch size, learning rate, etc.

    training_stats : list of dict
        A list of dictionaries, each containing the training loss for an epoch. 

    model : GPT2LMHeadModel
        The fine-tuned GPT-2 model
    """

    # Initialize list to store stats for each epoch
    training_stats = [] 
    # Initialize the args dictionary to hold hyperparameters
    args = {}  
    
    # Set the seed value all over the place to make this reproducible.
    set_seed(SEED_VAL)
    
    # Update args with seed and model name
    args['seed_val'] = SEED_VAL
    args['model_name'] = MODEL_NAME
    
    # Load GPT-2 model and tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME, bos_token=BOS_TOKEN, eos_token=EOS_TOKEN, pad_token=PAD_TOKEN) # add special tokens 
    model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)
    model.resize_token_embeddings(len(tokenizer)) # resize embeddings according to the updated tokenizer

    # Update args with tokenizer and model hyperparameters
    args['bos_token'] = '50257'  # Beginning of sentence token ID
    args['eos_token'] = '50256'  # End of sentence token ID
    args['pad_token'] = '50258'  # Padding token ID
    args['max_length'] = MAX_LENGTH  # Maximum length of input sequences
    args['batch_size'] = 2  # Batch size for training
    args['learning_rate'] = 5e-4  # Learning rate for the optimizer
    args['eps'] = 1e-8  # Epsilon value to prevent division by zero in the optimizer
    args['num_train_epochs'] = 5  # Number of epochs for training
    args['num_warmup_steps'] = 1e2  # Number of warmup steps for the learning rate scheduler
    
    # Load training dataset
    train_dataset = TextDataset(
        tokenizer=tokenizer,
        txt_list=train_data,
        max_length=args['max_length']
    )

    # Create data loader
    train_dataloader = DataLoader(
        train_dataset,
        sampler=RandomSampler(train_dataset),
        batch_size=args['batch_size']
    )

    # Set up optimizer and learning rate scheduler
    optimizer = AdamW(model.parameters(), lr=args['learning_rate'], eps=args['eps'])

     # Calculate number of total steps
    total_steps = len(train_dataloader) * args['num_train_epochs']
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args['num_warmup_steps'],
        num_training_steps=total_steps
    )

    # Move model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    # Save used device to args
    args['device'] = device
    
    # Training loop
    # Set model to training mode
    model.train()
    # Iterate over each epoch
    for epoch in range(args['num_train_epochs']):
        # Expected completion time for one epoch 
        epoch_iterator = tqdm(train_dataloader, desc=f"Epoch {epoch+1}")
        # Set loss per batch to 0
        batch_loss = 0
        # Iterate over each batch
        for step, batch in enumerate(epoch_iterator):
            # Unpack the batch into inputs and attention masks
            inputs, attention_masks = batch 
            # Move features to GPU
            inputs = inputs.to(device)
            attention_masks = attention_masks.to(device)

            # Forward pass: compute model predictions and calculate loss
            outputs = model(inputs, attention_mask=attention_masks, labels=inputs)
            loss = outputs.loss
            # Accumulate the batch loss
            batch_loss += loss.item()
            # Backward pass: compute gradients
            loss.backward()
            # Update model parameters and learning rate
            optimizer.step()
            scheduler.step()
            # Reset gradients for the next step
            optimizer.zero_grad()
        
        # Calculate the average loss for the current epoch
        avg_train_loss = batch_loss / len(train_dataloader)
         # Store the epoch number and average training loss in the training stats list
        training_stats.append(
            {
                'epoch': epoch + 1, # Start at epoch 1 instead of 0
                'Training Loss': avg_train_loss,
            }
        )
    # Return the training arguments, statistics, and the fine-tuned model 
    return args, training_stats, model

# Fine-tune gpt2-medium
args, training_stats, model = fine_tune_gpt2(train_data=texts)

Epoch 1: 100%|██████████| 50/50 [00:29<00:00,  1.68it/s]
Epoch 2: 100%|██████████| 50/50 [00:30<00:00,  1.66it/s]
Epoch 3: 100%|██████████| 50/50 [00:30<00:00,  1.66it/s]
Epoch 4: 100%|██████████| 50/50 [00:30<00:00,  1.65it/s]
Epoch 5: 100%|██████████| 50/50 [00:30<00:00,  1.67it/s]


## Display Model Info

In [17]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The GPT-2 model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:2]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[2:14]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-2:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The GPT-2 model has 292 different named parameters.

==== Embedding Layer ====

transformer.wte.weight                                  (50259, 1024)
transformer.wpe.weight                                  (1024, 1024)

==== First Transformer ====

transformer.h.0.ln_1.weight                                  (1024,)
transformer.h.0.ln_1.bias                                    (1024,)
transformer.h.0.attn.c_attn.weight                      (1024, 3072)
transformer.h.0.attn.c_attn.bias                             (3072,)
transformer.h.0.attn.c_proj.weight                      (1024, 1024)
transformer.h.0.attn.c_proj.bias                             (1024,)
transformer.h.0.ln_2.weight                                  (1024,)
transformer.h.0.ln_2.bias                                    (1024,)
transformer.h.0.mlp.c_fc.weight                         (1024, 4096)
transformer.h.0.mlp.c_fc.bias                                (4096,)
transformer.h.0.mlp.c_proj.weight                       (4096

## Save Fine-Tuned Model

In [19]:
output_dir = './models/GPT2-medium'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save fine-tuned model and tokenizer
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Save training arguments with fine-tuned model
torch.save(args, os.path.join(output_dir, 'training_args.bin'))

Saving model to ./models/GPT


## Show training loss

In [None]:
pd.set_option('display.precision', 2)

# Print training stats using DataFrame format
df = pd.DataFrame(training_stats)
df

In [None]:
# Plot training loss
plt.plot(df['epoch'], df['Training Loss'])
plt.title("Training loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.show()
plt.savefig(output_dir +'/training_loss.png')