# Transformer Model - PEGASUS

In [1]:
# !pip install transformers[torch] datasets rouge-score nltk sentencepiece

In [3]:
import torch

from transformers import (
    PegasusForConditionalGeneration, PegasusTokenizer,
    Trainer, TrainingArguments,
    get_linear_schedule_with_warmup
)

from datasets import load_dataset, Dataset
from rouge_score import rouge_scorer
from torch.utils.data import DataLoader, TensorDataset

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm
import os
import random
import logging

In [4]:
# seed for reproducability

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed()

In [5]:
# logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

## Load Model

In [6]:
# Pegasus Model

model_name = "google/pegasus-xsum"
model = PegasusForConditionalGeneration.from_pretrained(model_name)
tokenizer = PegasusTokenizer.from_pretrained(model_name)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
model

PegasusForConditionalGeneration(
  (model): PegasusModel(
    (shared): Embedding(96103, 1024, padding_idx=0)
    (encoder): PegasusEncoder(
      (embed_tokens): Embedding(96103, 1024, padding_idx=0)
      (embed_positions): PegasusSinusoidalPositionalEmbedding(512, 1024)
      (layers): ModuleList(
        (0-15): 16 x PegasusEncoderLayer(
          (self_attn): PegasusAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_nor

In [8]:
# set device

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Load Dataset and Tokenize

In [9]:
# load train, val and test data

train_data = pd.read_csv('../data/pubmed_dataset_preprocessed/train.csv')
val_data = pd.read_csv('../data/pubmed_dataset_preprocessed/val.csv')
test_data = pd.read_csv('../data/pubmed_dataset_preprocessed/test.csv')

In [12]:
from torch.utils.data import Dataset, DataLoader
from transformers import PegasusTokenizer
import torch

# custom Dataset class
class TokenizedDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        article = self.data.iloc[idx]['article']
        abstract = self.data.iloc[idx]['abstract']
        
        # tokenize article and abstract
        input_encoding = self.tokenizer(
            article,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )

        output_encoding = self.tokenizer(
            abstract,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        
        input_ids = input_encoding['input_ids'].squeeze(0)
        attention_mask = input_encoding['attention_mask'].squeeze(0)
        
        output_ids = output_encoding['input_ids'].squeeze(0)  # Same here
        output_attention_mask = output_encoding['attention_mask'].squeeze(0)
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': output_ids,
            'output_attention_mask': output_attention_mask
        }

# dataset
train_dataset = TokenizedDataset(train_data, tokenizer, max_length=tokenizer.model_max_length)
val_dataset = TokenizedDataset(val_data, tokenizer, max_length=tokenizer.model_max_length)
test_dataset = TokenizedDataset(test_data, tokenizer, max_length=tokenizer.model_max_length)

# dataLoader
train_loader = DataLoader(train_dataset, batch_size=2, num_workers=8, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=2, num_workers=8, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=2, num_workers=8, pin_memory=True)

In [13]:
for batch in train_loader:
    print(batch)
    break

{'input_ids': tensor([[  126,  4403,   115,  ...,   164,   233,     1],
        [15962, 39237, 35368,  ...,  1532,   196,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[ 1688, 32680,   115,  ...,     0,     0,     0],
        [15962, 39237, 35368,  ...,     0,     0,     0]]), 'output_attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


## Fine-tuning

In [53]:
# function to finetune model

def finetune_model(model, train_dataloader, val_dataloader, tokenizer, 
                num_epochs=3, learning_rate=2e-5, warmup_steps=500, 
                checkpoint_path='../models/best_checkpoint.pt'):
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    
    # scheduler
    total_steps = len(train_dataloader) * num_epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps
    )
    
    best_val_loss = float('inf')
    train_losses, val_losses = [], []
    
    # mixed precision (speed up training)
    scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())

    try:
        for epoch in range(num_epochs):
            logger.info(f"\nEpoch {epoch + 1}/{num_epochs}")

            # training loop
            model.train()
            epoch_train_loss = 0
            train_progress = tqdm(train_dataloader, desc="Training", leave=False)

            for batch in train_progress:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                optimizer.zero_grad()

                with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
                    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                    loss = outputs.loss

                # backpropagation
                scaler.scale(loss).backward()
                
                # gradient clipping
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

                # update weights (and scheduler)
                scaler.step(optimizer)
                scaler.update()
                scheduler.step()

                epoch_train_loss += loss.item()
                train_progress.set_postfix({'batch_loss': loss.item()})

                # clear memory
                del outputs, loss
                torch.cuda.empty_cache()

            avg_train_loss = epoch_train_loss / len(train_dataloader)
            train_losses.append(avg_train_loss)
            logger.info(f"Average training loss: {avg_train_loss:.4f}")

            # val loop
            model.eval()
            epoch_val_loss = 0
            val_progress = tqdm(val_dataloader, desc="Validation", leave=False)

            with torch.no_grad():
                for batch in val_progress:
                    input_ids = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    labels = batch['labels'].to(device)

                    with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
                        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                        loss = outputs.loss

                    epoch_val_loss += loss.item()
                    val_progress.set_postfix({'batch_loss': loss.item()})

                    del outputs, loss
                    torch.cuda.empty_cache()

            avg_val_loss = epoch_val_loss / len(val_dataloader)
            val_losses.append(avg_val_loss)
            logger.info(f"Average validation loss: {avg_val_loss:.4f}")

            # checkpoint best model
#             if avg_val_loss < best_val_loss:
#                 best_val_loss = avg_val_loss
#                 torch.save(model.state_dict(), checkpoint_path)
#                 logger.info(f"Saved new best model with validation loss: {best_val_loss:.4f}")

        # load best model weights before returning
        model.load_state_dict(torch.load(checkpoint_path))
        
    except KeyboardInterrupt:
        logger.info("Training interrupted. Returning current model state.")

    return model, train_losses, val_losses

In [54]:
# folder to save models
os.makedirs('../models', exist_ok=True)

In [None]:
finetuned_model, train_loss, val_loss = finetune_model(model, train_loader, val_loader, tokenizer, 
                num_epochs=3, learning_rate=2e-5, warmup_steps=500, 
                checkpoint_path='../models/pegasus.pt')

In [None]:
# function to plot training and validation loss

def plot_loss_curves(train_losses, val_losses, title="Loss Curves"):
    plt.figure(figsize=(7, 5))
    
    plt.plot(train_losses, label='Training Loss', color='blue', linestyle='-', linewidth=2)
    plt.plot(val_losses, label='Validation Loss', color='red', linestyle='--', linewidth=2)
    
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title(title)
    
    plt.legend()
    
    # Display the plot
    plt.tight_layout()
    plt.show()

In [None]:
plot_loss_curves(train_loss, val_loss)

## ROUGE Scores

In [None]:
# function to evaluate model (rouge scores)

def evaluate_model(model, test_dataloader, tokenizer, max_length=512, num_beams=4):
    
    model = model.to(device)
    model.eval()
    
    predictions = []
    references = []
    
    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            # Generate summaries
            summary_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=max_length,
                num_beams=num_beams,
                early_stopping=True,
                no_repeat_ngram_size=2
            )
            
            # Decode predictions
            decoded_preds = [tokenizer.decode(g, skip_special_tokens=True) for g in summary_ids]
            decoded_refs = [tokenizer.decode(g, skip_special_tokens=True) for g in batch['labels']]
            
            predictions.extend(decoded_preds)
            references.extend(decoded_refs)
        
        # Clear GPU memory after all batches
        torch.cuda.empty_cache()
    
    # Calculate ROUGE scores
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=True)
    scores = {'rouge1': [], 'rouge2': [], 'rougeL': [], 'rougeLsum': []}
    
    for pred, ref in zip(predictions, references):
        score = rouge.score(ref, pred)
        for key in scores:
            scores[key].append(score[key].fmeasure)
    
    # Average ROUGE scores
    avg_scores = {key: np.mean(values) for key, values in scores.items()}
    
    return scores, avg_scores, predictions, references

In [None]:
scores, avg_scores, predictions, references = evaluate_model(finetuned_model, test_loader, tokenizer, 
                                                     max_length=tokenizer.model_max_length)

In [None]:
# rouge scores

for metric, score in avg_scores.items():
        print(f"{metric}: {score:.4f}")

In [None]:
# pd.DataFrame({
#         'article': test_data['article'],
#         'reference': test_data['abstract'],
#         'prediction': predictions
#     })