In [1]:
# Import necessary libraries
import pandas as pd  # For handling data in dataframes
import torch  # PyTorch for deep learning
from torch.utils.data import DataLoader, Dataset  # For creating and loading datasets
from transformers import T5Tokenizer, T5ForConditionalGeneration  # Hugging Face Transformers for T5 model
from tqdm import tqdm  # For progress bars
from rouge_score import rouge_scorer  # For calculating ROUGE scores

# Load the validation data
validation_data = pd.read_csv('validate_data.csv')  # Load validation data from a CSV file


In [2]:
# Define a custom dataset class
class MyDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=512, max_output_length=150):
        # Initialize the dataset with data, tokenizer, and maximum lengths for input and output
        self.data = data  # Store the input data
        self.tokenizer = tokenizer  # Store the tokenizer
        self.max_input_length = max_input_length  # Set the maximum length for input sequences
        self.max_output_length = max_output_length  # Set the maximum length for output sequences
    
    def __len__(self):
        # Return the total number of samples in the dataset
        return len(self.data)
    
    def __getitem__(self, idx):
        # Get the article and highlight for the sample at the specified index
        article = self.data.iloc[idx]['article']
        highlight = self.data.iloc[idx]['highlight']
        
        # Tokenize input and target sequences using the T5 tokenizer
        inputs = self.tokenizer.encode_plus(
            article,  # Article text to tokenize
            max_length=self.max_input_length,  # Maximum input length
            padding='max_length',  # Pad sequences to the maximum length
            truncation=True,  # Truncate sequences to the maximum length
            return_tensors="pt"  # Return PyTorch tensors
        )
        
        targets = self.tokenizer.encode_plus(
            highlight,  # Highlight text to tokenize
            max_length=self.max_output_length,  # Maximum output length
            padding='max_length',  # Pad sequences to the maximum length
            truncation=True,  # Truncate sequences to the maximum length
            return_tensors="pt"  # Return PyTorch tensors
        )
        
        # Return a dictionary containing the input and target token IDs and attention masks
        return {
            'input_ids': inputs.input_ids.flatten(),  # Flatten input IDs tensor
            'attention_mask': inputs.attention_mask.flatten(),  # Flatten attention mask tensor
            'labels': targets.input_ids.flatten()  # Flatten target labels tensor
        }


In [3]:
# Initialize the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')  # Load the pre-trained T5 tokenizer
model = T5ForConditionalGeneration.from_pretrained('fine_tuning')  # Load the fine-tuned T5 model

# Create the dataset and dataloader for validation
validation_dataset = MyDataset(validation_data, tokenizer)  # Create a dataset for validation data
validation_loader = DataLoader(validation_dataset, batch_size=8, shuffle=False)  # DataLoader for validation data with batch size 8

# Initialize the ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)  # Initialize ROUGE scorer for evaluation


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
# Set device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # Use GPU if available, else use CPU
model.to(device)  # Move the model to the selected device

# Evaluation loop
total_rouge1, total_rouge2, total_rougeL = 0, 0, 0  # Initialize variables to store total ROUGE scores
total_samples = 0  # Initialize variable to store the total number of samples

model.eval()  # Set the model to evaluation mode
for batch in tqdm(validation_loader, desc="Evaluating"):  # Iterate over validation data with a progress bar
    input_ids = batch['input_ids'].to(device)  # Move input IDs to the selected device
    attention_mask = batch['attention_mask'].to(device)  # Move attention mask to the selected device
    labels = batch['labels'].to(device)  # Move labels to the selected device

    with torch.no_grad():  # Disable gradient calculations for validation
        generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=150, num_beams=2, early_stopping=True)  # Generate summaries

    generated_summaries = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)  # Decode generated summaries
    target_summaries = tokenizer.batch_decode(labels, skip_special_tokens=True)  # Decode target summaries

    for gen_summary, target_summary in zip(generated_summaries, target_summaries):  # Iterate over generated and target summaries
        scores = scorer.score(target_summary, gen_summary)  # Calculate ROUGE scores
        total_rouge1 += scores['rouge1'].fmeasure  # Accumulate ROUGE-1 scores
        total_rouge2 += scores['rouge2'].fmeasure  # Accumulate ROUGE-2 scores
        total_rougeL += scores['rougeL'].fmeasure  # Accumulate ROUGE-L scores
        total_samples += 1  # Increment the total number of samples

# Calculate average ROUGE scores for validation
avg_rouge1 = total_rouge1 / total_samples  # Calculate average ROUGE-1 score
avg_rouge2 = total_rouge2 / total_samples  # Calculate average ROUGE-2 score
avg_rougeL = total_rougeL / total_samples  # Calculate average ROUGE-L score

# Print average ROUGE scores with four decimal places
print(f"Average ROUGE-1 (Validation): {avg_rouge1:.4f}")  # Print average ROUGE-1 score
print(f"Average ROUGE-2 (Validation): {avg_rouge2:.4f}")  # Print average ROUGE-2 score
print(f"Average ROUGE-L (Validation): {avg_rougeL:.4f}")  # Print average ROUGE-L score


Evaluating: 100%|███████████████████████████████████████████████████████████████████████████████| 216/216 [1:49:27<00:00, 30.40s/it]

Average ROUGE-1 (Validation): 0.3504
Average ROUGE-2 (Validation): 0.1569
Average ROUGE-L (Validation): 0.2607



