In [3]:
from datasets import load_dataset
datasets = load_dataset("cnn_dailymail", "3.0.0")

In [4]:
from datasets import load_dataset
from rouge_score import rouge_scorer
import numpy as np

In [11]:
from datasets import load_dataset
from rouge_score import rouge_scorer
import numpy as np

In [17]:
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load pre-trained tokenizers and models
bart_tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

bart_original_tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
bart_original_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-base")

t5_tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
t5_model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")

t5_large_tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-large")
t5_large_model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-large")

# Load fine-tuned model and tokenizer
fine_tuned_tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_summarizer/checkpoint-375")
fine_tuned_model = AutoModelForSeq2SeqLM.from_pretrained("./fine_tuned_summarizer/checkpoint-375")

def preprocess_article(article: str, model_type: str = "bart", max_length: int = 1024):
    """
    Preprocess the input article by tokenizing it using the specified model type.
    
    Args:
        article (str): The input article text to preprocess.
        max_length (int): Maximum token length for compatibility with the selected model.
        model_type (str): The model type to use for tokenization. Options are "bart", "t5", "bart-original", "t5-large", "fine-tuned".
    
    Returns:
        dict: Tokenized input ready for summarization.
    """
    if model_type == "t5":
        tokenized_input = t5_tokenizer(
            article,
            max_length=max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
    elif model_type == "t5-large":
        tokenized_input = t5_large_tokenizer(
            article,
            max_length=max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
    elif model_type == "bart-original":
        tokenized_input = bart_original_tokenizer(
            article,
            max_length=max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
    elif model_type == "bart":
        tokenized_input = bart_tokenizer(
            article,
            max_length=max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
    elif model_type == "fine-tuned":
        tokenized_input = fine_tuned_tokenizer(
            article,
            max_length=max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
    else:
        raise ValueError(f"Unsupported model type: {model_type}")
    
    return tokenized_input




In [None]:
def generate_summary(article: str, model_type: str = "bart", max_input_length: int = 1024, max_summary_length: int = 150):
    """
    Generate a summary of the input article using either T5 or BART.
    
    Args:
        article (str): The input article text.
        model_type (str): Type of model, either 't5' or 'bart'.
        max_input_length (int): Max length of the tokenized input.
        max_summary_length (int): Max length of the generated summary.
    
    Returns:
        str: The generated summary.
    """
    # Select the appropriate model and tokenizer
    if model_type == "t5":
        model = t5_model
        tokenizer = t5_tokenizer
    elif model_type == "bart":
        model = bart_model
        tokenizer = bart_tokenizer
    elif model_type == "t5-large":
        model = t5_large_model
        tokenizer = t5_large_tokenizer
    elif model_type == "bart-original":
        model = bart_original_model
        tokenizer = bart_original_tokenizer
    elif model_type == "fine-tuned":
        model = fine_tuned_model
        tokenizer = fine_tuned_tokenizer
    
    else:
        raise ValueError("Invalid model_type. Choose 't5' or 'bart'.")

    # Preprocess the article
    tokenized_input = preprocess_article(article, model_type=model_type, max_length=max_input_length)
    
    # Generate the summary
    summary_ids = model.generate(
        tokenized_input['input_ids'],
        max_length=max_summary_length,
        min_length=30,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )

    # Decode the generated summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [None]:
from tqdm import tqdm
from rouge_score import rouge_scorer
import numpy as np

def calculate_rouge(dataset, model_type="bart", max_input_length=1024, max_summary_length=150):
    """
    Evaluates ROUGE scores for the last 100 examples in the dataset with progress display.

    Args:
        dataset (Dataset): The dataset with 'article' and 'highlights' columns.
        model_type (str): Type of model, such as 't5', 'bart', or 'fine-tuned'.
        max_input_length (int): Maximum token length for input.
        max_summary_length (int): Maximum token length for generated summary.
    
    Returns:
        dict: Average ROUGE-1, ROUGE-2, and ROUGE-L scores.
    """
    # Initialize ROUGE scorer
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    rouge1_scores, rouge2_scores, rougeL_scores = [], [], []

    # Target the last 100 examples in the dataset
    start_idx = max(0, len(dataset) - 5)

    # Progress bar
    print("Calculating ROUGE scores...")
    for idx in tqdm(range(start_idx, len(dataset)), desc="Processing examples", unit="example"):
        example = dataset[idx]

        # Generate the summary
        generated_summary = generate_summary(
            example["article"],
            model_type=model_type,
            max_input_length=max_input_length,
            max_summary_length=max_summary_length
        )

        # Calculate ROUGE scores
        scores = scorer.score(example["highlights"], generated_summary)
        rouge1_scores.append(scores["rouge1"].fmeasure)
        rouge2_scores.append(scores["rouge2"].fmeasure)
        rougeL_scores.append(scores["rougeL"].fmeasure)

    # Calculate average ROUGE scores
    avg_rouge1 = np.mean(rouge1_scores)
    avg_rouge2 = np.mean(rouge2_scores)
    avg_rougeL = np.mean(rougeL_scores)

    return {
        "Average ROUGE-1": avg_rouge1,
        "Average ROUGE-2": avg_rouge2,
        "Average ROUGE-L": avg_rougeL
    }


In [21]:
# Example usage
dataset = load_dataset("cnn_dailymail", "3.0.0", split="test")

# Evaluate with BART
bart_rouge_scores = calculate_rouge(dataset, model_type="bart")
print("BART ROUGE Scores:", bart_rouge_scores)

# Evaluate with T5
t5_rouge_scores = calculate_rouge(dataset, model_type="t5")
print("T5 ROUGE Scores:", t5_rouge_scores)

# Evaluate with BART
bart_base_rouge_scores = calculate_rouge(dataset, model_type="bart-original")
print("BART ROUGE Scores:", bart_base_rouge_scores)

# Evaluate with T5
t5_large_rouge_scores = calculate_rouge(dataset, model_type="t5-large")
print("T5-Large ROUGE Scores:", t5_large_rouge_scores)

# Evaluate with fine-tuned model
fine_tuned_rouge_scores = calculate_rouge(dataset, model_type="fine-tuned")
print("Fine-tuned ROUGE Scores:", fine_tuned_rouge_scores)

Calculating ROUGE scores...


Processing examples: 100%|██████████| 5/5 [01:56<00:00, 23.22s/example]


BART ROUGE Scores: {'Average ROUGE-1': 0.4361984024780211, 'Average ROUGE-2': 0.21323528703624234, 'Average ROUGE-L': 0.27493072514747563}
Calculating ROUGE scores...


Processing examples: 100%|██████████| 5/5 [01:52<00:00, 22.56s/example]


T5 ROUGE Scores: {'Average ROUGE-1': 0.4074326829103899, 'Average ROUGE-2': 0.18751513578838058, 'Average ROUGE-L': 0.2605476358979543}
Calculating ROUGE scores...


Processing examples: 100%|██████████| 5/5 [00:58<00:00, 11.72s/example]


BART ROUGE Scores: {'Average ROUGE-1': 0.3800909693730864, 'Average ROUGE-2': 0.14299618242863085, 'Average ROUGE-L': 0.20040745888424674}
Calculating ROUGE scores...


Processing examples: 100%|██████████| 5/5 [04:47<00:00, 57.51s/example]


T5-Large ROUGE Scores: {'Average ROUGE-1': 0.4470120918673608, 'Average ROUGE-2': 0.19643275829011073, 'Average ROUGE-L': 0.23771303828896687}
Calculating ROUGE scores...


Processing examples: 100%|██████████| 5/5 [01:30<00:00, 18.11s/example]

Fine-tuned ROUGE Scores: {'Average ROUGE-1': 0.3978393468182637, 'Average ROUGE-2': 0.17779812193129052, 'Average ROUGE-L': 0.1903461327233912}



