In [1]:
!pip install transformers
!pip install datasets
!pip install rouge-score
!pip install nltk
!pip install tqdm

[0m

In [3]:
import torch
from transformers import BartForConditionalGeneration, BartTokenizer
from datasets import load_dataset
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import corpus_bleu
from tqdm import tqdm

# Check if a GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model and tokenizer
model_name = "facebook/bart-large"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name).to(device)

# Load the dataset from the Hugging Face Hub
dataset_name = "pszemraj/scientific_lay_summarisation-plos-norm"
dataset = load_dataset(dataset_name, split="test")

source_texts = [example["article"] for example in dataset]
target_texts = [example["summary"] for example in dataset]

# Summarize and calculate scores using batch processing
rouge_scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
rouge_scores = []
bleu_references = []
bleu_hypotheses = []

batch_size = 25
num_batches = len(source_texts) // batch_size

for i in tqdm(range(num_batches), desc="Processing batches"):
    start = i * batch_size
    end = start + batch_size
    batch_sources = source_texts[start:end]
    batch_targets = target_texts[start:end]

    inputs = tokenizer(batch_sources, return_tensors="pt", padding=True, truncation=True).to(device)
    summary_ids = model.generate(inputs["input_ids"], num_beams=8, max_length=250, early_stopping=True)
    summaries = [tokenizer.decode(summary_id, skip_special_tokens=True) for summary_id in summary_ids]

    for target, summary in zip(batch_targets, summaries):
        rouge_scores.append(rouge_scorer.score(target, summary))
        bleu_references.append([target.split()])
        bleu_hypotheses.append(summary.split())

rouge1_avg = sum(score["rouge1"].fmeasure for score in rouge_scores) / len(rouge_scores)
rouge2_avg = sum(score["rouge2"].fmeasure for score in rouge_scores) / len(rouge_scores)
rougeL_avg = sum(score["rougeL"].fmeasure for score in rouge_scores) / len(rouge_scores)
bleu_score = corpus_bleu(bleu_references, bleu_hypotheses)

print(f"ROUGE-1: {rouge1_avg:.4f}")
print(f"ROUGE-2: {rouge2_avg:.4f}")
print(f"ROUGE-L: {rougeL_avg:.4f}")
print(f"BLEU: {bleu_score:.4f}")

Processing batches: 100%|██████████| 55/55 [30:11<00:00, 32.93s/it]


ROUGE-1: 0.4333
ROUGE-2: 0.1283
ROUGE-L: 0.2128
BLEU: 0.0766


In [4]:
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from datasets import load_dataset
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import corpus_bleu
from tqdm import tqdm

# Check if a GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model and tokenizer
model_name = "google/pegasus-large"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)

# Load the dataset from the Hugging Face Hub
dataset_name = "pszemraj/scientific_lay_summarisation-plos-norm"
dataset = load_dataset(dataset_name, split="test")

source_texts = [example["article"] for example in dataset]
target_texts = [example["summary"] for example in dataset]

# Summarize and calculate scores using batch processing
rouge_scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
rouge_scores = []
bleu_references = []
bleu_hypotheses = []

batch_size = 25
num_batches = len(source_texts) // batch_size

for i in tqdm(range(num_batches), desc="Processing batches"):
    start = i * batch_size
    end = start + batch_size
    batch_sources = source_texts[start:end]
    batch_targets = target_texts[start:end]

    inputs = tokenizer(batch_sources, return_tensors="pt", padding=True, truncation=True).to(device)
    summary_ids = model.generate(inputs["input_ids"], num_beams=8, max_length=250, early_stopping=True)
    summaries = [tokenizer.decode(summary_id, skip_special_tokens=True) for summary_id in summary_ids]

    for target, summary in zip(batch_targets, summaries):
        rouge_scores.append(rouge_scorer.score(target, summary))
        bleu_references.append([target.split()])
        bleu_hypotheses.append(summary.split())

rouge1_avg = sum(score["rouge1"].fmeasure for score in rouge_scores) / len(rouge_scores)
rouge2_avg = sum(score["rouge2"].fmeasure for score in rouge_scores) / len(rouge_scores)
rougeL_avg = sum(score["rougeL"].fmeasure for score in rouge_scores) / len(rouge_scores)
bleu_score = corpus_bleu(bleu_references, bleu_hypotheses)

print(f"ROUGE-1: {rouge1_avg:.4f}")
print(f"ROUGE-2: {rouge2_avg:.4f}")
print(f"ROUGE-L: {rougeL_avg:.4f}")
print(f"BLEU: {bleu_score:.4f}")

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing batches: 100%|██████████| 55/55 [12:58<00:00, 14.16s/it]


ROUGE-1: 0.3435
ROUGE-2: 0.1127
ROUGE-L: 0.1970
BLEU: 0.0325


In [5]:
import torch
from transformers import ProphetNetForConditionalGeneration, ProphetNetTokenizer
from datasets import load_dataset
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import corpus_bleu
from tqdm import tqdm

# Check if a GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model and tokenizer
model_name = "microsoft/prophetnet-large-uncased"
tokenizer = ProphetNetTokenizer.from_pretrained(model_name)
model = ProphetNetForConditionalGeneration.from_pretrained(model_name).to(device)

# Load the dataset from the Hugging Face Hub
dataset_name = "pszemraj/scientific_lay_summarisation-plos-norm"
dataset = load_dataset(dataset_name, split="test")

source_texts = [example["article"] for example in dataset]
target_texts = [example["summary"] for example in dataset]

# Summarize and calculate scores using batch processing
rouge_scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
rouge_scores = []
bleu_references = []
bleu_hypotheses = []

batch_size = 25
num_batches = len(source_texts) // batch_size

for i in tqdm(range(num_batches), desc="Processing batches"):
    start = i * batch_size
    end = start + batch_size
    batch_sources = source_texts[start:end]
    batch_targets = target_texts[start:end]

    inputs = tokenizer(batch_sources, return_tensors="pt", padding=True, truncation=True).to(device)
    summary_ids = model.generate(inputs["input_ids"], num_beams=8, max_length=250, early_stopping=True)
    summaries = [tokenizer.decode(summary_id, skip_special_tokens=True) for summary_id in summary_ids]

    for target, summary in zip(batch_targets, summaries):
        rouge_scores.append(rouge_scorer.score(target, summary))
        bleu_references.append([target.split()])
        bleu_hypotheses.append(summary.split())

rouge1_avg = sum(score["rouge1"].fmeasure for score in rouge_scores) / len(rouge_scores)
rouge2_avg = sum(score["rouge2"].fmeasure for score in rouge_scores) / len(rouge_scores)
rougeL_avg = sum(score["rougeL"].fmeasure for score in rouge_scores) / len(rouge_scores)
bleu_score = corpus_bleu(bleu_references, bleu_hypotheses)

print(f"ROUGE-1: {rouge1_avg:.4f}")
print(f"ROUGE-2: {rouge2_avg:.4f}")
print(f"ROUGE-L: {rougeL_avg:.4f}")
print(f"BLEU: {bleu_score:.4f}")

Processing batches: 100%|██████████| 55/55 [40:16<00:00, 43.93s/it]


ROUGE-1: 0.2885
ROUGE-2: 0.0795
ROUGE-L: 0.1560
BLEU: 0.0210
