In [1]:
!pip install transformers
!pip install datasets
!pip install rouge-score
!pip install nltk
!pip install tqdm

[0m

In [2]:
import torch
from transformers import BartForConditionalGeneration, BartTokenizer
from datasets import load_dataset
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import corpus_bleu
from tqdm import tqdm

# Check if a GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model and tokenizer
model_name = "facebook/bart-large"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name).to(device)

# Load the dataset from the Hugging Face Hub
dataset_name = "ccdv/govreport-summarization"
dataset = load_dataset(dataset_name, split="test")

source_texts = [example["report"] for example in dataset]
target_texts = [example["summary"] for example in dataset]

# Summarize and calculate scores using batch processing
rouge_scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
rouge_scores = []
bleu_references = []
bleu_hypotheses = []

batch_size = 25
num_batches = len(source_texts) // batch_size

for i in tqdm(range(num_batches), desc="Processing batches"):
    start = i * batch_size
    end = start + batch_size
    batch_sources = source_texts[start:end]
    batch_targets = target_texts[start:end]

    inputs = tokenizer(batch_sources, return_tensors="pt", padding=True, truncation=True).to(device)
    summary_ids = model.generate(inputs["input_ids"], num_beams=8, max_length=250, early_stopping=True)
    summaries = [tokenizer.decode(summary_id, skip_special_tokens=True) for summary_id in summary_ids]

    for target, summary in zip(batch_targets, summaries):
        rouge_scores.append(rouge_scorer.score(target, summary))
        bleu_references.append([target.split()])
        bleu_hypotheses.append(summary.split())

rouge1_avg = sum(score["rouge1"].fmeasure for score in rouge_scores) / len(rouge_scores)
rouge2_avg = sum(score["rouge2"].fmeasure for score in rouge_scores) / len(rouge_scores)
rougeL_avg = sum(score["rougeL"].fmeasure for score in rouge_scores) / len(rouge_scores)
bleu_score = corpus_bleu(bleu_references, bleu_hypotheses)

print(f"ROUGE-1: {rouge1_avg:.4f}")
print(f"ROUGE-2: {rouge2_avg:.4f}")
print(f"ROUGE-L: {rougeL_avg:.4f}")
print(f"BLEU: {bleu_score:.4f}")

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/3.22k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/271M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/15.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.2M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Processing batches: 100%|██████████| 38/38 [20:45<00:00, 32.78s/it]


ROUGE-1: 0.3413
ROUGE-2: 0.1220
ROUGE-L: 0.1761
BLEU: 0.0250


In [3]:
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from datasets import load_dataset
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import corpus_bleu
from tqdm import tqdm

# Check if a GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model and tokenizer
model_name = "google/pegasus-large"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)

# Load the dataset from the Hugging Face Hub
dataset_name = "ccdv/govreport-summarization"
dataset = load_dataset(dataset_name, split="test")

source_texts = [example["report"] for example in dataset]
target_texts = [example["summary"] for example in dataset]

# Summarize and calculate scores using batch processing
rouge_scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
rouge_scores = []
bleu_references = []
bleu_hypotheses = []

batch_size = 25
num_batches = len(source_texts) // batch_size

for i in tqdm(range(num_batches), desc="Processing batches"):
    start = i * batch_size
    end = start + batch_size
    batch_sources = source_texts[start:end]
    batch_targets = target_texts[start:end]

    inputs = tokenizer(batch_sources, return_tensors="pt", padding=True, truncation=True).to(device)
    summary_ids = model.generate(inputs["input_ids"], num_beams=8, max_length=250, early_stopping=True)
    summaries = [tokenizer.decode(summary_id, skip_special_tokens=True) for summary_id in summary_ids]

    for target, summary in zip(batch_targets, summaries):
        rouge_scores.append(rouge_scorer.score(target, summary))
        bleu_references.append([target.split()])
        bleu_hypotheses.append(summary.split())

rouge1_avg = sum(score["rouge1"].fmeasure for score in rouge_scores) / len(rouge_scores)
rouge2_avg = sum(score["rouge2"].fmeasure for score in rouge_scores) / len(rouge_scores)
rougeL_avg = sum(score["rougeL"].fmeasure for score in rouge_scores) / len(rouge_scores)
bleu_score = corpus_bleu(bleu_references, bleu_hypotheses)

print(f"ROUGE-1: {rouge1_avg:.4f}")
print(f"ROUGE-2: {rouge2_avg:.4f}")
print(f"ROUGE-L: {rougeL_avg:.4f}")
print(f"BLEU: {bleu_score:.4f}")

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/3.09k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)neration_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

Processing batches: 100%|██████████| 38/38 [09:31<00:00, 15.04s/it]


ROUGE-1: 0.2006
ROUGE-2: 0.0744
ROUGE-L: 0.1251
BLEU: 0.0012


In [4]:
import torch
from transformers import ProphetNetForConditionalGeneration, ProphetNetTokenizer
from datasets import load_dataset
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import corpus_bleu
from tqdm import tqdm

# Check if a GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model and tokenizer
model_name = "microsoft/prophetnet-large-uncased"
tokenizer = ProphetNetTokenizer.from_pretrained(model_name)
model = ProphetNetForConditionalGeneration.from_pretrained(model_name).to(device)

# Load the dataset from the Hugging Face Hub
dataset_name = "ccdv/govreport-summarization"
dataset = load_dataset(dataset_name, split="test")

source_texts = [example["report"] for example in dataset]
target_texts = [example["summary"] for example in dataset]

# Summarize and calculate scores using batch processing
rouge_scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
rouge_scores = []
bleu_references = []
bleu_hypotheses = []

batch_size = 25
num_batches = len(source_texts) // batch_size

for i in tqdm(range(num_batches), desc="Processing batches"):
    start = i * batch_size
    end = start + batch_size
    batch_sources = source_texts[start:end]
    batch_targets = target_texts[start:end]

    inputs = tokenizer(batch_sources, return_tensors="pt", padding=True, truncation=True).to(device)
    summary_ids = model.generate(inputs["input_ids"], num_beams=8, max_length=250, early_stopping=True)
    summaries = [tokenizer.decode(summary_id, skip_special_tokens=True) for summary_id in summary_ids]

    for target, summary in zip(batch_targets, summaries):
        rouge_scores.append(rouge_scorer.score(target, summary))
        bleu_references.append([target.split()])
        bleu_hypotheses.append(summary.split())

rouge1_avg = sum(score["rouge1"].fmeasure for score in rouge_scores) / len(rouge_scores)
rouge2_avg = sum(score["rouge2"].fmeasure for score in rouge_scores) / len(rouge_scores)
rougeL_avg = sum(score["rougeL"].fmeasure for score in rouge_scores) / len(rouge_scores)
bleu_score = corpus_bleu(bleu_references, bleu_hypotheses)

print(f"ROUGE-1: {rouge1_avg:.4f}")
print(f"ROUGE-2: {rouge2_avg:.4f}")
print(f"ROUGE-L: {rougeL_avg:.4f}")
print(f"BLEU: {bleu_score:.4f}")

Downloading (…)prophetnet.tokenizer:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]



Downloading model.safetensors:   0%|          | 0.00/1.57G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Processing batches: 100%|██████████| 38/38 [24:54<00:00, 39.33s/it]


ROUGE-1: 0.1336
ROUGE-2: 0.0422
ROUGE-L: 0.0816
BLEU: 0.0001
