In [1]:
!pip install datasets rouge-score bert-score spacy pytextrank

import spacy
import pytextrank
from datasets import load_dataset
from rouge_score import rouge_scorer
from bert_score import score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting pytextrank
  Downloading pytextrank-3.3.0-py3-none-any.whl.metadata (12 kB)
Collecting icecream>=2.1 (from pytextrank)
  Downloading icecream-2.1.7-py3-none-any.whl.metadata (1.5 kB)
Collecting colorama>=0.3.9 (from icecream>=2.1->pytextrank)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting executing>=2.1.0 (from icecream>=2.1->pytextrank)
  Downloading executing-2.2.1-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting asttokens>=2.0.1 (from icecream>=2.1->pytextrank)
  Downloading asttokens-3.0.0-py3-none-any.whl.metadata (4.7 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pytextrank-3.3.0-py3-

In [2]:
# 1. Load dataset (CNN/Daily Mail)
dataset = load_dataset("cnn_dailymail", "3.0.0", split="test[:1000]")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [3]:
# 2. Setup SpaCy + TextRank
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("textrank")

<pytextrank.base.BaseTextRankFactory at 0x7cf92868b320>

In [4]:
def textrank_summarize(text, n_sentences=3):
    doc = nlp(text)
    return " ".join([sent.text for sent in doc._.textrank.summary(limit_sentences=n_sentences)])

In [6]:
# 3. Generate summaries
references = []
predictions = []

for sample in dataset:
    article = sample["article"]
    ref_summary = sample["highlights"]

    pred_summary = textrank_summarize(article, n_sentences=3)

    references.append(ref_summary)
    predictions.append(pred_summary)


In [7]:
# 4. Evaluation: ROUGE
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

In [8]:
rouge1, rouge2, rougel = 0, 0, 0
for ref, pred in zip(references, predictions):
    scores = scorer.score(ref, pred)
    rouge1 += scores['rouge1'].fmeasure
    rouge2 += scores['rouge2'].fmeasure
    rougel += scores['rougeL'].fmeasure

In [9]:
n = len(predictions)
print("ROUGE-1:", rouge1/n)
print("ROUGE-2:", rouge2/n)
print("ROUGE-L:", rougel/n)

ROUGE-1: 0.25084404083752665
ROUGE-2: 0.07852253562293304
ROUGE-L: 0.1624421310721664


In [10]:
# 5. Evaluation: BERTScore
P, R, F1 = score(predictions, references, lang="en", verbose=True)
print("BERTScore F1:", F1.mean().item())

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/32 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/16 [00:00<?, ?it/s]

done in 26.05 seconds, 38.39 sentences/sec
BERTScore F1: 0.8508186936378479


In [11]:
# 6. Show example
for i in range(3):
    print("\nARTICLE:", dataset[i]["article"][:400], "...")
    print("REFERENCE SUMMARY:", dataset[i]["highlights"])
    print("TEXT RANK SUMMARY:", predictions[i])


ARTICLE: (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also acce ...
REFERENCE SUMMARY: Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June .
Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .
TEXT RANK SUMMARY: Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes commit