# Abstractive Summarization with HuggingFace and PEGASUS 

In [11]:
# Set up Google Colab runtime
import sys
if "google.colab" in sys.modules:
    print("Setting up Google Colab... ")
    !git clone https://github.com/Strabes/hfmodels.git
    %cd hfmodels
    from install import install_requirements
    install_requirements()

In [1]:
# imports
from transformers import pipeline, set_seed
from datasets import load_dataset, load_metric
import nltk
import pandas as pd
from nltk.tokenize import sent_tokenize
#nltk.download("punkt")


## CNN/Daily Mail Dataset

The CNN/Daily Mail Dataset

In [2]:
#dataset = load_dataset("cnn_dailymail", version="3.0.0", download_mode="force_redownload")
dataset = load_dataset("cnn_dailymail", version="3.0.0")
print(f"Features: {dataset['train'].column_names}")

Using custom data configuration default
Reusing dataset cnn_dailymail (C:\Users\grego\.cache\huggingface\datasets\cnn_dailymail\default\3.0.0\e6f7373c4552f36af359a1fc84b24352f22070483560e87f524e1730f8cf5539)
100%|██████████| 3/3 [00:00<00:00,  3.71it/s]

Features: ['article', 'highlights', 'id']





In [3]:
sample = dataset["train"][0]
print(f"""
Article (excerpt of 1000 characters, total length: {len(sample["article"])}):
""")
print(sample["article"][:1000])
print(f'\nSummary (length: {len(sample["highlights"])}):')
print(sample["highlights"])


Article (excerpt of 1000 characters, total length: 2527):

LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don't think I'll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his

In [4]:
string = "The U.S. are a country. The U.N. is an organization."
sent_tokenize(string)

['The U.S. are a country.', 'The U.N. is an organization.']

In [5]:
sample_text = dataset["train"][0]["article"][:2000]
# We'll collect the generated summaries of each model in a dictionary
summaries = {}

pipe = pipeline("summarization", model="google/pegasus-cnn_dailymail")
pipe_out = pipe(sample_text)
summaries["pegasus"] = pipe_out[0]["summary_text"].replace(" .<n>", ".\n")

In [6]:
print(summaries["pegasus"])

Harry Potter star Daniel Radcliffe gains access to a reported £20 million fortune.
Young actor says he has no plans to fritter his cash away.
Radcliffe's earnings from the first five Potter films have been held in a trust fund .


In [7]:
#bleu_metric = load_metric("sacrebleu")
rouge_metric = load_metric("rouge")

In [8]:
reference = dataset["train"][0]["highlights"]
records = []
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

for model_name in summaries:
    rouge_metric.add(prediction=summaries[model_name], reference=reference)
    score = rouge_metric.compute()
    rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
    records.append(rouge_dict)
pd.DataFrame.from_records(records, index=summaries.keys())

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.8,0.692308,0.8,0.8


In [10]:
dataset["train"]

Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 90152
})