In [287]:
import evaluate

rouge = evaluate.load("rouge")

In [288]:
from datasets import load_dataset

df = load_dataset("abisee/cnn_dailymail", "3.0.0")

In [289]:
df

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [290]:
import re

def clean_text(example):
    text = example['highlights']
    text = re.sub(r"\s+", " ", text).strip()  # Replace multiple spaces/newlines with a single space
    text = re.sub(r"[^\w\s.,'-]", "", text)  # Remove unwanted characters
    text = re.sub(r"\s+\.", ".", text)  # Remove space before a period
    text = re.sub(r"\.(\S)", r". \1", text)  # Ensure space after a period if missing
    example['highlights'] = text
    return example

df = df.map(clean_text)

In [423]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

save_path = "./saved_models/Model4"

# Load the trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(save_path)
model = AutoModelForSeq2SeqLM.from_pretrained(save_path)

# Function to summarize a blog post
def summarize(blog_post):
    # Tokenize the input blog post
    inputs = tokenizer(blog_post, max_length=524, truncation=True, return_tensors="pt")

    # Generate the summary
    summary_ids = model.generate(inputs["input_ids"], max_length=212, min_length=25, length_penalty=2.0, num_beams=4, early_stopping=True) 
    # summary_ids = model.generate(inputs["input_ids"], max_length=212, min_length=25, length_penalty=2.0, num_beams=4, early_stopping=True) 
    # summary_ids = model.generate(inputs["input_ids"], max_length=212, min_length=25, length_penalty=2.0, num_beams=3, early_stopping=True)
    

    # Decode the summary
    summary = tokenizer.decode(summary_ids[0],skip_special_tokens=True)
    return summary


In [424]:
df["validation"][0]["article"]

'(CNN)Share, and your gift will be multiplied. That may sound like an esoteric adage, but when Zully Broussard selflessly decided to give one of her kidneys to a stranger, her generosity paired up with big data. It resulted in six patients receiving transplants. That surprised and wowed her. "I thought I was going to help this one person who I don\'t know, but the fact that so many people can have a life extension, that\'s pretty big," Broussard told CNN affiliate KGO. She may feel guided in her generosity by a higher power. "Thanks for all the support and prayers," a comment on a Facebook page in her name read. "I know this entire journey is much bigger than all of us. I also know I\'m just the messenger." CNN cannot verify the authenticity of the page. But the power that multiplied Broussard\'s gift was data processing of genetic profiles from donor-recipient pairs. It works on a simple swapping principle but takes it to a much higher level, according to California Pacific Medical Ce

In [425]:
df["validation"][0]["highlights"]

'Zully Broussard decided to give a kidney to a stranger. A new computer program helped her donation spur transplants for six kidney patients.'

In [426]:
summarize(df["validation"][0]["article"])

'Zully Broussard gave one of her kidneys to a stranger. It resulted in six patients receiving transplants. Data processing of genetic profiles from donor-recipient pairs is needed.'

In [427]:
Summary = df["validation"]["highlights"]

In [428]:
Summary[:5]

['Zully Broussard decided to give a kidney to a stranger. A new computer program helped her donation spur transplants for six kidney patients.',
 'The 20th MLS season begins this weekend. League has changed dramatically since its inception in 1996. Some question whether rules regarding salary caps and transfers need to change.',
 'Bafetimbi Gomis collapses within 10 minutes of kickoff at Tottenham. But he reportedly left the pitch conscious and wearing an oxygen mask. Gomis later said that he was feeling well The incident came three years after Fabrice Muamba collapsed at White Hart Lane.',
 'Rory McIlroy throws club into water at WGC Cadillac Championship. Northern Irishman frustrated after pulling shot into water hazard.',
 "Cayman Naib, 13, hasn't been heard from since Wednesday. Police, family, volunteers search for eighth-grader."]

In [437]:
GeneratedSummary = []

In [438]:
for i in df["validation"]["article"][:5000]:
    GeneratedSummary.append(summarize(i))

KeyboardInterrupt: 

In [439]:
len(GeneratedSummary)

1548

In [None]:
GeneratedSummary[:5]

['Zully Broussard gave one of her kidneys to a stranger. It resulted in six patients receiving transplants. Data processing of genetic profiles from donor-recipient pairs is needed.',
 'The first ever Major League Soccer match was played in April 1996 in San Jose, California. Attendances are higher than ever before while the number of teams involved has doubled from 10 in the 1996 campaign to 20 in 2015. Orlando City Soccer Club is a prime example of this rapid transformation.',
 "French striker Bafetimbi Gomis says he is now feeling well after collapsing during Swansea's 3-2 loss at Tottenham. Gomeis has a history of fainting spells in France, which prompted the president of his former club to tell French television in 2009 We can't not be worried, it scares you each time",
 'Rory McIlroy pulls his second shot on the eighth hole of the WGC Cadillac Championship into a lake. The four-time major winner launched the 3-iron used to play the offending shot into the water as well. The North

In [440]:
results = rouge.compute(predictions=GeneratedSummary, references=Summary[:1548])

for key, value in results.items():
    print(f"{key}: {value:.4f}")

rouge1: 0.3402
rouge2: 0.1473
rougeL: 0.2495
rougeLsum: 0.2493
