In [1]:
import evaluate

rouge = evaluate.load("rouge")

In [2]:
from datasets import load_dataset

df = load_dataset("abisee/cnn_dailymail", "3.0.0")

In [3]:
df

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [4]:
import re

def clean_text(example):
    text = example['highlights']
    text = re.sub(r"\s+", " ", text).strip()  # Replace multiple spaces/newlines with a single space
    text = re.sub(r"[^\w\s.,'-]", "", text)  # Remove unwanted characters
    text = re.sub(r"\s+\.", ".", text)  # Remove space before a period
    text = re.sub(r"\.(\S)", r". \1", text)  # Ensure space after a period if missing
    example['highlights'] = text
    return example

df = df.map(clean_text)

In [34]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

save_path = "./saved_models/Model4"

# Load the trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(save_path)
model = AutoModelForSeq2SeqLM.from_pretrained(save_path)

# Function to summarize a blog post
def summarize(blog_post):
    # Tokenize the input blog post
    inputs = tokenizer(blog_post, max_length=512, truncation=True, return_tensors="pt")

    # Improved generation settings
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=180,           # Slightly shorter, encourages concise summaries
        min_length=40,            # Avoids too-short output
        length_penalty=1.2,       # Slightly encourages longer sentences, but not too much
        num_beams=6,              # Wider search for better summaries
        no_repeat_ngram_size=3,   # Avoids repetition in summaries
        early_stopping=True
    )

    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Optional: Basic post-processing to clean text
    summary = summary.strip().replace(" .", ".").replace(" ,", ",")
    
    return summary


In [37]:
df["validation"][0]["highlights"]

'Zully Broussard decided to give a kidney to a stranger. A new computer program helped her donation spur transplants for six kidney patients.'

In [38]:
summarize(df["validation"][0]["article"])

'Zully Broussard gave one of her kidneys to a stranger. It resulted in six patients receiving transplants. The super swap works on a simple swapping principle but takes it to a much higher level.'

In [42]:
Summary = df["validation"]["highlights"]

In [43]:
Summary[:5]

['Zully Broussard decided to give a kidney to a stranger. A new computer program helped her donation spur transplants for six kidney patients.',
 'The 20th MLS season begins this weekend. League has changed dramatically since its inception in 1996. Some question whether rules regarding salary caps and transfers need to change.',
 'Bafetimbi Gomis collapses within 10 minutes of kickoff at Tottenham. But he reportedly left the pitch conscious and wearing an oxygen mask. Gomis later said that he was feeling well The incident came three years after Fabrice Muamba collapsed at White Hart Lane.',
 'Rory McIlroy throws club into water at WGC Cadillac Championship. Northern Irishman frustrated after pulling shot into water hazard.',
 "Cayman Naib, 13, hasn't been heard from since Wednesday. Police, family, volunteers search for eighth-grader."]

In [39]:
GeneratedSummary = []

In [40]:
for i in df["validation"]["article"][:10]:
    GeneratedSummary.append(summarize(i))

In [None]:
len(GeneratedSummary)

10

In [None]:
GeneratedSummary[:5]

['Zully Broussard gave one of her kidneys to a stranger. It resulted in six patients receiving transplants.',
 'The first ever Major League Soccer match was played in April 1996 in San Jose, California. Attendances are higher than ever before while the number of teams involved has doubled from 10 in the 1996 campaign to 20 in 2015.',
 "French striker Bafetimbi Gomis says he is now feeling well after collapsing during Swansea's 3-2 loss at Tottenham. Gomeis has a history of fainting spells in France, which prompted the president of his former club to tell French television in 2009 We can't not be worried, it scares you each time",
 'Rory McIlroy pulls his second shot on the eighth hole of the WGC Cadillac Championship into a lake. The four-time major winner launched the 3-iron used to play the offending shot into the water as well. The Northern Irishman composed himself to finish with a second round of 70.',
 'Cayman Naib, 13, has been missing since Wednesday. He was last seen wearing a

In [44]:
results = rouge.compute(predictions=GeneratedSummary, references=Summary[:10])

for key, value in results.items():
    print(f"{key}: {value:.4f}")

rouge1: 0.2866
rouge2: 0.0896
rougeL: 0.1857
rougeLsum: 0.1855


In [None]:
rouge1: 0.3362
rouge2: 0.1368
rougeL: 0.2381
rougeLsum: 0.2378

In [None]:
rouge1: 0.3620
rouge2: 0.1679
rougeL: 0.2709
rougeLsum: 0.2717