In [None]:
from datasets import load_dataset
import re

# Load CNN/DailyMail dataset
dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:5%]")  # Use subset for quick testing

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # remove excessive whitespace
    text = re.sub(r'\[[0-9]*\]', '', text)  # remove citations
    return text.strip()

# Apply cleaning
dataset = dataset.map(lambda x: {"article": clean_text(x["article"]), "highlights": clean_text(x["highlights"])})


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Map:   0%|          | 0/14356 [00:00<?, ? examples/s]

In [None]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest

In [None]:
nlp = spacy.load("en_core_web_sm")

def extractive_summary(text, num_sentences=3):
    doc = nlp(text)
    word_freq = {}
    for word in doc:
        if word.text.lower() not in STOP_WORDS and word.text.lower() not in punctuation:
            word_freq[word.text.lower()] = word_freq.get(word.text.lower(), 0) + 1

    max_freq = max(word_freq.values())
    for word in word_freq:
        word_freq[word] /= max_freq

    sentence_scores = {}
    for sent in doc.sents:
        for word in sent:
            if word.text.lower() in word_freq:
                sentence_scores[sent] = sentence_scores.get(sent, 0) + word_freq[word.text.lower()]

    summary_sentences = nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
    final_summary = ' '.join([sent.text for sent in summary_sentences])
    return final_summary

print(extractive_summary(dataset[0]['article']))

Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart.


In [None]:
from transformers import pipeline

# Load pre-trained summarizer (BART)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


In [None]:
def abstractive_summary(text):
    return summarizer(text, max_length=130, min_length=30, do_sample=False)[0]['summary_text']

print(abstractive_summary(dataset[0]['article']))

Harry Potter star Daniel Radcliffe turns 18 on Monday. He gains access to a reported £20 million ($41.1 million) fortune. Radcliffe's earnings from the first five Potter films have been held in a trust fund.


In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset

tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

def tokenize_function(example)
    return tokenizer(example["article"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="no",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    save_steps=500,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

# Train (example with small set)
# trainer.train()

Map:   0%|          | 0/14356 [00:00<?, ? examples/s]

  trainer = Trainer(


In [None]:
!pip install rouge_score
from rouge_score import rouge_scorer

def evaluate_summary(reference, generated):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, generated)
    return scores

ref = dataset[0]['highlights']
gen = abstractive_summary(dataset[0]['article'])
print(evaluate_summary(ref, gen))


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=f96c55fa675ead769b4a4a76701b80ab95c00346141ec969709df4a5ade90b83
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
{'rouge1': Score(precision=0.6756756756756757, recall=0.6410256410256411, fmeasure=0.6578947368421052), 'rougeL': Score(precision=0.6486486486486487, recall=0.6153846153846154, fmeasure=0.631578947368421)}


In [None]:
print(evaluate_summary(ref, gen))

{'rouge1': Score(precision=0.6756756756756757, recall=0.6410256410256411, fmeasure=0.6578947368421052), 'rougeL': Score(precision=0.6486486486486487, recall=0.6153846153846154, fmeasure=0.631578947368421)}


In [None]:
# Input your own custom article
text = """
Artificial intelligence has made significant strides in the healthcare sector. AI-powered systems are
now capable of diagnosing diseases from images, predicting patient outcomes, and even suggesting treatment
plans. Despite these advancements, challenges such as data privacy, model transparency, and ethical concerns
remain. Experts stress the need for clear regulations and collaboration between tech developers and healthcare
professionals.
"""

# Extractive summary
print("Extractive Summary:")
print(extractive_summary(text, num_sentences=2))

# Abstractive summary
print("\nAbstractive Summary:")
print(abstractive_summary(text))


Your max_length is set to 130, but your input_length is only 85. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=42)


Extractive Summary:
AI-powered systems are
now capable of diagnosing diseases from images, predicting patient outcomes, and even suggesting treatment
plans. Experts stress the need for clear regulations and collaboration between tech developers and healthcare 
professionals.


Abstractive Summary:
Artificial intelligence has made significant strides in the healthcare sector. Despite these advancements, challenges such as data privacy, model transparency, and ethical concerns remain. Experts stress the need for clear regulations and collaboration between tech developers and healthcare professionals.
