# Abstractive summarization using Bart


In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [2]:
from datasets import load_dataset

dataset = load_dataset("cnn_dailymail", name="3.0.0")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [4]:
# ------------------ Load small dataset sample ------------------

train_dataset = dataset["train"].select(range(10000))       # 1000 samples only
eval_dataset = dataset["validation"].select(range(2000))    # 200 samples only



In [5]:
# ------------------------ ABSTRactive summarization (no fine-tuning) ------------------------
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

article_1 = dataset['train'][1]['article']
summary_of_article = summarizer(article_1, max_length=100, min_length=10, do_sample=False)
print("BART Summary without fine-tuning:\n", summary_of_article[0]['summary_text'])

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0


BART Summary without fine-tuning:
 Judge Steven Leifman is an advocate for justice and the mentally ill. About one-third of all people in Miami-Dade county jails are mentally ill, he says. He says the sheer volume is overwhelming the system. Starting in 2008, many inmates will be sent to a new mental health facility.


In [6]:

# ------------------------ Preprocess for Fine-tuning ------------------------
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import TrainingArguments, Trainer

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained(
    "facebook/bart-large-cnn",
    forced_bos_token_id=tokenizer.bos_token_id
)


In [7]:
def preprocess_function(batch):
    source = batch["article"]
    target = batch["highlights"]
    source_enc = tokenizer(source, truncation=True, padding="max_length", max_length=256)
    target_enc = tokenizer(target, truncation=True, padding="max_length", max_length=64)

    labels = target_enc["input_ids"]
    labels = [[(token if token != tokenizer.pad_token_id else -100) for token in label] for label in labels]

    return {
        "input_ids": source_enc["input_ids"],
        "attention_mask": source_enc["attention_mask"],
        "labels": labels
    }

In [8]:

# Preprocess all splits
train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [9]:
# ------------------ Training Arguments ------------------
training_args = TrainingArguments(
    output_dir="/content/results",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="/content/logs",
    logging_steps=50,
    save_total_limit=1,
    remove_unused_columns=False,
    report_to="none",
    fp16=True
)

In [10]:

# ------------------ Trainer ------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

# ------------------ Train ------------------
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.5392,1.98792




TrainOutput(global_step=5000, training_loss=1.9056281814575196, metrics={'train_runtime': 1442.9856, 'train_samples_per_second': 6.93, 'train_steps_per_second': 3.465, 'total_flos': 5417761505280000.0, 'train_loss': 1.9056281814575196, 'epoch': 1.0})

In [11]:
# ------------------------ EVALUATION ------------------------
eval_results = trainer.evaluate()
print("Eval Loss:", eval_results)

Eval Loss: {'eval_loss': 1.9879196882247925, 'eval_runtime': 52.5622, 'eval_samples_per_second': 38.05, 'eval_steps_per_second': 19.025, 'epoch': 1.0}


In [12]:
# Save the model and tokenizer after training
model.save_pretrained("/content/drive/MyDrive/Text_summarization")
tokenizer.save_pretrained("/content/drive/MyDrive/Text_summarization")

('/content/drive/MyDrive/Text_summarization/tokenizer_config.json',
 '/content/drive/MyDrive/Text_summarization/special_tokens_map.json',
 '/content/drive/MyDrive/Text_summarization/vocab.json',
 '/content/drive/MyDrive/Text_summarization/merges.txt',
 '/content/drive/MyDrive/Text_summarization/added_tokens.json',
 '/content/drive/MyDrive/Text_summarization/tokenizer.json')

In [15]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/Text_summarization")
model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/Text_summarization")

# Adjust forced_bos_token_id
model.config.forced_bos_token_id = 0

# Function to summarize a blog post
def summarize(blog_post):
    # Tokenize the input blog post
    inputs = tokenizer(blog_post, max_length=1024, truncation=True, return_tensors="pt")

    # # Print the tokenized input for debugging
    # print("Tokenized input:", inputs["input_ids"])

    # Configure generation settings
    generation_config = GenerationConfig(
        max_length=150,
        min_length=40,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )

    # Generate the summary
    summary_ids = model.generate(
        inputs["input_ids"],
        generation_config=generation_config
    )

    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary




# Example blog post
blog_post = """
As Yogi Berra famously said, it’s tough to make predictions, especially about the future. But had the baseball legend spent any time observing the UN climate negotiations, he could have safely predicted that climate finance will prove to be a key sticking point at COP29 in Baku at the end of this year.

‘Who will pay and how much?’ are perennial questions at the climate talks, but this year, the discussions about climate finance will be especially prominent. At COP29, Parties to the Paris Agreement must negotiate a new climate finance goal, to replace the existing commitment from 2009 for developed countries to provide US$100 billion climate finance annually from 2020 to 2025 - a commitment that only in 2022 was starting to be fulfilled, according to a recent OECD report.

It is vital that the forthcoming Bonn Climate Change Conference sends the right political signals, and lays the procedural and technical groundwork for an ambitious climate finance deal in Baku.

A pressing need

With global warming already destabilising the climate and devastating people’s lives and livelihoods, the need for finance to reduce greenhouse gas emissions and to adapt to a warming world has never been more pressing.

The sums involved are large. The Paris Agreement’s Global Stocktake process estimates that US$5.8-5.9 trillion is required to implement Nationally Determined Contributions (NDCs) in developing countries up to 2030. They will require US$215-387 billion annually over this period for adaptation. Investments of US$1.5 trillion in renewable energy are required worldwide every year up until 2030, according to IRENA.

But these sums are also affordable and beneficial for developed countries. They should be seen in the context of ongoing investments in energy and other infrastructure: around US$2.3 trillion was invested in energy infrastructure in 2023, of which US$1.74 trillion was in clean energy. These investments will generate strong returns for their investors and reduce the costs for energy consumers.

And, crucially, they should also be seen in the context of the alternative. The latest research estimates that the world economy is already set to face a 19% income reduction within the next 26 years based on the levels of warming we have already locked in. The more we delay and the more the planet heats, the greater the economic costs will be.

Laying the foundations for a new finance goal

While financial resources are beginning to flow, they are not flowing fast enough, and certainly not flowing to those developing countries where need is greatest and access to finance is most challenging.

The UN climate framework provides mechanisms that can enable those flows of climate finance. Back in 2015, parties at the climate talks agreed to establish a “new collective quantified goal” (NCQG) for climate finance. They agreed that the NCQG would be set prior to 2025.

The  ultimate size of the NCQG will be a product of the negotiations, but Parties have agreed it must be a significant increase from the floor of US$100 billion annually. For WWF, it must be needs-based and sufficiently ambitious to meet the scale of the challenge we face, and immediately accessible to help countries that are already facing the chaos of a destabilised climate system.

While developed countries are expected to provide financial and technical support, developing countries also have a role to play. Parties are due to submit revised NDCs in 2025, presenting how they plan to reduce emissions and adapt to climate change. Developing countries have the opportunity to use their NDCs to set out how international climate finance can support them and increase their ambition. To do this, they need to know the finance will be forthcoming.
"""

# Get the summary
summary = summarize(blog_post)
print("Summary:", summary)


Summary: Climate finance will be a key sticking point at COP29 in Baku at the end of this year .
Developed countries are expected to provide financial and technical support, but developing countries also have a role to play .
The UN climate framework provides mechanisms that can enable those flows of climate finance .
It is vital that Bonn Climate Change Conference sends the right political signals .
