In [None]:
pip install transformers datasets torch


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_c

Loading Dataset


In [None]:
from datasets import load_dataset

# Load the CNN/DailyMail dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")

# Take a sample for testing
train_data = dataset['train']
print(train_data[:4]['article'])   # The full article
print(train_data[:4]['highlights'])  # The summary

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


['LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details 

Load Pretrained Model and Tokenizer


In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration

# Load tokenizer and model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

Tokenize Inputs

In [None]:
article_text = input()

inputs = tokenizer(
    article_text,
    max_length=1024,
    return_tensors="pt",
    truncation=True
)

Daniel Radcliffe gains access to a £20 million fortune as he turns 18. However, he says he won’t spend extravagantly on fast cars or celebrity parties. He prefers to buy books, CDs, and DVDs. Radcliffe's earnings from the first five Harry Potter films are held in a trust fund. He remains grounded despite his fame and fortune.


Generate Summary

In [None]:
summary_ids = model.generate(
    inputs["input_ids"],
    max_length=150,
    min_length=40,
    length_penalty=2.0,
    num_beams=7,
    early_stopping=True
)

# Decode summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(summary)

Daniel Radcliffe gains access to a £20 million fortune as he turns 18. However, he says he won’t spend extravagantly on fast cars or celebrity parties. He prefers to buy books, CDs, and DVDs.


Fine-tuning (Optional — Full Training Loop)

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Preprocess all data
def preprocess_function(examples):
    inputs = tokenizer(examples["article"], max_length=1024, truncation=True)
    targets = tokenizer(examples["highlights"], max_length=128, truncation=True)
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=False,  # Set to True if using a GPU with mixed precision
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"].select(range(1000)),  # Use smaller set for quick training
    eval_dataset=tokenized_dataset["validation"].select(range(100)),
    tokenizer=tokenizer,
)

# Start training
trainer.train()


Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Save the Fine-tuned Model

In [None]:
model.save_pretrained("summarizer-bart")
tokenizer.save_pretrained("summarizer-bart")

Evaluate the Model

In [None]:
# You can use rouge_score for evaluation
from datasets import load_metric
rouge = load_metric("rouge")

# Evaluate on a few examples
predictions = []
references = []

for example in dataset['validation'].select(range(10)):
    inputs = tokenizer(example['article'], return_tensors="pt", truncation=True, max_length=1024)
    ids = model.generate(inputs['input_ids'], max_length=150, min_length=40, length_penalty=2.0, num_beams=4)
    pred = tokenizer.decode(ids[0], skip_special_tokens=True)

    predictions.append(pred)
    references.append(example['highlights'])

results = rouge.compute(predictions=predictions, references=references)
print(results)
