In [1]:
# Cell 1 - Imports
import os
import numpy as np
import torch
from datasets import load_dataset, load_metric
from transformers import (
    BartTokenizer,
    BartForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments
)
from bert_score import score as bert_score
import nltk

nltk.download("punkt")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Cell 2 - Load CNN/DailyMail dataset
dataset = load_dataset("cnn_dailymail", "default")

print(dataset)
print("Train size:", len(dataset["train"]))
print("Validation size:", len(dataset["validation"]))
print("Test size:", len(dataset["test"]))


DatasetDict({
    train: Dataset({
        features: ['id', 'article', 'highlights'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['id', 'article', 'highlights'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['id', 'article', 'highlights'],
        num_rows: 11490
    })
})
Train size: 287113
Validation size: 13368
Test size: 11490


In [6]:
# Cell 3 - Load pretrained BART model (already fine-tuned on CNN/DailyMail)
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Hyperparameters
max_input_length = 1024
max_target_length = 150


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:  66%|######5   | 1.07G/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [7]:
# Cell 4 - Preprocessing
def preprocess_function(batch):
    inputs = batch["article"]
    model_inputs = tokenizer(
        inputs, max_length=max_input_length, truncation=True
    )

    labels = tokenizer(
        batch["highlights"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["article", "highlights", "id"]
)


Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [8]:
# Cell 5 - Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


In [15]:
from transformers import TrainingArguments
import torch

batch_size = 2
output_dir = "./bart_cnn"

training_args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy="epoch",  
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=8,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=1,
    fp16=torch.cuda.is_available(),
    logging_dir='./logs',
    logging_steps=100
)

In [17]:
# Cell 7 - Metrics (ROUGE + BERTScore)
from evaluate import load

rouge = load("rouge", trust_remote_code=True)

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # ROUGE scores
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    rouge_results = {key: value for key, value in result.items()}  # Simplified for newer evaluate library

    # BERTScore
    P, R, F1 = bert_score(decoded_preds, decoded_labels, lang="en", verbose=False)
    bert_results = {
        "bert_precision": float(P.mean().item()),
        "bert_recall": float(R.mean().item()),
        "bert_f1": float(F1.mean().item())
    }

    # Combine
    rouge_results.update(bert_results)
    return rouge_results

In [21]:
pip install --upgrade datasets evaluate bert-score torch

Collecting datasets
  Obtaining dependency information for datasets from https://files.pythonhosted.org/packages/f4/c8/09012ac195a0aab58755800d2efdc0e7d5905053509f12cb5d136c911cda/datasets-4.1.1-py3-none-any.whl.metadata
  Downloading datasets-4.1.1-py3-none-any.whl.metadata (18 kB)
Collecting evaluate
  Obtaining dependency information for evaluate from https://files.pythonhosted.org/packages/3e/af/3e990d8d4002bbc9342adb4facd59506e653da93b2417de0fa6027cb86b1/evaluate-0.4.6-py3-none-any.whl.metadata
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting torch
  Obtaining dependency information for torch from https://files.pythonhosted.org/packages/84/57/2f64161769610cf6b1c5ed782bd8a780e18a3c9d48931319f2887fa9d0b1/torch-2.8.0-cp311-cp311-win_amd64.whl.metadata
  Using cached torch-2.8.0-cp311-cp311-win_amd64.whl.metadata (30 kB)
Collecting sympy>=1.13.3 (from torch)
  Obtaining dependency information for sympy>=1.13.3 from https://files.pythonhosted.org/packages/a2/0

ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\PC\\anaconda3\\Lib\\site-packages\\~orch\\lib\\asmjit.dll'
Consider using the `--user` option or check the permissions.



In [24]:
pip install -c huggingface transformers

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not open requirements file: [Errno 2] No such file or directory: 'huggingface'


In [29]:
pip install --upgrade tensorflow


Collecting tensorflow
  Obtaining dependency information for tensorflow from https://files.pythonhosted.org/packages/e3/f8/9246d3c7e185a29d7359d8b12b3d70bf2c3150ecf1427ec1382290e71a56/tensorflow-2.20.0-cp311-cp311-win_amd64.whl.metadata
  Using cached tensorflow-2.20.0-cp311-cp311-win_amd64.whl.metadata (4.6 kB)
Collecting protobuf>=5.28.0 (from tensorflow)
  Obtaining dependency information for protobuf>=5.28.0 from https://files.pythonhosted.org/packages/8c/f3/6f58f841f6ebafe076cebeae33fc336e900619d34b1c93e4b5c97a81fdfa/protobuf-6.32.1-cp310-abi3-win_amd64.whl.metadata
  Downloading protobuf-6.32.1-cp310-abi3-win_amd64.whl.metadata (593 bytes)
Collecting tensorboard~=2.20.0 (from tensorflow)
  Obtaining dependency information for tensorboard~=2.20.0 from https://files.pythonhosted.org/packages/9c/d9/a5db55f88f258ac669a92858b70a714bbbd5acd993820b41ec4a96a4d77f/tensorboard-2.20.0-py3-none-any.whl.metadata
  Using cached tensorboard-2.20.0-py3-none-any.whl.metadata (1.8 kB)
Using cached

ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\PC\\anaconda3\\Lib\\site-packages\\google\\~upb\\_message.pyd'
Consider using the `--user` option or check the permissions.



In [30]:
# Cell 8 - Trainer
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


AttributeError: module 'tensorflow' has no attribute 'io'

In [None]:
# Cell 9 - Evaluate baseline model (without extra fine-tuning)
val_results = trainer.evaluate(tokenized_datasets["validation"])
test_results = trainer.evaluate(tokenized_datasets["test"])

print("Validation Results:", val_results)
print("Test Results:", test_results)


In [None]:
# Cell 10 - Generate summary for a custom article
def generate_summary(text, max_length=150, num_beams=4):
    inputs = tokenizer([text], return_tensors="pt", truncation=True, padding="longest").to(model.device)
    summary_ids = model.generate(inputs["input_ids"], num_beams=num_beams, max_length=max_length)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

sample_text = dataset["test"][0]["article"]
print("Original Article:\n", sample_text[:1000], "...")
print("\nReference Summary:\n", dataset["test"][0]["highlights"])
print("\nGenerated Summary:\n", generate_summary(sample_text))
