In [1]:
# Cell 1 - Imports
import os
import numpy as np
import torch
from datasets import load_dataset, load_metric
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments
)
from bert_score import score as bert_score
import nltk

# Download punkt for sentence tokenization if not already
nltk.download("punkt")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
from datasets import load_dataset

dataset = load_dataset("cnn_dailymail", "default")
print(dataset)

print("Train size:", len(dataset["train"]))
print("Validation size:", len(dataset["validation"]))
print("Test size:", len(dataset["test"]))


DatasetDict({
    train: Dataset({
        features: ['id', 'article', 'highlights'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['id', 'article', 'highlights'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['id', 'article', 'highlights'],
        num_rows: 11490
    })
})
Train size: 287113
Validation size: 13368
Test size: 11490


In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_name = "t5-large"

tokenizer = T5Tokenizer.from_pretrained(model_name, force_download=True)
model = T5ForConditionalGeneration.from_pretrained(model_name, force_download=True)

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

In [5]:
# Cell 3 - Load tokenizer & model (T5-Large)
model_name = "t5-large"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Hyperparameters
max_input_length = 512
max_target_length = 150


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

OSError: Consistency check failed: file should be of size 2950736730 but has size 161389099 (model.safetensors).
This is usually due to network issues while downloading the file. Please retry with `force_download=True`.

In [None]:
# Cell 4 - Preprocessing function
def preprocess_function(batch):
    inputs = ["summarize: " + doc for doc in batch["article"]]
    model_inputs = tokenizer(
        inputs, max_length=max_input_length, truncation=True
    )

    labels = tokenizer(
        batch["highlights"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing to all splits
tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["article", "highlights", "id"]
)


In [None]:
# Cell 5 - Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


In [None]:
# Cell 6 - Training setup
batch_size = 2   # keep small for t5-large unless you have big GPU
output_dir = "./t5_large_cnn"

training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=8,   # simulate larger batch size
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=2,  # increase if compute allows
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    logging_dir='./logs',
    logging_steps=100
)


In [None]:
# Cell 7 - Define metrics (ROUGE + BERTScore)
rouge = load_metric("rouge")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # ROUGE
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    rouge_results = {key: value.mid.fmeasure for key, value in result.items()}

    # BERTScore
    P, R, F1 = bert_score(decoded_preds, decoded_labels, lang="en", verbose=False)
    bert_results = {"bert_precision": P.mean().item(),
                    "bert_recall": R.mean().item(),
                    "bert_f1": F1.mean().item()}

    rouge_results.update(bert_results)
    return rouge_results


In [None]:
# Cell 8 - Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


In [None]:
# Cell 9 - Training
trainer.train()


In [None]:
# Cell 10 - Evaluate on validation and test
val_results = trainer.evaluate(tokenized_datasets["validation"])
test_results = trainer.evaluate(tokenized_datasets["test"])

print("Validation Results:", val_results)
print("Test Results:", test_results)


In [None]:
# Cell 11 - Save final fine-tuned model
trainer.save_model("./t5_large_cnn_final")
tokenizer.save_pretrained("./t5_large_cnn_final")


In [None]:
# Cell 12 - Generate summary for a custom article
def generate_summary(text, max_length=150, num_beams=4):
    inputs = tokenizer("summarize: " + text, return_tensors="pt", truncation=True, padding="longest").to(model.device)
    summary_ids = model.generate(inputs["input_ids"], num_beams=num_beams, max_length=max_length)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

sample_text = dataset["test"][0]["article"]
print("Original Article:\n", sample_text[:1000], "...")
print("\nReference Summary:\n", dataset["test"][0]["highlights"])
print("\nGenerated Summary:\n", generate_summary(sample_text))
