In [1]:
# !pip install datasets transformers accelerate evaluate rouge_score -q

import pandas as pd
from datasets import load_dataset, DatasetDict
from transformers import T5Tokenizer
import warnings

# Ignore simple warnings
# warnings.filterwarnings("ignore")

In [2]:
# --- 2. Load and Sample the Dataset (Corrected) ---

# Define the correct path to the folder containing the CSVs
DATA_PATH = "/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/"

data_files = {
    'train': DATA_PATH + 'train.csv',
    'validation': DATA_PATH + 'validation.csv',
    'test': DATA_PATH + 'test.csv'
}

# Load all three files into a DatasetDict
print("Loading all datasets...")
full_dataset = load_dataset('csv', data_files=data_files)
print("Full dataset loaded:")
print(full_dataset)

# --- Create a Smaller Sample for Training ---
# You can adjust this number. 50k is a good balance of speed and quality.
TRAIN_SAMPLE_SIZE = 100000

print(f"\nCreating a training sample of {TRAIN_SAMPLE_SIZE} rows...")

# Create the final dataset we'll use for training
# We need to clean up potential nulls from the CSV first
clean_train = full_dataset['train'].filter(
    lambda example: example['article'] is not None and example['highlights'] is not None
)

tokenized_dataset = DatasetDict({
    'train': clean_train.shuffle(seed=42).select(range(TRAIN_SAMPLE_SIZE)),
    'validation': full_dataset['validation'].filter(
        lambda example: example['article'] is not None and example['highlights'] is not None
    ),
    'test': full_dataset['test'].filter(
        lambda example: example['article'] is not None and example['highlights'] is not None
    )
})

print("Sampled and cleaned dataset created:")
print(tokenized_dataset)

Loading all datasets...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Full dataset loaded:
DatasetDict({
    train: Dataset({
        features: ['id', 'article', 'highlights'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['id', 'article', 'highlights'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['id', 'article', 'highlights'],
        num_rows: 11490
    })
})

Creating a training sample of 100000 rows...


Filter:   0%|          | 0/287113 [00:00<?, ? examples/s]

Filter:   0%|          | 0/13368 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11490 [00:00<?, ? examples/s]

Sampled and cleaned dataset created:
DatasetDict({
    train: Dataset({
        features: ['id', 'article', 'highlights'],
        num_rows: 100000
    })
    validation: Dataset({
        features: ['id', 'article', 'highlights'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['id', 'article', 'highlights'],
        num_rows: 11490
    })
})


In [None]:
pip install --upgrade transformers huggingface_hub

In [3]:
# Load the T5 tokenizer
MODEL_NAME = 't5-small'
print(f"\nLoading tokenizer for '{MODEL_NAME}'...")
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

# T5 models require a task-specific prefix. For summarization, we use "summarize: "
PREFIX = "summarize: "

# We'll set max token lengths for the input and output
MAX_INPUT_LENGTH = 512   # Max length for the article
MAX_TARGET_LENGTH = 128  # Max length for the summary

def preprocess_function(examples):
    """
    Prepares the data for T5.
    1. Adds the "summarize: " prefix to the article.
    2. Tokenizes the article (input).
    3. Tokenizes the highlights (target/labels).
    """
    
    # Clean and prefix inputs (handle potential None/nan values)
    inputs = [PREFIX + str(doc) for doc in examples['article']]
    
    # Tokenize the articles
    model_inputs = tokenizer(
        inputs, 
        max_length=MAX_INPUT_LENGTH, 
        truncation=True
        # We don't pad here; the DataCollator will handle it (more efficient)
    )

    # Clean and tokenize the targets (summaries)
    targets = [str(doc) for doc in examples['highlights']]
    
    # Tokenize labels using the 'as_target_tokenizer' context manager
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets, 
            max_length=MAX_TARGET_LENGTH, 
            truncation=True
            # No padding here either
        )

    # Add the tokenized labels to our model inputs
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

# --- Apply the function to all splits ---
print("Tokenizing all datasets (this may take a few minutes)...")

# We use batched=True to process multiple examples at once (it's much faster)
# We remove the old columns to save memory
tokenized_dataset = tokenized_dataset.map(
    preprocess_function, 
    batched=True, 
    remove_columns=['id', 'article', 'highlights']
)

print("Tokenization complete.")
print(tokenized_dataset)

# Let's check a single processed example
print("\n--- Example of one tokenized training item ---")
print(tokenized_dataset['train'][0])


Loading tokenizer for 't5-small'...


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Tokenizing all datasets (this may take a few minutes)...


Map:   0%|          | 0/100000 [00:00<?, ? examples/s]



Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

Tokenization complete.
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100000
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 11490
    })
})

--- Example of one tokenized training item ---
{'input_ids': [21603, 10, 938, 3, 5, 11163, 63, 29, 2501, 15, 3, 5, 4946, 14925, 12177, 7, 33, 338, 3, 9, 11200, 53, 748, 24, 5689, 3, 9, 18936, 842, 5013, 53, 1067, 8, 643, 5, 309, 17344, 8, 458, 88, 1408, 16, 3, 9, 1367, 22, 6, 8, 1407, 5689, 8, 3640, 458, 9, 7591, 22, 45, 8, 798, 34, 19, 3641, 552, 34, 19, 2681, 16, 3, 9, 11095, 5, 3, 30705, 6, 66, 18936, 3640, 7, 33, 2681, 16, 3, 9, 1633, 18, 2689, 11, 3, 8623, 57, 3, 867, 12, 1709, 135, 3, 18687, 53, 30, 70, 2027, 344, 9612, 5, 933, 163, 405, 48, 4285, 149, 307, 3, 9, 18936, 842, 54, 36, 2697, 1067, 8, 643,

In [None]:
!pip install evaluate
!pip install rouge_score

In [4]:
import evaluate
import numpy as np
import torch
from transformers import (
    T5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

# --- 1. Load Model ---
# We use T5ForConditionalGeneration, which includes the language modeling
# head on top of the decoder, making it perfect for generation tasks.
print(f"Loading model: '{MODEL_NAME}'...")
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

# --- 2. Define Evaluation Metric (Deliverable 3) ---
# We'll load the ROUGE metric from the 'evaluate' library
print("Loading ROUGE metric...")
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    """
    This function is called by the Trainer during evaluation.
    It decodes the model's predictions and the true labels
    and computes the ROUGE scores.
    """
    predictions, labels = eval_pred
    
    # Decode predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # In the labels, -100 is used for padding, so we must replace it
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode labels
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # ROUGE expects newline-separated sentences
    decoded_preds = ["\n".join(pred.strip()) for pred in decoded_preds]
    decoded_labels = ["\n".join(label.strip()) for label in decoded_labels]

    # Compute ROUGE
    result = rouge.compute(
        predictions=decoded_preds, 
        references=decoded_labels, 
        use_stemmer=True
    )
    
    # Extract the main ROUGE scores
    result = {key: value for key, value in result.items()}
    
    # Add mean generated length as a metric
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

# --- 3. Define Data Collator ---
# This will dynamically pad all inputs and labels in a batch
# to the same length. This is more efficient than padding everything
# to 512 in the preprocessing step.
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer, 
    model=model
)

# --- 4. Define Training Arguments ---
# We use Seq2SeqTrainingArguments for encoder-decoder models
# --- 4. Define Training Arguments ---
# We use Seq2SeqTrainingArguments for encoder-decoder models
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5_summarization_results", # Where to save checkpoints
    
    # --- Strategy ---
    eval_strategy="epoch",      # Run evaluation every epoch
    save_strategy="epoch",      # Save a checkpoint every epoch (matches eval)
    
    # --- Hyperparameters ---
    learning_rate=1e-4,         # <-- INCREASED (T5 likes higher LR)
    optim="adafactor",          # <-- CHANGED (More memory efficient)
    
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    num_train_epochs=3,
    
    # --- Checkpoint Management ---
    save_total_limit=3,               # Only keep the 3 best checkpoints
    load_best_model_at_end=True,    # <-- ADDED (Crucial for preventing overfitting)
    metric_for_best_model="rouge1", # <-- ADDED (Tells it to use ROUGE-1 as the decider)
    
    # --- Critical Flags ---
    predict_with_generate=True,       
    fp16=torch.cuda.is_available(),   
    report_to="none"                  
)

# --- 5. Initialize the Trainer ---
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics      # Pass our ROUGE function
)

# --- 6. Start Fine-Tuning ---
print("Starting fine-tuning...")
trainer.train()

# --- 7. Save the Final Model ---
print("Training complete. Saving final model...")
final_model_path = "./t5_final_summarizer_model"
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)

print(f"Model and tokenizer saved to {final_model_path}")

2025-11-01 17:57:09.022455: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762019829.190434     105 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762019829.238371     105 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading model: 't5-small'...


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Loading ROUGE metric...


Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Seq2SeqTrainer(


Starting fine-tuning...


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,2.0085,1.810303,0.427,0.3263,0.3328,0.427,19.9989
2,1.9808,1.795307,0.4268,0.3258,0.3329,0.4268,19.9996
3,1.9149,1.789,0.4263,0.3257,0.3328,0.4263,19.9994


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Training complete. Saving final model...
Model and tokenizer saved to ./t5_final_summarizer_model


In [6]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from datasets import load_dataset
import textwrap

# --- 1. Load the Fine-Tuned Model and Tokenizer ---
print("Loading final fine-tuned model and tokenizer...")
model_path = "./t5_final_summarizer_model"
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the model and tokenizer from the saved directory
model = T5ForConditionalGeneration.from_pretrained(model_path).to(device)
tokenizer = T5Tokenizer.from_pretrained(model_path)

print(f"Model loaded and on device: {device}")

# --- 2. Get a Few Samples from the Test Set ---
# We'll use the 'full_dataset' variable that we loaded at the start
# This way we can access the original, untokenized text.
test_samples = full_dataset['test'].shuffle(seed=42).select(range(5))

# --- 3. Generate Summaries for Each Sample ---
PREFIX = "summarize: "

for i, example in enumerate(test_samples):
    original_article = example['article']
    actual_summary = example['highlights']
    
    # Prepare the article for the model
    input_text = PREFIX + original_article
    
    # Tokenize the article
    inputs = tokenizer(
        input_text, 
        max_length=512,  # Must match the training max_length
        truncation=True, 
        return_tensors="pt"
    ).to(device)

    # Generate the summary
    with torch.no_grad():
        output_ids = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=128,  # Max length for the generated summary
            num_beams=4,          # Use beam search for higher quality
            early_stopping=True
        )
    
    # Decode the generated summary
    generated_summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    # --- 4. Print the Comparison ---
    print("=" * 30)
    print(f"       EXAMPLE {i + 1}       ")
    print("=" * 30)
    
    # Use textwrap to make the long article more readable
    print("\n--- üì∞ ORIGINAL ARTICLE (truncated) ---")
    print(textwrap.fill(original_article, width=80))
    
    print("\n--- üéØ ACTUAL SUMMARY ---")
    print(actual_summary)
    
    print("\n--- ü§ñ MODEL'S GENERATED SUMMARY ---")
    print(generated_summary)
    print("\n")

Loading final fine-tuned model and tokenizer...
Model loaded and on device: cuda
       EXAMPLE 1       

--- üì∞ ORIGINAL ARTICLE (truncated) ---
Kate Winslet was a vision in blue at a London film premiere this week. Her
stunning body-con dress (top) had clearly been made to measure by Stella
McCartney. But my, what big feet ‚Äî and big leopard-print stilettos ‚Äî she has! At
5 ft 7 in, the 39-year-old Oscar-winner is certainly no towering Amazon, but
nonetheless she commands an out-of-the-ordinary UK¬†size-nine shoe. Kate is
endearingly frank on the subject, telling interviewers that Titanic co-star
Leonardo DiCaprio found the size of her feet hilarious: ‚ÄòI‚Äôd put my foot up and
he‚Äôd fall about laughing because my feet are exactly the same size as his and
he‚Äôs 6 ft 1 in. He‚Äôd refer to them as my canoes!‚Äô The average shoe size in the
UK has risen from a dainty 4¬Ω in 1900 to a roomy six today. But fascinatingly,
just like Kate, lots of stars from Elle Macpherson to Gwyneth

In [10]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
import textwrap

# --- 1. Load the Fine-Tuned Model and Tokenizer ---
print("Loading final fine-tuned model and tokenizer...")
MODEL_PATH = "./t5_final_summarizer_model"
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the model and tokenizer from the saved directory
model = T5ForConditionalGeneration.from_pretrained(MODEL_PATH).to(DEVICE)
tokenizer = T5Tokenizer.from_pretrained(MODEL_PATH)

print(f"Model loaded and on device: {DEVICE}")

def summarize_article(article_text, max_summary_length=150):
    """
    Generates a summary for a given article text.
    """
    
    # 1. Prepare the article for the model
    # We must use the same prefix T5 was trained on
    PREFIX = "summarize: "
    input_text = PREFIX + article_text
    
    # 2. Tokenize the article
    print("Tokenizing input text...")
    inputs = tokenizer(
        input_text, 
        max_length=512,  # Max input length (from training)
        truncation=True, 
        return_tensors="pt"
    ).to(DEVICE)

    # 3. Generate the summary
    print("Generating summary...")
    with torch.no_grad():
        output_ids = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=max_summary_length,
            num_beams=4,          # Use beam search
            early_stopping=True
        )
    
    # 4. Decode the generated summary
    generated_summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    return generated_summary

# --- Example Test ---

# Paste any long article text here
my_article = """
Lee hosted Xi at a state summit and dinner after an annual summit of the Asia-Pacific Economic Cooperation (Apec) in the South Korean city of Gyeongju, marking Xi‚Äôs first visit to the United States‚Äô ally in 11 years.
Beijing attaches great importance to relations with Seoul and sees South Korea as an inseparable cooperative partner, Xi said ahead of the summit, according to Lee‚Äôs office.
Lee, who was elected president in a snap election in June, has promised to strengthen ties with the US while not antagonising China and seeking to reduce tensions with the North.
‚ÄúI am very positive about the situation in which conditions for engagement with North Korea are being formed,‚Äù Lee said, referring to recent high-level exchanges between China and North Korea.
‚ÄúI also hope that South Korea and China will take advantage of these favourable conditions to strengthen strategic communication to resume dialogue with North Korea.‚Äù
Lee has called for a phased approach to denuclearising North Korea, starting with engagement and a freeze on further development of nuclear weapons.
In a statement on Saturday, Pyongyang, a military and economic ally of China, dismissed the denuclearisation agenda as an unrealisable ‚Äúpipe dream‚Äù.
North Korea has repeatedly and explicitly rejected Lee‚Äôs overtures, saying it will never talk to the South. In recent years Pyongyang abandoned its longstanding policy of unification with the South and called Seoul a main enemy.
Leader Kim Jong Un said he would be willing to talk to the US if Washington drops demands for denuclearisation, but he did not publicly respond when US President Donald Trump offered talks during his visit to South Korea earlier this week.
Trump and Lee announced a surprise breakthrough in talks to lower US tariffs in return for billions of dollars in investment from South Korea. The US president then departed before the main Apec leaders‚Äô summit.
South Korean national security adviser Wi Sunglac told a briefing that China expressed its willingness to cooperate for peace and stability on the Korean peninsula, but the leaders did not specifically discuss what kind of role China would play.
Both sides also agreed that the US-North Korea dialogue was most important, the adviser said.
Chinese state media reports on the meeting with Lee made no mention of the North Korea discussions.
According to Xinhua, Xi proposed ways to open a new chapter in relations, including having each country ‚Äúrespect each others social systems and development paths, accommodate core interests and major concerns, and properly handle differences through friendly consultation‚Äú.
Xi also called for upholding multilateralism and increasing cooperation in areas such as artificial intelligence, biopharmaceuticals, green industries and aging populations, Xinhua reported.
During Xi‚Äôs visit, China and South Korea signed seven agreements including a won-yuan currency swap and memorandums of understanding on online crime, businesses that cater to aging populations, and innovation, among other issues.
"""

# Generate the summary
my_summary = summarize_article(my_article)

# Print the results
print("\n" + "="*30)
print("       YOUR TEST SUMMARY       ")
print("="*30)
print("\n--- ORIGINAL ARTICLE ---")
print(textwrap.fill(my_article, width=80))
print("\n--- ü§ñ GENERATED SUMMARY ---")
print(my_summary)

Loading final fine-tuned model and tokenizer...
Model loaded and on device: cuda
Tokenizing input text...
Generating summary...

       YOUR TEST SUMMARY       

--- ORIGINAL ARTICLE ---
 Lee hosted Xi at a state summit and dinner after an annual summit of the Asia-
Pacific Economic Cooperation (Apec) in the South Korean city of Gyeongju,
marking Xi‚Äôs first visit to the United States‚Äô ally in 11 years. Beijing
attaches great importance to relations with Seoul and sees South Korea as an
inseparable cooperative partner, Xi said ahead of the summit, according to Lee‚Äôs
office. Lee, who was elected president in a snap election in June, has promised
to strengthen ties with the US while not antagonising China and seeking to
reduce tensions with the North. ‚ÄúI am very positive about the situation in which
conditions for engagement with North Korea are being formed,‚Äù Lee said,
referring to recent high-level exchanges between China and North Korea. ‚ÄúI also
hope that South Korea and Ch

In [5]:
!zip -r my_t5_model.zip /kaggle/working/t5_final_summarizer_model

  adding: kaggle/working/t5_final_summarizer_model/ (stored 0%)
  adding: kaggle/working/t5_final_summarizer_model/model.safetensors (deflated 7%)
  adding: kaggle/working/t5_final_summarizer_model/tokenizer_config.json (deflated 94%)
  adding: kaggle/working/t5_final_summarizer_model/special_tokens_map.json (deflated 85%)
  adding: kaggle/working/t5_final_summarizer_model/training_args.bin (deflated 52%)
  adding: kaggle/working/t5_final_summarizer_model/spiece.model (deflated 48%)
  adding: kaggle/working/t5_final_summarizer_model/added_tokens.json (deflated 83%)
  adding: kaggle/working/t5_final_summarizer_model/generation_config.json (deflated 28%)
  adding: kaggle/working/t5_final_summarizer_model/config.json (deflated 63%)
