In [None]:
import os
from transformers import T5ForConditionalGeneration, T5Tokenizer

# path where Kaggle will mount your committed output
INPUT_DIR = "/kaggle/input/t5-trained-yesterday/t5_model"
LOCAL_DIR = "/kaggle/working/t5_model"

if os.path.isdir(INPUT_DIR):
    # load from the committed output
    print("⏳ Loading model from previous session…")
    model = T5ForConditionalGeneration.from_pretrained(INPUT_DIR)
    tokenizer = T5Tokenizer.from_pretrained(INPUT_DIR)
    # copy it locally if you need write access
    if not os.path.isdir(LOCAL_DIR):
        from distutils.dir_util import copy_tree
        copy_tree(INPUT_DIR, LOCAL_DIR)
else:
    # no saved model found– run your training loop
    print("⚠️  No saved model found; starting training from scratch.")
    # … your existing training code here …


In [2]:
! pip install datasets evaluate transformers rouge-score nltk

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=d1ed39a0383c4490eeb1047f3bdd0c3ab2d5cbec40c086f1cbc78fcb847c0332
  Sto

In [3]:
!apt install git-lfs

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.3).
0 upgraded, 0 newly installed, 0 to remove and 87 not upgraded.


In [3]:
import transformers

print(transformers.__version__)

4.51.3


In [5]:
from transformers.utils import send_example_telemetry

send_example_telemetry("summarization_notebook", framework="pytorch")

In [6]:
model_checkpoint = "t5-small"

In [43]:
# 1) (Optional) upgrade to the latest `datasets` so you’re on the most recent bug‑fixes:
!pip install --upgrade datasets

from datasets import load_dataset
from evaluate import load

# 2) Make sure there's no local "xsum" folder lying around:
#    either delete or rename it in your file browser or via `!rm -rf xsum`

# 3) Load the dataset (with an explicit cache directory, if you'd like):
raw_datasets = load_dataset("xsum", cache_dir="./hf_cache",trust_remote_code=True)

# 4) Load ROUGE:
metric = load("rouge")

# Inspect
print(raw_datasets)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11334
    })
})


In [44]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11334
    })
})

In [9]:
raw_datasets["train"][24]

{'document': 'Mr Fox, 54, from London, denies eight counts of indecent assault and two counts of sexual assault between 1988 and 2014.\nHe said there was often "horseplay" with colleagues, involving "piggybacks, tickling and squeezing".\nBut he told Westminster Magistrates\' Court such behaviour was consensual.\nMr Fox, who uses the nicknames Dr Fox and Foxy, became well known for presenting the chart show on Capital Radio, and was a judge on the ITV show Pop Idol between 2001 and 2003 alongside Simon Cowell.\nHe joined Magic 105.4 in 2005, where he presents the breakfast show, Foxy in the Morning. He is currently not hosting the show.\nGiving evidence on Wednesday, Mr Fox said he had worked with "hundreds" of female colleagues during his career, but had never been accused of sexually inappropriate behaviour until last year.\nUnder questioning from his defence counsel, Jonathan Caplan QC, he told the court his teams had kept their energy up during live broadcasts by playing loud music,

In [11]:
metric

EvaluationModule(name: "rouge", module_type: "metric", features: [{'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id=None)}, {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}], usage: """
Calculates average rouge scores for a list of hypotheses and references
Args:
    predictions: list of predictions to score. Each prediction
        should be a string with tokens separated by spaces.
    references: list of reference for each prediction. Each
        reference should be a string with tokens separated by spaces.
    rouge_types: A list of rouge types to calculate.
        Valid names:
        `"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring,
        `"rougeL"`: Longest common subsequence based scoring.
        `"rougeLsum"`: rougeLsum splits text using `"
"`.
        See details in https://github.com/huggingface/

In [12]:
fake_preds = ["hello there", "general kenobi"]
fake_labels = ["hello there", "general kenobi"]
metric.compute(predictions=fake_preds, references=fake_labels)

{'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}

In [13]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [14]:
tokenizer("Hello, this one sentence!")

{'input_ids': [8774, 6, 48, 80, 7142, 55, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [15]:
tokenizer(["Hello, this one sentence!", "This is another sentence."])

{'input_ids': [[8774, 6, 48, 80, 7142, 55, 1], [100, 19, 430, 7142, 5, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}

In [16]:
print(tokenizer(text_target=["Hello, this one sentence!", "This is another sentence."]))

{'input_ids': [[8774, 6, 48, 80, 7142, 55, 1], [100, 19, 430, 7142, 5, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}


In [17]:
if model_checkpoint in ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"]:
    prefix = "summarize: "
else:
    prefix = ""

In [18]:
max_input_length = 1024
max_target_length = 128

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(text_target=examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [19]:
preprocess_function(raw_datasets['train'][:2])

{'input_ids': [[21603, 10, 37, 423, 583, 13, 1783, 16, 20126, 16496, 6, 80, 13, 8, 844, 6025, 4161, 6, 19, 341, 271, 14841, 5, 7057, 161, 19, 4912, 16, 1626, 5981, 11, 186, 7540, 16, 1276, 15, 2296, 7, 5718, 2367, 14621, 4161, 57, 4125, 387, 5, 15059, 7, 30, 8, 4653, 4939, 711, 747, 522, 17879, 788, 12, 1783, 44, 8, 15763, 6029, 1813, 9, 7472, 5, 1404, 1623, 11, 5699, 277, 130, 4161, 57, 18368, 16, 20126, 16496, 227, 8, 2473, 5895, 15, 147, 89, 22411, 139, 8, 1511, 5, 1485, 3271, 3, 21926, 9, 472, 19623, 5251, 8, 616, 12, 15614, 8, 1783, 5, 37, 13818, 10564, 15, 26, 3, 9, 3, 19513, 1481, 6, 18368, 186, 1328, 2605, 30, 7488, 1887, 3, 18, 8, 711, 2309, 9517, 89, 355, 5, 3966, 1954, 9233, 15, 6, 113, 293, 7, 8, 16548, 13363, 106, 14022, 84, 47, 14621, 4161, 6, 243, 255, 228, 59, 7828, 8, 1249, 18, 545, 11298, 1773, 728, 8, 8347, 1560, 5, 611, 6, 255, 243, 72, 1709, 1528, 161, 228, 43, 118, 4006, 91, 12, 766, 8, 3, 19513, 1481, 410, 59, 5124, 5, 96, 196, 17, 19, 1256, 68, 27, 103, 317, 132

In [20]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/204045 [00:00<?, ? examples/s]

Map:   0%|          | 0/11332 [00:00<?, ? examples/s]

Map:   0%|          | 0/11334 [00:00<?, ? examples/s]

In [21]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [22]:
batch_size = 4
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-xsum",
    eval_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    report_to=[], 
)

In [23]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [25]:
import nltk
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    # Note that other metrics may not have a `use_aggregator` parameter
    # and thus will return a list, computing a metric for each sentence.
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    # Extract a few results
    result = {key: value * 100 for key, value in result.items()}

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [26]:

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,2.6767,2.41901,29.4298,8.416,23.1224,23.1278,19.7013


TrainOutput(global_step=51012, training_loss=2.7069372709784743, metrics={'train_runtime': 10211.3757, 'train_samples_per_second': 19.982, 'train_steps_per_second': 4.996, 'total_flos': 4.357795090946458e+16, 'train_loss': 2.7069372709784743, 'epoch': 1.0})

In [27]:
# Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)

Evaluation Results: {'eval_loss': 2.4190096855163574, 'eval_rouge1': 29.4298, 'eval_rouge2': 8.416, 'eval_rougeL': 23.1224, 'eval_rougeLsum': 23.1278, 'eval_gen_len': 19.7013, 'eval_runtime': 955.1725, 'eval_samples_per_second': 11.864, 'eval_steps_per_second': 2.966, 'epoch': 1.0}


In [28]:
!pip install newspaper3k lxml[html_clean]
!pip install newspaper3k
from newspaper import Article

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m68.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tinysegmenter==0.3 (from newspaper3k)
  Downloading tinysegmenter-0.3.tar.gz (16 kB)
  Preparing metadata (setup.py) ... [?25

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




> # Test 1- Summarization from Text

In [1]:
import torch
def summarize_text(text):
    device = model.device
    inputs = tokenizer(prefix + text, return_tensors="pt", max_length=1024, truncation=True).to(device)
    with torch.no_grad():
        output_ids = model.generate(inputs["input_ids"], max_length=128, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return summary


# Example use
user_input = input("Enter a long article or paragraph: ")
print("\n🔹 Summary:\n", summarize_text(user_input))


Enter a long article or paragraph:  At 10 am on June 12, 1975, Justice Jagmohan Lal Sinha reached Courtroom Number 24 of the Allahabad High Court and took his seat in the jam-packed courtroom. And then, he pronounced a judgment that would go on to have epochal consequences for then Prime Minister Indira Gandhi — and India.  Allowing the petition of Raj Narain, who, following his loss to Indira Gandhi in the 1971 election, had moved court alleging electoral malpractices by the Prime Minister, Justice Sinha said, “This petition is allowed and the election of Smt. Indira Nehru Gandhi, Respondent No. 1, to the Lok Sabha is declared void… (Indira Gandhi) accordingly stands disqualified for a period of six years from the date of this order.”  For the first time in the history of independent India, a Prime Minister’s election had been set aside. Months earlier, the courtroom had witnessed another first — the Prime Minister being cross-examined for two consecutive days.  Explained | Explained:

NameError: name 'model' is not defined

 





> # Test 2- summarization from url

In [52]:

def get_article_text_from_url(url):
    article = Article(url)
    article.download()
    article.parse()
    return article.text

def summarize_text(text):
    inputs = tokenizer(prefix + text, return_tensors="pt", max_length=1024, truncation=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}  # Ensure same device
    output_ids = model.generate(inputs["input_ids"], max_length=128, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return summary

# Example use
article_url = input("Enter the URL of a news article: ")
article_text = get_article_text_from_url(article_url)
print("\n🔹 Summary:\n", summarize_text(article_text))


Enter the URL of a news article:  https://www.bbc.com/weather/articles/c4grg1w2xr7o



🔹 Summary:
 Heatwaves in the north and west of the UK are set to reach a threshold for at least three consecutive days.


> # Test 3 from Text with Topic guided

In [32]:
def topic_guided_summary(text, topic):
    device = model.device
    guided_input = f"{prefix} {topic}: {text}"
    inputs = tokenizer(guided_input, return_tensors="pt", max_length=1024, truncation=True).to(device)
    with torch.no_grad():
        output_ids = model.generate(inputs["input_ids"], max_length=128, num_beams=4, early_stopping=True)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)


# Example use
text = input("Enter the text to summarize: ")
topic = input("Enter the topic to guide the summary: ")
print("\n🔹 Topic-Guided Summary:\n", topic_guided_summary(text, topic))


Enter the text to summarize:  At 10 am on June 12, 1975, Justice Jagmohan Lal Sinha reached Courtroom Number 24 of the Allahabad High Court and took his seat in the jam-packed courtroom. And then, he pronounced a judgment that would go on to have epochal consequences for then Prime Minister Indira Gandhi — and India.  Allowing the petition of Raj Narain, who, following his loss to Indira Gandhi in the 1971 election, had moved court alleging electoral malpractices by the Prime Minister, Justice Sinha said, “This petition is allowed and the election of Smt. Indira Nehru Gandhi, Respondent No. 1, to the Lok Sabha is declared void… (Indira Gandhi) accordingly stands disqualified for a period of six years from the date of this order.”  For the first time in the history of independent India, a Prime Minister’s election had been set aside. Months earlier, the courtroom had witnessed another first — the Prime Minister being cross-examined for two consecutive days.  Explained | Explained: The s


🔹 Topic-Guided Summary:
 The Supreme Court of India has ruled that the election of Smt. Indira Nehru Gandhi has been declared void.


# Test 4 Prompt-Based Topic-Guided Summarization ──────────────────────

In [53]:
def summarize_with_prompt(model, tokenizer, text, topic,
                          max_input_len=1024, max_summary_len=150):
    instruction = f"summarize the following text with focus on {topic}: "
    inputs = tokenizer(instruction + text,
                       return_tensors="pt",
                       max_length=max_input_len,
                       truncation=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_summary_len,
        num_beams=5,
        early_stopping=True
    )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)
url = input("🔗 Enter the article URL: ").strip()
topic = input("🎯 Enter the topic to focus on: ").strip()

# ─── Run and Display Results ─────────────────────────────────────────────
article_text = get_article_text_from_url(url)

if not article_text or "❌ Error" in article_text or len(article_text.split()) < 50:
    print("❗ Unable to extract a valid article. Please check the URL.")
else:
    print("\n🔹 Prompt-Based Summary:")
    print(summarize_with_prompt(model, tokenizer, article_text, topic))


🔗 Enter the article URL:  https://www.bbc.com/weather/articles/c4grg1w2xr7o
🎯 Enter the topic to focus on:  Temperature



🔹 Prompt-Based Summary:
Temperatures in the north and west of the UK are set to soar to more than 30C (86F) this week, according to the Met Office.


# Test 5 Control-Token Topic-Guided Summarization ──────────────

In [50]:
control_token = "<FOCUS_ON>"

def summarize_with_control_token(model, tokenizer, text, topic,
                                 max_input_len=1024, max_summary_len=150):
    if control_token not in tokenizer.get_vocab():
        tokenizer.add_tokens([control_token])
        model.resize_token_embeddings(len(tokenizer))
    
    prompt = f"{control_token} {topic} {text}"
    inputs = tokenizer(prompt,
                       return_tensors="pt",
                       max_length=max_input_len,
                       truncation=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_summary_len,
        num_beams=5,
        early_stopping=True
    )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)
    
url = input("🔗 Enter the article URL: ").strip()
topic = input("🎯 Enter the topic to focus on: ").strip()

article_text = get_article_text_from_url(url)

if not article_text or "❌ Error" in article_text or len(article_text.split()) < 50:
    print("❗ Unable to extract a valid article. Please check the URL.")
else:
    print("\n🔹 Control Token-Based Summary:")
    print(summarize_with_control_token(model, tokenizer, article_text, topic))

🔗 Enter the article URL:  https://www.bbc.com/weather/articles/c4grg1w2xr7o
🎯 Enter the topic to focus on:  Temperature



🔹 Control Token-Based Summary:
For a heatwave to be declared by the Met Office we need to reach a threshold temperature for at least three consecutive days.


In [49]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import os

# choose a folder under /kaggle/working (which you can later download or commit as output)
OUT_DIR = "/kaggle/working/t5_model"

# make sure it exists
os.makedirs(OUT_DIR, exist_ok=True)

# assuming `model` and `tokenizer` are your trained objects:
model.save_pretrained(OUT_DIR)
tokenizer.save_pretrained(OUT_DIR)

print(f"✅ Model & tokenizer saved to {OUT_DIR}")


✅ Model & tokenizer saved to /kaggle/working/t5_model
