# News Summarization

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.wh

In [3]:
from datasets import load_dataset

# BBC News Summary
dataset = load_dataset("SurAyush/News_Summary_Dataset")

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'summary'],
        num_rows: 2224
    })
})

In [5]:
split_dataset = dataset['train'].train_test_split(test_size=0.1)

In [6]:
split_dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'summary'],
        num_rows: 2001
    })
    test: Dataset({
        features: ['article', 'summary'],
        num_rows: 223
    })
})

In [7]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

checkpoint = 'google-t5/t5-small'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [8]:
inputs = tokenizer("Hello, I need to summarized by the model")

inputs

{'input_ids': [8774, 6, 27, 174, 12, 21603, 26, 57, 8, 825, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [9]:
tokenizer.convert_ids_to_tokens(inputs.input_ids)

['▁Hello',
 ',',
 '▁I',
 '▁need',
 '▁to',
 '▁summarize',
 'd',
 '▁by',
 '▁the',
 '▁model',
 '</s>']

In [10]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [11]:
# model size
model.num_parameters()/1_000_000

60.506624

In [12]:
tokenizer.model_max_length

512

In [13]:
max_input_length = 512
max_target_length = 256

def preprocess_function(examples):

    model_inputs = tokenizer(
        examples["article"],
        max_length=max_input_length,
        truncation=True,
    )

    # unlike translation of languages, here we have the same tokenizer for labels
    labels = tokenizer(
        examples["summary"],
        max_length=max_target_length,
        truncation=True
    )

    model_inputs["labels"] = labels["input_ids"]            # we only need input_ids of lables

    return model_inputs

In [14]:
tokenized_dataset = split_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/2001 [00:00<?, ? examples/s]

Map:   0%|          | 0/223 [00:00<?, ? examples/s]

In [15]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2001
    })
    test: Dataset({
        features: ['article', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 223
    })
})

In [16]:
!pip install rouge_score
!pip install evaluate

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=06ef47dd6ed3747474ed3ac84b5dfc89fb021862a975096f0ec8e503df627f48
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [17]:
import evaluate

rouge_score = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [18]:
generated_summary = "rise in oil prices have created riots"
reference_summary = "riots caused by rise in oil prices"

scores = rouge_score.compute(
    predictions=[generated_summary], references=[reference_summary]
)
scores

{'rouge1': np.float64(0.7142857142857143),
 'rouge2': np.float64(0.5),
 'rougeL': np.float64(0.5714285714285714),
 'rougeLsum': np.float64(0.5714285714285714)}

The rouge1 variant is the overlap of unigrams — this is just a fancy way of saying the overlap of words and is exactly the metric we’ve discussed above.

rouge2 measures the overlap between bigrams (think the overlap of pairs of words).

rougeL and rougeLsum measure the longest matching sequences of words by looking for the longest common substrings in the generated and reference summaries. The “sum” in rougeLsum refers to the fact that this metric is computed over a whole summary, while rougeL is computed as the average over individual sentences.

## Let's fine tune the T5 small model

In [19]:
from transformers import Seq2SeqTrainingArguments

batch_size = 8
num_train_epochs = 8

logging_steps = len(tokenized_dataset["train"]) // batch_size

model_name = "news-summarizer-t5"

args = Seq2SeqTrainingArguments(
    output_dir= model_name,
    evaluation_strategy="epoch",
    learning_rate=5e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,                   # limiting the number of saves
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,        # seq2seq evaluation
    logging_steps=logging_steps,
    push_to_hub=True,
    fp16 = True
)



In [20]:
!pip install nltk



In [21]:
import nltk

nltk.download("punkt")       # punctuation rules
nltk.download("punkt_tab")       # punctuation rules

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [22]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [23]:
import numpy as np
from nltk.tokenize import sent_tokenize

def compute_metrics(eval_pred):
    '''
        similar to translation compute_metrics
    '''

    predictions, labels = eval_pred


    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 with <PAD>
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)


    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE score
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )

    result = {key: value * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [24]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [25]:
tokenized_dataset = tokenized_dataset.remove_columns(
    split_dataset["train"].column_names
)

In [26]:
features = [tokenized_dataset["train"][i] for i in range(2)]
collated_samples = data_collator(features)

In [27]:
collated_samples.keys()

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])

In [28]:
from huggingface_hub import notebook_login, whoami

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [29]:
whoami()

{'type': 'user',
 'id': '66a797b589b3e71262932d0d',
 'name': 'SurAyush',
 'fullname': 'Ayush Sur',
 'email': 'ayushsur26@gmail.com',
 'emailVerified': True,
 'canPay': False,
 'periodEnd': None,
 'isPro': False,
 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/noauth/RZJZW_w0wdVoOmQY250lR.png',
 'orgs': [],
 'auth': {'type': 'access_token',
  'accessToken': {'displayName': 'Google Colab',
   'role': 'write',
   'createdAt': '2025-03-02T10:56:32.713Z'}}}

In [30]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(


In [31]:
# before fine-tuning
trainer.evaluate()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mayushsur26[0m ([33mayushsur26-national-institute-of-technology-kurukshetra[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


{'eval_loss': 5.750208377838135,
 'eval_model_preparation_time': 0.0049,
 'eval_rouge1': 13.918,
 'eval_rouge2': 8.1441,
 'eval_rougeL': 11.6651,
 'eval_rougeLsum': 12.5001,
 'eval_runtime': 14.2089,
 'eval_samples_per_second': 15.694,
 'eval_steps_per_second': 1.971}

 - 'eval_rouge1': 13.918,
 - 'eval_rouge2': 8.1441,
 - 'eval_rougeL': 11.6651,
 - 'eval_rougeLsum': 12.5001

In [33]:
trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time,Rouge1,Rouge2,Rougel,Rougelsum
1,0.8689,0.658052,0.0049,18.8745,16.2314,18.1991,18.3287
2,0.6629,0.638466,0.0049,19.3705,17.1277,18.8685,18.9594
3,0.6114,0.629368,0.0049,19.3951,17.2113,18.9315,18.9848
4,0.571,0.619669,0.0049,19.8684,17.8234,19.4646,19.5401
5,0.5451,0.619271,0.0049,19.8981,17.9851,19.5083,19.5177
6,0.5194,0.620252,0.0049,19.8675,17.9521,19.5434,19.6046
7,0.4894,0.616615,0.0049,19.8622,17.9616,19.4791,19.5669
8,0.4872,0.617713,0.0049,19.8849,17.9939,19.5328,19.5918


TrainOutput(global_step=2008, training_loss=0.5943762056856041, metrics={'train_runtime': 652.6637, 'train_samples_per_second': 24.527, 'train_steps_per_second': 3.077, 'total_flos': 2164271365816320.0, 'train_loss': 0.5943762056856041, 'epoch': 8.0})

In [34]:
trainer.push_to_hub(commit_message='Training Complete', tags="summarization")

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

events.out.tfevents.1743435457.8e470de03c02.749.0:   0%|          | 0.00/12.9k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/SurAyush/news-summarizer-t5/commit/74a4bd4afd727d29502f799a9e5b0315a0d9021b', commit_message='Training Complete', commit_description='', oid='74a4bd4afd727d29502f799a9e5b0315a0d9021b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/SurAyush/news-summarizer-t5', endpoint='https://huggingface.co', repo_type='model', repo_id='SurAyush/news-summarizer-t5'), pr_revision=None, pr_num=None)

### Using the model from pipeline

In [35]:
from transformers import pipeline

hub_model_id = 'SurAyush/news-summarizer-t5'
summarizer = pipeline('summarization',model = hub_model_id)

config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Device set to use cuda:0


In [36]:
text = '''
The death toll from a major earthquake in Myanmar has risen to 2,056, the junta said on Monday. More than 3,900 people were injured. A statement from a junta spokesperson said that 270 more people were still missing. It has also declared a week of national mourning and directed that national flags will fly at half-mast until April 6 “in sympathy for the loss of life and damages.” The announcement came as the tempo and urgency of rescue efforts wound down in Mandalay, one of the worst-affected cities and the country's second-largest, with more than 1.7 million inhabitants.

"The situation is so dire that it's hard to express what is happening," said Aung Myint Hussein, chief administrator of Mandalay's Sajja North mosque, AFP reported.
According to AFP, people camped out in the streets across Mandalay for a third successive night because they were either unable to return to their ruined homes or were nervous about the repeated aftershocks that rattled the city over the weekend.
Some had tents, but many, including young children, simply slept on blankets in the middle of the roads, trying to stay as far away from buildings.
Among those killed in the quake were also three Chinese nationals and two French people, reported AFP, citing China's state media and the foreign ministry in Paris.

However, with communications down in much of Myanmar, the true scale of the disaster has yet to emerge and the death toll is expected to rise significantly.
The 7.7 earthquake left wide cracks on roads, brought down buildings, and sent tremors across neighbouring countries, including China, Thailand, Vietnam and parts of India.
'''

output = summarizer(text)

In [40]:
summary = output[0]['summary_text']

In [46]:
import re

def capitalize_sentences(text):
    sentences = re.split(r'([.!?]\s*)', text)  # Split text while keeping punctuation
    capitalized_sentences = [s.capitalize() for s in sentences]
    result = ''.join(capitalized_sentences)

    result = re.sub(r'\bindia\b', 'India', result, flags=re.IGNORECASE)

    return result

In [48]:
capitalize_sentences(summary)

"The death toll from a major earthquake in myanmar has risen to 2,056, the junta said on monday.The 7.7 earthquake left wide cracks on roads, brought down buildings, and sent tremors across neighbouring countries, including china, thailand, vietnam and parts of India.The announcement came as the tempo and urgency of rescue efforts wound down in mandalay, one of the worst-affected cities and the country's second-largest, with more than 1.7 million inhabitants."