### Installing dependencies on Amazon SageMaker

In [2]:
# Import essential packages
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np
from peft import LoraConfig, get_peft_model, TaskType
from peft import PeftModel, PeftConfig

2025-05-14 19:00:33.719746: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


---
### Load Dataset

The [News Article Dataset](https://www.kaggle.com/datasets/teja098/news-article) is a curated collection of **112 news articles** sourced from major Indian newspapers, including:

- *The Hindu*
- *Hindustan Times*
- *Indian Express*
- ...and others.

Each entry in the dataset includes:

- **Newspaper Name**: Source of the article  
- **Published Date**: Date the article was released  
- **URL**: Link to the original news piece  
- **Headline**: Article title  
- **Content**: Full article text  
- **Human Summary**: Manually written summary  
- **Category**: Article classification (e.g., *Science and Technology*, *National News*, *Business*, *Environment*)



**NOTE**: In this project, I will only be using Content and Summary

---


In [3]:
news_df = pd.read_csv("news.csv")

news_df = news_df[['Content', 'Human Summary']]

news_df.rename(columns={"Human Summary": "Summary"}, inplace=True)

# Max token for flan-t5-base model is 512
news_df['Content'] = news_df['Content'].str[:512]

news_df.head(5)

Unnamed: 0,Content,Summary
0,"India successfully launched Chandrayaan-4, aim...",India launched Chandrayaan-4 to study the moon...
1,The Prime Minister unveiled the Digital India ...,"PM launched Digital India 2.0, focusing on tec..."
2,India’s GDP showed a rebound in the first quar...,"India's Q1 2021 GDP rebounded, indicating a re..."
3,Cyclone Yaas wreaked havoc in Odisha and West ...,Cyclone Yaas caused severe damage in Eastern I...
4,Hyderabad became a central hub for COVID-19 va...,Hyderabad gained recognition as the COVID-19 v...


In [4]:
# Convert pandas DataFrame to Hugging Face Dataset with a 70% on train data and 15% on each Validation and Test data

dataset = Dataset.from_pandas(news_df)

# Split the dataset (you can customize these ratios as needed)
split_dataset = dataset.train_test_split(test_size=0.3, seed=1)
test_valid = split_dataset['test'].train_test_split(test_size=0.5, seed=1)

# Constructing DatasetDict
dataset = DatasetDict({
    'train': split_dataset['train'],
    'validation': test_valid['train'],
    'test': test_valid['test']
})

# Viewing the structure
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['Content', 'Summary'],
        num_rows: 78
    })
    validation: Dataset({
        features: ['Content', 'Summary'],
        num_rows: 17
    })
    test: Dataset({
        features: ['Content', 'Summary'],
        num_rows: 17
    })
})


---
### Model Loading


**FLAN-T5 Base** is a fine-tuned version of the original [T5 (Text-to-Text Transfer Transformer)](https://arxiv.org/abs/1910.10683) model developed by Google. It is part of the **FLAN (Fine-tuned LAnguage Net)** series, which incorporates **instruction tuning** to improve performance on a wide range of NLP tasks without task-specific training.
[google/flan-t5-base](https://huggingface.co/google/flan-t5-base)


####  Model Specs

- **Architecture**: T5 (Encoder-Decoder Transformer)
- **Size**: ~248M parameters
- **Training Objective**: Text-to-text tasks (e.g., translation, summarization, question answering, classification, etc.)
- **Fine-tuned With**: Instruction tuning on 60+ datasets
---

In [5]:
model_name='google/flan-t5-base'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)



config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [6]:
# Custom function to count number of trainable parameters

def trainable_model_parameters_count(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"Model parameters (trainable): {trainable_model_params}\nmodel parameters (All): {all_model_params}\n % of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(trainable_model_parameters_count(original_model))

Model parameters (trainable): 247577856
model parameters (All): 247577856
 % of trainable model parameters: 100.00%



### Testing the Baseline Model with Zero-Shot Inferencing


In [7]:
index = 15

content = dataset['test'][index]['Content']
summary = dataset['test'][index]['Summary']

prompt = f"""
Summarize the following news.

{content}

Summary:
"""

inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(
    original_model.generate(
        inputs["input_ids"], 
        max_new_tokens=500,
    )[0], 
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following news.

Water Minister and Delhi Jal Board (DJB) chairman Satyendar Jain on Saturday visited Rohini Lake to review the progress of various units being constructed in line with the Delhi government’s objective of transforming the Capital into “a city of lakes”.

The government plans to develop Rohini as an “abode of lakes and recreation” within 8 months.

A project revolving around the revival of lakes and water bodies in Delhi is on the AAP government’s list of priorities, the government stated, adding that it soug

Summary:

---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
Delhi Water Minister Satyendar Jain visited Rohini Lake to assess the progress of the Delhi government's "City of Lakes" project, aimed at reviving lakes and water bodies across the city. The government pla

---
### Preprocess the Dialog-Summary Dataset

Step 1: Converting news-summary (prompt-response) pairs into an explicit labeled dataset for the LLM
Step 2: Prepend an instruction to the start of the news with `Summarize the following news` and to the start of the news summary with `Summary` as follows:

                    Training prompt (content):
                    ```
                    Summarize the following news

                    On 1st Jan 1970, ABC was elected president ........
    
                    Summary: 
                    ```

                    Training response (summary):
                    ```
                    ABC was elected as president.
                    ```

Step 3: Preprocess the prompt-response dataset into tokens and extract their `input_ids` (one per token).

---

In [8]:
def tokenize_variables(example):
    start_prompt = 'Summarize the following news.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + content + end_prompt for content in example["Content"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["Summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids
    
    return example

tokenized_datasets = dataset.map(tokenize_variables, batched=True)

# Remove the 'Content', 'Summary' as we now have the tokenized dataset
tokenized_datasets = tokenized_datasets.remove_columns(['Content', 'Summary',])

print(tokenized_datasets)

Map:   0%|          | 0/78 [00:00<?, ? examples/s]

Map:   0%|          | 0/17 [00:00<?, ? examples/s]

Map:   0%|          | 0/17 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 78
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 17
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 17
    })
})


---
### Fine-Tune the Model with the Preprocessed Dataset

Inputs: Preprocessed dataset and reference to the original model

Output: Model ready for fine-tuning

---

In [10]:
# Output path of the finetuned model

output_dir = f'./news-summary-training'

# Training parameters
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-4,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=5
)

trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [11]:
trainer.train()


Step,Training Loss
1,39.25
2,36.0
3,31.75
4,33.75
5,31.25


TrainOutput(global_step=5, training_loss=34.4, metrics={'train_runtime': 352.7576, 'train_samples_per_second': 0.113, 'train_steps_per_second': 0.014, 'total_flos': 27390294097920.0, 'train_loss': 34.4, 'epoch': 0.5})

In [12]:
# Save the trained instruct model into local memory
trainer.save_model(output_dir)

In [13]:
# Read the instruct model saved back from memory

instruct_model = AutoModelForSeq2SeqLM.from_pretrained("./news-summary-training", torch_dtype=torch.bfloat16)

---
### Perform Parameter Efficient Fine-Tuning (PEFT)

PEFT (Parameter-Efficient Fine-Tuning) like LoRA allows fine-tuning large models by training only small adapter layers, making it much more efficient than full fine-tuning.

During inference time, the adapter is combined with the base model, allowing multiple adapters to reuse the same LLM, reducing memory and compute costs.

---

In [33]:
#Setup the PEFT/LoRA model for Fine-Tuning

# Setting rank to be 32
# "q" = query, "v" = value
# Each attention head computes: Query (Q) Key (K) Value (V)

lora_config = LoraConfig(
    r=16, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.01,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

In [34]:
# Add LoRA adapter layers/parameters to the original LLM to be trained.

peft_model = get_peft_model(original_model, 
                            lora_config)
print(trainable_model_parameters_count(peft_model))

Model parameters (trainable): 1769472
model parameters (All): 249347328
 % of trainable model parameters: 0.71%


### Training PEFT Adapter


In [37]:
output_dir = f'./peft-news-summary-training'

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3, 
    num_train_epochs=1,
    logging_steps=1,
    max_steps=5,    
)
    
peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [38]:
peft_trainer.train()

peft_model_path="./peft-news-summary-checkpoint-local"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

Step,Training Loss
1,32.25
2,31.375
3,28.25
4,29.25
5,26.0


('./peft-news-summary-checkpoint-local/tokenizer_config.json',
 './peft-news-summary-checkpoint-local/special_tokens_map.json',
 './peft-news-summary-checkpoint-local/spiece.model',
 './peft-news-summary-checkpoint-local/added_tokens.json',
 './peft-news-summary-checkpoint-local/tokenizer.json')

In [39]:
#Prepare this model by adding an adapter to the original FLAN-T5 model. 
# `is_trainable=False` because we only want to perform inference with this PEFT model.


peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

peft_model = PeftModel.from_pretrained(peft_model_base, 
                                       './peft-news-summary-checkpoint-local/', 
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False)

# Thus number of trainable parameters = `0` 
print(trainable_model_parameters_count(peft_model))



Model parameters (trainable): 0
model parameters (All): 249347328
 % of trainable model parameters: 0.00%


  adapters_weights = torch.load(



### Evaluating the Model Quantitatively (with ROUGE Metric)

In [40]:
contents = dataset['test'][0:5]['Content']
human_baseline_summaries = dataset['test'][0:5]['Summary']


original_model_summaries = []
instruct_model_summaries = []
peft_model_summaries = []

for idx, content in enumerate(contents):
    prompt = f"""
Summarize the following news.

{content}

Summary: """
    
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    human_baseline_text_output = human_baseline_summaries[idx]
    
    original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

    instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

    peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

    original_model_summaries.append(original_model_text_output)
    instruct_model_summaries.append(instruct_model_text_output)
    peft_model_summaries.append(peft_model_text_output)

zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, instruct_model_summaries, peft_model_summaries))
 
df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'original_model_summaries', 'instruct_model_summaries', 'peft_model_summaries'])
df

Unnamed: 0,human_baseline_summaries,original_model_summaries,instruct_model_summaries,peft_model_summaries
0,Delhi Chief Minister Arvind Kejriwal unveiled ...,The government of India has launched a five-po...,The government has launched a five-point plan ...,The government has launched a five-point plan ...
1,"In October 2022, five companies will issue bon...",.,- Bonus shares in October 2022: After the arri...,- Bonus shares in October 2022: After the arri...
2,The 2022 budget aims to enhance infrastructure...,The government has announced a budget of $1.25...,The government has allocated a total of £2 mil...,The government has allocated a total of £2 mil...
3,A Japanese woman was forced to urinate on a st...,The woman who smashed a window into a plane in...,A woman who was snatched from a plane after be...,A woman who was snatched from a plane after be...
4,The study highlights the significant impact of...,The study found that women with endometriosis ...,Women with endometriosis are more likely to mi...,Women with endometriosis are more likely to mi...


In [30]:
#Compute ROUGE score for this subset of the data. 
                   
rouge = evaluate.load('rouge')

original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries,
    use_aggregator=True,
    use_stemmer=True,
)

instruct_model_results = rouge.compute(
    predictions=instruct_model_summaries,
    references=human_baseline_summaries,
    use_aggregator=True,
    use_stemmer=True,
)

peft_model_results = rouge.compute(
    predictions=peft_model_summaries,
    references=human_baseline_summaries,
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print('INSTRUCT MODEL:')
print(instruct_model_results)
print('PEFT MODEL:')
print(peft_model_results)

ORIGINAL MODEL:
{'rouge1': 0.22323511894334863, 'rouge2': 0.0568236582694414, 'rougeL': 0.17300493991521565, 'rougeLsum': 0.1729411764705882}
INSTRUCT MODEL:
{'rouge1': 0.22392358891823694, 'rouge2': 0.0763635533303331, 'rougeL': 0.18504477341400466, 'rougeLsum': 0.1844192033923974}
PEFT MODEL:
{'rouge1': 0.2053022371517239, 'rouge2': 0.0758024123436602, 'rougeL': 0.16722297496408264, 'rougeLsum': 0.16522754082076116}
