## Importing Libs:

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns  
import re 
import nltk
import spacy
import pytextrank
import evaluate

from transformers import pipeline
from rouge_score import rouge_scorer

nltk.download('punkt')

c:\Users\KIIT0001\Downloads\text_summarizer-main\text_summarizer-main\texts\Lib\site-packages


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KIIT0001\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Loading Dataset:

In [2]:
try:
    df = pd.read_csv('news_summary.csv', encoding='latin-1')
except Exception as e:
    print(f"Failed to load news_summary.csv: {e}")

In [3]:
df.rename(columns={'ctext': 'full_text', 'text': 'summary'}, inplace=True)
print(f"Original data shape: {df.shape}")

Original data shape: (4514, 6)


## Cleaning dataset:

In [4]:
def clean_text(text):
    # Lowercase the text
    text = str(text)
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

# Apply cleaning to both text and summary
df['clean_text'] = df['summary'].apply(clean_text)

df['clean_full_text'] = df['full_text'].apply(clean_text)

df.dropna(subset=['full_text', 'summary'], inplace=True)
print(f"Data shape after dropping NaNs: {df.shape}")


Data shape after dropping NaNs: (4396, 8)


## Extractive Summarization (TextRank Baseline):


### --- Create a Sample ---

In [5]:
SAMPLE_SIZE = 500
df_sample = df.head(SAMPLE_SIZE).copy()
print(f"Processing a sample of {len(df_sample)} articles...")

Processing a sample of 500 articles...


### --- Loading model ---

In [6]:
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("textrank")

print("spaCy model with pytextrank loaded.")

spaCy model with pytextrank loaded.


### --- Initialize Scorer and Score Lists ---

In [7]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

In [8]:
print("Starting TextRank summarization and ROUGE evaluation...")

for i, row in enumerate(df_sample.itertuples()):
    if (i + 1) % 100 == 0:
        print(f"  ...processed {i + 1}/{len(df_sample)}")

    article = row.full_text
    reference_summary = row.summary

    try:
        # 1. Process the article with the nlp pipeline
        doc = nlp(article)

        # 2. Generate Summary (pytextrank)
        # We'll ask for a summary of 3 sentences
        # doc._.textrank.summary() returns a list of sentences
        summary_sentences = [sent.text for sent in doc._.textrank.summary(limit_sentences=3)]
        generated_summary = " ".join(summary_sentences)

        if not generated_summary:
            continue
            
        # 3. Calculate ROUGE Score
        scores = scorer.score(reference_summary, generated_summary)

        # 4. Store F-measures
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)

    except Exception as e:
        # Catch any other processing errors
        print(f"Error processing row {i}: {e}")
        continue

print("...Evaluation complete.")

Starting TextRank summarization and ROUGE evaluation...
  ...processed 100/500
  ...processed 200/500
  ...processed 300/500
  ...processed 400/500
  ...processed 500/500
...Evaluation complete.


### --- Calculate and Print Average Scores ---

In [9]:
print("\n--- pytextrank Baseline ROUGE Scores (F-measure) ---")
print(f"Total articles successfully scored: {len(rouge1_scores)} / {SAMPLE_SIZE}")

if rouge1_scores:
    print(f"Average ROUGE-1: {np.mean(rouge1_scores):.4f}")
    print(f"Average ROUGE-2: {np.mean(rouge2_scores):.4f}")
    print(f"Average ROUGE-L: {np.mean(rougeL_scores):.4f}")
else:
    print("No articles were successfully scored. Check your data or sample size.")


--- pytextrank Baseline ROUGE Scores (F-measure) ---
Total articles successfully scored: 500 / 500
Average ROUGE-1: 0.3929
Average ROUGE-2: 0.1703
Average ROUGE-L: 0.2482


##  Abstractive Summarization with Transformers

In [10]:
from datasets import Dataset, DatasetDict

### Converting pandas DataFrame into a Hugging Face Dataset

In [11]:
hg_dataset = Dataset.from_pandas(df)
train_test_split = hg_dataset.train_test_split(train_size=1000, test_size=200, seed=42)

In [12]:
hg_dataset = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})
print("Hugging Face Dataset created:")
print(hg_dataset)

Hugging Face Dataset created:
DatasetDict({
    train: Dataset({
        features: ['author', 'date', 'headlines', 'read_more', 'summary', 'full_text', 'clean_text', 'clean_full_text', '__index_level_0__'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['author', 'date', 'headlines', 'read_more', 'summary', 'full_text', 'clean_text', 'clean_full_text', '__index_level_0__'],
        num_rows: 200
    })
})


In [13]:
from transformers import AutoTokenizer

# Load the tokenizer for 't5-small'
model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# This prefix is a T5 requirement to tell it what task to do
prefix = "summarize: "

# Set max lengths for your text. You can adjust these.
# Articles longer than 512 tokens will be cut off (truncated).
MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 150

# Create the preprocessing function
def preprocess_function(examples):
    # Add the prefix to all articles
    inputs = [prefix + doc for doc in examples['full_text']]
    
    # Tokenize the articles (inputs)
    model_inputs = tokenizer(inputs, 
                             max_length=MAX_INPUT_LENGTH, 
                             truncation=True)

    # Tokenize the summaries (labels)
    # We use 'text_target' for the tokenizer in summarization
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['summary'], 
                           max_length=MAX_TARGET_LENGTH, 
                           truncation=True)

    # Add the tokenized labels to our model inputs
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Now, apply this function to our entire dataset
# The 'batched=True' part makes this run much faster
tokenized_datasets = hg_dataset.map(preprocess_function, batched=True)

print("\nTokenized dataset example:")
print(tokenized_datasets['train'][0])

Map: 100%|██████████| 1000/1000 [00:00<00:00, 1562.00 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 1198.87 examples/s]


Tokenized dataset example:
{'author': 'Abhishek Bansal', 'date': '18 Feb 2017,Saturday', 'headlines': 'Aussie players sledged me during warm-up game: Shreyas Iyer', 'read_more': 'http://indiatoday.intoday.in/story/shreyas-iyer-david-warner-matthew-wade-india-a-vs-australia/1/886144.html ', 'summary': 'India A batsman Shreyas Iyer has said he was sledged by Australian wicketkeeper Matthew Wade and David Warner on the second day of the ongoing practice match. "They started to sledge and said \'this guy does not have defence, he can play only attacking shots\'," said Iyer. "I am used to this type of sledging," the batsman added.', 'full_text': 'Attacking young batsman from Mumbai, Shreyas Iyer, said he was sledged by wicketkeeper Matthew Wade and David Warner during the course of the day two of the ongoing three-day practice match between India A and Australia.Iyer slammed an unbeaten 85 off 93 balls against Australia to power India A to 176/4 in reply to the visitors\' 469/7 declared.Iy




In [14]:
from transformers import (
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

# --- 1. Load ROUGE Metric for Evaluation ---
# This is the same metric we'll use to compare against our baseline
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    """
    This function is called by the Trainer to compute ROUGE scores.
    """
    predictions, labels = eval_pred
    
    # Decode generated summaries (predictions)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in labels (which are padding tokens)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries (labels)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # ROUGE expects newlines after each sentence
    decoded_preds = ["\n".join(pred.split()) for pred in decoded_preds]
    decoded_labels = ["\n".join(label.split()) for label in decoded_labels]
    
    # Compute ROUGE scores
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    
    # Extract the F-measures
    result = {key: value * 100 for key, value in result.items()}
    
    # Get median generation length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.median(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}


# --- 2. Load the Pre-trained T5 Model ---
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)


# --- 3. Define Training Arguments ---
# This object holds all the settings for the training run
training_args = Seq2SeqTrainingArguments(
    output_dir="t5-small-summarizer",         # Where to save the model
    eval_strategy="epoch",             # Run evaluation at the end of each epoch
    learning_rate=2e-5,                      # The learning rate
    per_device_train_batch_size=8,           # Batch size for training
    per_device_eval_batch_size=8,            # Batch size for evaluation
    weight_decay=0.01,                       # Regularization
    save_total_limit=3,                      # Only keep the 3 best checkpoints
    num_train_epochs=3,                      # Number of times to go over the data
    predict_with_generate=True,              # MUST be True for summarization
    fp16=False,                              # Set to True if you have a modern GPU
)


# --- 4. Create the Data Collator ---
# This pads your inputs and labels dynamically to the longest 
# sequence in a batch, which is more efficient.
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


# --- 5. Initialize the Trainer ---
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,  # Pass our ROUGE function
)


# --- 6. Start Training! ---
print("Starting training...")
trainer.train()

# --- 7. Save the Final Model ---
print("Training complete. Saving model...")
trainer.save_model("t5-small-summarizer-final")
print("Model saved to 't5-small-summarizer-final'")

  trainer = Seq2SeqTrainer(


Starting training...


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,2.024103,23.9068,12.2193,20.5513,23.9479,20.0
2,No log,1.976378,24.2778,12.6363,20.7274,24.2955,20.0
3,No log,1.96624,24.3311,12.8032,20.8724,24.3738,20.0


Training complete. Saving model...
Model saved to 't5-small-summarizer-final'


### loading fine tuned model: 

In [14]:
from transformers import pipeline, AutoTokenizer

# Path where you saved your final model
model_dir = "t5-small-summarizer-final" 

# Load the tokenizer separately to check length
tokenizer = AutoTokenizer.from_pretrained(model_dir) 
# Load the model using the pipeline
summarizer = pipeline("summarization", model=model_dir, tokenizer=model_dir)

print(f"Fine-tuned model loaded from {model_dir}") 

# --- Get Samples ---
num_samples_to_show = 3
samples = hg_dataset['test'].select(range(num_samples_to_show))
prefix = "summarize: "
# Define max input length (same as used in training)
MAX_INPUT_LENGTH = 512 

for i, sample in enumerate(samples):
    # --- Truncate Input Manually (Fix for Warning 3) ---
    # Encode, truncate, then decode back to string to ensure length limit
    inputs_truncated = tokenizer.encode(
        prefix + sample['full_text'], 
        max_length=MAX_INPUT_LENGTH, 
        truncation=True, 
        return_tensors="pt" # Return PyTorch tensor
    )
    truncated_article_text = tokenizer.decode(inputs_truncated[0], skip_special_tokens=True)
    # Remove the prefix temporarily added during tokenization check if necessary
    if truncated_article_text.startswith(prefix.strip()):
         truncated_article_text = truncated_article_text[len(prefix.strip()):].strip()
            
    # Add prefix back for the actual summarizer input
    final_input_text = prefix + truncated_article_text
            
    # --- Reference Summary ---
    reference_summary = sample['summary']

    # --- Generate Summary (Fix for Warning 2) ---
    generated_summary = summarizer(final_input_text, 
                                   max_new_tokens=60,  # Explicitly control NEW tokens
                                   min_length=30,      # Min words in summary still applies
                                   num_beams=4,        
                                   early_stopping=True)[0]['summary_text']

    print(f"\n--- Example {i+1} ---")
    print(f"**Original Article (first 100 chars):** {sample['full_text'][:100]}...")
    print(f"\n**Reference Summary:** {reference_summary}")
    print(f"\n**Generated Summary (T5):** {generated_summary}")
    print("-" * 50)

Device set to use cpu
Your max_length is set to 200, but your input_length is only 60. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=30)


Fine-tuned model loaded from t5-small-summarizer-final

--- Example 1 ---
**Original Article (first 100 chars):** On this day, 68 years ago, Mahatma Gandhi had breathed his last after being fatally shot while on hi...

**Reference Summary:** Mahatma Gandhi was assassinated on January 30, 1948, during a prayer meeting at Delhi?s Birla House. He was shot thrice by Nathuram Godse. "If I'm to die by the bullet of a mad man, I must do so smiling. God must be in my heart and on my lips," Gandhi had reportedly said two days before his assassination.

**Generated Summary (T5):** 68 years ago, Mahatma Gandhi had breathed his last after being fatally shot while on his way to a prayer meeting at the Birla House. Here are some things to know about his assassination.
--------------------------------------------------


Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors



--- Example 2 ---
**Original Article (first 100 chars):** A D-day veteran who jumped 15,000ft from a plane has become the oldest person in the world to skydiv...

**Reference Summary:** Bryson William Verdun Hayes, at the age of 101 years and 38 days, became the oldest person in the world to skydive after jumping from 15,000 feet. Verdun took ten members of his family to skydive along with him. Interestingly, the 101-year-old had been presented with the Legion d'honneur for his heroic actions in World War II.

**Generated Summary (T5):** Bryson William Verdun Hayes, from Croyde, Devon, jumped 15,000ft from a plane on Sunday, breaking the British record for the oldest skydiver in the world . he took to the skies with 10 members of his family at Sky
--------------------------------------------------

--- Example 3 ---
**Original Article (first 100 chars):** Hindu Mahasabha president Swami Omji Maharaj, one of the panelists in Rahul Kanwal's primetime show ...

**Reference Summary:** Sel