In [1]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=721ebab843398d0e97dbf6597f3ad87b5be1588095116e52eec18556b937f012
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [2]:
import kagglehub
import pandas as pd
import numpy as np
import os
import re
import string
import nltk
nltk.download('punkt_tab')
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from rouge_score import rouge_scorer

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [3]:
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
#LOAD DATASET (BBC NEWS SUMMARY)

path = kagglehub.dataset_download("pariza/bbc-news-summary")
print("Dataset downloaded:", path)


Downloading from https://www.kaggle.com/api/v1/datasets/download/pariza/bbc-news-summary?dataset_version_number=2...


100%|██████████| 8.91M/8.91M [00:00<00:00, 11.5MB/s]

Extracting files...





Dataset downloaded: /root/.cache/kagglehub/datasets/pariza/bbc-news-summary/versions/2


In [5]:
import os
import pandas as pd

articles_dir = os.path.join(path, "BBC News Summary/News Articles")
summaries_dir = os.path.join(path, "BBC News Summary/Summaries")

data = []

for category in os.listdir(articles_dir):
    article_folder = os.path.join(articles_dir, category)
    summary_folder = os.path.join(summaries_dir, category)

    for filename in os.listdir(article_folder):
        article_path = os.path.join(article_folder, filename)
        summary_path = os.path.join(summary_folder, filename)

        try:
            with open(article_path, 'r', encoding='utf-8') as f:
                full_text = f.read().strip()
        except UnicodeDecodeError:
            with open(article_path, 'r', encoding='latin-1') as f:
                full_text = f.read().strip()

        # Split first line as title and remaining as article
        lines = full_text.split("\n", 1)
        title = lines[0].strip()
        article = lines[1].strip() if len(lines) > 1 else ""

        try:
            with open(summary_path, 'r', encoding='utf-8') as f:
                summary = f.read().strip()
        except UnicodeDecodeError:
            with open(summary_path, 'r', encoding='latin-1') as f:
                summary = f.read().strip()

        data.append({
            "category": category,
            "title": title,
            "article": article,
            "summary": summary
        })

# Convert to DataFrame
df = pd.DataFrame(data)

print(f"Dataset loaded successfully! Total Samples: {len(df)}")
print("\nSamples per category:\n", df["category"].value_counts())

# --- Display few examples ---
for i in range(2):
    print(f"\n[{i+1}]  Category: {df['category'][i]}")
    print(f" Title: {df['title'][i]}")
    print(f"\n Article:\n{df['article'][i][:400]}...")
    print(f"\n Summary:\n{df['summary'][i]}")
    print("-"*120)


Dataset loaded successfully! Total Samples: 2225

Samples per category:
 category
sport            511
business         510
politics         417
tech             401
entertainment    386
Name: count, dtype: int64

[1]  Category: entertainment
 Title: Versace art portfolio up for sale

 Article:
The art collection of murdered fashion designer Gianni Versace could fetch up to £9m ($17m) when it is auctioned in New York and London later this year.

Among the pictures for sale are works by Roy Lichtenstein, Andy Warhol and Henri Matisse. The collection was housed at Versace's six-storey New York townhouse. The 51-year-old designer was shot outside his Florida home in 1997 by suspected serial...

 Summary:
Much of the collection will be offered for sale at three auctions in New York in June, with smaller contemporary paintings going under the hammer in London on 22 and 23 June.The collection was housed at Versace's six-storey New York townhouse.The art collection of murdered fashion designe

In [6]:
# DATA CLEANING (TEXT PREPROCESSING)

def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text)
    words = [w for w in text.split() if w not in stop_words]
    return ' '.join(words)

print("\nCleaning dataset...")
df["clean_article"] = df["article"].apply(clean_text)
df["clean_summary"] = df["summary"].apply(clean_text)
print("Cleaning done!")


Cleaning dataset...
Cleaning done!


In [7]:
#  Display a few cleaned samples
print("\n Preview of Cleaned Data:\n")
for i in range(1):
    print(f"Category: {df['category'][i]}")
    print(f"Original Article (first 200 chars):\n{df['article'][i][:200]}...\n")
    print(f"Cleaned Article:\n{df['clean_article'][i][:200]}...\n")
    print(f"Original Summary:\n{df['summary'][i]}\n")
    print(f"Cleaned Summary:\n{df['clean_summary'][i]}\n")


 Preview of Cleaned Data:

Category: entertainment
Original Article (first 200 chars):
The art collection of murdered fashion designer Gianni Versace could fetch up to £9m ($17m) when it is auctioned in New York and London later this year.

Among the pictures for sale are works by Roy L...

Cleaned Article:
art collection murdered fashion designer gianni versace could fetch £m auctioned new york london later year among pictures sale works roy lichtenstein andy warhol henri matisse collection housed versa...

Original Summary:
Much of the collection will be offered for sale at three auctions in New York in June, with smaller contemporary paintings going under the hammer in London on 22 and 23 June.The collection was housed at Versace's six-storey New York townhouse.The art collection of murdered fashion designer Gianni Versace could fetch up to £9m ($17m) when it is auctioned in New York and London later this year.Among the pictures for sale are works by Roy Lichtenstein, Andy Warhol a

In [8]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")

Train size: 1780, Test size: 445


Extractive Summarization

In [9]:
# TRAIN: TF-IDF VECTORIZER + TEXTRANK
print("\n Training TF-IDF model on training data...")
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_vectorizer.fit(train_df["clean_article"])
print(" TF-IDF trained!")

def custom_textrank_summary(text, vectorizer, top_n=3):
    sentences = nltk.sent_tokenize(text)
    if len(sentences) <= top_n:
        return text
    tfidf_matrix = vectorizer.transform(sentences)
    sim_matrix = cosine_similarity(tfidf_matrix)
    n = len(sentences)
    scores = np.ones(n)
    damping = 0.85
    threshold = 0.0001

    for _ in range(100):
        new_scores = (1 - damping) + damping * sim_matrix.dot(scores) / np.sum(sim_matrix, axis=1)
        if np.sum(np.abs(new_scores - scores)) < threshold:
            break
        scores = new_scores

    ranked_sentences = [sentences[i] for i in np.argsort(scores)[::-1]]
    summary = " ".join(ranked_sentences[:top_n])
    return summary


 Training TF-IDF model on training data...
 TF-IDF trained!


In [12]:
train_df = test_df.copy()
train_df["generated_summary"] = test_df["clean_article"].apply(
    lambda x: custom_textrank_summary(x, tfidf_vectorizer, top_n=3)
)

In [13]:
# EVALUATION
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

def cosine_sim_score(t1, t2):
    tfidf = tfidf_vectorizer.transform([t1, t2])
    return cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0]

train_cos, train_r1, train_rL = [], [], []
for _, row in train_df.iterrows():
    ref, gen = row["clean_summary"], row["generated_summary"]
    train_cos.append(cosine_sim_score(ref, gen))
    r = scorer.score(ref, gen)
    train_r1.append(r['rouge1'].fmeasure)
    train_rL.append(r['rougeL'].fmeasure)

print(f"\n TRAIN RESULTS:")
print(f"Avg Cosine Similarity: {np.mean(train_cos):.4f}")
print(f"Avg ROUGE-1: {np.mean(train_r1):.4f}")
print(f"Avg ROUGE-L: {np.mean(train_rL):.4f}")


 TRAIN RESULTS:
Avg Cosine Similarity: 0.7817
Avg ROUGE-1: 0.5768
Avg ROUGE-L: 0.3572


In [15]:
# SAMPLE TESTING

import random
sample_idx = random.randint(0, len(test_df) - 1)
sample = test_df.iloc[sample_idx]
sample_cosine = cosine_sim_score(sample["clean_summary"], sample["generated_summary"])

print(" TEST SAMPLE\n")
print(f"Category: {sample['category']}")
print(f"\nOriginal Article:\n{sample['article'][:700]}...")
print("\n--------------------------------------------")
print(f"Human-Written Summary (Dataset):\n{sample['summary']}")
print("\n--------------------------------------------")
print(f"Generated Summary (TF-IDF + TextRank):\n{sample['generated_summary']}")
print("\n--------------------------------------------")
print(f"Cosine Similarity for this sample: {sample_cosine:.4f}")


 TEST SAMPLE

Category: tech

Original Article:
BT is starting its push into television with plans to offer TV over broadband.

As a telecoms company, BT is moving to a content distribution strategy, Andrew Burke, chief of BT's new Entertainment unit told the IPTV World Forum. "We want to be an entertainment facilitator," he said on the opening day of the London conference. The BBC is also trialling a service to play programmes over the net and has not ruled out offering it to non-licence fee payers overseas. The corporation's Interactive Media Player (iMP) is its first foray into broadband TV - known as IPTV (Internet Protocol TV).

"We see several opportunities for delivering the type of content that normally broadcasters find it diffic...

--------------------------------------------
Human-Written Summary (Dataset):
With a broadband net subscription, you can also get your TV and phone service.The BBC recognises that TV over broadband is a reality and aims to innovate with it, said R

Title Generation Using TF-IDF Keywords

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

In [17]:
# TITLE GENERATION
print("\n Generating Smart Titles using TF-IDF keywords...")

# Reuse the TF-IDF vectorizer trained on article corpus
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf.fit(train_df["clean_article"])

# Extract top keywords for each article
def get_keywords(text, vectorizer, top_n=3):
    tfidf_matrix = vectorizer.transform([text])
    feature_names = vectorizer.get_feature_names_out()
    scores = tfidf_matrix.toarray()[0]
    top_indices = scores.argsort()[-top_n:][::-1]
    return [feature_names[i] for i in top_indices]

def generate_title(article, category, vectorizer):
    keywords = get_keywords(article, vectorizer, top_n=3)
    title = f"{category.capitalize()}: {' '.join(keywords)}"
    return title

# Generate titles for test set
test_df = test_df.copy()
test_df["generated_title"] = test_df.apply(
    lambda row: generate_title(row["clean_article"], row["category"], tfidf),
    axis=1
)

print(" Titles generated successfully!")


 Generating Smart Titles using TF-IDF keywords...
 Titles generated successfully!


In [20]:
# EVALUATING
print("\n Evaluating Generated Titles...")

smooth = SmoothingFunction().method1
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

bleu_scores, rougeL_scores = [], []

for _, row in test_df.iterrows():
    ref_title = row["title"]
    gen_title = row["generated_title"]

    ref_tokens = ref_title.lower().split()
    gen_tokens = gen_title.lower().split()

    bleu = sentence_bleu([ref_tokens], gen_tokens, smoothing_function=smooth)
    rouge = scorer.score(ref_title.lower(), gen_title.lower())["rougeL"].fmeasure

    bleu_scores.append(bleu)
    rougeL_scores.append(rouge)

print(f"\n Avg BLEU Score: {np.mean(bleu_scores):.4f}")
print(f" Avg ROUGE-L Score: {np.mean(rougeL_scores):.4f}")


 Evaluating Generated Titles...

 Avg BLEU Score: 0.0412
 Avg ROUGE-L Score: 0.1847


In [32]:
# SAMPLE TESTING
import random
idx = random.randint(0, len(test_df)-1)
sample = test_df.iloc[idx]

print("\n SAMPLE TITLE GENERATION\n")
print(f"Category: {sample['category']}")
print(f"\nOriginal Title:\n{sample['title']}")
print(f"\nGenerated Title:\n{sample['generated_title']}")
print("\n--------------------------------------------")
print(f"Article Snippet:\n{sample['article'][:300]}...")
print("\n--------------------------------------------")
print(f"ROUGE-L for this sample: {scorer.score(sample['title'], sample['generated_title'])['rougeL'].fmeasure:.4f}")


 SAMPLE TITLE GENERATION

Category: business

Original Title:
Jarvis sells Tube stake to Spain

Generated Title:
Business: jarvis lines stake

--------------------------------------------
Article Snippet:
Shares in engineering group Jarvis have soared more than 16% on news that it is offloading its stake in London underground consortium Tube Lines.

The sale of the 33% stake to Spain's Ferrovial for £146m ($281m) is a lifeline to Jarvis, which was weighed down by debts of more than £230m. The company...

--------------------------------------------
ROUGE-L for this sample: 0.4000


Title Generation using Tranformer model T5-Small

In [33]:
!pip install transformers rouge-score -q

In [51]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [52]:
df["clean_title"] = df["title"].apply(clean_text)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [54]:
# LOAD T5-SMALL MODEL
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

# Encode input (article) and output (title)
def encode_batch(texts, targets, tokenizer, max_input_len=512, max_output_len=32):
    inputs = tokenizer(
        ["summarize: " + text for text in texts],
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=max_input_len
    )
    labels = tokenizer(
        targets,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=max_output_len
    )
    return inputs, labels

train_inputs, train_labels = encode_batch(
    train_df["clean_article"].tolist(),
    train_df["clean_title"].tolist(),
    tokenizer
)

# Move tensors to device
train_inputs = {k: v.to(device) for k, v in train_inputs.items()}
train_labels = train_labels["input_ids"].to(device)

# Training hyperparameters
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
epochs = 15
batch_size = 4

In [55]:
model.train()
print("\n Training T5 for Title Generation...")
for epoch in range(epochs):
    epoch_loss = []
    for i in tqdm(range(0, len(train_labels), batch_size)):
        input_batch = {k: v[i:i+batch_size] for k, v in train_inputs.items()}
        label_batch = train_labels[i:i+batch_size]
        outputs = model(**input_batch, labels=label_batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        epoch_loss.append(loss.item())
    print(f"Epoch {epoch+1}/{epochs} — Avg Loss: {np.mean(epoch_loss):.4f}")


 Training T5 for Title Generation...


100%|██████████| 445/445 [01:07<00:00,  6.62it/s]


Epoch 1/15 — Avg Loss: 2.6830


100%|██████████| 445/445 [01:14<00:00,  6.00it/s]


Epoch 2/15 — Avg Loss: 1.5171


100%|██████████| 445/445 [01:10<00:00,  6.28it/s]


Epoch 3/15 — Avg Loss: 1.4019


100%|██████████| 445/445 [01:10<00:00,  6.27it/s]


Epoch 4/15 — Avg Loss: 1.3254


100%|██████████| 445/445 [01:10<00:00,  6.27it/s]


Epoch 5/15 — Avg Loss: 1.2754


100%|██████████| 445/445 [01:10<00:00,  6.28it/s]


Epoch 6/15 — Avg Loss: 1.2229


100%|██████████| 445/445 [01:10<00:00,  6.28it/s]


Epoch 7/15 — Avg Loss: 1.1739


100%|██████████| 445/445 [01:10<00:00,  6.27it/s]


Epoch 8/15 — Avg Loss: 1.1384


100%|██████████| 445/445 [01:10<00:00,  6.28it/s]


Epoch 9/15 — Avg Loss: 1.0975


100%|██████████| 445/445 [01:10<00:00,  6.28it/s]


Epoch 10/15 — Avg Loss: 1.0591


100%|██████████| 445/445 [01:10<00:00,  6.27it/s]


Epoch 11/15 — Avg Loss: 1.0185


100%|██████████| 445/445 [01:10<00:00,  6.28it/s]


Epoch 12/15 — Avg Loss: 0.9846


100%|██████████| 445/445 [01:10<00:00,  6.28it/s]


Epoch 13/15 — Avg Loss: 0.9551


100%|██████████| 445/445 [01:10<00:00,  6.29it/s]


Epoch 14/15 — Avg Loss: 0.9198


100%|██████████| 445/445 [01:10<00:00,  6.28it/s]

Epoch 15/15 — Avg Loss: 0.9056





In [56]:
# EVALUATION
model.eval()
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
smooth = SmoothingFunction().method1

bleu_scores, rougeL_scores = [], []

for i in tqdm(range(len(test_df))):
    article = test_df.iloc[i]["clean_article"]
    ref_title = test_df.iloc[i]["clean_title"]

    # Generate title
    inputs = tokenizer("summarize: " + article, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    gen_ids = model.generate(**inputs, max_length=32, num_beams=4)
    gen_title = tokenizer.decode(gen_ids[0], skip_special_tokens=True)

    # BLEU
    bleu = sentence_bleu([ref_title.split()], gen_title.split(), smoothing_function=smooth)
    # ROUGE-L
    rouge = scorer.score(ref_title, gen_title)["rougeL"].fmeasure

    bleu_scores.append(bleu)
    rougeL_scores.append(rouge)

100%|██████████| 445/445 [01:05<00:00,  6.82it/s]


In [57]:
print("\n Title Generation Results")
print(f" Avg BLEU Score: {np.mean(bleu_scores):.4f}")
print(f" Avg ROUGE-L Score: {np.mean(rougeL_scores):.4f}")


 Title Generation Results
 Avg BLEU Score: 0.0842
 Avg ROUGE-L Score: 0.3416


In [66]:
# TESTING SAMPLE
import random
idx = random.randint(0, len(test_df)-1)
sample = test_df.iloc[idx]

inputs = tokenizer("summarize: " + sample["clean_article"], return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
gen_ids = model.generate(**inputs, max_length=32, num_beams=4)
generated_title = tokenizer.decode(gen_ids[0], skip_special_tokens=True)

print("\n SAMPLE TITLE GENERATION")
print(f"Category: {sample['category']}")
print(f"\nOriginal Title:\n{sample['title']}")
print(f"\nGenerated Title:\n{generated_title}")
print("\nArticle Snippet:\n", sample["article"][:300], "...")
print(f"\nROUGE-L for this sample: {scorer.score(sample['clean_title'], generated_title)['rougeL'].fmeasure:.4f}")



 SAMPLE TITLE GENERATION
Category: sport

Original Title:
Dementieva prevails in Hong Kong

Generated Title:
dementieva wins hong kongs title

Article Snippet:
 Elena Dementieva swept aside defending champion Venus Williams 6-3 6-2 to win Hong Kong's Champions Challenge event.

The Russian, ranked sixth in the world, broke Williams three times in the first set, while losing her service once. Williams saved three championship points before losing the match a ...

ROUGE-L for this sample: 0.6667


Complete Testing

In [75]:
# Example input (you can replace this with any article)
original_title = "Serena Williams Wins Seventh Wimbledon Title"
custom_article = """
Serena Williams claimed her seventh Wimbledon title and 22nd Grand Slam overall after defeating Angelique Kerber in straight sets.
With this win, Williams equals Steffi Graf’s record and cements her status as one of the greatest tennis players of all time.
"""

# ---- Clean the input ----
clean_custom_article = clean_text(custom_article)

# ---- Generate Summary using TF-IDF + TextRank ----
generated_summary = custom_textrank_summary(clean_custom_article, tfidf_vectorizer, top_n=3)

# ---- Generate Title using Fine-tuned T5 ----
inputs = tokenizer(
    "summarize: " + clean_custom_article,
    return_tensors="pt",
    truncation=True,
    padding=True,
    max_length=512
).to(device)

gen_ids = model.generate(**inputs, max_length=32, num_beams=6)
generated_title = tokenizer.decode(gen_ids[0], skip_special_tokens=True)

# ---- Compute Evaluation Metrics ----
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# BLEU
bleu_score = sentence_bleu(
    [original_title.lower().split()],
    generated_title.lower().split(),
    smoothing_function=SmoothingFunction().method1
)

# ROUGE
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
scores = scorer.score(original_title.lower(), generated_title.lower())
rouge_L = scores['rougeL'].fmeasure

# Cosine Similarity
vectorizer = TfidfVectorizer().fit([original_title.lower(), generated_title.lower()])
vectors = vectorizer.transform([original_title.lower(), generated_title.lower()])
cosine_sim = cosine_similarity(vectors[0], vectors[1])[0][0]

# ---- Display Results ----
print("\nORIGINAL ARTICLE:\n", custom_article.strip())
print("\nORIGINAL TITLE:\n", original_title.strip())
print("\nGENERATED SUMMARY:\n", generated_summary.strip())
print("\nGENERATED TITLE:\n", generated_title.strip())

print("\nEVALUATION RESULTS:")
print(f"BLEU Score        : {bleu_score:.4f}")
print(f"ROUGE-L (F1)      : {rouge_L:.4f}")
print(f"Cosine Similarity : {cosine_sim:.4f}")



ORIGINAL ARTICLE:
 Serena Williams claimed her seventh Wimbledon title and 22nd Grand Slam overall after defeating Angelique Kerber in straight sets.
With this win, Williams equals Steffi Graf’s record and cements her status as one of the greatest tennis players of all time.

ORIGINAL TITLE:
 Serena Williams Wins Seventh Wimbledon Title

GENERATED SUMMARY:
 serena williams claimed seventh wimbledon title nd grand slam overall defeating angelique kerber straight sets win williams equals steffi graf’s record cements status one greatest tennis players time

GENERATED TITLE:
 williams wins wimbledon title

EVALUATION RESULTS:
BLEU Score        : 0.1457
ROUGE-L (F1)      : 0.8000
Cosine Similarity : 0.7093
