In [1]:
# Colab cell 1: installs (run once)
!pip install -q nltk scikit-learn rake-nltk yake networkx sentence-transformers

# Note: sentence-transformers will download a model (~50-100MB) on first use.


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/80.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.7/80.7 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/360.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m360.5/360.5 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Colab cell 2: imports + nltk downloads
import re
import os
from typing import List, Tuple

import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
import networkx as nx

from rake_nltk import Rake
import yake

# optional embeddings
from sentence_transformers import SentenceTransformer


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


2) Helper functions — preprocessing & utilities

In [3]:
# Colab cell 3: helpers
STOPWORDS = set(stopwords.words('english'))

def clean_text(text: str) -> str:
    text = text.replace('\n', ' ').strip()
    text = re.sub(r'\s+', ' ', text)
    # remove weird characters but keep basic punctuation
    text = re.sub(r'[^0-9A-Za-z.,;:!?\'\"()\- ]+', '', text)
    return text

def sentences_from_text(text: str) -> List[str]:
    text = clean_text(text)
    sents = sent_tokenize(text)
    # filter very short sentences
    sents = [s.strip() for s in sents if len(s.strip())>20]
    return sents


**3) TF–IDF sentence-scoring summarizer**

In [5]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
# Colab cell 4: TF-IDF summarizer
def tfidf_sentence_summary(text: str, n_sentences: int = 3, min_df: int = 1) -> Tuple[str, List[int]]:
    sents = sentences_from_text(text)
    if len(sents) == 0:
        return "", []

    # Vectorize sentences
    vect = TfidfVectorizer(ngram_range=(1,2), stop_words='english', min_df=min_df)
    X = vect.fit_transform(sents)  # shape: (n_sentences, n_features)
    # Sentence scores: sum of TF-IDF of terms in the sentence
    scores = X.sum(axis=1).A1  # convert to 1d array
    # pick top indices
    top_idx = np.argsort(scores)[-n_sentences:][::-1]
    top_idx_sorted = sorted(top_idx)  # reorder by original order
    summary = " ".join([sents[i] for i in top_idx_sorted])
    return summary, top_idx_sorted

# quick test
sample = ("Natural language processing (NLP) is a subfield of linguistics, computer science and artificial intelligence "
          "concerned with the interactions between computers and human language, in particular how to program computers "
          "to process and analyze large amounts of natural language data. The result is a computer capable of 'understanding' "
          "the contents of documents, including the contextual nuances of the language within them. The technology can then accurately "
          "extract information and insights contained in the documents as well as categorize and organize the documents themselves.")
print(tfidf_sentence_summary(sample, n_sentences=2)[0])


Natural language processing (NLP) is a subfield of linguistics, computer science and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data. The result is a computer capable of 'understanding' the contents of documents, including the contextual nuances of the language within them.


**4) TextRank summarizer (graph-based)**

In [7]:
# Colab cell 5: TextRank summarizer
from sklearn.metrics.pairwise import cosine_similarity

def textrank_summary(text: str, n_sentences: int = 3) -> Tuple[str, List[int]]:
    sents = sentences_from_text(text)
    if len(sents) == 0:
        return "", []
    vect = TfidfVectorizer(ngram_range=(1,2), stop_words='english')
    X = vect.fit_transform(sents)
    sim_mat = cosine_similarity(X)
    # build graph and rank
    nx_graph = nx.from_numpy_array(sim_mat)
    scores = nx.pagerank(nx_graph)
    ranked = sorted(((scores[i], i) for i in scores), reverse=True)
    top_idx = [idx for (_, idx) in ranked[:n_sentences]]
    top_idx_sorted = sorted(top_idx)
    summary = " ".join([sents[i] for i in top_idx_sorted])
    return summary, top_idx_sorted

# quick test
print(textrank_summary(sample, n_sentences=2)[0])


Natural language processing (NLP) is a subfield of linguistics, computer science and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data. The result is a computer capable of 'understanding' the contents of documents, including the contextual nuances of the language within them.


**5) Keyword extraction methods**

In [8]:
# Colab cell 6: RAKE keywords
def rake_keywords(text: str, top_n: int = 10) -> List[Tuple[str, float]]:
    r = Rake(stopwords=STOPWORDS)
    r.extract_keywords_from_text(text)
    ranked = r.get_ranked_phrases_with_scores()  # (score, phrase)
    # Rake returns (score, phrase) with higher score = more important
    ranked = [(phrase, score) for score, phrase in ranked]
    return ranked[:top_n]

# YAKE
def yake_keywords(text: str, top_n: int = 10, max_ngram_size: int = 3) -> List[Tuple[str, float]]:
    kw_extractor = yake.KeywordExtractor(n=max_ngram_size, top=top_n)
    keywords = kw_extractor.extract_keywords(text)
    # returns list of (keyword, score) where lower score is better
    return keywords

# TF-IDF top terms (document-level)
def tfidf_top_terms(text: str, top_n: int = 10) -> List[Tuple[str, float]]:
    vect = TfidfVectorizer(ngram_range=(1,2), stop_words='english')
    X = vect.fit_transform([text])
    feature_array = np.array(vect.get_feature_names_out())
    tfidf_scores = X.toarray().ravel()
    top_n_idx = tfidf_scores.argsort()[-top_n:][::-1]
    return list(zip(feature_array[top_n_idx], tfidf_scores[top_n_idx]))

# quick test
print("RAKE:", rake_keywords(sample, 5))
print("YAKE:", yake_keywords(sample, 5))
print("TF-IDF terms:", tfidf_top_terms(sample, 5))


RAKE: [('artificial intelligence concerned', 9.0), ('analyze large amounts', 9.0), ('accurately extract information', 9.0), ('natural language processing', 8.5), ('natural language data', 8.5)]
YAKE: [('artificial intelligence concerned', np.float64(0.003660303042437319)), ('analyze large amounts', np.float64(0.003660303042437319)), ('Natural language processing', np.float64(0.0036830307857007253)), ('natural language data', np.float64(0.0036830307857007253)), ('Natural language', np.float64(0.01407724257127509))]
TF-IDF terms: [('language', np.float64(0.3730019232961255)), ('documents', np.float64(0.27975144247209416)), ('natural', np.float64(0.18650096164806276)), ('natural language', np.float64(0.18650096164806276)), ('computer', np.float64(0.18650096164806276))]


**6. Embeddings-based semantic extractive summary (better semantic capture)**

In [10]:
# Colab cell 7: embeddings-based summary using SentenceTransformer
def embeddings_summary(text: str, n_sentences: int = 3, model_name: str = 'all-MiniLM-L6-v2') -> Tuple[str, List[int]]:
    sents = sentences_from_text(text)
    if len(sents) == 0:
        return "", []
    model = SentenceTransformer(model_name)  # downloads model if needed
    embeddings = model.encode(sents, convert_to_numpy=True, show_progress_bar=False)
    # Simple centroid scoring: compute distance of each sentence to global centroid; pick top sentences closest to centroid
    centroid = embeddings.mean(axis=0)
    # similarity (dot product)
    sim = embeddings @ centroid
    top_idx = np.argsort(sim)[-n_sentences:][::-1]
    top_idx_sorted = sorted(top_idx)
    summary = " ".join([sents[i] for i in top_idx_sorted])
    return summary, top_idx_sorted

# quick test (will download the model)
summary, idxs = embeddings_summary(sample, n_sentences=2)
print(summary)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

The result is a computer capable of 'understanding' the contents of documents, including the contextual nuances of the language within them. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.


**7) Putting it all together — pipeline function**

In [11]:
# Colab cell 8: pipeline wrapper
def summarize_and_extract_keywords(text: str,
                                   summary_type: str = 'tfidf',  # options: 'tfidf','textrank','embed'
                                   n_sentences: int = 3,
                                   keyword_method: str = 'rake',  # options: 'rake','yake','tfidf'
                                   top_k_keywords: int = 10,
                                   embed_model: str = 'all-MiniLM-L6-v2') -> dict:
    text = clean_text(text)
    result = {'original_text': text}
    if summary_type == 'tfidf':
        summary, idxs = tfidf_sentence_summary(text, n_sentences=n_sentences)
    elif summary_type == 'textrank':
        summary, idxs = textrank_summary(text, n_sentences=n_sentences)
    elif summary_type == 'embed':
        summary, idxs = embeddings_summary(text, n_sentences=n_sentences, model_name=embed_model)
    else:
        raise ValueError('Unknown summary_type')
    result['summary'] = summary
    result['summary_sentence_indices'] = idxs

    if keyword_method == 'rake':
        result['keywords'] = rake_keywords(text, top_n=top_k_keywords)
    elif keyword_method == 'yake':
        result['keywords'] = yake_keywords(text, top_n=top_k_keywords)
    elif keyword_method == 'tfidf':
        result['keywords'] = tfidf_top_terms(text, top_n=top_k_keywords)
    else:
        raise ValueError('Unknown keyword_method')

    return result

# demo with sample text
demo = summarize_and_extract_keywords(sample, summary_type='tfidf', keyword_method='rake')
print("Summary:\n", demo['summary'])
print("\nTop keywords (RAKE):", demo['keywords'])


Summary:
 Natural language processing (NLP) is a subfield of linguistics, computer science and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data. The result is a computer capable of 'understanding' the contents of documents, including the contextual nuances of the language within them. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.

Top keywords (RAKE): [('artificial intelligence concerned', 9.0), ('analyze large amounts', 9.0), ('accurately extract information', 9.0), ('natural language processing', 8.5), ('natural language data', 8.5), ('language within', 4.5), ('human language', 4.5), ('insights contained', 4.0), ('contextual nuances', 4.0), ('computer science', 4.0)]


In [13]:
# Colab cell: installs for evaluation
!pip install -q rouge-score sentence-transformers scikit-learn


  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


#8) Simple UI to check the results

In [12]:
# Colab cell 10: simple interactive cell (input prompt)
from IPython.display import display
import ipywidgets as widgets

text_area = widgets.Textarea(
    value='Paste your article or text here...',
    placeholder='Type something',
    description='Text:',
    disabled=False,
    layout=widgets.Layout(width='100%', height='200px')
)

summary_type_dd = widgets.Dropdown(options=['tfidf','textrank','embed'], value='tfidf', description='Summary:')
keyword_dd = widgets.Dropdown(options=['rake','yake','tfidf'], value='rake', description='Keywords:')
n_sent_slider = widgets.IntSlider(value=3, min=1, max=8, description='Sentences:')

button = widgets.Button(description="Run")
out = widgets.Output()

def on_button_clicked(b):
    with out:
        out.clear_output()
        txt = text_area.value
        res = summarize_and_extract_keywords(txt, summary_type=summary_type_dd.value,
                                             n_sentences=n_sent_slider.value,
                                             keyword_method=keyword_dd.value)
        print("=== Summary ===")
        print(res['summary'])
        print("\n=== Keywords ===")
        kws = res['keywords']
        # print nicely
        if isinstance(kws, list):
            for i, item in enumerate(kws[:20], 1):
                print(f"{i}. {item}")
        else:
            print(kws)

button.on_click(on_button_clicked)
display(text_area, summary_type_dd, keyword_dd, n_sent_slider, button, out)


Textarea(value='Paste your article or text here...', description='Text:', layout=Layout(height='200px', width=…

Dropdown(description='Summary:', options=('tfidf', 'textrank', 'embed'), value='tfidf')

Dropdown(description='Keywords:', options=('rake', 'yake', 'tfidf'), value='rake')

IntSlider(value=3, description='Sentences:', max=8, min=1)

Button(description='Run', style=ButtonStyle())

Output()

## 9) Evaluation Metrics

In [14]:
# STEP 10.1 — Import evaluation tools
!pip install -q rouge-score sentence-transformers

from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re

def compute_rouge(system_summary: str, reference_summary: str):
    scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'], use_stemmer=True)
    scores = scorer.score(reference_summary, system_summary)
    out = {}
    for k, v in scores.items():
        out[k] = {
            'precision': v.precision,
            'recall': v.recall,
            'fmeasure': v.fmeasure
        }
    return out

# Embedding similarity
_embed_model = None
def summary_cosine_similarity(system_summary: str, reference_summary: str, model_name='all-MiniLM-L6-v2'):
    global _embed_model
    if _embed_model is None:
        _embed_model = SentenceTransformer(model_name)

    emb_sys = _embed_model.encode(system_summary, convert_to_numpy=True)
    emb_ref = _embed_model.encode(reference_summary, convert_to_numpy=True)
    return float(cosine_similarity([emb_sys], [emb_ref])[0][0])

# Keyword evaluation
def norm(k):
    k = k.lower()
    k = re.sub(r'[^a-z0-9\s]', '', k)
    return k.strip()

def keyword_precision_recall_f1(pred_kw, gold_kw):
    pred = set(norm(k) for k in pred_kw)
    gold = set(norm(k) for k in gold_kw)

    tp = len(pred & gold)
    fp = len(pred - gold)
    fn = len(gold - pred)

    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    f1 = 2*precision*recall/(precision + recall) if precision + recall > 0 else 0

    return {"precision": precision, "recall": recall, "f1": f1}


In [15]:
text = """
Artificial intelligence (AI) is transforming industries across the world.
From healthcare diagnostics to financial forecasting, AI models are enabling
faster and more accurate decision-making. Machine learning, a subset of AI,
focuses on training algorithms using large datasets to identify patterns and
make predictions. Recent advancements in natural language processing (NLP)
have made it possible for machines to understand and generate human-like text.
Businesses are increasingly adopting AI solutions to automate processes,
reduce costs, and enhance customer experiences.
"""
reference_summary = """
Artificial intelligence is revolutionizing many industries by enabling faster
and more accurate decision-making. Advances in machine learning and NLP help
systems analyze data, make predictions, and automate business processes.
"""

reference_keywords = [
    "artificial intelligence",
    "machine learning",
    "natural language processing",
    "automation",
    "predictions"
]


In [16]:
system_result = summarize_and_extract_keywords(
    text,
    summary_type='tfidf',   # You can change to 'textrank' or 'embed'
    n_sentences=3,
    keyword_method='rake'   # or 'yake' or 'tfidf'
)

system_summary = system_result['summary']
system_keywords = [k for k, score in system_result['keywords']]

print("=== SYSTEM SUMMARY ===")
print(system_summary)

print("\n=== SYSTEM KEYWORDS ===")
print(system_keywords)


=== SYSTEM SUMMARY ===
Machine learning, a subset of AI, focuses on training algorithms using large datasets to identify patterns and make predictions. Recent advancements in natural language processing (NLP) have made it possible for machines to understand and generate human-like text. Businesses are increasingly adopting AI solutions to automate processes, reduce costs, and enhance customer experiences.

=== SYSTEM KEYWORDS ===
['training algorithms using large datasets', 'increasingly adopting ai solutions', 'transforming industries across', 'natural language processing', 'enhance customer experiences', 'reduce costs', 'recent advancements', 'make predictions', 'machine learning', 'like text']


In [17]:
# ROUGE evaluation
rouge_scores = compute_rouge(system_summary, reference_summary)
print("\n=== ROUGE SCORES ===")
print(rouge_scores)

# Embedding similarity (0–1)
sim = summary_cosine_similarity(system_summary, reference_summary)
print("\n=== EMBEDDING SIMILARITY ===")
print(sim)

# Keyword precision/recall/f1
kw_metrics = keyword_precision_recall_f1(system_keywords, reference_keywords)
print("\n=== KEYWORD PRECISION / RECALL / F1 ===")
print(kw_metrics)



=== ROUGE SCORES ===
{'rouge1': {'precision': 0.24074074074074073, 'recall': 0.43333333333333335, 'fmeasure': 0.30952380952380953}, 'rouge2': {'precision': 0.05660377358490566, 'recall': 0.10344827586206896, 'fmeasure': 0.07317073170731707}, 'rougeL': {'precision': 0.14814814814814814, 'recall': 0.26666666666666666, 'fmeasure': 0.19047619047619047}}

=== EMBEDDING SIMILARITY ===
0.8376084566116333

=== KEYWORD PRECISION / RECALL / F1 ===
{'precision': 0.2, 'recall': 0.4, 'f1': 0.26666666666666666}


# ***To improve the efficiency***

***Quick compare TF-IDF / TextRank / Embedding summaries***

In [18]:
# Assumes your earlier functions are present:
# tfidf_sentence_summary, textrank_summary, embeddings_summary, compute_rouge, summary_cosine_similarity

def evaluate_one_method(text, ref_summary, method, n_sent=3):
    if method == 'tfidf':
        sys_sum, _ = tfidf_sentence_summary(text, n_sentences=n_sent)
    elif method == 'textrank':
        sys_sum, _ = textrank_summary(text, n_sentences=n_sent)
    elif method == 'embed':
        sys_sum, _ = embeddings_summary(text, n_sentences=n_sent)
    else:
        raise ValueError(method)
    rouge = compute_rouge(sys_sum, ref_summary)
    emb_sim = summary_cosine_similarity(sys_sum, ref_summary)
    return {'method': method, 'summary': sys_sum, 'rouge': rouge, 'embed_sim': emb_sim}

# Example usage:
methods = ['tfidf','textrank','embed']
results = [evaluate_one_method(text, reference_summary, m, n_sent=3) for m in methods]
for r in results:
    print("METHOD:", r['method'])
    print("SUMMARY:", r['summary'])
    print("EMB SIM:", r['embed_sim'])
    print("ROUGE-1 F1:", r['rouge']['rouge1']['fmeasure'])
    print("ROUGE-2 F1:", r['rouge']['rouge2']['fmeasure'])
    print("---")


METHOD: tfidf
SUMMARY: Machine learning, a subset of AI, focuses on training algorithms using large datasets to identify patterns and make predictions. Recent advancements in natural language processing (NLP) have made it possible for machines to understand and generate human-like text. Businesses are increasingly adopting AI solutions to automate processes, reduce costs, and enhance customer experiences.
EMB SIM: 0.8376084566116333
ROUGE-1 F1: 0.30952380952380953
ROUGE-2 F1: 0.07317073170731707
---
METHOD: textrank
SUMMARY: Artificial intelligence (AI) is transforming industries across the world. From healthcare diagnostics to financial forecasting, AI models are enabling faster and more accurate decision-making. Recent advancements in natural language processing (NLP) have made it possible for machines to understand and generate human-like text.
EMB SIM: 0.7594662308692932
ROUGE-1 F1: 0.4533333333333333
ROUGE-2 F1: 0.2191780821917808
---
METHOD: embed
SUMMARY: Artificial intelligence

***TF-IDF with bigger n_sentences and bigger ngrams***

In [22]:
# Try longer summaries and include trigrams
def tfidf_variant(text, ref_summary, n_sent=4, ngram=(1,3)):
    # quick local reimplementation with adjustable ngram
    from sklearn.feature_extraction.text import TfidfVectorizer
    sents = sentences_from_text(text)
    vect = TfidfVectorizer(ngram_range=ngram, stop_words='english')
    X = vect.fit_transform(sents)
    scores = X.sum(axis=1).A1
    top_idx = np.argsort(scores)[-n_sent:][::-1]
    top_idx_sorted = sorted(top_idx)
    summary = " ".join([sents[i] for i in top_idx_sorted])
    rouge = compute_rouge(summary, ref_summary)
    emb_sim = summary_cosine_similarity(summary, ref_summary)
    return summary, rouge, emb_sim

summary, rouge, emb_sim = tfidf_variant(text, reference_summary, n_sent=4, ngram=(1,3))
print("Summary:", summary)
print("ROUGE-1 F1:", rouge['rouge1']['fmeasure'])
print("ROUGE-2 F1:", rouge['rouge2']['fmeasure'])
print("Embed sim:", emb_sim)


Summary: From healthcare diagnostics to financial forecasting, AI models are enabling faster and more accurate decision-making. Machine learning, a subset of AI, focuses on training algorithms using large datasets to identify patterns and make predictions. Recent advancements in natural language processing (NLP) have made it possible for machines to understand and generate human-like text. Businesses are increasingly adopting AI solutions to automate processes, reduce costs, and enhance customer experiences.
ROUGE-1 F1: 0.38
ROUGE-2 F1: 0.18367346938775508
Embed sim: 0.805416464805603


***Embeddings + clustering representative sentences***

In [23]:
# Requires sklearn
from sklearn.cluster import KMeans
import numpy as np

def embeddings_cluster_summary(text, n_sentences=3, model_name='all-MiniLM-L6-v2'):
    sents = sentences_from_text(text)
    model = SentenceTransformer(model_name)
    emb = model.encode(sents, convert_to_numpy=True)
    # choose num clusters = n_sentences (or min)
    k = min(n_sentences, len(sents))
    kmeans = KMeans(n_clusters=k, random_state=42).fit(emb)
    centers = kmeans.cluster_centers_
    # pick closest sentence to each cluster center
    chosen = []
    for c in centers:
        sims = emb @ c
        chosen_idx = int(np.argmax(sims))
        chosen.append(chosen_idx)
    chosen = sorted(set(chosen))
    summary = " ".join([sents[i] for i in chosen])
    return summary, chosen

# test
summary, idxs = embeddings_cluster_summary(text, n_sentences=3)
print("Cluster summary:", summary)
print("ROUGE:", compute_rouge(summary, reference_summary))
print("Embed sim:", summary_cosine_similarity(summary, reference_summary))


Cluster summary: Artificial intelligence (AI) is transforming industries across the world. Machine learning, a subset of AI, focuses on training algorithms using large datasets to identify patterns and make predictions. Recent advancements in natural language processing (NLP) have made it possible for machines to understand and generate human-like text.
ROUGE: {'rouge1': {'precision': 0.2916666666666667, 'recall': 0.4666666666666667, 'fmeasure': 0.35897435897435903}, 'rouge2': {'precision': 0.0851063829787234, 'recall': 0.13793103448275862, 'fmeasure': 0.10526315789473682}, 'rougeL': {'precision': 0.20833333333333334, 'recall': 0.3333333333333333, 'fmeasure': 0.25641025641025644}}
Embed sim: 0.8065097332000732


# 10) Run it against multiple samples and evaluate it

# 10) Run it against multiple samples and evaluate

In [25]:
# Single cell — run this in Colab to evaluate your uploaded CSV and save aggregated results.
# Uses existing pipeline functions if present in the notebook, otherwise falls back to a small TF-IDF+RAKE pipeline.

CSV_PATH = "/content/src_sample_dataset.csv"
OUT_CSV = "/content/summarization_keyword_results.csv"

# -------------------- imports --------------------
import os, re, math
import pandas as pd, numpy as np
from tqdm import tqdm

# Try imports used for evaluation (assumes installed in your Colab already)
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from rake_nltk import Rake
from sklearn.metrics.pairwise import cosine_similarity
try:
    from rapidfuzz import fuzz
except Exception:
    fuzz = None

# -------------------- small helpers --------------------
def normalize_gold_kw(raw):
    if pd.isna(raw):
        return []
    if isinstance(raw, str):
        parts = re.split(r'[;,\n]', raw)
        return [p.strip() for p in parts if p.strip()]
    if isinstance(raw, (list, tuple)):
        return [str(p).strip() for p in raw if str(p).strip()]
    return [str(raw).strip()]

def fuzzy_keyword_eval(preds, golds, threshold=65):
    preds = [str(p).lower().strip() for p in preds]
    golds = [str(g).lower().strip() for g in golds]
    if len(preds)==0 and len(golds)==0:
        return {'precision':1.0,'recall':1.0,'f1':1.0,'matched':[]}
    if fuzz is None:
        # fallback exact matching
        predset = set(preds)
        goldset = set(golds)
        tp = len(predset & goldset)
        prec = tp/len(predset) if predset else 0.0
        rec = tp/len(goldset) if goldset else 0.0
        f1 = 2*prec*rec/(prec+rec) if (prec+rec)>0 else 0.0
        return {'precision':prec,'recall':rec,'f1':f1,'matched':list(predset & goldset)}
    matched_gold = set()
    tp = 0
    for p in preds:
        for g in golds:
            if g in matched_gold:
                continue
            if fuzz.partial_ratio(p, g) >= threshold:
                tp += 1
                matched_gold.add(g)
                break
    prec = tp / len(preds) if preds else 0.0
    rec = tp / len(golds) if golds else 0.0
    f1 = 2*prec*rec/(prec+rec) if (prec+rec)>0 else 0.0
    return {'precision':prec,'recall':rec,'f1':f1,'matched':list(matched_gold)}

# -------------------- evaluation functions --------------------
rouge = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'], use_stemmer=True)
def compute_rouge(sys_sum, ref_sum):
    s = rouge.score(ref_sum or "", sys_sum or "")
    return {
        'rouge1_p': s['rouge1'].precision, 'rouge1_r': s['rouge1'].recall, 'rouge1_f': s['rouge1'].fmeasure,
        'rouge2_p': s['rouge2'].precision, 'rouge2_r': s['rouge2'].recall, 'rouge2_f': s['rouge2'].fmeasure,
        'rougeL_p': s['rougeL'].precision, 'rougeL_r': s['rougeL'].recall, 'rougeL_f': s['rougeL'].fmeasure
    }

# small cached embedding model for summary similarity
_embed_model = None
def summary_cosine_similarity(sys_sum, ref_sum, model_name='all-MiniLM-L6-v2'):
    global _embed_model
    if _embed_model is None:
        _embed_model = SentenceTransformer(model_name)
    v_sys = _embed_model.encode(sys_sum or "", convert_to_numpy=True, show_progress_bar=False)
    v_ref = _embed_model.encode(ref_sum or "", convert_to_numpy=True, show_progress_bar=False)
    if np.linalg.norm(v_sys)==0 or np.linalg.norm(v_ref)==0:
        return 0.0
    return float(cosine_similarity([v_sys], [v_ref])[0][0])

# -------------------- pipeline availability check / fallback --------------------
use_existing = False
try:
    summarize_and_extract_keywords  # if defined earlier in notebook
    use_existing = True
except Exception:
    use_existing = False

if not use_existing:
    # define small fallback pipeline (TF-IDF sentence scoring + RAKE)
    STOPWORDS = None
    try:
        import nltk
        from nltk.corpus import stopwords
        nltk.download('punkt', quiet=True)
        nltk.download('punkt_tab', quiet=True)
        nltk.download('stopwords', quiet=True)
        STOPWORDS = set(stopwords.words('english'))
    except Exception:
        STOPWORDS = set()
    def sentences_from_text(text):
        text = (text or "").replace("\n"," ").strip()
        from nltk.tokenize import sent_tokenize
        sents = sent_tokenize(text)
        return [s.strip() for s in sents if len(s.strip())>10]
    def tfidf_sentence_summary(text, n_sentences=3, ngram=(1,2)):
        sents = sentences_from_text(text)
        if not sents:
            return ""
        vect = TfidfVectorizer(ngram_range=ngram, stop_words='english')
        X = vect.fit_transform(sents)
        scores = X.sum(axis=1).A1
        top = np.argsort(scores)[-min(n_sentences,len(sents)):][::-1]
        top_sorted = sorted(top)
        return " ".join([sents[i] for i in top_sorted])
    def rake_keywords(text, top_n=10):
        r = Rake(stopwords=STOPWORDS)
        r.extract_keywords_from_text(text or "")
        ranked = r.get_ranked_phrases()  # list of phrases
        return ranked[:top_n]
    def summarize_and_extract_keywords(text, summary_type='tfidf', n_sentences=3, keyword_method='rake'):
        s = tfidf_sentence_summary(text, n_sentences=n_sentences)
        kws = rake_keywords(text, top_n=10)
        return {'summary': s, 'keywords': kws}

# -------------------- load CSV and run evaluation --------------------
df = pd.read_csv(CSV_PATH)
required = ['sample_text','gold_summary','gold_keywords']
for c in required:
    if c not in df.columns:
        raise ValueError(f"CSV missing required column: {c}. Found columns: {list(df.columns)}")

rows = []
# parameters - edit if needed
SUMMARY_METHOD = 'tfidf'   # or 'embed' if your summarize_and_extract_keywords supports it
N_SENTENCES = 3
KEYWORD_METHOD = 'rake'

for idx, r in tqdm(df.iterrows(), total=len(df)):
    text = r['sample_text']
    gold_sum = r.get('gold_summary') or ""
    gold_kw = normalize_gold_kw(r.get('gold_keywords'))
    out = summarize_and_extract_keywords(text, summary_type=SUMMARY_METHOD, n_sentences=N_SENTENCES, keyword_method=KEYWORD_METHOD)
    sys_sum = out.get('summary', "") if isinstance(out, dict) else str(out)
    sys_kws = out.get('keywords', []) if isinstance(out, dict) else []
    # ensure list of keywords (if tuple pairs etc.)
    if isinstance(sys_kws, list) and sys_kws and isinstance(sys_kws[0], tuple):
        sys_kws = [t[0] for t in sys_kws]
    # evaluations
    rouge_scores = compute_rouge(sys_sum, gold_sum)
    emb_sim = summary_cosine_similarity(sys_sum, gold_sum)
    kw_eval = fuzzy_keyword_eval(sys_kws, gold_kw, threshold=65)
    rows.append({
        'id': r.get('id', idx),
        'sys_summary': sys_sum,
        'gold_summary': gold_sum,
        'rouge1_f': rouge_scores['rouge1_f'],
        'rouge2_f': rouge_scores['rouge2_f'],
        'rougeL_f': rouge_scores['rougeL_f'],
        'embedding_cosine': emb_sim,
        'sys_keywords': "; ".join(sys_kws),
        'gold_keywords': "; ".join(gold_kw),
        'kw_precision': kw_eval['precision'],
        'kw_recall': kw_eval['recall'],
        'kw_f1': kw_eval['f1']
    })

results_df = pd.DataFrame(rows)
results_df.to_csv(OUT_CSV, index=False)
print("Saved results to:", OUT_CSV)

# -------------------- aggregated metrics --------------------
agg = {
    'rouge1_f_mean': results_df['rouge1_f'].mean(),
    'rouge2_f_mean': results_df['rouge2_f'].mean(),
    'rougeL_f_mean': results_df['rougeL_f'].mean(),
    'embedding_cosine_mean': results_df['embedding_cosine'].mean(),
    'kw_precision_mean': results_df['kw_precision'].mean(),
    'kw_recall_mean': results_df['kw_recall'].mean(),
    'kw_f1_mean': results_df['kw_f1'].mean()
}
print("\n=== AGGREGATED METRICS ===")
for k,v in agg.items():
    print(f"{k}: {v:.4f}")

# Show top/bottom examples by ROUGE-1 F
print("\nTop 3 by ROUGE-1 F:")
display(results_df.sort_values('rouge1_f', ascending=False).head(3)[['id','sys_summary','gold_summary','rouge1_f','embedding_cosine']])

print("\nBottom 3 by ROUGE-1 F:")
display(results_df.sort_values('rouge1_f', ascending=True).head(3)[['id','sys_summary','gold_summary','rouge1_f','embedding_cosine']])


100%|██████████| 60/60 [00:09<00:00,  6.43it/s]


Saved results to: /content/summarization_keyword_results.csv

=== AGGREGATED METRICS ===
rouge1_f_mean: 0.4104
rouge2_f_mean: 0.3727
rougeL_f_mean: 0.4030
embedding_cosine_mean: 0.7365
kw_precision_mean: 0.1733
kw_recall_mean: 0.5778
kw_f1_mean: 0.2667

Top 3 by ROUGE-1 F:


Unnamed: 0,id,sys_summary,gold_summary,rouge1_f,embedding_cosine
0,1,AI is transforming industries by enabling auto...,Artificial intelligence advances drive improve...,0.511628,0.874182
10,11,AI is transforming industries by enabling auto...,Artificial intelligence advances drive improve...,0.511628,0.874182
20,21,AI is transforming industries by enabling auto...,Artificial intelligence advances drive improve...,0.511628,0.874182



Bottom 3 by ROUGE-1 F:


Unnamed: 0,id,sys_summary,gold_summary,rouge1_f,embedding_cosine
28,29,Recent developments include new models and pra...,Robotics automation advances drive improvement...,0.0,0.090679
18,19,Recent developments include new models and pra...,Robotics automation advances drive improvement...,0.0,0.090679
38,39,Recent developments include new models and pra...,Robotics automation advances drive improvement...,0.0,0.090679
