In [1]:
!pip install datasets==2.* scikit-learn networkx nltk rouge_score tqdm



In [2]:
import os
import json
import re
from datasets import load_dataset
from tqdm import tqdm
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from rouge_score import rouge_scorer
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
STOPWORDS = set(stopwords.words("english"))

def clean_text(text):
    if not text:
        return ""
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"[^a-z0-9.,!?;:’'\"-]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def preprocess_sentences(text, min_len=5):
    text = clean_text(text)
    sents = sent_tokenize(text)
    processed = []
    for s in sents:
        words = [w for w in word_tokenize(s) if w.lower() not in STOPWORDS and w.isalpha()]
        if len(words) >= min_len:
            processed.append(" ".join(words))
    return processed


In [4]:
def textrank_summarize(text, n_sentences=3):
    sentences = sent_tokenize(clean_text(text))
    proc_sentences = preprocess_sentences(text)

    if len(proc_sentences) == 0 or len(sentences) == 0:
        return ""

    if len(proc_sentences) <= n_sentences:
        return " ".join(sentences[:n_sentences])

    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(proc_sentences)

    sim_matrix = cosine_similarity(X)
    np.fill_diagonal(sim_matrix, 0)

    graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(graph)

    ranked = sorted(((score, idx) for idx, score in scores.items()), reverse=True)
    top_indices = sorted([idx for _, idx in ranked[:n_sentences]])

    summary = " ".join(sentences[i] for i in top_indices if i < len(sentences))
    return summary


In [6]:
pip install dataset

Collecting typing-extensions>=4.12 (from alembic>=0.6.2->dataset)
  Obtaining dependency information for typing-extensions>=4.12 from https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl.metadata
  Using cached typing_extensions-4.15.0-py3-none-any.whl.metadata (3.3 kB)
Using cached typing_extensions-4.15.0-py3-none-any.whl (44 kB)
Installing collected packages: typing-extensions
  Attempting uninstall: typing-extensions
    Found existing installation: typing_extensions 4.10.0
    Uninstalling typing_extensions-4.10.0:
      Successfully uninstalled typing_extensions-4.10.0
Successfully installed typing-extensions-4.15.0
Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jupyter-server 1.23.4 requires anyio<4,>=3.1.0, but you have anyio 4.10.0 which is incompatible.
spyder 5.4.3 requires jedi<0.19.0,>=0.17.2, but you have jedi 0.19.1 which is incompatible.
tensorflow-intel 2.12.0 requires keras<2.13,>=2.12.0, but you have keras 3.11.3 which is incompatible.
tensorflow-intel 2.12.0 requires numpy<1.24,>=1.22, but you have numpy 1.26.4 which is incompatible.
tensorflow-intel 2.12.0 requires tensorboard<2.13,>=2.12, but you have tensorboard 2.19.0 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.


In [9]:
DATASET_NAME = "cnn_dailymail"
DATASET_VERSION = "3.0.0"

print("Loading dataset...")
dataset = load_dataset("cnn_dailymail", "default")

train_ds = dataset["train"]      # 287,113
val_ds   = dataset["validation"] # 13,368
test_ds  = dataset["test"]       # 11,490

print(f"Train: {len(train_ds)} | Val: {len(val_ds)} | Test: {len(test_ds)}")


Loading dataset...
Train: 287113 | Val: 13368 | Test: 11490


In [10]:
def process_split(ds, out_file, n_sentences=3):
    os.makedirs(os.path.dirname(out_file) or ".", exist_ok=True)
    writer = open(out_file, "w", encoding="utf-8")

    for i in tqdm(range(len(ds)), desc=f"Processing {out_file}"):
        article = ds[i]["article"]
        ref_summary = ds[i]["highlights"]
        pred_summary = textrank_summarize(article, n_sentences=n_sentences)

        record = {
            "id": i,
            "article": article,
            "ref_summary": ref_summary,
            "pred_summary": pred_summary
        }
        writer.write(json.dumps(record) + "\n")

    writer.close()
    print(f"Saved results to {out_file}")


In [11]:
# Full Validation (13,368)
process_split(val_ds, "cnn_val_textrank.jsonl", n_sentences=3)

# Full Test (11,490)
process_split(test_ds, "cnn_test_textrank.jsonl", n_sentences=3)

# Full Train (287,113) ⚠️ This will take the longest
process_split(train_ds, "cnn_train_textrank.jsonl", n_sentences=3)


Processing cnn_val_textrank.jsonl: 100%|██████████| 13368/13368 [01:30<00:00, 148.02it/s]


Saved results to cnn_val_textrank.jsonl


Processing cnn_test_textrank.jsonl: 100%|██████████| 11490/11490 [01:08<00:00, 168.73it/s]


Saved results to cnn_test_textrank.jsonl


Processing cnn_train_textrank.jsonl: 100%|██████████| 287113/287113 [30:44<00:00, 155.66it/s] 

Saved results to cnn_train_textrank.jsonl





In [12]:
!pip install bert_score



In [13]:
from bert_score import score as bert_score
from rouge_score import rouge_scorer
import json
import numpy as np
from tqdm import tqdm # Added import

def evaluate_metrics(file_path):
    # Initialize ROUGE scorer
    rouge_scorer_obj = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    scores = {"rouge1": [], "rouge2": [], "rougeL": []}

    refs, preds = [], []  # For BERTScore

    # Read dataset
    with open(file_path, "r", encoding="utf-8") as f:
        for line in tqdm(f, desc=f"Evaluating {file_path}"): # Added tqdm
            data = json.loads(line)
            ref = data["ref_summary"].strip()
            pred = data["pred_summary"].strip()
            if not ref or not pred:
                continue

            # Store for BERTScore
            refs.append(ref)
            preds.append(pred)

            # Compute ROUGE
            sc = rouge_scorer_obj.score(ref, pred)
            for k in scores:
                scores[k].append(sc[k].fmeasure)

    # Aggregate ROUGE scores
    rouge_results = {k: np.mean(v) for k, v in scores.items() if len(v) > 0}

    # Compute BERTScore (default model: roberta-large)
    P, R, F1 = bert_score(preds, refs, lang="en", verbose=True)
    bert_results = {
        "bert_precision": float(P.mean()),
        "bert_recall": float(R.mean()),
        "bert_f1": float(F1.mean())
    }

    return {"ROUGE": rouge_results, "BERTScore": bert_results}


# Run evaluation
print("Validation Metrics:", evaluate_metrics("cnn_val_textrank.jsonl"))
print("Test Metrics:", evaluate_metrics("cnn_test_textrank.jsonl"))

Evaluating cnn_val_textrank.jsonl: 13368it [00:21, 625.51it/s]
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   5%|5         | 73.4M/1.42G [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/roberta-large/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...
Trying to resume download...


model.safetensors:  17%|#6        | 241M/1.42G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/417 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/209 [00:00<?, ?it/s]

done in 204.11 seconds, 65.49 sentences/sec
Validation Metrics: {'ROUGE': {'rouge1': 0.3462005636648558, 'rouge2': 0.1324642136186202, 'rougeL': 0.2232043138821357}, 'BERTScore': {'bert_precision': 0.8445085287094116, 'bert_recall': 0.8529849648475647, 'bert_f1': 0.8485850095748901}}


Evaluating cnn_test_textrank.jsonl: 11490it [00:20, 573.87it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/359 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/180 [00:00<?, ?it/s]

done in 169.76 seconds, 67.68 sentences/sec
Test Metrics: {'ROUGE': {'rouge1': 0.3383968488311246, 'rouge2': 0.1285393387162409, 'rougeL': 0.21895217974062697}, 'BERTScore': {'bert_precision': 0.8438093066215515, 'bert_recall': 0.8534266948699951, 'bert_f1': 0.848457396030426}}
