In [1]:
# Cell 1 - Imports
import numpy as np
import torch
import nltk
from datasets import load_dataset
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from evaluate import load
import json

nltk.download("punkt")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Cell 2 - Load full CNN/DailyMail
dataset = load_dataset("cnn_dailymail", "default")

print("Train size:", len(dataset["train"]))
print("Validation size:", len(dataset["validation"]))
print("Test size:", len(dataset["test"]))


Train size: 287113
Validation size: 13368
Test size: 11490


In [3]:
# Cell 3 - Load BERT
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
model.eval()


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [4]:
# Cell 4 - Sentence embeddings using BERT
def get_sentence_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Mean pooling
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()


In [5]:
# Cell 4 - Sentence embeddings using BERT
def get_sentence_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Mean pooling
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()


In [6]:
# Cell 5 - Summarization with BERT + TextRank
def bert_extractive_summarize(article, num_sentences=3):
    sentences = nltk.sent_tokenize(article)
    if len(sentences) <= num_sentences:
        return article

    # Get embeddings for each sentence
    embeddings = [get_sentence_embedding(sent) for sent in sentences]

    # Similarity matrix
    sim_matrix = cosine_similarity(embeddings)

    # Graph
    nx_graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(nx_graph)

    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    summary = " ".join([s for _, s in ranked_sentences[:num_sentences]])
    return summary


In [None]:
# Cell 6 - Process full splits and save results
def process_split(split, output_file, n_sentences=3):
    with open(output_file, "w", encoding="utf-8") as f:
        for example in split:
            article = example["article"]
            ref_summary = example["highlights"]
            pred_summary = bert_extractive_summarize(article, n_sentences)
            f.write(json.dumps({
                "ref_summary": ref_summary,
                "pred_summary": pred_summary
            }) + "\n")

# Run on all splits
process_split(dataset["validation"], "bert_val.jsonl", n_sentences=3)
process_split(dataset["test"], "bert_test.jsonl", n_sentences=3)


In [None]:
# Cell 7 - Evaluate with ROUGE + BERTScore
rouge = load("rouge")
bertscore = load("bertscore")

def evaluate(file_path):
    preds, refs = [], []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line)
            preds.append(data["pred_summary"])
            refs.append(data["ref_summary"])

    rouge_result = rouge.compute(predictions=preds, references=refs)
    bertscore_result = bertscore.compute(predictions=preds, references=refs, lang="en")

    return {
        "rouge1": rouge_result["rouge1"],
        "rouge2": rouge_result["rouge2"],
        "rougeL": rouge_result["rougeL"],
        "bertscore_f1": np.mean(bertscore_result["f1"])
    }

print("Validation:", evaluate("bert_val.jsonl"))
print("Test:", evaluate("bert_test.jsonl"))
