In [None]:
# prompt: connect to drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers datasets bert-score rouge-score nltk
import nltk
nltk.download('punkt')
nltk.download('punkt_tab') # Download punkt_tab data
nltk.download('wordnet') # Download wordnet data

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidi

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
!pip install transformers datasets bert-score rouge-score nltk



In [None]:
import torch
import torch.nn as nn
from transformers import BartTokenizer, BartModel
from datasets import load_dataset
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from nltk.translate.meteor_score import single_meteor_score
from nltk.tokenize import sent_tokenize
from collections import OrderedDict

# === Step 1: Split validation into validation + test ===
def load_and_split_gigaword():
    full_val = load_dataset("gigaword", split="validation")
    split_dataset = full_val.train_test_split(test_size=10000, shuffle=True, seed=42)
    val_set = split_dataset['train']
    test_set = split_dataset['test']
    print(f" Dataset Split Complete - Validation: {len(val_set)}, Test: {len(test_set)}")
    return val_set, test_set

In [None]:
# === Sentence Splitter ===
def split_into_sentences(text):
    return sent_tokenize(text)

# === Tokenizer Function ===
def tokenize_sentences(sentences, tokenizer):
    return tokenizer(sentences, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

# === BART-based Extractive Summarization Model ===
class BartForExtractiveSummarization(nn.Module):
    def __init__(self, pretrained_model="facebook/bart-base"):
        super().__init__()
        self.bart = BartModel.from_pretrained(pretrained_model)
        self.classifier = nn.Linear(self.bart.config.d_model, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bart(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(cls_output).squeeze(-1)
        return logits

In [None]:
# === Evaluation Function ===
def evaluate_model(model_path, test_dataset):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"\nEvaluating on device: {device}")

    # Load model
    model = BartForExtractiveSummarization().to(device)
    state_dict = torch.load(model_path, map_location=device)
    clean_state_dict = OrderedDict((k[7:] if k.startswith("module.") else k, v) for k, v in state_dict.items())
    model.load_state_dict(clean_state_dict)
    model.eval()

    tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    meteor_total, rouge1_total, rouge2_total, rougeL_total = 0, 0, 0, 0
    references, predictions = [], []

    for example in test_dataset:
        article = example['document']
        summary = example['summary']
        sentences = split_into_sentences(article)

        if len(sentences) < 3:
            sentences += [""] * (3 - len(sentences))

        tokenized = tokenize_sentences(sentences, tokenizer)
        input_ids = tokenized['input_ids'].to(device)
        attention_mask = tokenized['attention_mask'].to(device)

        try:
            logits = model(input_ids, attention_mask).squeeze(0)
        except Exception:
            logits = torch.zeros(len(sentences), device=device)

        if logits.dim() == 0 or len(logits) != len(sentences):
            logits = torch.zeros(len(sentences), device=device)

        top_indices = sorted(range(len(logits)), key=lambda i: logits[i], reverse=True)[:3]
        pred_summary = " ".join([sentences[i] for i in top_indices])

        scores = rouge.score(summary, pred_summary)
        rouge1_total += scores['rouge1'].fmeasure
        rouge2_total += scores['rouge2'].fmeasure
        rougeL_total += scores['rougeL'].fmeasure
        meteor_total += single_meteor_score(summary.split(), pred_summary.split())
        references.append(summary)
        predictions.append(pred_summary)

    precision, recall, f1 = bert_score(predictions, references, lang='en', verbose=False)
    n = len(predictions)

    print(f"\n✅ Final Evaluation on {n} test samples")
    print(f"ROUGE-1 F1: {rouge1_total / n:.4f}")
    print(f"ROUGE-2 F1: {rouge2_total / n:.4f}")
    print(f"ROUGE-L F1: {rougeL_total / n:.4f}")
    print(f"METEOR:     {meteor_total / n:.4f}")
    print(f"BERTScore P/R/F1: {precision.mean().item():.4f} / {recall.mean().item():.4f} / {f1.mean().item():.4f}")

    torch.cuda.empty_cache()

In [None]:
# === Run the Pipeline ===
if __name__ == "__main__":
    model_path = "BART_GIGA/best_bart_gigaword_model.pt"  # <-- Update this path
    val_set, test_set = load_and_split_gigaword()
    evaluate_model(model_path, test_set)

✅ Dataset Split Complete - Validation: 179651, Test: 10000

Evaluating on device: cuda


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



✅ Final Evaluation on 10000 test samples
ROUGE-1 F1: 0.2904
ROUGE-2 F1: 0.1029
ROUGE-L F1: 0.2519
METEOR:     0.3927
BERTScore P/R/F1: 0.8348 / 0.9011 / 0.8664
