# HiGS Model Evaluation
Evaluate both checkpoints, compute ROUGE & BERTScore, and view generated summaries.

In [1]:
!pip install -q evaluate rouge_score bert_score spacy
!python -m spacy download en_core_web_sm -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m125.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
import os, torch, evaluate, spacy
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer, BartForConditionalGeneration, BartTokenizer
from transformers.modeling_outputs import BaseModelOutput
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from google.colab import drive

drive.mount('/content/drive')
nlp = spacy.load('en_core_web_sm')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')

Mounted at /content/drive
Device: cuda
GPU: Tesla T4


In [3]:
# ===== PATHS (update if needed) =====
DATA_PATH = '/content/drive/MyDrive/NewsSumm_Experiments/data/newssumm_cleaned.parquet'
CHECKPOINT_1 = '/content/drive/MyDrive/HiGraphSum_Checkpoints/higraphsum_epoch1.80_step16000.pt'
CHECKPOINT_2 = '/content/drive/MyDrive/HiGraphSum_Checkpoints/higraphsum_fixed_epoch0.25_step5000.pt'

MAX_SAMPLES = 100000
NUM_EVAL = 200  # test samples to evaluate
BATCH_SIZE = 4

In [4]:
# ===== NLP Utilities =====
def split_into_sentences(text):
    return [s.text.strip() for s in nlp(text).sents if len(s.text.strip()) > 10]

def extract_entities(sentence):
    return set(e.text.lower() for e in nlp(sentence).ents if e.label_ in {'PERSON','ORG','GPE','LOC'})

def build_adjacency_matrix(sentences, sent_embeddings, threshold=0.75):
    n = len(sentences)
    m = sent_embeddings.size(0)
    adj = torch.zeros(m, m)
    if n < 2: return adj
    with torch.no_grad():
        norms = F.normalize(sent_embeddings[:n], p=2, dim=1)
        sim = torch.mm(norms, norms.t())
    ents = [extract_entities(s) for s in sentences]
    for i in range(n):
        for j in range(i+1, n):
            if sim[i,j].item() > threshold or (ents[i] & ents[j]):
                adj[i,j] = adj[j,i] = 1.0
    return adj

In [5]:
# ===== Dataset =====
class GraphSumDataset(Dataset):
    def __init__(self, articles, summaries, bert_tok, bart_tok, max_sents=30, max_len=64, max_sum=128):
        self.articles, self.summaries = articles, summaries
        self.bert_tok, self.bart_tok = bert_tok, bart_tok
        self.max_sents, self.max_len, self.max_sum = max_sents, max_len, max_sum

    def __len__(self): return len(self.articles)

    def __getitem__(self, idx):
        sents = split_into_sentences(self.articles[idx])[:self.max_sents]
        if sents:
            enc = self.bert_tok(sents, padding='max_length', truncation=True, max_length=self.max_len, return_tensors='pt')
            ids, mask = enc['input_ids'], enc['attention_mask']
        else:
            ids = torch.zeros((0, self.max_len), dtype=torch.long)
            mask = torch.zeros((0, self.max_len), dtype=torch.long)
        n = ids.size(0)
        if n < self.max_sents:
            p = self.max_sents - n
            ids = torch.cat([ids, torch.full((p, self.max_len), self.bert_tok.pad_token_id, dtype=torch.long)])
            mask = torch.cat([mask, torch.zeros((p, self.max_len), dtype=torch.long)])
        padded = sents + ['']*(self.max_sents - len(sents))
        sum_enc = self.bart_tok(self.summaries[idx], padding='max_length', truncation=True, max_length=self.max_sum, return_tensors='pt')
        return {
            'sent_input_ids': ids, 'sent_attention_mask': mask,
            'sentences_raw': '|||'.join(padded),
            'summary_input_ids': sum_enc['input_ids'].squeeze(0),
            'reference': self.summaries[idx]
        }

In [6]:
# ===== Model =====
class GraphAttentionLayer(nn.Module):
    def __init__(self, in_f, out_f, dropout=0.2, alpha=0.2):
        super().__init__()
        self.W = nn.Linear(in_f, out_f, bias=False)
        self.a = nn.Linear(2*out_f, 1, bias=False)
        self.lrelu = nn.LeakyReLU(alpha)
        self.drop = nn.Dropout(dropout)
    def forward(self, h, adj):
        Wh = self.W(h)
        B, N, _ = Wh.size()
        Wi = Wh.unsqueeze(2).expand(-1,-1,N,-1)
        Wj = Wh.unsqueeze(1).expand(-1,N,-1,-1)
        e = self.lrelu(self.a(torch.cat([Wi,Wj], dim=-1)).squeeze(-1))
        m = (adj==0)
        e = e.masked_fill(m, float('-inf'))
        att = F.softmax(e, dim=-1).masked_fill(m, 0.0)
        return torch.bmm(self.drop(att), Wh)

class HiGraphSum(nn.Module):
    def __init__(self, num_gat_layers=2, gat_hidden_dim=512, dropout=0.2, label_smoothing=0.1):
        super().__init__()
        self.sentence_encoder = AutoModel.from_pretrained('bert-base-uncased')
        self.bert_hidden_dim = self.sentence_encoder.config.hidden_size
        self.gat_layers = nn.ModuleList([
            GraphAttentionLayer(
                self.bert_hidden_dim if i == 0 else gat_hidden_dim,
                gat_hidden_dim, dropout
            ) for i in range(num_gat_layers)
        ])
        self.gat_dropout = nn.Dropout(dropout)
        self.decoder = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
        self.label_smoothing = label_smoothing
        bart_hidden_dim = self.decoder.config.d_model
        self.projection = nn.Linear(gat_hidden_dim, bart_hidden_dim)
        self.bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
        self.loss_fn = nn.CrossEntropyLoss(label_smoothing=label_smoothing, ignore_index=-100)
    def encode_sentences(self, sent_input_ids, sent_attention_mask):
        B, S, L = sent_input_ids.size()
        out = self.sentence_encoder(input_ids=sent_input_ids.view(-1,L), attention_mask=sent_attention_mask.view(-1,L))
        return out.last_hidden_state[:,0,:].view(B,S,-1)
    def generate_summary(self, batch, num_beams=4, max_length=128):
        ids = batch['sent_input_ids']
        mask = batch['sent_attention_mask']
        raw = batch['sentences_raw']
        B = ids.size(0)
        with torch.no_grad():
            h = self.encode_sentences(ids, mask)
            adjs = []
            for i in range(B):
                s = raw[i].split('|||') if isinstance(raw[i], str) else list(raw[i])
                adjs.append(build_adjacency_matrix(s, h[i], 0.75))
            adj = torch.stack(adjs).to(h.device)
            for layer in self.gat_layers:
                h = F.relu(layer(h, adj))
                h = self.gat_dropout(h)
            enc_out = BaseModelOutput(last_hidden_state=self.projection(h))
            enc_mask = torch.ones(B, h.size(1)).to(h.device)
        return self.decoder.generate(encoder_outputs=enc_out, attention_mask=enc_mask,
                                     num_beams=num_beams, max_length=max_length, early_stopping=True)

In [7]:
# ===== Load Data & Create Test Set =====
df = pd.read_parquet(DATA_PATH)
df = df[df['articles_clean'].str.len() > 100]
df = df[df['summary_clean'].str.len() > 20].head(MAX_SAMPLES)
print(f'Total: {len(df):,}')

arts = df['articles_clean'].tolist()
sums = df['summary_clean'].tolist()
n = len(arts)
test_arts = arts[int(n*0.9):]
test_sums = sums[int(n*0.9):]
eval_arts = test_arts[:NUM_EVAL]
eval_sums = test_sums[:NUM_EVAL]
print(f'Test: {len(test_arts):,}, Evaluating: {len(eval_arts)}')

bert_tok = AutoTokenizer.from_pretrained('bert-base-uncased')
bart_tok = BartTokenizer.from_pretrained('facebook/bart-base')
test_ds = GraphSumDataset(eval_arts, eval_sums, bert_tok, bart_tok)
test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

Total: 100,000
Test: 10,000, Evaluating: 200


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [8]:
# ===== Evaluation Function =====
def evaluate_checkpoint(ckpt_path, test_dl, bart_tok, eval_arts):
    print(f'\n{"="*60}')
    print(f'Checkpoint: {os.path.basename(ckpt_path)}')
    print(f'{"="*60}')

    model = HiGraphSum(num_gat_layers=2, gat_hidden_dim=512, dropout=0.2, label_smoothing=0.1).to(device)
    ckpt = torch.load(ckpt_path, map_location=device)
    model.load_state_dict(ckpt['model_state_dict'], strict=True)
    model.eval()
    print(f'Epoch: {ckpt["epoch"]:.2f} | Step: {ckpt["step"]} | TrainLoss: {ckpt["train_loss"]:.4f} | ValLoss: {ckpt["val_loss"]:.4f}')

    preds, refs = [], []
    for batch in tqdm(test_dl, desc='Generating'):
        b = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items() if k != 'reference'}
        try:
            gen = model.generate_summary(b)
            preds.extend(bart_tok.batch_decode(gen, skip_special_tokens=True))
            refs.extend(batch['reference'])
        except Exception as e:
            print(f'Error: {e}')
            preds.extend(['[FAILED]'] * batch['sent_input_ids'].size(0))
            refs.extend(batch['reference'])

    # ROUGE
    rouge = evaluate.load('rouge')
    r = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
    print(f'\nROUGE-1: {r["rouge1"]:.4f} | ROUGE-2: {r["rouge2"]:.4f} | ROUGE-L: {r["rougeL"]:.4f}')

    # BERTScore
    bs = evaluate.load('bertscore')
    b = bs.compute(predictions=preds, references=refs, lang='en', model_type='roberta-large', device=device.type)
    f1 = np.mean(b['f1'])
    print(f'BERTScore F1: {f1:.4f}  (P: {np.mean(b["precision"]):.4f} | R: {np.mean(b["recall"]):.4f})')

    # Cleanup
    del model
    torch.cuda.empty_cache()

    return preds, refs, {'rouge1': r['rouge1'], 'rouge2': r['rouge2'], 'rougeL': r['rougeL'], 'bertscore_f1': f1}

In [10]:
# ===== Evaluate Checkpoint 2 (Phase 2: Decoder Fine-tuned) =====
preds2, refs2, scores2 = evaluate_checkpoint(CHECKPOINT_2, test_dl, bart_tok, eval_arts)


Checkpoint: higraphsum_fixed_epoch0.25_step5000.pt


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.bias                       | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Loading weights:   0%|          | 0/259 [00:00<?, ?it/s]

Epoch: 0.25 | Step: 5000 | TrainLoss: 5.4324 | ValLoss: 4.2496


Generating:   0%|          | 0/50 [00:00<?, ?it/s]


ROUGE-1: 0.2305 | ROUGE-2: 0.0339 | ROUGE-L: 0.1578


Loading weights:   0%|          | 0/389 [00:00<?, ?it/s]

RobertaModel LOAD REPORT from: roberta-large
Key                             | Status     | 
--------------------------------+------------+-
lm_head.dense.bias              | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
pooler.dense.weight             | MISSING    | 
pooler.dense.bias               | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


BERTScore F1: 0.8344  (P: 0.8481 | R: 0.8212)


In [18]:
# ===== Comparison Table =====
print('\n' + '='*60)
print('RESULTS')
print('='*60)
comp = pd.DataFrame({
    'Metric': ['ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'BERTScore F1'],
    '(epoch ~2)': [scores2['rouge1'], scores2['rouge2'], scores2['rougeL'], scores2['bertscore_f1']]
})
print(comp)


RESULTS
         Metric  (epoch ~2)
0       ROUGE-1    0.230541
1       ROUGE-2    0.033880
2       ROUGE-L    0.157820
3  BERTScore F1    0.834412


This testing was done in only around 2 epoch for testing the scripts and scores

In [12]:
# ===== Sample Summaries =====
for i in range(5):
    print(f'\n{"="*60}')
    print(f'Example {i+1}')
    print(f'{"="*60}')
    print(f'Source (first 200 chars): {eval_arts[i][:200]}...')
    print(f'\nReference: {refs1[i]}')
    print(f'\nPhase 1:   {preds1[i]}')
    print(f'Phase 2:   {preds2[i]}')


Example 1
Source (first 200 chars): HYDERABAD: While the post-9/11 world retreated into itself and lived off its blubber during 2002, Hyderabad soldiered on towards its goal of becoming a modern economic powerhouse.With the dust blown u...

Reference: Despite global turmoil post-9/11, Hyderabad continued its march towards becoming an economic powerhouse, shedding its lethargy for a can-do attitude. The city boasts 7 lakh credit cardholders, 800 IT companies offering competitive salaries, and a rising trend of nuclear families driving urban expansion. Hyderabadis embrace new ideas, evident in thriving sectors like retail and IT, with a grocery market estimated at Rs 3,000 crore and over 6,92,000 mobile phone users. The city's IT sector, with exports exceeding Rs 2,300 crore, employs 69,000 professionals and anticipates further growth with the advent of business process outsourcing and call centers.

Phase 1:    asking promotegeneral asking Cost magazine Like askingEN q Europe cash mill