In [1]:
import os
import sys
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
from transformers import pipeline, BartTokenizerFast
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
root = Path(os.getcwd()).parent
sys.path.insert(0, str(root))
sys.path.insert(0, str(root / "src"))

project_root = root

val_csv      = project_root / "data" / "training" / "validation_pairs.csv"
FT_ROOT      = project_root / "experiments" / "bart-finetune"

In [3]:
# (baseline + finetuned)
model_paths = {"baseline": "sshleifer/distilbart-cnn-12-6"}
for cfg_dir in FT_ROOT.iterdir():
    best = cfg_dir / "best-model"
    if best.is_dir():
        model_paths[cfg_dir.name] = str(best)

In [4]:
df_val = pd.read_csv(val_csv)
texts  = df_val["body"].tolist()
refs   = df_val["abstract"].tolist()

In [5]:
rouge = evaluate.load("rouge")

In [6]:
results = []
for name, model_id in model_paths.items():
    print(f"→ Evaluating {name}")
    # pipeline truncate 
    tokenizer = BartTokenizerFast.from_pretrained(model_id)
    summarizer = pipeline(
        "summarization",
        model=model_id,
        tokenizer=tokenizer,
        device=0 if os.getenv("CUDA_VISIBLE_DEVICES") else -1,
        truncation=True,       
        max_length=512,       
    )

    preds = []
    for text in tqdm(texts, desc=name):
        ids = tokenizer(text, truncation=True, max_length=512)["input_ids"]
        short = tokenizer.decode(ids, skip_special_tokens=True)
        out = summarizer(
            short,
            max_length=150,
            min_length=30,
            num_beams=4,
            length_penalty=2.0,
            no_repeat_ngram_size=3,
            early_stopping=True,
        )
        preds.append(out[0]["summary_text"].strip())

    # ROUGE
    scores = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
    scores = {k: v * 100 for k, v in scores.items()}
    scores["model"] = name
    results.append(scores)

→ Evaluating baseline


Device set to use cpu
baseline: 100%|██████████| 49/49 [03:32<00:00,  4.33s/it]


→ Evaluating bs2_lr3e-05_ep3


Device set to use cpu
bs2_lr3e-05_ep3:   0%|          | 0/49 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
bs2_lr3e-05_ep3: 100%|██████████| 49/49 [10:44<00:00, 13.15s/it]


→ Evaluating bs2_lr5e-05_ep3


Device set to use cpu
bs2_lr5e-05_ep3:   0%|          | 0/49 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
bs2_lr5e-05_ep3: 100%|██████████| 49/49 [11:15<00:00, 13.79s/it]


→ Evaluating bs2_lr5e-05_ep5


Device set to use cpu
bs2_lr5e-05_ep5:   0%|          | 0/49 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
bs2_lr5e-05_ep5: 100%|██████████| 49/49 [10:23<00:00, 12.73s/it]


In [7]:
df_res = pd.DataFrame(results).set_index("model")
df_res

Unnamed: 0_level_0,rouge1,rouge2,rougeL,rougeLsum
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
baseline,32.629141,25.525985,28.901312,28.970579
bs2_lr3e-05_ep3,79.983962,75.154015,78.805517,78.476785
bs2_lr5e-05_ep3,80.992784,76.5064,80.116131,79.77383
bs2_lr5e-05_ep5,81.066995,76.194872,79.971792,79.591498


In [8]:
out_csv = FT_ROOT / "validation_comparison.csv"
df_res.to_csv(out_csv)
print("Saved results to", out_csv)

Saved results to D:\SciSumm-RAG\experiments\bart-finetune\validation_comparison.csv
