In [None]:
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from evaluate import load as load_metric
import pandas as pd

def generate_summaries(model_name, dataset_split="test"):

    model = AutoModelForSeq2SeqLM.from_pretrained(f"../models/{model_name.replace('/', '_')}")
    tokenizer = AutoTokenizer.from_pretrained(f"../models/{model_name.replace('/', '_')}")

    # Original Dataset
    # dataset = load_dataset("sobamchan/aclsum")[dataset_split]

    # Preprocess Dataset
    dataset = load_from_disk("../data/cleaned_aclsum")[dataset_split].select(range(10))
    
    predictions = []
    references = []
    
    for entry in dataset:
        input_ids = tokenizer(entry["document"], return_tensors="pt", truncation=True, max_length=1024).input_ids
        summary_ids = model.generate(input_ids, max_length=150, num_beams=4)
        pred = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        predictions.append(pred)
        references.append(entry["outcome"])
    
    return predictions, references

def compute_metrics(predictions, references):
    rouge = load_metric("rouge")
    result = rouge.compute(predictions=predictions, references=references)
    return {
        "ROUGE-1": result["rouge1"].mid.fmeasure,
        "ROUGE-2": result["rouge2"].mid.fmeasure,
        "ROUGE-L": result["rougeL"].mid.fmeasure
    }

def main():
    models = ["t5-small", "facebook/bart-base", "google/pegasus-xsum", "allenai/led-base-16384"]
    rows = []
    for m in models:
        print(f"Evaluating {m}...")
        preds, refs = generate_summaries(m)
        metrics = compute_metrics(preds, refs)
        metrics["Model"] = m
        rows.append(metrics)
    
    df = pd.DataFrame(rows)
    df.to_csv("../results/rouge_scores.csv", index=False)
    print(df)

if __name__ == "__main__":
    main()