# <b>ACLSum Text Summarizer - Data Training, Modelling, and Evaluating</b>
## 2702217125 - Stanley Nathanael Wijaya

GitHub Repository Full Project: https://github.com/StyNW7/NLP-Text-Summarizer
<br>
Dataset Source: https://huggingface.co/datasets/sobamchan/aclsum
<br>
Docs Documentation: https://docs.google.com/document/d/1qSS2kVPMKn032hhjPmrgMquIb6Q827EiowY9y0s_k3I/edit?usp=sharing
<br><br>

## Task Description

In the Natural Language Processing Project at BINUS University, I want to compare Text Summarizer models using ACLSum Datasets.
<br><br>
Dataset:
<br>
https://huggingface.co/datasets/sobamchan/aclsum
<br><br>
To-do list in this Notebook:

<br>
Training and Evaluating
<ul>
    <li>Perform Training and record the model training result metrics</li>
    <li>Compare the model you choose with at least 3 other models!</li>
    <li>Conduct an evaluation using metrics according to the context of the topic</li>
</ul>

<br>
Modelling
<ul>
    <li>Give reasons why you chose and used the model.</li>
    <li>Provide an explanation of the architecture of the model you chose.</li>
</ul>

## Preprocessing

In [None]:
import re
import os
import string
import nltk
from datasets import load_dataset, DatasetDict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(words)

def preprocess_dataset():
    raw_dataset = load_dataset("sobamchan/aclsum")
    
    def apply_cleaning(example):
        example["document"] = clean_text(example["document"])
        example["outcome"] = clean_text(example["outcome"])
        return example
    
    cleaned_dataset = raw_dataset.map(apply_cleaning)
    return cleaned_dataset

if __name__ == "__main__":
    dataset = preprocess_dataset()
    print(dataset)

    output_path = "../data/cleaned_aclsum"
    os.makedirs(output_path, exist_ok=True)
    dataset.save_to_disk(output_path)

    print(f"Dataset yang telah dibersihkan berhasil disimpan di: {output_path}")

## Training Model

In [None]:
import argparse
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    Seq2SeqTrainer, Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
from datasets import load_from_disk


# Get Max Length Input for each model
def get_max_input_length(model_checkpoint):
    if "pegasus" in model_checkpoint:
        return 512
    elif "longformer" in model_checkpoint:
        return 4096
    else:
        return 1024


# Tokenization function with dynamic input length
def tokenize_function(example, tokenizer, max_input_length=1024):
    model_inputs = tokenizer(
        example["document"],
        max_length=max_input_length,
        padding="max_length",
        truncation=True
    )
    
    labels = tokenizer(
        text_target=example["outcome"],
        max_length=150,
        padding="max_length",
        truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def main(model_checkpoint):
    print(f"🔧 Loading model & tokenizer from: {model_checkpoint}")
    model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

    print("📂 Loading preprocessed dataset...")
    dataset = load_from_disk("../data/cleaned_aclsum")

    max_input_length = get_max_input_length(model_checkpoint)
    print(f"📏 Using max_input_length: {max_input_length}")

    print("🔠 Tokenizing dataset...")
    tokenized = dataset.map(
        lambda x: tokenize_function(x, tokenizer, max_input_length=max_input_length),
        batched=True
    )

    training_args = Seq2SeqTrainingArguments(
        output_dir=f"../models/{model_checkpoint.replace('/', '_')}",
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        predict_with_generate=True,
        evaluation_strategy="epoch",
        save_total_limit=2,
        num_train_epochs=3,
        logging_dir='./logs',
        logging_steps=10,
        learning_rate=2e-5,
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized["train"],
        eval_dataset=tokenized["validation"],
        tokenizer=tokenizer,
        data_collator=DataCollatorForSeq2Seq(tokenizer, model=model)
    )

    print("🚀 Training started...")
    trainer.train()

    print(f"💾 Saving model to: ../models/{model_checkpoint.replace('/', '_')}")
    model.save_pretrained(f"../models/{model_checkpoint.replace('/', '_')}")
    tokenizer.save_pretrained(f"../models/{model_checkpoint.replace('/', '_')}")

    print("✅ Training and saving completed.")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", type=str, required=True)
    args = parser.parse_args()
    main(args.model)


## Evaluate Model

In [None]:
import os
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from evaluate import load as load_metric

def get_max_input_length(model_name):
    if "pegasus" in model_name:
        return 512
    elif "longformer" in model_name:
        return 4096
    else:
        return 1024

def generate_summaries(model_name, dataset_split="test"):
    model_dir = f"../models/{model_name.replace('/', '_')}"
    model = AutoModelForSeq2SeqLM.from_pretrained(model_dir).to("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = AutoTokenizer.from_pretrained(model_dir)

    dataset = load_from_disk("../data/cleaned_aclsum")[dataset_split].select(range(10))

    predictions = []
    references = []
    max_len = get_max_input_length(model_name)

    print(f"⏳ Generating summaries for {model_name}...")
    for entry in dataset:
        if entry["document"].strip() == "" or entry["outcome"].strip() == "":
            continue
        inputs = tokenizer(entry["document"], return_tensors="pt", truncation=True, max_length=max_len).to(model.device)
        summary_ids = model.generate(inputs.input_ids, max_length=150, num_beams=4)
        pred = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        predictions.append(pred)
        references.append(entry["outcome"])

    return predictions, references

def compute_metrics(predictions, references):
    rouge = load_metric("rouge")
    result = rouge.compute(predictions=predictions, references=references, use_stemmer=True)
    return {
        "ROUGE-1": result["rouge1"],
        "ROUGE-2": result["rouge2"],
        "ROUGE-L": result["rougeL"]
    }

def main():
    models = [
        "t5-small",
        "facebook/bart-base",
        "google/pegasus-xsum",
        "allenai/led-base-16384"
    ]
    rows = []
    for m in models:
        preds, refs = generate_summaries(m)
        metrics = compute_metrics(preds, refs)
        metrics["Model"] = m
        rows.append(metrics)

    df = pd.DataFrame(rows)
    df = df[["Model", "ROUGE-1", "ROUGE-2", "ROUGE-L"]]

    # Save
    os.makedirs("../results", exist_ok=True)
    df.to_csv("../results/model_comparison.csv", index=False)

    # Visualize
    print("\n📊 Results of Model Comparison:")
    print(df)

    df.set_index("Model").plot(kind="bar", figsize=(10, 6))
    plt.title("Model Summarization Comparison (ROUGE Scores)")
    plt.ylabel("Score")
    plt.ylim(0.0, 1.0)
    plt.tight_layout()
    plt.savefig("../results/model_comparison.png")
    plt.close()
    print("✅ Graph save at: results/model_comparison.png")

if __name__ == "__main__":
    main()


## Visualize Model

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

data = {
    "Model": ["t5-small", "facebook/bart-base", "google/pegasus-xsum", "allenai/led-base-16384"],
    "ROUGE-1": [0.1369, 0.1236, 0.1018, 0.1643],
    "ROUGE-2": [0.0359, 0.0340, 0.0463, 0.0685],
    "ROUGE-L": [0.1013, 0.1163, 0.0922, 0.1522]
}

df = pd.DataFrame(data)

df_plot = df.set_index("Model")

plt.figure(figsize=(10, 6))
df_plot.plot(kind="bar", ax=plt.gca(), color=["#1f77b4", "#ff7f0e", "#2ca02c"])
plt.title("ROUGE Scores Comparison Across Models", fontsize=14)
plt.xlabel("Model", fontsize=12)
plt.ylabel("Score", fontsize=12)
plt.legend(title="Metric")
plt.grid(True, axis="y", linestyle="--", alpha=0.7)
plt.tight_layout()

plt.savefig("../results/rouge_comparison.png")
plt.show()

## Owner

This Notebook is created by:
- Stanley Nathanael Wijaya - 2702217125

<code> Striving for Excellence ❤️‍🔥❤️‍🔥 </code>