# <b>ACLSum Text Summarizer - Data Preprocessing and Vectorizer</b>
Dataset Source: https://huggingface.co/datasets/sobamchan/aclsum
<br>
Docs Documentation: https://docs.google.com/document/d/1qSS2kVPMKn032hhjPmrgMquIb6Q827EiowY9y0s_k3I/edit?usp=sharing
<br><br>
<b>Stanley Nathanael Wijaya - 2702217125</b>

## Task Description

As a data scientist in a tech company, you are tasked with developing a music recommendation system similar to Spotify’s.
Dataset:
<br>
(https://www.kaggle.com/datasets/bricevergnou/spotify-recommendation)
<br><br>
This system aims to enhance user experience by suggesting songs or artists based on their listening history, preferences, and behavior.
<ul>
    <li>Choose an appropriate model for your recommendation system. This could be a collaborative filtering model, a content-based model, or even a hybrid model.</li>
    <li>Develop the model to analyze user behavior and predict songs or artists they might like.</li>
    <li>Determine how you will measure the success of your recommendation system. </li>
    <li>Test the model with a set of users or simulated data to evaluate its performance.</li>
</ul>

## First Try

### Import Libary

Library needed to completely run the project

In [None]:
import re
import nltk
import string
import torch
import pickle
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, T5ForConditionalGeneration
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

### Data Cleaning, Data Preprocessing, and Create Model

In [None]:
# Download NLP resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Initialize Lemmatizer & Stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Load dataset ACLSum
dataset = load_dataset("sobamchan/aclsum", split="train")

# Function untuk membersihkan teks
def clean_text(text):
    text = text.lower()  # Lowercasing
    text = re.sub(r'\d+', '', text)  # Menghapus angka
    text = text.translate(str.maketrans('', '', string.punctuation))  # Menghapus tanda baca
    words = word_tokenize(text)  # Tokenisasi
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Lemmatization & stopword removal
    return " ".join(words)

# Preprocessing seluruh dataset
for entry in dataset:
    entry['document'] = clean_text(entry['document'])
    entry['outcome'] = clean_text(entry['outcome'])

# Inisialisasi tokenizer T5
tokenizer = AutoTokenizer.from_pretrained("t5-small")

# Function untuk tokenisasi dengan padding dan truncation
def preprocess_data(example):
    inputs = tokenizer(example['document'], max_length=1024, truncation=True, padding="max_length")
    labels = tokenizer(example['outcome'], max_length=150, truncation=True, padding="max_length")
    inputs["labels"] = labels["input_ids"]
    return inputs

# Tokenisasi dataset
tokenized_dataset = dataset.map(preprocess_data, batched=True)

# Inisialisasi model T5
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Menggunakan TF-IDF Vectorizer sebagai tambahan fitur
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Mengambil 5000 fitur paling penting
tfidf_matrix = tfidf_vectorizer.fit_transform([entry['document'] for entry in dataset])

# Simpan model, tokenizer, dan TF-IDF vectorizer setelah training
model.save_pretrained("saved_preprocessing_model")
tokenizer.save_pretrained("saved_preprocessing_model")

# Simpan TF-IDF vectorizer menggunakan pickle
with open("saved_preprocessing_model/tfidf_vectorizer.pkl", "wb") as file:
    pickle.dump(tfidf_vectorizer, file)

print("✅ Model, Tokenizer, dan TF-IDF Vectorizer berhasil disimpan di folder 'saved_preprocessing_model/'")


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Map: 100%|██████████| 100/100 [00:00<00:00, 272.25 examples/s]


✅ Model, Tokenizer, dan TF-IDF Vectorizer berhasil disimpan di folder 'saved_preprocessing_model/'


### Load Model and Vectorizer

In [None]:
# Load model yang telah disimpan
model = T5ForConditionalGeneration.from_pretrained("saved_preprocessing_model")
tokenizer = AutoTokenizer.from_pretrained("saved_preprocessing_model")

# Load TF-IDF Vectorizer
with open("saved_preprocessing_model/tfidf_vectorizer.pkl", "rb") as file:
    tfidf_vectorizer = pickle.load(file)

print("✅ Model, Tokenizer, dan TF-IDF Vectorizer berhasil dimuat kembali!")


✅ Model, Tokenizer, dan TF-IDF Vectorizer berhasil dimuat kembali!


### Model Testing Part 1

In [9]:
# Function untuk membuat ringkasan dengan model yang telah disimpan
def generate_summary(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs.input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Contoh penggunaan dengan dokumen yang sudah diproses
sample_paper = dataset[0]['document']
print("\nOriginal Paper (Cleaned):\n", sample_paper)
print("\nGenerated Summary:\n", generate_summary(sample_paper))

# Menggunakan TF-IDF Vectorizer untuk representasi numerik teks
tfidf_features = tfidf_vectorizer.transform([sample_paper])
print("\nTF-IDF Features for Sample Paper:\n", tfidf_features.toarray()[0])



Original Paper (Cleaned):
 In this paper , we explore correlation of dependency relation paths to rank candidate answers in answer extraction . Using the correlation measure , we compare dependency relations of a candidate answer and mapped question phrases in sentence with the corresponding relations in question . Different from previous studies , we propose an approximate phrase mapping algorithm and incorporate the mapping score into the correlation measure . The correlations are further incorporated into a Maximum Entropy-based ranking model which estimates path weights from training . Experimental results show that our method significantly outperforms state-ofthe-art syntactic relation-based methods by up to 20 % in MRR . Answer Extraction is one of basic modules in open domain Question Answering ( QA ) . It is to further process relevant sentences extracted with Passage / Sentence Retrieval and pinpoint exact answers using more linguistic-motivated analysis . Since QA turns to f

### Model Testing Part 2

In [7]:
# Function untuk membuat ringkasan dengan model yang telah disimpan
def generate_summary(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs.input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Contoh penggunaan dengan dokumen yang sudah diproses
sample_paper = dataset[10]['document']
print("\nOriginal Paper (Cleaned):\n", sample_paper)
print("\nGenerated Summary:\n", generate_summary(sample_paper))

# Menggunakan TF-IDF Vectorizer untuk representasi numerik teks
tfidf_features = tfidf_vectorizer.transform([sample_paper])
print("\nTF-IDF Features for Sample Paper:\n", tfidf_features.toarray()[0])



Original Paper (Cleaned):
 Automatically extracting social meaning and intention from spoken dialogue is an important task for dialogue systems and social computing . We describe a system for detecting elements of interactional style : whether a speaker is awkward , friendly , or flirtatious . We create and use a new spoken corpus of 991 4-minute speed-dates . Participants rated their interlocutors for these elements of style . Using rich dialogue , lexical , and prosodic features , we are able to detect flirtatious , awkward , and friendly styles in noisy natural conversational data with up to 75 % accuracy , compared to a 50 % baseline . We describe simple ways to extract relatively rich dialogue features , and analyze which features performed similarly for men and women and which were gender-specific . How can we extract social meaning from speech , deciding if a speaker is particularly engaged in the conversation , is uncomfortable or awkward , or is particularly friendly and flir

### Model Testing Part 3

In [45]:
# Function to generate summary
def generate_summary(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs.input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Example usage
sample_paper = dataset[0]['document']
print("Original Paper:\n", sample_paper)
print("\nGenerated Summary:\n", generate_summary(sample_paper))


Original Paper:
 In this paper , we explore correlation of dependency relation paths to rank candidate answers in answer extraction . Using the correlation measure , we compare dependency relations of a candidate answer and mapped question phrases in sentence with the corresponding relations in question . Different from previous studies , we propose an approximate phrase mapping algorithm and incorporate the mapping score into the correlation measure . The correlations are further incorporated into a Maximum Entropy-based ranking model which estimates path weights from training . Experimental results show that our method significantly outperforms state-ofthe-art syntactic relation-based methods by up to 20 % in MRR . Answer Extraction is one of basic modules in open domain Question Answering ( QA ) . It is to further process relevant sentences extracted with Passage / Sentence Retrieval and pinpoint exact answers using more linguistic-motivated analysis . Since QA turns to find exact a

### Model Evaluation using Rogue Score

In [49]:
from rouge_score import rouge_scorer
from datasets import load_dataset
from transformers import AutoTokenizer, T5ForConditionalGeneration
from tqdm import tqdm

# Load dataset ACLSum (gunakan data split yang tepat)
dataset = load_dataset("sobamchan/aclsum", split="train")

# Inisialisasi tokenizer dan model T5
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("saved_preprocessing_model")  # Gunakan model yang telah disimpan

# Function untuk generate summary
def generate_summary(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs.input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Initialize the ROUGE scorer
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

# Define evaluation function
def evaluate_model(dataset, num_samples=100):
    all_rouge_scores = {
        "rouge1": [],
        "rouge2": [],
        "rougeL": []
    }

    # Loop over samples
    for i in tqdm(range(num_samples), desc="Evaluating"):
        # Extract document and reference summary
        document = dataset[i]['document']
        reference_summary = dataset[i]['outcome']
        
        # Generate summary using model
        generated_summary = generate_summary(document)
        
        # Compute ROUGE score
        scores = scorer.score(reference_summary, generated_summary)
        
        # Store the ROUGE scores
        all_rouge_scores["rouge1"].append(scores["rouge1"].fmeasure)
        all_rouge_scores["rouge2"].append(scores["rouge2"].fmeasure)
        all_rouge_scores["rougeL"].append(scores["rougeL"].fmeasure)
    
    # Compute average ROUGE scores
    avg_rouge1 = sum(all_rouge_scores["rouge1"]) / len(all_rouge_scores["rouge1"])
    avg_rouge2 = sum(all_rouge_scores["rouge2"]) / len(all_rouge_scores["rouge2"])
    avg_rougeL = sum(all_rouge_scores["rougeL"]) / len(all_rouge_scores["rougeL"])

    return {
        "rouge1": avg_rouge1,
        "rouge2": avg_rouge2,
        "rougeL": avg_rougeL
    }

# Evaluate the model on 100 samples (you can adjust the number of samples)
evaluation_results = evaluate_model(dataset, num_samples=100)

# Print the evaluation results
print(f"ROUGE-1: {evaluation_results['rouge1']:.4f}")
print(f"ROUGE-2: {evaluation_results['rouge2']:.4f}")
print(f"ROUGE-L: {evaluation_results['rougeL']:.4f}")


Evaluating: 100%|██████████| 100/100 [06:48<00:00,  4.08s/it]

ROUGE-1: 0.1928
ROUGE-2: 0.0358
ROUGE-L: 0.1375





## Second Try - Seq2Seq

### Import Library & Initialization

In [12]:
import re
import nltk
import string
import numpy as np
import torch
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from transformers import AutoTokenizer, T5ForConditionalGeneration
from rouge_score import rouge_scorer

# Download needed NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Initialize Lemmatizer & Stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Data Preprocessing

In [13]:
# Function to clean and preprocess text
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    words = word_tokenize(text)  # Tokenization
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Lemmatization & stopword removal
    return " ".join(words)

# Load dataset ACLSum
dataset = load_dataset("sobamchan/aclsum", split="train")

# Clean and preprocess the 'document' and 'outcome' columns
for entry in dataset:
    entry['document'] = clean_text(entry['document'])
    entry['outcome'] = clean_text(entry['outcome'])


### Data Splitting

In [35]:
# Print the structure of the dataset
print(dataset)
print(dataset[99])  # Check the first data entry


Dataset({
    features: ['id', 'document', 'challenge', 'approach', 'outcome'],
    num_rows: 100
})
{'id': 'D08-1050', 'document': 'Most state-of-the-art wide-coverage parsers are trained on newspaper text and suffer a loss of accuracy in other domains , making parser adaptation a pressing issue . In this paper we demonstrate that a CCG parser can be adapted to two new domains , biomedical text and questions for a QA system , by using manually-annotated training data at the POS and lexical category levels only . This approach achieves parser accuracy comparable to that on newspaper data without the need for annotated parse trees in the new domain . We find that retraining at the lexical category level yields a larger performance increase for questions than for biomedical text and analyze the two datasets to investigate why different domains might behave differently for parser adaptation . Most state-of-the-art wide-coverage parsers are based on the Penn Treebank ( Marcus et al . , 199

In [16]:
# Drop any rows with missing 'document' or 'outcome' fields
dataset = dataset.filter(lambda x: x['document'] is not None and x['outcome'] is not None)


Filter: 100%|██████████| 100/100 [00:00<00:00, 3448.38 examples/s]


In [18]:
from sklearn.model_selection import train_test_split

# Convert the dataset into a list of dictionaries for easier handling
data_list = [entry for entry in dataset]

# Split the data into training, validation, and test sets
train_data, temp_data = train_test_split(data_list, test_size=0.2, random_state=42)  # 80% training, 20% temporary
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)  # 50% for validation and 50% for test

print(f"Training Data: {len(train_data)} samples")
print(f"Validation Data: {len(val_data)} samples")
print(f"Test Data: {len(test_data)} samples")


Training Data: 80 samples
Validation Data: 10 samples
Test Data: 10 samples


In [21]:
from datasets import Dataset

# Convert the data back into a Hugging Face Dataset format
train_dataset = Dataset.from_pandas(pd.DataFrame(train_data))
val_dataset = Dataset.from_pandas(pd.DataFrame(val_data))
test_dataset = Dataset.from_pandas(pd.DataFrame(test_data))

# Print to check
print(train_dataset)


Dataset({
    features: ['id', 'document', 'challenge', 'approach', 'outcome'],
    num_rows: 80
})


### Vectorization

In [22]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limiting to top 5000 features (words)
train_tfidf_matrix = tfidf_vectorizer.fit_transform([entry['document'] for entry in train_data])

# TF-IDF representation of training documents
print("TF-IDF Features: ", train_tfidf_matrix.shape)


TF-IDF Features:  (80, 5000)


### Model Training

In [24]:
# Function for tokenizing the dataset (same as before)
def preprocess_data(example):
    inputs = tokenizer(example['document'], max_length=1024, truncation=True, padding="max_length")
    labels = tokenizer(example['outcome'], max_length=150, truncation=True, padding="max_length")
    inputs["labels"] = labels["input_ids"]
    return inputs

# Tokenize the training, validation, and test datasets
tokenized_train_data = train_dataset.map(preprocess_data, batched=True)
tokenized_val_data = val_dataset.map(preprocess_data, batched=True)
tokenized_test_data = test_dataset.map(preprocess_data, batched=True)

# Cek hasil tokenisasi
print(tokenized_train_data[0])


Map: 100%|██████████| 80/80 [00:00<00:00, 373.54 examples/s]
Map: 100%|██████████| 10/10 [00:00<00:00, 78.29 examples/s]
Map: 100%|██████████| 10/10 [00:00<00:00, 238.48 examples/s]

{'id': 'N18-1108', 'document': 'Recurrent neural networks ( RNNs ) have achieved impressive results in a variety of linguistic processing tasks , suggesting that they can induce non-trivial properties of language . We investigate here to what extent RNNs learn to track abstract hierarchical syntactic structure . We test whether RNNs trained with a generic language modeling objective in four languages ( Italian , English , Hebrew , Russian ) can predict long-distance number agreement in various constructions . We include in our evaluation nonsensical sentences where RNNs can not rely on semantic or lexical cues ( " The colorless green ideas ideas ideas ideas ideas ideas ideas ideas ideas ideas ideas ideas ideas ideas ideas ideas ideas I ate with the chair sleep sleep sleep sleep sleep sleep sleep sleep sleep sleep sleep sleep sleep sleep sleep sleep sleep furiously " ) , and , for Italian , we compare model performance to human intuitions . Our language-model-trained RNNs make reliable 




### Training the Model

In [26]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./aclsum-results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True
)

# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_val_data,
    tokenizer=tokenizer
)

# Train the model
trainer.train()


  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,No log,9.132276
2,No log,5.899254
3,No log,4.84507


TrainOutput(global_step=60, training_loss=8.714350382486979, metrics={'train_runtime': 554.8254, 'train_samples_per_second': 0.433, 'train_steps_per_second': 0.108, 'total_flos': 64964064706560.0, 'train_loss': 8.714350382486979, 'epoch': 3.0})

### Save the Fine-tuned Model

In [28]:
# Save the fine-tuned model
model.save_pretrained("saved_model2")
tokenizer.save_pretrained("saved_model2")

('saved_model2\\tokenizer_config.json',
 'saved_model2\\special_tokens_map.json',
 'saved_model2\\tokenizer.json')

### Load the Saved Model

In [51]:
from transformers import T5ForConditionalGeneration, AutoTokenizer

# Load saved model
dataset = load_dataset("sobamchan/aclsum", split="train")
model = T5ForConditionalGeneration.from_pretrained("saved_model2")
tokenizer = AutoTokenizer.from_pretrained("saved_model2")


### Text Summarizer Testing

In [48]:
# Function to generate summary
def generate_summary(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs.input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Example usage
sample_paper = dataset[0]['document']
print("Original Paper:\n", sample_paper)
print("\nGenerated Summary:\n", generate_summary(sample_paper))


Original Paper:
 In this paper , we explore correlation of dependency relation paths to rank candidate answers in answer extraction . Using the correlation measure , we compare dependency relations of a candidate answer and mapped question phrases in sentence with the corresponding relations in question . Different from previous studies , we propose an approximate phrase mapping algorithm and incorporate the mapping score into the correlation measure . The correlations are further incorporated into a Maximum Entropy-based ranking model which estimates path weights from training . Experimental results show that our method significantly outperforms state-ofthe-art syntactic relation-based methods by up to 20 % in MRR . Answer Extraction is one of basic modules in open domain Question Answering ( QA ) . It is to further process relevant sentences extracted with Passage / Sentence Retrieval and pinpoint exact answers using more linguistic-motivated analysis . Since QA turns to find exact a

## Evaluation using ROGUE Score

In [None]:
num_samples = min(100, len(dataset))
print (num_samples)

100


In [43]:
# Initialize the ROUGE scorer
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

# Function to generate summary
def generate_summary(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs.input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Function to evaluate the model using ROUGE
def evaluate_model(dataset, num_samples=90):
    all_rouge_scores = {
        "rouge1": [],
        "rouge2": [],
        "rougeL": []
    }

    num_samples = min(100, len(dataset)-1)

    for i in range(num_samples):
        document = dataset[i]['document']
        reference_summary = dataset[i]['outcome']
        
        generated_summary = generate_summary(document)
        
        scores = scorer.score(reference_summary, generated_summary)
        
        all_rouge_scores["rouge1"].append(scores["rouge1"].fmeasure)
        all_rouge_scores["rouge2"].append(scores["rouge2"].fmeasure)
        all_rouge_scores["rougeL"].append(scores["rougeL"].fmeasure)
    
    avg_rouge1 = sum(all_rouge_scores["rouge1"]) / len(all_rouge_scores["rouge1"])
    avg_rouge2 = sum(all_rouge_scores["rouge2"]) / len(all_rouge_scores["rouge2"])
    avg_rougeL = sum(all_rouge_scores["rougeL"]) / len(all_rouge_scores["rougeL"])

    return {
        "rouge1": avg_rouge1,
        "rouge2": avg_rouge2,
        "rougeL": avg_rougeL
    }

# Evaluate the model on 100 samples
evaluation_results = evaluate_model(tokenized_test_data, num_samples=100)

# Print the evaluation results
print(f"ROUGE-1: {evaluation_results['rouge1']:.4f}")
print(f"ROUGE-2: {evaluation_results['rouge2']:.4f}")
print(f"ROUGE-L: {evaluation_results['rougeL']:.4f}")


ROUGE-1: 0.2325
ROUGE-2: 0.0278
ROUGE-L: 0.1690


## Conclusion and Insights