In [None]:
!pip install transformers datasets torch scikit-learn evaluate rouge_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [7]:
import pandas as pd
from datasets import Dataset

# Đọc dữ liệu từ các file CSV
train_data = pd.read_csv("/content/drive/MyDrive/NLP/dataset/train.csv")
val_data = pd.read_csv("/content/drive/MyDrive/NLP/dataset/val.csv")
test_data = pd.read_csv("/content/drive/MyDrive/NLP/dataset/test.csv")

# Đảm bảo các file có cột `input_text` và `summary_text`
def preprocess_data(data):
    return [{"input_text": row["Content"], "summary_text": row["Summarize"]} for _, row in data.iterrows()]

train_dataset = preprocess_data(train_data)
val_dataset = preprocess_data(val_data)
test_dataset = preprocess_data(test_data)

train_dataset = Dataset.from_pandas(pd.DataFrame(train_dataset))
val_dataset = Dataset.from_pandas(pd.DataFrame(val_dataset))
test_dataset = Dataset.from_pandas(pd.DataFrame(test_dataset))

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50")

# Hàm tiền xử lý dữ liệu
def preprocess_function(examples):
    inputs = ["summarize: " + example["input_text"] for example in examples]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    # Tokenize phần tóm tắt (summary)
    labels = tokenizer([example["summary_text"] for example in examples], max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Áp dụng tiền xử lý
train_encodings = preprocess_function(train_dataset)
val_encodings = preprocess_function(val_dataset)
test_encodings = preprocess_function(test_dataset)


In [9]:
import torch

class TextSummaryDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

train_dataset = TextSummaryDataset(train_encodings)
val_dataset = TextSummaryDataset(val_encodings)
test_dataset = TextSummaryDataset(test_encodings)

In [None]:
from transformers import MBartForConditionalGeneration, Trainer, TrainingArguments, AutoTokenizer

# Load model và tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50")
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")

# Định nghĩa các tham số huấn luyện
training_args = TrainingArguments(
    output_dir="./results",                # Thư mục lưu kết quả và checkpoint
    evaluation_strategy="epoch",          # Đánh giá sau mỗi epoch
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="steps",                # Lưu checkpoint sau mỗi số bước
    save_steps=500,                       # Lưu checkpoint mỗi 500 bước
    save_total_limit=3,                   # Chỉ giữ lại 3 checkpoint gần nhất
    fp16=True,                            # Sử dụng mixed precision (nếu GPU hỗ trợ)
    logging_dir='./logs',                 # Thư mục lưu log
    logging_steps=10,
    report_to="none",                     # Không đẩy logs lên hệ thống bên ngoài
    push_to_hub=False,
)

# Khởi tạo Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Huấn luyện
trainer.train()

# Tiếp tục từ checkpoint nếu cần
# trainer.train(resume_from_checkpoint="./results/checkpoint-500")
save_path = "/content/drive/MyDrive/NLP/finetuned/mbart-finetuned"

# Lưu mô hình đã huấn luyện
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)


pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

  trainer = Trainer(


In [None]:
from transformers import MBartForConditionalGeneration, AutoTokenizer

# Load the fine-tuned model and tokenizer
model_path = "/content/drive/MyDrive/NLP/finetuned/vit5-finetuned"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = MBartForConditionalGeneration.from_pretrained(model_path)

# Function to generate predictions
def generate_summary(input_text, max_length=128):
    # Preprocess the input
    input_ids = tokenizer(
        "summarize: " + input_text,
        return_tensors="pt",
        max_length=512,
        truncation=True,
    ).input_ids

    # Generate the summary
    output_ids = model.generate(input_ids, max_length=max_length, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return summary

# Test the model on your test dataset
test_results = []
for example in test_dataset:  # Assuming `test_dataset` is in PyTorch Dataset format
    input_text = tokenizer.decode(example["input_ids"], skip_special_tokens=True)
    true_summary = tokenizer.decode(example["labels"], skip_special_tokens=True)
    generated_summary = generate_summary(input_text)

    test_results.append({
        "Input": input_text,
        "True Summary": true_summary,
        "Generated Summary": generated_summary
    })

# Print results for evaluation
for result in test_results[:2]:  # Print first 10 examples
    print(f"Input: {result['Input']}")
    print(f"True Summary: {result['True Summary']}")
    print(f"Generated Summary: {result['Generated Summary']}")
    print("=" * 80)


In [None]:
import evaluate

rouge = evaluate.load("rouge")

# Compute ROUGE scores for generated and true summaries
generated_summaries = [result["Generated Summary"] for result in test_results]
true_summaries = [result["True Summary"] for result in test_results]

rouge_score = rouge.compute(predictions=generated_summaries, references=true_summaries)
print(rouge_score)
