In [2]:
!git clone https://github.com/TranTheHung2312332/text-summarization
!pip install evaluate
!!pip install rouge_score
import os
os.environ["WANDB_DISABLED"] = "true"

Cloning into 'text-summarization'...
remote: Enumerating objects: 45, done.[K
remote: Counting objects: 100% (45/45), done.[K
remote: Compressing objects: 100% (41/41), done.[K
remote: Total 45 (delta 2), reused 44 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (45/45), 8.83 KiB | 1.77 MiB/s, done.
Resolving deltas: 100% (2/2), done.
Filtering content: 100% (10/10), 93.97 MiB | 34.10 MiB/s, done.
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [3]:
from google.colab import drive
drive.mount('/content/drive')

from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import evaluate

dataset = load_from_disk("text-summarization/vietnamese_summarization_split")
train_ds = dataset['train']
valid_ds = dataset['validation']

model_name = "vinai/bartpho-word"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

max_input_length = 512
max_target_length = 128

def preprocess(example):
    inputs = tokenizer(
        example["Document"],
        max_length=max_input_length,
        truncation=True
    )
    targets = tokenizer(
        example["Summary"],
        max_length=max_target_length,
        truncation=True
    )
    inputs["labels"] = targets["input_ids"]
    return inputs

train_tokenized = train_ds.map(
    preprocess,
    batched=True,
    num_proc=4
)
valid_tokenized = valid_ds.map(preprocess, batched=True)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    return {key: round(value*100, 2) for key, value in result.items()}

training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/fine-tune-bartpho",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    fp16=True,

    optim="adafactor",
    generation_num_beams=1,
    generation_max_length=64,
    dataloader_num_workers=2,

    eval_steps=4000,
    save_steps=4000,
    logging_steps=200,

    save_total_limit=2,
    predict_with_generate=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=valid_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# trainer.train(resume_from_checkpoint=True)
trainer.train()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/897 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

bpe.codes: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

Map (num_proc=4):   0%|          | 0/25435 [00:00<?, ? examples/s]

Map:   0%|          | 0/3179 [00:00<?, ? examples/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Seq2SeqTrainer(


Step,Training Loss
200,3.3471
400,2.8463
600,2.6182
800,2.4853
1000,2.3556
1200,2.2369
1400,2.1495
1600,2.0916
1800,2.0101
2000,1.9137


TrainOutput(global_step=9540, training_loss=1.1698325635002345, metrics={'train_runtime': 6494.3368, 'train_samples_per_second': 11.749, 'train_steps_per_second': 1.469, 'total_flos': 8.266988866358477e+16, 'train_loss': 1.1698325635002345, 'epoch': 3.0})

In [8]:
output_dir = "/content/drive/MyDrive/summarization-finetuned"
trainer.save_model(output_dir)          # Save model + config
tokenizer.save_pretrained(output_dir)   # Save tokenizer

print(f"Đã lưu mô hình tại: {output_dir}")

Đã lưu mô hình tại: /content/drive/MyDrive/summarization-finetuned


In [9]:
metrics = trainer.evaluate()
print(metrics)

{'eval_loss': 0.47817227244377136, 'eval_rouge1': 74.27, 'eval_rouge2': 58.63, 'eval_rougeL': 64.46, 'eval_rougeLsum': 64.48, 'eval_runtime': 679.9147, 'eval_samples_per_second': 4.676, 'eval_steps_per_second': 0.585, 'epoch': 3.0}


In [51]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import evaluate
from nltk.translate.bleu_score import corpus_bleu
from nltk.tokenize import word_tokenize
from tqdm import tqdm

# -------------------------
# 1. Load model + tokenizer
# -------------------------
output_dir = "/content/drive/MyDrive/summarization-finetuned"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(output_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(output_dir)

# Tối ưu tốc độ
if torch.cuda.is_available():
    model = model.half()   # chạy FP16 (nhanh hơn)
model.to(device)
model.eval()


# -------------------------
# 2. Dataset
# -------------------------
test_docs = [sample['Document'] for sample in dataset['test']]
test_refs = [sample['Summary'] for sample in dataset['test']]


# -------------------------
# 3. ROUGE
# -------------------------
rouge = evaluate.load("rouge")


# -------------------------
# 4. Batch summarize
# -------------------------
@torch.inference_mode()
def generate_summaries_batch(
        docs,
        max_input_length=512,
        max_target_length=128,
        num_beams=4
    ):

    inputs = tokenizer(
        docs,
        max_length=max_input_length,
        truncation=True,
        padding=True,
        return_tensors="pt"
    ).to(device)

    summary_ids = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_target_length,
        num_beams=num_beams,
        early_stopping=True,
        no_repeat_ngram_size=3,   # tránh lặp
        length_penalty=1.0,
        repetition_penalty=1.2
    )

    return tokenizer.batch_decode(
        summary_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )


# -------------------------
# 5. Loop test set
# -------------------------
batch_size = 16   # tăng batch size nhờ FP16
predictions = []
references = []

for i in tqdm(range(0, len(test_docs), batch_size)):
    batch_docs = test_docs[i:i+batch_size]
    batch_refs = test_refs[i:i+batch_size]

    batch_preds = generate_summaries_batch(batch_docs)

    predictions.extend(batch_preds)
    references.extend(batch_refs)


# -------------------------
# 6. ROUGE
# -------------------------
rouge_result = rouge.compute(
    predictions=predictions,
    references=references
)
rouge_result = {k: round(v * 100, 2) for k, v in rouge_result.items()}


# -------------------------
# 7. BLEU
# -------------------------
tokenized_preds = [word_tokenize(pred) for pred in predictions]
tokenized_refs  = [[word_tokenize(ref)] for ref in references]

bleu_score = round(corpus_bleu(tokenized_refs, tokenized_preds) * 100, 2)


# -------------------------
# 8. Results
# -------------------------
print("=== Đánh giá trên test set ===")
print(f"ROUGE-1: {rouge_result.get('rouge1')}")
print(f"ROUGE-2: {rouge_result.get('rouge2')}")
print(f"ROUGE-L: {rouge_result.get('rougeL')}")
print(f"ROUGE-Lsum: {rouge_result.get('rougeLsum')}")
print(f"BLEU: {bleu_score}")


100%|██████████| 199/199 [31:16<00:00,  9.43s/it]


=== Đánh giá trên test set ===
ROUGE-1: 75.05
ROUGE-2: 60.84
ROUGE-L: 66.14
ROUGE-Lsum: 66.17
BLEU: 44.34


In [41]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch
import re

def split_into_chunks(text, max_chars=1500, overlap=200):
    sentences = re.split(r'(?<=[\.\?\!])\s+', text)
    chunks = []
    current = ""

    for s in sentences:
        if len(current) + len(s) < max_chars:
            current += " " + s
        else:
            chunks.append(current.strip())
            current = s

    if current:
        chunks.append(current.strip())

    return chunks


def summarize_bartpho(text,
                      model_name="/content/drive/MyDrive/summarization-finetuned",
                      max_length=150,
                      min_length=30,
                      chunk_size=1500):

    device = 0 if torch.cuda.is_available() else -1

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model     = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    summarizer = pipeline(
        "summarization",
        model=model,
        tokenizer=tokenizer,
        device=device
    )

    # Chia văn bản dài thành nhiều đoạn
    chunks = split_into_chunks(text, max_chars=chunk_size)

    chunk_summaries = []
    for c in chunks:
        s = summarizer(
            c,
            max_length=max_length,
            min_length=min_length,
            do_sample=False
        )[0]["summary_text"]
        chunk_summaries.append(s)

    # Gộp toàn bộ summary
    combined = " ".join(chunk_summaries)

    # Nếu output vẫn quá dài → tóm tắt lần 2
    if len(combined.split()) > 2 * max_length:
        final = summarizer(
            combined,
            max_length=max_length,
            min_length=min_length,
            do_sample=False
        )[0]["summary_text"]
        return final

    return combined

In [52]:
text = "Hơn 1.000 con ngan của một gia đình ở huyện Thanh Liêm ( Hà Nam ) có dấu hiệu nhiễm cúm A / H 5 N 1 . Chính quyền địa phương lập chốt kiểm dịch tại vùng dịch để ngăn chặn , không cho vận chuyển gia cầm ra vùng phát sinh ổ dịch ."

summarize_bartpho(text)

Device set to use cuda:0
Your max_length is set to 150, but your input_length is only 58. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=29)
Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


'Hơn 1.000 con ngan của một gia đình ở huyện Thanh Liêm ( Hà Nam ) có dấu hiệu nhiễm cúm A / H 5 N 1 .'