# 🤖 模型微调参数配置指南

本指南适用于使用 Hugging Face Transformers 库进行微调（Fine-tuning）时的参数配置，尤其适合用于 Donut 等文档理解模型的训练。

- ## donut-base-finetuned-invoices
- ## donut-base-finetuned-docvqa
- ## MiniCPM-Llama3-V-2_5

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# donut (no fine-tuning vs fine-tuned)

## STEP 1: Install dependencies

In [None]:
!pip install -q transformers accelerate torchvision pymupdf

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m59.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## STEP 2: Import libraries

In [None]:
import fitz  # PyMuPDF
import torch
from PIL import Image
from transformers import DonutProcessor, VisionEncoderDecoderModel

## STEP 3: Load PDF and convert to image

In [None]:
dir_invoice_pdf = "/content/drive/MyDrive/AI_Lecture/dataset/ENG_CHN/"

In [None]:
pdf_path = f"{dir_invoice_pdf}/測試股份有限公司.pdf"  # <- Replace with your actual path
pdf_doc = fitz.open(pdf_path)
page = pdf_doc[0]
pix = page.get_pixmap(dpi=200)
image_path = f"{dir_invoice_pdf}/invoice_page.jpg"
pix.save(image_path)
image = Image.open(image_path).convert("RGB")
image.show()

## STEP 4: Define inference function

In [None]:
def infer_with_model(model_name, task_prompt="<s_docvqa>", max_length=768):
    print(f"\n🔍 Running inference with: {model_name}")
    processor = DonutProcessor.from_pretrained(model_name)
    model = VisionEncoderDecoderModel.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()

    pixel_values = processor(image, return_tensors="pt").pixel_values
    decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids

    with torch.no_grad():
        outputs = model.generate(
            pixel_values.to(model.device),
            decoder_input_ids=decoder_input_ids.to(model.device),
            max_length=max_length,
            pad_token_id=processor.tokenizer.pad_token_id
        )

    result = processor.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    print("\n📄 Output:")
    print(result)
    return result

## STEP 5: Run base model (no fine-tuning)

In [None]:
base_output = infer_with_model("naver-clova-ix/donut-base", task_prompt="<s_docvqa><s_question>Extract all invoice fields</s_question><s_answer>")


🔍 Running inference with: naver-clova-ix/donut-base


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/362 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/518 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/1.30M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/71.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/355 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/809M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/809M [00:00<?, ?B/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.



📄 Output:
<s_docvqa><s_question>Extract all invoice fields</s_question><s_answer> . Test Co.,Ltd.. .,Ltd.. .,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## STEP 6: Run fine-tuned invoice model (ToBe version)

In [None]:
tuned_invoice_output = infer_with_model("to-be/donut-base-finetuned-invoices", task_prompt="<s_invoice-v2>")


🔍 Running inference with: to-be/donut-base-finetuned-invoices


preprocessor_config.json:   0%|          | 0.00/422 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/544 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/1.30M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/606 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/355 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/809M [00:00<?, ?B/s]


📄 Output:
<s_invoice-v2>008</s_InvoiceNumber><s_NetAmount1> 1000.00</s_NetAmount1><s_TaxAmount1> 300.00</s_TaxAmount1>


## STEP 7: Run fine-tuned DocVQA model (NAVER)

In [None]:
docvqa_output = infer_with_model("naver-clova-ix/donut-base-finetuned-docvqa", task_prompt="<s_docvqa><s_question>Extract all invoice fields</s_question><s_answer>")


🔍 Running inference with: naver-clova-ix/donut-base-finetuned-docvqa


preprocessor_config.json:   0%|          | 0.00/359 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/535 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/1.30M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/478 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/803M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/803M [00:00<?, ?B/s]


📄 Output:
Extract all invoice fields test co., ltd.


## STEP 8: Compare results (manual or automatic parsing)

In [None]:
print("\n================ Result Summary ================")
print("\n🔹 Base Model Output:\n", base_output)
print("\n🔹 Finetuned Invoice Output:\n", tuned_invoice_output)
print("\n🔹 Finetuned DocVQA Output:\n", docvqa_output)



🔹 Base Model Output:
 <s_docvqa><s_question>Extract all invoice fields</s_question><s_answer> . Test Co.,Ltd.. .,Ltd.. .,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,

🔹 Finetuned Invoice Output:
 <s_invoice-v2>008</s_InvoiceNumber><s_NetAmount1> 1000.00</s_NetAmount1><s_TaxAmount1> 300.00</s_TaxAmount1>

🔹 Finetuned DocVQA Outp

In [None]:
item_names = ['期間', 'Vendor code', 'Invoice NO', 'PAYMENT TERM', 'Name of beneficiary', 'Name of beneficiary bank', 'Address of beneficiary bank', 'Bank account number', 'Contact person']

In [None]:
for name in item_names:
  question = f"What is {name}?"
  docvqa_output = infer_with_model("naver-clova-ix/donut-base-finetuned-docvqa", task_prompt=f"<s_docvqa><s_question>{question}</s_question><s_answer>")


🔍 Running inference with: naver-clova-ix/donut-base-finetuned-docvqa

📄 Output:
What is 期間? 309824263008

🔍 Running inference with: naver-clova-ix/donut-base-finetuned-docvqa

📄 Output:
What is Vendor code? xx840-sxx

🔍 Running inference with: naver-clova-ix/donut-base-finetuned-docvqa

📄 Output:
What is Invoice NO? y 309824263008

🔍 Running inference with: naver-clova-ix/donut-base-finetuned-docvqa

📄 Output:
What is PAYMENT TERM? 150days

🔍 Running inference with: naver-clova-ix/donut-base-finetuned-docvqa

📄 Output:
What is Name of beneficiary? bank of test branch

🔍 Running inference with: naver-clova-ix/donut-base-finetuned-docvqa

📄 Output:
What is Name of beneficiary bank? bank of test branch

🔍 Running inference with: naver-clova-ix/donut-base-finetuned-docvqa

📄 Output:
What is Address of beneficiary bank? test road

🔍 Running inference with: naver-clova-ix/donut-base-finetuned-docvqa

📄 Output:
What is Bank account number? 0000999888

🔍 Running inference with: naver-clova-ix

### 📊 Donut 微调模型对比：发票提取任务

| 对比维度         | `to-be/donut-base-finetuned-invoices` ✅ | `naver-clova-ix/donut-base-finetuned-docvqa` 🧠 |
|------------------|------------------------------------------|------------------------------------------------|
| 🎯 任务类型       | 发票字段提取（结构化输出）               | 文档问答（DocVQA，自由提问）                   |
| 📤 输出格式       | JSON 键值对                              | 文本答案（非结构化）                           |
| 💬 是否需要 Prompt | ❌ 无需，自动输出字段                    | ✅ 需要精确提问                                 |
| 🌍 语言支持       | 英文发票优（中文需微调）                 | 英文为主，可泛化到多种格式                     |
| 🧾 适用文档类型   | 英文模板化发票                           | 各类文档（如发票、表格、收据、证件等）        |
| ⚙️ 灵活性         | ❌ 结构固定                               | ✅ 高度灵活，可问任意字段                      |
| 🚀 推荐使用场景   | 高精度字段提取、API集成                  | 构建问答系统、Agent式抽取、多样化发票          |


In [1]:
# ✅ Donut 发票抽取：结构化模型 + DocVQA 微调统一方案（Colab 使用）

# ✅ STEP 1: 安装依赖
!pip install -q transformers datasets accelerate torchvision pymupdf

In [17]:
# ✅ STEP 2: 导入库
import os, json
import fitz  # pymupdf
from PIL import Image
from datasets import Dataset, DatasetDict
from transformers import DonutProcessor, VisionEncoderDecoderModel, Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch

In [19]:
# ✅ STEP 3: PDF 转图像工具（支持批量）
def convert_pdf_to_images(pdf_path, out_dir):
    os.makedirs(out_dir, exist_ok=True)
    doc = fitz.open(pdf_path)
    for i in range(len(doc)):
        page = doc[i]
        pix = page.get_pixmap(dpi=200)
        img_path = os.path.join(out_dir, f"page_{i+1}.jpg")
        pix.save(img_path)
    return sorted([os.path.join(out_dir, f) for f in os.listdir(out_dir) if f.endswith('.jpg')])

# ✅ 示例：处理發票 PDF
pdf_path = "/Users/xiaotingzhou/Documents/Lectures/AI_OCR/data/測試股份有限公司.pdf"
img_dir = "/Users/xiaotingzhou/Documents/Lectures/AI_OCR/data/converted_images/"
image_paths = convert_pdf_to_images(pdf_path, img_dir)

# ✅ STEP 4: 结构化微调数据准备（方式 A）
def prepare_structured_dataset(img_paths, json_labels):
    data = []
    for img, label_path in zip(img_paths, json_labels):
        with open(label_path, "r", encoding="utf-8") as f:
            label_json = json.load(f)
        data.append({"image": img, "label": json.dumps(label_json, ensure_ascii=False)})
    return Dataset.from_list(data)

# ✅ 示例结构化标签（可替换为你自己的路径）
json_labels = ["/Users/xiaotingzhou/Documents/Lectures/AI_OCR/data/label1.json"] * len(image_paths)
structured_train_ds = prepare_structured_dataset(image_paths, json_labels)

# ✅ STEP 5: DocVQA 微调数据准备（方式 B）
def prepare_docvqa_dataset(img_paths, qa_pairs):
    data = []
    for img, qa in zip(img_paths, qa_pairs):
        for q, a in qa.items():
            data.append({"image": img, "question": q, "answer": a})
    return Dataset.from_list(data)

# ✅ 示例提问式标签（每页对应多个问答）
qa_pairs = [{
    "发票号码是多少？": "Y 309824263008",
    "开票日期？": "2025年6月30日",
    "金额是多少？": "300",
    "币种？": "USD",
    "税额？": "0"
}] * len(image_paths)
qa_train_ds = prepare_docvqa_dataset(image_paths, qa_pairs)

# ✅ STEP 6: 预处理器
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base")

# For structured JSON
def preprocess_structured(example):
    image = Image.open(example["image"]).convert("RGB")
    encoding = processor(image, return_tensors="pt")
    pixel_values = encoding.pixel_values.squeeze(0)  # 保证为 Tensor(C,H,W)
    labels = processor.tokenizer(example["label"], add_special_tokens=False, return_tensors="pt").input_ids.squeeze(0)
    return {
        "pixel_values": pixel_values,
        "labels": labels
    }

# For DocVQA
def preprocess_docvqa(example):
    image = Image.open(example["image"]).convert("RGB")
    prompt = f"<s_docvqa><s_question>{example['question']}</s_question><s_answer>"
    encoding = processor(image, return_tensors="pt")
    pixel_values = encoding.pixel_values.squeeze(0)
    labels = processor.tokenizer(
        example["answer"], add_special_tokens=False, return_tensors="pt"
    ).input_ids.squeeze(0)
    return {
        "pixel_values": pixel_values,
        "labels": labels
    }

# ✅ STEP 7: 数据映射
structured_train = structured_train_ds.map(
    preprocess_structured,
    remove_columns=structured_train_ds.column_names,
    batched=False  # <- 确保单样本处理
)

docvqa_train = qa_train_ds.map(
    preprocess_docvqa,
    remove_columns=qa_train_ds.column_names,
    batched=False
)


def transform_fn(example):
    example["pixel_values"] = torch.tensor(example["pixel_values"]) if isinstance(example["pixel_values"], list) else example["pixel_values"]
    example["labels"] = torch.tensor(example["labels"]) if isinstance(example["labels"], list) else example["labels"]
    return example

structured_train = structured_train.with_transform(transform_fn)
docvqa_train = docvqa_train.with_transform(transform_fn)

# ✅ STEP 8: 设置训练参数
training_args = Seq2SeqTrainingArguments(
    output_dir="./donut_struct_out",
    run_name="donut_struct_cn_v1",
    per_device_train_batch_size=1,
    num_train_epochs=3,
    learning_rate=5e-5,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=5,
    report_to="none"  # ✅ 彻底禁用 wandb / tensorboard / comet
)
def donut_collator(features):
    # 🛠️ 调试用：确认类型
    if isinstance(features[0]["pixel_values"], list):
        raise TypeError("pixel_values 应为 Tensor，而不是 list。请检查 preprocess 是否使用了 squeeze。")

    pixel_values = torch.stack([f["pixel_values"] for f in features])
    labels = [f["labels"] if isinstance(f["labels"], torch.Tensor) else torch.tensor(f["labels"]) for f in features]
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=processor.tokenizer.pad_token_id)
    return {"pixel_values": pixel_values, "labels": labels}

Map: 100%|██████████| 2/2 [00:01<00:00,  1.69 examples/s]
Map: 100%|██████████| 10/10 [00:05<00:00,  2.00 examples/s]


In [None]:
# ✅ STEP 9: 启动结构化模型训练
model_struct = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base")
trainer_struct = Seq2SeqTrainer(
    model=model_struct,
    args=training_args,
    train_dataset=structured_train,
    tokenizer=processor.tokenizer,
    data_collator=donut_collator,
)

trainer_struct.train()  # <- 取消注释开始训练

  trainer_struct = Seq2SeqTrainer(


RuntimeError: MPS backend out of memory (MPS allocated: 17.97 GB, other allocations: 66.81 MB, max allowed: 18.13 GB). Tried to allocate 150.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
# ✅ STEP 10: 启动 DocVQA 问答式模型训练
model_docvqa = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base")
trainer_docvqa = Seq2SeqTrainer(
    model=model_docvqa,
    args=training_args,
    train_dataset=docvqa_train,
    tokenizer=processor.tokenizer,
    data_collator=donut_collator,
)
trainer_docvqa.train()  # <- 取消注释开始训练


In [None]:

# ✅ STEP 11: 推理对比展示函数
def compare_model_outputs(image_path):
    image = Image.open(image_path).convert("RGB")
    pixel_values = processor(image, return_tensors="pt").pixel_values.to(model_struct.device)

    # 结构化模型推理
    struct_prompt = "<s_invoice-cn>"
    struct_decoder_input_ids = processor.tokenizer(struct_prompt, return_tensors="pt").input_ids.to(model_struct.device)
    struct_outputs = model_struct.generate(pixel_values, decoder_input_ids=struct_decoder_input_ids, max_length=512)
    struct_result = processor.tokenizer.batch_decode(struct_outputs, skip_special_tokens=True)[0]

    # DocVQA 问答模型逐问题推理
    questions = ["发票号码是多少？", "开票日期？", "金额是多少？", "币种？", "税额？"]
    docvqa_answers = {}
    for q in questions:
        prompt = f"<s_docvqa><s_question>{q}</s_question><s_answer>"
        input_ids = processor.tokenizer(prompt, return_tensors="pt").input_ids.to(model_docvqa.device)
        output = model_docvqa.generate(pixel_values, decoder_input_ids=input_ids, max_length=128)
        answer = processor.tokenizer.batch_decode(output, skip_special_tokens=True)[0]
        docvqa_answers[q] = answer

    print("\n📦 结构化模型输出：\n", struct_result)
    print("\n❓ DocVQA 问答输出：")
    for q, a in docvqa_answers.items():
        print(f"{q} -> {a}")



In [None]:
# ✅ 示例使用：compare_model_outputs(image_paths[0])
compare_model_outputs(image_paths[0])

base
```
    per_device_train_batch_size=1,
    num_train_epochs=1,
```



In [1]:
from transformers import DonutProcessor, VisionEncoderDecoderModel, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset
from PIL import Image
import torch, json, os

# ✅ 1. 加载模型和 processor
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base")
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base")

model.config.decoder_start_token_id = processor.tokenizer.convert_tokens_to_ids(["<s>"])[0]
model.config.pad_token_id = processor.tokenizer.pad_token_id
# ✅ 2. 准备数据
# 示例 label
label_json = {
    "InvoiceNo": "Y 309824263008",
    "InvoiceDate": "2025年6月30日",
    "Currency": "USD",
    "Amount with Tax": 300,
    "Amount without Tax": 300,
    "Tax": 0
}
img = Image.open("/Users/xiaotingzhou/Documents/Lectures/AI_OCR/data/converted_images/invoice_page_1.jpg").convert("RGB")  # 👈 替换为你图像路径

dataset = Dataset.from_list([{
    "image": img,
    "label": json.dumps(label_json, ensure_ascii=False)
}])

# ✅ 3. 预处理 + with_transform
def preprocess(example):
    encoding = processor(example["image"], return_tensors="pt")
    pixel_values = encoding.pixel_values.squeeze(0)
    labels = processor.tokenizer(example["label"], return_tensors="pt").input_ids.squeeze(0)
    return {"pixel_values": pixel_values, "labels": labels}

dataset = dataset.map(preprocess)
dataset = dataset.with_transform(lambda x: {
    "pixel_values": torch.tensor(x["pixel_values"]) if isinstance(x["pixel_values"], list) else x["pixel_values"],
    "labels": torch.tensor(x["labels"]) if isinstance(x["labels"], list) else x["labels"]
})

# ✅ 4. collator
def donut_collator(batch):
    pixel_values = torch.stack([x["pixel_values"] for x in batch])
    labels = torch.nn.utils.rnn.pad_sequence(
        [x["labels"] for x in batch], batch_first=True, padding_value=processor.tokenizer.pad_token_id)
    return {"pixel_values": pixel_values, "labels": labels}

# ✅ 5. 训练参数
training_args = Seq2SeqTrainingArguments(
    output_dir="./donut_out",
    per_device_train_batch_size=1,
    num_train_epochs=1,
    predict_with_generate=True,
    logging_dir="./logs",
    report_to="none",
    no_cuda=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=donut_collator,
    tokenizer=processor.tokenizer
)

trainer.train()



  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Map: 100%|██████████| 1/1 [00:01<00:00,  1.04s/ examples]
  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


TrainOutput(global_step=1, training_loss=8.624417304992676, metrics={'train_runtime': 88.6251, 'train_samples_per_second': 0.011, 'train_steps_per_second': 0.011, 'total_flos': 1.25076466040832e+16, 'train_loss': 8.624417304992676, 'epoch': 1.0})

In [2]:
model.save_pretrained("./donut_out")
processor.save_pretrained("./donut_out")

[]

In [12]:
from transformers import DonutProcessor, VisionEncoderDecoderModel
from PIL import Image
import torch
import json

# 加载模型（或使用刚训练完成的 model）

model = VisionEncoderDecoderModel.from_pretrained("./donut_out")
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base")

model.config.decoder_start_token_id = processor.tokenizer.convert_tokens_to_ids(["<s>"])[0]
model.config.pad_token_id = processor.tokenizer.pad_token_id

# 设置配置（重要！）
model.config.decoder_start_token_id = processor.tokenizer.convert_tokens_to_ids(["<s>"])[0]
model.config.pad_token_id = processor.tokenizer.pad_token_id

model.eval()
device = torch.device("cpu")  # 或 "cuda" if available
model.to(device)

VisionEncoderDecoderModel(
  (encoder): DonutSwinModel(
    (embeddings): DonutSwinEmbeddings(
      (patch_embeddings): DonutSwinPatchEmbeddings(
        (projection): Conv2d(3, 128, kernel_size=(4, 4), stride=(4, 4))
      )
      (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): DonutSwinEncoder(
      (layers): ModuleList(
        (0): DonutSwinStage(
          (blocks): ModuleList(
            (0): DonutSwinLayer(
              (layernorm_before): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
              (attention): DonutSwinAttention(
                (self): DonutSwinSelfAttention(
                  (query): Linear(in_features=128, out_features=128, bias=True)
                  (key): Linear(in_features=128, out_features=128, bias=True)
                  (value): Linear(in_features=128, out_features=128, bias=True)
                  (dropout): Dropout(p=0.0, inplace=False)
                )

In [14]:
# 👇 替换为你要评估的图像路径
image_path = "/Users/xiaotingzhou/Documents/Lectures/AI_OCR/data/converted_images/invoice_page_1.jpg"
image = Image.open(image_path).convert("RGB")

# Donut 默认使用特定 prompt 开头（你训练时通常没设置 prompt 可留空）
prompt = ""  # 或 "<s_invoice-v2>" if you trained with it

# 预处理图像 + prompt
decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids
pixel_values = processor(image, return_tensors="pt").pixel_values

# 移动到 GPU/CPU
pixel_values = pixel_values.to(device)
decoder_input_ids = decoder_input_ids.to(device)

In [16]:
from PIL import Image
import torch

image = Image.open(image_path).convert("RGB")
pixel_values = processor(image, return_tensors="pt").pixel_values.to(device)

outputs = model.generate(pixel_values, max_new_tokens=512)
generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]

print("模型输出结果：")
print(generated_text)

模型输出结果：



## 训练参数解析与调整建议

1. Smaller - more： 少量多次
2. k-folder: e.g: 10 folders (80% training(70%) val(10%), 20 % test)
3. learning-rate: smaller
4. batch: plot loss, accuracy
5. epoch: default --> smaller, bigger 1. 40 2. 43. 3. 50, ... 9. 75 10. 78 .. xx: 50/60

training --> validation --> test
training --> validation: 1 folder (accruarcy 50%), 2 folder (accruarcy 40%), ... 10 folder.

Unbalanced data --> bias

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./donut_out",            # 训练后模型保存目录
    per_device_train_batch_size=4,       # 每个设备（GPU/CPU）每个batch大小，根据显存调节
    num_train_epochs=5,                   # 训练总轮数，根据数据量调节
    learning_rate=5e-5,                   # 学习率，微调时通常用较低的学习率
    weight_decay=0.01,                    # 权重衰减，用于正则化防止过拟合
    gradient_accumulation_steps=2,       # 梯度累积步数，显存不足时用来模拟更大batch
    evaluation_strategy="steps",          # 评估策略，这里设置为每隔一定step评估一次
    eval_steps=500,                      # 每500步评估一次
    save_steps=500,                      # 每500步保存一次模型checkpoint
    save_total_limit=3,                  # 最多保存3个checkpoint，防止占用过多磁盘空间
    predict_with_generate=True,          # 验证时是否调用 generate 方法预测
    logging_dir="./logs",                 # tensorboard日志文件夹
    logging_steps=100,                   # 每100步记录一次日志
    report_to="none",                    # 禁用自动日志报告（如wandb），可设置为"tensorboard"或"wandb"
    no_cuda=False,                       # 是否禁用GPU，False表示使用GPU（前提环境支持）
    load_best_model_at_end=True,          # 训练结束加载验证集表现最好的模型
    metric_for_best_model="eval_loss",    # 评判最优模型的指标
    greater_is_better=False               # 评判指标越小越好（loss）
)

### 1. `per_device_train_batch_size`（批大小）
- 当前设置为 1，意味着一次只喂入 1 张图片，训练很慢且不稳定。
  - 如果有 GPU 且显存足够，可以调大，比如 4 或 8，加快训练速度，有利于梯度稳定。
  - 如果内存有限，可以调小，但太小可能影响模型收敛效果。

### 2. `num_train_epochs`（训练轮数）
- 当前设置为 1 轮，通常太少。
  - 建议根据数据量调整：
    - 数据少时可设为 3～5 轮。
    - 数据多时 1～3 轮即可。
  - 训练轮数过多可能导致过拟合，建议配合验证集监控性能。

### 3. `predict_with_generate=True`
- 建议保持开启：
  - 训练期间验证模型时，使用生成模式预测结果，更符合文本生成类任务（如OCR、文档理解等）。

### 4. `logging_dir`
- 设置日志文件夹路径：
  - 可配合 TensorBoard 使用，观察训练和验证的 loss 曲线及其他指标变化。

### 5. `report_to`
- 当前设置为 `"none"`，表示不向外部系统（如 WandB、TensorBoard）汇报日志。
  - 如果想启用日志报告功能，可设为 `"wandb"` 或 `"tensorboard"`。

### 6. `no_cuda=True`
- 当前设置为 `True`，表示训练仅使用 CPU。
  - 如果设备支持 GPU（如 NVIDIA 显卡），应设为 `False` 或直接删除该参数，以启用 GPU 加速训练，速度将大幅提升。

#### 综合示例：假设你有一张中等显卡（比如RTX 3060），可以改成：

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./donut_out",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir="./logs",
    report_to="none",
    no_cuda=False   # 用GPU
)


## 📦 常用训练参数解释与建议

| 参数 | 作用 | 推荐设置 | 说明 |
|------|------|----------|------|
| `learning_rate` | 学习率 | `1e-5 ~ 5e-5` | 控制学习步长，微调用小一点 |
| `num_train_epochs` | 训练轮数 | `3 ~ 10` | 轮数太大会过拟合 |
| `per_device_train_batch_size` | 每设备每批大小 | `4 ~ 16` | 根据显存调节 |
| `gradient_accumulation_steps` | 梯度累积步数 | `1 ~ 4` | 显存不足时增大 |
| `weight_decay` | 权重衰减 | `0.01` | 正则化，防止过拟合 |
| `evaluation_strategy` | 评估策略 | `"steps"` | 每N步评估一次 |
| `eval_steps` | 评估频率 | `100 ~ 500` | 评估间隔步数 |
| `save_steps` | 模型保存频率 | `100 ~ 500` | 保存Checkpoint |
| `save_total_limit` | 保存最大数量 | `2 ~ 5` | 控制磁盘占用 |
| `load_best_model_at_end` | 训练结束后加载最佳模型 | `True` | 推荐开启 |
| `metric_for_best_model` | 最佳模型评判指标 | `"eval_loss"` | 可自定义 |
| `greater_is_better` | 越大越好 | `False` (对loss) | 与指标一致性 |
| `predict_with_generate` | 是否在验证时生成文本 | `True` | 文本生成类任务必设 |
| `logging_steps` | 日志记录频率 | `50 ~ 200` | 配合 tensorboard 使用 |
| `logging_dir` | 日志目录 | `"./logs"` | 用于可视化 |
| `report_to` | 日志工具 | `"tensorboard"` | 或 `"wandb"`，`"none"` 禁用 |
| `no_cuda` | 是否禁用GPU | `False` | 若无GPU可设为True |

---

## 🧪 示例训练参数配置（Donut微调）

```python
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./donut_out",            # 模型输出路径
    per_device_train_batch_size=4,       # 每GPU的批大小
    num_train_epochs=5,                  # 总训练轮数
    learning_rate=5e-5,                  # 学习率
    weight_decay=0.01,                   # 权重衰减
    gradient_accumulation_steps=2,       # 梯度累积
    evaluation_strategy="steps",         # 每隔eval_steps评估一次
    eval_steps=500,                      # 每500步评估
    save_steps=500,                      # 每500步保存一次模型
    save_total_limit=3,                  # 最多保存3个checkpoint
    predict_with_generate=True,          # 使用生成进行预测
    logging_dir="./logs",                # 日志目录
    logging_steps=100,                   # 每100步记录一次日志
    report_to="tensorboard",             # 使用tensorboard记录
    no_cuda=False,                       # 使用GPU
    load_best_model_at_end=True,         # 训练结束加载最佳模型
    metric_for_best_model="eval_loss",   # 使用eval_loss作为评估指标
    greater_is_better=False              # 越小越好（loss）
)


## 📘 微调建议

	•	调小学习率 是最安全的开始方式，避免模型“遗忘”原本的知识。
	•	使用小批量 + 梯度累积 是在资源有限情况下训练大模型的推荐策略。
	•	开启验证 + 日志记录 能让你及时观察模型是否过拟合。
	•	若目标是生成类任务（如OCR转文本），predict_with_generate=True 非常关键。

### 进阶建议

	•	学习率: 默认可能是5e-5，可以用learning_rate=5e-5调整。学习率对训练效果影响大。
	•	梯度累积: 如果显存小且想用大batch，可以用gradient_accumulation_steps=2等。
	•	权重衰减: 加weight_decay=0.01帮助防过拟合。
	•	保存策略: save_steps=500, save_total_limit=2 控制检查点数量。

In [21]:
# 例如：

training_args = Seq2SeqTrainingArguments(
    output_dir="./donut_out",
    per_device_train_batch_size=4,
    num_train_epochs=5,
    learning_rate=5e-5,
    weight_decay=0.01,
    gradient_accumulation_steps=2,
    predict_with_generate=True,
    logging_dir="./logs",
    save_steps=500,
    save_total_limit=2,
    report_to="none",
    no_cuda=False
)

### 参数调整总结表

| 参数                         | 调整方向        | 说明                                 |
|------------------------------|------------------|--------------------------------------|
| `per_device_train_batch_size` | 2 ~ 8            | 显存足够就调大，加快训练               |
| `num_train_epochs`           | 3 ~ 10           | 根据数据量调整，多轮更充分学习         |
| `learning_rate`              | 1e-5 ~ 5e-4      | 低学习率更稳定，尝试微调最优值         |
| `weight_decay`               | 0.01             | 防止过拟合                            |
| `gradient_accumulation_steps`| 1 ~ 4            | 小显存用来模拟大batch                 |
| `no_cuda`                    | False            | 使用GPU显著加速（True为禁用GPU）      |

## 基于上述参数的完整finetune训练脚本示例

In [None]:
import torch
from transformers import Seq2SeqTrainer, default_data_collator
from transformers import AutoProcessor, VisionEncoderDecoderModel, Seq2SeqTrainingArguments
from datasets import load_metric

# 1. 加载模型和processor
model_name_or_path = "naver-clova-ix/donut-base"  # 换成你自己的模型路径或名称
processor = AutoProcessor.from_pretrained(model_name_or_path)
model = VisionEncoderDecoderModel.from_pretrained(model_name_or_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 2. 定义训练参数（用上面带注释的配置）
training_args = Seq2SeqTrainingArguments(
    output_dir="./donut_out",
    per_device_train_batch_size=4,
    num_train_epochs=5,
    learning_rate=5e-5,
    weight_decay=0.01,
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    save_total_limit=3,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=100,
    report_to="none",
    no_cuda=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

# 3. 准备数据集（示例，假设已准备好）
# train_dataset, eval_dataset = your_prepared_datasets

# 4. 定义计算评价指标的函数（简单计算loss，或者你可以接入bleu、rouge等）
def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    decoded_preds = processor.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = processor.batch_decode(labels, skip_special_tokens=True)
    # 这里可以用具体任务的评价指标，比如BLEU，ROUGE
    # 举例使用准确率等简单指标（根据需求修改）
    # return metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {}

# 5. 实例化Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor.tokenizer,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics,
)

# 6. 启动训练
trainer.train()

# 7. 保存模型和processor
trainer.save_model("./donut_out")
processor.save_pretrained("./donut_out")