In [1]:
from transformers import DonutProcessor, VisionEncoderDecoderModel, Seq2SeqTrainingArguments, Seq2SeqTrainer
from torch.utils.data import Dataset
import torch
from PIL import Image
import json
import os

In [2]:
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [3]:
class DonutFormDataset(Dataset):
    def __init__(self, jsonl_path, images_dir, processor):
        with open(jsonl_path) as f:
            self.samples = [json.loads(line) for line in f]
        self.images_dir = images_dir
        self.processor = processor

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        s = self.samples[idx]
        image = Image.open(os.path.join(self.images_dir, s['file_name'])).convert("RGB")
        pixel_values = self.processor.image_processor(image, return_tensors="pt").pixel_values.squeeze()
        labels = self.processor.tokenizer(
            s['ground_truth'],
            add_special_tokens=False,
            max_length=self.processor.tokenizer.model_max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        ).input_ids.squeeze()
        labels[labels == self.processor.tokenizer.pad_token_id] = -100
        return {"pixel_values": pixel_values, "labels": labels}

train_dataset = DonutFormDataset(
    jsonl_path="Dataset/metadata.jsonl",
    images_dir="Dataset/images",
    processor=processor
)

In [4]:
if processor.tokenizer.pad_token is None:
    processor.tokenizer.add_special_tokens({'pad_token': '<pad>'})
model.config.pad_token_id = processor.tokenizer.pad_token_id

docvqa_id = processor.tokenizer.convert_tokens_to_ids("<s_docvqa>")
if docvqa_id is not None and docvqa_id != processor.tokenizer.unk_token_id:
    model.config.decoder_start_token_id = docvqa_id
else:
    model.config.decoder_start_token_id = processor.tokenizer.convert_tokens_to_ids("<s>")

training_args = Seq2SeqTrainingArguments(
    output_dir="./donut_finetune",
    per_device_train_batch_size=1,
    num_train_epochs=3,
    learning_rate=5e-5,
    remove_unused_columns=False
)

class DonutDataCollator:
    def __call__(self, batch):
        pixel_values = torch.stack([x["pixel_values"] for x in batch])
        labels = torch.stack([x["labels"] for x in batch])
        return {"pixel_values": pixel_values, "labels": labels}


In [5]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=processor.tokenizer,
    data_collator=DonutDataCollator()
)

trainer.train()

  trainer = Seq2SeqTrainer(


ValueError: text input must be of type `str` (single example), `list[str]` (batch or single pretokenized example) or `list[list[str]]` (batch of pretokenized examples).