In [1]:
from transformers import DonutProcessor, VisionEncoderDecoderModel, Seq2SeqTrainer, Seq2SeqTrainingArguments
from torch.utils.data import Dataset
from dataclasses import dataclass
from typing import Any, Dict, List
from PIL import Image
import torch
import json

In [2]:
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
model.to("cpu")

processor.tokenizer.pad_token = "<pad>"
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.decoder_start_token_id = processor.tokenizer.convert_tokens_to_ids("<s>")

print("pad_token_id:", model.config.pad_token_id)
print("decoder_start_token_id:", model.config.decoder_start_token_id)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


pad_token_id: 1
decoder_start_token_id: 0


In [3]:
if processor.tokenizer.pad_token is None:
    processor.tokenizer.add_special_tokens({'pad_token': '<pad>'})
model.config.pad_token_id = processor.tokenizer.pad_token_id

docvqa_id = processor.tokenizer.convert_tokens_to_ids("<s_docvqa>")
if docvqa_id is not None and docvqa_id != processor.tokenizer.unk_token_id:
    model.config.decoder_start_token_id = docvqa_id
else:
    model.config.decoder_start_token_id = processor.tokenizer.convert_tokens_to_ids("<s>")

print("pad_token_id:", model.config.pad_token_id)
print("decoder_start_token_id:", model.config.decoder_start_token_id)
print("Vocab size:", len(processor.tokenizer))

from transformers.models.vision_encoder_decoder.modeling_vision_encoder_decoder import shift_tokens_right
test_labels = torch.tensor([[5, 1234, -100]])
shifted = shift_tokens_right(
    test_labels,
    pad_token_id=processor.tokenizer.pad_token_id,
    decoder_start_token_id=model.config.decoder_start_token_id
)
print("Shifted:", shifted)
print("Max ID:", shifted.max().item())


pad_token_id: 1
decoder_start_token_id: 57527
Vocab size: 57532
Shifted: tensor([[57527,     5,  1234]])
Max ID: 57527


In [4]:
class DonutFormDataset(Dataset):
    def __init__(self, jsonl_path, images_dir, processor):
        self.samples = []
        self.images_dir = images_dir
        self.processor = processor
        with open(jsonl_path, 'r') as f:
            for line in f:
                self.samples.append(json.loads(line))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        item = self.samples[idx]
        image = Image.open(f"{self.images_dir}/{item['file_name']}").convert("RGB")
        
        target_text = "<s_docvqa><s_answer>" + json.dumps(item['ground_truth']['gt_parse'], ensure_ascii=True)
        # Optional: replace weird characters
        target_text = target_text.encode('ascii', errors='replace').decode()

        pixel_values = self.processor.image_processor(image, return_tensors="pt").pixel_values.squeeze()

        labels = self.processor.tokenizer(
            target_text,
            add_special_tokens=False,
            max_length=512,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        ).input_ids.squeeze()


        print("Raw labels:", labels)
        print("Min ID:", labels.min())
        print("Max ID:", labels[labels != -100].max())

        if processor.tokenizer.pad_token is None:
            processor.tokenizer.add_special_tokens({'pad_token': '<pad>'})
        model.config.pad_token_id = processor.tokenizer.pad_token_id



        print("pad_token:", processor.tokenizer.pad_token)
        print("pad_token_id:", processor.tokenizer.pad_token_id)
        print("Does pad_token_id exist in vocab?",
            processor.tokenizer.pad_token_id < len(processor.tokenizer))
        

        labels[labels == processor.tokenizer.pad_token_id] = -100

        print("Unique IDs:", torch.unique(labels))
        print("Vocab size:", len(processor.tokenizer))

        return {
            "pixel_values": pixel_values,
            "labels": labels
        }


@dataclass
class DonutDataCollator:
    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        pixel_values = torch.stack([f["pixel_values"] for f in features])
        labels = torch.stack([f["labels"] for f in features])
        return {
            "pixel_values": pixel_values,
            "labels": labels
        }


In [6]:
print("pad_token:", processor.tokenizer.pad_token)
print("pad_token_id:", processor.tokenizer.pad_token_id)
print("Is pad_token_id < vocab size?", processor.tokenizer.pad_token_id < len(processor.tokenizer))


pad_token: <pad>
pad_token_id: 1
Is pad_token_id < vocab size? True


In [7]:
print("decoder_start_token_id:", model.config.decoder_start_token_id)

decoder_start_token_id: 0


In [9]:
print("ID 0 maps to:", processor.tokenizer.convert_ids_to_tokens(0))
print("Does <s_docvqa> exist?:", processor.tokenizer.convert_tokens_to_ids("<s_docvqa>"))


ID 0 maps to: <s>
Does <s_docvqa> exist?: 57527


In [10]:
model.config.decoder_start_token_id = processor.tokenizer.convert_tokens_to_ids("<s_docvqa>")
print("Updated decoder_start_token_id:", model.config.decoder_start_token_id)


Updated decoder_start_token_id: 57527


In [11]:
print("Final decoder_start_token_id:", model.config.decoder_start_token_id)
print("Vocab size:", len(processor.tokenizer))


Final decoder_start_token_id: 57527
Vocab size: 57532


In [12]:
from transformers.models.vision_encoder_decoder.modeling_vision_encoder_decoder import shift_tokens_right

labels = torch.tensor([[5, 1234, -100]])
shifted = shift_tokens_right(
    labels,
    pad_token_id=processor.tokenizer.pad_token_id,
    decoder_start_token_id=model.config.decoder_start_token_id
)
print("Shifted:", shifted)
print("Max ID:", shifted.max().item())


Shifted: tensor([[57527,     5,  1234]])
Max ID: 57527


In [5]:
train_dataset = DonutFormDataset(
    jsonl_path="Dataset/metadata.jsonl",
    images_dir="Dataset/images",
    processor=processor
)

In [6]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./donut_finetune",
    per_device_train_batch_size=1,
    num_train_epochs=3,
    fp16=True,
    remove_unused_columns=False,  # Important for custom dataset
    #device = "cpu",
    

)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    processing_class=processor.tokenizer,
    data_collator=DonutDataCollator(),
    #device = "cpu",
    #place_model_on_device = False
)

trainer.train()

Raw labels: tensor([57527, 57526,  1741, 45201, 20324, 52714, 34043, 45201,  2586, 40615,
        55144, 41014, 56809, 52989, 40615,  4879, 16191, 41989, 13573, 45201,
         2586, 40615, 37397, 22248, 45012, 43634, 14645, 56730, 52989, 40615,
        47597, 23220, 45201,  2586, 40615, 37397, 26707, 37114, 56981, 53278,
        47510, 39539, 34874, 52989, 40615, 41530, 38204, 45201,  2586, 40615,
         9726, 42812, 50934,  9898, 35192, 52989, 40615, 52789, 23291, 42987,
        45201,  2586, 40615, 16317, 34891, 37891, 39595, 11813,  9066, 35815,
        34891, 54136, 35815, 49794, 42592, 45384,   486, 52989, 40615, 53112,
         4306, 33916, 13573, 45201,  2586, 40615,  6807, 18829, 48517, 39611,
        24357, 52989, 40615, 49933, 28289, 45201,  2586, 40615, 35880, 42696,
        52989, 40615,  7510, 26361, 45201,  2586, 40615, 11065, 52989, 40615,
         5408, 26361, 45201,  2586, 40615,  2777, 52989, 40615, 43833, 17367,
        45201,  2586, 40615, 43833, 32255, 52989, 40

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [19,0,0], thread: [64,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [19,0,0], thread: [65,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [19,0,0], thread: [66,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [19,0,0], thread: [67,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [19,0,0], thread: [68,0,0] Assertion `srcIndex <

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
