In [1]:
from transformers import DonutProcessor, VisionEncoderDecoderModel, Seq2SeqTrainer, Seq2SeqTrainingArguments
from torch.utils.data import Dataset
from dataclasses import dataclass
from typing import Any, Dict, List
from PIL import Image
import torch
import json
import os

In [2]:
# Load processor and model
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")

# Move to appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using device: {device}")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Using device: cuda


In [3]:
# Configure tokenizer and model properly
original_vocab_size = len(processor.tokenizer)
if processor.tokenizer.pad_token is None:
    processor.tokenizer.add_special_tokens({'pad_token': '<pad>'})
    # Only resize if we actually added new tokens
    if len(processor.tokenizer) > original_vocab_size:
        model.resize_token_embeddings(len(processor.tokenizer))

model.config.pad_token_id = processor.tokenizer.pad_token_id

# Validate decoder start token
docvqa_id = processor.tokenizer.convert_tokens_to_ids("<s_docvqa>")
if docvqa_id == processor.tokenizer.unk_token_id or docvqa_id is None:
    print("Warning: <s_docvqa> token not found, using <s> token")
    model.config.decoder_start_token_id = processor.tokenizer.convert_tokens_to_ids("<s>")
else:
    model.config.decoder_start_token_id = docvqa_id

print(f"pad_token_id: {model.config.pad_token_id}")
print(f"decoder_start_token_id: {model.config.decoder_start_token_id}")
print(f"Vocab size: {len(processor.tokenizer)}")
print(f"Max valid token ID: {len(processor.tokenizer) - 1}")

# Validate configuration
assert model.config.pad_token_id < len(processor.tokenizer), f"pad_token_id {model.config.pad_token_id} >= vocab_size {len(processor.tokenizer)}"
assert model.config.decoder_start_token_id < len(processor.tokenizer), f"decoder_start_token_id {model.config.decoder_start_token_id} >= vocab_size {len(processor.tokenizer)}"

pad_token_id: 1
decoder_start_token_id: 57527
Vocab size: 57532
Max valid token ID: 57531


In [4]:
class DonutFormDataset(Dataset):
    def __init__(self, jsonl_path, images_dir, processor, max_length=512):
        self.samples = []
        self.images_dir = images_dir
        self.processor = processor
        self.max_length = max_length
        
        # Load all samples
        with open(jsonl_path, 'r') as f:
            for line in f:
                self.samples.append(json.loads(line))
        
        print(f"Loaded {len(self.samples)} samples")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        item = self.samples[idx]
        
        # Load and process image
        image_path = os.path.join(self.images_dir, item['file_name'])
        if not os.path.exists(image_path):
            raise FileNotFoundError(f"Image not found: {image_path}")
            
        image = Image.open(image_path).convert("RGB")
        
        # Process image
        pixel_values = self.processor.image_processor(
            image, 
            return_tensors="pt"
        ).pixel_values.squeeze()
        
        # Create target text in DONUT format
        # Use a simpler format to avoid tokenization issues
        gt_data = item['ground_truth']['gt_parse']
        target_text = "<s_docvqa><s_answer>" + json.dumps(
            gt_data, 
            ensure_ascii=True,  # Use ASCII to avoid special characters
            separators=(',', ':')  # Compact JSON
        ) + "</s_answer></s>"
        
        # Tokenize target text
        labels = self.processor.tokenizer(
            target_text,
            add_special_tokens=False,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        ).input_ids.squeeze()
        
        # Validate token IDs are within vocabulary range
        vocab_size = len(self.processor.tokenizer)
        if labels.max() >= vocab_size:
            print(f"WARNING: Found token ID {labels.max()} >= vocab_size {vocab_size}")
            print(f"Target text: {target_text[:100]}...")
            # Clamp invalid token IDs to UNK token
            labels = torch.clamp(labels, 0, vocab_size - 1)
        
        # Replace pad tokens with -100 for loss calculation
        labels[labels == self.processor.tokenizer.pad_token_id] = -100
        
        return {
            "pixel_values": pixel_values,
            "labels": labels
        }

@dataclass
class DonutDataCollator:
    """Data collator for DONUT training"""
    
    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        # Stack pixel values and labels
        pixel_values = torch.stack([f["pixel_values"] for f in features])
        labels = torch.stack([f["labels"] for f in features])
        
        return {
            "pixel_values": pixel_values,
            "labels": labels
        }



In [5]:
# Create dataset with debugging
print("Creating dataset...")

# First, let's test tokenization with a simple example
test_text = '<s_docvqa><s_answer>{"test": "value"}</s_answer></s>'
test_tokens = processor.tokenizer(test_text, add_special_tokens=False, return_tensors="pt").input_ids
print(f"Test tokenization - Max ID: {test_tokens.max()}, Vocab size: {len(processor.tokenizer)}")

if test_tokens.max() >= len(processor.tokenizer):
    print("ERROR: Test tokenization already produces invalid token IDs!")
    print("This suggests an issue with the tokenizer setup.")
    exit()

train_dataset = DonutFormDataset(
    jsonl_path="Dataset/metadata.jsonl",
    images_dir="Dataset/images", 
    processor=processor,
    max_length=512
)

# Test dataset loading
print("Testing dataset...")
try:
    sample = train_dataset[0]
    print(f"Sample pixel_values shape: {sample['pixel_values'].shape}")
    print(f"Sample labels shape: {sample['labels'].shape}")
    print(f"Labels min/max: {sample['labels'].min()}/{sample['labels'].max()}")
except Exception as e:
    print(f"Dataset error: {e}")
    exit()


Creating dataset...
Test tokenization - Max ID: 57527, Vocab size: 57532
Loaded 3 samples
Testing dataset...
Sample pixel_values shape: torch.Size([3, 2560, 1920])
Sample labels shape: torch.Size([512])
Labels min/max: -100/57527


In [6]:
# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./donut_finetune",
    per_device_train_batch_size=1,  # Start small
    gradient_accumulation_steps=4,   # Simulate larger batch size
    num_train_epochs=5,
    learning_rate=5e-5,
    warmup_steps=100,
    logging_steps=10,
    save_steps=500,
    eval_steps=500,
    save_total_limit=2,
    remove_unused_columns=False,
    dataloader_pin_memory=False,
    fp16=torch.cuda.is_available(),  # Only use fp16 if CUDA available
    gradient_checkpointing=True,     # Save memory
    dataloader_num_workers=0,        # Avoid multiprocessing issues
    report_to=None,                  # Disable wandb/tensorboard
)

# Create trainer
print("Creating trainer...")
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    processing_class=processor.tokenizer,
    data_collator=DonutDataCollator(),
)


Creating trainer...


In [7]:
# Debug tokenizer issues
print("=== TOKENIZER DEBUGGING ===")
print(f"Vocab size: {len(processor.tokenizer)}")
print(f"Pad token: '{processor.tokenizer.pad_token}' (ID: {processor.tokenizer.pad_token_id})")
print(f"UNK token: '{processor.tokenizer.unk_token}' (ID: {processor.tokenizer.unk_token_id})")

# Test special tokens
special_tokens = ["<s_docvqa>", "<s_answer>", "</s_answer>", "</s>"]
for token in special_tokens:
    token_id = processor.tokenizer.convert_tokens_to_ids(token)
    print(f"Token '{token}' -> ID {token_id} (valid: {token_id < len(processor.tokenizer)})")

# Test with your actual data
with open("Dataset/metadata.jsonl", 'r') as f:
    sample = json.loads(f.readline())
    
test_json = json.dumps(sample['ground_truth']['gt_parse'], ensure_ascii=True)
test_text = f"<s_docvqa><s_answer>{test_json}</s_answer></s>"
tokens = processor.tokenizer(test_text, add_special_tokens=False, return_tensors="pt").input_ids
print(f"Sample text length: {len(test_text)}")
print(f"Token IDs range: {tokens.min()} to {tokens.max()}")
print(f"Valid range: 0 to {len(processor.tokenizer)-1}")

=== TOKENIZER DEBUGGING ===
Vocab size: 57532
Pad token: '<pad>' (ID: 1)
UNK token: '<unk>' (ID: 3)
Token '<s_docvqa>' -> ID 57527 (valid: True)
Token '<s_answer>' -> ID 57526 (valid: True)
Token '</s_answer>' -> ID 57523 (valid: True)
Token '</s>' -> ID 2 (valid: True)
Sample text length: 471
Token IDs range: 2 to 57527
Valid range: 0 to 57531


In [9]:
!nvidia-smi

Wed Jul  2 15:20:29 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 575.55.01              Driver Version: 576.40         CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 2080 Ti     On  |   00000000:07:00.0  On |                  N/A |
|  0%   49C    P8             35W /  300W |    5053MiB /  11264MiB |     21%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [8]:

# Start training
print("Starting training...")
try:
    trainer.train()
except Exception as e:
    print(f"Training error: {e}")
    import traceback
    traceback.print_exc()

# Save model
print("Saving model...")
trainer.save_model()
processor.save_pretrained("./donut_finetune")

print("Training completed!")


Starting training...


`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`...
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [483,0,0], thread: [0,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [483,0,0], thread: [1,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [483,0,0], thread: [2,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [483,0,0], thread: [3,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [483,0,0], thread: [4,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [483,0,0], thread: [5,0,0] Assertio

Training error: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

Saving model...


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# Save model
print("Saving model...")
trainer.save_model()
processor.save_pretrained("./donut_finetune")

print("Training completed!")


In [None]:
# Test inference
def test_inference(image_path):
    """Test the trained model"""
    image = Image.open(image_path).convert("RGB")
    
    # Prepare decoder input
    decoder_input_ids = torch.tensor([[model.config.decoder_start_token_id]])
    
    # Process image
    pixel_values = processor.image_processor(image, return_tensors="pt").pixel_values
    
    if torch.cuda.is_available():
        pixel_values = pixel_values.cuda()
        decoder_input_ids = decoder_input_ids.cuda()
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            pixel_values=pixel_values,
            decoder_input_ids=decoder_input_ids,
            max_length=512,
            early_stopping=True,
            pad_token_id=processor.tokenizer.pad_token_id,
            eos_token_id=processor.tokenizer.eos_token_id,
            use_cache=True,
            num_beams=1,
            bad_words_ids=[[processor.tokenizer.unk_token_id]],
            return_dict_in_generate=True,
        )
    
    # Decode output
    sequence = processor.tokenizer.batch_decode(outputs.sequences)[0]
    sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
    sequence = sequence.replace("<s_docvqa><s_answer>", "").replace("</s_answer>", "")
    
    try:
        parsed_result = json.loads(sequence)
        return parsed_result
    except:
        return {"raw_output": sequence}


In [None]:
result = test_inference("Dataset/images/test1.png")
print("Inference result:", result)