In [1]:
import os
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
class CharacterDataset(Dataset):
    def __init__(self, root_dir):
        self.root_dir = root_dir
        self.image_files = []
        for subdir in os.listdir(root_dir):
            subdir_path = os.path.join(root_dir, subdir)
            if os.path.isdir(subdir_path):
                subdir_images = [os.path.join(subdir_path, f) for f in os.listdir(subdir_path) if os.path.isfile(os.path.join(subdir_path, f))]
                self.image_files.extend(subdir_images)
        if len(self.image_files) == 0:
            raise ValueError("No image files found in the specified directory.")

        self.processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-printed')

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_name = self.image_files[idx]
        image = Image.open(img_name).convert("RGB")
        pixel_values = self.processor(images=image, return_tensors="pt").pixel_values.squeeze(0)
        return pixel_values

In [22]:


# Path to your dataset folders
dataset_root = "datasets/train"

# Create dataset and dataloader
dataset = CharacterDataset(dataset_root)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Initialize model
model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-printed')

# Fine-tuning
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
criterion = torch.nn.CrossEntropyLoss()

num_epochs = 10
for epoch in range(num_epochs):
    for images in dataloader:
        outputs = model(images)
        # Compute loss here according to your task
        loss = criterion(outputs, ...)  # Compute your loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

# Save the fine-tuned model
model.save_pretrained("fine_tuned_model")


Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-printed and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
