In [1]:
import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
# os.environ['TORCHDYNAMO_VERBOSE'] = "1"

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import TrOCRProcessor, VisionEncoderDecoderModel

import numpy as np
import random
import math
from pathlib import Path
from fastai.vision import *
import lightning as L
from lightning.pytorch.loggers import TensorBoardLogger
from lightning.pytorch.callbacks import RichProgressBar, ModelCheckpoint, EarlyStopping, LearningRateMonitor, RichModelSummary
from torchmetrics.text import CharErrorRate

import cv2
from pycocotools import coco, cocoeval, _mask
from pycocotools import mask as maskUtils
from PIL import  Image
from matplotlib import pyplot as plt
import logging
from logging.config import fileConfig
import sys
import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2
import json

# logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
fileConfig("logging.ini")
logger = logging.getLogger("trainer")
torch.set_float32_matmul_precision('medium')
torch.backends.cudnn.benchmark = True
# logger.setLevel(logging.DEBUG)

In [2]:
# Crop images from the via json
# def crop_images_from_via_json(via_json, image_dir, output_dir):
#     with open(via_json) as f:
#         via_json = json.load(f)
    
#     for image_name, image_info in enumerate(via_json.values(), start=1):
#         image_path = os.path.join(image_dir, image_name)

        



In [7]:
ROOT_DIR = Path("dataset/odometer")
TRAIN_DIR = ROOT_DIR / "train"
TEST_DIR = ROOT_DIR / "test"

HF_MODEL_NAME = "microsoft/trocr-large-stage1"
# HF_MODEL_NAME = "microsoft/trocr-small-printed"

processor = TrOCRProcessor.from_pretrained(HF_MODEL_NAME)
# tokenizer = processor.tokenizer



class OdometerDataset(Dataset):
    def __init__(self, root_dir, transforms=None, processor: TrOCRProcessor=None):
        self.root_dir = root_dir
        self.transforms = transforms
        self.processor = processor
        self.image_paths = list(sorted(self.root_dir.glob("**/*.jpg")))
        self.text_paths = list(sorted(self.root_dir.glob("**/*.txt")))

    

    def __len__(self):
        return len(self.image_paths)
    
    def filter_text(self, text):
        # Returns only the digits
        return "".join(filter(str.isdigit, text))
    
    def collate_fn(self, batch):
        pixel_values = torch.stack([item["pixel_values"] for item in batch])
        labels = torch.stack([item["labels"] for item in batch])
        return {
            "pixel_values": pixel_values,
            "labels": labels,
        }
    
    
    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        text_path = image_path.parent / (image_path.stem + ".txt")

        image = Image.open(image_path)
        
        with open(text_path) as f:
            text = f.read()
            text = self.filter_text(text)

        if self.transforms:
            image = self.transforms(image=np.array(image))["image"]
            image = processor(image, return_tensors="pt").pixel_values.squeeze(0)
        else:
            image = processor(image, return_tensors="pt").pixel_values.squeeze(0)

        labels = self.processor.tokenizer(text, return_tensors="pt", max_length=10).input_ids.squeeze(0)
        labels = [label if label != -100 else self.processor.tokenizer.pad_token_id for label in labels]
        labels = torch.tensor(labels)

        logger.debug(f"Image: {image.shape}, Labels: {labels.shape}")
        
        return {
            "pixel_values": image,
            "labels": labels,
        }
    

        
        

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


In [8]:
MAX_EPOCHS = 100
MIN_EPOCHS = 10
BATCH_SIZE = 16

transforms = A.Compose([
    A.GaussNoise(p=0.5),
    A.RandomBrightnessContrast(p=0.5),
    A.Equalize(p=0.5),
    A.HueSaturationValue(p=0.5),
    # A.Normalize(),
    # ToTensorV2(),
])

train_dataset = OdometerDataset(TRAIN_DIR, transforms=transforms, processor=processor)
test_dataset = OdometerDataset(TEST_DIR, processor=processor)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=train_dataset.collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=test_dataset.collate_fn)

In [9]:
model = VisionEncoderDecoderModel.from_pretrained(HF_MODEL_NAME)

model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size
model.config.max_length = 10

logger.debug("Compiling model")
optimized_model = torch.compile(model)
logger.debug("Model compiled")

[DEBUG]: Compiling model [/tmp/ipykernel_144503/2376113032.py:8]
[DEBUG]: Model compiled [/tmp/ipykernel_144503/2376113032.py:10]


In [10]:
for batch in train_loader:
    pixel_values = batch["pixel_values"]
    labels = batch["labels"]
    outputs = optimized_model(pixel_values, labels=labels)
    break

RuntimeError: stack expects each tensor to be equal size, but got [4] at entry 0 and [5] at entry 1