In [1]:
import torch
from vit import ViT

# model1 = ResNet50ViT(img_dim=128, pretrained_resnet=False, 
#                         blocks=6, num_classes=10, 
#                         dim_linear_block=256, dim=256)
# # or
model = ViT(img_dim=256, in_channels=3, patch_dim=16, num_classes=10,dim=512)
x = torch.rand(2, 3, 256, 256)
y = model(x)
print(y.shape)

torch.Size([2, 10])


# Experiment with TransOCR
Model from:
https://github.com/FudanVI/benchmarking-chinese-text-recognition/blob/main/models/TransOCR/model/transocr.py


In [2]:
# Setup path in .env file
import os
from dotenv import load_dotenv
import cv2
import numpy as np

# Get absolut path to proeject root
load_dotenv()
data_path = os.getenv("PUBTABNET_DATA_DIR")

project_root_dir = os.path.dirname(os.path.abspath("./"))
print("Project root dir:", project_root_dir)

data_dir = os.getenv("PUBTABNET_DATA_DIR")
absolute_dir = project_root_dir + data_dir
print("Absolute path:", absolute_dir)

Project root dir: /Users/leonremke/Documents/GIT_REPOS/UNI/neural_networks_seminar
Absolute path: /Users/leonremke/Documents/GIT_REPOS/UNI/neural_networks_seminar/pubtabnet


In [51]:
!pip install jsonlines



In [63]:
import jsonlines
import json

def load_and_filter_json_lines(file_path, split_filter='train', num_items_to_print=1):
    filtered_items = []

    with open(file_path, 'r') as reader:
        img_list = jsonlines.Reader(reader)
        filtered_items = [img for img in img_list if img["split"] == split_filter]
    return filtered_items


json_file_path = f'{absolute_dir}/PubTabNet_2.0.0.jsonl'
filtered_items = load_and_filter_json_lines(json_file_path)
for item in filtered_items[:1]:
        print(json.dumps(item, indent=4))  # Print the item with an indentation of 4 spaces

{
    "filename": "PMC4840965_004_00.png",
    "split": "train",
    "imgid": 0,
    "html": {
        "cells": [
            {
                "tokens": [
                    "<b>",
                    "V",
                    "a",
                    "r",
                    "i",
                    "a",
                    "b",
                    "l",
                    "e",
                    "</b>"
                ],
                "bbox": [
                    1,
                    4,
                    27,
                    13
                ]
            },
            {
                "tokens": [
                    "<b>",
                    "H",
                    "a",
                    "z",
                    "a",
                    "r",
                    "d",
                    " ",
                    "r",
                    "a",
                    "t",
                    "i",
                    "o",
                    "</b>"
                ],
    

In [None]:
# Set configuration for training:
import torch
import torch.nn as nn
import torch.optim as optim
import argparse
import os
from transocr import Transformer
from utils import get_data_package, converter, tensor2str, get_alphabet
import zhconv

args = {
    'description': '',
    'exp_name': 'test',
    'batch_size': 32,
    'lr': 1.0,
    'epoch': 1000,
    'radical': False,
    'test': False,
    'resume': '',
    'train_dataset': '',
    'test_dataset': '',
    'imageH': 32,
    'imageW': 256,
    'coeff': 1.0,
    'alpha_path': './data/benchmark.txt',
    'alpha_path_radical': './data/radicals.txt',
    'decompose_path': './data/decompose.txt'
}

In [66]:
def concatenate_tokens(json_data):
    all_tokens = []
    for cell in data["html"]["cells"]:
        cell_tokens = cell["tokens"]
        concatenated_tokens = " ".join(cell_tokens)
        all_tokens.append(concatenated_tokens)

    result = "\n".join(all_tokens)
    return result

test_gold_label = concatenate_tokens(filtered_items[0])
print(test_gold_label)


<b> T y p e   o f   d e l a y </b>
<b> M e a n </b>
<b> S D </b>
<b> M e d i a n </b>
<b> I Q R </b>
<b> M i n </b>
<b> M a x </b>
P a t i e n t   d e l a y
5 5 . 3
4 0 . 0
5 9
5 - 1 2 3
5
1 9 8
H e a l t h c a r e   s e r v i c e s   d e l a y
7 6 . 5
9 1 . 2
4 5
3 8 - 1 2 8
0
3 7 1
T o t a l   d i a g n o s t i c   d e l a y
1 3 1 . 4
9 4 . 3
1 0 4
1 7 - 1 8 7
1 4
4 0 1


In [70]:
path_enriched_json = f'{absolute_dir}/PubTabNet_2.0.0_new.jsonl'
with open(path_enriched_json, "w") as json_file:
    
    for item in filtered_items:
        item['gold_label'] = concatenate_tokens(item)
        json.dump(item, json_file)


In [3]:
from torch.utils.data import Dataset, DataLoader
import os
import jsonlines
import cv2
from PIL import Image
import torchvision.transforms as transforms

class PaddleOCRDataset(Dataset):
    def __init__(self, image_dir, label_file, transform=None, max_images=None):
        self.image_dir = image_dir
        self.label_file = label_file
        self.labels = self.load_and_filter_json_lines(label_file)
        self.image_filenames = [filename for filename in os.listdir(image_dir) if filename.endswith(".png")]
        print("Image names initialised")
        self.transform = transform
        self.label_data = {item["filename"]: item for item in self.labels}
        print("Label data loaded")
        self.max_images = max_images if max_images is not None else len(self.image_filenames)

    def __len__(self):
        return min(self.max_images, len(self.image_filenames))

    def __getitem__(self, index):
        image_filename = self.image_filenames[index]
        image_path = os.path.join(self.image_dir, image_filename)
        label_data = self.label_data[image_filename]

        # Load image using OpenCV
        img = cv2.imread(image_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        pil_img = Image.fromarray(img)
        print("Img loaded")

        label_tokens = [cell["tokens"] for cell in label_data["html"]["cells"]]
        label = " ".join(" ".join(tokens) for tokens in label_tokens)
        print("Label loaded" + label)
        if self.transform is not None:
            img = self.transform(pil_img)

        return img, label

    def load_and_filter_json_lines(self, file_path, split_filter='train'):
        with open(file_path, 'r') as reader:
            img_list = jsonlines.Reader(reader)
            filtered_items = [img for img in img_list if img["split"] == split_filter]
        return filtered_items

class ResizeNormalize(object):
    def __init__(self, size, interpolation=Image.BILINEAR):
        self.size = size
        self.interpolation = interpolation
        self.toTensor = transforms.ToTensor()

    def __call__(self, img):
        img = img.resize(self.size, self.interpolation)
        img = self.toTensor(img)
        img.sub_(0.5).div_(0.5)
        return img

# Example usage
image_dir = f"{absolute_dir}/train"
label_file = f"{absolute_dir}/PubTabNet_2.0.0.jsonl"
transform = ResizeNormalize(size=(256, 256))  # Define your transformation
max_images = 1  # Limit the number of images for testing
dataset = PaddleOCRDataset(image_dir=image_dir, label_file=label_file, transform=transform, max_images=max_images)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4)

for batch_images, batch_labels in dataloader:
    print(batch_labels)
    break


  def __init__(self, size, interpolation=Image.BILINEAR):


In [None]:
train_dataset = YourDataset(data_dir='path_to_paddleOCR_dataset_train')
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [None]:
model = Transformer(args)
model = nn.DataParallel(model)
train_loader, test_loader = get_data_package(args)
optimizer = optim.Adadelta(model.parameters(), lr=args.lr, rho=0.9, weight_decay=1e-4)
criterion = torch.nn.CrossEntropyLoss()
best_acc = -1

# if args.resume.strip() != '':
#     model.load_state_dict(torch.load(args.resume))
#     print('loading pretrained model！！！')

def train(epoch, iteration, image, length, text_input, text_gt, length_radical, radical_input, radical_gt):
    model.train()
    optimizer.zero_grad()
    result = model(image, length, text_input, length_radical, radical_input)

    text_pred = result['pred']
    loss_char = criterion(text_pred, text_gt)
    # if args.radical:
    #     radical_pred = result['radical_pred']
    #     loss_radical = criterion(radical_pred, radical_gt)
    #     loss = loss_char + args.coeff * loss_radical
    #     print(
    #         'epoch : {} | iter : {}/{} | loss : {} | char : {} | radical : {} '.format(epoch, iteration, len(train_loader), loss, loss_char, loss_radical))

    # else:
    loss = loss_char
    print('epoch : {} | iter : {}/{} | loss : {}'.format(epoch, iteration, len(train_loader), loss))
    loss.backward()
    optimizer.step()

print('-------------')
for epoch in range(args.epoch):
    torch.save(model.state_dict(), './history/{}/model.pth'.format(args.exp_name))
    dataloader = iter(train_loader)
    train_loader_len = len(train_loader)
    print('length of training datasets:', train_loader_len)
    for iteration in range(train_loader_len):
        data = dataloader.next()
        image, label, _ = data
        image = torch.nn.functional.interpolate(image, size=(args.imageH, args.imageW))

        length, text_input, text_gt, length_radical, radical_input, radical_gt, string_label = converter(label, args)
        train(epoch, iteration, image, length, text_input, text_gt, length_radical, radical_input, radical_gt)

    # Test 

        # # scheduler
        # if (epoch + 1) <= 40 and (epoch + 1) % 8 == 0:
        #     for p in optimizer.param_groups:
        #         p['lr'] *= 0.8
        # elif (epoch + 1) > 40 and (epoch + 1) % 2 == 0:
        #     for p in optimizer.param_groups:
        #         p['lr'] *= 0.8

AttributeError: 'dict' object has no attribute 'alpha_path'