In [1]:
import json
import torch
from torchvision.transforms import Compose
import os
import cv2 as cv
import PIL
from utils.data_utils import load_image
from typing import Tuple

In [6]:
ROOT_ANNOT_DIR = "dataset/training_data/annotations"
max_len = 0
skipped = []
for annot in os.listdir(ROOT_ANNOT_DIR):
    json_path = os.path.join(ROOT_ANNOT_DIR, annot)
    try:
        with open(json_path,"r") as f:
            data = json.load(f)
        max_len = max(max_len, len(data["form"]))
    except:
        skipped.append(json_path)
        continue
    

In [7]:
max_len

181

In [9]:
len(os.listdir(ROOT_ANNOT_DIR))-len(skipped)

130

In [None]:
class FUNSD(torch.utils.data.Dataset):

    def __init__(self, root_dir):
        img_dir_path = os.path.join(root_dir,"images")
        annotation_dir_path = os.path.join(root_dir, "annotations")
        self.root_dir = root_dir
        self.img_dir = os.listdir(img_dir_path)
        self.annot_dir = os.listdir(annotation_dir_path)
        self.max_annot_len, self.no_of_data = self._check_max()

    def _check_max(self) -> Tuple[int ,int]:
        max_len = 0
        skipped = []
        for annot in os.listdir(self.root_dir):
            json_path = os.path.join(self.root_dir, annot)
            try:
                with open(json_path,"r") as f:
                    data = json.load(f)
                max_len = max(max_len, len(data["form"]))
            except:
                skipped.append(json_path)
                continue
        return max_len, len(os.listdir(self.annot_dir)) - len(skipped)
    
    def __len__(self):
        return self.no_of_data
    
    def __getitem__(self, idx):
        image_path = os.path.join(self.root_dir, self.img_dir[idx])
        annotation = os.path.join(self.root_dir, self.annot_dir[idx])
        tokens = []
        bboxes = []
        ner_tags = []
        with open(annotation, "r", encoding="utf8") as f:
            data = json.load(f)
        # image_path = os.path.join(img_dir, file)
        # image_path = image_path.replace("json", "png")
        image, size = load_image(image_path)
        for item in data["form"]:
            cur_line_bboxes = []
            words, label = item["words"], item["label"]
            words = [w for w in words if w["text"].strip() != ""]
            if len(words) == 0:
                continue
            if label == "other":
                for w in words:
                    tokens.append(w["text"])
                    ner_tags.append("O")
                    cur_line_bboxes.append(normalize_bbox(w["box"], size))
            else:
                tokens.append(words[0]["text"])
                ner_tags.append("B-" + label.upper())
                cur_line_bboxes.append(normalize_bbox(words[0]["box"], size))
                for w in words[1:]:
                    tokens.append(w["text"])
                    ner_tags.append("I-" + label.upper())
                    cur_line_bboxes.append(normalize_bbox(w["box"], size))
            # by default: --segment_level_layout 1
            # if do not want to use segment_level_layout, comment the following line
            cur_line_bboxes = self.get_line_bbox(cur_line_bboxes)
            # box = normalize_bbox(item["box"], size)
            # cur_line_bboxes = [box for _ in range(len(words))]
            bboxes.extend(cur_line_bboxes)

In [31]:
from utils.data_utils import load_image

In [2]:
img, dim = load_image("dataset/training_data/images/00040534.png")

tensor([[[255, 255, 255],
         [255, 255, 255],
         [255, 255, 255],
         ...,
         [255, 255, 255],
         [255, 255, 255],
         [255, 255, 255]],

        [[255, 255, 255],
         [255, 255, 255],
         [255, 255, 255],
         ...,
         [255, 255, 255],
         [255, 255, 255],
         [255, 255, 255]],

        [[255, 255, 255],
         [255, 255, 255],
         [255, 255, 255],
         ...,
         [255, 255, 255],
         [255, 255, 255],
         [255, 255, 255]],

        ...,

        [[255, 255, 255],
         [255, 255, 255],
         [255, 255, 255],
         ...,
         [255, 255, 255],
         [255, 255, 255],
         [255, 255, 255]],

        [[255, 255, 255],
         [255, 255, 255],
         [255, 255, 255],
         ...,
         [255, 255, 255],
         [255, 255, 255],
         [255, 255, 255]],

        [[255, 255, 255],
         [255, 255, 255],
         [255, 255, 255],
         ...,
         [255, 255, 255],
        

In [3]:
dim

(1000, 777)