OCR entire page instead of individual elements
subject1
    - Validate that OCR’ing entire page is faster than OCR’ing individual blocks
    - Validate that the text returned by OCR’ing entire page will is as accurate as the individual blocks

In [1]:
import os

import pdf2image

cur_dir = os.getcwd()
base_dir = os.path.join(cur_dir, os.pardir, os.pardir)
example_docs_dir = os.path.join(base_dir, "sample-docs")

# folder path to save temporary outputs
test_dir = os.path.join(cur_dir, "tmp")
os.makedirs(test_dir, exist_ok=True)

In [2]:
import tempfile

filename = "layout-parser-paper.pdf"
f_path = os.path.join(example_docs_dir, filename)

filename_without_extension = os.path.splitext(filename)[0]
sub_test_dir = os.path.join(test_dir, filename_without_extension)
os.makedirs(sub_test_dir, exist_ok=True)

with tempfile.TemporaryDirectory() as tmpdir:
    images = pdf2image.convert_from_path(f_path, output_folder=tmpdir)


individual_page_img_paths = []
individual_page_images = []
for i, image in enumerate(images):
    # Save the image to a file
    # img_path = os.path.join(sub_test_dir, f"page_{i+1}.jpg")
    # image.save(img_path)
    # individual_page_img_paths.append(img_path)

    individual_page_images.append(image)

n_pages = len(individual_page_images)
print(f"number_of_pages: {n_pages}")

print("individual_page_images:")
for i, image in enumerate(individual_page_images[:3]):
    print(f"\timage{i+1} - size: {image.size}")


number_of_pages: 16
individual_page_images:
	image1 - size: (1700, 2200)
	image2 - size: (1700, 2200)
	image3 - size: (1700, 2200)


In [3]:
# OCR'ing individual blocks

from engine import run_ocr_with_layout_detection

inferred_layouts, infer_time_individual, text_individual = run_ocr_with_layout_detection(
    images=individual_page_images,
    output_dir=sub_test_dir,
)


model_type: UnstructuredObjectDetectionModel


In [None]:
# OCR'ing entire page

from examples.ocr.engine import run_ocr

infer_time_entire, text_entire = run_ocr(image_paths=individual_page_img_paths)


In [None]:

print("Processing Time (OCR'ing individual blocks)")
print(f"\ttotal_infer_time: {infer_time_individual}")
print(f"\tavg_infer_time_per_page: {infer_time_individual / n_pages }")

print("Processing Time (OCR'ing entire page)")
print(f"\ttotal_infer_time: {infer_time_entire}")
print(f"\tavg_infer_time_per_page: {infer_time_entire / n_pages}")

In [None]:
# calculate similarity ratio
from difflib import SequenceMatcher
similarity_ratio = SequenceMatcher(None, text_individual, text_entire).ratio()

print(f"similarity_ratio: {similarity_ratio}")


In [None]:
import nltk

# Download the required resources (run this once)
nltk.download('punkt')

# Tokenize the text into words
word_list_by_individual_blocks = nltk.word_tokenize(text_individual)
print("n_word_list_individual_blocks:", len(word_list_by_individual_blocks))
word_sets_individual = set(list(word_list_by_individual_blocks))
print(f"n_word_sets_individual_blocks: {len(word_sets_individual)}")
# print("word_sets_merged:", word_sets_merged)

word_list_entire = nltk.word_tokenize(text_entire)
print("n_word_list_individual:", len(word_list_entire))
word_sets_entire = set(list(word_list_entire))
print(f"n_word_sets_individual: {len(word_sets_entire)}")
# print("word_sets_individual:", word_sets_individual)

# Find unique elements using difference
print("diff_elements:")
print(f"{word_sets_individual - word_sets_entire}\n")
print(word_sets_entire - word_sets_individual)
