Compare folders in /data/raw_data versus /data/ocr_results_outputs

In [2]:
import os

In [12]:
def list_pdf_files(directory_path):
    pdf_files = [file for file in os.listdir(directory_path)
                 if file.lower().endswith(".pdf") and os.path.isfile(os.path.join(directory_path, file))]
    return pdf_files

In [20]:
list_pdf_files("../data/raw_data/")[:5]

['document_459.pdf',
 'document_317.pdf',
 'document_471.pdf',
 'document_465.pdf',
 'document_303.pdf']

In [4]:
def list_folders_in_directory(directory_path):
    folders = [name for name in os.listdir(directory_path)
               if os.path.isdir(os.path.join(directory_path, name))]
    return folders

In [18]:
list_folders_in_directory("../data/ocr_results_outputs//")[:5]

['document_1020',
 'document_904',
 'document_138',
 'document_597',
 'document_107']

In [22]:
def remove_extension(file_list):
    return [os.path.splitext(file)[0] for file in file_list]

In [31]:
pdf_file_names = remove_extension(list_pdf_files("../data/raw_data/"))

folder_names = remove_extension(list_folders_in_directory("../data/ocr_results_outputs/"))

Perform comparison

In [40]:
print(f'Original folder has {len(pdf_file_names)} .pdf files.')
print(f'Processed folder has {len(folder_names)} subfolders, each corresponding to the original file.')

Original folder has 1170 .pdf files.
Processed folder has 883 subfolders, each corresponding to the original file.


Since the two value does not match, it means there are some files which was in original .pdf file list but not in the ocr processed result.

Identify the mismatch files

In [44]:
mismatch_files = list(set(pdf_file_names) - set(folder_names))
mismatch_files[:5]

['document_833',
 'document_1144',
 'document_302',
 'document_491',
 'document_1115']

Sanity check

In [47]:
assert len(mismatch_files) == (len(pdf_file_names) - len(folder_names))

----

Examine folder structure of each generated OCR folder which contain `.pdf` (original source), `.md` (parsed text file), `images` folder which contain images from original source.

Count number of `.jpeg` files in each folder

In [54]:
def count_jpeg_in_image_folder(folder_path):
    image_folder = os.path.join(folder_path, "images")
    if not os.path.isdir(image_folder):
        return 0
    return len([f for f in os.listdir(image_folder)
                if f.lower().endswith(".jpeg") and os.path.isfile(os.path.join(image_folder, f))])

In [58]:
count_jpeg_in_image_folder("../data/ocr_results_outputs/document_13/")

12

In [64]:
base_dir = "../data/ocr_results_outputs"

Examine file size for `.md`

In [71]:
def get_md_file_size(folder_name):
    file_size_byte = 0
    if not os.path.exists(f'{base_dir}/{folder_name}/{folder_name}.md'):
        return file_size_byte
    file_size_kb = os.path.getsize(f'{base_dir}/{folder_name}/{folder_name}.md') / 1024
    return file_size_kb

In [73]:
get_md_file_size("document_13")

43.330078125

Examine file size for `.pdf`

In [75]:
def get_pdf_file_size(folder_name):
    file_size_byte = 0
    if not os.path.exists(f'{base_dir}/{folder_name}/{folder_name}.pdf'):
        return file_size_byte
    file_size_kb = os.path.getsize(f'{base_dir}/{folder_name}/{folder_name}.pdf') / 1024
    return file_size_kb

In [80]:
get_pdf_file_size("document_1020")

95.49609375