Compare folders in /data/raw_data versus /data/ocr_results_outputs

In [2]:
import os

In [3]:
def list_pdf_files(directory_path):
    pdf_files = [file for file in os.listdir(directory_path)
                 if file.lower().endswith(".pdf") and os.path.isfile(os.path.join(directory_path, file))]
    return pdf_files

In [4]:
list_pdf_files("../data/raw_data/")[:5]

['document_459.pdf',
 'document_317.pdf',
 'document_471.pdf',
 'document_465.pdf',
 'document_303.pdf']

In [5]:
def list_folders_in_directory(directory_path):
    folders = [name for name in os.listdir(directory_path)
               if os.path.isdir(os.path.join(directory_path, name))]
    return folders

In [6]:
list_folders_in_directory("../data/ocr_results_outputs//")[:5]

['document_1020',
 'document_904',
 'document_138',
 'document_597',
 'document_107']

In [7]:
def remove_extension(file_list):
    return [os.path.splitext(file)[0] for file in file_list]

In [8]:
pdf_file_names = remove_extension(list_pdf_files("../data/raw_data/"))

folder_names = remove_extension(list_folders_in_directory("../data/ocr_results_outputs/"))

Perform comparison

In [10]:
print(f'Original folder has {len(pdf_file_names)} .pdf files.')
print(f'Processed folder has {len(folder_names)} subfolders, each corresponding to the original file.')

Original folder has 1170 .pdf files.
Processed folder has 883 subfolders, each corresponding to the original file.


Since the two value does not match, it means there are some files which was in original .pdf file list but not in the ocr processed result.

Identify the mismatch files

In [13]:
mismatch_files = list(set(pdf_file_names) - set(folder_names))
mismatch_files[:5]

['document_404',
 'document_566',
 'document_231',
 'document_802',
 'document_228']

Sanity check

In [15]:
assert len(mismatch_files) == (len(pdf_file_names) - len(folder_names))

----

Examine folder structure of each generated OCR folder which contain `.pdf` (original source), `.md` (parsed text file), `images` folder which contain images from original source.

In [18]:
base_dir = "../data/ocr_results_outputs"

Count number of `.jpeg` files in each folder

In [20]:
def count_jpeg_in_image_folder(folder_name):
    image_folder = os.path.join(base_dir, folder_name, "images")
    if not os.path.isdir(image_folder):
        return 0
    return len([f for f in os.listdir(image_folder)
                if f.lower().endswith(".jpeg") and os.path.isfile(os.path.join(image_folder, f))])

In [21]:
count_jpeg_in_image_folder("document_13")

12

Examine file size for `.md`

In [23]:
def get_md_file_size_kb(folder_name):
    file_size_byte = 0
    if not os.path.exists(f'{base_dir}/{folder_name}/{folder_name}.md'):
        return file_size_byte
    file_size_kb = os.path.getsize(f'{base_dir}/{folder_name}/{folder_name}.md') / 1024
    return file_size_kb

In [24]:
get_md_file_size_kb("document_13")

43.330078125

Examine file size for `.pdf`

In [26]:
def get_pdf_file_size_kb(folder_name):
    file_size_byte = 0
    if not os.path.exists(f'{base_dir}/{folder_name}/{folder_name}.pdf'):
        return file_size_byte
    file_size_kb = os.path.getsize(f'{base_dir}/{folder_name}/{folder_name}.pdf') / 1024
    return file_size_kb

In [27]:
get_pdf_file_size_kb("document_1020")

95.49609375

Examine `.md` file structure: number of lines, number of characters

In [29]:
def count_lines_in_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return sum(1 for _ in f)
    
        # return len(f.readlines())
        
        # content = f.read()
        # if not content:
        #     return 0
        # return content.count('\n') + (0 if content.endswith('\n') else 1)

def count_characters_in_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return len(f.read())

def analyze_md_file(folder_name):
    md_files = [f for f in os.listdir(f'{base_dir}/{folder_name}/')
                if f.lower().endswith(".md") and os.path.isfile(os.path.join(f'{base_dir}/{folder_name}/', f))]
    
    if not md_files:
        return 0, 0

    md_path = os.path.join(f'{base_dir}/{folder_name}/', md_files[0])
    line_count = count_lines_in_file(md_path)
    char_count = count_characters_in_file(md_path)
    
    return line_count, char_count

In [30]:
analyze_md_file('document_108')

(158, 36141)

Gloabl examination of folder structure of generated OCR folder

In [32]:
def examine_folder_structure(root_folder):
    summary = {}
    for folder_name in os.listdir(root_folder):
        folder_path = os.path.join(root_folder, folder_name)
        if not os.path.isdir(folder_path):
            continue

        pdf_file_size_kb = get_pdf_file_size_kb(folder_name)
        md_file_size_kb = get_md_file_size_kb(folder_name)
        line_count, char_count = analyze_md_file(folder_name)
        jpeg_count = count_jpeg_in_image_folder(folder_name)

        summary[folder_name] = {
            "pdf_file_size_kb": pdf_file_size_kb,
            "md_file_size_kb": md_file_size_kb,
            "md_line_count": line_count,
            "md_character_count": char_count,
            "jpeg_count_in_images_folder": jpeg_count
        }

    return summary

In [33]:
examine_folder_structure(base_dir)

{'document_1020': {'pdf_file_size_kb': 95.49609375,
  'md_file_size_kb': 7.083984375,
  'md_line_count': 219,
  'md_character_count': 7248,
  'jpeg_count_in_images_folder': 1},
 'document_904': {'pdf_file_size_kb': 234.37890625,
  'md_file_size_kb': 32.9326171875,
  'md_line_count': 194,
  'md_character_count': 33723,
  'jpeg_count_in_images_folder': 2},
 'document_138': {'pdf_file_size_kb': 357.68359375,
  'md_file_size_kb': 56.5380859375,
  'md_line_count': 218,
  'md_character_count': 57895,
  'jpeg_count_in_images_folder': 0},
 'document_597': {'pdf_file_size_kb': 484.638671875,
  'md_file_size_kb': 5.322265625,
  'md_line_count': 124,
  'md_character_count': 5450,
  'jpeg_count_in_images_folder': 1},
 'document_107': {'pdf_file_size_kb': 251.66796875,
  'md_file_size_kb': 13.3759765625,
  'md_line_count': 112,
  'md_character_count': 13697,
  'jpeg_count_in_images_folder': 2},
 'document_751': {'pdf_file_size_kb': 5982.708984375,
  'md_file_size_kb': 12.37109375,
  'md_line_count'

In [34]:
import pandas as pd

In [60]:
# Define variable to store the summary result
summary = examine_folder_structure(base_dir)

# Convert to DataFrame
summary_df = pd.DataFrame.from_dict(summary, orient='index')

summary_df.reset_index(inplace=True)
summary_df.rename(columns={'index': 'folder_name'}, inplace=True)

# View result
summary_df

Unnamed: 0,folder_name,pdf_file_size_kb,md_file_size_kb,md_line_count,md_character_count,jpeg_count_in_images_folder
0,document_1020,95.496094,7.083984,219,7248,1
1,document_904,234.378906,32.932617,194,33723,2
2,document_138,357.683594,56.538086,218,57895,0
3,document_597,484.638672,5.322266,124,5450,1
4,document_107,251.667969,13.375977,112,13697,2
...,...,...,...,...,...,...
878,document_518,515.031250,76.172852,885,78001,2
879,document_188,78.519531,5.769531,130,5906,1
880,document_715,819.650391,2.321289,80,2374,4
881,document_527,120.374023,28.816406,174,29508,0
