# Identify all markdown files in the master folder which contain folders of files

Functions that are useful to iterate through each file

In [14]:
import os

In [24]:
# Function that yields one file path at a time, ignoring .DS_Store and non-image files
def get_files_one_by_one(directory: str) -> str:
    for root, dirs, files in os.walk(directory):
        for file in files:
            # Skip .DS_Store file
            if file == '.DS_Store':
                continue
            # Include only .md files
            if file.lower().endswith(".md"):
                yield os.path.join(root, file)

# Function that gets all file paths in a list by calling get_files_one_by_one
def get_file_paths(directory: str) -> list:
    return list(get_files_one_by_one(directory))

def get_file_name_and_parent_folder(file_path: str) -> (str, str):
    file_name = os.path.basename(file_path)  # Extracts the file name
    parent_folder = os.path.basename(os.path.dirname(file_path))  # Extracts the parent folder
    return parent_folder, file_name

In [18]:
list_of_md_files = get_file_paths("../data/ocr_results_outputs/")
list_of_md_files[:5]

['../data/ocr_results_outputs/document_1020/document_1020.md',
 '../data/ocr_results_outputs/document_904/document_904.md',
 '../data/ocr_results_outputs/document_138/document_138.md',
 '../data/ocr_results_outputs/document_597/document_597.md',
 '../data/ocr_results_outputs/document_107/document_107.md']

# Detect large tables in each file

In [None]:
min_num_rows = 15

In [73]:
import re

In [77]:
def is_alignment_line(line):
    """Check if a line is a Markdown table alignment line."""
    return bool(re.match(r'^\s*\|?(\s*:?-+:?\s*\|)+\s*$', line))

def extract_markdown_tables(md_path):
    """Extract tables from a Markdown file using alignment lines and track start/end."""
    tables = []
    inside_table = False
    current_table = []
    start_index = None

    with open(md_path, 'r', encoding='utf-8') as md_file:
        lines = md_file.readlines()

    for i, line in enumerate(lines):
        stripped = line.strip()

        if is_alignment_line(stripped):
            # Alignment line means start of new table
            current_table = []

            # Optional header just above the alignment line
            if i > 0 and lines[i - 1].strip().startswith('|'):
                current_table.append((i - 1, lines[i - 1]))
                start_index = i - 1
            else:
                start_index = i

            current_table.append((i, line))
            inside_table = True

        elif inside_table:
            if stripped.startswith('|'):
                current_table.append((i, line))
            elif is_alignment_line(stripped):
                # Don't end table yet — alignment line means a new one is about to start
                # Save current table before continuing
                if current_table:
                    tables.append({
                        "start": start_index,
                        "end": current_table[-1][0],
                        "lines": current_table
                    })
                # Prepare to reprocess this line
                current_table = []
                inside_table = False
                start_index = None

                # Re-evaluate this line as the start of a new table in next loop
                continue
            else:
                # End of table
                if current_table:
                    tables.append({
                        "start": start_index,
                        "end": current_table[-1][0],
                        "lines": current_table
                    })
                inside_table = False
                current_table = []
                start_index = None

    # Final table at EOF
    if inside_table and current_table:
        tables.append({
            "start": start_index,
            "end": current_table[-1][0],
            "lines": current_table
        })

    return tables

# Extract the table data

# Save it as a CSV file 

# Reference the table in the Markdown