# Identify all markdown files in the master folder which contain folders of files

Functions that are useful to iterate through each file

In [4]:
import os

In [5]:
# Function that yields one file path at a time, ignoring .DS_Store and non-image files
def get_files_one_by_one(directory: str) -> str:
    for root, dirs, files in os.walk(directory):
        for file in files:
            # Skip .DS_Store file
            if file == '.DS_Store':
                continue
            # Include only .md files
            if file.lower().endswith(".md"):
                yield os.path.join(root, file)

# Function that gets all file paths in a list by calling get_files_one_by_one
def get_file_paths(directory: str) -> list:
    return list(get_files_one_by_one(directory))

def get_file_name_and_parent_folder(file_path: str) -> (str, str):
    file_name = os.path.basename(file_path)  # Extracts the file name
    parent_folder = os.path.basename(os.path.dirname(file_path))  # Extracts the parent folder
    return parent_folder, file_name

In [6]:
list_of_md_files = get_file_paths("../data/ocr_results_outputs/")
list_of_md_files[:5]

['../data/ocr_results_outputs/document_1020/document_1020.md',
 '../data/ocr_results_outputs/document_904/document_904.md',
 '../data/ocr_results_outputs/document_138/document_138.md',
 '../data/ocr_results_outputs/document_597/document_597.md',
 '../data/ocr_results_outputs/document_107/document_107.md']

# Detect large tables in each file

In [8]:
min_num_rows = 15

In [9]:
import re

In [44]:
def is_alignment_line(line):
    """Check if a line is a Markdown table alignment line."""
    return bool(re.match(r'^\s*\|?(\s*:?-+:?\s*\|)+\s*$', line))

def extract_markdown_tables(md_path):
    """Extract tables from a Markdown file using alignment lines and track start/end."""
    tables = []
    inside_table = False
    current_table = []
    start_index = None

    with open(md_path, 'r', encoding='utf-8') as md_file:
        lines = md_file.readlines()

    for i, line in enumerate(lines):
        stripped = line.strip()

        if is_alignment_line(stripped):
            print(f'Alignment line found at line {i}.')
            # Alignment line means start of new table
            current_table = []

            # Optional header just above the alignment line
            if i > 0 and lines[i - 1].strip().startswith('|'):
                current_table.append((i - 1, lines[i - 1]))
                start_index = i - 1
            else:
                start_index = i

            current_table.append((i, line))
            inside_table = True

        elif inside_table:
            if stripped.startswith('|'):
                current_table.append((i, line))
            elif is_alignment_line(stripped):
                # Don't end table yet — alignment line means a new one is about to start
                # Save current table before continuing
                if current_table:
                    tables.append({
                        "start": start_index,
                        "end": current_table[-1][0],
                        "lines": current_table
                    })
                # Prepare to reprocess this line
                current_table = []
                inside_table = False
                start_index = None

                # Re-evaluate this line as the start of a new table in next loop
                continue
            else:
                # End of table
                if current_table:
                    print(f'End of table spotted at line {i}.')
                    print(f'Adding table from line {start_index} to {current_table[-1][0]}.\n')
                    tables.append({
                        "start": start_index,
                        "end": current_table[-1][0],
                        "lines": current_table
                    })
                inside_table = False
                current_table = []
                start_index = None

    # Final table at EOF
    if inside_table and current_table:
        print(f'Adding final table from line {start_index} to {current_table[-1][0]}.\n')
        tables.append({
            "start": start_index,
            "end": current_table[-1][0],
            "lines": current_table
        })

    return tables

In [46]:
md_path = "../data/ocr_results_outputs/document_932/document_932.md"
tables = extract_markdown_tables(md_path)

for idx, table in enumerate(tables, 1):
    print(f"\nTable {idx} (lines {table['start']} to {table['end']}):")
    for _, line in table["lines"]:
        print(line.rstrip())

Alignment line found at line 107.
End of table spotted at line 110.
Adding table from line 106 to 109.

Alignment line found at line 172.
End of table spotted at line 178.
Adding table from line 171 to 177.

Alignment line found at line 197.
End of table spotted at line 200.
Adding table from line 196 to 199.

Alignment line found at line 205.
End of table spotted at line 219.
Adding table from line 204 to 218.

Alignment line found at line 237.
End of table spotted at line 253.
Adding table from line 236 to 252.

Alignment line found at line 256.
End of table spotted at line 268.
Adding table from line 255 to 267.

Alignment line found at line 292.
End of table spotted at line 307.
Adding table from line 291 to 306.

Alignment line found at line 387.
End of table spotted at line 392.
Adding table from line 386 to 391.

Alignment line found at line 404.
End of table spotted at line 405.
Adding table from line 404 to 404.

Alignment line found at line 455.
End of table spotted at line 4

# Extract the table data

In [13]:
def table_to_rows(table_text: str) -> list:
    table_lines = table_text.splitlines()
    # Disregard the table if there are less than 2 rows (which could be header and alignment line.
    if len(table_lines) <= 2:
        return list()
    
    rows = []
    
    for table_line in table_lines:
        # Disregard the line which break between header and body
        if is_alignment_line(table_line):
            continue
            
        cleaned_row = table_line.strip('|').split('|')
        
        # Disregard row if all cells are empty
        if all([not cell.strip() for cell in cleaned_row]):
            continue
        
        rows.append(cleaned_row)    
        
    # Cleanup spcaing for each cell
    rows = [[cell.strip() for cell in row] for row in rows]
    
    return rows

In [14]:
table_to_rows("""| Your Electric Charges Breakdown (from page 2) |  |
| :--: | :--: |
| Conservation Incentive | $\$ 20.08$ |
| Transmission | 36.94 |
| Distribution | 75.47 |
| Electric Public Purpose Programs | 16.35 |
| Nuclear Decommissioning | $-0.10$ |
| Wildfire Fund Charge | 3.41 |
| Recovery Bond Charge | 8.27 |
| Recovery Bond Credit | $-8.27$ |
| Wildfire Hardening Charge | 1.05 |
| Competition Transition Charges (CTC) | 0.17 |
| Energy Cost Recovery Amount | $-3.19$ |
| PCIA | 15.01 |
| Taxes and Other | 4.01 |
| Total Electric Charges | $\$ 169.20$ |
""")

[['Your Electric Charges Breakdown (from page 2)', ''],
 ['Conservation Incentive', '$\\$ 20.08$'],
 ['Transmission', '36.94'],
 ['Distribution', '75.47'],
 ['Electric Public Purpose Programs', '16.35'],
 ['Nuclear Decommissioning', '$-0.10$'],
 ['Wildfire Fund Charge', '3.41'],
 ['Recovery Bond Charge', '8.27'],
 ['Recovery Bond Credit', '$-8.27$'],
 ['Wildfire Hardening Charge', '1.05'],
 ['Competition Transition Charges (CTC)', '0.17'],
 ['Energy Cost Recovery Amount', '$-3.19$'],
 ['PCIA', '15.01'],
 ['Taxes and Other', '4.01'],
 ['Total Electric Charges', '$\\$ 169.20$']]

# Save it as a CSV file 

In [16]:
import csv

In [17]:
def write_rows_to_csv(rows: list, destination: str):
    # Create the directory if it doesn't exist
    os.makedirs(os.path.dirname(destination), exist_ok=True)

    with open(destination, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
        writer.writerows(rows)
        
    print(f"✅ new table files created at {destination}.")

# Reference the table in the Markdown

In [19]:
from pathlib import Path

In [20]:
def read_md(md_path: str) -> str:
    with open(md_path, 'r', encoding='utf-8') as file:
        markdown_content = file.read()
        return markdown_content

In [21]:
base_destination_folder = "../data/ocr_table_extracted_results_output/"

In [52]:
def replace_tables_with_placeholders(md_path: str, list_of_table: list) -> str:
    # Read in the .md file
    md = read_md(md_path)

    # Split complete text into individual lines so that line removal per start and end can be done
    md_lines = md.splitlines()
    
    # Retreive document_{number}
    parent_folder, _ = get_file_name_and_parent_folder(md_path)

    # Replace tables from the bottom up to avoid messing up line indices when replacing
    for i, table in reversed(list(enumerate(list_of_table, 1))):
        start = table["start"]       # line number of table start (inclusive)
        end = table["end"]           # line number of table end (exclusive)
        table_lines = table['lines'] # list of tuple (line number, content) of the table lines

        # Build the table content string
        table_content = ''
        for _, table_line in table_lines:
            table_content = f'{table_content}{table_line}'

        # Only need to retain the lines which are valid table
        rows = table_to_rows(table_content)
        if rows != list():
            write_rows_to_csv(rows, f"{base_destination_folder}/{parent_folder}/tables/table-{i-1}.csv")


            csv_filename = f"table-{i-1}.csv"
            tag = f"{{{{table_data: {csv_filename}}}}}"

            # Replace the lines for this table with the tag
            md_lines[start:end+1] = [tag]
        # Remove the lines which are not valid table
        else:
            md_lines[start:end+1] = []
            
    print(f"✅ Processed {len(list_of_table)} table(s), return updated '{Path(md_path).name}' texts.\n")
    
    return "\n".join(md_lines)
        
def save_new_md(md: str, destination: str):
    # Create the directory if it doesn't exist
    os.makedirs(os.path.dirname(destination), exist_ok=True)
    
    Path(destination).write_text(md, encoding='utf-8')
    
    print(f"✅ updated .md file created at {destination}.")

----

Example code

In [48]:
md_path = "../data/ocr_results_outputs/document_932/document_932.md"
tables = extract_markdown_tables(md_path)

# Read in the .md file
md = read_md(md_path)

# Split complete text into individual lines so that line removal per start and end can be done
md_lines = md.splitlines()

# Replace tables from the bottom up to avoid messing up line indices when replacing
for i, table in reversed(list(enumerate(tables, 1))):
    start = table["start"]       # line number of table start (inclusive)
    end = table["end"]           # line number of table end (exclusive)
    table_lines = table['lines'] # list of tuple (line number, content) of the table lines
    
    # Build the table content string
    table_content = ''
    for _, table_line in table_lines:
        table_content = f'{table_content}{table_line}'

    # Only need to retain the lines which are valid table
    rows = table_to_rows(table_content)
    if rows != list():
        write_rows_to_csv(rows, f"./test_table_output_{i-1}.csv")
        
        
        csv_filename = f"table_{i-1}.csv"
        tag = f"{{{{table_data: {csv_filename}}}}}"

        # Replace the lines for this table with the tag
        md_lines[start:end+1] = [tag]
    # Remove the lines which are not valid table
    else:
        md_lines[start:end+1] = []

print()

# Join back to string
print("\n".join(md_lines))

# Save the transformed file in new location
save_new_md("\n".join(md_lines), "./test_new_md.md")

Alignment line found at line 107.
End of table spotted at line 110.
Adding table from line 106 to 109.

Alignment line found at line 172.
End of table spotted at line 178.
Adding table from line 171 to 177.

Alignment line found at line 197.
End of table spotted at line 200.
Adding table from line 196 to 199.

Alignment line found at line 205.
End of table spotted at line 219.
Adding table from line 204 to 218.

Alignment line found at line 237.
End of table spotted at line 253.
Adding table from line 236 to 252.

Alignment line found at line 256.
End of table spotted at line 268.
Adding table from line 255 to 267.

Alignment line found at line 292.
End of table spotted at line 307.
Adding table from line 291 to 306.

Alignment line found at line 387.
End of table spotted at line 392.
Adding table from line 386 to 391.

Alignment line found at line 404.
End of table spotted at line 405.
Adding table from line 404 to 404.

Alignment line found at line 455.
End of table spotted at line 4

Simplified example code

In [54]:
md_path = "../data/ocr_results_outputs/document_932/document_932.md"
tables = extract_markdown_tables(md_path)

replace_tables_with_placeholders(md_path, tables)

Alignment line found at line 107.
End of table spotted at line 110.
Adding table from line 106 to 109.

Alignment line found at line 172.
End of table spotted at line 178.
Adding table from line 171 to 177.

Alignment line found at line 197.
End of table spotted at line 200.
Adding table from line 196 to 199.

Alignment line found at line 205.
End of table spotted at line 219.
Adding table from line 204 to 218.

Alignment line found at line 237.
End of table spotted at line 253.
Adding table from line 236 to 252.

Alignment line found at line 256.
End of table spotted at line 268.
Adding table from line 255 to 267.

Alignment line found at line 292.
End of table spotted at line 307.
Adding table from line 291 to 306.

Alignment line found at line 387.
End of table spotted at line 392.
Adding table from line 386 to 391.

Alignment line found at line 404.
End of table spotted at line 405.
Adding table from line 404 to 404.

Alignment line found at line 455.
End of table spotted at line 4

'## Service For:\n\nTHE APRICOT PIT LP 400 E REMINGTON DR SUNNYVALE, CA 94087\n\n## Questions about your bill?\n\nMon-Fri 7 a.m.-7 p.m. Saturday 8 a.m.-5 p.m. Phone: 1-800-743-5000 www.pge.com/MyEnergy\n\n## Ways To Pay\n\nwww.pge.com/waystopay\n\n## Your Account Summary\n\nAmount Due on Previous Statement\n$957.02\nPayment(s) Received Since Last Statement\n$-957.02$\nPrevious Unpaid Balance\n$0.00\nCurrent PG\\&E Electric Delivery Charges\n$118.15\nSilicon Valley Clean Energy Electric Generation Charges\nCurrent Gas Charges\n$878.10$\nTotal Amount Due by 01/19/2023\n\\$1,063.15\n\n## Reportant Messages\n\nYour current electricity rate Your electricity usage is currently billed on a rate for a single-family home or common-use area of a multi-family complex. If this is incorrect, please call us at 1-800-743-5000 for a free rate analysis.\n\nContinued on last page\n\nPlease return this portion with your payment. No staples or paper clips. Do not fold. Thank you.\n\n99904134b143948000010b

Simplified example code with saving `.md` file

In [56]:
md_path = "../data/ocr_results_outputs/document_932/document_932.md"
tables = extract_markdown_tables(md_path)

# Retreive document_{number}
parent_folder, _ = get_file_name_and_parent_folder(md_path)

save_new_md(replace_tables_with_placeholders(md_path, tables), f"{base_destination_folder}/{parent_folder}/{parent_folder}.md")

Alignment line found at line 107.
End of table spotted at line 110.
Adding table from line 106 to 109.

Alignment line found at line 172.
End of table spotted at line 178.
Adding table from line 171 to 177.

Alignment line found at line 197.
End of table spotted at line 200.
Adding table from line 196 to 199.

Alignment line found at line 205.
End of table spotted at line 219.
Adding table from line 204 to 218.

Alignment line found at line 237.
End of table spotted at line 253.
Adding table from line 236 to 252.

Alignment line found at line 256.
End of table spotted at line 268.
Adding table from line 255 to 267.

Alignment line found at line 292.
End of table spotted at line 307.
Adding table from line 291 to 306.

Alignment line found at line 387.
End of table spotted at line 392.
Adding table from line 386 to 391.

Alignment line found at line 404.
End of table spotted at line 405.
Adding table from line 404 to 404.

Alignment line found at line 455.
End of table spotted at line 4

-----

Complete transformation code

In [66]:
list_of_md_files = get_file_paths("../data/ocr_results_outputs/")

for md_files in list_of_md_files:
    print(f'Starting {md_files} processing')
    tables = extract_markdown_tables(md_files)

    # Retreive document_{number}
    parent_folder, _ = get_file_name_and_parent_folder(md_files)

    save_new_md(replace_tables_with_placeholders(md_files, tables), f"{base_destination_folder}/{parent_folder}/{parent_folder}.md")
    
    print()
    print(f'{md_files} transformation complete\n\n')

Starting ../data/ocr_results_outputs/document_1020/document_1020.md processing
Alignment line found at line 178.
End of table spotted at line 185.
Adding table from line 177 to 184.

Alignment line found at line 192.
End of table spotted at line 206.
Adding table from line 191 to 205.

✅ new table files created at ../data/ocr_table_extracted_results_output//document_1020/tables/table-1.csv.
✅ new table files created at ../data/ocr_table_extracted_results_output//document_1020/tables/table-0.csv.
✅ Processed 2 table(s), return updated 'document_1020.md' texts.

✅ updated .md file created at ../data/ocr_table_extracted_results_output//document_1020/document_1020.md.

../data/ocr_results_outputs/document_1020/document_1020.md transformation complete


Starting ../data/ocr_results_outputs/document_904/document_904.md processing
Alignment line found at line 29.
End of table spotted at line 34.
Adding table from line 28 to 33.

Alignment line found at line 86.
End of table spotted at line 97