In [None]:
!pip install pymupdf4llm

Collecting pymupdf4llm
  Downloading pymupdf4llm-0.0.17-py3-none-any.whl.metadata (4.1 kB)
Collecting pymupdf>=1.24.10 (from pymupdf4llm)
  Downloading PyMuPDF-1.24.14-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf4llm-0.0.17-py3-none-any.whl (26 kB)
Downloading PyMuPDF-1.24.14-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (19.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf, pymupdf4llm
Successfully installed pymupdf-1.24.14 pymupdf4llm-0.0.17


In [None]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.8.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.5.1 (from gradio)
  Downloading gradio_client-1.5.1-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.19-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.8.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metad

In [None]:
import pymupdf4llm
import re
import os
import zipfile
import gradio as gr

In [None]:
def extract_markdown_elements(lines):
    """
    Extract both content sections and tables from markdown lines.

    Args:
        lines (list): List of markdown lines

    Returns:
        tuple: (content_sections, tables)
            - content_sections: List of (heading, level, content)
            - tables: List of (associated_heading, table_content)
    """
    # Regular expressions
    heading_pattern = r'^(#+)\s+(.*)$'

    # Initialize containers
    content_sections = []
    tables = []

    # State tracking variables
    current_heading = None
    current_content = []
    current_level = 0

    i = 0
    while i < len(lines):
        line = lines[i].strip()

        # Table detection
        if line.startswith('|'):
            table_content = []
            # Collect entire table
            while i < len(lines) and lines[i].strip().startswith('|'):
                table_content.append(lines[i])
                i += 1

            # Check if it's a valid table (at least header, separator, and one data row)
            if len(table_content) > 2:
                table_text = '\n'.join(table_content)
                tables.append((current_heading, table_text))
            continue

        # Heading detection
        heading_match = re.match(heading_pattern, line, re.MULTILINE)
        if heading_match:
            # Save previous section if it exists
            if current_heading and current_content:
                content_sections.append((current_heading, current_level, '\n'.join(current_content)))
                current_content = []

            # Start new section
            current_heading = heading_match.group(2)
            current_level = len(heading_match.group(1))
            i += 1
            continue

        # Collect content
        if line:
            current_content.append(line)

        i += 1

    # Save last section if exists
    if current_heading and current_content:
        content_sections.append((current_heading, current_level, '\n'.join(current_content)))

    return content_sections, tables


In [None]:

def organize_markdown_content(input_pdf_path, output_dir):
    """
    Convert PDF to markdown, extract tables, headings, and organize content.

    Args:
        input_pdf_path (str): Path to input PDF file
        output_dir (str): Directory to save organized markdown files
    """
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)

    # Convert PDF to markdown
    md_text = pymupdf4llm.to_markdown(input_pdf_path)

    # Split content into lines
    lines = md_text.split('\n')

    # Extract content sections and tables
    content_sections, tables = extract_markdown_elements(lines)

    # Write document content
    with open(os.path.join(output_dir, 'document_content.md'), 'w', encoding='utf-8') as f:
        for heading, level, content in content_sections:
            f.write('#' * level + ' ' + heading + '\n\n')
            f.write(content + '\n\n')

    # Write tables
    if tables:
        with open(os.path.join(output_dir, 'tables.md'), 'w', encoding='utf-8') as f:
            f.write('# Tables\n\n')
            for heading, table in tables:
                if heading:
                    f.write(f'## From section: {heading}\n\n')
                f.write(table + '\n\n')

    # Create summary file
    with open(os.path.join(output_dir, 'summary.md'), 'w', encoding='utf-8') as f:
        f.write('# Document Summary\n\n')
        f.write(f'- Total tables: {len(tables)}\n')
        f.write(f'- Total content sections: {len(content_sections)}\n')

In [None]:
def process_pdf(pdf_file):
    # Define the output directory and paths
    output_dir = "organized_content"
    zip_filename = "organized_content.zip"

    # Run the PDF processing function
    organize_markdown_content(pdf_file, output_dir)

    # Zip the output directory
    with zipfile.ZipFile(zip_filename, 'w') as zipf:
        for root, _, files in os.walk(output_dir):
            for file in files:
                file_path = os.path.join(root, file)
                zipf.write(file_path, os.path.relpath(file_path, output_dir))

    return zip_filename

# Gradio UI
def gradio_interface(pdf_file):
    zip_filepath = process_pdf(pdf_file.name)
    return zip_filepath

# Define Gradio input and output
gr.Interface(
    fn=gradio_interface,
    inputs=gr.File(label="Upload PDF File"),
    outputs=gr.File(label="Download ZIP File"),
    title="PDF to Organized Markdown ZIP Converter",
    description="Upload a PDF file, and download a ZIP file with organized markdown content."
).launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://a6593182b47e0f050c.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


