<a href="https://colab.research.google.com/github/TijwaLtd/pdf_to_markdown/blob/main/bookToMarkDown.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install marker-pdf

Collecting marker-pdf
  Downloading marker_pdf-0.3.9-py3-none-any.whl.metadata (16 kB)
Collecting filetype<2.0.0,>=1.2.0 (from marker-pdf)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting ftfy<7.0.0,>=6.1.1 (from marker-pdf)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting pdftext<0.4.0,>=0.3.17 (from marker-pdf)
  Downloading pdftext-0.3.18-py3-none-any.whl.metadata (8.2 kB)
Collecting pydantic-settings<3.0.0,>=2.0.3 (from marker-pdf)
  Downloading pydantic_settings-2.6.0-py3-none-any.whl.metadata (3.5 kB)
Collecting python-dotenv<2.0.0,>=1.0.0 (from marker-pdf)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting rapidfuzz<4.0.0,>=3.8.1 (from marker-pdf)
  Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting surya-ocr<0.7.0,>=0.6.11 (from marker-pdf)
  Downloading surya_ocr-0.6.12-py3-none-any.whl.metadata (30 kB)
Collecting tabled-pdf<0.2.0,>=0

In [None]:
import subprocess

def run_marker_single_with_progress(input_pdf: str, output_folder: str, batch_multiplier: int = 2):
    # Command to be executed
    command = [
        "marker_single",
        input_pdf,
        output_folder,
        "--batch_multiplier", str(batch_multiplier)
    ]

    # Start the process and read output line-by-line
    with subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) as process:
        for line in process.stdout:
            print(f"Progress: {line.strip()}")

        # Handle any errors after completion
        _, stderr = process.communicate()
        if process.returncode != 0:
            print(f"Error: {stderr.strip()}")


In [None]:
!mkdir data

mkdir: cannot create directory ‘data’: File exists


In [None]:
%%time
pdf_file_path = "./thinkos.pdf"
output_folder = "./data"
run_marker_single_with_progress(pdf_file_path, output_folder)

Progress: Loaded detection model vikp/surya_det3 on device cuda with dtype torch.float16
Progress: Loaded detection model vikp/surya_layout3 on device cuda with dtype torch.float16
Progress: Loaded reading order model vikp/surya_order on device cuda with dtype torch.float16
Progress: Loaded recognition model vikp/surya_rec2 on device cuda with dtype torch.float16
Progress: Loaded texify model to cuda with torch.float16 dtype
Progress: Loaded recognition model vikp/surya_tablerec on device cuda with dtype torch.float16
Progress: Saved markdown to the ./data/thinkos folder
Progress: Total time: 146.45304036140442
CPU times: user 999 ms, sys: 125 ms, total: 1.12 s
Wall time: 4min 36s


In [None]:
import json
import os
from marker.convert import convert_single_pdf
from marker.models import load_all_models

def generate_markdown(full_text, images, tables=None):
    """
    Generate a Markdown representation from extracted text, images, and tables.

    Parameters:
    full_text (str): The text content extracted from the PDF.
    images (list): A list of image file paths or image data extracted from the PDF.
    tables (list): A list of tables extracted from the PDF, where each table is represented as a list of rows.

    Returns:
    str: A string containing the Markdown formatted content.
    """
    markdown_content = "# Extracted PDF Content\n\n"
    markdown_content += "## Text Content\n\n"
    markdown_content += full_text + "\n\n"

    # Add images in Markdown format
    if images:
        markdown_content += "## Images\n\n"
        for i, img_path in enumerate(images):
            markdown_content += f"![Image {i+1}]({img_path})\n\n"

    # Add tables in Markdown format
    if tables:
        markdown_content += "## Tables\n\n"
        for table_index, table in enumerate(tables):
            markdown_content += f"### Table {table_index + 1}\n\n"
            for row in table:
                markdown_content += "| " + " | ".join(row) + " |\n"
            markdown_content += "|---" * len(table[0]) + "|\n\n"  # Add header separator line

    return markdown_content


def process_pdf_to_markdown(pdf_file_path, output_folder):
    """
    Convert a PDF file to Markdown format and generate a metadata JSON file.

    Parameters:
    pdf_file_path (str): The file path to the PDF to convert.
    output_folder (str): The directory where the output Markdown and metadata files will be saved.

    Returns:
    tuple: Paths to the generated Markdown and metadata files.
    """
    # Load all models needed for conversion
    model_lst = load_all_models()

    # Convert the PDF file
    full_text, images, out_meta = convert_single_pdf(pdf_file_path, model_lst)

    # Prepare the metadata output
    metadata_output = {
        "languages": out_meta.get("languages"),
        "filetype": out_meta.get("filetype", "pdf"),
        "pdf_toc": out_meta.get("pdf_toc", []),
        "computed_toc": out_meta.get("computed_toc", []),
        "pages": out_meta.get("pages", 0),
        "ocr_stats": out_meta.get("ocr_stats", {}),
        "block_stats": out_meta.get("block_stats", {}),
    }

    # Define output file paths
    markdown_file_path = os.path.join(output_folder, "output.md")
    metadata_file_path = os.path.join(output_folder, "metadata.json")

    # Generate Markdown content
    tables = out_meta.get("tables", [])  # Extract tables from metadata if available
    markdown_content = generate_markdown(full_text, images, tables)

    # Write the Markdown content to a file
    with open(markdown_file_path, "w") as md_file:
        md_file.write(markdown_content)

    # Write the metadata to a JSON file
    with open(metadata_file_path, "w") as json_file:
        json.dump(metadata_output, json_file, indent=4)

    return markdown_file_path, metadata_file_path


In [None]:
%%time
pdf_file_path = "./thinkos.pdf"
output_folder = "./data/book"
process_pdf_to_markdown(pdf_file_path, output_folder)

In [None]:
!pip install streamlit pyngrok

In [None]:
# Install required packages
!pip install streamlit pyngrok

# Import libraries
import streamlit as st
import os
from pyngrok import ngrok

# Set your ngrok authentication token directly here
NGROK_AUTH_TOKEN = "2nts24zokJ5F1lGp3zMKGiKVoFK_6uv15Kvda1mqLdhGF5soZ"  # Replace with your actual ngrok auth token

# Authenticate ngrok
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

# Save the code to a .py file to run with Streamlit
with open("app.py", "w") as f:
    f.write('''
import streamlit as st
import os

# Define the function (use the actual implementation for PDF processing)
def process_pdf_to_markdown(pdf_file_path, output_folder):
    # Example placeholder function; replace with actual processing code
    markdown_file_path = os.path.join(output_folder, "output.md")
    metadata_file_path = os.path.join(output_folder, "metadata.json")
    markdown_content = "# Sample Markdown Content\\nThis is a converted PDF content."
    return markdown_file_path, metadata_file_path, markdown_content

# Streamlit UI setup
st.title("PDF to Markdown Converter")

# File uploader
uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])

if uploaded_file is not None:
    # Save the uploaded file
    pdf_file_path = os.path.join("./data/book", uploaded_file.name)
    output_folder = "./data/book"
    os.makedirs(output_folder, exist_ok=True)

    with open(pdf_file_path, "wb") as f:
        f.write(uploaded_file.getbuffer())
    st.success("File uploaded successfully!")

    # Process the PDF to Markdown
    with st.spinner("Processing PDF..."):
        markdown_file_path, metadata_file_path, markdown_content = process_pdf_to_markdown(pdf_file_path, output_folder)

    st.success("PDF converted to Markdown successfully!")

    # Display the Markdown content
    st.markdown("### Converted Markdown Content:")
    st.markdown(markdown_content)

    # Optionally provide a download link
    with open(markdown_file_path, "r") as md_file:
        st.download_button("Download Markdown", md_file, file_name="output.md")
''')

# Run Streamlit app using ngrok to make it accessible
port = 8501
public_url = ngrok.connect(port).public_url
print(f"Streamlit app is live at {public_url}")

# Start Streamlit app in the background
!streamlit run app.py --server.port {port} &
