In [2]:
import fitz
import os
import base64
import io
from PIL import Image

In [6]:
def get_image_caption(image_bytes: bytes) -> str:
    """VLM captioning. Replace with actual VLM call."""
    return "This is a placeholder caption for an image."

In [None]:
def table_to_markdown(table_data: list) -> str:
    """Converts a table (list of lists) to a Markdown format"""
    return "This is a placeholder for converted markdown of a table"

In [15]:
def process_pdf(pdf_path: str) -> str:
    """
        Processes a PDF, extracts text, images (gets captions), and tables,
        and returns a Markdown string.
    """
    doc = fitz.open(pdf_path)
    print(f"Processing PDF: {pdf_path} with {len(doc)} pages.")
    
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        
        page_content = []
        page_content.append(f"\n## Page {page_num + 1}\n")
        
        # Extracting Text
        text = page.get_text("text")
        if text.strip():
            # page_content.append("### Text\n")
            page_content.append(text.strip())
            # page_content.append("\n")
            pass
            
        
        # Extracting Images and getting caption
        image_list = page.get_images(full=True)
        print(f"Page: {page_num}")
        if image_list:
            # print(f"YESS: {page_num}")
            page_content.append("### Images\n")
            
            for img in image_list:
                
                # get the XREF of the image
                xref = img[0]

                base_image = doc.extract_image(xref)
                # base_image is a dictionary with lot of info
                
                # this is the bytes of the image, true representation of the image
                image_bytes = base_image["image"]
                
                # get the image extension(useful for saving the img)
                # image_ext = base_image["ext"]
                
                # Caption the image and add it to our page_content
                caption = get_image_caption(image_bytes)
                page_content.append(caption)

        
        # Extracting tables
        # PyMuPDF's table extraction is heuristic.
        # For complex tables, check pdfplumber or camelot-py.
        tables = page.find_tables()
        if tables.tables:
            # page_content.append("### Table\n")
            # Write logic here to convert table into plain text
            pass
                    
        print(page_content)

In [12]:
PDF_PATH = 'Data/Math for AI Coding Assignment.pdf'

In [13]:
process_pdf(PDF_PATH)

Processing PDF: Data/Math for AI Coding Assignment.pdf with 16 pages.
Page: 0
['\n## Page 1\n']
Page: 1
YESS: 1
['\n## Page 2\n', '### Images\n', 'This is a placeholder caption for an image.', 'This is a placeholder caption for an image.']
Page: 2
YESS: 2
['\n## Page 3\n', '### Images\n', 'This is a placeholder caption for an image.']
Page: 3
YESS: 3
['\n## Page 4\n', '### Images\n', 'This is a placeholder caption for an image.']
Page: 4
YESS: 4
['\n## Page 5\n', '### Images\n', 'This is a placeholder caption for an image.', 'This is a placeholder caption for an image.']
Page: 5
YESS: 5
['\n## Page 6\n', '### Images\n', 'This is a placeholder caption for an image.', 'This is a placeholder caption for an image.']
Page: 6
YESS: 6
['\n## Page 7\n', '### Images\n', 'This is a placeholder caption for an image.', 'This is a placeholder caption for an image.']
Page: 7
YESS: 7
['\n## Page 8\n', '### Images\n', 'This is a placeholder caption for an image.', 'This is a placeholder caption for an

In [17]:
content_list = ['\n## Page 2\n', 'How are you doing :)','\n', 'I am great!','\n','### Images\n', 'This is a placeholder caption for an image.', 'This is a placeholder caption for an image.']
markdown_text = '\n'.join(content_list)

print(markdown_text)


## Page 2

How are you doing :)


I am great!


### Images

This is a placeholder caption for an image.
This is a placeholder caption for an image.
