In [8]:
import pytesseract
from pdf2image import convert_from_path

import base64
from io import BytesIO
from PIL import Image

from ollama import chat
from pydantic import BaseModel
import json
from typing import Literal, List, Optional

In [6]:
def pil_image_to_base64(image: Image):
    buffer = BytesIO()
    image.save(buffer, format="PNG")
    bytes = buffer.getvalue()
    base64_image = base64.b64encode(bytes).decode("utf-8")
    return base64_image

In [2]:
pdf_path = "../../assets/pdfs/SamplePDFWithWords.pdf"
page_images = convert_from_path(pdf_path)

In [None]:
# NOTE: This sample uses pytesseract and seems to miss some of the numbers on the first page.
for page_image in page_images:
    text = pytesseract.image_to_string(page_image)
    print("PAGE:")
    print(text)

PAGE:
Here is a sample document that has words in it. And, here are some more words. And some
more.

1

2

20
21
22
23

24

PAGE:
25
26

27

Now, there will be a table.

Food Primary Color Category
Apple Red Fruit
Carrot Orange Vegetable




In [7]:
# This sample will ask llama vision model to parse the text on each page into markdown.

pil_image_to_base64(page_images[0])

'iVBORw0KGgoAAAANSUhEUgAABqQAAAiYCAIAAAA+NVHkAADLF0lEQVR4nOz9d4AV9d0/bs+yS2+CVKUXARso2LADFpR423tiojHGlhg1saWpscXYYsmtJpYoYi9RsEWxYVdUbFRpAkuRvsvW8/wx33t+59ldluWwZw9+vK6/dqed98ycaa/zmZm8VCoVAQAAAADff41yXQAAAAAAUD+EfQAAAAAQCGEfAAAAAARC2AcAAAAAgRD2AQAAAEAghH0AAAAAEAhhHwAAAAAEQtgHAAAAAIEQ9gEAAABAIIR9AAAAABAIYR8AAAAABELYBwAAAACBEPYBAAAAQCCEfQAAAAAQCGEfAAAAAARC2AcAAAAAgRD2AQAAAEAghH0AAAAAEAhhHwAAAAAEQtgHAAAAAIEQ9gEAAABAIIR9AAAAABAIYR8AAAAABELYBwAAAACBEPYBAAAAQCCEfQAAAAAQCGEfAAAAAARC2AcAAAAAgRD2AQAAAEAghH0AAAAAEAhhHwAAAAAEQtgHAAAAAIEQ9gEAAABAIIR9AAAAABAIYR8AAAAABELYBwAAAACBEPYBAAAAQCCEfQAAAAAQCGEfAAAAAARC2AcAAAAAgRD2AQAAAEAghH0AAAAAEAhhHwAAAAAEQtgHAAAAAIEQ9gEAAABAIIR9AAAAABAIYR8AAAAABELYBwAAAACBEPYBAAAAQCCEfQAAAAAQCGEfAAAAAARC2AcAAAAAgRD2AQAAAEAghH0AAAAAEAhhHwAAAAAEQtgHAAAAAIEQ9gEAAABAIIR9AAAAABAIYR8AAAAABELYBwAAAACBEPYBAAAAQCCEfQAAAAAQCGEfAAAAAARC2AcAAAAAgRD2AQAAAEAghH0AAAAAEAhhHwAAAAAEQtgHAAAAAIEQ9gEAAABAIIR9AAAAABAIYR8AAAAABELYBwAAAACBEPYBAAAAQCCEfQAAAAAQCGEfAAAAAARC2AcAAAAAgRD2AQAAAEAghH0

In [82]:
model = "llama3.2-vision:90b"
#model = "minicpm-v"

response = chat(
    model=model,
    messages=[
        {
            "role": "user", 
            "content": "Parse text from this PDF document page into Markdown format to reperesent the original formatting as closely as possible. Return only the markdown text. Do not explain anything. Do not use a single # character for headings. Use ## or ###. ONLY SHOW ACTUAL CONTENT FROM THE SCANNED DOCUMENT IMAGE.",
            "images": [ pil_image_to_base64(page_images[1]) ]
        }
    ],
    options={'temperature': 0.0, 'num_ctx': 2048}
)

response = response.message
print(response.content)

**25**

**26**

**27**

Now, there will be a table.

| Food | Primary Color | Category |
| --- | --- | --- |
| Apple | Red | Fruit |
| Carrot | Orange | Vegetable |


In [59]:
# Try to parse with anthropic API
import anthropic

import os
from dotenv import load_dotenv

# ANTHROPIC_API_KEY should be in .env file at root of project.
load_dotenv(override=True)

True

In [78]:
# NOTE: This will require ANTHROPIC_API_KEY to be configured
# and all usual costs for calling Anthropic's APIs will apply.
def convert_image_to_markdown_with_claude(page_image: Image):

    client = anthropic.Anthropic()
    message = client.messages.create(
        model="claude-3-5-sonnet-20241022",
        max_tokens=1024,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": "image/png",
                            "data": pil_image_to_base64(page_image)
                        }
                    },
                    {
                        "type": "text",
                        "text": "Parse text from this PDF document page into Markdown format to reperesent the original formatting as closely as possible. Return only the markdown text. Do not explain anything. Do not use a single # character for headings; use ## or ### instead. ONLY SHOW ACTUAL CONTENT FROM THE SCANNED DOCUMENT IMAGE.",
                    }
                ]
            }
        ],
    )

    return message.content[0].text

In [80]:
page_markdown_list = []
for page_image in page_images:
	page_markdown = convert_image_to_markdown_with_claude(page_image)
	print(page_markdown)
	page_markdown_list.append(page_markdown)

Here is a sample document that has words in it. And, here are some more words. And some more.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25

26

27

Now, there will be a table.

| Food | Primary Color | Category |
|------|--------------|----------|
| Apple | Red | Fruit |
| Carrot | Orange | Vegetable |


In [81]:
combined_markdown = ""
for page_markdown in page_markdown_list:
	combined_markdown = combined_markdown + "\n" + page_markdown
	
from IPython.display import Markdown, display
display(Markdown(combined_markdown))


Here is a sample document that has words in it. And, here are some more words. And some more.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25

26

27

Now, there will be a table.

| Food | Primary Color | Category |
|------|--------------|----------|
| Apple | Red | Fruit |
| Carrot | Orange | Vegetable |