In [13]:
IMAGE_PROMPT = """
Given an image of a page from a market research report, your task is to convert all the information on the page into markdown format, preserving the original structure and content.

- Transcribe all text, including paragraphs and headings, verbatim from the page to markdown, maintaining the original format. Do not modify, omit, or add any text.
- If the page includes numerical information with arrows, percentages etc, describe the text in full sentences in markdown format. For example, if there's a box with the text "revenue of wine industry" and an arrow pointing up saying "10% (2015-2020)", describe this as "Revenue of wine industry increased by 10% from 2015 to 2020" instead of just copying the text.
- Do not explain any text that is clearly written in the page, including headings, subheadings, and paragraphs. Copy the text as it is.
- If the text structure is unclear, use your best judgement to format it in markdown.
- If the page contains tables, convert them into markdown table format and provide an explnation as well. Explain all the data that can be inferred from each table. For example, if a table shows sales data for different products, explain the sales trends and patterns with respect to each product. Try to provide as much detail as possible.
- If the page includes a plot or graph, describe it objectively in markdown format. Explain all the details that can be inferred from the plot or graph. For example, if a plot shows sales trends over time, describe the sales trends and patterns observed. Provide a detailed explanation of the data represented in the plot or graph.
- When explaining any component, understand the context of the entire page and be as specific as possible without any ambiguity. The position of the explanation should match the position of the component in the page.
- The output should not contain personal opinions or biases. Do not add personal comments or any information not present on the page. Avoid referring to the page or the report - explain without reference.
- Ensure no important information from the page is missed, as capturing all details is crucial.
"""

In [14]:
from pdf2image import convert_from_path
import os
import base64
import requests
import logging
from openai import OpenAI
import pickle
from langchain_core.documents import Document


os.environ["OPENAI_API_KEY"] = 
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')


model = "gpt-4o-mini"

def request_openai_with_image(base64_image):
    responded = False
    num_tries = 0
    while not responded and num_tries < 5:
        num_tries += 1

        client = OpenAI()
        response = client.chat.completions.create(
            model=model,
            messages=[
                {
                    "role": "system",
                    "content": {"type": "text", "text": "You are a profession converter that converts all the details in given image of page of a market research report to markdown format while preserving all the structure."}
                },
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": IMAGE_PROMPT},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/png;base64,{base64_image}",
                                "detail": "high"
                            },
                        },
                        {"type": "text", "text": "Please describe the provided page in markdown format. Strictly follow the criteria mentioned above to describe each component of the page."},
                    ],
                }
            ],
            max_tokens=10000,
        )

        print(response)
        response_txt = response.choices[0].message.content
        print(response_txt)

    
        try:
            response_txt = response.choices[0].message.content
            responded = True
            return response_txt
        except:
            continue
    return ""

data_dir = "./data/report2"
pdf_path = os.path.join(data_dir, "health.pdf")
img_dir = os.path.join(data_dir, "imgs")
if not os.path.exists(img_dir):
    os.makedirs(img_dir)

output_text = os.path.join(data_dir, f"./_{model}_pages.txt")
output_pickle = os.path.join(data_dir, f"{model}_pages.pkl")


images = convert_from_path(pdf_path)
pages = []
# Iterate over the images
for i, image in enumerate(images):
    # Define the path to save the image
    image_path = os.path.join(img_dir, f'page_{i+1}.png')

    # Save the image
    image.save(image_path, 'PNG')
    logging.info(f"Saved image {i+1} to {image_path}")
    base64_image = encode_image(image_path)
    response = request_openai_with_image(base64_image)

    # convert the response to langchain document
    doc = Document(page_content=response, metadata={"page": i+1})
    pages.append(doc)

# save the pages as txt file
txt_path = output_text
with open(txt_path, "w") as f:
    for page in pages:
        print(page.page_content)
        #write page number
        f.write(f"Page {page.metadata['page']}\n")
        f.write(page.page_content)
        f.write("\n")

pickle.dump(pages, open(output_pickle, "wb"))

ChatCompletion(id='chatcmpl-AE046HCyz9WyqqL1YhzYdeiZKyCIW', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='```markdown\n# INDUSTRY REPORT\n## Healthcare and Social Assistance in the US\n\n### Mar 2024\n```', refusal=None, role='assistant', function_call=None, tool_calls=None))], created=1727900206, model='gpt-4o-mini-2024-07-18', object='chat.completion', service_tier=None, system_fingerprint='fp_74ba47b4ac', usage=CompletionUsage(completion_tokens=24, prompt_tokens=37304, total_tokens=37328, prompt_tokens_details={'cached_tokens': 0}, completion_tokens_details={'reasoning_tokens': 0}))
```markdown
# INDUSTRY REPORT
## Healthcare and Social Assistance in the US

### Mar 2024
```
ChatCompletion(id='chatcmpl-AE049QHhvpF5PmmUgofDp1TFI2Hsq', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='```markdown\n# About IBISWorld\n\nIBISWorld specializes in industry research with coverage on tho

In [18]:
from pdf2image import convert_from_path
import os
import base64
import requests
import logging
from openai import OpenAI
import anthropic
import pickle
from langchain_core.documents import Document

os.environ["ANTHROPIC_API_KEY"] = 
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')


model = "claude-3-5-sonnet-20240620"

def request_openai_with_image(base64_image):
    responded = False
    num_tries = 0
    while not responded and num_tries < 5:
        num_tries += 1

        client = anthropic.Anthropic()
        response = client.messages.create(
            model=model,
            max_tokens=5000,
            system = "You are a profession converter that converts all the details in given image of page of a market research report to markdown format while preserving all the structure.",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": IMAGE_PROMPT},
                        {
                            "type": "image",
                            "source": {
                                "type": "base64",
                                "media_type": "image/png",
                                "data": base64_image,
                            },
                        },
                        {"type": "text", "text": "Please describe the provided page in markdown format. Strictly follow the criteria mentioned above to describe each component of the page."},
                    ],
                }
            ],
        )

        print(response)
        response_txt = response.content[0].text
        print(response_txt)

    
        try:
            response_txt = response.content[0].text
            responded = True
            return response_txt
        except:
            continue
    return ""


data_dir = "./data/report2"
pdf_path = os.path.join(data_dir, "health.pdf")
img_dir = os.path.join(data_dir, "imgs")
if not os.path.exists(img_dir):
    os.makedirs(img_dir)

output_text = os.path.join(data_dir, f"./_{model}_pages.txt")
output_pickle = os.path.join(data_dir, f"{model}_pages.pkl")


images = convert_from_path(pdf_path)
pages = []
# Iterate over the images
for i, image in enumerate(images):
    # Define the path to save the image
    image_path = os.path.join(img_dir, f'page_{i+1}.png')

    # Save the image
    image.save(image_path, 'PNG')
    logging.info(f"Saved image {i+1} to {image_path}")
    base64_image = encode_image(image_path)
    response = request_openai_with_image(base64_image)

    # convert the response to langchain document
    doc = Document(page_content=response, metadata={"page": i+1})
    pages.append(doc)

# save the pages as txt file
txt_path = output_text
with open(txt_path, "w") as f:
    for page in pages:
        print(page.page_content)
        #write page number
        f.write(f"Page {page.metadata['page']}\n")
        f.write(page.page_content)
        f.write("\n")

pickle.dump(pages, open(output_pickle, "wb"))

Message(id='msg_012Ly8rHGXAzZBckQeHiDvvV', content=[TextBlock(text='# IBISWorld\n\n## INDUSTRY REPORT\n\n# Healthcare and Social Assistance in the US\n\n## Mar 2024\n\nThe image shows the cover page of an industry report by IBISWorld. The report focuses on Healthcare and Social Assistance in the US, and is dated March 2024. The title and information are presented on a dark blue background with red circular design elements in the corners and bottom of the page. The IBISWorld logo, featuring white text on a red background, is prominently displayed at the top left corner of the page.', type='text')], model='claude-3-5-sonnet-20240620', role='assistant', stop_reason='end_turn', stop_sequence=None, type='message', usage=Usage(input_tokens=2083, output_tokens=118))
# IBISWorld

## INDUSTRY REPORT

# Healthcare and Social Assistance in the US

## Mar 2024

The image shows the cover page of an industry report by IBISWorld. The report focuses on Healthcare and Social Assistance in the US, and i