In [1]:
"""
Source: https://stackoverflow.com/questions/77469097/how-can-i-upload-a-pdf-to-chatgpt-using-the-api
"""

from openai import OpenAI

from openai.types.beta.threads.message_create_params import (
    Attachment,
    AttachmentToolFileSearch,
)
import os
from dotenv import load_dotenv
load_dotenv()


info_filename = "IT Consulting in the US.pdf"
out_filename = info_filename.replace('.pdf', '.json')

template = open('industry-at-a-glance-template.md').read()
prompt = f"""
Using the template below in triple quotes, create a summary of the document.
Return the result in JSON format.
Split paragraphs into sentences preceded by the subject of the sentence.
For example: "Revenue Growth": "Revenue has grown at a CAGR of 2.8% to $692.9 billion over the past five years."
Do not extract text from images, only focus on text.

'''
{template}
'''
"""

OAI_KEY = "sk-proj-d1QKc3nsqPDC7bhUCFcMT3BlbkFJCpeDntegoW99slypsPlO"
# OAI_KEY = os.getenv("OPENAI_APIKEY")
client = OpenAI(api_key=OAI_KEY)

pdf_assistant = client.beta.assistants.create(
    model="gpt-4o",
    description="An assistant to extract the contents of PDF files.",
    tools=[{"type": "file_search"}],
    name="PDF assistant",
)

# Create thread
thread = client.beta.threads.create()

info_file = client.files.create(file=open(info_filename, "rb"), purpose="assistants")

# Create assistant
client.beta.threads.messages.create(
    thread_id=thread.id,
    role="user",
    attachments=[
        Attachment(
            file_id=info_file.id, tools=[AttachmentToolFileSearch(type="file_search")]
        )
    ],
    content=prompt,
)

# Run thread
run = client.beta.threads.runs.create_and_poll(
    thread_id=thread.id,
    assistant_id=pdf_assistant.id,
    timeout=1000,
    temperature=0,
    # response_format={"type": "json_object"},
)

if run.status != "completed":
    raise Exception("Run failed:", run.status)

messages_cursor = client.beta.threads.messages.list(thread_id=thread.id)
messages = [message for message in messages_cursor]

In [2]:
glance = messages[0].content[0].text.value
glance = glance.replace("```json", '').replace("```", '').strip()
print(glance)

{
  "Industry at a Glance": {
    "Key Statistics": {
      "Revenue": "Revenue for the IT Consulting industry in the US is $710,683 million.",
      "Historical Revenue Growth": "Revenue has grown at a CAGR of 3.93% from 2002 to 2024.",
      "Projected Revenue Growth": "Revenue is projected to grow at a CAGR of 2.36% from 2024 to 2029.",
      "Profit Margins": "Profit is expected to remain relatively steady and account for 6.4% of revenue in 2023."
    },
    "Executive Summary": {
      "Industry Composition": "A sizable number of operators are small nonemployers or independent contractors.",
      "Technological Shifts": "The industry has experienced a shift towards cloud computing and data analytics.",
      "Revenue Growth": "Industry revenue has grown at a CAGR of 2.8% to $692.9 billion over the five years to 2023.",
      "Profit Stability": "Profit is expected to remain relatively steady and account for 6.4% of revenue in 2023.",
      "Future Projections": "Industry revenue 

In [3]:
import json

json.dump(json.loads(glance), open(out_filename, 'w'), indent=2)