In [2]:
import pandas as pd
from PyPDF2 import PdfReader, PdfWriter
import dotenv,os,json,time
from openai import OpenAI
dotenv.load_dotenv()
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

In [13]:
def extract_nth_page(input_pdf_path, output_pdf_path, n):

    with open(input_pdf_path, "rb") as file:
        pdf_reader = PdfReader(file)
        
        # Check if n is within the range of available pages
        if n < 1 or n > len(pdf_reader.pages):
            print(f"Error: The PDF does not have a page number {n}.")
            return
        
        # Create a new PDF writer
        pdf_writer = PdfWriter()
        
        # Add the nth page to the writer (adjusting for 0-based index)
        pdf_writer.add_page(pdf_reader.pages[n - 1])
        
        # Save the new PDF
        with open(output_pdf_path, "wb") as output_file:
            pdf_writer.write(output_file)

# Example usage
input_pdf_path = "night_audits.pdf"
output_pdf_path = "one_page.pdf"

extract_nth_page(input_pdf_path, output_pdf_path, 2)

In [40]:
file = client.files.create(
    file=open("pdfs/one_page.pdf", "rb"),
    purpose="assistants"
)

file_id = file.id
print(f"Uploaded file ID: {file_id}")

vector_store_id = "vs_67c90f8422dc819183e709c3b108fa49" 
client.beta.vector_stores.files.create(
    vector_store_id=vector_store_id,
    file_id=file_id
)

print(f"Added file {file_id} to vector store {vector_store_id}")

assistant_id="asst_PR4MLhFeyXeKTd8A6shiznZS"
client.beta.assistants.update(
    assistant_id,
    tool_resources={"file_search": {"vector_store_ids": [vector_store_id]}}
)

print(f"Updated assistant {assistant_id} to use vector store {vector_store_id}")

Uploaded file ID: file-MxbTQEhRLzfpoFMZZXhKoX
Added file file-MxbTQEhRLzfpoFMZZXhKoX to vector store vs_67c90f8422dc819183e709c3b108fa49
Updated assistant asst_PR4MLhFeyXeKTd8A6shiznZS to use vector store vs_67c90f8422dc819183e709c3b108fa49


In [None]:
thread = client.beta.threads.create()
thread_id = thread.id
query = """
Analyze the file attached and output the data in JSON format using the following keys: 
Account, Name, Current, 30Days, 60Days, 90Days, 120Days, Credits, Balance, Limit.
"""

message = client.beta.threads.messages.create(
    thread_id=thread_id,
    role="user",
    content=query
)

print(f"Added message to thread: {query}")

# Step 3: Run the assistant on the thread
run = client.beta.threads.runs.create(
    thread_id=thread_id,
    assistant_id=assistant_id,
    tools=[{"type": "file_search"}]  # Ensure file search is enabled
)

print(f"Started run with ID: {run.id}")

# Step 4: Wait for the run to complete and retrieve the response

while True:
    run_status = client.beta.threads.runs.retrieve(
        thread_id=thread_id,
        run_id=run.id
    )
    if run_status.status == "completed":
        break
    time.sleep(2)  # Wait for 2 seconds before checking again

print("Run completed. Retrieving response...")

# Retrieve the assistant's response
messages = client.beta.threads.messages.list(
    thread_id=thread_id
)

# Extract the assistant's response
for message in messages.data:
    if message.role == "assistant":
        print("Assistant's response:")
        print(message.content[0].text.value)
        break

Added message to thread: 
Analyze the file attached and output the data in JSON format using the following keys: 
Account, Name, Current, 30Days, 60Days, 90Days, 120Days, Credits, Balance, Limit.

Started run with ID: run_DD4TZ8RNRtFg1yWDrKzZ92Qp
Run completed. Retrieving response...
Assistant's response:
Here is the extracted data from the attached PDF in JSON format:

```json
[
    {
        "Account": "1911",
        "Name": "SURE WINNER FOODS",
        "Current": "0.00",
        "30Days": "0.00",
        "60Days": "0.00",
        "90Days": "0.00",
        "120Days": "(1.08)",
        "Credits": "(1.08)",
        "Balance": "(1.08)",
        "Limit": "0.00"
    },
    {
        "Account": "1925",
        "Name": "L & O PLUMBING & HEATING",
        "Current": "0.00",
        "30Days": "0.00",
        "60Days": "0.00",
        "90Days": "0.00",
        "120Days": "(2.88)",
        "Credits": "(2.88)",
        "Balance": "(2.88)",
        "Limit": "0.00"
    },
    {
        "Account":

In [47]:
vector_store_files = client.beta.vector_stores.files.list(
    vector_store_id=vector_store_id)
print(len(vector_store_files.data))
for file in vector_store_files.data:
    client.beta.vector_stores.files.delete(
        vector_store_id=vector_store_id,
        file_id=file.id)
    print(f"Deleted file {file.id} from vector store {vector_store_id}")


0


In [3]:
def upload_and_query_pdf(pdf_path, query):
    dotenv.load_dotenv()
    client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
    file = client.files.create(
    file=open(pdf_path, "rb"),
    purpose="assistants")
    file_id = file.id
    
    vector_store_id = os.getenv('VECTOR_STORE_ID')
    print(vector_store_id)
    client.beta.vector_stores.files.create(
        vector_store_id=vector_store_id,
        file_id=file_id)
    print(f"Added file {file_id} to vector store {vector_store_id}")

    assistant_id=os.getenv('ASSISTANT_ID')
    client.beta.assistants.update(
        assistant_id,
        tool_resources={"file_search": {"vector_store_ids": [vector_store_id]}})
    thread = client.beta.threads.create()
    thread_id = thread.id
    message = client.beta.threads.messages.create(
        thread_id=thread_id,
        role="user",
        content=query)
    print(f"Added message to thread: {query}")
    run = client.beta.threads.runs.create(
    thread_id=thread_id,
    assistant_id=assistant_id,
    tools=[{"type": "file_search"}])

    print(f"Started run with ID: {run.id}")

    while True:
        run_status = client.beta.threads.runs.retrieve(
            thread_id=thread_id,
            run_id=run.id
        )
        if run_status.status == "completed":
            break
        time.sleep(2)  # Wait for 2 seconds before checking again

    print("Run completed. Retrieving response...")

    messages = client.beta.threads.messages.list(
    thread_id=thread_id)
    for message in messages.data:
        if message.role == "assistant":
            print("Assistant's response:")
            text_out=message.content[0].text.value
            print(text_out)
            text_out = text_out.split("```json")[1].split("```")[0].strip()
            print("***************************************************")
            out=json.loads(text_out)
            print("successfully extracted json")
    vector_store_files = client.beta.vector_stores.files.list(
    vector_store_id=vector_store_id)
    for file in vector_store_files.data:
        client.beta.vector_stores.files.delete(
            vector_store_id=vector_store_id,
            file_id=file.id)
        print(f"Deleted file {file.id} from vector store {vector_store_id}")

    return out
    



query = """
Analyze the file attached and output the data in JSON format using the following keys: 
Account, Name, Current, 30Days, 60Days, 90Days, 120Days, Credits, Balance, Limit.
"""
out=upload_and_query_pdf("pdfs/one_page.pdf", query)

with open("outputs/out.json", "w") as f:
    json.dump(out, f,indent=4)

    

vs_67c90f8422dc819183e709c3b108fa49
Added file file-3nai3Qhcp1BBmUv13UbHDA to vector store vs_67c90f8422dc819183e709c3b108fa49
Added message to thread: 
Analyze the file attached and output the data in JSON format using the following keys: 
Account, Name, Current, 30Days, 60Days, 90Days, 120Days, Credits, Balance, Limit.

Started run with ID: run_81aK3FaeGCNM6bTN6kMjzq4H
Run completed. Retrieving response...
Assistant's response:
```json
[
    {
        "Account": "1911",
        "Name": "SURE WINNER FOODS",
        "Current": "0.00",
        "30Days": "0.00",
        "60Days": "0.00",
        "90Days": "0.00",
        "120Days": "(1.08)",
        "Credits": "(1.08)",
        "Balance": "(1.08)",
        "Limit": "0.00"
    },
    {
        "Account": "1925",
        "Name": "L & O PLUMBING & HEATING",
        "Current": "0.00",
        "30Days": "0.00",
        "60Days": "0.00",
        "90Days": "0.00",
        "120Days": "(2.88)",
        "Credits": "(2.88)",
        "Balance": "(2.