In [7]:
import pandas as pd
from PyPDF2 import PdfReader, PdfWriter
import dotenv,os,json,time
from openai import OpenAI
from mistralai import Mistral, DocumentURLChunk
from pathlib import Path

dotenv.load_dotenv()
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
client = Mistral(api_key=os.getenv('MISTRAL_API_KEY'))

In [13]:
def extract_nth_page(input_pdf_path, output_pdf_path, n):

    with open(input_pdf_path, "rb") as file:
        pdf_reader = PdfReader(file)
        
        # Check if n is within the range of available pages
        if n < 1 or n > len(pdf_reader.pages):
            print(f"Error: The PDF does not have a page number {n}.")
            return
        
        # Create a new PDF writer
        pdf_writer = PdfWriter()
        
        # Add the nth page to the writer (adjusting for 0-based index)
        pdf_writer.add_page(pdf_reader.pages[n - 1])
        
        # Save the new PDF
        with open(output_pdf_path, "wb") as output_file:
            pdf_writer.write(output_file)

# Example usage
input_pdf_path = "night_audits.pdf"
output_pdf_path = "one_page.pdf"

extract_nth_page(input_pdf_path, output_pdf_path, 2)

In [None]:
def upload_and_query_pdf(pdf_path, query):
    dotenv.load_dotenv()
    client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
    file = client.files.create(
    file=open(pdf_path, "rb"),
    purpose="assistants")
    file_id = file.id
    
    vector_store_id = os.getenv('VECTOR_STORE_ID')
    print(vector_store_id)
    client.beta.vector_stores.files.create(
        vector_store_id=vector_store_id,
        file_id=file_id)
    print(f"Added file {file_id} to vector store {vector_store_id}")

    assistant_id=os.getenv('ASSISTANT_ID')
    client.beta.assistants.update(
        assistant_id,
        tool_resources={"file_search": {"vector_store_ids": [vector_store_id]}})
    thread = client.beta.threads.create()
    thread_id = thread.id
    message = client.beta.threads.messages.create(
        thread_id=thread_id,
        role="user",
        content=query)
    print(f"Added message to thread: {query}")
    run = client.beta.threads.runs.create(
    thread_id=thread_id,
    assistant_id=assistant_id,
    tools=[{"type": "file_search"}])

    print(f"Started run with ID: {run.id}")

    while True:
        run_status = client.beta.threads.runs.retrieve(
            thread_id=thread_id,
            run_id=run.id
        )
        if run_status.status == "completed":
            break
        time.sleep(2)  # Wait for 2 seconds before checking again

    print("Run completed. Retrieving response...")

    messages = client.beta.threads.messages.list(
    thread_id=thread_id)
    for message in messages.data:
        if message.role == "assistant":
            print("Assistant's response:")
            text_out=message.content[0].text.value
            print(text_out)
            text_out = text_out.split("```json")[1].split("```")[0].strip()
            print("***************************************************")
            out=json.loads(text_out)
            print("successfully extracted json")
    vector_store_files = client.beta.vector_stores.files.list(
    vector_store_id=vector_store_id)
    for file in vector_store_files.data:
        client.beta.vector_stores.files.delete(
            vector_store_id=vector_store_id,
            file_id=file.id)
        print(f"Deleted file {file.id} from vector store {vector_store_id}")

    return out
    



query = """
Analyze the file attached and output the data in JSON format using the following keys: 
Account, Name, Current, 30Days, 60Days, 90Days, 120Days, Credits, Balance, Limit.
"""
out=upload_and_query_pdf("pdfs/one_page.pdf", query)

with open("outputs/out.json", "w") as f:
    json.dump(out, f,indent=4)

    

In [8]:
client = Mistral(api_key=os.getenv('MISTRAL_API_KEY'))
pdf_file=Path("pdfs/night_audits.pdf")

uploaded_pdf = client.files.upload(
    file={
        "file_name": pdf_file.stem,
        "content": pdf_file.read_bytes(),
    },
    purpose="ocr"
)  
signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id, expiry=1)

pdf_response = client.ocr.process(document=DocumentURLChunk(document_url=signed_url.url),
                                  model="mistral-ocr-latest",
                                  include_image_base64=True)

response_dict = json.loads(pdf_response.json())
json_string = json.dumps(response_dict, indent=4)

print(json_string)

KeyboardInterrupt: 