In [5]:

import openai
from openai import OpenAI
import os
from dotenv import load_dotenv
import inspect
import nbimporter
import Extracting_Data_Project
import time
import numpy as np

load_dotenv()
api_key = os.getenv("API_KEY")
os.environ["OPENAI_API_KEY"] = api_key
openai.api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI() 

from prompts import (
    bank_statement_prompt,
    passport_prompt,
    council_tax_prompt,
    payslip_prompt,
    driving_licence_prompt,
    accountant_certificate_prompt,
    p60_prompt,
    tyo_prompt,
    sa302_prompt
)

image_blocks = Extracting_Data_Project.convert_pdf_to_png(r"C:\Users\ranvi\OneDrive\Documents\Test_Data\P60_1.pdf")

#trying to do extraction using only LLM - gpt model does python for me - quickest 
times = []
for i in range(0,3):
    start = time.time()
    response = client.responses.create(
    model="gpt-4.1-mini",
    input=[{
        "role": "user",
        "content": [
            {"type": "input_text", "text": """You are an extemely high level data analyst and your task is to extract useful information from several different types of documents used in banking. From the images given,
            identify what kind of document has been given to you. Only select from one of these options. Bank Statement, Passport, Council Tax, Payslip, Driving Licence, 
            Accountant Certificate, P60, TYO, SA302. If you are unable to identify one of these documents, then output "Document not supported." and try summarise what this dcument is showing. Take care when distinguishing between SA302 or TYO, the TYO is a less detailed overall
            view of the year with totals calculated, meanwhile the SA302 is a break down of taxes paid over the year and a lot more detailed."""},
            *image_blocks
        ]
    }]
    )
    output = response.output_text

    response2 = client.responses.create(
    model="gpt-4.1-mini",
    input=[{
        "role": "user",
        "content":[
            {"type": "input_text", "text": """You have been given the output of a previous answer from a prompt given by the user which has specified the type of document you are supposed
            to analyse and extract information from. You have also been given several python functions that should be used for this task. You have already been given the 
            image blocks and document_type used in 'handle_file' so you can skip to the document processing. All the functions and strings called in handle-file have also been provided.
            Your task is to recreate the string output I would get from running handle_file (ignore the boolean value returned)."""},
            {"type": "input_text", "text": output},
            {"type":"input_text", "text": inspect.getsource(Extracting_Data_Project.handle_file)},
            {"type":"input_text", "text": inspect.getsource(Extracting_Data_Project.extract_json_from_document)},
            {"type":"input_text", "text": bank_statement_prompt},
            {"type":"input_text", "text": passport_prompt},
            {"type":"input_text", "text": council_tax_prompt},
            {"type":"input_text", "text": payslip_prompt},
            {"type":"input_text", "text": driving_licence_prompt},
            {"type":"input_text", "text": accountant_certificate_prompt},
            {"type":"input_text", "text": p60_prompt},
            {"type":"input_text", "text": tyo_prompt},
            {"type":"input_text", "text": sa302_prompt},
            *image_blocks
        ]
    }]
    )
    end = time.time()
    #print(response2.output_text)
    #print(f"time: {round(end - start,2)}s")
    times.append(end - start)

print(round(np.sum(np.array(times))/3,2))

#Alot faster using only LLM but slightly less accurate 

#Human must give document, LLM cant do anything till document given so can't be fully agentic

18.78


In [7]:
#Creating tools as AI agents to carry out extraction

#Function calling in API to decide which document is being extracted (using openAI documentation)


import json

def extract_json_from_document(prompt, image_blocks):
    response = client.responses.create(
    model="gpt-4.1-mini",
    input=[{
        "role": "user",
        "content": [
            {"type": "input_text", "text": prompt},
            *image_blocks
        ]
    }]
)
    return(response.output_text)

#image_blocks = Extracting_Data_Project.convert_pdf_to_png(r"C:\Users\ranvi\OneDrive\Documents\Test_Data\bank-statement1.pdf")


tools = [
    {
    "type": "function",
    "name": "extracting_agent",
    "description": """Takes in images of a document and decides what type of document it is from the list: Bank Statement, Passport, Council Tax, Payslip, Driving Licence, 
        Accountant Certificate, P60, TYO, SA302 or other. And then extracts specific information from document depending on its type.""",
    "parameters":{
        "type": "object",
        "properties": {
            "document_type": {"type": "string"}
        },
        "required": ["document_type"]
    }
}
]

def extracting_agent(document_type, images, client):
    if document_type == "Bank Statement":
        return(extract_json_from_document(bank_statement_prompt, images))
    
    elif document_type == "Passport":
        return(extract_json_from_document(passport_prompt, images))
          
    elif document_type == "Council Tax":
        return(extract_json_from_document(council_tax_prompt, images))
        
    elif document_type == "Payslip":
        return(extract_json_from_document(payslip_prompt, images))
    
    elif document_type == "Driving Licence":
        return(extract_json_from_document(driving_licence_prompt, images))

    elif document_type == "Accountant Certificate":
        return(extract_json_from_document(accountant_certificate_prompt, images))
        
    elif document_type == "P60":
        return(extract_json_from_document(p60_prompt, images))
    
    elif document_type == "TYO":
        return(extract_json_from_document(tyo_prompt, images))
        
    elif document_type == "SA302":
        return(extract_json_from_document(sa302_prompt, images))
    else:
        response = client.responses.create(
        model="gpt-4.1-mini",
        input=[{
            "role": "user",
            "content": [
                {"type": "input_text", "text": """You are a high level data extraction model and have been given a document that isn't supported by the
                software. You should still try identify what the document is and try summarise it. Your response should start exactly like this 
                'Document type not supported but document appears to be ......' """},
                *images
            ]
        }]
    )
    return(response.output_text)
    
def summarise_agent(images):
    response = client.responses.create(
        model="gpt-4.1-mini",
        input=[{
            "role": "user",
            "content": [
                {"type": "input_text", "text": """You are a high level data extraction model and have been given a document that isn't supported by the
                software. You should still try identify what the document is and try summarise it. Your response should start exactly like this 
                'Document type not supported but document appears to be ......' """},
                *images
            ]
        }]
    )
    return(response.output_text)
times = []
for i in range(0,3):
    start = time.time()
    response = client.responses.create(
        model="gpt-4.1-mini",
        input= [{
        "role": "user",
        "content": [
            {"type": "input_text", "text": """Decide what the document is that has been submitted in prompt."""},
            *image_blocks
        ]
    }],
        tools=tools,
    )

    args = json.loads(response.output[0].arguments)
    #print(args["document_type"])
    document = args["document_type"]
    output=extracting_agent(document, image_blocks, client)
    #print(f"time: {round(end - start,2)}s")
    times.append(end -start)
print(round(np.sum(np.array(times))/3,2))


-1634.26


In [1]:
print((24.02 + 24.67 + 32)/3)

26.896666666666665
