In [1]:
import os
import io
import tkinter as tk
from tkinter import filedialog
import openai
import pandas as pd
from google.cloud import vision
import fitz  # PyMuPDF


In [2]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "C:\\Users\\prati\\Downloads\\Creds.json"
client = vision.ImageAnnotatorClient()

In [3]:
extracted_and_mentioned_data = set()

In [4]:
def extract_text_from_image(image_path):

    # Load the image
    with open(image_path, "rb") as image_file:
        image_data = image_file.read()
    vision_image = vision.Image(content=image_data)

    # Perform OCR
    response = client.text_detection(image=vision_image)

    # Extract text from response
    extracted_text = ""
    if response.text_annotations:
        extracted_text = response.text_annotations[0].description

    return extracted_text



In [5]:
def extract_pdf_text(pdf_file_path):
    pdf_document = fitz.open(pdf_file_path)

    extracted_texts = []

    # Process each page
    for i in range(pdf_document.page_count):
        page = pdf_document[i]
        image = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))
        image_path = f"temp_image_{i}.jpg"
        image.save(image_path, "jpeg")

        # Extract text from the image using the provided function
        extracted_text = extract_text_from_image(image_path)
        extracted_texts.append(extracted_text)

        # Remove the temporary image file
        os.remove(image_path)

    pdf_document.close()

    return ", ".join(extracted_texts)


In [6]:
def split_text_into_chunks(text, chunk_size, overlap):
    chunks = []
    start = 0
    while start < len(text):
        chunks.append(text[start:start + chunk_size])
        start += chunk_size - overlap
    return chunks

In [7]:
def process_and_extract_data(result_string):
    # Initialize OpenAI API
    api_key = "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"
    openai.api_key = api_key

    chunk_size = 16000
    overlap = 300
    chunks = split_text_into_chunks(result_string, chunk_size, overlap)

    # Initialize data extraction
    extracted_data_list = []
    extracted_data = set()

    # Make API calls for each chunk
    for chunk in chunks:
        user_message = {
            "role": "user",
            "content": chunk
        }

        if extracted_data:
            user_message["content"] = " ".join(
                [line for line in chunk.split("\n") if not any(keyword in line for keyword in extracted_data)]
            )

        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo-16k",
            messages=[
                {
                    "role": "system",
                    "content": f"Find all the data in the pdf that you think will be important to an insurance broker. The output should contain Name of the company getting the inurance, Insurance Provider, Insurance Name, Total amount insured for, How the total amount insured is split, Insurance start day, Insurance End day, Risk Location(Risk Location may be multiple, try to find them all), this is not an exhaustive list, try to extract everything you can> If anything is not mentioned, do not return anything. I do not want you to send not mentioned"
                },
                user_message
            ],
            temperature=0.5,
            max_tokens=4096
        )

        extracted_text = response['choices'][0]['message']['content']

        for line in extracted_text.split("\n"):
            if ":" in line:
                keyword, data = line.split(":", 1)
                keyword = keyword.strip()
                data = data.strip()
                if keyword and data:
                    if keyword not in extracted_data:
                        extracted_data.add(keyword)
                        extracted_data_list.append({"Keyword": keyword, "Data": data})
    
    
    filtered_data_list = [data for data in extracted_data_list if "not mentioned" not in data['Data'].lower()]

    return pd.DataFrame(filtered_data_list)


In [8]:
import gradio as gr

In [None]:
def run_pipeline(pdf_file):
    pdf_file_path = pdf_file.name
    result_string = extract_pdf_text(pdf_file_path)
    extracted_and_mentioned_data = set()  # Initialize the set to store extracted and mentioned data
    output = process_and_extract_data(result_string, extracted_and_mentioned_data)

    root = tk.Tk()
    root.withdraw()

    excel_file_path = filedialog.asksaveasfilename(defaultextension=".xlsx", filetypes=[("Excel files", "*.xlsx")])
    
    if excel_file_path:
        excel_file_path = excel_file_path + ".xlsx"
        output += "\n" + f"Extracted data saved to '{excel_file_path}'."
        return output
    else:
        return "No output Excel file selected."

iface = gr.Interface(
    fn=run_pipeline,
    inputs=[
        gr.inputs.File(type="file", label="Select PDF file"),
    ],
    outputs="text",
    live=True,
    capture_session=True
)

if __name__ == "__main__":
    iface.launch()