In [None]:
!pip install azure-ai-documentintelligence

Import Required Library

In [2]:
import os
from PyPDF2 import PdfReader, PdfWriter  # For splitting the PDF
import requests
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, ContentFormat

Endpoints and Key Setting

In [3]:
# Configuration for Azure Document Intelligence
AZURE_ENDPOINT = "" #Document Intelligence Endpoint
AZURE_KEY = "" #Key for the Document Intelligence Endpoint

In [4]:
# Configuration for OpenAI API
OPENAI_API_KEY = "" #API Key for OpenAI
OPENAI_ENDPOINT = "" #Azure OpenAI Endpoint
OPENAI_HEADERS = {
    "Content-Type": "application/json",
    "api-key": OPENAI_API_KEY,
}

Split the long pdf

In [5]:
def split_pdf(input_pdf_path, output_pdf_paths, num_splits):
    """
    Splits the input PDF into `num_splits` smaller PDFs.

    :param input_pdf_path: Path to the input PDF file.
    :param output_pdf_paths: List of paths for the output PDF files.
    :param num_splits: Number of parts to split the PDF into.
    :return: A list of tuples containing page ranges for each split file.
    """
    reader = PdfReader(input_pdf_path)
    total_pages = len(reader.pages)
    pages_per_split = total_pages // num_splits
    remainder = total_pages % num_splits

    start_page = 0
    page_ranges = []

    for i in range(num_splits):
        writer = PdfWriter()
        end_page = start_page + pages_per_split + (1 if remainder > 0 else 0)
        remainder = max(remainder - 1, 0)

        for j in range(start_page, end_page):
            writer.add_page(reader.pages[j])

        with open(output_pdf_paths[i], "wb") as output_file:
            writer.write(output_file)

        page_ranges.append((start_page + 1, end_page))
        start_page = end_page

    print(f"PDF successfully split into {output_pdf_paths}")
    return total_pages, page_ranges

Call Azure Document Intelligence

In [7]:
def analyze_pdf_and_save_results(file_paths, output_markdown_paths):
    """
    Analyzes each PDF file and saves the results to markdown files.

    :param file_paths: List of paths to the PDF files to analyze.
    :param output_markdown_paths: List of paths to save the markdown results.
    """
    document_intelligence_client = DocumentIntelligenceClient(
        endpoint=AZURE_ENDPOINT, credential=AzureKeyCredential(AZURE_KEY)
    )
    results = []

    for i, file_path in enumerate(file_paths):
        with open(file_path, "rb") as file:
            file_bytes = file.read()

        poller = document_intelligence_client.begin_analyze_document(
            "prebuilt-layout",
            AnalyzeDocumentRequest(bytes_source=file_bytes),
            output_content_format=ContentFormat.MARKDOWN,
        )
        result = poller.result()

        results.append(result.content)
        with open(output_markdown_paths[i], "w", encoding="utf-8") as output_file:
            output_file.write(result.content)

        print(f"Analysis result for file {file_path} saved to {output_markdown_paths[i]}")
    return results

Ready for Azure Open AI Request

In [8]:
def generate_payload(document_content, question):
    """
    Generate the payload for the OpenAI API request.
    """
    return {
        "messages": [
            {
                "role": "system",
                "content": "You are an AI assistant analyzing a long document that has been split into multiple parts. Not all parts contain answers to the question, and if no part contains the answer, you should respond that no answers were found in the document."
            },
            {
                "role": "user",
                "content": f"This is part of a long document split into smaller sections. Based on the following document content, help to answer the question: {question}\n\nDocument Content:\n{document_content}\n\nIf the document does not contain the answer, respond with: 'Answers not found in this part.'"
            }
        ],
        "temperature": 0.2,
        "top_p": 0.95,
        "max_tokens": 3000
    }




Question for Single Splited Document

In [9]:
def ask_openai(file_path, question):
    """
    Sends a request to the OpenAI API for a specific document part.

    :param file_path: Path to the split document file.
    :param question: The question to ask the document.
    """
    with open(file_path, "r", encoding="utf-8") as file:
        document_content = file.read()

    payload = generate_payload(document_content, question)
    response = requests.post(OPENAI_ENDPOINT, headers=OPENAI_HEADERS, json=payload)
    response.raise_for_status()
    result = response.json()

    # Check if the response explicitly states no answer was found
    answer = result['choices'][0]['message']['content']
    if "Answers not found in this part." in answer:
        print(f"No answers found in {file_path}")
    return result

Combine the anwsers for final response

In [10]:
def combine_responses(responses, question):
    """
    Sends a final request to OpenAI to combine and summarize the responses.

    :param responses: List of responses from the split documents.
    :param question: The original question.
    """
    combined_content = "\n\n".join(
        [f"Response from part {i + 1}:\n{response['choices'][0]['message']['content']}" for i, response in enumerate(responses)]
    )
    payload = {
        "messages": [
            {
                "role": "system",
                "content": "You are an AI assistant analyzing a long document that has been split into multiple parts. Not all parts contain answers to the question, and if no part contains the answer, you should respond that no answers were found in the document."
            },
            {
                "role": "user",
                "content": f"The following are responses from different parts of a split document. Combine these responses and provide a final answer to the question: {question}\n\n{combined_content}\n\nIf none of the parts contain the answer, respond with: 'No answers were found in the document.'"
            }
        ],
        "temperature": 0.2,
        "top_p": 0.95,
        "max_tokens": 3000
    }
    response = requests.post(OPENAI_ENDPOINT, headers=OPENAI_HEADERS, json=payload)
    response.raise_for_status()
    return response.json()

Question to ask

In [11]:
# The question to ask
QUESTION = "Based on the document, help to answer the following question: What is personal data?"

Main

In [None]:
if __name__ == "__main__":
    input_pdf_path = "CELEX_32016R0679_EN_TXT.pdf"
    NUM_SPLITS = 3
    split_output_paths = [f"split_part_{i + 1}.pdf" for i in range(NUM_SPLITS)]
    markdown_output_paths = [f"split_part_{i + 1}.md" for i in range(NUM_SPLITS)]

    # Step 1: Split the PDF
    total_pages, page_ranges = split_pdf(input_pdf_path, split_output_paths, NUM_SPLITS)

    # Step 2: Analyze the split files
    analyze_pdf_and_save_results(split_output_paths, markdown_output_paths)

    # Step 3: Ask OpenAI for each split file
    responses = []
    for file_path in markdown_output_paths:
        response = ask_openai(file_path, QUESTION)
        responses.append(response)

    # Step 4: Combine and summarize responses
    final_response = combine_responses(responses, QUESTION)
    final_answer = final_response['choices'][0]['message']['content']

    # Output results
    print("\n--- Final Answer ---")
    print(final_answer)