In [30]:
import json
import logging
import os
import time
import asyncio
import re
from typing import Dict, List, Any, Optional, Tuple, Union
from pathlib import Path
from datetime import datetime
import traceback
import warnings

import google.generativeai as genai
from google.genai import types

import fitz
import PyPDF2

import httpx

from pydantic import BaseModel, Field, validator
from pydantic.types import constr, conint

import pathlib
import shutil
from io import BytesIO

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(name)s - %(message)s'
)
logger = logging.getLogger(__name__)

warnings.filterwarnings("ignore", category=UserWarning)


In [31]:
from dotenv import load_dotenv
load_dotenv()  # This loads your .env file

api_key = os.getenv('GEMINI_API_KEY')
genai.configure(api_key=api_key)

In [32]:
import fitz  # PyMuPDF

def extract_fields(pdf_path):
    doc = fitz.open(pdf_path)
    fields = []

    for page_num, page in enumerate(doc, start=1):
        for w in page.widgets() or []:
            is_checkbox = (w.field_type == fitz.PDF_WIDGET_TYPE_CHECKBOX)

            field = {
                "name": w.field_name,
                "type": "checkbox" if is_checkbox else "text",
                "value": w.field_value,
                "page": page_num,
                "field_type": w.field_type,
                "field_type_string": w.field_type_string,
                "field_label": w.field_label,
            }

            # Add on_state only for checkboxes
            if is_checkbox:
                try:
                    field["on_state"] = w.button_get_on_state_name()  # e.g., "Yes", "On"
                except Exception:
                    field["on_state"] = "Yes"  # safe default

            fields.append(field)

    # Group fields by page
    fields_by_page = {}
    for f in fields:
        fields_by_page.setdefault(f["page"], []).append(f)

    return fields_by_page


In [33]:
# Test it
abdulla_pa_fields = extract_fields("Input Data/Abdulla/PA.pdf")


In [34]:
abdulla_pa_fields

{2: [{'name': 'CB1',
   'type': 'checkbox',
   'value': 'Off',
   'page': 2,
   'field_type': 2,
   'field_type_string': 'CheckBox',
   'field_label': 'Start of treatment',
   'on_state': 'Yes'},
  {'name': 'T2',
   'type': 'text',
   'value': '',
   'page': 2,
   'field_type': 7,
   'field_type_string': 'Text',
   'field_label': 'Start date: (MM)'},
  {'name': 'T3',
   'type': 'text',
   'value': '',
   'page': 2,
   'field_type': 7,
   'field_type_string': 'Text',
   'field_label': 'Start date: (DD)'},
  {'name': 'T4',
   'type': 'text',
   'value': '',
   'page': 2,
   'field_type': 7,
   'field_type_string': 'Text',
   'field_label': 'Start date: (YYYY)'},
  {'name': 'CB5',
   'type': 'checkbox',
   'value': 'Off',
   'page': 2,
   'field_type': 2,
   'field_type_string': 'CheckBox',
   'field_label': 'Continuation of therapy',
   'on_state': 'Yes'},
  {'name': 'T6',
   'type': 'text',
   'value': '',
   'page': 2,
   'field_type': 7,
   'field_type_string': 'Text',
   'field_label

In [35]:
PROMPT_PA = """
You are an expert medical document processing assistant specializing in Prior Authorization (PA) form analysis and field mapping.
Your task is to process and enrich PA form field data with detailed contextual information.

Given Input:
1. A structured dataset containing PA form field definitions including:
   - Field names (e.g. CB1, T1)
   - Field types (checkbox, text, etc.)
   - Page numbers
   - Field labels
   - Current values

2. The complete Prior Authorization form PDF document

Required Processing:
For each form field, analyze sequentially by page number and:

1. Extract the implicit question being asked by the field:
   - For checkboxes: Rephrase the field_label into a yes/no question starting with "Is", "Does", or "Has".
   - For text fields: Frame as an information request.
   - For date fields: Specify what event/action the date refers to.

2. Generate rich contextual information that includes:
   - The section/category the field belongs to.
   - Whether it's a primary question or sub-question.
   - Whose information is being requested (patient, provider, insurer).
   - Any dependencies on other fields.
   - Clinical relevance of the requested information.

<CRITICAL_REQUIREMENTS>
- Every field must have both "question" and "context" added.
- Context must be specific and clinically relevant.
- Maintain logical relationships between fields.
- Preserve exact field names and field labels from the input.
- Keep context concise but informative (25 words max).
- Output must be a valid JSON array only.
- Do not include markdown formatting, triple backticks, or any extra commentary.
</CRITICAL_REQUIREMENTS>

<RESPONSE_FORMAT>
{{
  {{
    "name": "CB1",
    "type": "checkbox",
    "page": 2,
    "field_label": "Start of treatment",
    "question": "Is this a new treatment start for the patient?",
    "context": "Initial checkbox in treatment timeline section indicating whether patient is beginning new therapy versus continuing existing treatment."
  }},
  {{
    "name": "T2",
    "type": "text",
    "page": 2,
    "field_label": "Start date: (MM)",
    "question": "What is the month of treatment start?",
    "context": "2-digit month format for planned medication initiation date in treatment scheduling section."
  }}
}}
</RESPONSE_FORMAT>

<PA_FORM_DATA>
This is a JSON array of field definitions:
{page_fields}
</PA_FORM_DATA>

Return only the JSON array as described. Do not include any text outside the JSON array.
"""


In [36]:
async def query_gemini_async(prompt, pdf_path, model="gemini-2.5-flash"):
    import pathlib
    
    filepath = pathlib.Path(pdf_path)
    loop = asyncio.get_event_loop()
    
    # Configure model for JSON output
    generation_config = genai.GenerationConfig(
        response_mime_type="application/json"
    )
    
    response = await loop.run_in_executor(
        None,
        lambda: genai.GenerativeModel(
            model,
            generation_config=generation_config
        ).generate_content([
            genai.upload_file(path=filepath),
            prompt
        ])
    )
    
    return response.text

async def process_pa_fields_async(pa_fields_data, pdf_path):
    async def process_page(page_num, page_fields):
        prompt = PROMPT_PA.format(page_fields=json.dumps(page_fields))
        result = await query_gemini_async(prompt, pdf_path)
        return page_num, result
    
    # Create tasks for all pages
    tasks = [process_page(page, fields) for page, fields in pa_fields_data.items()]
    
    # Run all pages concurrently
    results = await asyncio.gather(*tasks)
    
    # Simple results structure
    enhanced_fields = {}
    for page, result in results:
        enhanced_fields[page] = result
        print(f"Page {page} processed")
    
    return enhanced_fields

# Usage
enhanced_pa_data = await process_pa_fields_async(abdulla_pa_fields, "Input Data/Abdulla/PA.pdf")



Page 2 processed
Page 3 processed
Page 4 processed
Page 5 processed


In [37]:
enhanced_pa_data

{2: '[\n  {\n    "name": "CB1",\n    "type": "checkbox",\n    "value": "Off",\n    "page": 2,\n    "field_type": 2,\n    "field_type_string": "CheckBox",\n    "field_label": "Start of treatment",\n    "question": "Is this a new treatment start for the patient?",\n    "context": "Primary question in the treatment timeline section for new medication initiation. Patient information."\n  },\n  {\n    "name": "T2",\n    "type": "text",\n    "value": "",\n    "page": 2,\n    "field_type": 7,\n    "field_type_string": "Text",\n    "field_label": "Start date: (MM)",\n    "question": "What is the month of treatment start?",\n    "context": "Month of planned medication initiation date. Required if \'Start of treatment\' is selected. Patient information."\n  },\n  {\n    "name": "T3",\n    "type": "text",\n    "value": "",\n    "page": 2,\n    "field_type": 7,\n    "field_type_string": "Text",\n    "field_label": "Start date: (DD)",\n    "question": "What is the day of treatment start?",\n    "co