In [57]:
import json
import logging
import os
import time
import asyncio
import re
from typing import Dict, List, Any, Optional, Tuple, Union
from pathlib import Path
from datetime import datetime
import traceback
import warnings

import google.generativeai as genai
from google.genai import types

import fitz
import PyPDF2

import httpx

from pydantic import BaseModel, Field, validator
from pydantic.types import constr, conint

import pathlib
import shutil
from io import BytesIO

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(name)s - %(message)s'
)
logger = logging.getLogger(__name__)

warnings.filterwarnings("ignore", category=UserWarning)


In [58]:
from dotenv import load_dotenv
load_dotenv()  # This loads your .env file

api_key = os.getenv('GEMINI_API_KEY')
genai.configure(api_key=api_key)

# PA DATA

In [59]:
def extract_fields_with_positions(pdf_path):
   doc = fitz.open(pdf_path)
   fields = []
   for page_num, page in enumerate(doc, start=1):
       for w in page.widgets() or []:
           field = {
               "name": w.field_name,
               "type": "checkbox" if w.field_type == fitz.PDF_WIDGET_TYPE_CHECKBOX else "text",
               "value": w.field_value,
               "page": page_num,
               "field_type": w.field_type,
               "field_type_string": w.field_type_string,
               "field_label": w.field_label,
           }
           fields.append(field)

   # Group fields by page
   fields_by_page = {}
   for field in fields:
       page_num = field['page']
       if page_num not in fields_by_page:
           fields_by_page[page_num] = []
       fields_by_page[page_num].append(field)
   
   return fields_by_page

In [60]:
# Test it
abdulla_pa_fields = extract_fields_with_positions("Input Data/Abdulla/PA.pdf")


In [61]:
PROMPT_PA = """You are an expert medical document processing assistant specializing in Prior Authorization (PA) form analysis and field mapping. Your task is to process and enrich PA form field data with detailed contextual information.

Given Input:
1. A structured dataset containing PA form field definitions including:
   - Field names (e.g. CB1, T1)
   - Field types (checkbox, text, etc.)
   - Page numbers
   - Field labels
   - Current values

2. The complete Prior Authorization form PDF document

Required Processing:
For each form field, analyze sequentially by page number and:

1. Extract the implicit question being asked by the field
   - For checkboxes: Frame the label as a yes/no question
   - For text fields: Frame as an information request
   - For dates: Specify what event/action the date refers to

2. Generate rich contextual information that includes:
   - The section/category the field belongs to
   - Whether it's a primary question or sub-question
   - Whose information is being requested (patient, provider, insurer)
   - Any dependencies on other fields
   - Clinical relevance of the requested information

<CRITICAL_REQUIREMENTS>
- Every field must have both question and context added
- Context must be specific and clinically relevant
- Maintain logical relationships between fields
- Preserve exact field names and labels
- Keep context concise but informative (25 words max)
- Only output valid JSON
</CRITICAL_REQUIREMENTS>

<RESPONSE_FORMAT>
Each output JSON object should only contain the fields - name, type, page, field_label, question, context in the following format:
{{"name": "CB1",
 "type": "checkbox",
 "page": 2,
 "field_label": "Start of treatment",
 "question": "Is this a new treatment start for the patient?",
 "context": "Initial checkbox in treatment timeline section indicating whether patient is beginning new therapy versus continuing existing treatment."}}
{{"name": "T2",
 "type": "text", 
 "page": 2,
 "field_label": "Start date: (MM)",
 "question": "What is the month of treatment start?",
 "context": "2-digit month format for planned medication initiation date in treatment scheduling section."}}
</RESPONSE_FORMAT>

<PA_FORM_DATA>
{page_fields}
</PA_FORM_DATA>

Return valid JSON array only. No explanations outside the JSON."""

In [62]:
async def query_gemini_async(prompt, pdf_path, model="gemini-2.5-flash"):
    import pathlib
    
    filepath = pathlib.Path(pdf_path)
    loop = asyncio.get_event_loop()
    
    # Configure model for JSON output
    generation_config = genai.GenerationConfig(
        response_mime_type="application/json"
    )
    
    response = await loop.run_in_executor(
        None,
        lambda: genai.GenerativeModel(
            model,
            generation_config=generation_config
        ).generate_content([
            genai.upload_file(path=filepath),
            prompt
        ])
    )
    
    return response.text

async def process_pa_fields_async(pa_fields_data, pdf_path):
    async def process_page(page_num, page_fields):
        prompt = PROMPT_PA.format(page_fields=json.dumps(page_fields))
        result = await query_gemini_async(prompt, pdf_path)
        return page_num, result
    
    # Create tasks for all pages
    tasks = [process_page(page, fields) for page, fields in pa_fields_data.items()]
    
    # Run all pages concurrently
    results = await asyncio.gather(*tasks)
    
    # Simple results structure
    enhanced_fields = {}
    for page, result in results:
        enhanced_fields[page] = result
        print(f"Page {page} processed")
    
    return enhanced_fields

# Usage
enhanced_pa_data = await process_pa_fields_async(abdulla_pa_fields, "Input Data/Abdulla/PA.pdf")



Page 2 processed
Page 3 processed
Page 4 processed
Page 5 processed


In [63]:
all_pages_data = {}

for page_num, json_response in enhanced_pa_data.items():
    page_data = json.loads(json_response)
    all_pages_data[page_num] = page_data

# Save as one pretty JSON file
with open("pa_all_fields.json", "w") as f:
    json.dump(all_pages_data, f, indent=2, ensure_ascii=False)

print(f"Saved enhanced PA fields")

Saved enhanced PA fields


In [64]:
# Simple flat JSON with field names as keys
all_fields = {}

for page_num, json_response in enhanced_pa_data.items():
    fields = json.loads(json_response)
    for field in fields:
        all_fields[field['name']] = field

with open("pa_all_fields_flat.json", "w") as f:
    json.dump(all_fields, f, indent=2, ensure_ascii=False)

print(f"Saved {len(all_fields)} fields")

Saved 323 fields


# MEDICAL DATA

In [85]:
REFERRAL_PACKAGE_PROMPT = """You are an expert medical document processing assistant specializing in Prior Authorization (PA) forms and medical documentation. You are given a list of PA form fields with their associated context and questions. Your task is to thoroughly analyze the provided PDF referral package and extract all relevant information to accurately fill out the PA form.

## CRITICAL INSTRUCTIONS:
1. **NEVER leave answer fields empty or null** - always provide a specific value
2. **For missing information**: Use "Not documented" or "Not specified" instead of empty strings
3. **For checkbox fields**: Always answer with either "Yes" or "No" (never true/false or empty)
4. **For text fields**: Provide the exact information or "Not available" if truly missing
5. **For dates**: Use MM/DD/YYYY format (unless format is specified) or "Not specified" if date is missing 
6. **Be thorough**: Review the ENTIRE document multiple times to find all relevant information

## DETAILED EXTRACTION GUIDELINES:

### Patient Information:
- Extract ALL demographic details (name, DOB, address, phone, insurance)
- Look for patient information in headers, footers, cover pages, and forms
- Check multiple pages for complete contact information

### Medical Information:
- **Diagnoses**: Extract primary and secondary diagnoses with ICD-10 codes if available
- **Medications**: Include exact drug names, strengths, frequencies, routes of administration
- **Treatment History**: Look for previous medications tried, dates, outcomes, failures
- **Clinical Notes**: Extract relevant symptoms, assessments, lab results
- **Provider Details**: Include all prescribing physicians, NPIs, addresses, phone numbers

### Administrative Details:
- **Insurance**: Member IDs, group numbers, prior authorization numbers
- **Facility Information**: Infusion centers, pharmacies, administration locations
- **Dates**: Treatment start dates, last treatment dates, prescription dates

## ANSWER FORMAT REQUIREMENTS:

**For Checkbox Fields (CB prefixes):**
- Answer ONLY with "Yes" or "No" 
- If unclear, use clinical judgment based on available information
- Example: If asking about "Start of treatment" and document shows new prescription → "Yes"

**For Text Fields (T prefixes):**
- Provide exact values from the document
- For dates: Use MM/DD/YYYY format (e.g., "05/22/2024") unless format is specified 
- For names: Use exact spelling and format from document
- For missing info: Use "Not documented" instead of leaving blank

**For Yes/No Questions:**
- Base answers on clinical evidence in the document
- If patient has the condition/medication/history mentioned → "Yes"
- If explicitly stated they don't have it or no evidence found → "No"

## VALIDATION CHECKLIST:
Before submitting, ensure:
- ✓ Every field has a non-empty answer
- ✓ All checkbox answers are "Yes" or "No"
- ✓ All dates follow MM/DD/YYYY format
- ✓ Patient demographics are complete
- ✓ Medication information is detailed and accurate
- ✓ No fields are left with null, empty strings, or boolean values

<PA_FORM_DATA>
{pa_form_fields}
</PA_FORM_DATA>

<RESPONSE FORMAT>
[
  {{
    "name": "CB1",
    "page": 2,
    "field_label": "Start of treatment",
    "answer": "Yes"
  }},
  {{
    "name": "T2",
    "page": 2,
    "field_label": "Start date: (MM)",
    "answer": "05"
  }}
]
</RESPONSE FORMAT>

**CRITICAL**: Every field must have a specific answer - no empty strings, no null values, no boolean true/false."""

In [86]:
from pydantic import BaseModel, Field
from typing import List

class PAFormAnswer(BaseModel):
    name: str
    page: int
    field_label: str
    answer: str = Field(description="answer to the question based on the referral package PDF")

In [87]:
async def fill_pa_page_with_pydantic(page_fields, referral_pdf_path, page_num):
    prompt = REFERRAL_PACKAGE_PROMPT.format(
        pa_form_fields=json.dumps(page_fields, indent=2)
    )
    
    filepath = pathlib.Path(referral_pdf_path)
    loop = asyncio.get_event_loop()
    
    response = await loop.run_in_executor(
        None,
        lambda: genai.GenerativeModel(
            "gemini-2.5-flash",
            generation_config=genai.GenerationConfig(
                response_mime_type="application/json",
            )
        ).generate_content([
            genai.upload_file(path=filepath),
            prompt
        ])
    )
    
    return page_num, json.loads(response.text)

In [88]:
async def fill_all_pages_async(enhanced_fields_by_page, referral_pdf_path):
    
    tasks = [
        fill_pa_page_with_pydantic(page_fields, referral_pdf_path, page_num)
        for page_num, page_fields in enhanced_fields_by_page.items()
    ]
    
    results = await asyncio.gather(*tasks)
    
    filled_pages = {}
    for page_num, page_results in results:
        filled_pages[page_num] = page_results
        print(f"Page {page_num} completed")
    
    return filled_pages

In [84]:
# Load flat file
with open("pa_all_fields.json", "r") as f:
    enhanced_fields_by_page = json.load(f)
    
# Now run it
filled_results = await fill_all_pages_async(enhanced_fields_by_page, "Input Data/Abdulla/referral_package.pdf")

Page 2 completed
Page 3 completed
Page 4 completed
Page 5 completed


In [89]:
filled_results

{'2': [{'name': 'CB1',
   'page': 2,
   'field_label': 'Start of treatment',
   'answer': 'Yes'},
  {'name': 'T2', 'page': 2, 'field_label': 'Start date: (MM)', 'answer': '05'},
  {'name': 'T3', 'page': 2, 'field_label': 'Start date: (DD)', 'answer': '23'},
  {'name': 'T4',
   'page': 2,
   'field_label': 'Start date: (YYYY)',
   'answer': '2024'},
  {'name': 'CB5',
   'page': 2,
   'field_label': 'Continuation of therapy',
   'answer': 'No'},
  {'name': 'T6',
   'page': 2,
   'field_label': 'Date of last treatment: (MM)',
   'answer': 'Not applicable'},
  {'name': 'T7',
   'page': 2,
   'field_label': 'Date of last treatment: (DD)',
   'answer': 'Not applicable'},
  {'name': 'T8',
   'page': 2,
   'field_label': 'Date of last treatment: (YYYY)',
   'answer': 'Not applicable'},
  {'name': 'T9',
   'page': 2,
   'field_label': 'Precertification Requested By:',
   'answer': 'Erfan Rostami, BSN, RN'},
  {'name': 'T10',
   'page': 2,
   'field_label': 'Phone:',
   'answer': '615-343-1176'}

In [90]:
# Flatten filled_results to simple array format
flattened_results = []

for page_num, page_fields in filled_results.items():
    for field in page_fields:
        flattened_results.append({
            "name": field["name"],
            "page": field["page"], 
            "field_label": field["field_label"],
            "answer": field["answer"]
        })

# Save as JSON
import json
with open("pa_results_flattened.json", "w") as f:
    json.dump(flattened_results, f, indent=2, ensure_ascii=False)

print(f"Saved {len(flattened_results)} fields to flattened JSON")


KeyError: 'answer'

In [None]:
# Flatten to dictionary with field names as keys
all_fields_dict = {}

for page_num, page_results in filled_results.items():
   for field in page_results:
       all_fields_dict[field['name']] = {
           "field_label": field['field_label'],
           "answer": field['answer']
       }

# Save as flat dictionary
#with open("filled_pa_results_flat_dict.json", "w") as f:
   #json.dump(all_fields_dict, f, indent=2, ensure_ascii=False)

print(f"Saved {len(all_fields_dict)} fields to flat dictionary")

Saved 323 fields to flat dictionary


In [91]:
flattened_results

[{'name': 'CB1',
  'page': 2,
  'field_label': 'Start of treatment',
  'answer': 'Yes'},
 {'name': 'T2', 'page': 2, 'field_label': 'Start date: (MM)', 'answer': '05'},
 {'name': 'T3', 'page': 2, 'field_label': 'Start date: (DD)', 'answer': '23'},
 {'name': 'T4',
  'page': 2,
  'field_label': 'Start date: (YYYY)',
  'answer': '2024'},
 {'name': 'CB5',
  'page': 2,
  'field_label': 'Continuation of therapy',
  'answer': 'No'},
 {'name': 'T6',
  'page': 2,
  'field_label': 'Date of last treatment: (MM)',
  'answer': 'Not applicable'},
 {'name': 'T7',
  'page': 2,
  'field_label': 'Date of last treatment: (DD)',
  'answer': 'Not applicable'},
 {'name': 'T8',
  'page': 2,
  'field_label': 'Date of last treatment: (YYYY)',
  'answer': 'Not applicable'},
 {'name': 'T9',
  'page': 2,
  'field_label': 'Precertification Requested By:',
  'answer': 'Erfan Rostami, BSN, RN'},
 {'name': 'T10', 'page': 2, 'field_label': 'Phone:', 'answer': '615-343-1176'},
 {'name': 'T11', 'page': 2, 'field_label': 

In [None]:
flaten_field = {}
for page_field in abdulla_pa_fields.values():
    for field in page_field:
        flaten_field[field['name']] = field

flaten_field

{'CB1': {'name': 'CB1',
  'type': 'checkbox',
  'value': 'Off',
  'page': 2,
  'field_type': 2,
  'field_type_string': 'CheckBox',
  'field_label': 'Start of treatment'},
 'T2': {'name': 'T2',
  'type': 'text',
  'value': '',
  'page': 2,
  'field_type': 7,
  'field_type_string': 'Text',
  'field_label': 'Start date: (MM)'},
 'T3': {'name': 'T3',
  'type': 'text',
  'value': '',
  'page': 2,
  'field_type': 7,
  'field_type_string': 'Text',
  'field_label': 'Start date: (DD)'},
 'T4': {'name': 'T4',
  'type': 'text',
  'value': '',
  'page': 2,
  'field_type': 7,
  'field_type_string': 'Text',
  'field_label': 'Start date: (YYYY)'},
 'CB5': {'name': 'CB5',
  'type': 'checkbox',
  'value': 'Off',
  'page': 2,
  'field_type': 2,
  'field_type_string': 'CheckBox',
  'field_label': 'Continuation of therapy'},
 'T6': {'name': 'T6',
  'type': 'text',
  'value': '',
  'page': 2,
  'field_type': 7,
  'field_type_string': 'Text',
  'field_label': 'Date of last treatment: (MM)'},
 'T7': {'name':

In [None]:
import json

# Step 0) Make sure abdulla_form_answers is a *list of dicts*

abdulla_form_answers = flattened_results

# Step 1) Fast lookup: (page, name) -> answer
answer_map = {
   (ans["page"], ans["name"]): ans["answer"]
   for ans in abdulla_form_answers
}

# Step 2) Write the answers back into abdulla_pa_fields in-place
for page_fields in abdulla_pa_fields.values():
   for fld in page_fields:
       ans = answer_map.get((fld["page"], fld["name"]))
       if ans is None:
           continue  # no answer supplied -> leave as-is
           
       if fld["type"] == "checkbox":
           if ans == "Yes":
               fld["value"] = True
           else:
               fld["value"] = False
       else:
           fld["value"] = ans


In [None]:
def fill_pa_form(pdf_path, field_mapping, out_path=None):
   doc = fitz.open(pdf_path)
   
   for page in doc:
       for w in page.widgets() or []:
           data = field_mapping.get(w.field_name)
           if data is None:
               continue
               
           val = data["value"]
           
           if w.field_type == fitz.PDF_WIDGET_TYPE_CHECKBOX:
               checked = bool(val) and str(val).lower() not in {
                   "off", "0", "false", "no"
               }
               w.field_value = (w.button_caption or "Yes") if checked else "Off"
           else:
               w.field_value = str(val)
               
           w.update()
   
   # Save
   if out_path:
       # full new file - no incremental flag
       doc.save(out_path, deflate=True, encryption=fitz.PDF_ENCRYPT_KEEP)
   else:
       buf = BytesIO()
       doc.save(buf, deflate=True, encryption=fitz.PDF_ENCRYPT_KEEP)
       buf.seek(0)
       return buf
       
   doc.close()

In [None]:
# Create field mapping
field_mapping = {}
for page_fields in abdulla_pa_fields.values():
   for field in page_fields:
       field_mapping[field['name']] = field

# Fill and save the PDF
fill_pa_form("Input Data/Abdulla/PA.pdf", field_mapping, "filled_PA_form.pdf")
print("PDF form filled and saved")

MuPDF error: format error: partial block in aes filter

PDF form filled and saved


{}