In [1]:
import json
import logging
import os
import time
import asyncio
import re
from typing import Dict, List, Any, Optional, Tuple, Union
from pathlib import Path
from datetime import datetime
import traceback
import warnings

import google.generativeai as genai
from google.genai import types

import fitz
import PyPDF2

import httpx

from pydantic import BaseModel, Field, validator
from pydantic.types import constr, conint

import pathlib
import shutil
from io import BytesIO

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(name)s - %(message)s'
)
logger = logging.getLogger(__name__)

warnings.filterwarnings("ignore", category=UserWarning)




  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from dotenv import load_dotenv
load_dotenv()  # This loads your .env file

api_key = os.getenv('GEMINI_API_KEY')
genai.configure(api_key=api_key)

# STAGE 1

In [3]:
def call_gemini_model(pdf_path, prompt):
    """
    Simple function to call Gemini 2.5 Flash with PDF and prompt
    """
    model = genai.GenerativeModel('gemini-2.5-flash')
    sample_file = genai.upload_file(path=pdf_path)
    response = model.generate_content([sample_file, prompt])
    genai.delete_file(sample_file.name)
    return response.text

In [None]:
def clean_json_response_med(response_text):
    """Clean up Gemini response to extract just the JSON"""
    
    if "```json" in response_text:
        response_text = response_text.split("```json")[1].split("```")[0]
    elif "```" in response_text:
        response_text = response_text.split("```")[1].split("```")[0]
    
    start = response_text.find('{')
    end = response_text.rfind('}') + 1
    
    if start != -1 and end != 0:
        response_text = response_text[start:end]
    
    response_text = response_text.strip()
    return response_text

In [None]:
def extract_medical_data(pdf_path):
    """
    Extract structured medical data from PDF using Gemini 2.5 Flash
    
    Args:
        pdf_path: Path to medical PDF file
    
    Returns:
        dict: Structured medical data
    """
    
    prompt = """
    Analyze this medical document and extract ALL available information. Organize the data into logical categories and return as a JSON object.
    Follow this structure pattern but include ALL data you find:
    {
        "patient_demographics": {
            "first_name": "example_value",
            "last_name": "example_value",
            "any_other_demographic_fields_you_find": "value"
        },
        "insurance_information": {
            "member_id": "example_value",
            "any_other_insurance_fields_you_find": "value"
        },
        "medical_information": {
            "diagnosis": "example_value",
            "any_other_medical_fields_you_find": "value"
        },
        "add_more_categories_as_needed": {
            "any_field_name": "any_value"
        }
    }
    Instructions:
    - Find EVERY piece of information in the document
    - Create appropriate category names for different types of data
    - Use descriptive field names for each piece of information
    - Include dates, numbers, names, addresses, codes - everything
    - If you find data that doesn't fit existing categories, create new ones
    - Use MM/DD/YYYY format for dates
    - Return ONLY the JSON object
    - Be comprehensive - don't miss anything
    """
    
    try:
        response_text = call_gemini_model(pdf_path, prompt)
        cleaned_response = clean_json_response_med(response_text)
        medical_data = json.loads(cleaned_response)
        
        # Save JSON to file
        with open("extracted_medical_data.json", "w") as f:
            json.dump(medical_data, f, indent=2)
        
        return medical_data
        
    except Exception as e:
        logger.error(f"Error extracting medical data: {str(e)}")
        return None



In [42]:
medical_data = extract_medical_data("./Input Data/Abdulla/referral_package.pdf")

In [40]:
medical_data

{'fax_transmission_details': {'fax_date': '04/22/2024',
  'fax_time': '11:32',
  'fax_pages': '001/028',
  'sender_information': {'center_name': 'Better Life Multiple Sclerosis Center',
   'address': '3320 Montgomery Dr. Nashville, TN 37361',
   'fax_number': '615-562-4820',
   'phone_number': '615-562-4848',
   'doctors': ['Dr. Asriel Han', 'Dr. Aditya Shah'],
   'from': 'Erfan Rostami, BSN, RN',
   'from_phone': '615-343-1176',
   'from_fax': '615-343-1219'},
  'recipient_information': {'to': 'Golden Gate Infusion Center',
   'fax_number': '614-278-3355',
   'phone_number': '614895-7655'},
  'pages_including_cover_sheet': '1',
  'comments': ['Arabic - Spoken / English-Written',
   'Rituxan (Truxima) TP',
   'MRI Reports',
   'Hospital D/c note',
   'Demographics'],
  'disclaimer': 'The documents accompanying this transmission may contain health information that is legally protected. This information is intended only for the use of the individual or entity named above. The authorized 

# PA DATA


In [7]:
def extract_fields_with_positions(pdf_path):
   doc = fitz.open(pdf_path)
   fields = []
   for page_num, page in enumerate(doc, start=1):
       for w in page.widgets() or []:
           field = {
               "name": w.field_name,
               "type": "checkbox" if w.field_type == fitz.PDF_WIDGET_TYPE_CHECKBOX else "text",
               "value": w.field_value,
               "page": page_num,
               "field_type": w.field_type,
               "field_type_string": w.field_type_string,
               "field_label": w.field_label,
           }
           fields.append(field)

   # Group fields by page
   fields_by_page = {}
   for field in fields:
       page_num = field['page']
       if page_num not in fields_by_page:
           fields_by_page[page_num] = []
       fields_by_page[page_num].append(field)
   
   return fields_by_page



In [None]:
# Test it
abdulla_pa_fields = extract_fields_with_positions("Input Data/Abdulla/PA.pdf")


Total fields: 335
Page 2: 115 fields
Page 3: 81 fields
Page 4: 77 fields
Page 5: 62 fields


In [47]:
def clean_json_response_pa(response_text):
    """Clean up Gemini response to extract just the JSON"""
    if "```json" in response_text:
        response_text = response_text.split("```json")[1].split("```")[0]
    elif "```" in response_text:
        response_text = response_text.split("```")[1].split("```")[0]
    
    response_text = response_text.strip()
    
    # Look for array first [...]
    array_start = response_text.find('[')
    array_end = response_text.rfind(']') + 1
    
    # Look for object {...}
    obj_start = response_text.find('{')
    obj_end = response_text.rfind('}') + 1
    
    # Use whichever comes first and is valid
    if array_start != -1 and array_end > array_start:
        response_text = response_text[array_start:array_end]
    elif obj_start != -1 and obj_end > obj_start:
        response_text = response_text[obj_start:obj_end]
    
    return response_text

In [56]:
def add_field_context_by_page(fields_by_page, pa_form_pdf_path):
    enhanced_fields = {}
    
    for page_num, page_fields in fields_by_page.items():
        print(f"Processing page {page_num} with {len(page_fields)} fields...")
        
        prompt = f"""
You are analyzing a Prior Authorization (PA) form for medical treatments. Your task is to understand what each form field is asking for and provide clear context.

INSTRUCTIONS:
1. Look at each field's name and field_label to understand what it's asking for
2. Write a clear, specific context explaining what medical information should go in this field
3. Focus on practical medical data that would be found in patient records
4. Be specific - don't just repeat the field label
5. Think about what a nurse or doctor would need to know to fill this field

FORM FIELDS TO ANALYZE:
{json.dumps(page_fields, indent=2)}

OUTPUT FORMAT:
Return ONLY a JSON array of objects. Each object must have exactly these fields:
- name: the original field name
- type: the field type (text/checkbox)
- page: the page number
- field_label: the original field label
- context: ONE clear sentence explaining what specific medical information goes here

EXAMPLE OUTPUT:

{{
 "name": "CB1",
 "type": "checkbox",
 "page": 2,
 "field_label": "Start of treatment",
 "context": "Check this box to indicate the patient is beginning their first course of this medication treatment."
}},
{{
 "name": "T2",
 "type": "text",
 "page": 2,
 "field_label": "Start date: (MM)",
 "context": "Enter the month (01-12) when the patient will begin taking this medication."
}}


Return valid JSON array only. No markdown, no explanations outside the JSON.
"""
        
        response_text = call_gemini_model(pa_form_pdf_path, prompt)
        cleaned_response = clean_json_response_pa(response_text)
        page_enhanced_fields = json.loads(cleaned_response)
        
        # Convert array to dict if needed
        if isinstance(page_enhanced_fields, list):
            page_dict = {}
            for field in page_enhanced_fields:
                page_dict[field['name']] = field
            page_enhanced_fields = page_dict
        
        enhanced_fields.update(page_enhanced_fields)
    
    with open("enhanced_pa_fields666.json", "w") as f:
        json.dump(enhanced_fields, f, indent=2)
    
    return enhanced_fields

# Test it


In [57]:
enhanced_pa_fields = add_field_context_by_page(abdulla_pa_fields, "Input Data/Abdulla/PA.pdf")

Processing page 2 with 115 fields...
Processing page 3 with 81 fields...
Processing page 4 with 77 fields...
Processing page 5 with 62 fields...


In [None]:
# Enhanced prompt function with better structure (like your instructor's)
def form_pa_prompt(page_fields):
    return f"""You are an expert medical document processing assistant specializing in Prior Authorization (PA) form analysis and field mapping. Your task is to process and enrich PA form field data with detailed contextual information.

Given Input:
1. A structured dataset containing PA form field definitions including:
   - Field names (e.g. CB1, T1)
   - Field types (checkbox, text, etc.)
   - Page numbers
   - Field labels
   - Current values

2. The complete Prior Authorization form PDF document

Required Processing:
For each form field, analyze sequentially by page number and:

1. Extract the implicit question being asked by the field
   - For checkboxes: Frame the label as a yes/no question
   - For text fields: Frame as an information request
   - For dates: Specify what event/action the date refers to

2. Generate rich contextual information that includes:
   - The section/category the field belongs to
   - Whether it's a primary question or sub-question
   - Whose information is being requested (patient, provider, insurer)
   - Any dependencies on other fields
   - Clinical relevance of the requested information

<CRITICAL_REQUIREMENTS>
- Every field must have both question and context added
- Context must be specific and clinically relevant
- Maintain logical relationships between fields
- Preserve exact field names and labels
- Keep context concise but informative (25 words max)
- Only output valid JSON
</CRITICAL_REQUIREMENTS>

<RESPONSE_FORMAT>
Each output JSON object should only contain the fields - name, type, page, field_label, question, context in the following format:
{{"name": "CB1",
 "type": "checkbox",
 "page": 2,
 "field_label": "Start of treatment",
 "question": "Is this a new treatment start for the patient?",
 "context": "Initial checkbox in treatment timeline section indicating whether patient is beginning new therapy versus continuing existing treatment."}}
{{"name": "T2",
 "type": "text", 
 "page": 2,
 "field_label": "Start date: (MM)",
 "question": "What is the month of treatment start?",
 "context": "2-digit month format for planned medication initiation date in treatment scheduling section."}}
</RESPONSE_FORMAT>

Example object structure:
{{
  "name": "CB1",
  "type": "checkbox",
  "page": 2,
  "field_label": "Start of treatment",
  "question": "Is this a new treatment start for the patient?",
  "context": "Initial checkbox in treatment timeline section indicating whether patient is beginning new therapy versus continuing existing treatment."
}}

<PA_FORM_DATA>
{page_fields}
</PA_FORM_DATA>

Return valid JSON array only. No explanations outside the JSON."""

# Async query function (simplified version of instructor's approach)
async def query_gemini_async(prompt, pdf_path, model="gemini-2.5-flash"):
    import pathlib
    
    filepath = pathlib.Path(pdf_path)
    
    # Get event loop for async processing
    loop = asyncio.get_event_loop()
    
    # Run the Gemini API call in executor (non-blocking)
    response = await loop.run_in_executor(
        None,
        lambda: genai.GenerativeModel(model).generate_content([
            genai.upload_file(path=filepath),
            prompt
        ])
    )
    
    return response.text



In [None]:
async def process_page(page):
    prompt = form_pa_prompt(abdulla_pa_fields[page])
    result = await query_gemini_async(prompt, "Input Data/Abdulla/PA.pdf")
    return page, result

tasks = [process_page(page) for page in abdulla_pa_fields]
results = await asyncio.gather(*tasks)

# Process the results to convert to proper JSON
abdulla_pa_fields_with_context = {}
for page, result in results:
    try:
        # Clean and parse the JSON response
        cleaned_result = clean_json_response_pa(result)
        page_data = json.loads(cleaned_result)
        
        # Convert array to dictionary format
        if isinstance(page_data, list):
            for field in page_data:
                abdulla_pa_fields_with_context[field['name']] = field
        else:
            abdulla_pa_fields_with_context.update(page_data)
            
    except Exception as e:
        print(f"Error processing page {page}: {e}")
        continue

# Save to JSON file
with open("enhanced_pa_fields.json", "w") as f:
    json.dump(abdulla_pa_fields_with_context, f, indent=2)

print(f"Saved {len(abdulla_pa_fields_with_context)} enhanced PA fields to enhanced_pa_fields.json")

✅ Saved 322 enhanced PA fields to enhanced_pa_fields.json
