In [1]:
import json
import logging
import os
import time
import asyncio
import re
from typing import Dict, List, Any, Optional, Tuple, Union
from pathlib import Path
from datetime import datetime
import traceback
import warnings

import google.generativeai as genai
from google.genai import types

import fitz
import PyPDF2

import httpx

from pydantic import BaseModel, Field, validator
from pydantic.types import constr, conint

import pathlib
import shutil
from io import BytesIO

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(name)s - %(message)s'
)
logger = logging.getLogger(__name__)

warnings.filterwarnings("ignore", category=UserWarning)




  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from dotenv import load_dotenv
load_dotenv()  # This loads your .env file

api_key = os.getenv('GEMINI_API_KEY')
genai.configure(api_key=api_key)

# STAGE 1

In [3]:
def call_gemini_model(pdf_path, prompt):
    """
    Simple function to call Gemini 2.5 Flash with PDF and prompt
    """
    model = genai.GenerativeModel('gemini-2.5-flash')
    sample_file = genai.upload_file(path=pdf_path)
    response = model.generate_content([sample_file, prompt])
    genai.delete_file(sample_file.name)
    return response.text

In [7]:
def extract_medical_data(pdf_path):
    """
    Extract structured medical data from PDF using Gemini 2.5 Flash
    
    Args:
        pdf_path: Path to medical PDF file
    
    Returns:
        dict: Structured medical data
    """
    
    prompt = """
    Analyze this medical document and extract ALL available information. Organize the data into logical categories and return as a JSON object.
    Follow this structure pattern but include ALL data you find:
    {
        "patient_demographics": {
            "first_name": "example_value",
            "last_name": "example_value",
            "any_other_demographic_fields_you_find": "value"
        },
        "insurance_information": {
            "member_id": "example_value",
            "any_other_insurance_fields_you_find": "value"
        },
        "medical_information": {
            "diagnosis": "example_value",
            "any_other_medical_fields_you_find": "value"
        },
        "add_more_categories_as_needed": {
            "any_field_name": "any_value"
        }
    }
    Instructions:
    - Find EVERY piece of information in the document
    - Create appropriate category names for different types of data
    - Use descriptive field names for each piece of information
    - Include dates, numbers, names, addresses, codes - everything
    - If you find data that doesn't fit existing categories, create new ones
    - Use MM/DD/YYYY format for dates
    - Return ONLY the JSON object
    - Be comprehensive - don't miss anything
    """
    
    try:
        response_text = call_gemini_model(pdf_path, prompt)
        cleaned_response = clean_json_response(response_text)
        medical_data = json.loads(cleaned_response)
        
        # Save JSON to file
        with open("extracted_medical_data.json", "w") as f:
            json.dump(medical_data, f, indent=2)
        
        return medical_data
        
    except Exception as e:
        logger.error(f"Error extracting medical data: {str(e)}")
        return None

def clean_json_response(response_text):
    """Clean up Gemini response to extract just the JSON"""
    
    if "```json" in response_text:
        response_text = response_text.split("```json")[1].split("```")[0]
    elif "```" in response_text:
        response_text = response_text.split("```")[1].split("```")[0]
    
    start = response_text.find('{')
    end = response_text.rfind('}') + 1
    
    if start != -1 and end != 0:
        response_text = response_text[start:end]
    
    response_text = response_text.strip()
    return response_text

In [8]:
medical_data = extract_medical_data("./Input Data/Abdulla/referral_package.pdf")

In [9]:
medical_data

{'fax_transmission_details': {'fax_date': '04/22/2024',
  'fax_time': '11:32',
  'fax_number_pages': '001/028',
  'sender_information': {'center_name': 'Better Life Multiple Sclerosis Center',
   'address': '3320 Montgomery Dr. Nashville, TN 37361',
   'fax': '615-562-4820',
   'phone': '615-562-4848',
   'doctors': ['Dr. Asriel Han', 'Dr. Aditya Shah'],
   'from_name': 'Erfan Rostami, BSN, RN',
   'from_phone': '615-343-1176',
   'from_fax': '615-343-1219'},
  'recipient_information': {'to': 'Golden Gate Infusion Center',
   'fax': '614-278-3355',
   'phone': '614-895-7655'},
  'pages_including_cover_sheet': '1',
  'comments': ['Arabic - Spoken / English - Written',
   'Rituxan (Truxima) TP',
   'MRI Reports',
   'Hospital D/C Note',
   'Demographics'],
  'disclaimer': 'The documents accompanying this transmission may contain health information that is legally protected. This information is intended only for the use of the individual or entity named above. The authorized recipient of 