In [28]:
import google.generativeai as genai
import pathlib
import httpx
import os

from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Check if API key is loaded
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
    raise ValueError("GOOGLE_API_KEY not found in environment variables. Please check your .env file.")

# Configure the API
genai.configure(api_key=api_key)

# Retrieve and encode the PDF byte
filepath = pathlib.Path('Input Data/Adbulla/PA.pdf')


In [29]:
prompt = """Please analyze this Prior Authorization (PA) document and extract all available information into a structured JSON format. This document is a prior authorization form that may contain multiple pages with patient information, provider details, clinical notes, insurance information, and handwritten annotations.ations.ations.

IMPORTANT: This document may contain handwritten text, stamps, signatures, and various form fields. Pay special attention to:
- Handwritten patient names, dates, and clinical notes (may be in cursive, print, or mixed styles)
- Stamped information (dates, provider stamps, approval stamps, fax transmission stamps)
- Checkboxes that may be marked with X, checkmarks, circles, or filled squares
- Form fields that may be filled in by hand, typed, or a combination
- Multiple pages that may contain different types of information
- Partial or incomplete information where forms are only partially filled

EXTRACTION GUIDELINES:
1. If information is not clearly visible or present, use null (not empty strings)
2. Preserve exact text as it appears, including spelling variations
3. For dates, maintain the original format found in the document
4. For checkboxes, note what is marked and what the options were
5. Capture both typed and handwritten content separately when both exist
6. Include any crossed-out or corrected information in notes

PRIOR AUTHORIZATION SPECIFIC PATTERNS TO LOOK FOR:OR:OR:
- PA request numbers or case numbers
- Requested medications/treatments/procedures with specific details details details
- Medical necessity justification and clinical criterial criterial criteria
- Insurance plan information and coverage detailscoverage detailscoverage details
- Provider certification and attestation statementstion statementstion statements
- Formulary alternatives or step therapy requirementsquirementsquirements
- Dosage, frequency, and duration of requested treatmentsted treatmentsted treatment
- Previous treatment history and failure documentatione documentatione documentation
- Clinical guidelines and evidence-based criteriased criteriased criteria
- Approval/denial status and effective dates- Approval/denial status and effective dates- Approval/denial status and effective dates

Follow this comprehensive JSON structure, filling in only the fields that are clearly present in the document:Follow this comprehensive JSON structure, filling in only the fields that are clearly present in the document:Follow this comprehensive JSON structure, filling in only the fields that are clearly present in the document:

{
    "document_metadata": {
        "document_type": "Prior Authorization", "Prior Authorization", "Prior Authorization",
        "form_name": "",
        "form_version": "",,,
        "total_pages": "",
        "document_date": "",
        "submission_method": "",",",
        "document_quality": ""  "document_quality": ""  "document_quality": ""
    },
    "patient_information": {rmation": {rmation": {
        "name": {
            "full_name": "",
            "first_name": "",,,
            "last_name": "",
            "middle_initial": "", "", "",
            "name_source": ""  "name_source": ""  "name_source": ""
        },
        "demographics": {
            "date_of_birth": "",irth": "",irth": "",
            "age": "",
            "gender": "",
            "weight": "",
            "height": ""  "height": ""  "height": ""
        },
        "contact_information": {tion": {tion": {
            "address": {
                "street": "",",",
                "city": "",
                "state": "",
                "zip_code": ""
            },
            "phone_numbers": {
                "primary": "",
                "secondary": ""
            }
        },
        "identifiers": {ifiers": {ifiers": {
            "mrn": "",
            "patient_id": "",
            "account_number": "": "": ""
        }
    },
    "insurance_information": {e_information": {e_information": {
        "primary_insurance": {rimary_insurance": {rimary_insurance": {
            "insurance_name": "",ame": "",ame": "",
            "member_id": "",": "",": "",
            "group_number": "",
            "policy_number": "","","",
            "plan_type": "",
            "subscriber_name": "",   "subscriber_name": "",   "subscriber_name": "",
            "subscriber_relationship": "",      "subscriber_relationship": "",      "subscriber_relationship": "",
            "subscriber_dob": """"""
        },
        "authorization_info": {
            "pa_number": "",
            "case_number": "",
            "request_date": "",
            "effective_date": "",: "",: "",
            "expiration_date": "",,,
            "status": "",
            "approval_date": "",te": "",te": "",
            "denial_date": "",,,
            "appeal_rights": ""
        }
    },
    "healthcare_providers": {
        "prescribing_provider": {
            "name": "",
            "title": "",
            "specialty": "",
            "practice_name": "",: "",: "",
            "address": "",  "address": "",  "address": "",
            "phone": "",
            "fax": "",
            "npi": "",
            "dea_number": "",,,
            "license_number": ""
        },,,
        "facility_information": {  "facility_information": {  "facility_information": {
            "name": "",
            "type": "",
            "address": "","","",
            "phone": "",
            "fax": "",
            "npi": ""
        }
    },
    "requested_treatment": {nt": {nt": {
        "medication_information": {rmation": {rmation": {
            "drug_name": "",": "",": "",
            "generic_name": "",
            "brand_name": "",: "",: "",
            "ndc_number": "",  "ndc_number": "",  "ndc_number": "",
            "strength": "",
            "dosage_form": "",m": "",m": "",
            "route_of_administration": "",stration": "",stration": "",
            "dosage": "",
            "frequency": "", "", "",
            "quantity": "",: "",: "",
            "days_supply": "",
            "number_of_refills": ""  "number_of_refills": ""  "number_of_refills": ""
        },
        "procedure_information": {rmation": {rmation": {
            "procedure_name": "",: "",: "",
            "cpt_codes": [],
            "hcpcs_codes": [],,,
            "procedure_description": "",cription": "",cription": "",
            "place_of_service": "",rvice": "",rvice": "",
            "frequency": "",": "",": "",
            "duration": "": "": ""
        },
        "device_information": {evice_information": {evice_information": {
            "device_name": "",
            "manufacturer": "",er": "",er": "",
            "model_number": "",er": "",er": "",
            "hcpcs_code": "","","",
            "quantity": ""
        }
    },
    "clinical_information": {ical_information": {ical_information": {
        "diagnoses": {  "diagnoses": {  "diagnoses": {
            "primary_diagnosis": "",nosis": "",nosis": "",
            "secondary_diagnoses": [],: [],: [],
            "icd_10_codes": [],
            "diagnosis_date": ""
        },
        "medical_necessity": {
            "clinical_rationale": "",: "",: "",
            "medical_necessity_statement": "",ement": "",ement": "",
            "supporting_clinical_data": "",ata": "",ata": "",
            "treatment_goals": "", "", "",
            "expected_outcomes": ""
        },
        "clinical_criteria": {
            "meets_criteria": "",",",
            "criteria_met": [],
            "contraindications": [],],],
            "safety_considerations": ""ations": ""ations": ""
        },
        "medications": {
            "current_medications": [],ions": [],ions": [],
            "allergies": [],,,
            "adverse_reactions": []s": []s": []
        },
        "treatment_history": {treatment_history": {treatment_history": {
            "previous_treatments": [],      "previous_treatments": [],      "previous_treatments": [],
            "treatment_outcomes": [],es": [],es": [],
            "reasons_for_failure": [],or_failure": [],or_failure": [],
            "step_therapy_completed": "",: "",: "",
            "formulary_alternatives_tried": []ied": []ied": []
        }
        nclcnical_fi_dnngd: {{
            "examination_findings": ""   "examination_findings": "",
             lab_  "ulbs,[],
            "imaging_studies": [],
            "symptoms": [],
            "symptom_onset": "",
            "duration_of_condition": "",
            "severity_level": "",
            "functional_impact": ""
        },
        "medications": {
            "current_medications": [],
            "dosages": [],
            "frequencies": [],
            "medication_allergies": [],
            "allergy_reactions": []
        },
        "medical_history": {
            "relevant_history": "",
            "previous_treatments": [],
            "treatment_outcomes": "",
            "surgical_history": "",
            "family_history": "",
            "social_history": ""
        },
        "clinical_findings": {
            "examination_findings": "",
            "vital_signs": {},
            "lab_results": [],
            "imaging_studies": [],
            "diagnostic_tests": []
        },
        "assessment_and_plan": {
            "clinical_assessment": "",
            "treatment_plan": "",
            "goals_of_referral": "",
            "expected_outcomes": ""
        }
    },
    "handwritten_content": {
        "handwritten_sections": {
            "patient_information": [],
            "clinical_notes": [],
            "provider_notes": [],
            "appointment_notes": [],
            "other_handwritten": []
        },
        "signatures_and_dates": {
            "provider_signature": "",
            "signature_date": "",
            "patient_signature": "",
            "patient_signature_date": "",
            "witness_signature": ""
        },
        "stamps_and_markings": {
            "date_stamps": [],
            "approval_stamps": [],
            "received_stamps": [],
            "other_stamps": []
        }
    },
    "form_data": {
        "checkboxes_marked": {
            "service_requests": [],
            "urgency_selections": [],
            "appointment_preferences": [],
            "consent_acknowledgments": [],
            "other_selections": []
        },
        "form_fields_completed": [],
        "sections_completed": [],
        "incomplete_sections": []
    },
    "administrative_details": {
        "processing_information": {
            "submission_date": "",
            "received_date": "",
            "processed_date": "",
            "case_number": "",
            "reference_number": "",
            "priority_level": ""
        },
        "approval_status": {
            "status": "",
            "approval_date": "",
            "approved_by": "",
            "expiration_date": "",
            "limitations": ""
        },
        "contact_information": {
            "primary_contact": "",
            "contact_phone": "",
            "contact_email": "",
            "best_time_to_call": ""
        }
    },
    "additional_information": {
        "supporting_documentation": [],
        "special_instructions": "",
        "patient_preferences": "",
        "transportation_needs": "",
        "interpreter_needed": "",
        "attachments_mentioned": [],
        "notes_and_comments": ""
    }
}

FINAL INSTRUCTIONS:
- Return ONLY valid JSON without any markdown formatting or explanatory text
- Use null for any field where information is not clearly present or legible
- Preserve original formatting and spelling from the document
- If handwriting is unclear, note this in the appropriate field with [unclear handwriting]
- For partially legible text, include what you can read followed by [partial]
"""

# Create the model
model = genai.GenerativeModel('gemini-2.0-flash')

# Generate content
response = model.generate_content([
    prompt,
    {
        "mime_type": "application/pdf",
        "data": filepath.read_bytes()
    }
])

# Store the response text
PA_text = response.text
# Print the response text
print("Extracted JSON:")
print(response.text)

Extracted JSON:
```json
{
    "document_metadata": {
        "document_type": "Prior Authorization",
        "form_name": "MEDICARE FORM\nMedication Precertification Request",
        "form_version": "GR-68535-3 (1-25)",
        "total_pages": "5",
        "document_date": null,
        "submission_method": null,
        "document_quality": null
    },
    "patient_information": {
        "name": {
            "full_name": null,
            "first_name": null,
            "last_name": null,
            "middle_initial": null,
            "name_source": null
        },
        "demographics": {
            "date_of_birth": null,
            "age": null,
            "gender": null,
            "weight": null,
            "height": null
        },
        "contact_information": {
            "address": {
                "street": null,
                "city": null,
                "state": null,
                "zip_code": null
            },
            "phone_numbers": {
               

In [30]:
import os
from mistralai import Mistral
from dotenv import load_dotenv

load_dotenv()
client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))

uploaded = client.files.upload(
    file={"file_name": "referral_package.pdf", "content": open('Input Data/Adbulla/referral_package.pdf', "rb")},
    purpose="ocr"
)
signed = client.files.get_signed_url(file_id=uploaded.id)

ocr = client.ocr.process(
    model="mistral-ocr-latest",
    document={"type": "document_url", "document_url": signed.url},
    include_image_base64=False
)

# Save structured content in a variable
referral_package_text = ""
for pg in ocr.pages:
    referral_package_text += pg.markdown + "\n\n"

# Print structured content
for pg in ocr.pages:
    print(pg.markdown)



04/22/2024 WED 11:32 FAX

001/028

# FAX TRANSMISSION

**Better Life Multiple Sclerosis Center**

3320 Montgomery Dr. Nashville, TN 37361

**BetterLife**

F 615-562-4820 P: 615-562-4848

Dr. Asriel Han | Dr. Aetya Shan

---

**TO:**

Golden Gate Infusion Center

**Fax:** 614-225-3355 **Phone:** 614-295-7655

**From:** Erran Rostami, BSN, RN

**P:** 615-343-1176

**F:** 615-343-1219

---

**Page**

(including cover sheet)

---

**Comments:**

- Arabic - spoken / English - written
- Rituxen (Truxima) TP
- MRI Reports
- Hospital DIC Make
- Demographics

---

The documents accompanying this transmission may contain health information that is legally protected. This information is intended only for the use of the individual or entity named above. The authorized recipient of this information is prohibited from disclosing this information to any other party unless permitted by law or regulation.

If you are not the intended recipient, you are hereby notified that any use, disclosure, copying 

In [35]:
prompt = """Please analyze the referral package markdown text and the PA JSON text to merge and enhance the structured data. 

Your task is to:
1. Extract relevant information from the referral package markdown
2. Map this information to the appropriate fields in the PA JSON structure
3. Fill in missing fields in the PA JSON using data from the referral package
4. Ensure data consistency between both sources
5. Preserve existing PA data while enhancing it with referral package information

Focus on these key areas:
- Patient demographics and identifiers
- Provider information and contact details
- Clinical information and diagnoses
- Treatment requests and medical necessity
- Insurance and authorization details
- Administrative information

Guidelines:
- If PA JSON already has a field populated, keep that value unless the referral package has more complete/accurate information
- Use null for fields where no clear information is available
- Preserve exact text formatting and spelling from source documents
- Note any discrepancies between the two sources in a comments field

Return the enhanced PA JSON structure with merged data from both sources."""

# Use the correct Gemini API syntax
model = genai.GenerativeModel('gemini-2.0-flash')

response = model.generate_content([
    prompt,
    f"REFERRAL PACKAGE DATA:\n{referral_package_text}",
    f"PRIOR AUTHORIZATION DATA:\n{PA_text}"
])

print("Merged JSON:")
print(response.text)

Merged JSON:
```json
{
    "document_metadata": {
        "document_type": "Prior Authorization",
        "form_name": "MEDICARE FORM\nMedication Precertification Request",
        "form_version": "GR-68535-3 (1-25)",
        "total_pages": "5",
        "document_date": null,
        "submission_method": "Fax",
        "document_quality": null
    },
    "patient_information": {
        "name": {
            "full_name": "Shakh Abdulla",
            "first_name": "Shakh",
            "last_name": "Abdulla",
            "middle_initial": null,
            "name_source": "Referral Package"
        },
        "demographics": {
            "date_of_birth": "04/01/2001",
            "age": 23,
            "gender": "male",
            "weight": null,
            "height": null
        },
        "contact_information": {
            "address": {
                "street": "425 Sherman Ave APT D",
                "city": "Nashville",
                "state": "TN",
                "zip_code": "