# 🧾 Insurance Claim Auditor – Demo Notebook

In [None]:
import re

# Simulated Claude Sonnet prompt and logic
def detect_document_type(text):
    """
    Detect whether the document is an Invoice, OCF, or Other.
    """
    lower_text = text.lower()
    if "ocf-21" in lower_text or "ocf-18" in lower_text or "ocf-23" in lower_text:
        return "OCF"
    elif "invoice" in lower_text or re.search(r"invoice\s+#?\d+", lower_text):
        return "Invoice"
    return "Other"

doc_type = detect_document_type(ocf_text)
print("📄 Detected Document Type:", doc_type)


📄 Detected Document Type: OCF

In [None]:
# Route logic based on document type
if doc_type == "Invoice" or doc_type == "OCF":
    print("✅ Proceeding with structured extraction...")
else:
    print("⚠️ Document type not supported for processing.")


✅ Proceeding with structured extraction...

In [None]:
# Load sample OCF document
ocf_path = "../data/sample_documents/ocf21_jane_doe.txt"
with open(ocf_path, "r") as file:
    ocf_text = file.read()

print("📄 Sample OCF-21 Document:")
print("=" * 50)
print(ocf_text)


In [None]:
# Simulate structured JSON extraction (as if from Claude or LLaMA)
extracted_json = {
    "valid_claims": [
        {
            "amount": "$600",
            "provider": "Toronto Rehab Inc.",
            "service": "Chiropractic",
            "date": "2024-03-20"
        }
    ],
    "flagged_duplicates": [
        {
            "amount": "$1,200",
            "provider": "Toronto Rehab Inc.",
            "date": "2024-03-18",
            "reason": "Duplicate of Invoice #1245"
        }
    ],
    "total_approved": "$600"
}

print("🧠 Extracted JSON:")
import json
print(json.dumps(extracted_json, indent=2))


{
  "valid_claims": [
    {
      "amount": "$600",
      "provider": "Toronto Rehab Inc.",
      "service": "Chiropractic",
      "date": "2024-03-20"
    }
  ],
  "flagged_duplicates": [
    {
      "amount": "$1,200",
      "provider": "Toronto Rehab Inc.",
      "date": "2024-03-18",
      "reason": "Duplicate of Invoice #1245"
    }
  ],
  "total_approved": "$600"
}

In [None]:
# Evaluation: Check duplicate and total logic
num_valid = len(extracted_json["valid_claims"])
num_duplicates = len(extracted_json["flagged_duplicates"])
total = extracted_json["total_approved"]

print("✅ Valid Entries:", num_valid)
print("⚠️ Duplicates Flagged:", num_duplicates)
print("💰 Total Approved:", total)


✅ Valid Entries: 1
⚠️ Duplicates Flagged: 1
💰 Total Approved: $600

In [None]:
import json
import base64
from PIL import Image
import io

# System prompt for Claude Sonnet
system_prompt = (
    "You are a professional document analyst specializing in extracting meaningful insights "
    "from images and text. Your responses should be concise, structured, and highly accurate."
)

# Load and encode sample image (simulated invoice or OCF page)
image_path = "../data/sample_documents/ocf21_jane_doe.txt"  # placeholder for scanned image path
# We'll simulate this as if it's a scanned image
with open(image_path, "rb") as f:
    fake_image_data = f.read()  # Simulated image input
    img_b64 = base64.b64encode(fake_image_data).decode("utf-8")

# Content prompt for invoice field extraction
content_prompt = '''
You are given an insurance claim document (either an invoice or OCF).
Your task is to extract relevant fields in the following JSON format:

{
  "document_type": "Invoice | OCF | Other",
  "claimed_services": [
    {
      "service": "...",
      "provider": "...",
      "amount": "...",
      "date": "..."
    }
  ],
  "duplicates_flagged": [
    {
      "invoice_id": "...",
      "reason": "Duplicate/Overlap"
    }
  ]
}

Strictly return the above JSON format without explanations. Only include duplicates if they are explicitly marked or repeated.
'''

# Construct messages for Claude Sonnet
images = [img_b64]  # Simulated image input
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "source": {
                    "type": "base64",
                    "media_type": "image/jpeg",
                    "data": img,
                },
            }
            for img in images
        ] + [{"type": "text", "text": content_prompt}]
    }
]

# Prepare the API payload for Bedrock
body = json.dumps(
    {
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": 1500,
        "system": system_prompt,
        "top_p": 0.7,
        "temperature": 0.7,
        "messages": messages,
        "stop_sequences": ["### END"],
    }
)

print("📦 Claude Sonnet Payload:")
print(json.dumps(json.loads(body), indent=2))


In [None]:
import json

# Load mock Claude Sonnet response
with open("../data/extracted_json/../mock_claude_invoice_output.json") as f:
    response = json.load(f)

# Extract info
claimed_services = response.get("claimed_services", [])
duplicates = response.get("duplicates_flagged", [])

valid_count = len(claimed_services)
duplicate_count = len(duplicates)
total_value = sum([int(s["amount"].replace("$", "").replace(",", "")) for s in claimed_services])

print("✅ Valid Claims:", valid_count)
print("⚠️ Duplicates Flagged:", duplicate_count)
print("💰 Total Value Claimed:", f"${total_value}")
print("📝 Document Type Detected:", response.get("document_type"))


In [None]:
from IPython.display import Image, display

# Display the complex test case image
display(Image(filename="../data/sample_documents/../mock_invoice_discrepancy_test.png"))


In [None]:
import json

# Load simulated Claude output
with open("../data/sample_documents/../mock_claude_invoice_discrepancy_output.json") as f:
    result = json.load(f)

# Extract valid services, discrepancies, and duplicates
total_claimed = 0
discrepancies = []
for svc in result["claimed_services"]:
    amt = int(svc["amount"].replace("$", "").replace(",", ""))
    total_claimed += amt
    if "discrepancy_flag" in svc:
        discrepancies.append({
            "invoice_id": svc["invoice_id"],
            "issue": svc["discrepancy_flag"]
        })

# Print summary
print("📋 Document Type:", result["document_type"])
print("✅ Valid Entries:", len(result["claimed_services"]))
print("⚠️ Duplicates:", len(result["duplicates_flagged"]))
print("❗ Discrepancies Detected:", len(discrepancies))
print("💰 Total Claimed:", f"${total_claimed}")
print()

# Show details of discrepancies
for d in discrepancies:
    print(f"Invoice {d['invoice_id']}: {d['issue']}")


✅ Valid Entries: 1
⚠️ Duplicates Flagged: 1
💰 Total Approved: $600