In [None]:
!pip install -q streamlit openai PyMuPDF pdfplumber pytesseract pillow python-docx
!sudo apt install -q tesseract-ocr libtesseract-dev

# Install Cloudflare Tunnel
!wget https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -O cloudflared
!chmod +x cloudflared
!mv cloudflared /usr/local/bin/

# Step 2: Create application files with guaranteed filling
import os

# Create app.py
app_py_content = """
import streamlit as st
import os
import tempfile
from modules import extract_text_from_pdf, extract_kv_pairs, fill_template

st.set_page_config(
    page_title="Insurance Document Automation",
    page_icon="📑",
    layout="wide"
)

st.title("📑 USAA Insurance Document Automation")
st.caption("Upload DOCX template and PDF photo reports to generate filled insurance documents")

with st.sidebar:
    st.header("Configuration")
    api_key = st.text_input("OpenRouter API Key", type="password")
    st.info("Get API key from [OpenRouter](https://openrouter.ai/keys)")

    # Model selection
    model_options = {
        "Claude Haiku (Fast)": "anthropic/claude-3-haiku",
        "DeepSeek Chat": "deepseek-chat",
        "Google Gemini Pro": "google/gemini-pro",
        "Mistral 7B": "mistralai/mistral-7b-instruct"
    }
    selected_model = st.selectbox("Select AI Model", list(model_options.keys()), index=0)
    st.info(f"Using model: {model_options[selected_model]}")

# File upload section
col1, col2 = st.columns(2)
with col1:
    template_file = st.file_uploader("Upload DOCX Template", type=["docx"])

with col2:
    pdf_files = st.file_uploader("Upload PDF Photo Reports",
                                type=["pdf"],
                                accept_multiple_files=True)

process_btn = st.button("Generate Document", type="primary", disabled=not (template_file and pdf_files))

if process_btn:
    if not api_key:
        st.error("Please enter your OpenRouter API key")
        st.stop()

    with st.status("Processing documents...", expanded=True) as status:
        # Save uploaded files temporarily
        with tempfile.TemporaryDirectory() as tmp_dir:
            # Save template
            template_path = os.path.join(tmp_dir, "template.docx")
            with open(template_path, "wb") as f:
                f.write(template_file.getvalue())

            # Process PDFs
            combined_text = ""
            for pdf_file in pdf_files:
                pdf_path = os.path.join(tmp_dir, pdf_file.name)
                with open(pdf_path, "wb") as f:
                    f.write(pdf_file.getbuffer())

                st.write(f"Processing {pdf_file.name}...")
                combined_text += extract_text_from_pdf(pdf_path) + "\\n\\n"

            # Display extracted text
            with st.expander("View extracted text"):
                st.text(combined_text[:5000] + "..." if len(combined_text) > 5000 else combined_text)

            # Extract key-value pairs
            st.write("Analyzing content with AI...")
            try:
                context_data = extract_kv_pairs(combined_text, api_key, model_options[selected_model])
                st.json(context_data)
            except Exception as e:
                st.error(f"Error during AI processing: {str(e)}")
                st.stop()

            # Fill template
            st.write("Generating final document...")
            try:
                output_docx = fill_template(template_path, context_data)
                status.update(label="Processing complete!", state="complete", expanded=False)
            except Exception as e:
                st.error(f"Error filling template: {str(e)}")
                st.stop()

    # Download button
    st.download_button(
        label="Download Filled Document",
        data=output_docx,
        file_name="filled_insurance_report.docx",
        mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
    )

    # Show success message
    st.success("Document generated successfully! Click the download button above.")
"""

with open("app.py", "w") as f:
    f.write(app_py_content)

# Create modules.py with guaranteed filling using direct XML manipulation
modules_py_content = """
import fitz  # PyMuPDF
import pdfplumber
import pytesseract
from PIL import Image
import io
import json
import re
import openai
import tempfile
import zipfile
import os
import shutil
import xml.etree.ElementTree as ET
from docx import Document
from docx.shared import Pt
import sys

def extract_text_from_pdf(pdf_path):
    \"\"\"Extract text from PDF with OCR fallback\"\"\"
    full_text = ""

    # First try with PyMuPDF for text-based PDFs
    try:
        doc = fitz.open(pdf_path)
        for page in doc:
            text = page.get_text()
            if text and len(text.strip()) > 50:  # Valid text page
                full_text += text + "\\n\\n"
            else:  # Likely image-based page
                pix = page.get_pixmap()
                img = Image.open(io.BytesIO(pix.tobytes()))
                text = pytesseract.image_to_string(img)
                full_text += text + "\\n\\n"
    except Exception as e:
        print(f"PyMuPDF error: {e}, falling back to pdfplumber")
        # Fallback to pdfplumber
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text = page.extract_text()
                if text:
                    full_text += text + "\\n\\n"

    return full_text

def extract_kv_pairs(text, api_key, model_id):
    \"\"\"Extract structured data using LLM with model fallback\"\"\"
    client = openai.OpenAI(
        base_url="https://openrouter.ai/api/v1",
        api_key=api_key
    )

    # Create a more robust prompt with examples
    prompt = f\"\"\"
    Extract insurance claim information from the following report text.
    Return ONLY valid JSON with no additional commentary. Use these exact keys:
    - claim_number
    - insured_name
    - address_of_loss
    - date_of_loss
    - policy_number
    - damage_description
    - roof_damage_details
    - fence_damage
    - pool_damage
    - food_loss_amount
    - mortgage_company

    Example Output:
    {{
      "claim_number": "CL123456",
      "insured_name": "John Doe",
      "address_of_loss": "123 Main St, Anytown, ST 12345",
      "date_of_loss": "2023-08-15",
      "policy_number": "POL987654",
      "damage_description": "Wind damage to roof and fence",
      "roof_damage_details": "Multiple missing shingles on southwest section",
      "fence_damage": "3 sections destroyed",
      "pool_damage": "None",
      "food_loss_amount": "250",
      "mortgage_company": "ABC Mortgage"
    }}

    Report Text:
    {text[:12000]}  # Truncate to token limit
    \"\"\"

    # Try the requested model first
    try:
        response = client.chat.completions.create(
            model=model_id,
            messages=[{"role": "user", "content": prompt}],
            max_tokens=2000,
            response_format={"type": "json_object"}
        )
        json_str = response.choices[0].message.content
        return json.loads(json_str)
    except Exception as e:
        # Fallback to Claude Haiku if the requested model fails
        try:
            print(f"Model {model_id} failed, falling back to anthropic/claude-3-haiku")
            response = client.chat.completions.create(
                model="anthropic/claude-3-haiku",
                messages=[{"role": "user", "content": prompt}],
                max_tokens=2000,
                response_format={"type": "json_object"}
            )
            json_str = response.choices[0].message.content
            return json.loads(json_str)
        except Exception as fallback_e:
            # Final fallback to regex extraction
            error_msg = f"Primary model error: {str(e)}\\nFallback model error: {str(fallback_e)}"
            print(error_msg)
            return extract_kv_fallback(text)

def extract_kv_fallback(text):
    \"\"\"Fallback extraction for when LLM fails\"\"\"
    print("Using regex fallback extraction")
    data = {}

    # More robust regex patterns
    patterns = {
        "claim_number": r"(?:Claim\\s*[#:]?|CL\\s*)\\s*([A-Z0-9-]+)",
        "policy_number": r"(?:Policy\\s*[#:]?|POL\\s*)\\s*([A-Z0-9-]+)",
        "insured_name": r"Insured:\\s*(.+?)(?=\\n|Address)",
        "date_of_loss": r"Date\\s*of\\s*Loss:\\s*(\\d{1,2}[/-]\\d{1,2}[/-]\\d{4})",
        "address_of_loss": r"Address\\s*of\\s*Loss:\\s*(.+?)(?=\\n|Date)",
        "damage_description": r"Damage\\s*Description:\\s*(.+?)(?=\\n|Roof)",
        "roof_damage_details": r"roof(?:.*?damage)?[.:]\\s*(.+?)(?=\\n|Fence)",
        "fence_damage": r"fence(?:.*?damage)?[.:]\\s*(.+?)(?=\\n|Pool)",
        "pool_damage": r"pool(?:.*?damage)?[.:]\\s*(.+?)(?=\\n|Food)",
        "food_loss_amount": r"food\\s*loss\\s*amount\\D*(\\d+)",
        "mortgage_company": r"mortgage\\s*company[\\s\\S]*?([A-Za-z0-9\\s&]+)"
    }

    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
        if match:
            data[key] = match.group(1).strip()
        else:
            # Search for key in text if pattern fails
            if key in text:
                start = text.find(key) + len(key)
                end = text.find("\\n", start)
                data[key] = text[start:end].strip(": ")[:100]
            else:
                data[key] = "Not found"

    return data

def fill_template(template_path, context_data):
    \"\"\"Populate DOCX template with extracted data using direct XML replacement\"\"\"
    # Set default values for missing fields
    defaults = {
        "claim_number": "Unknown",
        "insured_name": "Unknown",
        "address_of_loss": "Unknown",
        "date_of_loss": "Unknown",
        "policy_number": "Unknown",
        "damage_description": "No description provided",
        "roof_damage_details": "No roof damage details",
        "fence_damage": "No fence damage",
        "pool_damage": "No pool damage",
        "food_loss_amount": "0",
        "mortgage_company": "Unknown"
    }

    # Merge context with defaults
    merged_context = {**defaults, **context_data}

    # Create a temporary working directory
    with tempfile.TemporaryDirectory() as tmp_dir:
        # Copy template to temporary directory
        temp_docx = os.path.join(tmp_dir, "template.docx")
        shutil.copyfile(template_path, temp_docx)

        # Unzip the DOCX file
        docx_dir = os.path.join(tmp_dir, "docx_contents")
        with zipfile.ZipFile(temp_docx, 'r') as zip_ref:
            zip_ref.extractall(docx_dir)

        # Process document.xml
        document_xml = os.path.join(docx_dir, "word", "document.xml")
        if not os.path.exists(document_xml):
            # Try alternative location
            document_xml = os.path.join(docx_dir, "document.xml")
            if not os.path.exists(document_xml):
                raise FileNotFoundError("document.xml not found in DOCX file")

        # Read and process the XML
        with open(document_xml, "r", encoding="utf-8") as f:
            xml_content = f.read()

        # Replace placeholders in the XML content
        for key, value in merged_context.items():
            # Create all possible placeholder variations
            patterns = [
                f"{{{{{key}}}}}",       # {{key}}
                f"{{{{ {key} }}}}",     # {{ key }}
                f"{{{{{key} }}}}",      # {{key }}
                f"{{{{ {key}}}}}",      # {{ key}}
            ]

            for pattern in patterns:
                xml_content = xml_content.replace(pattern, str(value))

        # Write back the modified XML
        with open(document_xml, "w", encoding="utf-8") as f:
            f.write(xml_content)

        # Re-zip the document
        filled_docx = os.path.join(tmp_dir, "filled.docx")
        with zipfile.ZipFile(filled_docx, 'w') as zipf:
            for root, _, files in os.walk(docx_dir):
                for file in files:
                    file_path = os.path.join(root, file)
                    arcname = os.path.relpath(file_path, docx_dir)
                    zipf.write(file_path, arcname)

        # Read the filled document into memory
        with open(filled_docx, "rb") as f:
            output_bytes = f.read()

    # Return as BytesIO object
    return io.BytesIO(output_bytes)
"""

with open("modules.py", "w") as f:
    f.write(modules_py_content)

# Step 3: Upload your sample files
from google.colab import files
import shutil

# Create directories
os.makedirs("templates", exist_ok=True)
os.makedirs("photo_reports", exist_ok=True)

# Upload template
print("Upload DOCX template file:")
uploaded_template = files.upload()
template_filename = list(uploaded_template.keys())[0]
shutil.move(template_filename, f"templates/{template_filename}")

# Upload photo report
print("Upload PDF photo report:")
uploaded_report = files.upload()
report_filename = list(uploaded_report.keys())[0]
shutil.move(report_filename, f"photo_reports/{report_filename}")

print("Files uploaded successfully!")

# Step 4: Run Streamlit with Cloudflare Tunnel
import subprocess
import threading
import time
from IPython.display import display, Markdown

# Start Streamlit in background
def run_streamlit():
    subprocess.run(["streamlit", "run", "app.py", "--server.port", "8501", "--server.headless", "true"])

streamlit_thread = threading.Thread(target=run_streamlit, daemon=True)
streamlit_thread.start()
print("Streamlit server starting... (wait 15 seconds)")
time.sleep(15)

# Start Cloudflare Tunnel
def run_cloudflared():
    subprocess.run(["/usr/local/bin/cloudflared", "tunnel", "--url", "http://localhost:8501"],
                   stdout=open('tunnel.log', 'w'),
                   stderr=subprocess.STDOUT)

tunnel_thread = threading.Thread(target=run_cloudflared, daemon=True)
tunnel_thread.start()
print("Cloudflare Tunnel starting... (wait 20 seconds)")
time.sleep(20)

# Extract the public URL
try:
    time.sleep(5)
    with open('tunnel.log', 'r') as f:
        log_content = f.read()

    # Find URL in logs
    import re
    url_match = re.search(r'https://[a-z0-9-]+\\.trycloudflare\\.com', log_content)
    if url_match:
        public_url = url_match.group(0)
        display(Markdown(f"### [ACCESS YOUR STREAMLIT APP HERE]({public_url})"))
        print(f"Public URL: {public_url}")
    else:
        print("Public URL not found in logs. Here's the log content:")
        print(log_content)

except Exception as e:
    print(f"Error getting URL: {str(e)}")

# Keep the session alive
print("\n" + "="*80)
print("IMPORTANT: Keep this Colab tab open to maintain the connection")
print("The application will be accessible as long as this session is active")
print("="*80)

Reading package lists...
Building dependency tree...
Reading state information...
libtesseract-dev is already the newest version (4.1.1-2.1build1).
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
--2025-06-25 06:41:01--  https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/cloudflare/cloudflared/releases/download/2025.6.1/cloudflared-linux-amd64 [following]
--2025-06-25 06:41:01--  https://github.com/cloudflare/cloudflared/releases/download/2025.6.1/cloudflared-linux-amd64
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/106867604/015db4d3-519

Saving Eberl-GuideOne REPORT TEMPLATE_XM8.docx to Eberl-GuideOne REPORT TEMPLATE_XM8.docx
Upload PDF photo report:


Saving Photo Report - 3.pdf to Photo Report - 3.pdf
Files uploaded successfully!
Streamlit server starting... (wait 15 seconds)
Cloudflare Tunnel starting... (wait 20 seconds)
Public URL not found in logs. Here's the log content:
2025-06-25T06:42:42Z INF Thank you for trying Cloudflare Tunnel. Doing so, without a Cloudflare account, is a quick way to experiment and try it out. However, be aware that these account-less Tunnels have no uptime guarantee, are subject to the Cloudflare Online Services Terms of Use (https://www.cloudflare.com/website-terms/), and Cloudflare reserves the right to investigate your use of Tunnels for violations of such terms. If you intend to use Tunnels in production you should use a pre-created named tunnel by following: https://developers.cloudflare.com/cloudflare-one/connections/connect-apps
2025-06-25T06:42:42Z INF Requesting new quick Tunnel on trycloudflare.com...
2025-06-25T06:42:46Z INF +-----------------------------------------------------------------