# Query Pipelining on Gradio

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install pymupdf python-docx pytesseract openai chromadb sentence-transformers langchain langchain_community langchain_chroma --upgrade gradio --upgrade google-search-results

Collecting pymupdf
  Downloading pymupdf-1.26.0-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting openai
  Downloading openai-1.82.0-py3-none-any.whl.metadata (25 kB)
Collecting chromadb
  Downloading chromadb-1.0.10-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting langchain_community
  Downloading langchain_community-0.3.24-py3-none-any.whl.metadata (2.5 kB)
Collecting langchain_chroma
  Downloading langchain_chroma-0.2.4-py3-none-any.whl.metadata (1.1 kB)
Collecting gradio
  Downloading gradio-5.31.0-py3-none-any.whl.metadata (16 kB)
Collecting google-search-results
  Downloading google_search_results-2.4.2.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fastapi==0.115.9 (from chromadb)
  Downloading fast

In [1]:
import os
OPENAI_API_KEY = input("🔑 Enter your OpenAI API key: ")
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [2]:
SERP_API_KEY = input("🔑 Enter your SERP API key: ")
os.environ["SERPAPI_API_KEY"] = SERP_API_KEY

# GIS Components with Outcome Metrics Classification

In [None]:
import pandas as pd
import plotly.graph_objects as go
import gradio as gr

# Load county-level outcome data
map_df = pd.read_csv("/content/drive/MyDrive/LLM/CalWorks/Vector Database/Asset/[GIS]ca_counties_outcome.csv")

# The 15 outcome metrics you want to visualize
OUTCOME_METRICS = [
    'Ancillary_Access_Rate',
    'Education_and_Skills_Dev_Rate',
    'Employment_Rate',
    'Engagement_Rate',
    'Exits_With_Earnings',
    'Family_Stabilization_to_WTW_Eng',
    'First_Activity_Rate',
    'OCAT_Appraisal_to_Next_Activity',
    'OCAT_Timeliness_Rate',
    'PostCWEmployment',
    'Reentry_After_Exit_with_Earning',
    'Reentry',
    'Sanction_Rate',
    'Sanction_Resolution_Rate',
    'Orientation_Attendance_2024'
]

COUNTIES = sorted(map_df['County'].unique())

# Plot callback with optional county filter
def plot_outcome_map(metric, selected_counties):
    df = map_df.copy()
    if selected_counties:
        df = df[df['County'].isin(selected_counties)]

    color_map = {
    "low": "#d62728",     # red → poor outcome
    "medium": "#ff7f0e",  # orange → moderate
    "high": "#2ca02c"     # green → favorable
}
    fig = go.Figure()

    for label in df[metric].dropna().unique():
        mask = df[metric] == label
        fig.add_trace(go.Scattermapbox(
            lat=df.loc[mask, 'INTPTLAT'],
            lon=df.loc[mask, 'INTPTLON'],
            text=df.loc[mask, 'County'],
            mode='markers',
            name=label.title(),
            marker=go.scattermapbox.Marker(
                size=8,
                color=color_map.get(label.lower(), '#636EFA')
            ),
            hovertemplate=(
                "<b>County</b>: %{text}<br>"
                f"<b>Category</b>: {label.title()}"
            )
        ))

    fig.update_layout(
        mapbox_style="open-street-map",
        mapbox=dict(
            center={"lat": 37.5, "lon": -119.5},
            zoom=5.2,  # Or try zoom=4.5
            style="open-street-map"
        ),
        margin=dict(l=0, r=0, t=30, b=0),
        title=f"{metric.replace('_',' ').title()} Classification by County"
    )

    return fig


# Multi-model extension

In [None]:
import fitz  # PyMuPDF
import pandas as pd
from PIL import Image
import pytesseract
from docx import Document as WordDoc

def extract_text_from_file(uploaded_file):
    if uploaded_file is None:
        return ""

    name = uploaded_file.name.lower()

    # PDF: use PyMuPDF
    if name.endswith(".pdf"):
        doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
        return "\n\n".join([page.get_text() for page in doc])

    # Word: .docx
    elif name.endswith(".docx"):
        doc = WordDoc(uploaded_file)
        return "\n\n".join([para.text for para in doc.paragraphs if para.text.strip()])

    # Excel / CSV
    elif name.endswith(".xlsx") or name.endswith(".xls"):
        df = pd.read_excel(uploaded_file)
        return df.to_csv(index=False)
    elif name.endswith(".csv"):
        df = pd.read_csv(uploaded_file)
        return df.to_csv(index=False)

    # Image: OCR
    elif name.endswith((".jpg", ".jpeg", ".png")):
        image = Image.open(uploaded_file)
        return pytesseract.image_to_string(image)

    return "❌ Unsupported file type."



# Final LLM Chat Interface on Gradio

In [None]:
# Revised QA system with county-wise comparison and map-reduce summarization
import os
import gradio as gr
import json
import ast
from datetime import datetime
from collections import defaultdict
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document as LCDocument
from langchain_community.utilities import SerpAPIWrapper

# Config
PERSIST_DIR = "/content/drive/MyDrive/LLM/CalWorks/Vector Database/Output/chroma_sip_csa_db"
COLLECTION_NAME  = "sip_csa_chunks"
QUERY_LOG_PATH = "/content/drive/MyDrive/LLM/CalWorks/Vector Database/Output/query_log.json"
TOP_K_DEFAULT = 5
CALIFORNIA_COUNTIES = ["Alameda", "Alpine", "Amador", "Butte", "Calaveras", "Colusa", "Contra Costa",
    "Del Norte", "El Dorado", "Fresno", "Glenn", "Humboldt", "Imperial", "Inyo", "Kern", "Kings", "Lake", "Lassen",
    "Los Angeles", "Madera", "Marin", "Mariposa", "Mendocino", "Merced", "Modoc", "Mono", "Monterey", "Napa",
    "Nevada", "Orange", "Placer", "Plumas", "Riverside", "Sacramento", "San Benito", "San Bernardino",
    "San Diego", "San Francisco", "San Joaquin", "San Luis Obispo", "San Mateo", "Santa Barbara", "Santa Clara",
    "Santa Cruz", "Shasta", "Sierra", "Siskiyou", "Solano", "Sonoma", "Stanislaus", "Sutter", "Tehama", "Trinity",
    "Tulare", "Tuolumne", "Ventura", "Yolo", "Yuba"]
MAX_CHAR_LIMIT = 80000

def extract_counties_from_query(query):
    if any(w in query.lower() for w in ["all counties", "county-wise", "statewide", "compare counties"]):
        return set(CALIFORNIA_COUNTIES)
    return {c for c in CALIFORNIA_COUNTIES if c.lower() in query.lower()}

def load_query_log():
    try:
        with open(QUERY_LOG_PATH, "r") as f:
            return json.load(f)
    except FileNotFoundError:
        return {}

def save_query_log(counter):
    with open(QUERY_LOG_PATH, "w") as f:
        json.dump(counter, f, indent=2)

# Embeddings and Retriever
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
SERPAPI_API_KEY = os.getenv("SERPAPI_API_KEY")
openai_ef = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_key=OPENAI_API_KEY)
vectorstore = Chroma(collection_name=COLLECTION_NAME, persist_directory=PERSIST_DIR, embedding_function=openai_ef)
retriever = vectorstore.as_retriever()
search_tool = SerpAPIWrapper(serpapi_api_key=SERPAPI_API_KEY)

# LLM + Chains
llm = ChatOpenAI(temperature=0.0, model_name="gpt-4o")
summarizer = load_summarize_chain(llm, chain_type="map_reduce")

def summarize_with_map_reduce(docs):
    docs_lc = []
    for i, doc in enumerate(docs):
        ref_id = f"[{i+1}]"
        meta = doc.metadata
        county = meta.get("county", "Unknown")
        section = meta.get("section", "Unknown Section")
        page = meta.get("page", "?")
        report_type = meta.get("report_type", "Unknown")
        header = f"{ref_id} 📍 {county} | {report_type} | Section: {section} | Page {page}"
        content = f"{header}\n{doc.page_content.strip()}"
        docs_lc.append(LCDocument(page_content=content))
    return summarizer.run(docs_lc)

qa_prompt_template = PromptTemplate(
    input_variables=["context", "question", "external", "user_context"],
    template="""
You are analyzing CalWORKs CSA and SIP county reports.

You may use the following sources:
- Internal report excerpts (Context)
- Optional external web information (External Info)
- User-supplied input (User Context)

Answer the question based on the provided information. If relevant, incorporate insights from external or user context.
If no reliable answer is available, say so directly without guessing.

Context:
{context}

External Info:
{external}

User Context:
{user_context}

Question: {question}

Answer:
"""
)

qa_chain = LLMChain(llm=llm, prompt=qa_prompt_template)
query_log = load_query_log()

def analyze_uploaded_file(uploaded_file, query=""):
    if uploaded_file is None:
        return "❌ Please upload a file."

    file_text = extract_text_from_file(uploaded_file)
    if not file_text.strip():
        return "❌ Could not extract meaningful text from the file."

    file_text = file_text[:MAX_CHAR_LIMIT]
    if not query.strip():
        query = "Summarize the key points or findings in the uploaded document."

    try:
        response = qa_chain.invoke({
            "context": clean_unicode(file_text),
            "question": clean_unicode(query),
            "external": "",
            "user_context": "This is a file upload with optional query input."
        })["text"]
        return response.strip()
    except Exception as e:
        return f"❌ Failed to process file with LLM: {str(e)}"


def clean_unicode(text: str) -> str:
    return text.encode("utf-8", "ignore").decode("utf-8")

def query_interface(query, external_query, top_k, use_external, force_all_counties=False):
    query = query.strip()
    external_query = external_query.strip()
    if not query:
        return "Please enter a question.", get_top_queries(), ""

    mentioned = extract_counties_from_query(query)
    docs = retriever.get_relevant_documents(query, k=top_k * 2)

    if mentioned and not force_all_counties:
        docs = [doc for doc in docs if doc.metadata.get("county", "").title() in mentioned][:top_k * 2]
    else:
        docs = docs[:top_k]

    if not docs:
        return "❌ No relevant documents found.", get_top_queries(), ""

    summarized = summarize_with_map_reduce(docs)
    user_context = f"Query targets: {', '.join(sorted(set(doc.metadata.get('county', 'Unknown') for doc in docs)))}"

    external_info = ""

    if use_external and external_query:
        try:
            raw = search_tool.run(external_query)

            # Convert raw string to list if it looks like one
            if isinstance(raw, str) and raw.strip().startswith("["):
                try:
                    raw = ast.literal_eval(raw)
                except Exception:
                    pass  # If parsing fails, fallback to treating it as raw string

            if isinstance(raw, list):
                cleaned = [
                    clean_unicode(s).strip(" '\"\n")
                    for s in raw if isinstance(s, str) and s.strip()
                ]
                external_info = "\n".join(f"• {s}" for s in cleaned[:20])
            else:
                external_info = clean_unicode(str(raw))[:8000].strip()

        except Exception as e:
            external_info = f"External search failed: {e}"


    try:
        response = qa_chain.invoke({
            "context": clean_unicode(summarized),
            "question": clean_unicode(query),
            "external": external_info,
            "user_context": user_context
        })["text"]
    except Exception as e:
        response = f"❌ QA failed: {str(e)}"

    timestamp = datetime.now().isoformat()
    query_log[timestamp] = {
        "query": query,
        "external_query": external_query,
        "counties": sorted(list(mentioned)),
        "used_external": use_external,
        "is_countywise": force_all_counties or len(mentioned) > 3,
        "answer_preview": response[:200]
    }
    save_query_log(query_log)

    excerpts = "\n\n---\n\n".join([
        f"[{i+1}] 📍 {doc.metadata.get('county', 'Unknown')} | {doc.metadata.get('report_type', 'Unknown')} | Section: {doc.metadata.get('section', 'Unknown')} | Page {doc.metadata.get('page', '?')}\n{doc.page_content.strip()}"
        for i, doc in enumerate(docs)
    ])

    return response.strip() + "\n\n📚 Used Excerpts:\n\n" + excerpts, get_top_queries(), external_info.strip()


def get_top_queries(n=20):
    freqs = defaultdict(int)
    for entry in query_log.values():
        freqs[entry["query"]] += 1
    sorted_qs = sorted(freqs.items(), key=lambda x: -x[1])
    return "\n".join([f"{i+1}. {q} — {count}x" for i, (q, count) in enumerate(sorted_qs[:n])]) or "No queries yet."

# Gradio UI
def build_ui():
    with gr.Blocks() as demo:
        gr.Image("/content/drive/MyDrive/LLM/CalWorks/Vector Database/Asset/cdss-logo.png", show_label=False, container=False, width=500)
        gr.Markdown("### 🐻🌲 CalWORKs County QA System")
        gr.Markdown("""📌 **Prompt Tips for County-Wise and Single-County Questions**

You can ask:
- *What is Santa Clara doing to reduce exits and re-entries?*
- *Compare child care availability across counties.*
- *Which counties cite transportation as a major barrier?*

Use phrases like **"county-wise"**, **"statewide"**, or **specific county names** to guide the system.
Avoid vague yes/no questions — ask for comparisons, summaries, or trends.
Include **keywords like** "compare counties", "county-wise", or "statewide" to ensure multi-county retrieval.
""")

        with gr.Row():
            with gr.Column(scale=3):
                qbox = gr.Textbox(label="Ask a Question", placeholder="e.g., Which counties improved work participation?")
                extbox = gr.Textbox(label="External Search Query")
                topk = gr.Slider(1, 50, value=TOP_K_DEFAULT, label="Top k chunks")
                extflag = gr.Checkbox(label="Include Web Search", value=False)
                go = gr.Button("Answer")
                answer = gr.Textbox(label="Answer", lines=10)
                extout = gr.Textbox(label="External Info", lines=10, max_lines=20, show_copy_button=True)
            with gr.Column(scale=3):
                gr.Markdown("### 📎 Upload a File")
                file_upload = gr.File(
                    label="Upload File (PDF, Word, Excel, Image)",
                    file_types=[".pdf", ".docx", ".xlsx", ".xls", ".csv", ".jpg", ".png"],
                    type="filepath"
                )
                query_input = gr.Textbox(label="Optional Question", placeholder="Leave blank to summarize key findings")
                analyze_button = gr.Button("Analyze File")
                file_output = gr.Textbox(label="LLM Summary", lines=10, max_lines=20)
            with gr.Column(scale=1):
                freq = gr.Textbox(label="Top Queries", value=get_top_queries(), lines=20)

        gr.Markdown("### 🗺️ Map of CalWORKs Outcome Metrics by County")
        with gr.Row():
            with gr.Column(scale=1):
                metric_dropdown = gr.Dropdown(
                    choices=OUTCOME_METRICS,
                    value=OUTCOME_METRICS[0],
                    label="Select Outcome Metric"
                )
                county_dropdown = gr.Dropdown(
                    choices=COUNTIES,
                    multiselect=True,
                    label="Select Counties (optional)"
                )
            with gr.Column(scale=4):
                map_plot = gr.Plot(label="Classification by County")

        go.click(query_interface, [qbox, extbox, topk, extflag], [answer, freq, extout])
        analyze_button.click(analyze_uploaded_file, inputs=[file_upload, query_input], outputs=[file_output])

        # Connect dropdowns to the plot callback
        demo.load(plot_outcome_map, inputs=[metric_dropdown, county_dropdown], outputs=[map_plot])
        metric_dropdown.change(plot_outcome_map, inputs=[metric_dropdown, county_dropdown], outputs=[map_plot])
        county_dropdown.change(plot_outcome_map, inputs=[metric_dropdown, county_dropdown], outputs=[map_plot])

        gr.Image("/content/drive/MyDrive/LLM/CalWorks/Vector Database/Asset/calworks_logo.jpeg", show_label=False, container=False, width=1600)

    return demo

demo = build_ui()
demo.launch(debug=True, share=True)


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://39c067c3ab84bb20d2.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/gradio/queueing.py", line 625, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/gradio/route_utils.py", line 322, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/gradio/blocks.py", line 2191, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/gradio/blocks.py", line 1702, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/anyio/to_thread.py", line 56, in run_sync
    return await get_async_backend().run_sync_in_worker_thread(
           ^^^^^