# ANC Vertex: Wikipedia + FAISS + Vertex AI + StateGraph

This notebook combines two Vertex AI workflows in one place:

- Baseline notebook pipeline: Wikipedia -> chunking -> FAISS -> Vertex-hosted Q&A with `ChatVertexAI`
- Canonical orchestration diagram + run flow via shared `naturalist_companion.stategraph_shared`

Provider mapping used here:
- `ChatOllama` -> `ChatVertexAI`
- `OllamaEmbeddings` -> `VertexAIEmbeddings`
- Same `WikipediaLoader` + `FAISS` retrieval flow


## Prerequisites (Run First)

### Google Cloud / Vertex AI setup

- Authenticate locally before running notebook cells: `gcloud auth application-default login`
- Set project and region env vars as needed: `GOOGLE_CLOUD_PROJECT` and `VERTEX_LOCATION`
- Ensure Vertex AI API is enabled for your project.
- Optional: set `GCP_PROJECT` if you use that env var name instead of `GOOGLE_CLOUD_PROJECT`.

### Python dependency notes

- Run the next code cell in a fresh kernel to install notebook dependencies.
- If you hit `TqdmWarning: IProgress not found`, run `%pip install -q ipywidgets jupyterlab_widgets` and restart the kernel.
- If you hit macOS OpenMP kernel crashes, launch Jupyter with `KMP_DUPLICATE_LIB_OK=TRUE`.

### IDE notebook stability tips

- Ensure the notebook kernel points to this project interpreter: `/Users/ryan/Developer/naturalist-companion/.venv/bin/python`
- After dependency changes, use kernel restart then run all cells from top.
- Keep one notebook kernel active at a time while testing provider configs.


In [None]:
# Uncomment in fresh environments:

# %pip install -q -r ../requirements-gcp-dev.txt


# %pip install -q ipywidgets

# %pip install -q jupyterlab_widgets


In [None]:
# Databricks-only setup (safe no-op outside Databricks)
import os
from pathlib import Path

IS_DATABRICKS_RUNTIME = bool(str(os.environ.get("DATABRICKS_RUNTIME_VERSION", "")).strip())
DBX_NOTEBOOK_PATH = ""
DBX_REPO_SRC_PATH = ""
DBX_REPOS_ROOT = Path("/Workspace/Repos")

try:
    notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()  # type: ignore[name-defined]
    DBX_NOTEBOOK_PATH = str(notebook_path)
    IS_DATABRICKS_RUNTIME = True
except Exception:
    pass

if DBX_NOTEBOOK_PATH and "/notebooks/" in DBX_NOTEBOOK_PATH:
    DBX_REPO_SRC_PATH = "/Workspace" + DBX_NOTEBOOK_PATH.split("/notebooks/", 1)[0] + "/src"

if IS_DATABRICKS_RUNTIME:
    print("[databricks-only] Runtime context detected.")
    if DBX_NOTEBOOK_PATH:
        print(f"[databricks-only] notebook_path={DBX_NOTEBOOK_PATH}")
    if DBX_REPO_SRC_PATH:
        print(f"[databricks-only] repo_src_path={DBX_REPO_SRC_PATH}")
else:
    print("[databricks-only] Not running on Databricks; this cell is a no-op.")


In [None]:
#######################################################################################################

###### Dependency Preflight (Fail Fast)                                                           ######

#######################################################################################################


import importlib
import os
import sys
from pathlib import Path


def _candidate_src_paths():
    candidates = []

    # Optional explicit override.
    env_src = os.environ.get("NATURALIST_COMPANION_SRC", "").strip()
    if env_src:
        candidates.append(Path(env_src))

    # Databricks-only overrides are centralized in the Databricks-only setup cell.
    dbx_repo_src = str(globals().get("DBX_REPO_SRC_PATH", "") or "").strip()
    if dbx_repo_src:
        candidates.append(Path(dbx_repo_src))

    cwd = Path.cwd()
    candidates.extend([
        cwd / "src",
        cwd.parent / "src",
        cwd.parent.parent / "src",
    ])

    # Optional scan for repository-style checkouts.
    repos_root = globals().get("DBX_REPOS_ROOT", Path("/Workspace/Repos"))
    repos_root = Path(str(repos_root))
    if repos_root.exists():
        for pkg_dir in repos_root.glob("*/*/src/naturalist_companion"):
            candidates.append(pkg_dir.parent)

    deduped = []
    seen = set()
    for item in candidates:
        key = str(item)
        if key in seen:
            continue
        seen.add(key)
        deduped.append(item)
    return deduped


for src_path in _candidate_src_paths():
    if (src_path / "naturalist_companion").exists() and str(src_path) not in sys.path:
        sys.path.insert(0, str(src_path))
        break


CHECKS = [('FAISS backend', ['faiss']), ('LangGraph runtime', ['langgraph']), ('Wikipedia loader module', ['langchain_community.document_loaders', 'langchain.document_loaders']), ('Text splitter module', ['langchain_text_splitters', 'langchain.text_splitter']), ('LangChain vectorstore module', ['langchain_community.vectorstores']), ('LangChain in-memory docstore module', ['langchain_community.docstore.in_memory']), ('Vertex integration', ['langchain_google_vertexai']), ('Google AI Platform SDK', ['google.cloud.aiplatform']), ('Naturalist stategraph module', ['naturalist_companion.stategraph_shared'])]
resolved = {}
missing = []

for label, module_candidates in CHECKS:
    matched = None
    last_error = None
    for module_name in module_candidates:
        try:
            importlib.import_module(module_name)
            matched = module_name
            break
        except ImportError as import_error:
            last_error = f"{type(import_error).__name__}: {import_error}"

    if matched is not None:
        resolved[label] = matched
    else:
        missing.append((label, module_candidates, last_error))

if missing:
    non_stategraph_missing = [m for m in missing if m[0] != "Naturalist stategraph module"]
    if non_stategraph_missing:
        missing = non_stategraph_missing

    lines = ["[preflight] Missing required notebook dependencies:"]
    for label, module_candidates, last_error in missing:
        lines.append(f"- {label}: expected one of {', '.join(module_candidates)}")
        if last_error:
            lines.append(f"  last error: {last_error}")

    lines.append("")
    lines.append("Run the install cell above, restart the kernel, and retry.")
    lines.append(f"Install hint: {'%pip install -q -r ../requirements-gcp-dev.txt'}")
    lines.append("Set NATURALIST_COMPANION_SRC (or DBX_REPO_SRC_PATH in the Databricks-only cell) if needed.")
    raise ModuleNotFoundError("\n".join(lines))

print("[preflight] Dependency check passed.")
for label, module_name in resolved.items():
    print(f"  - {label}: {module_name}")


In [None]:
#######################################################################################################

###### Python Package Imports for this notebook                                                  ######

#######################################################################################################


import os
import json
import re
import warnings
from pathlib import Path
from typing import Literal
from threading import Event, Thread
from time import perf_counter

from IPython.display import Image, Markdown, display

# Mitigate common macOS OpenMP duplicate-library crashes in notebook kernels.
os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")

# Silence noisy tqdm widget warning in IDE notebooks when rich progress widgets are unavailable.
warnings.filterwarnings("ignore", message=".*IProgress not found.*")


# LangChain moved WikipediaLoader in newer releases; keep backward compatibility.
try:
    from langchain_community.document_loaders import WikipediaLoader
except ImportError:
    from langchain.document_loaders import WikipediaLoader

try:
    from langchain_text_splitters import RecursiveCharacterTextSplitter
except ImportError:
    from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain_community.vectorstores import FAISS
from langchain_google_vertexai import ChatVertexAI, VertexAIEmbeddings


def _start_heartbeat(task_label: str, every_s: float = 8.0):
    stop = Event()

    def _run():
        elapsed = 0.0
        while not stop.wait(every_s):
            elapsed += every_s
            print(f"[{task_label}] still running... {elapsed:.0f}s elapsed")

    thread = Thread(target=_run, daemon=True)
    thread.start()
    return stop


from naturalist_companion.wikipedia_tools import display_wikipedia_images_for_pages


STATEGRAPH_AVAILABLE = False
_stategraph_import_error = None
try:
    from naturalist_companion.stategraph_shared import (
        build_stategraph_app,
        run_i81_eval_harness,
        run_stategraph,
    )
    STATEGRAPH_AVAILABLE = True
except Exception as stategraph_import_error:
    _stategraph_import_error = stategraph_import_error
    print(f"[stategraph] import error: {type(stategraph_import_error).__name__}: {stategraph_import_error}")
    print("[stategraph] StateGraph cells can be skipped until this import works.")


try:
    import ipywidgets as _ipywidgets
    print(f"[env] ipywidgets detected: v{_ipywidgets.__version__}")
except ImportError:
    print("[env] ipywidgets not found in this kernel. Run `%pip install -q ipywidgets` and restart the kernel.")


def _show_vertex_auth_status() -> None:
    project = os.environ.get("GOOGLE_CLOUD_PROJECT", os.environ.get("GCP_PROJECT", "")).strip()
    location = os.environ.get("VERTEX_LOCATION", os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")).strip()
    if project:
        print(f"[env] Vertex config detected: project={project}, location={location}")
    else:
        print("[env] Vertex project is not set. Set GOOGLE_CLOUD_PROJECT (or GCP_PROJECT) before model calls.")


def _vertex_client_kwargs() -> dict:
    kwargs = {}
    project = os.environ.get("GOOGLE_CLOUD_PROJECT", os.environ.get("GCP_PROJECT", "")).strip()
    location = os.environ.get("VERTEX_LOCATION", os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")).strip()
    if project:
        kwargs["project"] = project
    if location:
        kwargs["location"] = location
    return kwargs


def _build_vertex_embeddings(model_name: str):
    kwargs = _vertex_client_kwargs()
    try:
        return VertexAIEmbeddings(model_name=model_name, **kwargs)
    except TypeError:
        return VertexAIEmbeddings(model=model_name, **kwargs)


def _build_vertex_chat(model_name: str, temperature: float):
    kwargs = _vertex_client_kwargs()
    try:
        return ChatVertexAI(model_name=model_name, temperature=temperature, **kwargs)
    except TypeError:
        return ChatVertexAI(model=model_name, temperature=temperature, **kwargs)


_show_vertex_auth_status()


#######################################################################################################

###### Config (Define LLMs, Embeddings, Vector Store, Data Loader specs)                         ######

#######################################################################################################


# DataLoader Config
query_terms = [
    "roadcut",
    "geology",
    "sedimentary rock",
    "stratigraphy",
]
max_docs = 12  # Realistic retrieval setting for richer context.

# Stage 2 chunking + batching controls (keep small for interactive runs).
chunk_size = 1200
chunk_overlap = 150
embedding_batch_size = 8


# Retriever Config
k = 4
EMBEDDING_MODEL = "text-embedding-005"


# LLM Config
LLM_MODEL = "gemini-1.5-flash"
TEMPERATURE = 0.25


# Response style controls (Roadside Geology audience: curious drivers, practical field learners).
RESPONSE_TONE = "field-guide"
MAX_BULLETS_PER_SECTION = 4


# Local artifact settings
FAISS_NAMESPACE = "anc_gcp"


# StateGraph Config
STATEGRAPH_PROVIDER: Literal["vertex"] = "vertex"
STATEGRAPH_LIVE_MAX_DOCS = 16
STATEGRAPH_COMMON_CONFIG = {
    "max_retrieval_attempts": 3,
    "citation_coverage_threshold": 0.80,
    "runtime_mode": "realistic",
    "llm_temperature": TEMPERATURE,
    "llm_model": LLM_MODEL,
    "live_max_docs": STATEGRAPH_LIVE_MAX_DOCS,
}
STATEGRAPH_RUN_CONFIG = {"artifact_root": "out/stategraph/notebook_runs", **STATEGRAPH_COMMON_CONFIG}
STATEGRAPH_EVAL_CONFIG = {"artifact_root": "out/stategraph/notebook_eval", **STATEGRAPH_COMMON_CONFIG}


## Query Prompt (Edit This Cell)

Use the next code cell to set the active question(s).

Question types this notebook is designed for:
- Detour geology: legal pull-offs or short walks near a route segment
- Safety-first prompts: where to stop and what to avoid roadside
- Route constraints: city/exit anchors plus max detour minutes
- Beginner field interpretation: what visual clues to look for and why they matter

Tip: Include your nearest city or exit and your max detour time to improve stop recommendations.


In [None]:
example_question = "I am on I-81 near Hagerstown with a 30-minute detour. Where can I safely stop to observe folded Valley-and-Ridge strata, and what exactly should I look for?"

example_questions = [
    "I am driving I-81 near Bristol, TN. Give me two legal pull-off stops where I can see clear sedimentary layering, and tell me exactly what to look for.",
    "Near I-81 between Winchester and Strasburg, where can I safely stop to see Valley-and-Ridge structure, and what field clues confirm folding?",
    "I have 45 minutes near Hagerstown, MD. What roadside geology stop gives the best payoff for a beginner, and what story does the outcrop tell?",
    "Along I-81 in the Shenandoah Valley, point me to a short-walk stop to compare rock type and landform, then explain why that match matters.",
    "On an I-81 drive day, suggest one stop where I can observe evidence of ancient seas or sediment transport, with specific visual clues.",
]

# StateGraph run can use the same prompt by default; edit independently if desired.
stategraph_question = example_question

def _parse_place_query_list(raw: str, max_items: int = 6) -> list[str]:
    text = str(raw or "").strip()
    if not text:
        return []
    match = re.search(r"\[[\s\S]*\]", text)
    if match:
        text = match.group(0)
    try:
        data = json.loads(text)
    except json.JSONDecodeError:
        data = [ln.strip(" -â€¢\t") for ln in text.splitlines() if ln.strip()]

    out: list[str] = []
    for item in data if isinstance(data, list) else []:
        value = str(item or "").strip()
        if not value:
            continue
        low = value.lower()
        if any(token in low for token in ("interstate", "highway", "i-81", "route ")):
            continue
        if value not in out:
            out.append(value)
        if len(out) >= int(max_items):
            break
    return out


def _generate_place_image_queries_with_model(question: str, max_items: int = 6) -> list[str]:
    prompt = (
        "Return only JSON: an array of concise Wikipedia search queries for NATURAL LANDSCAPES "
        "near this route question. Exclude highways, interstates, and city-only queries. "
        "Prefer valleys, ridges, mountains, parks, overlooks, and geologic landforms. "
        f"Question: {question}"
    )
    try:
        planner = _build_vertex_chat(LLM_MODEL, min(0.3, float(TEMPERATURE)))
        response = planner.invoke(prompt)
        raw = getattr(response, "content", response)
        places = _parse_place_query_list(raw, max_items=max_items)
        if places:
            return places
    except Exception as query_error:
        print(f"[query] place query generation fallback: {type(query_error).__name__}: {query_error}")

    return [
        "Shenandoah Valley overlooks",
        "Blue Ridge Mountains viewpoints",
        "Appalachian Valley and Ridge outcrops",
        "Catoctin Mountain Park geology",
        "Great North Mountain ridge overlook",
    ][: max(1, int(max_items))]


place_image_queries = _generate_place_image_queries_with_model(example_question, max_items=6)
print("[query] place_image_queries:")
for query_idx, query_text in enumerate(place_image_queries, start=1):
    print(f"  {query_idx}. {query_text}")


In [None]:
#######################################################################################################

###### Stage 1/3: Wikipedia Data Load                                                            ######

#######################################################################################################


print("[stage 1/3] Starting Wikipedia document load...")
query = " ".join(query_terms) if isinstance(query_terms, list) else query_terms
print(f"[stage 1/3] query={query!r}, max_docs={max_docs}")

stage1_heartbeat = _start_heartbeat("stage 1/3 wikipedia load", every_s=8.0)
t0 = perf_counter()
try:
    docs = WikipediaLoader(query=query, load_max_docs=max_docs).load()
finally:
    stage1_heartbeat.set()
t1 = perf_counter()

print(f"[stage 1/3] Loaded {len(docs)} document(s) in {t1 - t0:.2f}s")
if not docs:
    raise RuntimeError("No documents loaded from Wikipedia. Adjust query_terms/max_docs and re-run stage 1.")

print("[stage 1/3] Sample titles:")
for sample_idx, sample_doc in enumerate(docs[:3], start=1):
    sample_title = str((sample_doc.metadata or {}).get("title") or f"doc_{sample_idx}")
    sample_source = str((sample_doc.metadata or {}).get("source") or "n/a")
    print(f"  {sample_idx}. {sample_title} ({sample_source})")

print("[stage 1/3] Wikipedia image previews from loaded pages...")
display_wikipedia_images_for_pages(docs, max_images=min(3, len(docs)))

print("[stage 1/3] Wikipedia image previews near place queries...")
display_wikipedia_images_for_pages(place_image_queries, max_images=min(5, len(place_image_queries)))


In [None]:
#######################################################################################################

###### Stage 2/3: Build + Save FAISS Index, Then Retrieve                                        ######

#######################################################################################################


if "docs" not in globals() or not docs:
    raise RuntimeError("`docs` not found. Run Stage 1/3 first.")

print(f"[stage 2/3] Building embeddings with model={EMBEDDING_MODEL!r} and Vertex config={_vertex_client_kwargs()}...")
print(
    f"[stage 2/3] Chunking docs with chunk_size={chunk_size}, chunk_overlap={chunk_overlap}, "
    f"embedding_batch_size={embedding_batch_size}"
)

splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
split_docs = splitter.split_documents(docs)
if not split_docs:
    raise RuntimeError("Chunking produced 0 documents. Adjust chunk_size/chunk_overlap and retry.")

total_chars = sum(len(str(d.page_content or "")) for d in split_docs)
print(f"[stage 2/3] Prepared {len(split_docs)} chunk(s), total_chars={total_chars}")

embeddings = _build_vertex_embeddings(EMBEDDING_MODEL)

batch_size = max(1, int(embedding_batch_size))
vector_store = None

stage2_heartbeat = _start_heartbeat("stage 2/3 embedding/index", every_s=8.0)
t2 = perf_counter()
try:
    for start in range(0, len(split_docs), batch_size):
        batch = split_docs[start : start + batch_size]
        b0 = perf_counter()
        if vector_store is None:
            vector_store = FAISS.from_documents(batch, embeddings)
        else:
            vector_store.add_documents(batch)
        b1 = perf_counter()

        done = min(start + batch_size, len(split_docs))
        pct = (100.0 * done) / len(split_docs)
        print(
            f"[stage 2/3] Embedded batch {start // batch_size + 1}: "
            f"{done}/{len(split_docs)} chunks ({pct:.1f}%) in {b1 - b0:.2f}s"
        )
finally:
    stage2_heartbeat.set()

t3 = perf_counter()
if vector_store is None:
    raise RuntimeError("Vector store was not created.")

print(f"[stage 2/3] Built FAISS index in {t3 - t2:.2f}s")


faiss_base = os.environ.get("ANC_FAISS_DIR", "").strip()
if faiss_base:
    faiss_dir = (Path(faiss_base).expanduser() / FAISS_NAMESPACE).resolve()
else:
    faiss_dir = (Path.home() / "DATA" / "naturalist-companion" / "faiss" / FAISS_NAMESPACE).resolve()

faiss_dir.mkdir(parents=True, exist_ok=True)
vector_store.save_local(str(faiss_dir))
print(f"[stage 2/3] Saved FAISS index to: {faiss_dir}")


print(f"[stage 2/3] Running similarity search for question={example_question!r}, k={k}...")
results = vector_store.similarity_search(example_question, k=k)
print(f"[stage 2/3] Retrieved {len(results)} result(s)")

for result_idx, result_doc in enumerate(results, start=1):
    result_title = str((result_doc.metadata or {}).get("title") or f"result_{result_idx}")
    result_source = str((result_doc.metadata or {}).get("source") or "n/a")
    result_snippet = str(result_doc.page_content or "")[:220].replace("\n", " ")
    print(f"  {result_idx}. {result_title} ({result_source})")
    print(f"     {result_snippet}...")

print("[stage 2/3] Wikipedia image previews from retrieved pages...")
display_wikipedia_images_for_pages(results, max_images=min(4, len(results)))


In [None]:
#######################################################################################################

###### Stage 3/3: Generate Answer with ChatVertexAI                                                ######

#######################################################################################################


if "vector_store" not in globals():
    raise RuntimeError("`vector_store` not found. Run Stage 2/3 first.")

print(f"[stage 3/3] Generating answer with model={LLM_MODEL!r} and Vertex config={_vertex_client_kwargs()}...")
llm = _build_vertex_chat(LLM_MODEL, TEMPERATURE)

voice_instructions = f"""
You are writing in a concise Roadside Geology field-guide voice for curious drivers.
Tone:
- Plainspoken, observant, and practical (not academic).
- Emphasize what can be seen from legal/safe pull-offs or short walks.
- Explain key geology in everyday language, then add one precise term when useful.
- Include safety and access realism (do not suggest unsafe roadside behavior).
Output format:
1) "Where to stop" (up to {MAX_BULLETS_PER_SECTION} bullets)
2) "What to look for" (up to {MAX_BULLETS_PER_SECTION} bullets)
3) "Why it matters" (2-4 sentences)
4) "Citations" (Wikipedia URLs only)
""".strip()


def _context_for_question(question: str, top_k: int = 2) -> str:
    local_results = vector_store.similarity_search(question, k=max(1, top_k))
    context_lines: list[str] = []
    for context_idx, context_doc in enumerate(local_results, start=1):
        context_title = str((context_doc.metadata or {}).get("title") or f"result_{context_idx}")
        context_source = str((context_doc.metadata or {}).get("source") or "n/a")
        context_snippet = str(context_doc.page_content or "")[:450].replace("\n", " ")
        context_lines.append(f"[{context_idx}] {context_title} ({context_source}) :: {context_snippet}")
    return "\n".join(context_lines)


def answer_question(question: str) -> str:
    context_block = _context_for_question(question, top_k=max(1, k))
    prompt = (
        f"Use only the provided Wikipedia-grounded context when you can.\n\n"
        f"Question: {question}\n\n"
        f"Context:\n{context_block}\n\n"
        f"Style requirements:\n{voice_instructions}"
    )

    llm_heartbeat = _start_heartbeat("stage 3/3 llm", every_s=8.0)
    started_at = perf_counter()
    llm_response = None
    try:
        llm_response = llm.invoke(prompt)
    finally:
        llm_heartbeat.set()
    dt = perf_counter() - started_at
    print(f"[stage 3/3] LLM response received in {dt:.2f}s")
    if llm_response is None:
        raise RuntimeError("LLM did not return a response.")
    response_content = getattr(llm_response, "content", llm_response)
    return str(response_content)


print(f"[stage 3/3] Primary question:\n- {example_question}")
primary_answer = answer_question(example_question)
print("\nAnswer:\n")
print(primary_answer)


In [None]:
#######################################################################################################

###### Stage 3b/3: Run All Example Questions                                                     ######

#######################################################################################################


if "answer_question" not in globals():
    raise RuntimeError("`answer_question` not found. Run Stage 3/3 first.")

if "example_questions" not in globals() or not example_questions:
    raise RuntimeError("`example_questions` is empty. Check config cell.")

all_answers = []
print(f"[stage 3b/3] Running {len(example_questions)} example question(s)...")

for question_idx, question_text in enumerate(example_questions, start=1):
    print("\n" + "=" * 110)
    print(f"[stage 3b/3] Question {question_idx}/{len(example_questions)}")
    print(question_text)
    print("=" * 110)

    answer_text = answer_question(question_text)
    all_answers.append({"question": question_text, "answer": answer_text})

    print("\nResponse:\n")
    print(answer_text)

print(f"\n[stage 3b/3] Completed {len(all_answers)} question(s).")


## Canonical Workflow Diagram (StateGraph)

StateGraph is the canonical workflow diagram for this notebook because it reflects the shared orchestration logic and is less likely to drift than a separate static PlantUML drawing.


In [None]:
# Uncomment in fresh environments:
# %pip install -q -r ../requirements-gcp-dev.txt

if STATEGRAPH_AVAILABLE:
    print("[stategraph] Ready: naturalist_companion.stategraph_shared imported in setup cells.")
else:
    print("[stategraph] Unavailable: fix setup/import paths above; diagram/run/eval cells will skip.")


In [None]:
if not STATEGRAPH_AVAILABLE:
    print("[stategraph] Skipping canonical diagram: naturalist_companion not available.")
else:
    provider: Literal["vertex"] = STATEGRAPH_PROVIDER
    app = build_stategraph_app(provider=provider)
    print('Compiled StateGraph successfully for provider:', provider)

    # Render a real image (PNG bytes) instead of plain Mermaid text.
    try:
        png_bytes = app.get_graph().draw_mermaid_png()
        display(Image(data=png_bytes, width=880))
    except Exception as graph_render_error:
        display(Markdown(f'Graph render fallback (text). Error: `{type(graph_render_error).__name__}: {graph_render_error}`'))
        print(app.get_graph().draw_mermaid())


In [None]:
if not STATEGRAPH_AVAILABLE:
    print("[stategraph] Skipping run_stategraph: naturalist_companion not available.")
else:
    result = run_stategraph(
        stategraph_question,
        provider=STATEGRAPH_PROVIDER,
        config=STATEGRAPH_RUN_CONFIG,
    )
    final_output = result['final_output']
    print('Question:', stategraph_question)
    print('Provider:', final_output['provider'])
    print('Route:', final_output['route_decision']['decision'])
    print('Quality passed:', final_output['quality']['passed'])
    quality_reasons = final_output['quality'].get('reasons', [])
    print('Quality reasons:', ', '.join(quality_reasons) if quality_reasons else 'none')
    print('Attempts:', final_output['retrieval_attempts'])
    print('Artifact dir:', result['artifact_dir'])
    print('Response:')
    print(final_output['answer']['response'])
    print('Citation image previews:')
    display_wikipedia_images_for_pages(final_output['answer'].get('citations', []), max_images=4)


In [None]:
if not STATEGRAPH_AVAILABLE:
    print("[stategraph] Skipping eval harness: naturalist_companion not available.")
else:
    report = run_i81_eval_harness(provider=STATEGRAPH_PROVIDER, config=STATEGRAPH_EVAL_CONFIG)
    print(report['summary'])
    print(report['artifact_root'])
