# ANC Vertex: Wikipedia + FAISS + Vertex AI + StateGraph

This notebook combines two Vertex AI workflows in one place:

- Baseline notebook pipeline: Wikipedia -> chunking -> FAISS -> Vertex-hosted Q&A with `ChatVertexAI`
- Canonical orchestration diagram + run flow via shared `naturalist_companion.stategraph_shared`

Provider mapping used here:
- `ChatOllama` -> `ChatVertexAI`
- `OllamaEmbeddings` -> `VertexAIEmbeddings`
- Same `WikipediaLoader` + `FAISS` retrieval flow


## Prerequisites (Run First)

### Google Cloud / Vertex AI setup

- Authenticate locally before running notebook cells: `gcloud auth application-default login`
- Set project and region env vars as needed: `GOOGLE_CLOUD_PROJECT` and `VERTEX_LOCATION`
- Ensure Vertex AI API is enabled for your project.
- Optional: set `GCP_PROJECT` if you use that env var name instead of `GOOGLE_CLOUD_PROJECT`.

### Python dependency notes

- Run the next code cell in a fresh kernel to install notebook dependencies.
- If you hit `TqdmWarning: IProgress not found`, run `pip install ipywidgets jupyterlab_widgets` and restart the kernel.
- If you hit macOS OpenMP kernel crashes, launch Jupyter with `KMP_DUPLICATE_LIB_OK=TRUE`.

### IDE notebook stability tips

- Ensure the notebook kernel points to this project interpreter: `/Users/ryan/Developer/naturalist-companion/.venv/bin/python`
- After dependency changes, use kernel restart then run all cells from top.
- Keep one notebook kernel active at a time while testing provider configs.


In [None]:
# Uncomment in fresh environments:

# %pip install -q -r ../requirements-gcp-dev.txt


# %pip install -q ipywidgets

# %pip install -q jupyterlab_widgets


In [None]:
#######################################################################################################

###### Dependency Preflight (Fail Fast)                                                           ######

#######################################################################################################


import importlib
import os
import sys
from pathlib import Path


def _candidate_src_paths():
    candidates = []

    # Optional explicit override.
    env_src = os.environ.get("NATURALIST_COMPANION_SRC", "").strip()
    if env_src:
        candidates.append(Path(env_src))

    # Databricks notebook context path (when available).
    try:
        notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()
        if "/notebooks/" in notebook_path:
            repo_workspace_path = "/Workspace" + notebook_path.split("/notebooks/", 1)[0]
            candidates.append(Path(repo_workspace_path) / "src")
    except Exception:
        pass

    cwd = Path.cwd()
    candidates.extend([
        cwd / "src",
        cwd.parent / "src",
        cwd.parent.parent / "src",
    ])

    repos_root = Path("/Workspace/Repos")
    if repos_root.exists():
        for pkg_dir in repos_root.glob("*/*/src/naturalist_companion"):
            candidates.append(pkg_dir.parent)

    deduped = []
    seen = set()
    for item in candidates:
        key = str(item)
        if key in seen:
            continue
        seen.add(key)
        deduped.append(item)
    return deduped


for src_path in _candidate_src_paths():
    if (src_path / "naturalist_companion").exists() and str(src_path) not in sys.path:
        sys.path.insert(0, str(src_path))
        break


CHECKS = [('FAISS backend', ['faiss']), ('LangGraph runtime', ['langgraph']), ('Wikipedia loader module', ['langchain_community.document_loaders', 'langchain.document_loaders']), ('Text splitter module', ['langchain_text_splitters', 'langchain.text_splitter']), ('LangChain vectorstore module', ['langchain_community.vectorstores']), ('LangChain in-memory docstore module', ['langchain_community.docstore.in_memory']), ('Vertex integration', ['langchain_google_vertexai']), ('Google AI Platform SDK', ['google.cloud.aiplatform']), ('Naturalist stategraph module', ['naturalist_companion.stategraph_shared'])]
resolved = {}
missing = []

for label, module_candidates in CHECKS:
    matched = None
    last_error = None
    for module_name in module_candidates:
        try:
            importlib.import_module(module_name)
            matched = module_name
            break
        except Exception as exc:
            last_error = f"{type(exc).__name__}: {exc}"

    if matched is not None:
        resolved[label] = matched
    else:
        missing.append((label, module_candidates, last_error))

if missing:
    non_stategraph_missing = [m for m in missing if m[0] != "Naturalist stategraph module"]
    if non_stategraph_missing:
        missing = non_stategraph_missing

    lines = ["[preflight] Missing required notebook dependencies:"]
    for label, module_candidates, last_error in missing:
        lines.append(f"- {label}: expected one of {', '.join(module_candidates)}")
        if last_error:
            lines.append(f"  last error: {last_error}")

    lines.append("")
    lines.append("Run the install cell above, restart the kernel, and retry.")
    lines.append(f"Install hint: {'%pip install -q -r ../requirements-gcp-dev.txt'}")
    lines.append("If this repo is synced in Databricks, set NATURALIST_COMPANION_SRC to your repo src path if needed.")
    raise ModuleNotFoundError("\n".join(lines))

print("[preflight] Dependency check passed.")
for label, module_name in resolved.items():
    print(f"  - {label}: {module_name}")


In [None]:
#######################################################################################################

###### Python Package Imports for this notebook                                                  ######

#######################################################################################################


import json
import os
import warnings
from threading import Event, Thread
from urllib.parse import quote, unquote, urlparse
from urllib.request import Request, urlopen

from IPython.display import Image, Markdown, display

# Mitigate common macOS OpenMP duplicate-library crashes in notebook kernels.
os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")

# Silence noisy tqdm widget warning in IDE notebooks when rich progress widgets are unavailable.
warnings.filterwarnings("ignore", message=".*IProgress not found.*")


# LangChain moved WikipediaLoader in newer releases; keep backward compatibility.
try:
    from langchain_community.document_loaders import WikipediaLoader
except ImportError:
    from langchain.document_loaders import WikipediaLoader

try:
    from langchain_text_splitters import RecursiveCharacterTextSplitter
except ImportError:
    from langchain.text_splitter import RecursiveCharacterTextSplitter

import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_google_vertexai import ChatVertexAI, VertexAIEmbeddings


def _start_heartbeat(label: str, every_s: float = 8.0):
    stop = Event()

    def _run():
        elapsed = 0.0
        while not stop.wait(every_s):
            elapsed += every_s
            print(f"[{label}] still running... {elapsed:.0f}s elapsed")

    thread = Thread(target=_run, daemon=True)
    thread.start()
    return stop


WIKIPEDIA_API_ENDPOINT = "https://en.wikipedia.org/w/api.php"


def _wiki_api_get(params):
    query = "&".join(
        f"{quote(str(key), safe='')}={quote(str(value), safe='')}" for key, value in params.items()
    )
    url = f"{WIKIPEDIA_API_ENDPOINT}?{query}"
    req = Request(
        url,
        headers={"User-Agent": "naturalist-companion/0.1 (notebook image preview)"},
    )
    try:
        with urlopen(req, timeout=10) as resp:
            return json.loads(resp.read().decode("utf-8"))
    except Exception:
        return {}


def _title_from_wikipedia_url(url):
    parsed = urlparse(str(url or ""))
    marker = "/wiki/"
    if marker not in parsed.path:
        return None
    title = unquote(parsed.path.split(marker, 1)[1]).replace("_", " ").strip()
    return title or None



def _wiki_title_from_search(query):
    payload = _wiki_api_get(
        {
            "action": "query",
            "list": "search",
            "format": "json",
            "formatversion": 2,
            "srlimit": 1,
            "srsearch": query,
        }
    )
    results = (payload.get("query") or {}).get("search") or []
    if not results:
        return None
    title = str(results[0].get("title") or "").strip()
    return title or None


def _iter_page_refs(items):
    for item in items or []:
        if isinstance(item, str):
            raw = item.strip()
            if not raw:
                continue
            title = _title_from_wikipedia_url(raw)
            if not title:
                title = _wiki_title_from_search(raw)
            if title:
                yield {"title": title, "url": f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"}
            continue

        if isinstance(item, dict):
            title = str(item.get("title") or "").strip()
            url = str(item.get("url") or item.get("source") or "").strip()
            if not title and url:
                title = _title_from_wikipedia_url(url) or ""
            if title:
                yield {"title": title, "url": url}
            continue

        metadata = getattr(item, "metadata", None) or {}
        title = str(metadata.get("title") or "").strip()
        url = str(metadata.get("source") or "").strip()
        if not title and url:
            title = _title_from_wikipedia_url(url) or ""
        if title:
            yield {"title": title, "url": url}


def _wiki_thumbnail_for_title(title, thumb_px=640):
    payload = _wiki_api_get(
        {
            "action": "query",
            "prop": "pageimages",
            "format": "json",
            "formatversion": 2,
            "redirects": 1,
            "piprop": "thumbnail|original",
            "pithumbsize": int(thumb_px),
            "titles": title,
        }
    )
    pages = (payload.get("query") or {}).get("pages") or []
    for page in pages:
        if not isinstance(page, dict):
            continue
        thumb = page.get("thumbnail") or {}
        original = page.get("original") or {}
        source = thumb.get("source") or original.get("source")
        if source:
            return str(source)
    return None


def display_wikipedia_images_for_pages(items, max_images=4, thumb_px=640):
    seen = set()
    shown = 0
    for ref in _iter_page_refs(items):
        title = ref["title"]
        if title in seen:
            continue
        seen.add(title)

        image_url = _wiki_thumbnail_for_title(title, thumb_px=thumb_px)
        if not image_url:
            continue

        shown += 1
        page_url = ref.get("url") or f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
        display(Markdown(f"**Wikipedia image preview: {title}**"))
        display(Image(url=image_url, width=min(int(thumb_px), 720)))
        display(Markdown(f"[Open page]({page_url})"))

        if shown >= int(max_images):
            break

    if shown == 0:
        print("[wiki-images] No thumbnail images found for the selected pages.")


try:
    import ipywidgets as _ipywidgets
    print(f"[env] ipywidgets detected: v{_ipywidgets.__version__}")
except Exception:
    print("[env] ipywidgets not found in this kernel. Run `%pip install -q ipywidgets` and restart the kernel.")


def _show_vertex_auth_status() -> None:
    project = os.environ.get("GOOGLE_CLOUD_PROJECT", os.environ.get("GCP_PROJECT", "")).strip()
    location = os.environ.get("VERTEX_LOCATION", os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")).strip()
    if project:
        print(f"[env] Vertex config detected: project={project}, location={location}")
    else:
        print("[env] Vertex project is not set. Set GOOGLE_CLOUD_PROJECT (or GCP_PROJECT) before model calls.")


def _vertex_client_kwargs() -> dict:
    kwargs = {}
    project = os.environ.get("GOOGLE_CLOUD_PROJECT", os.environ.get("GCP_PROJECT", "")).strip()
    location = os.environ.get("VERTEX_LOCATION", os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")).strip()
    if project:
        kwargs["project"] = project
    if location:
        kwargs["location"] = location
    return kwargs


def _build_vertex_embeddings(model_name: str):
    kwargs = _vertex_client_kwargs()
    try:
        return VertexAIEmbeddings(model_name=model_name, **kwargs)
    except TypeError:
        return VertexAIEmbeddings(model=model_name, **kwargs)


def _build_vertex_chat(model_name: str, temperature: float):
    kwargs = _vertex_client_kwargs()
    try:
        return ChatVertexAI(model_name=model_name, temperature=temperature, **kwargs)
    except TypeError:
        return ChatVertexAI(model=model_name, temperature=temperature, **kwargs)


_show_vertex_auth_status()


#######################################################################################################

###### Config (Define LLMs, Embeddings, Vector Store, Data Loader specs)                         ######

#######################################################################################################


# DataLoader Config
query_terms = [
    "roadcut",
    "geology",
    "sedimentary rock",
    "stratigraphy",
]
max_docs = 3  # Fast local iteration setting.

# Stage 2 chunking + batching controls (keep small for interactive runs).
chunk_size = 1200
chunk_overlap = 150
embedding_batch_size = 8


# Retriever Config
k = 1
EMBEDDING_MODEL = "text-embedding-005"


# LLM Config
LLM_MODEL = "gemini-1.5-flash"
TEMPERATURE = 0.0


# Response style controls (Roadside Geology audience: curious drivers, practical field learners).
RESPONSE_TONE = "field-guide"
MAX_BULLETS_PER_SECTION = 4


## Query Prompt (Edit This Cell)

Use the next code cell to set the active question(s).

Question types this notebook is designed for:
- Detour geology: legal pull-offs or short walks near a route segment
- Safety-first prompts: where to stop and what to avoid roadside
- Route constraints: city/exit anchors plus max detour minutes
- Beginner field interpretation: what visual clues to look for and why they matter

Tip: Include your nearest city or exit and your max detour time to improve stop recommendations.


In [None]:
example_question = "I am on I-81 near Hagerstown with a 30-minute detour. Where can I safely stop to observe folded Valley-and-Ridge strata, and what exactly should I look for?"

example_questions = [
    "I am driving I-81 near Bristol, TN. Give me two legal pull-off stops where I can see clear sedimentary layering, and tell me exactly what to look for.",
    "Near I-81 between Winchester and Strasburg, where can I safely stop to see Valley-and-Ridge structure, and what field clues confirm folding?",
    "I have 45 minutes near Hagerstown, MD. What roadside geology stop gives the best payoff for a beginner, and what story does the outcrop tell?",
    "Along I-81 in the Shenandoah Valley, point me to a short-walk stop to compare rock type and landform, then explain why that match matters.",
    "On an I-81 drive day, suggest one stop where I can observe evidence of ancient seas or sediment transport, with specific visual clues.",
]

# StateGraph run can use the same prompt by default; edit independently if desired.
stategraph_question = example_question

place_image_queries = [
    "Hagerstown, Maryland",
    "Bristol, Tennessee",
    "Winchester, Virginia",
    "Strasburg, Virginia",
    "Shenandoah Valley",
]


In [None]:
#######################################################################################################

###### Stage 1/3: Wikipedia Data Load                                                            ######

#######################################################################################################


from time import perf_counter

print("[stage 1/3] Starting Wikipedia document load...")
query = " ".join(query_terms) if isinstance(query_terms, list) else query_terms
print(f"[stage 1/3] query={query!r}, max_docs={max_docs}")

heartbeat = _start_heartbeat("stage 1/3 wikipedia load", every_s=8.0)
t0 = perf_counter()
try:
    docs = WikipediaLoader(query=query, load_max_docs=max_docs).load()
finally:
    heartbeat.set()
t1 = perf_counter()

print(f"[stage 1/3] Loaded {len(docs)} document(s) in {t1 - t0:.2f}s")
if not docs:
    raise RuntimeError("No documents loaded from Wikipedia. Adjust query_terms/max_docs and re-run stage 1.")

print("[stage 1/3] Sample titles:")
for i, doc in enumerate(docs[:3], start=1):
    title = str((doc.metadata or {}).get("title") or f"doc_{i}")
    source = str((doc.metadata or {}).get("source") or "n/a")
    print(f"  {i}. {title} ({source})")

print("[stage 1/3] Wikipedia image previews from loaded pages...")
display_wikipedia_images_for_pages(docs, max_images=min(3, len(docs)))

print("[stage 1/3] Wikipedia image previews near place queries...")
display_wikipedia_images_for_pages(place_image_queries, max_images=min(5, len(place_image_queries)))


In [None]:
#######################################################################################################

###### Stage 2/3: Build + Save FAISS Index, Then Retrieve                                        ######

#######################################################################################################


import os
from pathlib import Path
from time import perf_counter

if "docs" not in globals() or not docs:
    raise RuntimeError("`docs` not found. Run Stage 1/3 first.")

print(f"[stage 2/3] Building embeddings with model={EMBEDDING_MODEL!r} and Vertex config={_vertex_client_kwargs()}...")
print(
    f"[stage 2/3] Chunking docs with chunk_size={chunk_size}, chunk_overlap={chunk_overlap}, "
    f"embedding_batch_size={embedding_batch_size}"
)

splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
split_docs = splitter.split_documents(docs)
if not split_docs:
    raise RuntimeError("Chunking produced 0 documents. Adjust chunk_size/chunk_overlap and retry.")

total_chars = sum(len(str(d.page_content or "")) for d in split_docs)
print(f"[stage 2/3] Prepared {len(split_docs)} chunk(s), total_chars={total_chars}")

embeddings = _build_vertex_embeddings(EMBEDDING_MODEL)

batch_size = max(1, int(embedding_batch_size))
vector_store = None

heartbeat = _start_heartbeat("stage 2/3 embedding/index", every_s=8.0)
t2 = perf_counter()
try:
    for start in range(0, len(split_docs), batch_size):
        batch = split_docs[start : start + batch_size]
        b0 = perf_counter()
        if vector_store is None:
            vector_store = FAISS.from_documents(batch, embeddings)
        else:
            vector_store.add_documents(batch)
        b1 = perf_counter()

        done = min(start + batch_size, len(split_docs))
        pct = (100.0 * done) / len(split_docs)
        print(
            f"[stage 2/3] Embedded batch {start // batch_size + 1}: "
            f"{done}/{len(split_docs)} chunks ({pct:.1f}%) in {b1 - b0:.2f}s"
        )
finally:
    heartbeat.set()

t3 = perf_counter()
if vector_store is None:
    raise RuntimeError("Vector store was not created.")

print(f"[stage 2/3] Built FAISS index in {t3 - t2:.2f}s")


faiss_base = os.environ.get("ANC_FAISS_DIR", "").strip()
if faiss_base:
    faiss_dir = (Path(faiss_base).expanduser() / "anc_gcp").resolve()
else:
    faiss_dir = (Path.home() / "DATA" / "naturalist-companion" / "faiss" / "anc_gcp").resolve()

faiss_dir.mkdir(parents=True, exist_ok=True)
vector_store.save_local(str(faiss_dir))
print(f"[stage 2/3] Saved FAISS index to: {faiss_dir}")


print(f"[stage 2/3] Running similarity search for question={example_question!r}, k={k}...")
results = vector_store.similarity_search(example_question, k=k)
print(f"[stage 2/3] Retrieved {len(results)} result(s)")

for i, res in enumerate(results, start=1):
    title = str((res.metadata or {}).get("title") or f"result_{i}")
    source = str((res.metadata or {}).get("source") or "n/a")
    snippet = str(res.page_content or "")[:220].replace("\\n", " ")
    print(f"  {i}. {title} ({source})")
    print(f"     {snippet}...")

print("[stage 2/3] Wikipedia image previews from retrieved pages...")
display_wikipedia_images_for_pages(results, max_images=min(4, len(results)))


In [None]:
#######################################################################################################

###### Stage 3/3: Generate Answer with ChatVertexAI                                                ######

#######################################################################################################


from time import perf_counter

if "vector_store" not in globals():
    raise RuntimeError("`vector_store` not found. Run Stage 2/3 first.")

print(f"[stage 3/3] Generating answer with model={LLM_MODEL!r} and Vertex config={_vertex_client_kwargs()}...")
llm = _build_vertex_chat(LLM_MODEL, TEMPERATURE)

voice_instructions = f"""
You are writing in a concise Roadside Geology field-guide voice for curious drivers.
Tone:
- Plainspoken, observant, and practical (not academic).
- Emphasize what can be seen from legal/safe pull-offs or short walks.
- Explain key geology in everyday language, then add one precise term when useful.
- Include safety and access realism (do not suggest unsafe roadside behavior).
Output format:
1) "Where to stop" (up to {MAX_BULLETS_PER_SECTION} bullets)
2) "What to look for" (up to {MAX_BULLETS_PER_SECTION} bullets)
3) "Why it matters" (2-4 sentences)
4) "Citations" (Wikipedia URLs only)
""".strip()


def _context_for_question(question: str, top_k: int = 2) -> str:
    local_results = vector_store.similarity_search(question, k=max(1, top_k))
    lines = []
    for i, res in enumerate(local_results, start=1):
        title = str((res.metadata or {}).get("title") or f"result_{i}")
        source = str((res.metadata or {}).get("source") or "n/a")
        snippet = str(res.page_content or "")[:450].replace("\n", " ")
        lines.append(f"[{i}] {title} ({source}) :: {snippet}")
    return "\n".join(lines)


def answer_question(question: str) -> str:
    context_block = _context_for_question(question, top_k=max(1, k))
    prompt = (
        f"Use only the provided Wikipedia-grounded context when you can.\n\n"
        f"Question: {question}\n\n"
        f"Context:\n{context_block}\n\n"
        f"Style requirements:\n{voice_instructions}"
    )

    heartbeat = _start_heartbeat("stage 3/3 llm", every_s=8.0)
    t0 = perf_counter()
    try:
        response = llm.invoke(prompt)
    finally:
        heartbeat.set()
    dt = perf_counter() - t0
    print(f"[stage 3/3] LLM response received in {dt:.2f}s")
    return str(response.content)


print(f"[stage 3/3] Primary question:\n- {example_question}")
primary_answer = answer_question(example_question)
print("\nAnswer:\n")
print(primary_answer)


In [None]:
#######################################################################################################

###### Stage 3b/3: Run All Example Questions                                                     ######

#######################################################################################################


if "answer_question" not in globals():
    raise RuntimeError("`answer_question` not found. Run Stage 3/3 first.")

if "example_questions" not in globals() or not example_questions:
    raise RuntimeError("`example_questions` is empty. Check config cell.")

all_answers = []
print(f"[stage 3b/3] Running {len(example_questions)} example question(s)...")

for i, q in enumerate(example_questions, start=1):
    print("\n" + "=" * 110)
    print(f"[stage 3b/3] Question {i}/{len(example_questions)}")
    print(q)
    print("=" * 110)

    answer = answer_question(q)
    all_answers.append({"question": q, "answer": answer})

    print("\nResponse:\n")
    print(answer)

print(f"\n[stage 3b/3] Completed {len(all_answers)} question(s).")


## Canonical Workflow Diagram (StateGraph)

StateGraph is the canonical workflow diagram for this notebook because it reflects the shared orchestration logic and is less likely to drift than a separate static PlantUML drawing.


In [None]:
# Uncomment in fresh environments:
# %pip install -q -r ../requirements-gcp-dev.txt

from IPython.display import Image, Markdown, display

from naturalist_companion.stategraph_shared import (
    build_stategraph_app,
    run_i81_eval_harness,
    run_stategraph,
)


In [None]:
provider = 'vertex'
app = build_stategraph_app(provider=provider)
print('Compiled StateGraph successfully for provider:', provider)

# Render a real image (PNG bytes) instead of plain Mermaid text.
try:
    png_bytes = app.get_graph().draw_mermaid_png()
    display(Image(data=png_bytes))
except Exception as exc:
    display(Markdown(f'Graph render fallback (text). Error: `{type(exc).__name__}: {exc}`'))
    print(app.get_graph().draw_mermaid())


In [None]:
result = run_stategraph(
    stategraph_question,
    provider='vertex',
    config={'artifact_root': 'out/stategraph/notebook_runs', 'max_retrieval_attempts': 3, 'citation_coverage_threshold': 0.80},
)
final_output = result['final_output']
print('Question:', stategraph_question)
print('Provider:', final_output['provider'])
print('Route:', final_output['route_decision']['decision'])
print('Quality passed:', final_output['quality']['passed'])
quality_reasons = final_output['quality'].get('reasons', [])
print('Quality reasons:', ', '.join(quality_reasons) if quality_reasons else 'none')
print('Attempts:', final_output['retrieval_attempts'])
print('Artifact dir:', result['artifact_dir'])
print('Response:')
print(final_output['answer']['response'])
print('Citation image previews:')
display_wikipedia_images_for_pages(final_output['answer'].get('citations', []), max_images=4)


In [None]:
report = run_i81_eval_harness(provider='vertex', config={'artifact_root': 'out/stategraph/notebook_eval', 'max_retrieval_attempts': 3, 'citation_coverage_threshold': 0.80})
print(report['summary'])
print(report['artifact_root'])
