# 🔎 Vertex AI Search — Title Finder (v2, aware of your data layout)

Ta wersja:
- Rozumie **strukturę Twojego indeksu** (wg `metadata_20250829.ndjson`).
- Potrafi **zmapować linki `gs://...`** z indeksu na **lokalne pliki** rozpakowane w `missions.zip`.
- Daje **fuzzy wyszukiwanie po tytule/haśle** (RapidFuzz) + re-ranking, a także szybki podgląd planu/transkryptu/metryk jeśli są dostępne lokalnie.

> Jeśli uruchamiasz u siebie: ustaw `GOOGLE_APPLICATION_CREDENTIALS` i podaj `serving_config`, aby faktycznie pytać **Vertex Search**. Gdy chcesz tylko przeglądać lokalne dane, też zadziała (patrz sekcja „Tryb lokalny”).

---

## 0) Instalacja pakietów (jeśli potrzeba)
Uruchom jeśli nie masz pakietów:

```
%pip install -q google-cloud-discoveryengine rapidfuzz pandas ipywidgets
```

In [None]:
# %pip install -q google-cloud-discoveryengine rapidfuzz pandas ipywidgets

## 1) Importy + konfiguracja

In [1]:

import re, json, sys, os
from typing import Any, Dict, List, Optional
from pathlib import Path
import pandas as pd

# Opcjonalny fuzzy matcher
try:
    from rapidfuzz import fuzz
    _FUZZY = True
except Exception:
    _FUZZY = False
    def _simple_ratio(a: str, b: str) -> float:
        a = (a or '').lower(); b = (b or '').lower()
        if not a or not b: return 0.0
        if a in b or b in a: return 100.0 * min(len(a), len(b))/max(len(a), len(b))
        return 0.0
    class fuzz:
        @staticmethod
        def partial_ratio(a,b): return _simple_ratio(a,b)
        @staticmethod
        def token_set_ratio(a,b): return _simple_ratio(a,b)

# Klient Discovery Engine (Vertex Search)
try:
    from google.cloud import discoveryengine_v1 as de
    from google.api_core.client_options import ClientOptions
except Exception as e:
    de = None
    ClientOptions = None
    print("ℹ️  google-cloud-discoveryengine niedostępny — tryb lokalny nadal działa.")

# Ścieżki do dostarczonych danych
META_PATH = Path('/mnt/data/metadata_20250829.ndjson')   # NDJSON z metadanymi
MISSIONS_DIR = Path('/mnt/data/missions_extracted')      # Rozpakowany missions.zip

# Rozpakuj missions.zip jeśli potrzeba
ZIP_PATH = Path('/mnt/data/missions.zip')
if not MISSIONS_DIR.exists() and ZIP_PATH.exists():
    import zipfile
    with zipfile.ZipFile(ZIP_PATH, 'r') as z:
        z.extractall(MISSIONS_DIR)
    print(f"📦 Rozpakowano: {ZIP_PATH} -> {MISSIONS_DIR}")

# Wczytaj metadane (lokalnie)
if META_PATH.exists():
    meta_rows = [json.loads(line) for line in META_PATH.open() if line.strip()]
    df_meta = pd.json_normalize(meta_rows)
else:
    df_meta = pd.DataFrame()

print('Kolumny metadanych:', list(df_meta.columns))

# Mapowanie gs://... -> lokalny plik (po basename)
_local_index = {}
if MISSIONS_DIR.exists():
    for p in MISSIONS_DIR.rglob('*'):
        if p.is_file():
            _local_index[p.name] = p

def resolve_local_from_uri(uri: Optional[str]) -> Optional[Path]:
    if not uri:
        return None
    base = os.path.basename(uri)
    return _local_index.get(base)

def infer_api_endpoint(serving_config: str) -> str:
    m = re.search(r"/locations/([^/]+)/", serving_config)
    loc = m.group(1) if m else "global"
    return f"{loc}-discoveryengine.googleapis.com"


Kolumny metadanych: []


## 2) Funkcje: Vertex Search + fuzzy re-ranking + lokalne podglądy

In [2]:

def search_vertex(
    serving_config: Optional[str],
    query: str,
    page_size: int = 30,
    api_endpoint: Optional[str] = None,
    filter_expression: Optional[str] = None,
) -> List[Dict[str, Any]]:
    """Jeśli podasz serving_config i masz klienta DE, pyta Vertex Search.
    W przeciwnym razie — tryb lokalny filtrujący df_meta po prostym dopasowaniu query."""
    rows: List[Dict[str, Any]] = []

    if serving_config and de is not None:
        endpoint = api_endpoint or infer_api_endpoint(serving_config)
        client = de.SearchServiceClient(client_options=ClientOptions(api_endpoint=endpoint))

        req = de.SearchRequest(serving_config=serving_config, query=query, page_size=page_size)
        # Rozszerzanie zapytań i korekta pisowni
        try:
            req.query_expansion_spec = de.SearchRequest.QueryExpansionSpec(
                condition=de.SearchRequest.QueryExpansionSpec.Condition.AUTO
            )
        except Exception:
            pass
        try:
            req.spell_correction_spec = de.SearchRequest.SpellCorrectionSpec(
                mode=de.SearchRequest.SpellCorrectionSpec.Mode.AUTO
            )
        except Exception:
            pass
        if filter_expression:
            req.filter = filter_expression

        for r in client.search(request=req):
            doc = r.document
            struct = dict(doc.struct_data) if doc.struct_data is not None else {}
            links = struct.get('links') if isinstance(struct.get('links'), dict) else {}
            tags = struct.get('tags', [])
            if isinstance(tags, dict): tags = list(tags.values())
            elif isinstance(tags, str): tags = [tags]

            rows.append({
                'engine_score': getattr(r, 'score', None),
                'mission_id': struct.get('mission_id'),
                'display_id': struct.get('display_id'),
                'mission_type': struct.get('mission_type'),
                'outcome': struct.get('outcome'),
                'tags': tags,
                'approved': struct.get('approved'),
                'final_score': struct.get('final_score'),
                'lang': struct.get('lang'),
                'nodes_count': struct.get('nodes_count'),
                'edges_count': struct.get('edges_count'),
                'plan_uri': links.get('plan_uri'),
                'transcript_uri': links.get('transcript_uri'),
                'metrics_uri': links.get('metrics_uri'),
                'txt_uri': links.get('txt_uri'),
                '_struct': struct,
            })
        return rows

    # --- Tryb lokalny: jeśli nie łączymy się z Vertex Search ---
    if df_meta.empty:
        return rows

    # Pola tekstowe do szukania (display_id / mission_id / outcome)
    cols = [c for c in df_meta.columns if c in (
        'structData.display_id', 'structData.mission_id', 'structData.outcome'
    )]
    if not cols:
        cols = [c for c in df_meta.columns if c.startswith('structData.')]

    q = (query or '').strip().lower()
    for _, rec in df_meta.iterrows():
        text = ' '.join(str(rec.get(c, '')) for c in cols).lower()
        if not q or any(tok in text for tok in q.split()):
            rows.append({
                'engine_score': None,
                'mission_id': rec.get('structData.mission_id'),
                'display_id': rec.get('structData.display_id'),
                'mission_type': rec.get('structData.mission_type'),
                'outcome': rec.get('structData.outcome'),
                'tags': rec.get('structData.tags'),
                'approved': rec.get('structData.approved'),
                'final_score': rec.get('structData.final_score'),
                'lang': rec.get('structData.lang'),
                'nodes_count': rec.get('structData.nodes_count'),
                'edges_count': rec.get('structData.edges_count'),
                'plan_uri': rec.get('structData.links.plan_uri'),
                'transcript_uri': rec.get('structData.links.transcript_uri'),
                'metrics_uri': rec.get('structData.links.metrics_uri'),
                'txt_uri': rec.get('structData.links.txt_uri'),
                '_struct': rec.to_dict(),
            })
    return rows

def fuzzy_score(query: str, row: Dict[str, Any]) -> float:
    fields = [row.get('display_id') or '', row.get('mission_id') or '', row.get('outcome') or '']
    scores = [
        fuzz.token_set_ratio(query, fields[0]),
        fuzz.partial_ratio(query, fields[0]),
        fuzz.token_set_ratio(query, fields[1]),
        fuzz.partial_ratio(query, fields[1]),
        fuzz.token_set_ratio(query, fields[2]),
        fuzz.partial_ratio(query, fields[2]),
    ]
    return max(float(s or 0.0) for s in scores)

def rerank_by_title(query: str, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    for r in rows:
        r['fuzzy_score'] = fuzzy_score(query, r)
        es = r.get('engine_score') or 0.0
        try: es = float(es)
        except Exception: es = 0.0
        r['combined_score'] = (0.85 * r['fuzzy_score']) + (0.15 * es)
    rows.sort(key=lambda x: (x.get('combined_score', 0.0), x.get('engine_score') or 0.0), reverse=True)
    return rows

def as_dataframe(rows: List[Dict[str, Any]]) -> pd.DataFrame:
    cols = [
        'combined_score','fuzzy_score','engine_score',
        'display_id','mission_id','mission_type','outcome',
        'tags','approved','final_score','lang',
        'nodes_count','edges_count',
        'plan_uri','transcript_uri','metrics_uri','txt_uri'
    ]
    return pd.DataFrame([{k: r.get(k) for k in cols} for r in rows])

def load_local_json_from_uri(uri: Optional[str]) -> Optional[Dict[str, Any]]:
    p = resolve_local_from_uri(uri)
    if not p or not p.exists():
        return None
    try:
        return json.loads(p.read_text(encoding='utf-8'))
    except Exception:
        return None

def load_local_text_from_uri(uri: Optional[str]) -> Optional[str]:
    p = resolve_local_from_uri(uri)
    if not p or not p.exists():
        return None
    try:
        return p.read_text(encoding='utf-8')
    except Exception:
        return None


In [3]:
## 3) Uruchom wyszukiwanie


# Parametry:
serving_config = "projects/815755318672/locations/us/collections/default_collection/dataStores/external-memory-connector_1756845276280_gcs_store/servingConfigs/default_config"  # zostaw puste, by działać lokalnie (na metadata_20250829.ndjson)
api_endpoint = None
query = "zbuduj adaptacyjny system ml continuous learning"  # wpisz przybliżony tytuł/hasło
page_size = 30
filter_expression = None  # np. 'approved = true AND lang = "pl"'

rows = search_vertex(serving_config, query, page_size, api_endpoint, filter_expression)
if not rows:
    print("Brak wyników.")
else:
    rows = rerank_by_title(query, rows)
    df = as_dataframe(rows)
    display(df)

    # Podgląd best-hita + lokalnych artefaktów
    best = rows[0]
    print("\n== Najlepsze trafienie ==")
    print(json.dumps({
        k: best.get(k) for k in [
            "display_id","mission_id","mission_type","outcome",
            "tags","final_score","approved","lang",
            "plan_uri","transcript_uri","metrics_uri","txt_uri"
        ]
    }, ensure_ascii=False, indent=2))

    plan = load_local_json_from_uri(best.get('plan_uri'))
    transcript = load_local_json_from_uri(best.get('transcript_uri'))
    metrics = load_local_json_from_uri(best.get('metrics_uri'))
    txt = load_local_text_from_uri(best.get('txt_uri'))

    if plan:
        print("\n== plan.json (local mirror) ==")
        keys = list(plan.keys())
        print(f"Klucze: {keys[:10]} ...  (total: {len(keys)})")
        nn = len(plan.get('nodes', [])) if isinstance(plan.get('nodes'), list) else None
        ee = len(plan.get('edges', [])) if isinstance(plan.get('edges'), list) else None
        if nn is not None or ee is not None:
            print(f"Nodes: {nn}, Edges: {ee}")
    else:
        print("\n(plan.json niedostępny lokalnie lub nieczytelny)")

    if transcript:
        print("\n== transcript.json (local mirror) ==")
        tkeys = list(transcript.keys())
        print(f"Klucze: {tkeys[:10]} ... (total: {len(tkeys)})")
    else:
        print("\n(transcript.json niedostępny lokalnie lub nieczytelny)")

    if metrics:
        print("\n== metrics.json (local mirror) ==")
        mkeys = list(metrics.keys())
        print(f"Klucze: {mkeys[:10]} ... (total: {len(mkeys)})")
    else:
        print("\n(metrics.json niedostępny lokalnie lub nieczytelny)")

    if txt:
        print("\n== txt (local mirror) ==")
        print(txt[:500] + ('...' if len(txt) > 500 else ''))
    else:
        print("\n(txt niedostępny lokalnie)")


## 4) Eksport wyników (opcjonalnie)

In [None]:

export_csv = "/mnt/data/search_results_v2.csv"
export_jsonl = "/mnt/data/search_results_v2.jsonl"

if 'df' in globals():
    df.to_csv(export_csv, index=False)
    with open(export_jsonl, "w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")
    print("Zapisano:", export_csv, "i", export_jsonl)
else:
    print("Najpierw uruchom wyszukiwanie.")
