In [2]:
from exporter_missions_gcs import export_local_by_filename_date_split

ndjson_uris = export_local_by_filename_date_split(
    input_dir="memory/missions",                          # lokalne mission_YYYYMMDD_HHMMSS_*.json
    output_missions_root_gcs="gs://external_memory/missions", # artefakty TXT/JSON
    output_index_root_gcs="gs://external_memory/index",  # tylko NDJSON
    pattern="*.json"
)
print(ndjson_uris)

ImportError: cannot import name 'export_local_by_filename_date_split' from 'exporter_missions_gcs' (/home/jupyter/olga_zydziak/version_beta/Folder/advanced_agents_system/exporter_missions_gcs.py)

In [1]:
from exporter_missions_gcs import export_local_by_filename_date

ndjson_uris = export_local_by_filename_date(
    input_dir="memory/missions",          # tu leżą Twoje mission_YYYYMMDD_HHMMSS_*.json
    output_root_gcs="gs://external_memory/missions",
    pattern="*.json"                      # domyślnie *.json
)
print(ndjson_uris)

ImportError: cannot import name 'export_local_by_filename_date' from 'exporter_missions_gcs' (/home/jupyter/olga_zydziak/version_beta/Folder/advanced_agents_system/exporter_missions_gcs.py)

In [3]:
from pathlib import Path, PurePath
import json

def sniff_local_jsons(folder="memory/missions", pattern="*.json", limit=50):
    bad = []
    for p in sorted(Path(folder).glob(pattern))[:limit]:
        try:
            json.loads(p.read_text(encoding="utf-8"))
        except Exception as e:
            bad.append((str(p), repr(e)))
    print(f"Bad local files: {len(bad)}")
    for name, err in bad[:20]:
        print("-", name, "->", err)
    return bad

bad_local = sniff_local_jsons("memory/missions")

Bad local files: 0


In [4]:
from google.cloud import storage
import json

def sniff_ndjson(gs_uri):
    assert gs_uri.startswith("gs://")
    bucket, _, prefix = gs_uri[5:].partition("/")
    client = storage.Client()
    blob = client.bucket(bucket).blob(prefix)
    data = blob.download_as_text(encoding="utf-8").splitlines()
    bad = []
    for i, line in enumerate(data, 1):
        if not line.strip():
            continue
        try:
            json.loads(line)
        except Exception as e:
            bad.append((i, repr(e), line[:200]))
    print(f"Lines: {len(data)}, bad: {len(bad)}")
    for i, err, preview in bad[:10]:
        print(f"- line {i}: {err}\n  {preview}...")
    return bad

In [6]:
bad = sniff_ndjson("gs://external_memory/missions/2025/08/29/metadata_20250829.ndjson")

Lines: 2, bad: 0


In [9]:
from google.cloud import storage
import json

def validate_ndjson_links(gs_ndjson_uri: str, sample=50):
    assert gs_ndjson_uri.startswith("gs://")
    bucket_name, _, path = gs_ndjson_uri[5:].partition("/")
    client = storage.Client()
    blob = client.bucket(bucket_name).blob(path)
    lines = blob.download_as_text(encoding="utf-8").splitlines()
    problems = []
    checked = 0
    for i, line in enumerate(lines, 1):
        if not line.strip(): 
            continue
        try:
            obj = json.loads(line)
        except Exception as e:
            problems.append(("json-parse", i, repr(e)))
            continue
        uri = ((obj.get("content") or {}).get("uri")) if obj.get("content") else None
        if not uri or not uri.startswith("gs://"):
            problems.append(("no-uri", i, str(uri)))
            continue
        bkt, _, pfx = uri[5:].partition("/")
        b = client.bucket(bkt).blob(pfx)
        if not b.exists():
            problems.append(("missing-content", i, uri))
        else:
            # Sprawdź rozmiar i typ
            b.reload()
            if b.size == 0:
                problems.append(("empty-content", i, uri))
        checked += 1
        if checked >= sample:
            break
    return problems


problems = validate_ndjson_links("gs://external_memory/missions/2025/08/29/metadata_20250829.ndjson", sample=200)
print(problems)

[]


In [18]:
# pip install google-cloud-discoveryengine google-cloud-storage
from google.cloud import discoveryengine_v1 as de
from google.api_core.client_options import ClientOptions
from google.cloud import storage
import json

def _read_txt_head_from_gcs(gs_uri: str, max_chars: int = 200) -> str:
    if not gs_uri or not gs_uri.startswith("gs://"):
        return ""
    bkt, _, path = gs_uri[5:].partition("/")
    blob = storage.Client().bucket(bkt).blob(path)
    data = blob.download_as_text(encoding="utf-8")
    return (data[:max_chars] + "…") if len(data) > max_chars else data

def vertex_search_with_snippets(serving_config_path: str, location: str, query: str, top_k: int = 3, filter_str: str = ""):
    client = de.SearchServiceClient(client_options=ClientOptions(api_endpoint=f"{location}-discoveryengine.googleapis.com"))

    req = de.SearchRequest(
        serving_config=serving_config_path,
        query=query,
        page_size=top_k,
        filter=filter_str,
        content_search_spec=de.SearchRequest.ContentSearchSpec(
            snippet_spec=de.SearchRequest.ContentSearchSpec.SnippetSpec(
                return_snippet=True
            )
        ),
    )

    out = []
    resp = client.search(request=req)
    for r in resp:
        doc = r.document
        # 1) preferowane: r.snippet_info (lista wycinków)
        snippet_texts = []
        if getattr(r, "snippet_info", None):
            for s in r.snippet_info:
                if getattr(s, "snippet", None):
                    snippet_texts.append(s.snippet)

        # 2) fallback: próbuj derived_struct_data.contentSnippet (bywa puste)
        if not snippet_texts and doc and getattr(doc, "derived_struct_data", None):
            cs = doc.derived_struct_data.get("contentSnippet")
            if cs:
                snippet_texts.append(cs)

        # 3) ostateczny fallback: wczytaj początek .txt z GCS
        txt_uri = (doc.content.uri if doc and doc.content and getattr(doc.content, "uri", None) else None)
        if not snippet_texts and txt_uri:
            snippet_texts.append(_read_txt_head_from_gcs(txt_uri, max_chars=200))

        out.append({
            "id": (doc.id if doc else ""),
            "snippet": " … ".join(snippet_texts) if snippet_texts else "",
            "txt_uri": txt_uri,
            "links": (doc.struct_data or {}).get("links", {}) if doc else {},
        })
    return out


In [19]:
SERVING_CONFIG = "projects/815755318672/locations/us/collections/default_collection/dataStores/external-memory-connector_1756845276280_gcs_store/servingConfigs/default_config"

hits = vertex_search_with_snippets(
    serving_config_path=SERVING_CONFIG,
    location="us",
    query="Jak zaprojektować mechanizm retry i rollback w odkrywaniu przyczynowości?",
    top_k=3,
    filter_str='structData.approved = true'  # na próbę możesz usunąć filtr
)
for h in hits:
    print(h["id"], "→", (h["snippet"][:140] + "…") if len(h["snippet"])>140 else h["snippet"])
    print("TXT:", h["txt_uri"])
    print("PLAN:", h["links"].get("plan_uri"))
    print("---")

InvalidArgument: 400 Invalid filter syntax 'structData.approved = true'. Parsing filter failed with error: Unsupported field "structData.approved" on comparison operators. Comparison operators include ">", "<", ">=", "<=", "="..

In [16]:
# --- przykład użycia ---

SERVING_CONFIG = "projects/815755318672/locations/us/collections/default_collection/dataStores/external-memory-connector_1756845276280_gcs_store/servingConfigs/default_config"

results = test_vertex_search(
    query="Jak zaprojektować mechanizm retry i rollback w odkrywaniu przyczynowości?",
    project_id="dark-data-discovery",
    location="us",
    data_store_id="external-memory-connector_1756845276280_gcs_store"
)
for r in results:
    print(r["id"], "->", r["snippet"], r["uri"])

20250829_212954_1e55a591 ->  None
