# Debug Vector Search Metadata

Goal: trace why `embedding_metadata` is empty by comparing raw Vertex API neighbors and `functions.core.search.search` output.

In [30]:
# Setup import path
import sys
from pathlib import Path

project_root = Path.cwd().resolve().parents[1]
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

print(project_root)

/Users/shin.t/Desktop/Projects/mhesi_hyde_recommendation/hyde_core_recommendation_engine/items_pipeline


In [31]:
# NPY
from functions.utils.gcs import load_data_from_gcs_prefix

gcs_prefix = "gs://hyde-datalake-feeds/stu_p000/embedding/"
npy_items = load_data_from_gcs_prefix(
    gcs_prefix,
    field_name="embedding01",
    file_type="npy",
)

print(f"Loaded NPY objects: {len(npy_items)}")
if npy_items:
    first = npy_items[0]
    print(type(first))
    print(first)
    if isinstance(first, list):
        print(f"First NPY outer length: {len(first)}")

Loaded NPY objects: 5
<class 'list'>
[-0.0016069455305114388, -0.01654176227748394, 0.009188651107251644, -0.12363864481449127, 0.016019757837057114, 0.017166830599308014, -0.016681546345353127, 0.01630791276693344, 0.006883992347866297, 0.03049732744693756, -0.028930820524692535, 0.008886635303497314, 0.016103964298963547, 0.04205628111958504, 0.1796730011701584, 0.0227912999689579, 0.012724732980132103, -0.021377502009272575, -0.013593843206763268, 0.004518724046647549, -0.037661440670490265, 0.04135395213961601, -0.01618782989680767, -0.04981079697608948, -0.009179255925118923, -0.03359142690896988, 0.03878185525536537, 0.033550702035427094, 0.06920751184225082, -0.04246381297707558, -0.01581304334104061, 0.013842585496604443, 0.034261543303728104, 0.011929886415600777, -0.032177917659282684, 0.010637162253260612, -0.006842692382633686, 0.022234270349144936, 0.0018493917305022478, 0.01564491167664528, -0.04065341129899025, 0.032908279448747635, 0.0006790155894123018, -0.013730106875

In [32]:
# Hardcoded test params
endpoint_id = "projects/810737581373/locations/asia-southeast1/indexEndpoints/2127009641979183104"
deployed_index_id = "deployed_items_endpoint"
query_type = "vector"
top_k = 10

print(endpoint_id)
print(deployed_index_id)

projects/810737581373/locations/asia-southeast1/indexEndpoints/2127009641979183104
deployed_items_endpoint


In [40]:
# Raw Vertex find_neighbors debug
import time
from google.cloud import aiplatform
from functions.utils.load_config import load_config

config = load_config()
aiplatform.init(project=config["project_id"], location=config["region"])
endpoint = aiplatform.MatchingEngineIndexEndpoint(index_endpoint_name=endpoint_id)

t0 = time.monotonic()
neighbors = endpoint.find_neighbors(
    deployed_index_id=deployed_index_id,
    queries=[npy_items[0]],
    num_neighbors=top_k,
    return_full_datapoint=True,
)
t1 = time.monotonic()

print(f"raw_find_neighbors_runtime_sec={t1 - t0:.6f}")
print(f"neighbor_batches={len(neighbors)}")
print(f"neighbors_in_first_batch={len(neighbors[0]) if neighbors else 0}")

raw_find_neighbors_runtime_sec=0.178812
neighbor_batches=1
neighbors_in_first_batch=10


In [41]:
neighbors[0][0].restricts

[Namespace(name='lesson_title', allow_tokens=['SQL for Data Science'], deny_tokens=[]),
 Namespace(name='level', allow_tokens=['Beginner'], deny_tokens=[]),
 Namespace(name='category_name', allow_tokens=['Computer Science'], deny_tokens=[]),
 Namespace(name='status', allow_tokens=['active'], deny_tokens=[]),
 Namespace(name='short_description', allow_tokens=['Learn how to use and apply the powerful language of SQL to better communicate and extract data from databases - a must for anyone working in the data science field.'], deny_tokens=[]),
 Namespace(name='link', allow_tokens=['https://www.edx.org/course/sql-for-data-science'], deny_tokens=[])]

In [26]:
# Inspect raw neighbor metadata fields
if not neighbors or not neighbors[0]:
    raise ValueError("No neighbors returned.")

for i, n in enumerate(neighbors[0][:3], start=1):
    datapoint = getattr(n, "datapoint", None)
    print(f"--- raw neighbor #{i} ---")
    print("distance/score:", getattr(n, "distance", None), getattr(n, "score", None))
    print(n)
    if datapoint is None:
        print("datapoint: None")
        continue

    print("datapoint_id:", getattr(datapoint, "datapoint_id", None))
    embedding_metadata = getattr(datapoint, "embedding_metadata", None)
    metadata = getattr(datapoint, "metadata", None)
    print("embedding_metadata type:", type(embedding_metadata))
    print("embedding_metadata value:", embedding_metadata)
    print("metadata type:", type(metadata))
    print("metadata value:", metadata)

--- raw neighbor #1 ---
distance/score: 0.8386043906211853 None
MatchNeighbor(id='01KAWP740P6W5ECCTGXRV03DP2', distance=0.8386043906211853, sparse_distance=None, feature_vector=[0.0010502171935513616, -0.021800972521305084, 0.017659900709986687, -0.06019212678074837, -0.0014229955850169063, -0.0022963276132941246, -0.0122545026242733, 0.010641653090715408, 0.005220016930252314, 0.003261236008256674, -0.01534280739724636, -0.0085119204595685, -0.00018404908769298345, 0.0018089332152158022, 0.11643503606319427, -0.016182661056518555, 0.010303773917257786, -0.022009696811437607, -0.005931884050369263, -0.008938450366258621, -0.012056990526616573, 0.009906433522701263, -0.01408492960035801, -0.02273079752922058, 0.004428801126778126, -0.0006333597120828927, 0.02405938319861889, 0.01704927533864975, 0.03978236764669418, -0.033322565257549286, -0.026998475193977356, -0.0006198057089932263, 0.030220594257116318, 0.007963652722537518, -0.010633542202413082, 0.02737933024764061, 0.0017597564728

In [None]:

result = endpoint.read_index_datapoints(
    deployed_index_id=deployed_index_id,
    ids=["01KAWP740P6W5ECCTGXRV03DP2"]
)

print(result[0].emb)

01KAWP740P6W5ECCTGXRV03DP2


In [None]:
MatchNeighbor(id='01KAWP740P6W5ECCTGXRV03DP2', distance=0.8386043906211853, sparse_distance=None, 
              feature_vector=[0.0010502171935513616, -0.021800972521305084,... 0.008101140148937702, -0.019485989585518837], crowding_tag='0', 
              restricts=[Namespace(name='level', allow_tokens=['Beginner'], deny_tokens=[]), Namespace(name='category_id', allow_tokens=['cat_computer_science'], deny_tokens=[]), 
                         Namespace(name='category_name', allow_tokens=['Computer Science'], deny_tokens=[]), 
                         Namespace(name='sub_cat_name', allow_tokens=['Programming Languages'], deny_tokens=[]), 
                         Namespace(name='skill_name', allow_tokens=['Computer Science Skills'], deny_tokens=[]), 
                         Namespace(name='status', allow_tokens=['active'], deny_tokens=[]), Namespace(name='university', allow_tokens=['IBM'], deny_tokens=[])], 
                         numeric_restricts=[NumericNamespace(name='created_at', value_int=1740312060, value_float=None, value_double=None, op=None), 
                                            NumericNamespace(name='updated_at', value_int=1763553660, value_float=None, value_double=None, op=None)], 
                                            sparse_embedding_values=[], sparse_embedding_dimensions=[])


In [None]:
# Inline search logic (no import from functions.core.search)
from typing import Any

import vertexai
from google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint import Namespace
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel

from functions.utils.validators import apply_defaults


def _build_namespace_filters(restricts: list[dict[str, Any]] | None) -> list[Namespace]:
    filters: list[Namespace] = []
    for item in restricts or []:
        namespace = item.get("namespace") or item.get("name")
        if not namespace:
            continue
        allow = item.get("allow") or item.get("allow_list") or []
        deny = item.get("deny") or item.get("deny_list") or []
        filters.append(Namespace(namespace, list(allow), list(deny)))
    return filters


def _extract_neighbor(neighbor: Any) -> dict[str, Any]:
    datapoint = getattr(neighbor, "datapoint", None)
    if datapoint is not None:
        neighbor_id = getattr(datapoint, "datapoint_id", None) or getattr(datapoint, "id", None)
        metadata = getattr(datapoint, "embedding_metadata", None) or getattr(datapoint, "metadata", None)
    else:
        neighbor_id = getattr(neighbor, "id", None)
        metadata = None

    score = getattr(neighbor, "distance", None)
    if score is None:
        score = getattr(neighbor, "score", None)

    return {
        "id": neighbor_id,
        "score": score,
        "metadata": metadata,
    }


payload = {
    "endpoint_id": endpoint_id,
    "deployed_index_id": deployed_index_id,
    "query": npy_items[0],
    "query_type": query_type,
    "top_k": top_k,
    "restricts": [],
}

defaults = config.get("search", {})
request = apply_defaults(payload, defaults)
request["restricts"] = request.get("restricts") or []

if request["query_type"] == "text":
    vertexai.init(project=config["project_id"], location=config["region"])
    model = TextEmbeddingModel.from_pretrained(
        request.get("embedding_model_name")
        or defaults.get("embedding_model_name")
        or config.get("embed_data", {}).get("embedding_model_name")
        or "text-embedding-005"
    )
    output_dimensionality = int(
        request.get("dimension")
        or defaults.get("dimension")
        or config.get("embed_data", {}).get("dimension")
        or 768
    )
    embedding = model.get_embeddings(
        [TextEmbeddingInput(text=request["query"], task_type="RETRIEVAL_QUERY")],
        output_dimensionality=output_dimensionality,
    )[0]
    embedding_values = [float(v) for v in embedding.values]
else:
    embedding_values = [float(v) for v in request["query"]]

aiplatform.init(project=config["project_id"], location=config["region"])
endpoint = aiplatform.MatchingEngineIndexEndpoint(index_endpoint_name=request["endpoint_id"])
filters = _build_namespace_filters(request["restricts"])

t0 = time.monotonic()
neighbors = endpoint.find_neighbors(
    deployed_index_id=request["deployed_index_id"],
    queries=[embedding_values],
    num_neighbors=int(request.get("top_k", 10)),
    return_full_datapoint=True,
    filter=filters or None,
)
t1 = time.monotonic()

results = [_extract_neighbor(n) for n in neighbors[0]] if neighbors else []
print(f"inline_search_runtime_sec={t1 - t0:.6f}")
print("num_recommendations:", len(results))
print("first 3 metadata values:")
for i, item in enumerate(results[:3], start=1):
    print(f"#{i} id={item.get('id')} metadata={item.get('metadata')}")