# Load Data From GCS (One File Type Per Cell)

Run the setup cell first. Then run the cells you need.

In [None]:
# Setup import path for local package imports
import sys
from pathlib import Path

project_root = Path.cwd().resolve().parents[1]
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

print(project_root)

In [None]:
# TXT
from functions.utils.gcs import load_data_from_gcs_prefix

gcs_prefix = "gs://your-bucket/your-prefix/"
txt_items = load_data_from_gcs_prefix(
    gcs_prefix,
    field_name="txt_prefix",
    file_type="txt",
)

print(f"Loaded TXT lines: {len(txt_items)}")
print(txt_items[:5])

In [10]:
# NPY
from functions.utils.gcs import load_data_from_gcs_prefix

gcs_prefix = "gs://hyde-datalake-feeds/stu_p000/embedding/"
npy_items = load_data_from_gcs_prefix(
    gcs_prefix,
    field_name="embedding01",
    file_type="npy",
)

print(f"Loaded NPY objects: {len(npy_items)}")
if npy_items:
    first = npy_items[0]
    print(type(first))
    print(first)
    if isinstance(first, list):
        print(f"First NPY outer length: {len(first)}")

Loaded NPY objects: 5
<class 'list'>
[-0.0016069455305114388, -0.01654176227748394, 0.009188651107251644, -0.12363864481449127, 0.016019757837057114, 0.017166830599308014, -0.016681546345353127, 0.01630791276693344, 0.006883992347866297, 0.03049732744693756, -0.028930820524692535, 0.008886635303497314, 0.016103964298963547, 0.04205628111958504, 0.1796730011701584, 0.0227912999689579, 0.012724732980132103, -0.021377502009272575, -0.013593843206763268, 0.004518724046647549, -0.037661440670490265, 0.04135395213961601, -0.01618782989680767, -0.04981079697608948, -0.009179255925118923, -0.03359142690896988, 0.03878185525536537, 0.033550702035427094, 0.06920751184225082, -0.04246381297707558, -0.01581304334104061, 0.013842585496604443, 0.034261543303728104, 0.011929886415600777, -0.032177917659282684, 0.010637162253260612, -0.006842692382633686, 0.022234270349144936, 0.0018493917305022478, 0.01564491167664528, -0.04065341129899025, 0.032908279448747635, 0.0006790155894123018, -0.013730106875

In [9]:
# JSON / JSONL
from functions.utils.gcs import load_data_from_gcs_prefix

gcs_prefix = "gs://hyde-datalake-feeds/stu_p000/embedding/"
json_items = load_data_from_gcs_prefix(
    gcs_prefix,
    field_name="embedding01",
    file_type="npy",
)

print(f"Loaded JSON items: {len(json_items)}")
print(json_items[:2])

Loaded JSON items: 5
[[-0.0016069455305114388, -0.01654176227748394, 0.009188651107251644, -0.12363864481449127, 0.016019757837057114, 0.017166830599308014, -0.016681546345353127, 0.01630791276693344, 0.006883992347866297, 0.03049732744693756, -0.028930820524692535, 0.008886635303497314, 0.016103964298963547, 0.04205628111958504, 0.1796730011701584, 0.0227912999689579, 0.012724732980132103, -0.021377502009272575, -0.013593843206763268, 0.004518724046647549, -0.037661440670490265, 0.04135395213961601, -0.01618782989680767, -0.04981079697608948, -0.009179255925118923, -0.03359142690896988, 0.03878185525536537, 0.033550702035427094, 0.06920751184225082, -0.04246381297707558, -0.01581304334104061, 0.013842585496604443, 0.034261543303728104, 0.011929886415600777, -0.032177917659282684, 0.010637162253260612, -0.006842692382633686, 0.022234270349144936, 0.0018493917305022478, 0.01564491167664528, -0.04065341129899025, 0.032908279448747635, 0.0006790155894123018, -0.013730106875300407, 0.00269

In [13]:
# NPY -> core.search (vector query) with runtime per search
import time

from api.schemas.search import SearchRequest
from functions.core.search import search
from functions.utils.load_config import load_config

# Hardcoded test values
endpoint_id = "projects/810737581373/locations/asia-southeast1/indexEndpoints/2127009641979183104"
deployed_index_id = "deployed_items_endpoint"
query_type = "vector"
top_k = 10

config = load_config()

if not npy_items:
    raise ValueError("npy_items is empty. Run the NPY cell first and verify GCS prefix.")

for i, vec in enumerate(npy_items, start=1):
    query_vector = [float(x) for x in vec]

    print("vector length:", len(query_vector))
    print("vector sample:", query_vector[:5])

    payload = SearchRequest(
        endpoint_id=endpoint_id,
        deployed_index_id=deployed_index_id,
        query=query_vector,
        query_type=query_type,
        top_k=top_k,
        restricts=[],
    )

    start = time.monotonic()
    result = search(payload, config)
    elapsed = time.monotonic() - start

    print(f"search #{i} runtime_sec={elapsed:.6f} recommendations={result.get('num_recommendations')}")
    print(result.get("results", [])[:2])
    print()

vector length: 768
vector sample: [-0.0016069455305114388, -0.01654176227748394, 0.009188651107251644, -0.12363864481449127, 0.016019757837057114]
search #1 runtime_sec=0.878652 recommendations=10
[{'id': '01KAWP740P6W5ECCTGXRV03DP2', 'score': 0.8386043906211853, 'metadata': None}, {'id': '01KAWP740BY7G74JPB1DPRK1AA', 'score': 0.8224852681159973, 'metadata': None}]

vector length: 768
vector sample: [0.011962340213358402, -0.018131067976355553, -0.00435173325240612, -0.13027994334697723, 0.011078780516982079]
search #2 runtime_sec=0.383121 recommendations=10
[{'id': '01KAWP740P6W5ECCTGXRV03DP2', 'score': 0.7672320604324341, 'metadata': None}, {'id': '01KAWP740BY7G74JPB1DPRK1AA', 'score': 0.7482328414916992, 'metadata': None}]

vector length: 768
vector sample: [-0.01656429097056389, -0.0004332702956162393, 0.0013909724075347185, -0.11225864291191101, 0.01024330873042345]
search #3 runtime_sec=0.440075 recommendations=10
[{'id': '01KAWP740P6W5ECCTGXRV03DP2', 'score': 0.8213900923728943,