# Quickstart for creating a Vertex Vector Search Index

## Setup

### pip installs

In [2]:
# !pip install --upgrade --user google-cloud-aiplatform google-cloud-storage

In [3]:
# # Restart kernel after installs so that your environment can access the new packages
# import IPython
# import time

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

### imports

In [4]:
import pandas as pd
import numpy as np
import json
import uuid

from google.cloud import aiplatform
from google.cloud import storage
from google.cloud import bigquery

# logging
import logging
logging.disable(logging.WARNING)

#python warning 
import warnings
warnings.filterwarnings("ignore")

In [5]:
print(f'BigQuery SDK version      : {bigquery.__version__}')
print(f'Vertex AI SDK version     : {aiplatform.__version__}')
print(f'Cloud Storage SDK version : {storage.__version__}')

BigQuery SDK version      : 3.15.0
Vertex AI SDK version     : 1.39.0
Cloud Storage SDK version : 2.14.0


### Define vars and GCP env config

`CREATE_NEW_ASSETS`
* True creates new GCS buckets and Vector Search instances, etc.
* False skips these steps (in case you need to re-run notebook to include new variables you create)

In [7]:
# create new gcs bucket, vs index, etc.?
CREATE_NEW_ASSETS         = False 

**Edit these to define naming conventions and stay organized across different versions**

In [8]:
# naming convention for all cloud resources
VERSION        = "vpc1"                        # TODO
PREFIX         = f'vvs-vectorio-{VERSION}'   # TODO

print(f"PREFIX = {PREFIX}")

PREFIX = vvs-vectorio-vpc1


#### Edit these as needed

`VPC_NETWORK_NAME`
* if index will be **deployed to a private endpoint** within a VPC network, edit this with the name of your VPC network name
* if index will be **deployed to a public endpoint**, leave blank

> For details on configuring VPC for Vertex AI Vector Search, see **[Set up a VPC Network Peering connection](https://cloud.google.com/vertex-ai/docs/vector-search/setup/vpc)**

<div class="alert alert-block alert-warning">
    <b>⚠️ if deploying index to index endpoint within a VPC network, you must interact with the index endpoint from within the VPC network, i.e., run this notebook in a Vertex Workbench instance within that VPC ⚠️</b>
</div>


In [9]:
VPC_NETWORK_NAME = "ucaip-haystack-vpc-network"  # e.g., "your-vpc-name" | ""

In [10]:
if VPC_NETWORK_NAME:
    USE_PUBLIC_ENDPOINTS = False
else:
    USE_PUBLIC_ENDPOINTS = True
    
print(f"VPC_NETWORK_NAME     = {VPC_NETWORK_NAME}")
print(f"USE_PUBLIC_ENDPOINTS = {USE_PUBLIC_ENDPOINTS}")

VPC_NETWORK_NAME     = ucaip-haystack-vpc-network
USE_PUBLIC_ENDPOINTS = False


In [11]:
# locations / regions for cloud resources
REGION            = 'us-central1'        
BQ_REGION         = 'US'

print(f"REGION    = {REGION}")
print(f"BQ_REGION = {BQ_REGION}")

REGION    = us-central1
BQ_REGION = US


#### Let these ride

In [12]:
# let these ride
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

PROJECT_NUM              = !gcloud projects describe $PROJECT_ID --format="value(projectNumber)"
PROJECT_NUM              = PROJECT_NUM[0]

# service account
VERTEX_SA                = f'{PROJECT_NUM}-compute@developer.gserviceaccount.com'

# full VPC network name
VPC_NETWORK_FULL         = f"projects/{PROJECT_NUM}/global/networks/{VPC_NETWORK_NAME}"

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

print(f"PROJECT_ID       = {PROJECT_ID}")
print(f"PROJECT_NUM      = {PROJECT_NUM}")
print(f"VERTEX_SA        = {VERTEX_SA}")
print(f"VPC_NETWORK_FULL = {VPC_NETWORK_FULL}")
print(f"BUCKET_NAME      = {BUCKET_NAME}")
print(f"BUCKET_URI       = {BUCKET_URI}")

PROJECT_ID       = hybrid-vertex
PROJECT_NUM      = 934903580331
VERTEX_SA        = 934903580331-compute@developer.gserviceaccount.com
VPC_NETWORK_FULL = projects/934903580331/global/networks/ucaip-haystack-vpc-network
BUCKET_NAME      = vvs-vectorio-vpc1-hybrid-vertex
BUCKET_URI       = gs://vvs-vectorio-vpc1-hybrid-vertex


In [13]:
!gcloud config set project $PROJECT_ID

Updated property [core/project].


### Enable APIs, if needed

In [14]:
# !gcloud services enable compute.googleapis.com \
#                         aiplatform.googleapis.com \
#                         storage.googleapis.com \
#                         iam.googleapis.com

### Create GCS bucket

In [16]:
if CREATE_NEW_ASSETS:
    
    # create new bucket
    ! gsutil mb -l $REGION $BUCKET_URI
    
    ### give Service account Admin to GCS
    !gcloud projects add-iam-policy-binding $PROJECT_ID \
        --member=serviceAccount:$VERTEX_SA \
        --role=roles/storage.admin
    
    ### uncomment if org policy prevents granting Admin:
    # ! gsutil iam ch serviceAccount:{$VERTEX_SA}:roles/storage.objects.get $BUCKET_URI
    # ! gsutil iam ch serviceAccount:{$VERTEX_SA}:roles/storage.objects.create $BUCKET_URI
    # ! gsutil iam ch serviceAccount:{$VERTEX_SA}:roles/storage.objects.list $BUCKET_URI
    
    
print(f"{VERTEX_SA} should have access to {BUCKET_URI}")

934903580331-compute@developer.gserviceaccount.com should have access to gs://vvs-vectorio-vpc1-hybrid-vertex


### Save notebook config 

> to easily use in other GCP related notebooks

In [17]:
config = f"""
PREFIX                   = \"{PREFIX}\"
VERSION                  = \"{VERSION}\"

PROJECT_ID               = \"{PROJECT_ID}\"
PROJECT_NUM              = \"{PROJECT_NUM}\"

REGION                   = \"{REGION}\"
BQ_REGION                = \"{BQ_REGION}\"

VERTEX_SA                = \"{VERTEX_SA}\"

VPC_NETWORK_NAME         = \"{VPC_NETWORK_NAME}\"
VPC_NETWORK_FULL         = \"{VPC_NETWORK_FULL}\"

USE_PUBLIC_ENDPOINTS     = \"{USE_PUBLIC_ENDPOINTS}\"

BUCKET_NAME              = \"{BUCKET_NAME}\"
BUCKET_URI               = \"{BUCKET_URI}\"
"""
print(config)


PREFIX                   = "vvs-vectorio-vpc1"
VERSION                  = "vpc1"

PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"

REGION                   = "us-central1"
BQ_REGION                = "US"

VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"
VPC_NETWORK_FULL         = "projects/934903580331/global/networks/ucaip-haystack-vpc-network"

USE_PUBLIC_ENDPOINTS     = "False"

BUCKET_NAME              = "vvs-vectorio-vpc1-hybrid-vertex"
BUCKET_URI               = "gs://vvs-vectorio-vpc1-hybrid-vertex"



In [18]:
!echo '{config}' | gsutil cp - {BUCKET_URI}/config/notebook_env.py

Copying from <STDIN>...
/ [1 files][    0.0 B/    0.0 B]                                                
Operation completed over 1 objects.                                              


### Initialize Google Cloud SDK Clients

In [19]:
# cloud storage client
storage_client = storage.Client(project=PROJECT_ID)

# Vertex client
aiplatform.init(project=PROJECT_ID, location=REGION)

# bigquery client
bq_client = bigquery.Client(
    project=PROJECT_ID,
    # location=BQ_REGION
)

# Prepare StackOverflow data
You use the [Stack Overflow dataset](https://console.cloud.google.com/marketplace/product/stack-exchange/stack-overflow) of question and answers hosted on BigQuery.

> This public dataset is hosted in Google BigQuery and is included in BigQuery's 1TB/mo of free tier processing. This means that each user receives 1TB of free BigQuery processing every month, which can be used to run queries on this public dataset.

The BigQuery table is too large to fit into memory, so you need to write a generator called `query_bigquery_chunks` to yield chunks of the dataframe for processing. Additionally, an extra column `title_with_body` is added, which is a concatenation of the question title and body.

In [20]:
import math
from typing import Any, Generator

In [47]:
# QUERY_TEMPLATE = """
#         SELECT distinct q.id, 
#             q.title, 
#             q.body
#         FROM (
#             SELECT * FROM `bigquery-public-data.stackoverflow.posts_questions` 
#             WHERE Score > 0 
#             ORDER BY View_Count desc
#             ) AS q 
#         LIMIT {limit} OFFSET {offset};
#         """

QUERY_TEMPLATE = """
        SELECT DISTINCT q.id, 
           q.title, 
           q.body, 
           q.score, 
           q.tags,
        FROM (
            SELECT * FROM `bigquery-public-data.stackoverflow.posts_questions` 
            WHERE score > 0 
            ORDER BY view_count DESC
            ) AS q 
        LIMIT {limit} OFFSET {offset};
        """


def query_bigquery_chunks(
    max_rows: int, 
    rows_per_chunk: int, 
    start_chunk: int = 0
) -> Generator[pd.DataFrame, Any, None]:
    
    for offset in range(start_chunk, max_rows, rows_per_chunk):
        query = QUERY_TEMPLATE.format(limit=rows_per_chunk, offset=offset)
        query_job = bq_client.query(query)
        rows = query_job.result()
        df = rows.to_dataframe()
        df["title_with_body"] = df.title + "\n" + df.body
        df['tags_split'] = df['tags'].apply(lambda x: x.split('|', maxsplit=1)[0])
        yield df

In [48]:
# Get a dataframe of 1000 rows for demonstration purposes
df_test = next(
    query_bigquery_chunks(
        max_rows=1000, 
        rows_per_chunk=1000
    )
)

# Examine the data
df_test.head()

Unnamed: 0,id,title,body,score,tags,title_with_body,tags_split
0,62387396,Unable install SQLserver tool in ubuntu 20,<p>I installed SQLserver in my ubuntu. But whe...,11,sql-server|bash|ubuntu|installation,Unable install SQLserver tool in ubuntu 20\n<p...,sql-server
1,60270794,"Vue 3 composition API, how to get context pare...",<p>I'm running into an issue with Vue 3 (alpha...,16,vue.js|vuejs3|vue-composition-api,"Vue 3 composition API, how to get context pare...",vue.js
2,49413937,Why TypeError: axios.create is not a function?...,<p>I'm trying to test my axios API functions i...,13,unit-testing|testing|axios|axios-mock-adapter,Why TypeError: axios.create is not a function?...,unit-testing
3,49665571,[Vue warn]: Unknown custom element: <nuxt-link...,<p><strong>Problem</strong></p>\n<p>I'm using ...,31,vue.js|jestjs|vue-router|nuxt.js|vue-test-utils,[Vue warn]: Unknown custom element: <nuxt-link...,vue.js
4,44971694,How to use `jsonb_set` on column with null values,<p>I am using Postgres 9.6 and I have a JSONB ...,15,json|postgresql|jsonb|postgresql-9.6,How to use `jsonb_set` on column with null val...,json


In [49]:
df_test.shape

(1000, 7)

## Instantiate the text encoding model

> Use the [Vertex AI Embeddings for Text API](https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings) developed by Google for converting text to embeddings

### Define helper functions

* `encode_texts_to_embeddings()`: convert sentences to embeddings

* `generate_batches()`: splits sentences into batches of 5 before sending to the embedding API

* `encode_text_to_embedding_batched()`: calls `generate_batches()` to handle batching, calls embedding API via `encode_texts_to_embeddings()`, handles rate-limiting using `time.sleep` *(Note: For production use cases, use more sophisticated rate-limiting mechanism that takes retries into account)*

In [24]:
import os 

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [25]:
from typing import List, Optional

# Load the "Vertex AI Embeddings for Text" model
from vertexai.preview.language_models import TextEmbeddingModel

model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")


# Define an embedding method that uses the model
def encode_texts_to_embeddings(sentences: List[str]) -> List[Optional[List[float]]]:
    try:
        embeddings = model.get_embeddings(sentences)
        return [embedding.values for embedding in embeddings]
    except Exception:
        return [None for _ in range(len(sentences))]

In [26]:
import functools
import time
from concurrent.futures import ThreadPoolExecutor
from typing import Generator, List, Tuple, Dict

import numpy as np
from tqdm.auto import tqdm


# Generator function to yield batches of sentences
def generate_batches(
    sentences: List[str], 
    batch_size: int
) -> Generator[List[str], None, None]:
    for i in range(0, len(sentences), batch_size):
        yield sentences[i : i + batch_size]


def encode_text_to_embedding_batched(
    sentences: List[str], 
    api_calls_per_second: int = 10, 
    batch_size: int = 5
) -> Tuple[List[bool], np.ndarray]:

    embeddings_list: List[List[float]] = []

    # Prepare the batches using a generator
    batches = generate_batches(sentences, batch_size)

    seconds_per_job = 1 / api_calls_per_second

    with ThreadPoolExecutor() as executor:
        futures = []
        for batch in tqdm(
            batches, total=math.ceil(len(sentences) / batch_size), position=0
        ):
            futures.append(
                executor.submit(functools.partial(encode_texts_to_embeddings), batch)
            )
            time.sleep(seconds_per_job)

        for future in futures:
            embeddings_list.extend(future.result())

    is_successful = [
        embedding is not None for sentence, embedding in zip(sentences, embeddings_list)
    ]
    embeddings_list_successful = np.squeeze(
        np.stack([embedding for embedding in embeddings_list if embedding is not None])
    )
    return is_successful, embeddings_list_successful

*test encoding function*

In [27]:
# Encode a subset of questions for validation
questions = df_test.title.tolist()[:500]

is_successful, question_embeddings = encode_text_to_embedding_batched(
    sentences=df.title.tolist()[:500],
)

# Filter for successfully embedded sentences
questions = np.array(questions)[is_successful]

  0%|          | 0/100 [00:00<?, ?it/s]

In [28]:
print(questions.shape)
questions[0]

(500,)


'Unable install SQLserver tool in ubuntu 20'

In [29]:
print(question_embeddings.shape)
# question_embeddings[0]

(500, 768)


In [30]:
# len(is_successful)
is_successful[0]

True

In [31]:
# sentences=df.title.tolist()[:10]

# batches_test = generate_batches(sentences, batch_size=5)
# next(batches_test)

In [32]:
DIMENSIONS = len(question_embeddings[0])

print(DIMENSIONS)

768


In [33]:
import random

question_index = random.randint(0, 99)

print(f"Query question = {questions[question_index]}")

# Get similarity scores for each embedding by using dot-product.
scores = np.dot(question_embeddings[question_index], question_embeddings.T)

# Print top 20 matches
for index, (question, score) in enumerate(
    sorted(zip(questions, scores), key=lambda x: x[1], reverse=True)[:20]
):
    print(f"\t{index}: {question}: {score}")

Query question = Locust/Python: Splitting a tasks array with if conditions in a SequentialTaskSet
	0: Locust/Python: Splitting a tasks array with if conditions in a SequentialTaskSet: 0.9999988941994281
	1: Python: Find specific and distinct set of combinations: 0.690856257416196
	2: Aysnc convert object to coroutine. Load object fields: 0.6770379306583949
	3: How to loop through an array and find matches with the values in the array: 0.6720070304466261
	4: How to remove stop words and lemmatize at the same time when using spaCy?: 0.661894678735283
	5: Using graphql, I need to crate a mutation which includes an empty array: 0.6467180591503121
	6: json array into json array - Python: 0.6453590036206271
	7: Learing of if and else: 0.6427257821148757
	8: iterative append previous value in python: 0.6377486452871857
	9: PHP - curl_multi_exec while loop uses 100% of the CPU: 0.636539320971094
	10: Convert string to dict without using split in Python Scrapy: 0.6358473515641552
	11: How to so

In [121]:
import tempfile
from pathlib import Path

# Create temporary file to write embeddings to
embeddings_file_path = Path(tempfile.mkdtemp())

print(f"Embeddings directory: {embeddings_file_path}")

Embeddings directory: /var/tmp/tmppkzjydtx


### formatting vectors for Vertex Vector Search

**embedding_vector**
* Encode the file using UTF-8.
* Make each line a valid `.json` object to be interpreted as a record.
* Include in each record a field named `id` that requires a valid UTF-8 string that is the ID of the vector.
* Include in each record a field named embedding that requires an array of numbers. This is the feature vector.

Filtering with **String Namespaces**

The value of the field `restricts`, if present, should be an array of objects, each is turned into a TokenNamespace in restricts.

For each vector's record, add a field called restricts, to contain an array of objects, each of which is a namespace.

* Each object must have a field named namespace. This field is the TokenNamespace.namespace, namespace.
* The value of the field allow, if present, is an array of strings. This array of strings is the TokenNamespace.string_tokens list.
* The value of the field deny, if present, is an array of strings. This array of strings is the TokenNamespace.string_denylist_tokens list.

```
{
    "id": "42", 
    "embedding": [0.5, 1.0], 
    "restricts": [
        {"namespace": "class","allow": ["cat", "pet"]},
        {"namespace": "category", "allow": ["feline"]}
    ]
}
{
    "id": "43", 
    "embedding": [0.6, 1.0], 
    "restricts": [
        {"namespace": "class", "allow": ["dog", "pet"]},
        {"namespace": "category", "allow": ["canine"]}
    ]
}
```

Filtering with **Numeric namespaces**
For each vector's record, add a field called `numeric_restricts`, to contain an array of objects, each of which is a numeric restrict.

* Each object must have a field named namespace. This field is the NumericRestrictNamespace.namespace, namespace.
* Each object must have one of `value_int`, `value_float`, and `value_double`.
* Each object must not have a field named op. This field is only for query.

```
{
    "id": "42", 
    "embedding": [0.5, 1.0], 
    "numeric_restricts": [
        {"namespace": "size", "value_int": 3},
        {"namespace": "ratio", "value_float": 0.1}
    ]
}
{
    "id": "43", 
    "embedding": [0.6, 1.0], 
    "numeric_restricts": [
        {"namespace":"weight", "value_double": 0.3}
    ]
}
```

**crowding tag**

The value of the field `crowding_tag`, if present, should be a string

```
{
    "id": "43", 
    "embedding": [0.6, 1.0], 
    "numeric_restricts": [
        {"namespace":"weight", "value_double": 0.3}
    ],
    "crowding_tag": "shoes"
}
```

In [122]:
# print(len(question_embeddings))
# question_embeddings[0]

# df_test.head()

In [123]:
import gc
import json

BQ_NUM_ROWS = 1000 #50000
BQ_CHUNK_SIZE = 100
BQ_NUM_CHUNKS = math.ceil(BQ_NUM_ROWS / BQ_CHUNK_SIZE)

START_CHUNK = 0

# Create a rate limit of 300 requests per minute. Adjust this depending on your quota.
API_CALLS_PER_SECOND = 300 / 60
# According to the docs, each request can process 5 instances per request
ITEMS_PER_REQUEST = 5

print(f"BQ_NUM_ROWS          : {BQ_NUM_ROWS}")
print(f"BQ_CHUNK_SIZE        : {BQ_CHUNK_SIZE}")
print(f"BQ_NUM_CHUNKS        : {BQ_NUM_CHUNKS}")
print(f"API_CALLS_PER_SECOND : {API_CALLS_PER_SECOND}")
print(f"ITEMS_PER_REQUEST    : {ITEMS_PER_REQUEST}")


# Loop through each generated dataframe, convert
for i, df in tqdm(
    enumerate(
        query_bigquery_chunks(
            max_rows=BQ_NUM_ROWS, 
            rows_per_chunk=BQ_CHUNK_SIZE, 
            start_chunk=START_CHUNK
        )
    ),
    total=BQ_NUM_CHUNKS - START_CHUNK,
    position=-1,
    desc="Chunk of rows from BigQuery",
):
    print(f"Starting: {i} of {BQ_NUM_CHUNKS} loops")
    
    # Create a unique output file for each chunk
    chunk_path = embeddings_file_path.joinpath(
        f"{embeddings_file_path.stem}_{i+START_CHUNK}.json"
    )
    with open(chunk_path, "a") as f:
        id_chunk = df.id
        scores_chunk = df.score
        tags_chunk = df.tags_split

        # Convert batch to embeddings
        is_successful, question_chunk_embeddings = encode_text_to_embedding_batched(
            # sentences=df.title_with_body,
            sentences=df.title_with_body.tolist(), #[:500]
            api_calls_per_second=API_CALLS_PER_SECOND,
            batch_size=ITEMS_PER_REQUEST,
        )

        # Append to file
        embeddings_formatted = [
            json.dumps(
                {
                    "id": str(id),
                    "embedding": [
                        str(value) for value in embedding
                    ],
                    "restricts": [
                        {"namespace": "tags", "allow": [str(tag)]}
                    ],
                    "numeric_restricts": [
                        {"namespace": "score", "value_int": int(score)}
                    ],
                    "crowding_tag": str(_tag_crowd)
                }
            )
            + "\n"
            # for id, embedding in zip(id_chunk[is_successful], question_chunk_embeddings)
            for id, embedding, tag, score, _tag_crowd in zip(id_chunk[is_successful], question_chunk_embeddings, tags_chunk, scores_chunk, tags_chunk)
        ]
        f.writelines(embeddings_formatted)
        
        # print(f"tags_chunk                : {len(tags_chunk)}")
        # print(f"scores_chunk              : {len(scores_chunk)}")
        # print(f"question_chunk_embeddings : {len(question_chunk_embeddings)}")
        # print(f"id_chunk[is_successful]   : {len(id_chunk[is_successful])}")

        # Delete the DataFrame and any other large data structures
        del df
        gc.collect()

BQ_NUM_ROWS          : 1000
BQ_CHUNK_SIZE        : 100
BQ_NUM_CHUNKS        : 10
API_CALLS_PER_SECOND : 5.0
ITEMS_PER_REQUEST    : 5


Chunk of rows from BigQuery:   0%|          | 0/10 [00:00<?, ?it/s]

Starting: 0 of 10 loops


  0%|          | 0/20 [00:00<?, ?it/s]

tags_chunk                : 100
scores_chunk              : 100
question_chunk_embeddings : 100
id_chunk[is_successful]   : 100
Starting: 1 of 10 loops


  0%|          | 0/20 [00:00<?, ?it/s]

tags_chunk                : 100
scores_chunk              : 100
question_chunk_embeddings : 100
id_chunk[is_successful]   : 100
Starting: 2 of 10 loops


  0%|          | 0/20 [00:00<?, ?it/s]

tags_chunk                : 100
scores_chunk              : 100
question_chunk_embeddings : 100
id_chunk[is_successful]   : 100
Starting: 3 of 10 loops


  0%|          | 0/20 [00:00<?, ?it/s]

tags_chunk                : 100
scores_chunk              : 100
question_chunk_embeddings : 100
id_chunk[is_successful]   : 100
Starting: 4 of 10 loops


  0%|          | 0/20 [00:00<?, ?it/s]

tags_chunk                : 100
scores_chunk              : 100
question_chunk_embeddings : 100
id_chunk[is_successful]   : 100
Starting: 5 of 10 loops


  0%|          | 0/20 [00:00<?, ?it/s]

tags_chunk                : 100
scores_chunk              : 100
question_chunk_embeddings : 100
id_chunk[is_successful]   : 100
Starting: 6 of 10 loops


  0%|          | 0/20 [00:00<?, ?it/s]

tags_chunk                : 100
scores_chunk              : 100
question_chunk_embeddings : 100
id_chunk[is_successful]   : 100
Starting: 7 of 10 loops


  0%|          | 0/20 [00:00<?, ?it/s]

tags_chunk                : 100
scores_chunk              : 100
question_chunk_embeddings : 100
id_chunk[is_successful]   : 100
Starting: 8 of 10 loops


  0%|          | 0/20 [00:00<?, ?it/s]

tags_chunk                : 100
scores_chunk              : 100
question_chunk_embeddings : 100
id_chunk[is_successful]   : 100
Starting: 9 of 10 loops


  0%|          | 0/20 [00:00<?, ?it/s]

tags_chunk                : 100
scores_chunk              : 100
question_chunk_embeddings : 100
id_chunk[is_successful]   : 100


#### example output

a single `embeddings_formatted` entry should resemble:

```
{
    "id": "67574726", 
    "embedding": [
        "0.875, ..., 1.000"
    ], 
    "restricts": [
        {"namespace": "tags", "allow": ["c#"]}
    ], 
    "numeric_restricts": [
        {"namespace": "score", "value_int": 1}
    ],
    "crowding_tag": "c#"
}
```

In [279]:
print(len(embeddings_formatted))
print(len(embeddings_formatted[0]))
# embeddings_formatted[0]

100
18670


In [130]:
REMOTE_GCS_FOLDER = f"{BUCKET_URI}/{PREFIX}/embedding_indexes/{embeddings_file_path.stem}/"
print(f"REMOTE_GCS_FOLDER: {REMOTE_GCS_FOLDER}")

REMOTE_GCS_FOLDER: gs://vvs-vectorio-vpc1-hybrid-vertex/vvs-vectorio-vpc1/embedding_indexes/tmppkzjydtx/


In [131]:
! gsutil -m cp -r {embeddings_file_path}/* {REMOTE_GCS_FOLDER}

Copying file:///var/tmp/tmppkzjydtx/tmppkzjydtx_0.json [Content-Type=application/json]...
Copying file:///var/tmp/tmppkzjydtx/tmppkzjydtx_1.json [Content-Type=application/json]...
Copying file:///var/tmp/tmppkzjydtx/tmppkzjydtx_2.json [Content-Type=application/json]...
Copying file:///var/tmp/tmppkzjydtx/tmppkzjydtx_3.json [Content-Type=application/json]...
Copying file:///var/tmp/tmppkzjydtx/tmppkzjydtx_4.json [Content-Type=application/json]...
Copying file:///var/tmp/tmppkzjydtx/tmppkzjydtx_5.json [Content-Type=application/json]...
Copying file:///var/tmp/tmppkzjydtx/tmppkzjydtx_6.json [Content-Type=application/json]...
Copying file:///var/tmp/tmppkzjydtx/tmppkzjydtx_7.json [Content-Type=application/json]...
Copying file:///var/tmp/tmppkzjydtx/tmppkzjydtx_8.json [Content-Type=application/json]...
Copying file:///var/tmp/tmppkzjydtx/tmppkzjydtx_9.json [Content-Type=application/json]...
- [10/10 files][ 17.8 MiB/ 17.8 MiB] 100% Done                                  
Operation completed

In [132]:
! gsutil ls $REMOTE_GCS_FOLDER

gs://vvs-vectorio-vpc1-hybrid-vertex/vvs-vectorio-vpc1/embedding_indexes/tmppkzjydtx/tmppkzjydtx_0.json
gs://vvs-vectorio-vpc1-hybrid-vertex/vvs-vectorio-vpc1/embedding_indexes/tmppkzjydtx/tmppkzjydtx_1.json
gs://vvs-vectorio-vpc1-hybrid-vertex/vvs-vectorio-vpc1/embedding_indexes/tmppkzjydtx/tmppkzjydtx_2.json
gs://vvs-vectorio-vpc1-hybrid-vertex/vvs-vectorio-vpc1/embedding_indexes/tmppkzjydtx/tmppkzjydtx_3.json
gs://vvs-vectorio-vpc1-hybrid-vertex/vvs-vectorio-vpc1/embedding_indexes/tmppkzjydtx/tmppkzjydtx_4.json
gs://vvs-vectorio-vpc1-hybrid-vertex/vvs-vectorio-vpc1/embedding_indexes/tmppkzjydtx/tmppkzjydtx_5.json
gs://vvs-vectorio-vpc1-hybrid-vertex/vvs-vectorio-vpc1/embedding_indexes/tmppkzjydtx/tmppkzjydtx_6.json
gs://vvs-vectorio-vpc1-hybrid-vertex/vvs-vectorio-vpc1/embedding_indexes/tmppkzjydtx/tmppkzjydtx_7.json
gs://vvs-vectorio-vpc1-hybrid-vertex/vvs-vectorio-vpc1/embedding_indexes/tmppkzjydtx/tmppkzjydtx_8.json
gs://vvs-vectorio-vpc1-hybrid-vertex/vvs-vectorio-vpc1/embedding

# Create Vertex Vector Search index and endpoint

## Create VS Index

In [133]:
CREATE_NEW_VS_INDEX = True

# !gcloud ai indexes list \
#   --project=$PROJECT_ID \
#   --region=$REGION

In [134]:
# if using exsiting index
if not CREATE_NEW_VS_INDEX:
    EXISTING_INDEX_ID = "..TODO.."
    EXISTING_INDEX_NAME = f'projects/{PROJECT_NUM}/locations/{REGION}/indexes/{EXISTING_INDEX_ID}'
    print(f"EXISTING_INDEX_NAME  : {EXISTING_INDEX_NAME}")

In [135]:
# ANN index config
APPROX_NEIGHBORS           = 150
DISTANCE_MEASURE           = "DOT_PRODUCT_DISTANCE"
LEAF_NODE_EMB_COUNT        = 500
LEAF_NODES_SEARCH_PERCENT  = 80
# DIMENSIONS                 = 768
INDEX_UPDATE_METHOD        = 'batch_update' # accepts: 'batch_update' | 'stream_update'

DISPLAY_NAME = f"soverflow_{PREFIX}".replace("-","_")
DESCRIPTION = "sample index for vectorio demo"

print(f"APPROX_NEIGHBORS          : {APPROX_NEIGHBORS}")
print(f"DISTANCE_MEASURE          : {DISTANCE_MEASURE}")
print(f"LEAF_NODE_EMB_COUNT       : {LEAF_NODE_EMB_COUNT}")
print(f"LEAF_NODES_SEARCH_PERCENT : {LEAF_NODES_SEARCH_PERCENT}")
print(f"DIMENSIONS                : {DIMENSIONS}")
print(f"INDEX_UPDATE_METHOD       : {INDEX_UPDATE_METHOD}")
print(f"DISPLAY_NAME              : {DISPLAY_NAME}")
print(f"DESCRIPTION               : {DESCRIPTION}")

APPROX_NEIGHBORS          : 150
DISTANCE_MEASURE          : DOT_PRODUCT_DISTANCE
LEAF_NODE_EMB_COUNT       : 500
LEAF_NODES_SEARCH_PERCENT : 80
DIMENSIONS                : 768
DISPLAY_NAME              : soverflow_vvs_vectorio_vpc1
DESCRIPTION               : sample index for vectorio demo


In [136]:
if CREATE_NEW_VS_INDEX:
    
    tree_ah_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
        display_name=DISPLAY_NAME,
        contents_delta_uri=REMOTE_GCS_FOLDER,
        dimensions=DIMENSIONS,
        approximate_neighbors_count=APPROX_NEIGHBORS,
        distance_measure_type=DISTANCE_MEASURE,
        leaf_node_embedding_count=LEAF_NODE_EMB_COUNT,
        leaf_nodes_to_search_percent=LEAF_NODES_SEARCH_PERCENT,
        description=DESCRIPTION,
        index_update_method=INDEX_UPDATE_METHOD,
        sync=True,
        labels={
            "prefix": f'{PREFIX}',
        },
    )
else:
    tree_ah_index = aiplatform.MatchingEngineIndex(EXISTING_INDEX_NAME)
    
INDEX_RESOURCE_NAME = tree_ah_index.resource_name

print(f"DISPLAY_NAME        : {DISPLAY_NAME}")
print(f"check display_name  : {tree_ah_index.display_name}\n")
print(f"INDEX_RESOURCE_NAME : {INDEX_RESOURCE_NAME}")

DISPLAY_NAME        : soverflow_vvs_vectorio_vpc1
check display_name  : soverflow_vvs_vectorio_vpc1

INDEX_RESOURCE_NAME : projects/934903580331/locations/us-central1/indexes/7417608906186162176


*list all indexes:*

In [278]:
# list_of_indices = tree_ah_index.list()
# list_of_indices[0]
# list_of_indices
# type(list_of_indices[0])
list_of_indices[0].resource_name

google.cloud.aiplatform.matching_engine.matching_engine_index.MatchingEngineIndex

*Using the resource name, you can retrieve an existing MatchingEngineIndex:*

In [140]:
# tree_ah_index = aiplatform.MatchingEngineIndex(index_name=INDEX_RESOURCE_NAME)

#### Troubleshooting

If index creation fails, grab `OPERATION_ID` and `FAILED_INDEX_ID` from the operation resource name in the error message, for example:

> `Please check the details in the metadata of operation: projects/934903580331/locations/us-central1/indexes/FAILED_INDEX_ID/operations/OPERATION_ID.`

Then, use `gcloud ai operations describe` to get the error details:

```
!gcloud ai operations describe $OPERATION_ID \
    --index=$FAILED_INDEX_ID \
    --project=$PROJECT_ID \
    --region=$REGION
```

In [137]:
# OPERATION_ID    = "2710742495469240320"
# FAILED_INDEX_ID = "4846053518957608960"

# !gcloud ai operations describe $OPERATION_ID \
#     --index=$FAILED_INDEX_ID \
#     --project=$PROJECT_ID \
#     --region=$REGION

## Create VS Index Endpoint

In [153]:
CREATE_NEW_VS_INDEX_ENDPOINT = True

# !gcloud ai index-endpoints list \
#   --project=$PROJECT_ID \
#   --region=$REGION

In [155]:
# if using exsiting index endpoint
if not CREATE_NEW_VS_INDEX_ENDPOINT:
    EXISTING_ENDPOINT_ID = "..TODO.."
    EXISTING_ENDPOINT_NAME = f'projects/{PROJECT_NUM}/locations/{REGION}/indexEndpoints/{EXISTING_ENDPOINT_ID}'
    print(f"EXISTING_ENDPOINT_NAME  : {EXISTING_ENDPOINT_NAME}")

In [157]:
ENDPOINT_DISPLAY_NAME = f'{DISPLAY_NAME}_endpoint'
ENDPOINT_DESCRIPTION  = "index endpoint for vectorio demo"

# USE_PUBLIC_ENDPOINTS  = False

print(f"ENDPOINT_DISPLAY_NAME : {ENDPOINT_DISPLAY_NAME}")
print(f"ENDPOINT_DESCRIPTION  : {ENDPOINT_DESCRIPTION}")
print(f"USE_PUBLIC_ENDPOINTS  : {USE_PUBLIC_ENDPOINTS}")

ENDPOINT_DISPLAY_NAME : soverflow_vvs_vectorio_vpc1_endpoint
ENDPOINT_DESCRIPTION  : index endpoint for vectorio demo
USE_PUBLIC_ENDPOINTS  : False


In [161]:
if CREATE_NEW_VS_INDEX_ENDPOINT:
    
    my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
        display_name=ENDPOINT_DISPLAY_NAME,
        description=ENDPOINT_DESCRIPTION,
        network=VPC_NETWORK_FULL,                    # if not USE_PUBLIC_ENDPOINTS else None,
        public_endpoint_enabled=USE_PUBLIC_ENDPOINTS,
        sync=True,
        labels={
            "prefix": f'{PREFIX}',
        },
    )
else:
    my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint(EXISTING_ENDPOINT_NAME)
    
ENDPOINT_RESOURCE_NAME = my_index_endpoint.resource_name

print(f"ENDPOINT_DISPLAY_NAME  : {ENDPOINT_DISPLAY_NAME}")
print(f"check display_name     : {my_index_endpoint.display_name}\n")
print(f"VPC_NETWORK_FULL       : {VPC_NETWORK_FULL}")
print(f"check network name     : {my_index_endpoint.network}\n")
print(f"ENDPOINT_RESOURCE_NAME : {ENDPOINT_RESOURCE_NAME}")

ENDPOINT_DISPLAY_NAME  : soverflow_vvs_vectorio_vpc1_endpoint
check display_name     : soverflow_vvs_vectorio_vpc1_endpoint

VPC_NETWORK_FULL       : projects/934903580331/global/networks/ucaip-haystack-vpc-network


In [162]:
ENDPOINT_RESOURCE_NAME = my_index_endpoint.resource_name
print(f"ENDPOINT_RESOURCE_NAME : {ENDPOINT_RESOURCE_NAME}")

ENDPOINT_RESOURCE_NAME : projects/934903580331/locations/us-central1/indexEndpoints/4440729552494264320


## Deploy Indexes

* [src](https://github.com/googleapis/python-aiplatform/blob/main/google/cloud/aiplatform/matching_engine/matching_engine_index_endpoint.py#L822)

In [163]:
DEPLOY_NEW_VS_INDEX = True

In [164]:
# if using exsiting deployed index
if not DEPLOY_NEW_VS_INDEX:
    EXISTING_DEPLOYED_INDEX_ID = "..TODO.."
    EXISTING_DEPLOYED_INDEX_NAME = f'projects/{PROJECT_NUM}/locations/{REGION}/indexEndpoints/{EXISTING_DEPLOYED_INDEX_ID}'
    print(f"EXISTING_DEPLOYED_INDEX_NAME  : {EXISTING_DEPLOYED_INDEX_NAME}")

In [167]:
import time 

TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")
DEPLOYED_INDEX_ID = f'deployed_{TIMESTAMP}'.replace("-","_")
# MACHINE_TYPE = "XXXX"
MIN_REPLICAS = 1
MAX_REPLICAS = 1

print(f"DEPLOYED_INDEX_ID : {DEPLOYED_INDEX_ID}")
print(f"MIN_REPLICAS      : {MIN_REPLICAS}")
print(f"MAX_REPLICAS      : {MAX_REPLICAS}")

DEPLOYED_INDEX_ID : deployed_20240125_190236
MIN_REPLICAS      : 1
MAX_REPLICAS      : 1


In [168]:
if DEPLOY_NEW_VS_INDEX:
    
    deployed_index = my_index_endpoint.deploy_index(
        index=tree_ah_index, 
        deployed_index_id=DEPLOYED_INDEX_ID,
        min_replica_count=MIN_REPLICAS,
        max_replica_count=MAX_REPLICAS,
        
    )
    
else:
    deployed_index = aiplatform.MatchingEngineIndexEndpoint(EXISTING_DEPLOYED_INDEX_NAME)

DEPLOYED_INDEX_RESOURCE_NAME = deployed_index.resource_name
DEPLOYED_INDEX_DISPLAY_NAME  = deployed_index.display_name

print(f"DEPLOYED_INDEX_RESOURCE_NAME : {DEPLOYED_INDEX_RESOURCE_NAME}")
print(f"DEPLOYED_INDEX_DISPLAY_NAME  : {DEPLOYED_INDEX_DISPLAY_NAME}")

DEPLOYED_INDEX_RESOURCE_NAME : projects/934903580331/locations/us-central1/indexEndpoints/4440729552494264320
DEPLOYED_INDEX_DISPLAY_NAME  : soverflow_vvs_vectorio_vpc1_endpoint


In [169]:
print(f"Deployed indexes on the index endpoint:")
for d in my_index_endpoint.deployed_indexes:
    print(f"    {d.id}")

Deployed indexes on the index endpoint:
    deployed_20240125_190236


In [170]:
DEPLOYED_INDEX_ID_TEST = deployed_index.deployed_indexes[0].id
print(f"DEPLOYED_INDEX_ID_TEST : {DEPLOYED_INDEX_ID_TEST}")

DEPLOYED_INDEX_ID_TEST : deployed_20240125_190236


### Confirm matches

In [172]:
number_of_vectors = sum(
    aiplatform.MatchingEngineIndex(
        deployed_index.index
    )._gca_resource.index_stats.vectors_count
    for deployed_index in my_index_endpoint.deployed_indexes
)

print(f"Expected: {BQ_NUM_ROWS}, Actual: {number_of_vectors}")

Expected: 1000, Actual: 1000


## Create online queries

After you build your indexes, you may query against the deployed index to find nearest neighbors.

Note: For the `DOT_PRODUCT_DISTANCE` distance type, the "distance" property returned with each MatchNeighbor actually refers to the similarity.

In [173]:
test_embeddings = encode_texts_to_embeddings(sentences=["Install GPU for Tensorflow"])

In [174]:
# Test query
NUM_NEIGHBOURS = 10

start = time.time()

response = my_index_endpoint.find_neighbors(
    deployed_index_id=DEPLOYED_INDEX_ID,
    queries=test_embeddings,
    num_neighbors=NUM_NEIGHBOURS,
    return_full_datapoint=True,
)

elapsed_ann_time = time.time() - start
elapsed_ann_time = round(elapsed_ann_time, 4)
print(f'ANN latency: {elapsed_ann_time} seconds')

ANN latency: 0.0129 seconds


In [175]:
response

[[MatchNeighbor(id='43070568', distance=0.7574796676635742),
  MatchNeighbor(id='54122858', distance=0.7128270864486694),
  MatchNeighbor(id='53843711', distance=0.7045555114746094),
  MatchNeighbor(id='56801148', distance=0.6956605911254883),
  MatchNeighbor(id='45467758', distance=0.6887144446372986),
  MatchNeighbor(id='45366523', distance=0.6812765598297119),
  MatchNeighbor(id='61365790', distance=0.6766673922538757),
  MatchNeighbor(id='65438932', distance=0.664444088935852),
  MatchNeighbor(id='58616134', distance=0.663773238658905),
  MatchNeighbor(id='54006694', distance=0.6581679582595825)]]

In [237]:
# Test query
NUM_NEIGHBOURS = 10

start = time.time()

response = my_index_endpoint.find_neighbors(
    deployed_index_id=DEPLOYED_INDEX_ID,
    queries=test_embeddings,
    num_neighbors=NUM_NEIGHBOURS,
    return_full_datapoint=False,
)

elapsed_ann_time = time.time() - start
elapsed_ann_time = round(elapsed_ann_time, 4)
print(f'ANN latency: {elapsed_ann_time} seconds')

ANN latency: 0.0137 seconds


In [238]:
response

[[MatchNeighbor(id='43070568', distance=0.7574796676635742),
  MatchNeighbor(id='54122858', distance=0.7128270864486694),
  MatchNeighbor(id='53843711', distance=0.7045555114746094),
  MatchNeighbor(id='56801148', distance=0.6956605911254883),
  MatchNeighbor(id='45467758', distance=0.6887144446372986),
  MatchNeighbor(id='45366523', distance=0.6812765598297119),
  MatchNeighbor(id='61365790', distance=0.6766673922538757),
  MatchNeighbor(id='65438932', distance=0.664444088935852),
  MatchNeighbor(id='58616134', distance=0.663773238658905),
  MatchNeighbor(id='54006694', distance=0.6581679582595825)]]

In [176]:
for match_index, neighbor in enumerate(response[0]):
    print(f"https://stackoverflow.com/questions/{neighbor.id}")

https://stackoverflow.com/questions/43070568
https://stackoverflow.com/questions/54122858
https://stackoverflow.com/questions/53843711
https://stackoverflow.com/questions/56801148
https://stackoverflow.com/questions/45467758
https://stackoverflow.com/questions/45366523
https://stackoverflow.com/questions/61365790
https://stackoverflow.com/questions/65438932
https://stackoverflow.com/questions/58616134
https://stackoverflow.com/questions/54006694


### Retrieving metadata

In [275]:
read_response = my_index_endpoint.read_index_datapoints(
    deployed_index_id=DEPLOYED_INDEX_ID, 
    ids= ["43070568", "54122858"],
)
# read_response
print(f"crowding_tag       : {read_response[0].crowding_tag}")
print(f"crowding_attribute : {read_response[0].crowding_tag.crowding_attribute}")
print(f"datapoint_id       : {read_response[0].datapoint_id}")
print(f"restricts          : {read_response[0].restricts}")
print(f"allow_list         : {read_response[0].restricts[0].allow_list}")
print(f"deny_list          : {read_response[0].restricts[0].deny_list}")
print(f"namespace          : {read_response[0].restricts[0].namespace}")

crowding_tag       : crowding_attribute: "6395101720579670463"

crowding_attribute : 6395101720579670463
datapoint_id       : 54122858
restricts          : [namespace: "tags"
allow_list: "machine-learning"
]
allow_list         : ['machine-learning']
deny_list          : []
namespace          : tags


# Clean-up

> TODO

In [None]:
# import os

# delete_bucket = False

# # Force undeployment of indexes and delete endpoint
# my_index_endpoint.delete(force=True)

# # Delete indexes
# tree_ah_index.delete()

# if delete_bucket or os.getenv("IS_TESTING"):
#     ! gsutil rm -rf {BUCKET_URI}