In [None]:
!pip install --upgrade --user google-cloud-aiplatform google-cloud-storage 'google-cloud-bigquery[pandas]'

In [2]:
# Restart kernel after installs so that your environment can access the new packages
import IPython
import time

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

In [1]:
# get project ID
PROJECT_ID = ! gcloud config get project
PROJECT_ID = PROJECT_ID[0]
LOCATION = "us-central1"
if PROJECT_ID == "(unset)":
    print(f"Please set the project ID manually below")


Please set the project ID manually below


In [31]:
# define project information
if PROJECT_ID == "(unset)":
    PROJECT_ID = ""  # @param {type:"string"}

# generate an unique id for this session
from datetime import datetime

UID = datetime.now().strftime("%m%d%H%M")

In [3]:
import sys

# if it's Colab runtime, authenticate the user with Google Cloud
if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

In [None]:
! gcloud services enable compute.googleapis.com aiplatform.googleapis.com storage.googleapis.com bigquery.googleapis.com --project {PROJECT_ID}

In [6]:
# load the BQ Table into a Pandas Dataframe
import pandas as pd
from google.cloud import bigquery

QUESTIONS_SIZE = 1000

bq_client = bigquery.Client(project=PROJECT_ID)
QUERY_TEMPLATE = """
        SELECT distinct q.id, q.title
        FROM (SELECT * FROM `bigquery-public-data.stackoverflow.posts_questions`
        where Score > 0 ORDER BY View_Count desc) AS q
        LIMIT {limit} ;
        """
query = QUERY_TEMPLATE.format(limit=QUESTIONS_SIZE)
query_job = bq_client.query(query)
rows = query_job.result()
df = rows.to_dataframe()

# examine the data
df.head()

Unnamed: 0,id,title
0,64360615,Why is the hatch not showing?
1,64363095,How to make div elements appear inline with a ...
2,64240586,VBA Excel 365 64-bit Winsock implementation pr...
3,64483009,How to upload/download file into Oracle Cloud ...
4,64377894,Difference between explode and explode_outer


In [7]:
df

Unnamed: 0,id,title
0,64360615,Why is the hatch not showing?
1,64363095,How to make div elements appear inline with a ...
2,64240586,VBA Excel 365 64-bit Winsock implementation pr...
3,64483009,How to upload/download file into Oracle Cloud ...
4,64377894,Difference between explode and explode_outer
...,...,...
995,64262064,Bind GUI element property to boolean expressio...
996,64349698,Is my client considered public/confidential if...
997,64387610,How to configure Spring Security OAuth2 to ver...
998,64532085,R maximum distance of a matrix after removing ...


In [8]:
# init the vertexai package
import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)


In [10]:
# Load the text embeddings model
from vertexai.preview.language_models import TextEmbeddingModel

model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")


In [11]:
import time
import tqdm  # to show a progress bar

# get embeddings for a list of texts
BATCH_SIZE = 5


def get_embeddings_wrapper(texts):
    embs = []
    for i in tqdm.tqdm(range(0, len(texts), BATCH_SIZE)):
        time.sleep(1)  # to avoid the quota error
        result = model.get_embeddings(texts[i : i + BATCH_SIZE])
        embs = embs + [e.values for e in result]
    return embs


In [None]:
# get embeddings for the question titles and add them as "embedding" column
df = df.assign(embedding=get_embeddings_wrapper(list(df.title)))
df.head()


In [15]:
len(df['embedding'][0])

768

In [16]:
import random
import numpy as np

# pick one of them as a key question
key = random.randint(0, len(df))

# calc dot product between the key and other questions
embs = np.array(df.embedding.to_list())
similarities = np.dot(embs[key], embs.T)

# print similarities for the first 5 questions
similarities[:5]


array([0.50433718, 0.56227078, 0.47846575, 0.47568009, 0.45684142])

In [17]:
# print the question
print(f"Key question: {df.title[key]}\n")

# sort and print the questions by similarities
sorted_questions = sorted(
    zip(df.title, similarities), key=lambda x: x[1], reverse=True
)[:20]
for i, (question, similarity) in enumerate(sorted_questions):
    print(f"{similarity:.4f} {question}")


Key question: How to show a Time Series chart by Date + Hour in Data Studio - only showing up as 12AM

1.0000 How to show a Time Series chart by Date + Hour in Data Studio - only showing up as 12AM
0.6996 Time data does not match format '%yyyy-%mm-%dd
0.6975 How to turn datetime to format('%H:%M') but still keep datetime type?
0.6566 DataFrame New Column to split sessions by time difference - pandas
0.6384 Power BI Map Dynamic Buckets
0.6218 how to write regex expression to find date and time
0.6216 How to create dynamic data sets on ng-charts
0.6215 What is wrong with this code to generate new date rows in my Pandas dataframe based on start and end dates?
0.6215 Disable the label text (min, max, average, bins values) in measure annotation while doing MeasureX in highstocks and highcharts
0.6190 in Python Pandas above 1.1.0 InvalidIndexError when slicing MultIndex frame with DatetimeIndex
0.6181 How to show only column with Values in Pandas Groupby
0.6137 Showing arbitrary @Timed metri

In [18]:
# save id and embedding as a json file
jsonl_string = df[["id", "embedding"]].to_json(orient="records", lines=True)
with open("questions.json", "w") as f:
    f.write(jsonl_string)

# show the first few lines of the json file
! head -n 3 questions.json


{"id":64360615,"embedding":[0.0096144881,0.0196080822,0.0315130427,-0.018537879,0.0473369174,0.0251252614,0.0055566989,0.0120723713,0.0206799638,0.0293143075,0.0187999979,0.0227961503,0.0277153514,0.0356685854,-0.0067165233,-0.0430152752,-0.0765612125,0.0130272582,-0.0173531286,0.0272374731,-0.0593699478,-0.0215581749,-0.0154614747,-0.0267537031,-0.0103264945,-0.0544658303,0.001135871,-0.0425284281,-0.0099049332,0.0000936401,0.0284323711,0.003146196,-0.0378895029,-0.0057702251,-0.0422542915,0.0789663941,-0.0096604433,0.0008278319,0.0451143011,-0.0067756623,-0.0320472457,-0.0048065665,0.0031847805,0.0170230176,0.0128254481,0.0196729377,-0.0318697356,-0.008650898,-0.0238228869,0.0288226474,-0.0387511142,-0.0138769774,-0.0046359692,0.0398554839,0.0477813892,0.0459595509,-0.0056817182,-0.0135848373,-0.0223781131,-0.0376843698,0.0448303595,-0.0194943491,0.0300437715,0.0091752345,-0.0028871559,0.0628972501,0.0029386678,-0.031865526,0.016144678,-0.0204657614,0.0229924954,0.0066935876,0.004110

In [None]:
BUCKET_URI = f"gs://{PROJECT_ID}-embvs-tutorial-{UID}"
! gsutil mb -l $LOCATION -p {PROJECT_ID} {BUCKET_URI}
! gsutil cp questions.json {BUCKET_URI}


In [20]:
# init the aiplatform package
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION)


In [None]:
# create index
my_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name=f"embvs-tutorial-index-{UID}",
    contents_delta_uri=BUCKET_URI,
    dimensions=768,
    approximate_neighbors_count=20,
    distance_measure_type="DOT_PRODUCT_DISTANCE",
)


In [None]:
# create IndexEndpoint
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name=f"embvs-tutorial-index-endpoint-{UID}",
    public_endpoint_enabled=True,
)


In [23]:
DEPLOYED_INDEX_ID = f"embvs_tutorial_deployed_{UID}"


In [None]:
# deploy the Index to the Index Endpoint
my_index_endpoint.deploy_index(index=my_index, deployed_index_id=DEPLOYED_INDEX_ID)


In [25]:
test_embeddings = get_embeddings_wrapper(["How to read JSON with Python?"])


100%|██████████| 1/1 [00:01<00:00,  1.55s/it]


In [26]:
# Test query
response = my_index_endpoint.find_neighbors(
    deployed_index_id=DEPLOYED_INDEX_ID,
    queries=test_embeddings,
    num_neighbors=20,
)

# show the result
import numpy as np

for idx, neighbor in enumerate(response[0]):
    id = np.int64(neighbor.id)
    similar = df.query("id == @id", engine="python")
    print(f"{neighbor.distance:.4f} {similar.title.values[0]}")


0.8132 Python - How To Decode JWT Header?
0.7756 How to extract inner dictionary key/value into a new dictionary in Python?
0.7266 Why can't I read in .conll file with Python (confusing parse-error)?
0.7208 How can I use regex in python to find words between tags?
0.7033 Python CSV, How to append data at the end of a row whilst reading it line by line (row by row)?
0.6976 Why customer python package can not be imported?
0.6946 How to perform the join on multiple fields in django queryset?
0.6942 Why is ZeroMQ poller not receiving messages (python)?
0.6917 How to parse un json string into a List with MOSHI
0.6843 Flutter driver: how to pass json mock to test?
0.6816 Not getting response on fetching an API
0.6789 Howto emulate pipe-like buffer in Python?
0.6761 Django: Why am I getting ValueError when referring to a ForeignKey using a varchar?
0.6736 Fetching JSON which might contain null values
0.6689 How to take the value of a key in mongodb using python
0.6655 console.log the data fro

In [None]:
# wait for a confirmation
input("Press Enter to delete Index Endpoint, Index and Cloud Storage bucket:")

# delete Index Endpoint
my_index_endpoint.undeploy_all()
my_index_endpoint.delete(force=True)

# delete Index
my_index.delete()

# delete Cloud Storage bucket
! gsutil rm -r {BUCKET_URI}