>For Googla Colab Only

>>git clone https://github.com/OperationalizingAI/Hackathon-2-22-24.git

# Cosine Similarity from Atlas Vector Search

In [None]:
!pip install google-cloud-secret-manager
!pip install --upgrade google-auth
!pip install -r requirements.txt

### Google Only Code

In [None]:
!pip install google-cloud-secret-manager
!pip install --upgrade google-auth

import os

from google.cloud import secretmanager
from google.colab import auth
from google.colab import drive

In [None]:
def load_secrets(secrets_name, project_id):
  # Build a client
  auth.authenticate_user()
  client = secretmanager.SecretManagerServiceClient()
  secret_name = secrets_name
  # Create path to latest secret
  resource_name = f"projects/{project_id}/secrets/{secret_name}/versions/latest"
  # Get your secret :
  response = client.access_secret_version(request={"name": resource_name})
  secret_string = response.payload.data.decode('UTF-8')
  return secret_string

In [None]:
project_id = 'botchagalupep1'
openai_api_key = load_secrets("openai_api_key",project_id)
os.environ['OPENAI_API_KEY'] = openai_api_key
#MONGODB_ATLAS_CLUSTER_URI = load_secrets("mdb_uri",project_id)
MONGODB_ATLAS_CLUSTER_URI = load_secrets("MDB_CLUSTER0_URI",project_id)
langsmith_api_key = load_secrets("langsmith_api_key",project_id)
#print(langsmith_api_key )
#print(MONGODB_ATLAS_CLUSTER_URI)

In [None]:
DB_NAME = "Cluster0"
COLLECTION_NAME = "OpenContext0"
INDEX_NAME = "vector_index"

In [None]:
from AtlasClient import AtlasClient

atlas_client = AtlasClient (MONGODB_ATLAS_CLUSTER_URI, DB_NAME)
print("Connected to the Mongo Atlas database!")

Connected to the Mongo Atlas database!


In [None]:
from OpenAIClient import OpenAIClient

openAI_client = None

openAI_client = OpenAIClient (api_key=openai_api_key)
print ("OpenAI client initialized")

OpenAI client initialized


In [None]:
queries = [
    'What is a CodeCompnent',
    'What is a SaaS User',
    'What is a Service',
    'What is a Location',
    'What is a Datacenter',
    'List the different type of entities'
]

In [None]:
embeddings = {}

for query in queries:
    embedding  = openAI_client.get_embedding(query, model='text-embedding-ada-002')
    print (f"Embedding for query='{query}', embeddding_length={len(embedding)}, printing first few numbers... :\n", embedding [:10] )

    print(embeddings)

Embedding for query='What is a CodeCompnent', embeddding_length=1536, printing first few numbers... :
 [-0.01723318360745907, 0.03324076160788536, -0.0025894613936543465, -0.04337441176176071, -0.011904796585440636, 0.017038879916071892, -0.006382106803357601, -0.019235998392105103, 0.008467127569019794, -0.0264551043510437]
{}
Embedding for query='What is a SaaS User', embeddding_length=1536, printing first few numbers... :
 [0.017121756449341774, -0.007427082397043705, 0.0009337672381661832, -0.005708448123186827, -0.04230925440788269, 0.006239466834813356, -0.02626388520002365, -0.00546087883412838, -0.025431478396058083, -0.029909254983067513]
{}
Embedding for query='What is a Service', embeddding_length=1536, printing first few numbers... :
 [0.0019209387246519327, -0.006987689062952995, -0.015718922019004822, -0.004159499425441027, -0.013509081676602364, 0.00228417688049376, -0.002182808006182313, -0.003595213172957301, -0.00019017208251170814, -0.014853907749056816]
{}
Embedding

In [None]:
import json

str = json.dumps(embeddings)

with open("embeddings_openai.json", "w") as f:
    f.write(str)

print ("saved to : 'embeddings_openai.json'")

saved to : 'embeddings_openai.json'


In [None]:
import os
import json

cached_embeddings = {}
cached_embedding_file = 'embeddings_openai.json'

if os.path.exists(cached_embedding_file):
    with open(cached_embedding_file, "r") as f:
        str = f.read()
        cached_embeddings = json.loads(str)

print ("Loaded the following cached embeddings...")
for query in cached_embeddings.keys():
    print (f'- {query}')

Loaded the following cached embeddings...


In [None]:
import time

# Handy function
def do_vector_search (query:str) -> None:
    #query = query.lower().strip()
    #print ('query: ', query)
    if query in cached_embeddings.keys():
        print ("using cached embeddings")
        embedding = cached_embeddings.get (query)
    else:
        t1a = time.perf_counter()
        embedding = openAI_client.get_embedding(query)
        t1b = time.perf_counter()
        print (f"- Getting embeddings from OpenAI took {(t1b-t1a)*1000:,.0f} ms")

    t2a = time.perf_counter()
    result = atlas_client.vector_search(collection_name=COLLECTION_NAME, index_name=INDEX_NAME, attr_name='embedding', embedding_vector=embedding,limit=2)
    t2b = time.perf_counter()

    print (f"- Altas query returned {len (result)} entries in {(t2b-t2a)*1000:,.0f} ms")
    print()

    for idx, data in enumerate (result):
      print(f'### {idx+1}- id: {data["_id"]}\n- text: {data["text"][:20]}\n' +
           f'- search_score:{data["search_score"]}\n- source: {data["source"]}\n')
           #f'- search_score:{data["search_score"]}n')

In [None]:
for query in queries:
  print('## ' +query)
  do_vector_search (query=query)

## What is a CodeCompnent
- Getting embeddings from OpenAI took 208 ms
- Altas query returned 2 entries in 233 ms

### 1- id: 65de73c35d596d909a85b2c8
- text: Kind: CodeComponent
- search_score:0.9399929642677307
- source: /content/gdrive/MyDrive/GAI/catalog-yaml-format/code-component.md

### 2- id: 65de73c35d596d909a85b2ce
- text: kind: CodeComponent 
- search_score:0.9374871850013733
- source: /content/gdrive/MyDrive/GAI/catalog-yaml-format/code-component.md

## What is a SaaS User
- Getting embeddings from OpenAI took 169 ms
- Altas query returned 2 entries in 32 ms

### 1- id: 65de73c35d596d909a85b2ac
- text: user
- search_score:0.8929404020309448
- source: /content/gdrive/MyDrive/GAI/catalog-yaml-format/common.md

### 2- id: 65de73c35d596d909a85b2f9
- text: :::caution SaaS user
- search_score:0.8918938636779785
- source: /content/gdrive/MyDrive/GAI/catalog-yaml-format/user.md

## What is a Service
- Getting embeddings from OpenAI took 93 ms
- Altas query returned 2 entries in 32 m