>For Googla Colab Only

>>git clone https://github.com/OperationalizingAI/Hackathon-2-22-24.gi

In [None]:
!pip install -r requirements.txt

### Google Only Code

In [None]:
!pip install google-cloud-secret-manager
!pip install --upgrade google-auth

import os

from google.cloud import secretmanager
from google.colab import auth
from google.colab import drive

In [None]:
def load_secrets(secrets_name, project_id):
  # Build a client
  auth.authenticate_user()
  client = secretmanager.SecretManagerServiceClient()
  secret_name = secrets_name
  # Create path to latest secret
  resource_name = f"projects/{project_id}/secrets/{secret_name}/versions/latest"
  # Get your secret :
  response = client.access_secret_version(request={"name": resource_name})
  secret_string = response.payload.data.decode('UTF-8')
  return secret_string

In [None]:
project_id = 'botchagalupep1'
openai_api_key = load_secrets("openai_api_key",project_id)
os.environ['OPENAI_API_KEY'] = openai_api_key
#MONGODB_ATLAS_CLUSTER_URI = load_secrets("mdb_uri",project_id)
MONGODB_ATLAS_CLUSTER_URI = load_secrets("MDB_CLUSTER0_URI",project_id)
langsmith_api_key = load_secrets("langsmith_api_key",project_id)
#print(langsmith_api_key )
#print(MONGODB_ATLAS_CLUSTER_URI)

In [None]:
# Our variables

DB_NAME = 'sample_mflix'
COLLECTION_NAME = 'embedded_movies'

In [None]:
from AtlasClient import AtlasClient

atlas_client = AtlasClient (MONGODB_ATLAS_CLUSTER_URI, DB_NAME)
print("Connected to the Mongo Atlas database!")

Connected to the Mongo Atlas database!


In [None]:
model_mappings = {
    'BAAI/bge-small-en-v1.5' : {'embedding_attr' : 'plot_embedding_bge_small', 'index_name' : 'idx_plot_embedding_bge_small'},

    'sentence-transformers/all-mpnet-base-v2' : {'embedding_attr' : 'plot_embedding_mpnet_base_v2', 'index_name' : 'idx_plot_embedding_mpnet_base_v2'},

    # 'sentence-transformers/all-MiniLM-L12-v2' : {'embedding_attr' : 'plot_embedding_minilm_l12_v2', 'index_name' : 'idx_plot_embedding_minilm_l12_v2'},

    'sentence-transformers/all-MiniLM-L6-v2' : {'embedding_attr' : 'plot_embedding_minilm_l6_v2', 'index_name' : 'idx_plot_embedding_minilm_l6_v2'},

    ## bge-large takes too long and consumes too much memory!
    # 'BAAI/bge-large-en-v1.5' : {'embedding_attr' : 'plot_embedding_bge_large', 'index_name' : 'idx_plot_embedding_bge_large', 'embedding_length' : 1024},
}


In [None]:
%pip install llama-index-embeddings-huggingface
%pip install llama-index-embeddings-instructor

In [None]:
import os
## LlamaIndex will download embeddings models as needed.
## Set llamaindex cache dir to ./cache dir here (Default is system tmp)
## This way, we can easily see downloaded artifacts
os.environ['LLAMA_INDEX_CACHE_DIR'] = os.path.join(os.path.abspath(''), '..', 'llama-index-cache')

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

import time

def run_vector_query (query : str, model_name : str):
    model_mapping = model_mappings.get(model_name)
    if model_mapping is None:
        raise Exception ("Unknown model : " + model_name)
    embedding_attr = model_mapping['embedding_attr']
    index_name = model_mapping ['index_name']

    # generate embeddings
    embed_model = HuggingFaceEmbedding(model_name=model_name)
    query_embeddings = embed_model.get_text_embedding(query)

    # now let's query Atlas
    t1a = time.perf_counter()
    movies = atlas_client.vector_search (collection_name=COLLECTION_NAME, index_name=index_name, attr_name=embedding_attr, embedding_vector=query_embeddings, limit=5)
    t1b = time.perf_counter()
    print (f'Atlas query returned in {(t1b-t1a)*1000} ms')
    return movies

In [None]:
def print_movies(movies):
    print (f"Found {len (movies)} movies")
    for idx, movie in enumerate (movies):
        print(f'{idx+1}\nid: {movie["_id"]}\ntitle: {movie["title"]}' +
            f'\nsearch_score(meta):{movie["search_score"]}\nplot: {movie["plot"]}\n')

In [None]:
query = 'fatalistic sci-fi movies'
model_name = 'BAAI/bge-small-en-v1.5'

movies = run_vector_query (query=query, model_name=model_name)

print (f'========== model = {model_name} ======')
print_movies (movies)

In [None]:
query = 'fatalistic sci-fi movies'
model_name = 'sentence-transformers/all-mpnet-base-v2'

movies = run_vector_query (query=query, model_name=model_name)

print (f'========== model = {model_name} ======')
print_movies (movies)