# Create addtional embeddings in Atlas (for sample_mflix))
## Using LLama Index

>For Googla Colab Only

>>git clone https://github.com/OperationalizingAI/Hackathon-2-22-24.gi

In [None]:
!pip install -r requirements.txt

### Google Only Code

In [None]:
!pip install google-cloud-secret-manager
!pip install --upgrade google-auth

import os

from google.cloud import secretmanager
from google.colab import auth
from google.colab import drive

In [None]:
def load_secrets(secrets_name, project_id):
  # Build a client
  auth.authenticate_user()
  client = secretmanager.SecretManagerServiceClient()
  secret_name = secrets_name
  # Create path to latest secret
  resource_name = f"projects/{project_id}/secrets/{secret_name}/versions/latest"
  # Get your secret :
  response = client.access_secret_version(request={"name": resource_name})
  secret_string = response.payload.data.decode('UTF-8')
  return secret_string

In [None]:
project_id = 'botchagalupep1'
openai_api_key = load_secrets("openai_api_key",project_id)
os.environ['OPENAI_API_KEY'] = openai_api_key
#MONGODB_ATLAS_CLUSTER_URI = load_secrets("mdb_uri",project_id)
MONGODB_ATLAS_CLUSTER_URI = load_secrets("MDB_CLUSTER0_URI",project_id)
langsmith_api_key = load_secrets("langsmith_api_key",project_id)
#print(langsmith_api_key )
#print(MONGODB_ATLAS_CLUSTER_URI)

In [None]:
# Our variables

DB_NAME = 'sample_mflix'
COLLECTION_NAME = 'embedded_movies'

In [None]:
from AtlasClient import AtlasClient

atlas_client = AtlasClient (MONGODB_ATLAS_CLUSTER_URI, DB_NAME)
print("Connected to the Mongo Atlas database!")

Connected to the Mongo Atlas database!


In [None]:
collection = atlas_client.get_collection(COLLECTION_NAME)
document_count = collection.count_documents({})

print (f"document count = {document_count:,}")

document count = 1,500


In [None]:
%pip install llama-index-embeddings-huggingface
%pip install llama-index-embeddings-instructor

In [None]:
import os
## LlamaIndex will download embeddings models as needed.
## Set llamaindex cache dir to ./cache dir here (Default is system tmp)
## This way, we can easily see downloaded artifacts
os.environ['LLAMA_INDEX_CACHE_DIR'] = os.path.join(os.path.abspath(''), '..', 'llama-index-cache')

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

import time

## handy function to calculate embeddings, given a model
def create_embeddings (movies, embedding_model, embedding_attr):
    embed_model = HuggingFaceEmbedding(model_name=embedding_model)

    t2a = time.perf_counter()
    for movie in movies:
        movie[embedding_attr] = embed_model.get_text_embedding(movie['plot'])

    t2b = time.perf_counter()
    # print (f'Embeddings generated for {len(movies):,} movies  in {(t2b-t2a)*1000:,.0f} ms')

[nltk_data] Downloading package stopwords to
[nltk_data]     /usr/local/lib/python3.10/dist-
[nltk_data]     packages/llama_index/core/_static/nltk_cache...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /usr/local/lib/python3.10/dist-
[nltk_data]     packages/llama_index/core/_static/nltk_cache...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# fetch all movies
t1a = time.perf_counter()
movies = [m for m in atlas_client.find (collection_name=COLLECTION_NAME, filter={'plot':{"$exists": True}}, limit=0)]
t1b = time.perf_counter()

print (f'Fetched {len(movies):,} from Atlas in {(t1b-t1a)*1000:,.0f} ms')

Fetched 1,500 from Atlas in 1,389 ms


In [None]:
## Embedding models we want to use.

model_mappings = {
    'BAAI/bge-small-en-v1.5' : {'embedding_attr' : 'plot_embedding_bge_small', 'index_name' : 'idx_plot_embedding_bge_small'},

    'sentence-transformers/all-mpnet-base-v2' : {'embedding_attr' : 'plot_embedding_mpnet_base_v2', 'index_name' : 'idx_plot_embedding_mpnet_base_v2'},

    # 'sentence-transformers/all-MiniLM-L12-v2' : {'embedding_attr' : 'plot_embedding_minilm_l12_v2', 'index_name' : 'idx_plot_embedding_minilm_l12_v2'},

    'sentence-transformers/all-MiniLM-L6-v2' : {'embedding_attr' : 'plot_embedding_minilm_l6_v2', 'index_name' : 'idx_plot_embedding_minilm_l6_v2'},

    ## bge-large takes too long and consumes too much memory!
    # 'BAAI/bge-large-en-v1.5' : {'embedding_attr' : 'plot_embedding_bge_large', 'index_name' : 'idx_plot_embedding_bge_large', 'embedding_length' : 1024},
}

In [None]:
## For selected embedding models above, we are giong to create vectors
## in movie collection.
## Remember, each embedding model has its own 'plot_embedding' attribute (we don't want to mix them up)

for key in model_mappings.keys():
    embedding_model = key
    embedding_attr = model_mappings[key]['embedding_attr']

    print (f'\n------- embedding model = {embedding_model} ---------')
    t1a = time.perf_counter()
    create_embeddings(movies=movies, embedding_model=embedding_model, embedding_attr=embedding_attr)
    t1b = time.perf_counter()
    avg_time_per_movie = (t1b-t1a)*1000 / len(movies)
    print (f'model={embedding_model}, created embeddings for {len(movies):,} movies in {(t1b-t1a)*1000:,.0f} ms, avg_time_per_movie={avg_time_per_movie:,.0f} ms')

In [None]:
import random

movie = random.choice(movies)
# print (movie)
print ('_id :', movie['_id'])
print ('title :', movie['title'])
print ('plot :', movie['plot'])
print (f'plot_embeddings (existing openAI generated), len={len(movie["plot_embedding"])} , {movie["plot_embedding"][:5]}...')
print (f'plot_embedding_bge_small , len={len(movie["plot_embedding_bge_small"])} , {movie["plot_embedding_bge_small"][:5]}...')
print (f'plot_embedding_mpnet_base_v2 , len={len(movie["plot_embedding_mpnet_base_v2"])} , {movie["plot_embedding_mpnet_base_v2"][:5]}...')
print (f'plot_embedding_minilm_l6_v2 , len={len(movie["plot_embedding_minilm_l6_v2"])} , {movie["plot_embedding_minilm_l6_v2"][:5]}...')

_id : 65cb71579e5a3532351227dc
title : D.E.B.S.
plot : Plaid-skirted schoolgirls are groomed by a secret government agency to become the newest members of the elite national-defense group, D.E.B.S.
plot_embeddings (existing openAI generated), len=1536 , [-0.027119491, -0.014553583, -0.008263821, -0.025921442, -0.017766535]...
plot_embedding_bge_small , len=384 , [-0.06293287873268127, 0.03485560789704323, 0.02202887274324894, -0.04800337553024292, 0.07878005504608154]...
plot_embedding_mpnet_base_v2 , len=768 , [0.01646951213479042, 0.010799586772918701, 0.02173537015914917, -0.01162173692137003, -0.00999673455953598]...
plot_embedding_minilm_l6_v2 , len=384 , [-0.044605500996112823, 0.03230508416891098, 0.020126136019825935, -0.005326232872903347, 0.041450947523117065]...


In [None]:
## Let's do a bulk update
from pymongo import  ReplaceOne


collection = atlas_client.get_collection(COLLECTION_NAME)

replacements = [ReplaceOne ({"_id" : movie["_id"]}, movie) for movie in movies]

# print (replacements[:3])

# Perform bulk replacement
print (f'About to update {len(replacements)} movies in Atlas...')
t1a = time.perf_counter()
result = collection.bulk_write(replacements)
t1b = time.perf_counter()

## Print result
print(f"Update matched count: {result.matched_count}")
print(f"Update modified count: {result.modified_count}")
print (f'Updated {len(movies):,} in Atlas in {(t1b-t1a)*1000:,.0f} ms')

About to update 1500 movies in Atlas...
Update matched count: 1500
Update modified count: 1500
Updated 1,500 in Atlas in 10,795 ms


We need to add three new Atlas Vector Indexes

idx_plot_embedding_bge_small

```
{
  "fields": [
    {
      "type": "vector",
      "path": "plot_embedding_bge_small",
      "numDimensions": 384,
      "similarity": "euclidean"
    }
  ]
}
```
idx_plot_embedding_mpnet_base_v2

```
{
  "fields": [
    {
      "type": "vector",
      "path": "plot_embedding_mpnet_base_v2",
      "numDimensions": 768,
      "similarity": "euclidean"
    }
  ]
}
```
idx_plot_embedding_minilm_l6_v2

```
{
  "fields": [
    {
      "type": "vector",
      "path": "plot_embedding_minilm_l6_v2",
      "numDimensions": 384,
      "similarity": "euclidean"
    }
  ]
}
```


