In [1]:
# !pip install llama-index llama-index-vector-stores-mongodb llama-index-embeddings-openai pymongo

In [2]:
import getpass, os, pymongo, pprint
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, StorageContext
from llama_index.core.settings import Settings
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.vector_stores import MetadataFilter, MetadataFilters, ExactMatchFilter, FilterOperator
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch

In [3]:
from dotenv import load_dotenv
import os
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_KEY")
ATLAS_CONNECTION_STRING = os.environ.get("ATLAS_URI")

In [4]:
Settings.llm = OpenAI()
Settings.embed_model = OpenAIEmbedding(model="text-embedding-ada-002")
Settings.chunk_size = 100
Settings.chunk_overlap = 10

In [5]:
# import requests
# import os

# # Define the URL and the file name
# url = 'https://query.prod.cms.rt.microsoft.com/cms/api/am/binary/RE4HkJP'
# file_name = 'data/atlas_best_practices.pdf'

# # Create the 'data' directory if it doesn't exist
# os.makedirs(os.path.dirname(file_name), exist_ok=True)

# # Download the file
# response = requests.get(url)
# if response.status_code == 200:
#     with open(file_name, 'wb') as f:
#         f.write(response.content)
#     print("File downloaded successfully.")
# else:
#     print("Failed to download file. Status code:", response.status_code)

In [6]:
# Load the sample data
sample_data = SimpleDirectoryReader(input_files=["./data/atlas_best_practices.pdf"]).load_data()

# Print the first document
sample_data[0]

Document(id_='19ee74e1-cb21-44aa-aa47-ff7a8385e74c', embedding=None, metadata={'page_label': '1', 'file_name': 'atlas_best_practices.pdf', 'file_path': 'data\\atlas_best_practices.pdf', 'file_type': 'application/pdf', 'file_size': 512653, 'creation_date': '2024-04-26', 'last_modified_date': '2024-04-26'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='Mong oDB Atlas Best P racticesJanuary 20 19A MongoD B White P aper\n', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

In [7]:
# Connect to your Atlas cluster
mongodb_client = pymongo.MongoClient(ATLAS_CONNECTION_STRING)

# Instantiate the vector store
atlas_vector_search = MongoDBAtlasVectorSearch(
    mongodb_client,
    db_name = "llamaindex_test_db",
    collection_name = "test",
    index_name = "vector_index"
)
vector_store_context = StorageContext.from_defaults(vector_store=atlas_vector_search)

In [8]:
vector_store_index = VectorStoreIndex.from_documents(
   sample_data, storage_context=vector_store_context, show_progress=True
)

  from .autonotebook import tqdm as notebook_tqdm
Parsing nodes: 100%|██████████| 21/21 [00:00<00:00, 311.38it/s]
Generating embeddings: 100%|██████████| 259/259 [00:38<00:00,  6.70it/s]


In [None]:
#search index
# {
#   "mappings": {
#     "dynamic": true,
#     "fields": {
#       "embedding": {
#         "dimensions": 1536,
#         "similarity": "cosine",
#         "type": "knnVector"
#       }
#     }
#   }
# }

In [9]:
# Instantiate Atlas Vector Search as a retriever
vector_store_retriever = VectorIndexRetriever(index=vector_store_index, similarity_top_k=5)
# Pass the retriever into the query engine
query_engine = RetrieverQueryEngine(retriever=vector_store_retriever)
# Prompt the LLM
response = query_engine.query('How can I secure my MongoDB Atlas cluster?')

In [10]:
print(response)
print("\nSource documents: ")
pprint.pprint(response.source_nodes)

You can secure your MongoDB Atlas cluster by utilizing security features such as authentication, IP address whitelisting, encryption of data in motion over the network and at rest in persistent storage, and optionally configuring an additional layer of encryption on data at rest using the MongoDB Encrypted Storage Engine with an Atlas-compatible key management service like AWS Key Management Service or Azure Key Vault. Additionally, MongoDB Atlas provides built-in replication for high availability, backups for point-in-time recovery, fine-grained monitoring for scaling, and the ability to set up global clusters across different cloud platforms with ease through the MongoDB Atlas UI.

Source documents: 
[NodeWithScore(node=TextNode(id_='79700022-3e99-4391-8e16-321a1b31e243', embedding=None, metadata={'page_label': '3', 'file_name': 'atlas_best_practices.pdf', 'file_path': 'data\\atlas_best_practices.pdf', 'file_type': 'application/pdf', 'file_size': 512653, 'creation_date': '2024-04-26'