In [None]:
from dotenv import load_dotenv
import os
from langchain_huggingface import HuggingFaceEmbeddings



Config

In [None]:
load_dotenv()

In [6]:
embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

embeddings_model

HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_encode_kwargs={}, multi_process=False, show_progress=False)

In [9]:
from langchain_astradb import AstraDBVectorStore

vector_store = AstraDBVectorStore(
    embedding=embeddings_model,
    collection_name="astra_vectors",
    api_endpoint=os.getenv("ASTRA_DB_API_ENDPOINT"),
    token=os.getenv("ASTRA_DB_APPLICATION_TOKEN"),
    namespace=None
)

In [10]:
from langchain_core.documents import Document

document_1 = Document(page_content="I had chocolate chips and scrambled eggs for breakfast this morning.", metadata={"source": "tweet"})
document_2 = Document(page_content="The capital of France is Paris.", metadata={"source": "wiki"})
document_3 = Document(page_content="The Eiffel Tower is located in Paris, France.", metadata={"source": "wiki"})
document_4 = Document(page_content="I love programming in Python!", metadata={"source": "blog"})
document_5 = Document(page_content="The Great Wall of China is one of the Seven Wonders of the World.", metadata={"source": "wiki"})
document_6 = Document(page_content="Artificial Intelligence is transforming the world.", metadata={"source": "article"})
document_7 = Document(page_content="The sun rises in the east and sets in the west.", metadata={"source": "fact"})
document_8 = Document(page_content="Mount Everest is the highest mountain on Earth.", metadata={"source": "geo"})
document_9 = Document(page_content="The Pacific Ocean is the largest ocean on Earth.", metadata={"source": "geo"})
document_10 = Document(page_content="Python is a popular programming language for data science.", metadata={"source": "blog"})

documents = [document_1, document_2, document_3, document_4, document_5, document_6, document_7, document_8, document_9, document_10]

In [11]:
vector_store.add_documents(documents=documents)

['fb36af16fb414903824f94449a4597c1',
 '7bfdf2ed16ac48479a9bb8d4c00cffbb',
 '6db1e8b488b0494987440541878b0ea7',
 '780b917dd805485fb1cd80e1b0ab9b3c',
 '324d095e6ef440c69449f8e5d226fc34',
 '8a92f1a5026349dd87de773b930940e3',
 'd15e260973fb4e89b296a2ec2829ebe1',
 '818da46cd1044049914c9501bc1380be',
 'a37f782ada0d40bfbaf0b44886f9bd90',
 '35b1a02987a04236bf865804b90e75d8']

Search

In [12]:
vector_store.similarity_search("Where is the Eiffel Tower located?")

[Document(id='6db1e8b488b0494987440541878b0ea7', metadata={'source': 'wiki'}, page_content='The Eiffel Tower is located in Paris, France.'),
 Document(id='7bfdf2ed16ac48479a9bb8d4c00cffbb', metadata={'source': 'wiki'}, page_content='The capital of France is Paris.'),
 Document(id='324d095e6ef440c69449f8e5d226fc34', metadata={'source': 'wiki'}, page_content='The Great Wall of China is one of the Seven Wonders of the World.'),
 Document(id='d15e260973fb4e89b296a2ec2829ebe1', metadata={'source': 'fact'}, page_content='The sun rises in the east and sets in the west.')]

In [13]:
results = vector_store.similarity_search("Which is the largest mountain",
                                         k=3,
                                         filter={"source": "geo"})

for result in results:
    print(f"* '{result.page_content}', metadata: {result.metadata}")

* 'Mount Everest is the highest mountain on Earth.', metadata: {'source': 'geo'}
* 'The Pacific Ocean is the largest ocean on Earth.', metadata: {'source': 'geo'}


In [14]:
results = vector_store.similarity_search_with_score("Which is the largest mountain",
                                         k=3,
                                         filter={"source": "geo"})

for res, score in results:
    print(f"* Score: {score} '{res.page_content}', metadata: {res.metadata}")

* Score: 0.8257852 'Mount Everest is the highest mountain on Earth.', metadata: {'source': 'geo'}
* Score: 0.7136688 'The Pacific Ocean is the largest ocean on Earth.', metadata: {'source': 'geo'}


Retriever

In [15]:
retriever = vector_store.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 1, "score_threshold": 0.5}
)

retriever.invoke("Suggest me a good programming language", filter={"source": "blog"})

[Document(id='35b1a02987a04236bf865804b90e75d8', metadata={'source': 'blog'}, page_content='Python is a popular programming language for data science.')]

In [16]:
retriever = vector_store.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 1}
)

retriever.invoke("Suggest me a good programming language", filter={"source": "blog"})

[Document(id='35b1a02987a04236bf865804b90e75d8', metadata={'source': 'blog'}, page_content='Python is a popular programming language for data science.')]