#### **Vector DB**

In [1]:
# we'll generally use vector, 

import os 
from huggingface_hub import login
login(token=os.environ["HF_TOKEN"])

  from .autonotebook import tqdm as notebook_tqdm
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [2]:
# lets use a HuggigFace model as encoder
import sentence_transformers
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("BAAI/bge-m3")

normalize_embeddings=True makes embeddings:

- unit-length

- stable

- consistent

- cosine-similarity-friendly

In [3]:
# lets try encoding some sentences 
import time
sent1 = "Hey What are you working on?"

start = time.time()
embedding = model.encode(sent1, normalize_embeddings=True)
end = time.time()

print(f"Time takes : {(end-start)/60 :.2f} min")

Time takes : 0.38 min


In [4]:
# lets check the embedings
print(f"Length of embedding : {embedding.shape[0]}")
print(f"Embedding : {embedding}")

Length of embedding : 1024
Embedding : [-0.05459158  0.03189433 -0.04950237 ... -0.00609845 -0.01839233
  0.03193588]


---
**We should also try out huggingface embeddings**

---

In [5]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [6]:
text1 = "This is a test document."
embedding = embedding_model.embed_query(text1)

print(f"Length of embeddings are : {len(embedding)}")
print(f"Embedding : {embedding}")

Length of embeddings are : 768
Embedding : [-0.048951778560876846, -0.03986189514398575, -0.021562768146395683, 0.009908477775752544, -0.038103990256786346, 0.012684357352554798, 0.043494585901498795, 0.07183391600847244, 0.009748563170433044, -0.006987015716731548, 0.0635281428694725, -0.03032267466187477, 0.013839450664818287, 0.025805959478020668, -0.0011362320510670543, -0.014563615433871746, 0.04164029657840729, 0.03622830659151077, -0.02680082619190216, 0.025120696052908897, -0.024978619068861008, -0.0045332517474889755, -0.02666720375418663, 0.004100714344531298, -0.052047986537218094, -0.009930450469255447, -0.05206526443362236, 0.0089920898899436, -0.0383005328476429, -0.0440584234893322, -0.00420440873131156, 0.07047971338033676, 0.005133901257067919, -0.07161537557840347, 1.6975317294054548e-06, -0.00604772474616766, -0.011076384223997593, 0.017513377591967583, -0.022299878299236298, 0.04095499590039253, 0.03379019349813461, 0.05665036663413048, -0.07114934921264648, 0.04097

In [21]:
type(embedding)

list

---
### **Vectorstores**

---

In [15]:
# We'll only try faiss vectorstore 
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

In [16]:
# vectorstore for 1st model 
embedding_dim = 1024

index = faiss.IndexFlatL2(embedding_dim)

vector_store1 = FAISS(
    embedding_function=model,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


In [31]:
## It is saying that our first model is not instance of Embeddings class so we need to wrap it
from langchain.embeddings.base import Embeddings
import numpy as np

class CustomSentTransformerEmbedding(Embeddings):
    def __init__(self, model):
        self.model = model

    def embed_documents(self, texts):
        return [list(embed) for embed in model.encode(texts)]
    
    def embed_query(self, query):
        return list(self.model.encode(query))

In [32]:
new_st_model = CustomSentTransformerEmbedding(model)
new_st_model

<__main__.CustomSentTransformerEmbedding at 0x167388a90>

In [33]:
sample_embedding = new_st_model.embed_query("Hey How are you?")

In [34]:
type(sample_embedding)

list

In [35]:
## lets try recreating the vectorstore 
vector_store1 = FAISS(
    embedding_function=new_st_model,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)

In [36]:
## managing vectorstores 
from uuid import uuid4

from langchain_core.documents import Document

document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "tweet"},
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
)

document_7 = Document(
    page_content="The top 10 soccer players in the world right now.",
    metadata={"source": "website"},
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]
uuids = [str(uuid4()) for _ in range(len(documents))]

In [37]:
vector_store1.add_documents(documents=documents, ids=uuids)

['05674116-4c0c-4178-8a84-3a91b30ef932',
 '23f707a4-43f2-4132-8321-3f530e1ec6df',
 'bd5f9355-2884-493c-8582-46927053f806',
 '549c476a-cbd6-4804-b41d-bb283d25423c',
 '628d57dc-7ca7-4237-90db-beecbf76cefe',
 'cc7defe5-2994-4a75-a7ed-2ff6aaf8b1a4',
 'f3083f19-385a-4888-ba68-aed8e71bbcb9',
 '84eb739f-6e0c-4bb8-a56a-236693fe5472',
 'a4366683-b949-4c83-a4ca-0628272ff1e7',
 '18b21c91-4c58-402c-97df-401572318b4a']

In [40]:
# making a similarity search to find relevant documents
results = vector_store1.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy",
    k=2,
    filter={"source": "tweet"},
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* Building an exciting new project with LangChain - come check it out! [{'source': 'tweet'}]
* LangGraph is the best framework for building stateful, agentic applications! [{'source': 'tweet'}]
