Hybrid Search Using LangChain


In [1]:
!pip install --upgrade --quiet pinecone-text pinecone-notebooks


In [3]:
from langchain_community.retrievers import PineconeHybridSearchRetriever # this class is going to do both the semantic search and syntatic search and also it have the sparse matrix and dense vector search

In [6]:
from pinecone import Pinecone

# Initialize Pinecone
pc = Pinecone(api_key="XXXXXXXXXXXXXXXXX", environment="us-east-1")

index_name = "hybrid-search-langchain-pinecone"

# List indexes
indexes = pc.list_indexes()
print("Indexes:", indexes)

# Check if the index exists
if index_name not in [idx["name"] for idx in indexes]:
    # Only create the index if it doesn't exist
    pc.create_index(
        name=index_name,
        dimension=384,
        spec={
            "serverless": {
                "cloud": "aws",
                "region": "us-east-1"
            }
        }
    )

# Connect to the index
index = pc.Index(index_name)

print("Connected to index:", index_name)


Indexes: [{
    "name": "hybrid-search-langchain-pinecone",
    "metric": "dotproduct",
    "host": "hybrid-search-langchain-pinecone-ulryqfy.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}]
Connected to index: hybrid-search-langchain-pinecone


In [7]:
index=pc.Index(index_name)
index

<pinecone.db_data.index.Index at 0x227d76e5820>

In [8]:
pip install langchain_huggingface

Note: you may need to restart the kernel to use updated packages.


In [9]:
pip install sentence-transformers

Note: you may need to restart the kernel to use updated packages.


In [10]:
# For standard Python users
!pip install setuptools



In [11]:
## vector embedding and sparse matrix
import os
from dotenv import load_dotenv
load_dotenv()
os.environ["HF_Token"]=os.getenv("HF_Token") # this is the hugging face token which is used to access the hugging face model, you can get it from https://huggingface.co/settings/tokens

from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(
    model_name="all-MiniLM-L6-v2", # this is the sentence transformer model which will convert any text to 384 dimension vector
)
embeddings






HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_encode_kwargs={}, multi_process=False, show_progress=False)

In [12]:
embeddings

HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_encode_kwargs={}, multi_process=False, show_progress=False)

In [13]:
!pip install pinecone-text



In [14]:
from pinecone_text.sparse import BM25Encoder  # this encoder uses the tfidf algorithm to convert the text to sparse vector
bm25_encoder = BM25Encoder().default()  # this is the default encoder which will convert the text to sparse vector which is tfidf
bm25_encoder

<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x227f06478f0>

In [15]:
sentences=[
    "In 2023,I Visited Paris",
    "In 2022,I Visited New York",
    "In 2021,I Visited New Orleans"

]
## tfidf values on these sentence
bm25_encoder.fit(sentences)
## store the values to a json file
bm25_encoder.dump("bm25_values.json")



  0%|          | 0/3 [00:00<?, ?it/s]

In [16]:
retriever=PineconeHybridSearchRetriever(embeddings=embeddings,sparse_encoder=bm25_encoder,index=index,top_k=3) # this retriever will do both the semantic search and syntatic search and also it have the sparse matrix and dense vector search

In [17]:
retriever # here the retriever is the data storage which will do both the semantic search and syntatic search and also it have the sparse matrix and dense vector search and it is like a datastore which will store the data and also it will do the search

PineconeHybridSearchRetriever(embeddings=HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_encode_kwargs={}, multi_process=False, show_progress=False), sparse_encoder=<pinecone_text.sparse.bm25_encoder.BM25Encoder object at 0x00000227F06478F0>, index=<pinecone.db_data.index.Index object at 0x00000227D76E5820>, top_k=3)

In [18]:
retriever.add_texts(sentences) # this will add the sentences to the index and also it will do the sparse matrix and dense vector search 

  0%|          | 0/1 [00:00<?, ?it/s]

In [19]:
retriever.invoke("What City Did i Visit in 2023?") # this will do the search and return the results based on the semantic search and syntatic search and also it will do the sparse matrix and dense vector search

[Document(metadata={'score': 0.485442907}, page_content='In 2023,I Visited Paris'),
 Document(metadata={'score': 0.374639153}, page_content='In 2022,I Visited New York'),
 Document(metadata={'score': 0.331566453}, page_content='In 2021,I Visited New Orleans')]

In [20]:
retriever.invoke("What City Did i Visit in last?")

[Document(metadata={'score': 0.284371018}, page_content='In 2021,I Visited New Orleans'),
 Document(metadata={'score': 0.259336591}, page_content='In 2022,I Visited New York'),
 Document(metadata={'score': 0.236757725}, page_content='In 2023,I Visited Paris')]

In [21]:
retriever.invoke("What City Did i Visit in recently?") # this will do the search and return the results based on the semantic search and syntatic search and also it will do the sparse matrix and dense vector search

[Document(metadata={'score': 0.290020108}, page_content='In 2021,I Visited New Orleans'),
 Document(metadata={'score': 0.251049638}, page_content='In 2022,I Visited New York'),
 Document(metadata={'score': 0.222349375}, page_content='In 2023,I Visited Paris')]

In [22]:
retriever.invoke("What City Did i Visit in first?")

[Document(metadata={'score': 0.252177835}, page_content='In 2021,I Visited New Orleans'),
 Document(metadata={'score': 0.246452466}, page_content='In 2022,I Visited New York'),
 Document(metadata={'score': 0.221383303}, page_content='In 2023,I Visited Paris')]