## Hybrid Search

In [1]:
api_key = "6367bb4d-8ec6-4b62-ad1a-d749ea9ce65a"

In [2]:
from langchain_community.retrievers import PineconeHybridSearchRetriever

In [3]:
import os
from pinecone import Pinecone, ServerlessSpec
index_name = "hybrid-search-langchain-pinecone"

## Initialize the Pinecone client

pc = Pinecone(api_key=api_key)

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name, 
        dimension=384, #Dimensionality of dense model
        metric="dotproduct", #spare values supported only for dotproduct
        spec=ServerlessSpec(cloud='aws', region="us-east-1")
    )

In [4]:
index = pc.Index(index_name)
index

<pinecone.data.index.Index at 0x262ba31c750>

In [8]:
# Vector embedding and sparse matrix
import os
from dotenv  import load_dotenv

load_dotenv()
os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")

from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
embeddings 




modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [9]:
from pinecone_text.sparse import BM25Encoder

bm25_encoder = BM25Encoder().default()
bm25_encoder


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shrey\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shrey\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x262d5c8e110>

In [10]:
sentence = [
    "The quick brown fox jumps over the lazy dog",
    "The five boxing wizards jump quickly at dawn",
    "Pack my box with five dozen liquor jugs",
    "How vexingly quick witted zebras jump",
]

# tfidf values on these sentence
bm25_encoder.fit(sentence)

# store the  value to json file
bm25_encoder.dump("bm25_values.json")

  0%|          | 0/4 [00:00<?, ?it/s]

In [11]:
retriever = PineconeHybridSearchRetriever(embeddings=embeddings,  index=index, sparse_encoder=bm25_encoder)

In [12]:
retriever

PineconeHybridSearchRetriever(embeddings=HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False), sparse_encoder=<pinecone_text.sparse.bm25_encoder.BM25Encoder object at 0x00000262D5C8E110>, index=<pinecone.data.index.Index object at 0x00000262BA31C750>)

In [13]:
retriever.add_texts(sentence)

  0%|          | 0/1 [00:00<?, ?it/s]

In [14]:
retriever.invoke("Quick brown")

[Document(metadata={'score': 0.434366345}, page_content='The quick brown fox jumps over the lazy dog'),
 Document(metadata={'score': 0.199603066}, page_content='How vexingly quick witted zebras jump'),
 Document(metadata={'score': 0.138112545}, page_content='The five boxing wizards jump quickly at dawn'),
 Document(metadata={'score': 0.0531437099}, page_content='Pack my box with five dozen liquor jugs')]

In [16]:
retriever.invoke("Quick wizard")

[Document(metadata={'score': 0.40015161}, page_content='The five boxing wizards jump quickly at dawn'),
 Document(metadata={'score': 0.206498235}, page_content='The quick brown fox jumps over the lazy dog'),
 Document(metadata={'score': 0.199130937}, page_content='How vexingly quick witted zebras jump'),
 Document(metadata={'score': 0.0700001195}, page_content='Pack my box with five dozen liquor jugs')]