In [5]:
from langchain_community.retrievers import PineconeHybridSearchRetriever
from pinecone import Pinecone, ServerlessSpec
import os
from dotenv import load_dotenv

load_dotenv()

os.environ['HUGGINGFACE_API_KEY'] = os.getenv("HUGGINGFACE_API_KEY")
os.environ['PINECONE_API_KEY'] = os.getenv("PINECONE_API_KEY")

True

In [6]:

index_name = "hybrid-search"

##Initialize the pinecone client
pc = Pinecone()

##Cretae Index
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384, # dense vector
        metric='dotproduct', ## sparse calue supported only for dotproduct
        spec=ServerlessSpec(cloud='aws', region="us-east-1"),
    )

In [7]:
index  = pc.Index(index_name)
index

<pinecone.data.index.Index at 0x7fc5d1b37500>

In [8]:
## vector embedding using dense matrix
from langchain_huggingface import HuggingFaceEmbeddings
embeddings=HuggingFaceEmbeddings(model_name="all-MiniLM-l6-v2")
embeddings

# embeddings = HuggingFaceBgeEmbeddings(
#     model_name="sentence-transformers/all-MiniLM-l6-v2",
#     model_kwargs={'device':'cpu'},
#     encode_kwargs={'normalize_embeddings':True}
#     )



HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='all-MiniLM-l6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [31]:
## vector embwdding using sparse matrix
from pinecone_text.sparse import BM25Encoder # using TFIDF by default
bm25_encoder=BM25Encoder().default()

In [46]:
bm25_encoder

<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x7fc370c8aae0>

In [35]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /home/user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [38]:
sentences = ["In 2023, I visited Paris", "In 2022, Ivisited New York", "In 2021, I visited Tokyo"]

## tfidf values on these sentence
bm25_encoder.fit(sentences)

## Store the values to a json file
bm25_encoder.dump("bm25_values.json")

#load to your BM25Encoder obeject
bm25_encoder = BM25Encoder().load("bm25_values.json")

  0%|          | 0/3 [00:00<?, ?it/s]

In [39]:
retriever = PineconeHybridSearchRetriever(embeddings=embeddings, sparse_encoder=bm25_encoder, index=index)

In [40]:
retriever

PineconeHybridSearchRetriever(embeddings=HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='all-MiniLM-l6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False), sparse_encoder=<pinecone_text.sparse.bm25_encoder.BM25Encoder object at 0x7fc370c8aae0>, index=<pinecone.data.index.Index object at 0x7fc5d1b37500>)

In [41]:
retriever.add_texts(["In 2023, I visited Paris", "In 2022, Ivisited New York", "In 2021, I visited Tokyo"])

  0%|          | 0/1 [00:00<?, ?it/s]

In [51]:
retriever.invoke("In which year i visited New York?")

[Document(page_content='In 2022, Ivisited New York'),
 Document(page_content='In 2021, I visited Tokyo'),
 Document(page_content='In 2023, I visited Paris')]