In [1]:
!pip install --upgrade --quiet  pinecone-client pinecone-text pinecone-notebooks


[notice] A new release of pip is available: 23.2.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
api_key="205fc917-0xxxxxxxxxxxxxxxxb8e"

In [2]:
from langchain_community.retrievers import PineconeHybridSearchRetriever

Pinecone is a fully managed vector database, meaning it takes care of infrastructure, scaling, and maintenance. 
<!--  -->
Pinecone runs on the cloud (AWS, GCP, Azure), making it accessible and highly available globally
<!--  -->
Hybrid Search Capabilities
<!--  -->
Automatic Scaling and Performance Tuning

In [3]:
# creating a Pinecone index, which is a vector database used for similarity search and information retrieval tasks.

import os
from pinecone import Pinecone,ServerlessSpec
index_name="hybrid-search"
pc=Pinecone(api_key=api_key)

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,
        # using HF embedd that creates 384 size vector
        metric='dotproduct', # good for sparse values
        spec=ServerlessSpec(cloud='aws',region="us-east-1"),
    )

  from tqdm.autonotebook import tqdm


In [4]:
index=pc.Index(index_name)
index

<pinecone.data.index.Index at 0x2016eb00b50>

In [6]:
# vector embedding and sparse matrix
from dotenv import load_dotenv
load_dotenv()

os.environ["HF_TOKEN"]=os.getenv("HF_TOKEN")

from langchain_huggingface import HuggingFaceEmbeddings
embeddings=HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')



In [7]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

set up a BM25 text encoder with default settings, which is used to encode text into sparse vectors for efficient keyword-based search in Pinecone.
<!--  -->
BM25 (Best Matching 25) is a ranking function used by search engines to estimate the relevance of documents to a given search query. It's part of a family of algorithms called probabilistic information retrieval models.
Uses TF IDF

In [14]:
from pinecone_text.sparse import BM25Encoder

bm25_encoder=BM25Encoder().default()
bm25_encoder

<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x2011f8ffa10>

In [15]:
sentences=[
    "In 2023, I visited Paris",
        "In 2022, I visited New York",
        "In 2021, I visited New Orleans",

]

## tfidf values on these sentence
bm25_encoder.fit(sentences)

## store the values to a json file
bm25_encoder.dump("bm25_values.json")

# load to your BM25Encoder object
bm25_encoder = BM25Encoder().load("bm25_values.json")


100%|██████████| 3/3 [00:00<00:00,  9.78it/s]


In [16]:
retriever=PineconeHybridSearchRetriever(embeddings=embeddings,sparse_encoder=bm25_encoder,index=index)

In [17]:
retriever

PineconeHybridSearchRetriever(embeddings=HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False), sparse_encoder=<pinecone_text.sparse.bm25_encoder.BM25Encoder object at 0x000002011F2FFD10>, index=<pinecone.data.index.Index object at 0x000002016EB00B50>)

In [19]:
retriever.add_texts(["In 2023, I visited Paris",
        "In 2022, I visited New York",
        "In 2021, I visited New Orleans",
]
)
# texts get added to pinecone index

100%|██████████| 1/1 [00:06<00:00,  6.17s/it]


In [20]:
retriever.invoke("What city did i visit first?")

[Document(metadata={'score': 0.258085757}, page_content='In 2021, I visited New Orleans'),
 Document(metadata={'score': 0.244448498}, page_content='In 2022, I visited New York'),
 Document(metadata={'score': 0.220180511}, page_content='In 2023, I visited Paris')]

In [21]:
retriever.invoke("What city did i visit in 2023?")

[Document(metadata={'score': 0.484895855}, page_content='In 2023, I visited Paris'),
 Document(metadata={'score': 0.374192238}, page_content='In 2022, I visited New York'),
 Document(metadata={'score': 0.331765205}, page_content='In 2021, I visited New Orleans')]

In [22]:
retriever.add_texts([
    "The Eiffel Tower in Paris is one of the most iconic landmarks in the world, attracting millions of tourists annually.",
    "The Statue of Liberty in New York symbolizes freedom and is a must-visit for history enthusiasts.",
    "Mardi Gras in New Orleans is known for its vibrant parades, colorful costumes, and lively street performances.",
    "The Louvre Museum in Paris houses some of the world's most famous artworks, including the Mona Lisa.",
    "Central Park in New York offers a serene escape from the city's hustle and bustle, with vast green spaces and scenic views.",
    "Jazz music in New Orleans has deep roots, with countless clubs offering live performances that capture the city’s soul.",
    "The cuisine in Paris is a delightful blend of classic French dishes, with famous pastries like croissants and macarons.",
    "Broadway shows in New York are a major attraction, featuring world-class performances in historic theaters.",
    "New Orleans’ French Quarter is famous for its rich history, vibrant nightlife, and unique Creole architecture.",
    "The Seine River in Paris offers scenic boat tours that provide stunning views of the city’s historic bridges and buildings.",
])

100%|██████████| 1/1 [00:02<00:00,  2.66s/it]


In [24]:
retriever.invoke("Places in Paris?")

[Document(metadata={'score': 0.416581571}, page_content='In 2023, I visited Paris'),
 Document(metadata={'score': 0.334014237}, page_content='The cuisine in Paris is a delightful blend of classic French dishes, with famous pastries like croissants and macarons.'),
 Document(metadata={'score': 0.284285545}, page_content='The Eiffel Tower in Paris is one of the most iconic landmarks in the world, attracting millions of tourists annually.'),
 Document(metadata={'score': 0.266835123}, page_content='The Seine River in Paris offers scenic boat tours that provide stunning views of the city’s historic bridges and buildings.')]

In [25]:
retriever.invoke("places to enjoy jazz music")

[Document(metadata={'score': 0.390160829}, page_content='Jazz music in New Orleans has deep roots, with countless clubs offering live performances that capture the city’s soul.'),
 Document(metadata={'score': 0.184022754}, page_content='New Orleans’ French Quarter is famous for its rich history, vibrant nightlife, and unique Creole architecture.'),
 Document(metadata={'score': 0.145032972}, page_content='In 2021, I visited New Orleans'),
 Document(metadata={'score': 0.135576591}, page_content='Broadway shows in New York are a major attraction, featuring world-class performances in historic theaters.')]

If a user searches for "iconic landmarks in Paris," the hybrid search will recognize both "Eiffel Tower" (semantic match) and "Paris" (keyword match).
<!--  -->
For a query like "places to enjoy jazz music," the system will intelligently rank the New Orleans text highly, even if "jazz music" isn’t an exact keyword match.