<a href="https://colab.research.google.com/github/Saifullah785/langchain-generative-ai-journey/blob/main/Lecture_13_langchain_retrievers/Lecture_13_langchain_retrievers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os
# Set the HUGGINGFACE_API_KEY environment variable
os.environ['HUGGINGFACE_API_KEY'] = 'hf_token'

In [3]:
# Install necessary libraries for LangChain, ChromaDB, FAISS, sentence-transformers, tiktoken, HuggingFace, and Wikipedia integration
! pip install langchain chromadb faiss-cpu sentence-transformers tiktoken langchain_HuggingFace langchain_community wikipedia

Collecting chromadb
  Downloading chromadb-1.0.15-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting langchain_HuggingFace
  Downloading langchain_huggingface-0.3.0-py3-none-any.whl.metadata (996 bytes)
Collecting langchain_community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28

# **Wikipedia Retriever**

In [None]:
# Import the WikipediaRetriever class from the langchain_community module
from langchain_community.retrievers import WikipediaRetriever

In [None]:
# Initialize the WikipediaRetriever with a specified number of results and language
retriever = WikipediaRetriever(top_k_results=2, lang='en')

In [None]:
# Define the query string for the Wikipedia search
query = 'the geopolitical history of india and pakistan from the perspective of a chinese'

In [None]:
# Invoke the retriever with the query to get relevant documents from Wikipedia
docs = retriever.invoke(query)
# Display the retrieved documents
docs

[Document(metadata={'title': 'China–India relations', 'summary': "China and India maintained peaceful relations for thousands of years, but their relationship has varied since the Chinese Communist Party's victory in the Chinese Civil War in 1949 and the annexation of Tibet by the People's Republic of China. The two nations have sought economic cooperation with each other, while frequent border disputes and economic nationalism in both countries are major points of contention.\nCultural and economic relations between China and India date back to ancient times. The Silk Road not only served as a major trade route between India and China, but is also credited for facilitating the spread of Buddhism from India to East Asia. During the 19th century, China was involved in a growing opium trade with the East India Company, which exported opium grown in India. During World War II, both British India and the Republic of China (ROC) played a crucial role in halting the progress of Imperial Japa

In [None]:
# Iterate through the retrieved documents and print their content
for i , doc in enumerate(docs):
    print(f'\n--- Result {i+1} --- ')
    print(f'Content:\n{doc.page_content}..')


--- Result 1 --- 
Content:
China and India maintained peaceful relations for thousands of years, but their relationship has varied since the Chinese Communist Party's victory in the Chinese Civil War in 1949 and the annexation of Tibet by the People's Republic of China. The two nations have sought economic cooperation with each other, while frequent border disputes and economic nationalism in both countries are major points of contention.
Cultural and economic relations between China and India date back to ancient times. The Silk Road not only served as a major trade route between India and China, but is also credited for facilitating the spread of Buddhism from India to East Asia. During the 19th century, China was involved in a growing opium trade with the East India Company, which exported opium grown in India. During World War II, both British India and the Republic of China (ROC) played a crucial role in halting the progress of Imperial Japan. After India became independent in 19

#**Vector Store Retriever**

In [None]:
# Import necessary classes from langchain_community and langchain_core for working with vector stores and documents
from langchain_community.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document

In [None]:
# Define a list of Document objects with example text content
documents = [
    Document(page_content="LangChain helps developers build LLM applications easily"),
    Document(page_content="Chroma is a vector database optimized for LLM-based search"),
    Document(page_content='Embeddings convert text into high-dimensional vectors.'),
    Document(page_content='OpenAI provides powerful embedding models.'),
]

In [None]:
# Initialize the HuggingFaceEmbeddings model for generating embeddings
embeddings_model = HuggingFaceEmbeddings()

# Create a Chroma vector store from the documents and embeddings model
vectorstores = Chroma.from_documents(
    documents =documents,
    embedding = embeddings_model,
    collection_name='my_collection'

)

  embeddings_model = HuggingFaceEmbeddings()


In [None]:
# Convert the vector store into a retriever with a specified number of search results
retriever = vectorstores.as_retriever(search_kwargs={'k': 2})

In [None]:
# Define the query string for the vector store search
query = 'what is Chroma used for?'
# Invoke the retriever with the query to get relevant documents from the vector store
results = retriever.invoke(query)

In [None]:
# Iterate through the search results and print the content of each document
for i, doc in enumerate(results):
  print(f'\n--- Result {i+1} --- ')
  print(f'Content:\n{doc.page_content}..')


--- Result 1 --- 
Content:
Chroma is a vector database optimized for LLM-based search..

--- Result 2 --- 
Content:
Chroma is a vector database optimized for LLM-based search..


In [None]:
# Iterate through the search results again and print the content of each document
for i, doc in enumerate(results):
    print(f"\n--- Result {i+1} ---")
    print(doc.page_content)


--- Result 1 ---
Chroma is a vector database optimized for LLM-based search

--- Result 2 ---
Chroma is a vector database optimized for LLM-based search


# **MMR (Maximum Marginal Relevance)**

In [None]:
# Define a list of Document objects with example text content for MMR
docs = [
    Document(page_content="LangChain makes it easy to work with LLMs."),
    Document(page_content="LangChain is used to build LLM based applications."),
    Document(page_content="Chroma is used to store and search document embeddings."),
    Document(page_content="Embeddings are vector representations of text."),
    Document(page_content="MMR helps you get diverse results when doing similarity search."),
    Document(page_content="LangChain supports Chroma, FAISS, Pinecone, and more."),
]

In [None]:
# Import the FAISS vector store class from the langchain_community module
from langchain_community.vectorstores import FAISS

In [None]:
# Initialize the HuggingFaceEmbeddings model (this will use a default model)
embeddings_model = HuggingFaceEmbeddings()
# Create a FAISS vector store from the documents and embeddings model
vectorstore = FAISS.from_documents(
    documents = docs,
    embedding = embeddings_model
    )

  embeddings_model = HuggingFaceEmbeddings()


In [None]:
# Convert the FAISS vector store into a retriever using MMR search
retriever = vectorstore.as_retriever(
    search_type='mmr', # Specify Maximum Marginal Relevance search
    search_kwargs = {'k': 3, 'lambda_mult': 0.5} # Configure search parameters (k=number of results, lambda_mult=diversity)
)

In [None]:
# Define the query string for the MMR search
query = 'what is langchain?'
# Invoke the retriever with the query to get relevant documents using MMR
results = retriever.invoke(query)

In [None]:
# Iterate through the MMR search results and print the content of each document
for i, doc in enumerate(results):
  print(f'\n --- Result {i+1} --- ')
  print(f'Content:\n{doc.page_content}..')


 --- Result 1 --- 
Content:
LangChain is used to build LLM based applications...

 --- Result 2 --- 
Content:
Embeddings are vector representations of text...

 --- Result 3 --- 
Content:
LangChain supports Chroma, FAISS, Pinecone, and more...


# **Multiquery Retriever**

In [4]:
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEndpoint,ChatHuggingFace
from langchain.retrievers.multi_query import MultiQueryRetriever

In [5]:
# Initialize the HuggingFaceEndpoint with the Meta-Llama model for text generation.
llm = HuggingFaceEndpoint(
    repo_id ='meta-llama/Meta-Llama-3-8B-Instruct',
    task = 'text-generation'
)

# Wrap the endpoint in a ChatHuggingFace object for chat-style interaction.
model = ChatHuggingFace(llm=llm)

In [6]:
# Define a list of Document objects with example text content and metadata
all_docs = [
    Document(page_content="Regular walking boosts heart health and can reduce symptoms of depression.", metadata={"source": "H1"}),
    Document(page_content="Consuming leafy greens and fruits helps detox the body and improve longevity.", metadata={"source": "H2"}),
    Document(page_content="Deep sleep is crucial for cellular repair and emotional regulation.", metadata={"source": "H3"}),
    Document(page_content="Mindfulness and controlled breathing lower cortisol and improve mental clarity.", metadata={"source": "H4"}),
    Document(page_content="Drinking sufficient water throughout the day helps maintain metabolism and energy.", metadata={"source": "H5"}),
    Document(page_content="The solar energy system in modern homes helps balance electricity demand.", metadata={"source": "I1"}),
    Document(page_content="Python balances readability with power, making it a popular system design language.", metadata={"source": "I2"}),
    Document(page_content="Photosynthesis enables plants to produce energy by converting sunlight.", metadata={"source": "I3"}),
    Document(page_content="The 2022 FIFA World Cup was held in Qatar and drew global energy and excitement.", metadata={"source": "I4"}),
    Document(page_content="Black holes bend spacetime and store immense gravitational energy.", metadata={"source": "I5"}),
]

In [7]:
# Initialize the HuggingFaceEmbeddings model for generating embeddings
embeddings_model = HuggingFaceEmbeddings()

# Create a FAISS vector store from the documents and embeddings model
vectorstore = FAISS.from_documents(documents=all_docs,
                                   embedding=embeddings_model)

  embeddings_model = HuggingFaceEmbeddings()
  embeddings_model = HuggingFaceEmbeddings()


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [13]:
# Convert the FAISS vector store into a similarity retriever with a specified number of search results
similarity_retriever = vectorstore.as_retriever(search_type='similarity',
                                               search_kwargs={'k': 5})

In [14]:
# Create a MultiQueryRetriever from the vector store and the language model
multiquery_retriever = MultiQueryRetriever.from_llm(
    retriever = vectorstore.as_retriever(search_kwargs={'k':5}),
    llm = model,
)

In [15]:
# Define the query string for the retrievers
query = 'How to improve energy levels and maintain balance?'

In [16]:
# Invoke the similarity retriever with the query to get relevant documents
similarity_results = similarity_retriever.invoke(query)
# Invoke the multiquery retriever with the query to get relevant documents
multiquery_results = multiquery_retriever.invoke(query)

In [18]:
# Iterate through the similarity search results and print the content of each document
for i, doc in enumerate(similarity_results):
    print(f"\n--- Result {i+1} ---")
    print(doc.page_content)

print("*"*150)

# Iterate through the multiquery search results and print the content of each document
for i, doc in enumerate(multiquery_results):
    print(f"\n--- Result {i+1} ---")
    print(doc.page_content)


--- Result 1 ---
Drinking sufficient water throughout the day helps maintain metabolism and energy.

--- Result 2 ---
Regular walking boosts heart health and can reduce symptoms of depression.

--- Result 3 ---
Mindfulness and controlled breathing lower cortisol and improve mental clarity.

--- Result 4 ---
Consuming leafy greens and fruits helps detox the body and improve longevity.

--- Result 5 ---
The solar energy system in modern homes helps balance electricity demand.
******************************************************************************************************************************************************

--- Result 1 ---
Black holes bend spacetime and store immense gravitational energy.

--- Result 2 ---
Photosynthesis enables plants to produce energy by converting sunlight.

--- Result 3 ---
The 2022 FIFA World Cup was held in Qatar and drew global energy and excitement.

--- Result 4 ---
Python balances readability with power, making it a popular system design lan

# **Contextual Compression Retriever**

In [20]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain_core.documents import Document

In [21]:
# Define a list of Document objects with example text content and metadata for contextual compression
docs = [
    Document(page_content=(
        """The Grand Canyon is one of the most visited natural wonders in the world.
        Photosynthesis is the process by which green plants convert sunlight into energy.
        Millions of tourists travel to see it every year. The rocks date back millions of years."""
    ), metadata={"source": "Doc1"}),

    Document(page_content=(
        """In medieval Europe, castles were built primarily for defense.
        The chlorophyll in plant cells captures sunlight during photosynthesis.
        Knights wore armor made of metal. Siege weapons were often used to breach castle walls."""
    ), metadata={"source": "Doc2"}),

    Document(page_content=(
        """Basketball was invented by Dr. James Naismith in the late 19th century.
        It was originally played with a soccer ball and peach baskets. NBA is now a global league."""
    ), metadata={"source": "Doc3"}),

    Document(page_content=(
        """The history of cinema began in the late 1800s. Silent films were the earliest form.
        Thomas Edison was among the pioneers. Photosynthesis does not occur in animal cells.
        Modern filmmaking involves complex CGI and sound design."""
    ), metadata={"source": "Doc4"})
]

In [25]:
# Initialize the HuggingFaceEmbeddings model for generating embeddings
embeddings_model = HuggingFaceEmbeddings()

# Create a FAISS vector store from the documents and embeddings model
vectorstore = FAISS.from_documents(documents=docs,
                                   embedding=embeddings_model)

  embeddings_model = HuggingFaceEmbeddings()


In [26]:
# Convert the FAISS vector store into a base retriever with a specified number of search results
base_retriever = vectorstore.as_retriever(search_kwargs={'k': 5})

In [27]:
# Initialize the HuggingFaceEndpoint with the Meta-Llama model for text generation.
llm = HuggingFaceEndpoint(
    repo_id ='meta-llama/Meta-Llama-3-8B-Instruct',
    task = 'text-generation'
)

# Wrap the endpoint in a ChatHuggingFace object for chat-style interaction.
model = ChatHuggingFace(llm=llm)

In [28]:
# Create an LLMChainExtractor to compress documents using the language model
compressor = LLMChainExtractor.from_llm(model)

In [29]:
# Create a ContextualCompressionRetriever using the base retriever and the compressor
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=base_retriever
)

In [30]:
# Define the query string for the contextual compression retriever
query = 'what is photosynthesis?'
# Invoke the compression retriever with the query to get compressed results
compressed_results = compression_retriever.invoke(query)

In [31]:
# Iterate through the compressed search results and print the content of each document
for i, doc in enumerate(compressed_results):
    print(f"\n--- Result {i+1} ---")
    print(doc.page_content)


--- Result 1 ---
Photosynthesis is the process by which green plants convert sunlight into energy.
        Millions of tourists travel to see the Grand Canyon every year. The rocks date back millions of years.

--- Result 2 ---
Photosynthesis does not occur in animal cells.

--- Result 3 ---
The chlorophyll in plant cells captures sunlight during photosynthesis.
