## Setup

In [1]:
from dotenv import load_dotenv
import chromadb
from langchain_openai import ChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
#from langchain_chroma import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage, AIMessage
from langchain_ollama import ChatOllama
import os

load_dotenv()

2026-01-10 09:59:14.193074: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


True

## Load the LLM

In [2]:
# Perplexity LLM ‚Äî Correct model name (December 2025)
perplexity_llm = ChatOpenAI(
    base_url="https://api.perplexity.ai",
    api_key=os.getenv("PERPLEXITY_API_KEY"),
    model="sonar-pro",           # ‚Üê THIS IS THE CORRECT MODEL
    temperature=0.7
)

google_llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    google_api_key=os.getenv("GEMINI_API_KEY"),
    temperature=0.3,
    max_output_tokens=8192
)

ollama_llm = ChatOllama(
    model = 'llama3.2:latest',
    temperature=0.0,
    max_output_tokens=300
)

In [3]:
def llm(max_lokens=1000):
    ollama_llm = ChatOllama(
        model = 'llama3.2:latest',
        temperature=0.0,
        max_output_tokens=max_lokens
    )
    return ollama_llm

## Text splitter

In [4]:
# NEW correct import (post LangChain 0.1+)
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [5]:
def text_splitter(data, chunk_size, chunk_overlap):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
    )
    chunks = text_splitter.split_documents(data)
    return chunks

## Create the embedding model

In [6]:
# OpenAI Embeddings (most popular free alternative)
from langchain_openai import OpenAIEmbeddings

def openai_embedding():
    return OpenAIEmbeddings(
        model="text-embedding-3-small",
        api_key=os.getenv("OPENAI_API_KEY"),
        dimensions=1536  # Match WatsonX vector size
    )

# Google Gemini Embeddings (your existing API key)
from langchain_google_genai import GoogleGenerativeAIEmbeddings

def gemini_embedding():
    return GoogleGenerativeAIEmbeddings(
        model="models/embedding-001",
        google_api_key=os.getenv("GEMINI_API_KEY")
    )

# Hugging Face (completely free, local)
from langchain_huggingface import HuggingFaceEmbeddings

def hf_embedding():
    return HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={'device': 'cpu'}  # or 'cuda'
    )

# Ollama (local, free)
from langchain_community.embeddings import OllamaEmbeddings

def ollama_embedding():
    return OllamaEmbeddings(
        model="nomic-embed-text"
    )


## Retrievers

### Vector Store-Backed Retriever

In [7]:
!wget "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/MZ9z1lm-Ui3YBp3SYWLTAQ/companypolicies.txt"

--2026-01-10 09:59:17--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/MZ9z1lm-Ui3YBp3SYWLTAQ/companypolicies.txt
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 198.23.119.245
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|198.23.119.245|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15660 (15K) [text/plain]
Saving to: ‚Äòcompanypolicies.txt.2‚Äô


2026-01-10 09:59:22 (179 MB/s) - ‚Äòcompanypolicies.txt.2‚Äô saved [15660/15660]



In [8]:
from langchain_community.document_loaders import TextLoader

In [9]:
loader = TextLoader("companypolicies.txt")
txt_data = loader.load()

In [10]:
len(txt_data)

1

In [11]:
chunks_txt = text_splitter(txt_data, 200, 25)
#hunks_txt

#### Store the embeddings into a `ChromaDB`.

In [12]:
from langchain_community.vectorstores import Chroma
#from `langchain_chroma import Chroma`

In [13]:
vectordb = Chroma.from_documents(chunks_txt, hf_embedding())

##### Simple similarity search

In [14]:
retriever = vectordb.as_retriever()

In [15]:
query = "email policy"
docs = retriever.invoke(query)
docs

[Document(metadata={'source': 'companypolicies.txt'}, page_content='3.\tInternet and Email Policy'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='Our Internet and Email Policy aims to promote safe, responsible usage of digital communication tools that align with our values and legal obligations. Each employee is expected to understand and'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='Our Internet and Email Policy is established to guide the responsible and secure use of these essential tools within our organization. We recognize their significance in daily business operations and'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='Confidentiality: Reserve email for the transmission of confidential information, trade secrets, and sensitive customer data only when encryption is applied. Exercise discretion when discussing')]

In [16]:
#You can also specify `search kwargs` like `k` to limit the retrieval results.

retriever = vectordb.as_retriever(search_kwargs={"k": 1})
docs = retriever.invoke(query)
docs

[Document(metadata={'source': 'companypolicies.txt'}, page_content='3.\tInternet and Email Policy')]

##### MMR search

MMR in vector stores is a technique used to balance the relevance and diversity of retrieved results. It selects documents that are both highly relevant to the query and minimally similar to previously selected documents. This approach helps to avoid redundancy and ensures a more comprehensive coverage of different aspects of the query.


In [17]:
retriever = vectordb.as_retriever(search_type="mmr")
docs = retriever.invoke(query)
docs

[Document(metadata={'source': 'companypolicies.txt'}, page_content='3.\tInternet and Email Policy'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='Confidentiality: Reserve email for the transmission of confidential information, trade secrets, and sensitive customer data only when encryption is applied. Exercise discretion when discussing'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='Review of Policy: This policy will be reviewed periodically to ensure its alignment with evolving legal requirements and best practices for maintaining a healthy and safe workplace.'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='any individual found to be in violation of this policy.')]

##### Similarity score threshold retrieval

In [18]:
retriever = vectordb.as_retriever(
    search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.4}
)
docs = retriever.invoke(query)
docs

[Document(metadata={'source': 'companypolicies.txt'}, page_content='3.\tInternet and Email Policy'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='Our Internet and Email Policy aims to promote safe, responsible usage of digital communication tools that align with our values and legal obligations. Each employee is expected to understand and'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='Our Internet and Email Policy is established to guide the responsible and secure use of these essential tools within our organization. We recognize their significance in daily business operations and'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='Confidentiality: Reserve email for the transmission of confidential information, trade secrets, and sensitive customer data only when encryption is applied. Exercise discretion when discussing')]

### Multi-Query Retriever

Distance-based vector database retrieval represents queries in high-dimensional space and finds similar embedded documents based on "distance". However, retrieval results may vary with subtle changes in query wording or if the embeddings do not accurately capture the data's semantics.

The `MultiQueryRetriever` addresses this by using an LLM to generate multiple queries from different perspectives for a given user input query. For each query, it retrieves a set of relevant documents and then takes the unique union of these results to form a larger set of potentially relevant documents. By generating multiple perspectives on the same question, the `MultiQueryRetriever` can potentially overcome some limitations of distance-based retrieval, resulting in a richer and more diverse set of results.

The following picture shows the difference between retrievers solely based on distance and the Multi-Query Retriever.

<img src="https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/NCZCJ26bp3uKTa0gp8Agwg/multiquery.png" width="40%" alt="multiquery"/>


In [19]:
from langchain_community.document_loaders import PyPDFLoader

In [20]:
loader = PyPDFLoader("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/ioch1wsxkfqgfLLgmd-6Rw/langchain-paper.pdf")
pdf_data = loader.load()

In [21]:
#pdf_data[1]

In [22]:
# Split
chunks_pdf = text_splitter(pdf_data, 500, 20)

# VectorDB
ids = vectordb.get()["ids"]
vectordb.delete(ids) # We need to delete existing embeddings from previous documents and then store current document embeddings in.
vectordb = Chroma.from_documents(documents=chunks_pdf, embedding=hf_embedding())

In [23]:
from langchain_classic.retrievers.multi_query import MultiQueryRetriever

query = "What does the paper say about langchain?"

retriever = MultiQueryRetriever.from_llm(
    retriever=vectordb.as_retriever(search_kwargs={"k": 2}), 
    llm=llm() # ‚úÖ Correct: using Chat LLM
)

In [24]:
docs = retriever.invoke(query)
docs

[Document(metadata={'title': 's8329 final', 'page_label': '5', 'total_pages': 6, 'moddate': '2023-12-31T03:52:06+00:00', 'creator': 'Microsoft Word', 'source': 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/ioch1wsxkfqgfLLgmd-6Rw/langchain-paper.pdf', 'producer': 'PyPDF', 'author': 'IEEE', 'creationdate': '2023-12-31T03:50:13+00:00', 'page': 4}, page_content="question (Fig. 4b). \n‚Ä¢ MindGuide Chatbot's AI response to the \nsubsequent human message, followed by another \nmental health question from the human (Fig. 4c). \n‚Ä¢ MindGuide Chatbot's AI response after \nanalyzing the latest human message (Fig. 4d). \n \n   s \n                                                         (a)      (b) \n      \n                                                         (c)      (d)"),
 Document(metadata={'page_label': '5', 'title': 's8329 final', 'creator': 'Microsoft Word', 'page': 4, 'creationdate': '2023-12-31T03:50:13+00:00', 'source': 'https://cf-courses-data.s3.us.cloud-o

In [25]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

def create_multi_query_retriever(vectordb, llm):
    """Custom MultiQueryRetriever - works with your exact setup"""
    
    def retrieve_with_multi_query(query):
        # Generate multiple queries using LLM
        prompt = ChatPromptTemplate.from_template("""
        Generate 3 different versions of this query for better document retrieval:
        Original: {question}
        
        Return 3 queries separated by "|||":
        """)
        
        chain = prompt | llm | StrOutputParser()
        response = chain.invoke({"question": query})
        
        # Parse queries
        queries = [q.strip() for q in response.split("|||") if q.strip()]
        print(f"üîç Generated {len(queries)} queries:")
        for i, q in enumerate(queries[:3], 1):
            print(f"  {i}. {q}")
        
        # Retrieve documents for each query
        all_docs = []
        for query_variant in queries[:3]:
            docs = vectordb.similarity_search(query_variant, k=3)
            all_docs.extend(docs)
        
        # Remove duplicates and return top results
        unique_docs = []
        seen_sources = set()
        for doc in all_docs:
            source_key = f"{doc.metadata.get('source', '')}-{doc.metadata.get('page', 0)}"
            if source_key not in seen_sources:
                unique_docs.append(doc)
                seen_sources.add(source_key)
        
        return unique_docs[:8]
    
    return retrieve_with_multi_query

# ‚úÖ Use it exactly like MultiQueryRetriever
query = "What does the paper say about langchain?"
multi_retriever = create_multi_query_retriever(vectordb, llm())

# Works exactly the same!
docs = multi_retriever(query)
print(f"\n‚úÖ Retrieved {len(docs)} unique documents")
for i, doc in enumerate(docs):
    print(f"{i+1}: {doc.page_content[:150]}...")

üîç Generated 1 queries:
  1. Here are three alternative queries that can help improve document retrieval:

1. What is discussed in the paper regarding LangChain?
2. Can you summarize the content related to LangChain in this paper?
3. How does LangChain relate to the topics covered in this research paper?

These revised queries use different wordings and phrasings to capture more specific information about what the user is looking for, which can help improve the accuracy of document retrieval results.

‚úÖ Retrieved 1 unique documents
1: LangChain helps us to unlock the ability to harness the 
LLM‚Äôs immense potential in tasks such as document analysis, 
chatbot development, code analys...


### Self-Querying Retriever

A Self-Querying Retriever, as the name suggests, has the ability to query itself. Specifically, given a natural language query, the retriever uses a query-constructing LLM chain to generate a structured query. It then applies this structured query to its underlying vector store. This enables the retriever to not only use the user-input query for semantic similarity comparison with the contents of stored documents but also to extract and apply filters based on the metadata of those documents.


In [26]:
from langchain_core.documents import Document
#from langchain.chains.query_constructor.base import AttributeInfo
from langchain_classic.chains.query_constructor.base import AttributeInfo
from langchain_classic.retrievers.self_query.base import SelfQueryRetriever
from lark import lark

In [27]:
docs = [
    Document(
        page_content="A bunch of scientists bring back dinosaurs and mayhem breaks loose",
        metadata={"year": 1993, "rating": 7.7, "genre": "science fiction"},
    ),
    Document(
        page_content="Leo DiCaprio gets lost in a dream within a dream within a dream within a ...",
        metadata={"year": 2010, "director": "Christopher Nolan", "rating": 8.2},
    ),
    Document(
        page_content="A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea",
        metadata={"year": 2006, "director": "Satoshi Kon", "rating": 8.6},
    ),
    Document(
        page_content="A bunch of normal-sized women are supremely wholesome and some men pine after them",
        metadata={"year": 2019, "director": "Greta Gerwig", "rating": 8.3},
    ),
    Document(
        page_content="Toys come alive and have a blast doing so",
        metadata={"year": 1995, "genre": "animated"},
    ),
    Document(
        page_content="Three men walk into the Zone, three men walk out of the Zone",
        metadata={
            "year": 1979,
            "director": "Andrei Tarkovsky",
            "genre": "thriller",
            "rating": 9.9,
        },
    ),
]

In [28]:
metadata_field_info = [
    AttributeInfo(
        name="genre",
        description="The genre of the movie. One of ['science fiction', 'comedy', 'drama', 'thriller', 'romance', 'action', 'animated']",
        type="string",
    ),
    AttributeInfo(
        name="year",
        description="The year the movie was released",
        type="integer",
    ),
    AttributeInfo(
        name="director",
        description="The name of the movie director",
        type="string",
    ),
    AttributeInfo(
        name="rating", description="A 1-10 rating for the movie", type="float"
    ),
]

In [29]:
vectordb = Chroma.from_documents(docs, hf_embedding())

In [30]:
document_content_description = "Brief summary of a movie."

retriever = SelfQueryRetriever.from_llm(
    llm(),
    vectordb,
    document_content_description,
    metadata_field_info,
)

In [31]:
# This example only specifies a filter
retriever.invoke("I want to watch a movie rated higher than 8.5")

[Document(metadata={'year': 2006, 'director': 'Satoshi Kon', 'rating': 8.6}, page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea'),
 Document(metadata={'genre': 'thriller', 'rating': 9.9, 'year': 1979, 'director': 'Andrei Tarkovsky'}, page_content='Three men walk into the Zone, three men walk out of the Zone')]

In [56]:
from langchain_core.exceptions import OutputParserException

# Solution 2: Use proper exception handling
try:
    results = retriever.invoke("Has Greta Gerwig directed any movies about women")
    print(f"Found {len(results)} results")
except OutputParserException as e:
    print(f"Output parsing error: {e}")
    # Fallback: try a simpler query
    try:
        results = retriever.invoke("Greta Gerwig women movies")
        print(f"Fallback query returned {len(results)} results")
    except Exception as fallback_error:
        print(f"Fallback also failed: {fallback_error}")
except Exception as e:
    print(f"General error: {e}")

Found 1 results


In [57]:
results

[Document(id='02a80ad5-f843-4946-8475-1b44da921d47', metadata={'rating': 8.3, 'year': 2019, 'director': 'Greta Gerwig'}, page_content='A bunch of normal-sized women are supremely wholesome and some men pine after them')]

In [34]:
# This example specifies a composite filter
retriever.invoke("What's a highly rated (above 8.5) science fiction film?")

[Document(metadata={'year': 1993, 'rating': 7.7, 'genre': 'science fiction'}, page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose')]

### Parent Document Retriever

When splitting documents for retrieval, there are often conflicting desires:

1. You may want to have small documents so that their embeddings can most accurately reflect their meaning. If the documents are too long, the embeddings can lose meaning.
2. You want to have long enough documents so that the context of each chunk is retained.

The `ParentDocumentRetriever` strikes that balance by splitting and storing small chunks of data. During retrieval, it first fetches the small chunks but then looks up the parent IDs for those chunks and returns those larger documents.


In [38]:
from langchain_classic.retrievers import ParentDocumentRetriever
from langchain_text_splitters import CharacterTextSplitter
from langchain_core.stores import InMemoryStore  # ‚úÖ CORRECT BaseStore
#from langchain_community.vectorstores import Chroma
from langchain_chroma import Chroma

In [39]:
# Set two splitters. One is with big chunk size (parent) and one is with small chunk size (child)
parent_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100, separator='\n')
child_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=30, separator='\n')

In [40]:
vectordb = Chroma(
    collection_name="split_parents", embedding_function=hf_embedding()
)
#vectordb = Chroma.from_documents(documents=chunks_pdf, embedding=watsonx_embedding())


In [41]:
# The storage layer for the parent documents
store = InMemoryStore()

In [42]:
retriever = ParentDocumentRetriever(
    vectorstore=vectordb,  # Your existing Chroma
    docstore=store,        # ‚úÖ InMemoryStore (KeyValue)
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

In [43]:
retriever.add_documents(txt_data)

Created a chunk of size 223, which is longer than the specified 200
Created a chunk of size 274, which is longer than the specified 200
Created a chunk of size 262, which is longer than the specified 200
Created a chunk of size 282, which is longer than the specified 200
Created a chunk of size 262, which is longer than the specified 200
Created a chunk of size 270, which is longer than the specified 200
Created a chunk of size 224, which is longer than the specified 200
Created a chunk of size 325, which is longer than the specified 200
Created a chunk of size 300, which is longer than the specified 200
Created a chunk of size 216, which is longer than the specified 200
Created a chunk of size 242, which is longer than the specified 200
Created a chunk of size 235, which is longer than the specified 200
Created a chunk of size 300, which is longer than the specified 200
Created a chunk of size 294, which is longer than the specified 200
Created a chunk of size 234, which is longer tha

In [44]:
len(list(store.yield_keys()))

19

In [45]:
sub_docs = vectordb.similarity_search("smoking policy")

In [46]:
print(sub_docs[0].page_content)

5.	Smoking Policy


In [47]:
retrieved_docs = retriever.invoke("smoking policy")
print(retrieved_docs[0].page_content)

The Mobile Phone Policy is aimed at promoting the responsible and secure use of mobile devices in line with legal and ethical standards. Every employee is expected to comprehend and abide by these guidelines. Regular reviews of the policy ensure its ongoing alignment with evolving technology and security best practices.
5.	Smoking Policy
Policy Purpose: The Smoking Policy has been established to provide clear guidance and expectations concerning smoking on company premises. This policy is in place to ensure a safe and healthy environment for all employees, visitors, and the general public.
Designated Smoking Areas: Smoking is only permitted in designated smoking areas, as marked by appropriate signage. These areas have been chosen to minimize exposure to secondhand smoke and to maintain the overall cleanliness of the premises.


### Retrieve Top 2 Results Using a Vector Store-Backed Retriever

In [48]:
vectordb = Chroma.from_documents(documents=chunks_txt, embedding=hf_embedding())
retriever = vectordb.as_retriever(search_kwargs={"k": 2})
query = "Drinking Policy"
docs = retriever.invoke(query)
docs

[Document(id='99d767bf-3604-4f4e-8d06-b57134aedfa1', metadata={'source': 'companypolicies.txt'}, page_content='6.\tDrug and Alcohol Policy'),
 Document(id='90a643f5-fa43-49ce-a63f-a546de8e93f8', metadata={'source': 'companypolicies.txt'}, page_content='Policy Objective: The Drug and Alcohol Policy is established to establish clear expectations and guidelines for the responsible use of drugs and alcohol within the organization. This policy aims to')]

### Self-Querying Retriever for a Query

In [49]:
retriever = SelfQueryRetriever.from_llm(
    llm(),
    vectordb,
    document_content_description,
    metadata_field_info,
)

retriever.invoke(
    "Name the best mystry movie."
)

[Document(id='5735a942-35aa-4370-ae52-20395ba50f95', metadata={'year': 2006, 'rating': 8.6, 'director': 'Satoshi Kon'}, page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea'),
 Document(id='d1392d4a-b217-49ce-b69b-b5e31da815a0', metadata={'source': 'companypolicies.txt'}, page_content='online activity or potential security breaches.'),
 Document(id='64d4e58a-3552-44cb-8a36-8c18f47cb0b6', metadata={'source': 'companypolicies.txt'}, page_content='concerns or suspicious activities related to your mobile device.'),
 Document(id='0cc9ab8e-cd62-4516-97df-5d89233c5762', metadata={'director': 'Christopher Nolan', 'rating': 8.2, 'year': 2010}, page_content='Leo DiCaprio gets lost in a dream within a dream within a dream within a ...')]