In [1]:
from langchain_chroma.vectorstores import Chroma

from langchain_ollama.llms import OllamaLLM
from langchain_ollama.embeddings import OllamaEmbeddings

from langchain.document_loaders.pdf import PyPDFDirectoryLoader

from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain.schema.document import Document

from langchain_openai import OpenAIEmbeddings

from langchain.prompts import ChatPromptTemplate

In [2]:
import os
import sys

from dotenv import load_dotenv
load_dotenv()

# os.environ['HF_HOME']="/Users/nikhil20.sharma/Desktop/langchain/.cache"

# Print all environment variables loaded from .env
print("Loaded Environment Variables:")
for key, value in os.environ.items():
    if key in ['OPENAI_API_KEY', 'LANGSMITH_AIP_KEY', 'HUGGINGFACE_TOKEN']:
        # Mask sensitive values for security
        masked_value = value[:8] + "..." + value[-4:] if value else value
        print(f"- {key}: {masked_value}")

Loaded Environment Variables:
- OPENAI_API_KEY: sk-proj-...znMA
- HUGGINGFACE_TOKEN: hf_Xbepg...LTAM
- LANGSMITH_AIP_KEY: lsv2_pt_...4996


In [3]:
def load_documents(document_path):
    document_loader = PyPDFDirectoryLoader(document_path)
    return document_loader.load()

In [4]:
def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False
    )
    return text_splitter.split_documents(documents)

In [5]:
def get_embedding_from_ollama():
    embeddings = OllamaEmbeddings(model='nomic-embed-text')
    return embeddings

In [6]:
embeddings = OpenAIEmbeddings(
    # model="text-embedding-3-large",
    model = "text-embedding-3-small"
    # With the `text-embedding-3` class
    # of models, you can specify the size
    # of the embeddings you want returned.
    # dimensions=1024
)

In [7]:
def get_embedding_function():
    # embeddings = BedrockEmbeddings(
    #     credentials_profile_name="default", region_name="us-east-1"
    # )
    embeddings = OllamaEmbeddings(model="nomic-embed-text")
    return embeddings

In [8]:
def calculate_chunk_ids(chunks):

    # This will create IDs like "data/monopoly.pdf:6:2"
    # Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks

In [9]:
def add_to_chroma(chunks: list[Document], CHROMA_PATH):
    # Load the existing database.
    db = Chroma(
        persist_directory=CHROMA_PATH, 
        embedding_function=get_embedding_function()
    )

    # Calculate Page IDs.
    chunks_with_ids = calculate_chunk_ids(chunks)

    # Add or Update the documents.
    existing_items = db.get(include=[])  # IDs are always included by default
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    # Only add documents that don't exist in the DB.
    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
        # db.persist()
    else:
        print("✅ No new documents to add")

In [10]:
DOCUMENT_PATH = "/Users/nikhil20.sharma/Desktop/langchain/data/pdfs"
CHROMA_PATH = "/Users/nikhil20.sharma/Desktop/langchain/DB/chroma"

In [11]:
# Create (or update) the data store.
documents = load_documents(document_path=DOCUMENT_PATH)
chunks = split_documents(documents)
add_to_chroma(chunks, CHROMA_PATH)

Number of existing documents in DB: 0
👉 Adding new documents: 40


In [12]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

In [13]:
QUERY_TEXT = "How to come out of joil?"

In [14]:
# Prepare the DB.
embedding_function = get_embedding_function()
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

In [15]:
# Search the DB.
results = db.similarity_search_with_score(QUERY_TEXT, k=5)

In [16]:
context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=QUERY_TEXT)

In [17]:
print(prompt)

Human: 
Answer the question based only on the following context:

instructions and return the card facedown to the bottom of the deck. 
The "Get Out of Jail Free" card is held until used and then returned to 
the bottom of the deck. If the player who draws it does not wish to use 
it, helshe may sell it, at any time, to another player at a price agreeable 
to both. 
"INCOME TAX": If you land here you have two options: You may 
estimate your tax at $900 and pay the Bank, or you may pay 10% of 
your total worth to the Bank. Your total worth is all your cash on hand, 
printed prices of mortgaged and unmortgaged properties and cost 
price of all buildings you own. 
You must decide which option you will take before you add up 
your total worth. 
"JAIL": You land in Jail when. ..(I) your token lands on the space

---

BANKRUPTCY.. You are declared bankrupt if you owe more than you 
can pay either to another player or to the Bank. If your 
 , 
debt is to another player, you must tum over to t

In [18]:
# MODEL CALL
model = OllamaLLM(model="mistral")
response_text = model.invoke(prompt)

Failed to multipart ingest runs: langsmith.utils.LangSmithAuthError: Authentication failed for https://api.smith.langchain.com/runs/multipart. HTTPError('401 Client Error: Unauthorized for url: https://api.smith.langchain.com/runs/multipart', '{"error":"Unauthorized"}\n')trace=c17d5322-7edd-45bf-8338-399510f0c4f5,id=c17d5322-7edd-45bf-8338-399510f0c4f5


Failed to send compressed multipart ingest: langsmith.utils.LangSmithAuthError: Authentication failed for https://api.smith.langchain.com/runs/multipart. HTTPError('401 Client Error: Unauthorized for url: https://api.smith.langchain.com/runs/multipart', '{"error":"Unauthorized"}\n')
Failed to send compressed multipart ingest: langsmith.utils.LangSmithAuthError: Authentication failed for https://api.smith.langchain.com/runs/multipart. HTTPError('401 Client Error: Unauthorized for url: https://api.smith.langchain.com/runs/multipart', '{"error":"Unauthorized"}\n')
Failed to send compressed multipart ingest: langsmith.utils.LangSmithAuthError: Authentication failed for https://api.smith.langchain.com/runs/multipart. HTTPError('401 Client Error: Unauthorized for url: https://api.smith.langchain.com/runs/multipart', '{"error":"Unauthorized"}\n')
Failed to send compressed multipart ingest: langsmith.utils.LangSmithAuthError: Authentication failed for https://api.smith.langchain.com/runs/multi

In [19]:
sources = [doc.metadata.get("id", None) for doc, _score in results]
formatted_response = f"\nResponse: {response_text}\n\nSources:\n {'\n\t- '.join(sources)}"
print(formatted_response)


Response: 1. Throw doubles on any of your next three turns.
2. Use the "Get Out of Jail Free" card if you have it.
3. Purchase the "Get Out of Jail Free" card from another player and play it.
4. Pay a fine of $50 before you roll the dice on either of your next two turns.

Sources:
 /Users/nikhil20.sharma/Desktop/langchain/data/pdfs/monopoly-instruction.pdf:4:0
	- /Users/nikhil20.sharma/Desktop/langchain/data/pdfs/monopoly-instruction.pdf:7:0
	- /Users/nikhil20.sharma/Desktop/langchain/data/pdfs/monopoly-instruction.pdf:4:1
	- /Users/nikhil20.sharma/Desktop/langchain/data/pdfs/monopoly-instruction.pdf:4:2
	- /Users/nikhil20.sharma/Desktop/langchain/data/pdfs/monopoly-instruction.pdf:5:2


### TEST LLM RAG PIPELINE OUTPUT

In [20]:
EVAL_PROMPT = """
Expected Response: {expected_response}
Actual Response: {actual_response}
---
(Answer with 'true' or 'false') Does the actual response match the expected response? 
"""

In [21]:
def query_rag(query_text: str):
    # Prepare the DB.
    embedding_function = get_embedding_function()
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    # Search the DB.
    results = db.similarity_search_with_score(query_text, k=5)

    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    # print(prompt)

    model = OllamaLLM(model="mistral")
    response_text = model.invoke(prompt)

    sources = [doc.metadata.get("id", None) for doc, _score in results]
    formatted_response = f"\nResponse: {response_text}\n\nSources:\n {'\n\t- '.join(sources)}"
    print(formatted_response)
    return response_text

In [22]:
def query_and_validate(question: str, expected_response: str):
    response_text = query_rag(question)
    prompt = EVAL_PROMPT.format(
        expected_response=expected_response, actual_response=response_text
    )

    model = OllamaLLM(model="mistral")
    evaluation_results_str = model.invoke(prompt)
    evaluation_results_str_cleaned = evaluation_results_str.strip().lower()

    print(prompt)

    if "true" in evaluation_results_str_cleaned:
        # Print response in Green if it is correct.
        print("\033[92m" + f"Response: {evaluation_results_str_cleaned}" + "\033[0m")
        return True
    elif "false" in evaluation_results_str_cleaned:
        # Print response in Red if it is incorrect.
        print("\033[91m" + f"Response: {evaluation_results_str_cleaned}" + "\033[0m")
        return False
    else:
        raise ValueError(
            f"Invalid evaluation result. Cannot determine if 'true' or 'false'."
        )

In [23]:
EVAL_PROMPT = """
Expected Response: {expected_response}
Actual Response: {actual_response}
---
(Answer with 'true' or 'false') Does the actual response match the expected response? 
"""

In [24]:
def test_monopoly_rules():
    assert query_and_validate(
        question="How much total money does a player start with in Monopoly? (Answer with the number only)",
        expected_response="$1500",
    )


def test_ticket_to_ride_rules():
    assert query_and_validate(
        question="How many points does the longest continuous train get in Ticket to Ride? (Answer with the number only)",
        expected_response="10 points",
    )

In [25]:
test_monopoly_rules()


Response: 1600

Sources:
 /Users/nikhil20.sharma/Desktop/langchain/data/pdfs/monopoly-instruction.pdf:2:0
	- /Users/nikhil20.sharma/Desktop/langchain/data/pdfs/monopoly-instruction.pdf:0:0
	- /Users/nikhil20.sharma/Desktop/langchain/data/pdfs/monopoly-instruction.pdf:3:0
	- /Users/nikhil20.sharma/Desktop/langchain/data/pdfs/monopoly-instruction.pdf:1:2
	- /Users/nikhil20.sharma/Desktop/langchain/data/pdfs/monopoly-instruction.pdf:2:1

Expected Response: $1500
Actual Response: 1600
---
(Answer with 'true' or 'false') Does the actual response match the expected response? 

[92mResponse: true[0m


In [26]:
test_ticket_to_ride_rules()


Response: 10 (The player who has the Longest Continuous Path of routes receives a 10 point bonus.)

Sources:
 /Users/nikhil20.sharma/Desktop/langchain/data/pdfs/ticket-to-ride-rulebook.pdf:3:2
	- /Users/nikhil20.sharma/Desktop/langchain/data/pdfs/ticket-to-ride-rulebook.pdf:1:3
	- /Users/nikhil20.sharma/Desktop/langchain/data/pdfs/ticket-to-ride-rulebook.pdf:3:3
	- /Users/nikhil20.sharma/Desktop/langchain/data/pdfs/ticket-to-ride-rulebook.pdf:0:1
	- /Users/nikhil20.sharma/Desktop/langchain/data/pdfs/ticket-to-ride-rulebook.pdf:1:2

Expected Response: 10 points
Actual Response: 10 (The player who has the Longest Continuous Path of routes receives a 10 point bonus.)
---
(Answer with 'true' or 'false') Does the actual response match the expected response? 

[92mResponse: true[0m
