# 1. load data

In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader, UnstructuredFileLoader, \
    JSONLoader, TextLoader, CSVLoader


# 1.load data
def load_all_documents(folder_path: str = "data"):
    """
    Load all documents (PDF, DOCX, JSON, TXT, CSV, etc.) from a folder into LangChain Document objects.
    """

    docs = []
    supported_exts = {'.pdf', '.docx', '.json', '.txt', '.csv'}
    for root, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            ext = os.path.splitext(file)[-1].lower()

            try:
                if ext == '.pdf':
                    loader = PyPDFLoader(file_path)
                elif ext == '.docx':
                    loader = UnstructuredWordDocumentLoader(file_path)
                elif ext == '.json':
                    loader = JSONLoader(
                        file_path,
                        jq_schema=".",  # You can change this if JSON has a specific key
                        text_content=True
                    )
                elif ext == '.txt':
                    loader = TextLoader(file_path, encoding='utf-8')
                elif ext == '.csv':
                    loader = CSVLoader(file_path)
                else:
                    # fallback for other formats
                    loader = UnstructuredFileLoader(file_path)

                docs.extend(loader.load())

            except Exception as e:
                print(f"❌ Failed to load {file_path}: {e}")

    print(f"✅ Loaded {len(docs)} documents from {folder_path}")
    return docs


#running the step
documents = load_all_documents()
print(f"total {len(documents)} documents")
for doc in documents:
    print(f"doc : {doc}\n================================END======================================")

✅ Loaded 1 documents from data
total 1 documents
doc : page_content='	REQUEST FOR PROPOSALS      

(RFP)

ISSUE DATE:			August 3, 2018					

TITLE:					 Administrative Services and Fully Insured Health Benefits Plans

Number: 		OHB19-01

ISSUING AGENCY:		Commonwealth of Virginia

Department of Human Resource Management

James Monroe Building, 13th Floor

101 North 14th Street

Richmond, Virginia 23219

PERIOD OF CONTRACT:	From July 1, 2019 through June 30, 2024, with five one-year renewal options as described within.

Sealed proposals for furnishing services described herein will be received subject to the conditions cited herein until 2:00 p.m., September 18, 2018.

All Inquiries Must Be In Writing (the cut-off date for all questions is September 1, 2018) and Should Be Directed To:

Mr. Richard Whitfield

Department of Human Resource Management

James Monroe Building, 13th Floor

101 North 14th Street

Richmond, Virginia 23219

e-mail:  richard.whitfield@dhrm.virginia.gov

SEND ALL PR

# 2. split data


In [2]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document


# 2. split data
def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"✂️ Split {len(documents)} documents into {len(chunks)} chunks.")
    return chunks


#running the step
chunks = split_documents(documents)
for chunk in chunks:
    print(f"doc : {chunk}\n================================END======================================")

✂️ Split 1 documents into 356 chunks.
doc : page_content='REQUEST FOR PROPOSALS      

(RFP)

ISSUE DATE:			August 3, 2018					

TITLE:					 Administrative Services and Fully Insured Health Benefits Plans

Number: 		OHB19-01

ISSUING AGENCY:		Commonwealth of Virginia

Department of Human Resource Management

James Monroe Building, 13th Floor

101 North 14th Street

Richmond, Virginia 23219

PERIOD OF CONTRACT:	From July 1, 2019 through June 30, 2024, with five one-year renewal options as described within.

Sealed proposals for furnishing services described herein will be received subject to the conditions cited herein until 2:00 p.m., September 18, 2018.

All Inquiries Must Be In Writing (the cut-off date for all questions is September 1, 2018) and Should Be Directed To:

Mr. Richard Whitfield

Department of Human Resource Management' metadata={'source': 'data\\sample-rfp.docx'}
doc : page_content='Mr. Richard Whitfield

Department of Human Resource Management

James Monroe Building, 1

# 3. Generating Unique IDs for Each Chunk

In [3]:
# 3. Generating Unique IDs for Each Chunk
def calculate_chunk_ids(chunks):
    # This will create IDs like "data/sample.docx:6:2"
    # Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    print(f"🔢 Assigned unique IDs to {len(chunks)} chunk(s) ✅\n")
    return chunks


#running the step
for chunk in calculate_chunk_ids(chunks):
    print(f"chunk : {chunk.metadata['id']}\n===============================END=======================================")


🔢 Assigned unique IDs to 356 chunk(s) ✅

chunk : data\sample-rfp.docx:None:0
chunk : data\sample-rfp.docx:None:1
chunk : data\sample-rfp.docx:None:2
chunk : data\sample-rfp.docx:None:3
chunk : data\sample-rfp.docx:None:4
chunk : data\sample-rfp.docx:None:5
chunk : data\sample-rfp.docx:None:6
chunk : data\sample-rfp.docx:None:7
chunk : data\sample-rfp.docx:None:8
chunk : data\sample-rfp.docx:None:9
chunk : data\sample-rfp.docx:None:10
chunk : data\sample-rfp.docx:None:11
chunk : data\sample-rfp.docx:None:12
chunk : data\sample-rfp.docx:None:13
chunk : data\sample-rfp.docx:None:14
chunk : data\sample-rfp.docx:None:15
chunk : data\sample-rfp.docx:None:16
chunk : data\sample-rfp.docx:None:17
chunk : data\sample-rfp.docx:None:18
chunk : data\sample-rfp.docx:None:19
chunk : data\sample-rfp.docx:None:20
chunk : data\sample-rfp.docx:None:21
chunk : data\sample-rfp.docx:None:22
chunk : data\sample-rfp.docx:None:23
chunk : data\sample-rfp.docx:None:24
chunk : data\sample-rfp.docx:None:25
chunk :

# 4. embed data


In [9]:
from langchain_ollama import OllamaEmbeddings


# 4. embed data
def get_embedding_function():
    embeddings = OllamaEmbeddings(model="llama3")
    print(f"🚀 Embedding model initialized: llama3")
    return embeddings


#running the step
print(get_embedding_function())

🚀 Embedding model initialized: llama3
model='llama3' validate_model_on_init=False base_url=None client_kwargs={} async_client_kwargs={} sync_client_kwargs={} mirostat=None mirostat_eta=None mirostat_tau=None num_ctx=None num_gpu=None keep_alive=None num_thread=None repeat_last_n=None repeat_penalty=None temperature=None stop=None tfs_z=None top_k=None top_p=None



# 5. reset db(optional)

In [5]:
import os
import shutil


# 5. reset db(optional)
def clear_database():
    paths_to_clear = ["chroma", "data"]  # List of directories to clear

    for path in paths_to_clear:
        if os.path.exists(path):
            shutil.rmtree(path)
            print(f"Deleted: {path}")
        else:
            print(f"Path does not exist: {path}")
    print("🧹 Database cleared")


#running the step
clear_database()

Deleted: chroma
Deleted: data
🧹 Database cleared


# 6. store in db

In [6]:
from langchain_chroma import Chroma


# 6. store in db
def add_to_chroma(chunks: list[Document]):
    # Load the existing database.
    db = Chroma(
        persist_directory="chroma", embedding_function=get_embedding_function()
    )

    # Calculate Page IDs.
    chunks_with_ids = calculate_chunk_ids(chunks)

    # Add or Update the documents.
    existing_items = db.get(include=[])  # IDs are always included by default
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    # Only add documents that don't exist in the DB.
    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
    else:
        print("✅ No new documents to add")


#running the step
add_to_chroma(chunks)

🔢 Assigned unique IDs to 356 chunk(s) ✅

Number of existing documents in DB: 0
👉 Adding new documents: 356


----------

# 7. ask


In [7]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama import OllamaLLM


# 1. ask
def query_rag(query_text: str):
    # 2. Prepare the DB.
    embedding_function = get_embedding_function()
    db = Chroma(persist_directory="chroma", embedding_function=embedding_function)

    # 3. template var
    PROMPT_TEMPLATE = """
    You are an expert AI assistant specialized in analyzing Pharmacy Benefits Management (PBM) RFP documents.

    Your task is to answer questions *strictly and only* based on the information provided in the following context.
    Do not use any outside knowledge, assumptions, or general information unless explicitly stated in the context.

    If the answer is not clearly stated or cannot be derived from the context, respond with:
    "I cannot find this information in the provided context."

    ---

    Context:
    {context}

    ---

    Question:
    {question}

    ---

    Answer (concise, factual, and supported by the context):
    """

    # 4. Search the DB.
    results = db.similarity_search_with_score(query_text, k=5)

    # 5. generate complete prompt
    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    # print(prompt)

    # 6.invoke llm
    model = OllamaLLM(model="llama3")
    response_text = model.invoke(prompt)

    # 7. get the original source
    sources = [doc.metadata.get("id", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    # print(formatted_response)
    return response_text


#running the step
query_rag("What is this doc ?")

'This document appears to be a regulation or code section related to public funding for faith-based organizations or their contracts with government agencies. It outlines specific guidelines and requirements regarding accounting, auditing, and payment obligations.'

# 8. unit test

In [8]:
from langchain_community.llms.ollama import Ollama

# 1. prompt template
EVAL_PROMPT = """
Expected Response: {expected_response}
Actual Response: {actual_response}
---
(Answer with 'true' or 'false') Does the actual response match the expected response?
"""


def test_monopoly_rules():
    assert query_and_validate(
        question="How much total money does a player start with in Monopoly? (Answer with the number only)",
        expected_response="$1500",
    )


def test_ticket_to_ride_rules():
    assert query_and_validate(
        question="How many points does the longest continuous train get in Ticket to Ride? (Answer with the number only)",
        expected_response="10 points",
    )


# 2. ask
def query_and_validate(question: str, expected_response: str):
    response_text = query_rag(question)
    prompt = EVAL_PROMPT.format(
        expected_response=expected_response, actual_response=response_text
    )

    # 3. invoke llm
    model = Ollama(model="llama3")
    evaluation_results_str = model.invoke(prompt)

    # 4. clean
    evaluation_results_str_cleaned = evaluation_results_str.strip().lower()

    print(prompt)

    # 5. check
    if "true" in evaluation_results_str_cleaned:
        # Print response in Green if it is correct.
        print("\033[92m" + f"Response: {evaluation_results_str_cleaned}" + "\033[0m")
        return True
    elif "false" in evaluation_results_str_cleaned:
        # Print response in Red if it is incorrect.
        print("\033[91m" + f"Response: {evaluation_results_str_cleaned}" + "\033[0m")
        return False
    else:
        raise ValueError(
            f"Invalid evaluation result. Cannot determine if 'true' or 'false'."
        )

#running the step
# test_monopoly_rules()
# test_ticket_to_ride_rules()

  model = Ollama(model="llama3")



Expected Response: $1500
Actual Response: I cannot find this information in the provided context.
---
(Answer with 'true' or 'false') Does the actual response match the expected response?

[91mResponse: false[0m


AssertionError: 

In [None]:
# from rag_pipeline import run_pipeline, clear_database
# clear_database()
# run_pipeline()