# 1. load data

In [1]:
# from langchain_community.document_loaders import PyPDFDirectoryLoader
#
# # 1.load data v1
# DATA_PATH = "data"
#
#
# def load_documents():
#     document_loader = PyPDFDirectoryLoader(DATA_PATH)
#     return document_loader.load()
#
#
# documents = load_documents()
# print(f"total {len(documents)} documents")

In [2]:
# from langchain_community.document_loaders import UnstructuredWordDocumentLoader
#
#
# # 1.load data v2
# def load_documents():
#     document_loader = UnstructuredWordDocumentLoader("data/sample-rfp.docx")
#     return document_loader.load()
#
#
# #running the step
# documents = load_documents()
# print(f"total {len(documents)} documents")
# print(documents[0])

In [13]:
import os
from langchain_community.document_loaders import (
    PyPDFLoader,
    UnstructuredWordDocumentLoader,
    UnstructuredFileLoader,
    JSONLoader,
    TextLoader,
    CSVLoader,
)


# 1.load data v2
def load_all_documents(folder_path: str = "data"):
    """
    Load all documents (PDF, DOCX, JSON, TXT, CSV, etc.) from a folder into LangChain Document objects.
    """

    docs = []
    supported_exts = {'.pdf', '.docx', '.json', '.txt', '.csv'}
    for root, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            ext = os.path.splitext(file)[-1].lower()

            try:
                if ext == '.pdf':
                    loader = PyPDFLoader(file_path)
                elif ext == '.docx':
                    loader = UnstructuredWordDocumentLoader(file_path)
                elif ext == '.json':
                    loader = JSONLoader(
                        file_path,
                        jq_schema=".",  # You can change this if JSON has a specific key
                        text_content=True
                    )
                elif ext == '.txt':
                    loader = TextLoader(file_path, encoding='utf-8')
                elif ext == '.csv':
                    loader = CSVLoader(file_path)
                else:
                    # fallback for other formats
                    loader = UnstructuredFileLoader(file_path)

                docs.extend(loader.load())

            except Exception as e:
                print(f"❌ Failed to load {file_path}: {e}")

    print(f"✅ Loaded {len(docs)} documents from {folder_path}")
    return docs


#running the step
documents = load_all_documents()
print(f"total {len(documents)} documents")
for doc in documents:
    print(f"doc : {doc}\n================================END======================================")

✅ Loaded 1 documents from data
total 1 documents
doc : page_content='

 Project Plan: Pharmacy Benefits Management RFP Analysis

 Objective: Develop a system to parse, analyze, and create a chatbot that can interact based on the information from a pharmacy benefits  management RFP document.

 
 Project Phases and Tasks:

Phase 1: Initial Setup and Requirements Gathering

         Task 1: Set Up Environment
         Description: Ensure development environments are set up with necessary tools (Python 3.x, NLP libraries, etc.)

         Task 2: Understand RFP Requirements
         Description: Read and summarize the RFP document to extract key requirements and objectives.
 
Phase 2: Parsing RFP Document

         Task 3: Develop RFP Parsing Module
         Description: Create or utilize existing OCR/NLP tools to parse the RFP document.

         Task 4: Data Extraction and Structuring
         Description: Develop scripts to extract structured data from the parsed text (e.g., questions, c

# 2. split data


In [14]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document


# 2. split data
def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)


#running the step
chunks = split_documents(documents)
print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
print('----------------------')
print(chunks[0])

Split 1 documents into 4 chunks.
----------------------
page_content='Project Plan: Pharmacy Benefits Management RFP Analysis

 Objective: Develop a system to parse, analyze, and create a chatbot that can interact based on the information from a pharmacy benefits  management RFP document.

 
 Project Phases and Tasks:

Phase 1: Initial Setup and Requirements Gathering

         Task 1: Set Up Environment
         Description: Ensure development environments are set up with necessary tools (Python 3.x, NLP libraries, etc.)

         Task 2: Understand RFP Requirements
         Description: Read and summarize the RFP document to extract key requirements and objectives.
 
Phase 2: Parsing RFP Document

         Task 3: Develop RFP Parsing Module
         Description: Create or utilize existing OCR/NLP tools to parse the RFP document.' metadata={'source': 'data\\Project_Req.txt'}


# 3. Generating Unique IDs for Each Chunk

In [5]:
# 3. Generating Unique IDs for Each Chunk
def calculate_chunk_ids(chunks):
    # This will create IDs like "data/sample.docx:6:2"
    # Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks


#running the step
for chunk in calculate_chunk_ids(chunks):
    print(f"chunk : {chunk.metadata}\n======================================================================")


chunk : {'producer': 'WeasyPrint 65.1', 'creator': 'ChatGPT', 'creationdate': '', 'title': 'Codemonk Universal Doc Bot - Project', 'author': 'ChatGPT Canvas', 'source': 'data\\Codemonk Universal Doc Bot - Project.pdf', 'total_pages': 10, 'page': 0, 'page_label': '1', 'id': 'data\\Codemonk Universal Doc Bot - Project.pdf:0:0'}
chunk : {'producer': 'WeasyPrint 65.1', 'creator': 'ChatGPT', 'creationdate': '', 'title': 'Codemonk Universal Doc Bot - Project', 'author': 'ChatGPT Canvas', 'source': 'data\\Codemonk Universal Doc Bot - Project.pdf', 'total_pages': 10, 'page': 0, 'page_label': '1', 'id': 'data\\Codemonk Universal Doc Bot - Project.pdf:0:1'}
chunk : {'producer': 'WeasyPrint 65.1', 'creator': 'ChatGPT', 'creationdate': '', 'title': 'Codemonk Universal Doc Bot - Project', 'author': 'ChatGPT Canvas', 'source': 'data\\Codemonk Universal Doc Bot - Project.pdf', 'total_pages': 10, 'page': 0, 'page_label': '1', 'id': 'data\\Codemonk Universal Doc Bot - Project.pdf:0:2'}
chunk : {'produc

# 4. embed data


In [6]:
from langchain_ollama import OllamaEmbeddings


# 4. embed data
def get_embedding_function():
    embeddings = OllamaEmbeddings(model="llama3")
    return embeddings


#running the step
print(get_embedding_function())

model='llama3' validate_model_on_init=False base_url=None client_kwargs={} async_client_kwargs={} sync_client_kwargs={} mirostat=None mirostat_eta=None mirostat_tau=None num_ctx=None num_gpu=None keep_alive=None num_thread=None repeat_last_n=None repeat_penalty=None temperature=None stop=None tfs_z=None top_k=None top_p=None



# 5. reset db(optional)

In [7]:
import os
import shutil


# 5. reset db(optional)
def clear_database():
    if os.path.exists("chroma"):
        shutil.rmtree("chroma")

#running the step
# clear_database()

# 6. store in db

In [8]:
from langchain_chroma import Chroma


# 6. store in db
def add_to_chroma(chunks: list[Document]):
    # Load the existing database.
    db = Chroma(
        persist_directory="chroma", embedding_function=get_embedding_function()
    )

    # Calculate Page IDs.
    chunks_with_ids = calculate_chunk_ids(chunks)

    # Add or Update the documents.
    existing_items = db.get(include=[])  # IDs are always included by default
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    # Only add documents that don't exist in the DB.
    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
    else:
        print("✅ No new documents to add")


#running the step
add_to_chroma(chunks)

Number of existing documents in DB: 764
✅ No new documents to add


----------

# 7. ask


In [12]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama import OllamaLLM


# 1. ask
def query_rag(query_text: str):
    # 2. Prepare the DB.
    embedding_function = get_embedding_function()
    db = Chroma(persist_directory="chroma", embedding_function=embedding_function)

    # 3. prompt template
    # PROMPT_TEMPLATE = """
    # Answer the question based only on the following context:
    #
    # {context}
    #
    # ---
    #
    # Answer the question based on the above context: {question}
    # """

    PROMPT_TEMPLATE = """
    You are an expert AI assistant specialized in analyzing Pharmacy Benefits Management (PBM) RFP documents.

    Your task is to answer questions *strictly and only* based on the information provided in the following context.
    Do not use any outside knowledge, assumptions, or general information unless explicitly stated in the context.

    If the answer is not clearly stated or cannot be derived from the context, respond with:
    "I cannot find this information in the provided context."

    ---

    Context:
    {context}

    ---

    Question:
    {question}

    ---

    Answer (concise, factual, and supported by the context):
    """

    # 4. Search the DB.
    results = db.similarity_search_with_score(query_text, k=5)

    # 5. generate complete prompt
    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    print(prompt)

    # 6.invoke llm
    model = OllamaLLM(model="llama3")
    response_text = model.invoke(prompt)

    # 7. get the original source
    sources = [doc.metadata.get("id", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    # print(formatted_response)
    return response_text


#running the step
query_rag("What is this doc ?")

Human: 
    Answer the question based only on the following context:

    show standards, if applicable, YTD statistics, and most recent two quarters of activity broken by month.

---

and the title of the document shall be the most prominent features on the first page of each document.

---

Generating New Content
Mehmet Ozkaya 46
Inputs or prompts guide content creation: Model pulls from the 
patterns it has learned and produces something unique
Sampling from learned patterns: Write a short story about a 
journey to Mars or A futuristic city floating above the clouds
Composer creating new music
'Write a story about a 
journey to Mars'

---

of race, age, color, gender or national origin and shall be subject to the same rules as other organizations that contract with public bodies to account for the use of the funds provided; however, if the faith-based organization segregates public funds into separate accounts, only the accounts and programs funded with public funds shall be subject

'Based on the provided context, I would say that this document appears to be a standard report or template for providing information about a project or organization. The title of the document suggests that it might be related to "Generating New Content", but the actual content seems to be discussing various topics such as show standards, YTD statistics, and recent activity broken down by month.'

# 8. unit test

In [10]:
from langchain_community.llms.ollama import Ollama

# 1. prompt template
EVAL_PROMPT = """
Expected Response: {expected_response}
Actual Response: {actual_response}
---
(Answer with 'true' or 'false') Does the actual response match the expected response?
"""


def test_monopoly_rules():
    assert query_and_validate(
        question="How much total money does a player start with in Monopoly? (Answer with the number only)",
        expected_response="$1500",
    )


def test_ticket_to_ride_rules():
    assert query_and_validate(
        question="How many points does the longest continuous train get in Ticket to Ride? (Answer with the number only)",
        expected_response="10 points",
    )


# 2. ask
def query_and_validate(question: str, expected_response: str):
    response_text = query_rag(question)
    prompt = EVAL_PROMPT.format(
        expected_response=expected_response, actual_response=response_text
    )

    # 3. invoke llm
    model = Ollama(model="llama3")
    evaluation_results_str = model.invoke(prompt)

    # 4. clean
    evaluation_results_str_cleaned = evaluation_results_str.strip().lower()

    print(prompt)

    # 5. check
    if "true" in evaluation_results_str_cleaned:
        # Print response in Green if it is correct.
        print("\033[92m" + f"Response: {evaluation_results_str_cleaned}" + "\033[0m")
        return True
    elif "false" in evaluation_results_str_cleaned:
        # Print response in Red if it is incorrect.
        print("\033[91m" + f"Response: {evaluation_results_str_cleaned}" + "\033[0m")
        return False
    else:
        raise ValueError(
            f"Invalid evaluation result. Cannot determine if 'true' or 'false'."
        )


In [11]:
# test_monopoly_rules()
# test_ticket_to_ride_rules()