# Retrieval Augmented Generation (RAG):
chat wwith local LLM, For Your PDFs

In [1]:
import langchain
from langchain.document_loaders.pdf import PyPDFDirectoryLoader

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document

from langchain_community.embeddings.bedrock import BedrockEmbeddings
from langchain_community.embeddings.ollama import OllamaEmbeddings

from langchain_community.vectorstores import Chroma
import warnings
warnings.filterwarnings("ignore")

from langchain.prompts import ChatPromptTemplate
from langchain_community.llms.ollama import Ollama

In [2]:
DATA_PATH= "data"
CHROMA_PATH = "chroma"
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

### Load and Split the documents:

In [3]:
def load_docs():
    doc_loader = PyPDFDirectoryLoader(DATA_PATH)
    return doc_loader.load()

In [4]:
def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)

### Count the embedding:

In [5]:
def get_embedding_function():
    #if we use AWS
    # embeddings = BedrockEmbeddings(
    #     credentials_profile_name="default",
    #     region_name="us-east-1")
    
    #if we use Ollama 
    embeddings= OllamaEmbeddings(model="nomic-embed-text")
    
    return embeddings
    

### Create the database:

In [6]:
def calculate_chunk_ids(chunks):

    # This will create IDs like "data/monopoly.pdf:6:2"
    # Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks

In [7]:
def add_to_chroma(chunks: list[Document]):
    db= Chroma(persist_directory=CHROMA_PATH,
               embedding_function=get_embedding_function()
               )
    # Calculate Page IDs.
    chunks_with_ids = calculate_chunk_ids(chunks)

    # Add or Update the documents.
    existing_items = db.get(include=[])  # IDs are always included by default
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    # Only add documents that don't exist in the DB.
    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        print(f"Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
        db.persist()
    else:
        print("No new documents to add!")

In [8]:
documents = load_docs()
chunks = split_documents(documents)
add_to_chroma(chunks)

Number of existing documents in DB: 41
No new documents to add!


### Create qurey data:

In [9]:
def query_rag(query_text: str):
    # Prepare the DB.
    embedding_function = get_embedding_function()
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    # Search the DB.
    results = db.similarity_search_with_score(query_text, k=5)

    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    # print(prompt)

    model = Ollama(model="llama3")
    response_text = model.invoke(prompt)

    sources = [doc.metadata.get("id", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)
    return response_text

In [10]:
if __name__ == "__main__":
    query= input("Please enter the query text: ")
    query_rag(query)

Human: 
Answer the question based only on the following context:

MONOPOLY 
Property Trading Game from Parker Brothers" 
AGES 8+ 
2 to 8 Players 
Contents: Gameboard, 3 dice, tokens, 32 houses, I2 hotels, Chance 
and Community Chest cards, Title Deed cards, play money and a Banker's tray. 
Now there's a faster way to play MONOPOLY. Choose to play by 
the classic rules for buying, renting and selling properties or use the 
Speed Die to get into the action faster. If you've never played the classic 
MONOPOLY game, refer to the Classic Rules beginning on the next page. 
If you already know how to play and want to use the Speed Die, just 
read the section below for the additional Speed Die rules. 
SPEED DIE RULES 
Learnins how to Play with the S~eed Die IS as 
/ 
fast as playing with i't. 
1. When starting the game, hand out an extra $1,000 to each player

---

Bus: This lets you "get off the bus early." Look at the two white 
dice. You can move the value of one die, the other die, or the 