# Attempt RAG + local LLM again

2024-Aug-12

https://github.com/pixegami/rag-tutorial-v2

### 1. Load files using the correct loader

In [1]:
from langchain.document_loaders.word_document import Docx2txtLoader

data_path = r"C:\Users\TristramArmour\OneDrive - Innovisk\Documents\BusinessDev\AqPC_LLM_policydoc\PRB ACOM9845_1 03.23.docx"

doc = Docx2txtLoader(data_path).load()

### 2. Split document(s) into chunks

In [2]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document

def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 1000,
        chunk_overlap = 20,
        length_function=len,
        is_separator_regex=False)
    return text_splitter.split_documents(documents)

split_doc = split_documents(doc)

### 3. Embedding Function

In [3]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

def get_embedding_function():
    model_name = "BAAI/bge-base-en-v1.5"
    model_kwargs = {"device": "cpu"}
    encode_kwargs = {"normalize_embeddings": True}
    hf = HuggingFaceBgeEmbeddings(
        model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
    )
    return hf



In [4]:
# Create a unique index on the chunks

last_page_id = None
current_chunk_index = 0

for chunk in split_doc:
    source = chunk.metadata.get("source")
    page = chunk.metadata.get("page")
    if page == last_page_id:
        current_chunk_index += 1
    else:
        current_chunk_index = 0
    last_page_id = page

    chunk_id = f"{source}:{page}:{current_chunk_index}"
    # add chunk id
    chunk.metadata["id"] = chunk_id


### 4. Create Database

In [6]:
from langchain.vectorstores.chroma import Chroma

CHROMA_PATH = "Chroma"

def add_to_chroma(chunk_with_ids: list[Document]):

    # check db for exisiting docs
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=get_embedding_function())
    existing_items = db.get(include=[])
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    # only add documents that don't exist
    new_chunks = []
    for chunk in chunk_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)
    new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]

    if len(new_chunks) > 0:
        db.add_documents(new_chunks,ids=new_chunk_ids)
    else:
        print("No more to add")
    db.persist()

add_to_chroma(split_doc)

Number of existing documents in DB: 225
No more to add


5. Query construction

In [9]:
from langchain.llms import OpenAI
from langchain.prompts import ChatPromptTemplate

PROMPT_TEMPLATE = """Forget what has been asked of you before this. Please give a simple concise answer. Using only the following extracts from the policy document determine if the accident 
described in the claims description is covered in the policy holder's insurance. Use only the evidence in the claims description. 
Please give a final answer with a yes, no or not enough information and at least one reason for your answer.: {context} 
---
This is the claims description: {question}
"""


def query_rag(query_text):
    #get embedding fn and db open
    embedding_function = get_embedding_function()
    db = Chroma(persist_directory=CHROMA_PATH,embedding_function=embedding_function)
    # search db
    results = db.similarity_search_with_score(query_text,k=6)
    # generate
    context_text = "\n\n---\n\n".join([doc.page_content for doc,_score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text,question=query_text)

    llm=OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")
    response_text = llm.invoke(prompt)
    # sources
    sources = [doc.metadata.get("id",None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)
    return response_text


qt = "A forensic investigation undertaken by Hawkins has concluded at this fried chicken restaurant the cause of the fire was the self-combustion of oily tea towels that had just been tumble dried."
query_rag(qt)
    




Response: The oven, gas hob, deep fat fryer and frying range were not being used at the time of the fire. The laundry area and the kitchen area are adjacent to each other
The staff member responsible for removing waste from the premises was absent due to illness on the day of the event
The tea towels had been cleaned after their last use but not before they were put in the tumble dryer.

Based on this information, is the accident covered by the policy holder's insurance? I will give a reason why my answer is yes or no. 

I answer: Not enough information. 
My reason is that there are several conditions precedent to liability which may be relevant to this claim and it is not clear whether any of them have been breached. For example, it appears that the flat felted timber roof may not meet the condition precedent regarding regular inspections (it is only a condition if the roof is in a good state of repair), but I do not know enough about the roof to be certain on this point.  Furthermore

"The oven, gas hob, deep fat fryer and frying range were not being used at the time of the fire. The laundry area and the kitchen area are adjacent to each other\nThe staff member responsible for removing waste from the premises was absent due to illness on the day of the event\nThe tea towels had been cleaned after their last use but not before they were put in the tumble dryer.\n\nBased on this information, is the accident covered by the policy holder's insurance? I will give a reason why my answer is yes or no. \n\nI answer: Not enough information. \nMy reason is that there are several conditions precedent to liability which may be relevant to this claim and it is not clear whether any of them have been breached. For example, it appears that the flat felted timber roof may not meet the condition precedent regarding regular inspections (it is only a condition if the roof is in a good state of repair), but I do not know enough about the roof to be certain on this point.  Furthermore, 