In [17]:
#%pip install llama_index transformers
#%pip install python-dotenv
#%pip install nest_asyncio
# %pip install diskcache
#%pip install llama-index-embeddings-huggingface

from dotenv import load_dotenv
load_dotenv(r"C:\Users\sandy\Downloads\Insurance_Doc_RAG_With_LangchainLlamaIndex\keys.env")
import os
from llama_index.core import Settings
from llama_index.core import SimpleDirectoryReader
import nest_asyncio

from llama_index.core import  VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding

from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.extractors import TitleExtractor
from llama_index.core.ingestion import IngestionPipeline
from llama_index.llms.openai import OpenAI
from llama_index.core.postprocessor import SentenceTransformerRerank

from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext

import chromadb

import time

from diskcache import Cache

In [18]:

chroma_client = chromadb.PersistentClient(path=r"C:\Users\sandy\Downloads\Insurance_Doc_RAG_With_LangchainLlamaIndex\chroma.db")
cache = Cache("./cache")
nest_asyncio.apply()
Settings.llm = OpenAI(model="gpt-4o-mini")
api_key = os.getenv("OPENAI_API_KEY")
pdf_dir_path = r"C:\Users\sandy\Downloads\Insurance_Doc_RAG_With_LangchainLlamaIndex\New folder"
rerank = SentenceTransformerRerank(
    model="cross-encoder/ms-marco-MiniLM-L-2-v2", top_n=3
)

In [3]:
#print(os.environ)
#print(api_key)

# docs = []
# for doc in documnet:
#     docs.append(doc.text)
# print(docs)

In [19]:
def build_index(pdf_dir_path, storage_context):
    docs = SimpleDirectoryReader(pdf_dir_path).load_data()
    pipeline = IngestionPipeline(
        transformations=[
            SentenceSplitter(),
            TitleExtractor(),
            HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2"),
            #OpenAIEmbedding(model_name="text-embedding-ada-002"),
        ]
    )
    nodes = pipeline.run(documents=docs)
    index = VectorStoreIndex(nodes=nodes, storage_context=storage_context)

    return index

In [20]:
def save_index():
    try:
        chroma_collection = chroma_client.create_collection(name="Insurance_Doc_RAG_LlamaIndex")

    except Exception as e:
        chroma_collection = chroma_client.get_collection(name="Insurance_Doc_RAG_LlamaIndex")

    vector_store = ChromaVectorStore(
        chroma_collection=chroma_collection,
    )
    storage_context = StorageContext.from_defaults(vector_store=vector_store)

    index = build_index(pdf_dir_path, storage_context)
    
    return index


In [21]:
def load_index():
    try:
         chroma_collection = chroma_client.get_collection(name="Insurance_Doc_RAG_LlamaIndex")

    except Exception as e:
         print("Save the index first")
       

    vector_store = ChromaVectorStore(
        chroma_collection=chroma_collection,
    )
    storage_context = StorageContext.from_defaults(vector_store=vector_store)

    index =VectorStoreIndex.from_vector_store(
        vector_store=vector_store,
        storage_context=storage_context
    )
    
    return index

In [7]:
# Vector_Index = VectorStoreIndex()
# index = Vector_Index.build_index_from_nodes(nodes=nodes)
# Load documents
# pdf_dir_path = r"C:\Users\sandy\Downloads\Policy+Documents"
# docs = SimpleDirectoryReader(pdf_dir_path).load_data()

# with open('vector_store_index.pkl', 'rb') as f:
#     loaded_index = pickle.load(f)

# if loaded_index is not None:
#     index = loaded_index
# else:
#     index = build_index(pdf_dir_path)
#     with open("vector_store_index.pkl", "wb") as f:
#         pickle.dump(index, f)

In [8]:
# retriever = index.as_retriever()

# results = retriever.retrieve("What is the procedure to claim the insurance?")

In [9]:
# for res in results:
#     print(res.node.metadata["document_title"])

In [10]:
#from transformers import AutoModelForSequenceClassification, AutoTokenizer
# %pip install tf_keras
#%pip uninstall keras
#%pip install keras==2.11.0

In [22]:
def query_comm(query, index):
    retriever = index.as_retriever()
    results = retriever.retrieve(query)

    system_message = f"""You are an Question answering expert. The user will ask you a question/query. 
    The Question is : {query}
    Now, the Documents related to the question is : {[res.node.text for res in results]}
    If the question is related to the document, answer it using the information in the document.
    If the question is not related to the document, answer "Please contact the insurance company/agent as I am not able to answer the question".
    If you answer the question, please provide the relevant document as reference.
    Reference Format : Page Number | Document Name.
    Page Numbers is {[res.node.metadata['page_label'] for res in results]}
    Document Names is {[res.node.metadata['document_title'] for res in results]}
    Example:
    Reference 1 : Page 4 | Accidental Death Benefit Claims Procedure and Exclusions
    etc.
    Use all the info retrieved and if used then give the reference. Multiple references can be used.
    """

    llm = OpenAI(model="gpt-4o-mini", system_prompt=system_message)

    query_engine = index.as_query_engine(response_mode="compact", similirty_top_k=3, llm=llm, node_postprocessors=[rerank])
    response = query_engine.query(query)
    return response

In [23]:
try:
    index = load_index()
    #print("load path")
except:
    index = save_index()
    #print("save path")
print("Welcome to Insurance Documentation Chatbot. Please enter your query.")
query = input()
print("User: ", query)
print("-"*100)
time.sleep(1)
print("Searching in Cache... Please wait!")
if cache.get(query) is not None:
    print("Data found in Cache. Retrieving relevant information...")
    time.sleep(1)
    response = cache.get(query)
else:
    print("Data not found in Cache. Searching in Documents...")
    time.sleep(1)
    print("Data Found. Retrieving relevant information...")
    response = query_comm(query, index)
    cache.set(query, response, expire=600)
print("-"*100)
print(response)
print("-"*100)
print("Thanks for using Insurance Documentation Chatbot. Have a great day!")

Welcome to Insurance Documentation Chatbot. Please enter your query.
User:  what is Accelerated Critical Illness Benefit?
----------------------------------------------------------------------------------------------------
Searching in Cache... Please wait!
Data not found in Cache. Searching in Documents...
Data Found. Retrieving relevant information...
----------------------------------------------------------------------------------------------------
The Accelerated Critical Illness Benefit is a feature of an insurance policy that provides a benefit equal to the Sum Assured in the event that the Scheme Member is diagnosed with any of the covered Critical Illnesses during the Policy Term. Upon diagnosis, the policy will terminate, and all benefits will expire. The covered Critical Illnesses include conditions such as Myocardial Infarction, Cancer of Specified Severity, Stroke, and several others as listed in the policy document. 

Reference: Page 7 | Comprehensive Guide to Insurance B