# Import libraries

In [1]:
%pip install -qU "pinecone[grpc]" llama-index llama-parse llama-index-embeddings-ollama llama_index-llms-ollama llama-index-vector-stores-pinecone llama-index-vector-stores-mongodb --no-cache-dir

Note: you may need to restart the kernel to use updated packages.


In [1]:
import json
from pathlib import Path
from dotenv import load_dotenv
import os
load_dotenv("../.env")

from pymongo import MongoClient
client = MongoClient(os.environ["MONGODB_URI"])

from pinecone.grpc import PineconeGRPC
pinecone_api_key = os.getenv("PINECONE_API_KEY")
pc = PineconeGRPC(api_key=pinecone_api_key)


  from tqdm.autonotebook import tqdm


# Data aquisition:   
Fetch all relevant documents

## **If you have a json copy of the data** ***(DEPRECATED)***: 
- Each top-level object should be a collection//table
- the data should contain a vector targets top-level object
    - specifies the field used for each table for vectorization//embedding
- load the json file as data

In [None]:
data  = json.loads(Path("../data/sources/data.json").read_text())

## **If you directly use a pdf as data**:
- You need access to llama index 
    - Specifically u need a llama-cloud account as well as your own llama-cloud api key
- Once you have your own API key
    - You can save it in a `.env` file according to `../.env.example`

In [2]:
llamaparse_api_key = os.getenv("LLAMA_PARSE_API_KEY")

### Setup llama parse from llama index
- A freemium document parser/data converter that can help parse **66** pages per day FOR FREE 

In [3]:
import nest_asyncio
nest_asyncio.apply()
from llama_parse import LlamaParse  # pip install llama-parse
from llama_index.core import VectorStoreIndex,SimpleDirectoryReader,Document  # pip install llama-index

results_encoding = 'markdown'
parser = LlamaParse(
    api_key=llamaparse_api_key,
    result_type=results_encoding,
    premium_mode=True,
    disable_image_extraction=True,
    take_screenshot=False,
    parsing_instruction= "This is an insurance document. SOME tables have structural issues like headers as columns or multi-line headers. RESTRUCTURE those tables, leave the rest as is.")

### If you had already parsed your document in llamaCloud's online Dashboard or with python
Get the job id and results_type for specifiying the encoding for parsing the given document

#### Get history of parsing requests

In [4]:
history = None
async with parser.client_context() as client:
    headers = {"Authorization": f"Bearer {parser.api_key}"}
    history = await client.get("https://api.cloud.llamaindex.ai//api/v1/parsing/history", headers=headers)
    history = history.json()
print("latest records:")
for record in history:
    if record['expired'] == False: print(f"day: {record['day']} \t job: {record['job_id']}")
print("\nexpired records:")
for record in history:
    if record['expired'] == True: print(f"day: {record['day']} \t job: {record['job_id']}")

latest records:
day: 2024-11-25 	 job: 31bc561d-6f41-4f46-ab0c-fef9f6330280

expired records:
day: 2024-11-22 	 job: 4bda4c29-1727-4167-9236-0f40859eadd9
day: 2024-11-22 	 job: a63efdcc-c254-40bc-aea9-76a8a4ae199f
day: 2024-11-22 	 job: cae1c64e-ce7f-4968-971d-4d2927da05b4
day: 2024-11-22 	 job: dcb4645d-3506-4c83-9e9b-8b49ba154856
day: 2024-11-22 	 job: ce8c8945-0f06-4dc2-b399-8cbe820aae2a
day: 2024-11-22 	 job: d5a03f7a-d888-4c5b-b9a1-f2ff19212faf
day: 2024-11-22 	 job: 188e4ac2-491f-4aee-a3db-f3a1ae527baa
day: 2024-11-22 	 job: edee6082-0c5b-4f6c-9cac-4e313ee03ad3
day: 2024-11-22 	 job: 16ffaadd-3dc6-4786-919d-bee6c116af8b
day: 2024-11-21 	 job: e1ce8746-ed05-4830-8c12-ad9911dabf9e
day: 2024-11-21 	 job: 6d7e117e-df3f-474a-95dd-92780d304b74
day: 2024-11-21 	 job: e543c3c6-6584-4459-86cf-3ae475c1adf5
day: 2024-11-21 	 job: f786759d-a9f3-4e9f-a345-9b7f8ed7ad60
day: 2024-11-20 	 job: 040b9401-8042-4467-b20e-22a5e4c54670


#### Save it for further use

In [None]:
async with parser.client_context() as client:
    headers = {"Authorization": f"Bearer {parser.api_key}"}
    history = await client.get("https://api.cloud.llamaindex.ai//api/v1/parsing/history", headers=headers)
    history = history.json()

In [5]:
job_id = '31bc561d-6f41-4f46-ab0c-fef9f6330280'
response = await parser._get_job_result(job_id=job_id,result_type=results_encoding)
if results_encoding == 'json':
    results = response['pages']
    for i, pages in enumerate(response['pages']):
        with open(f'./outputs/brochure_{i}.json', 'w') as f:
            f.write(json.dumps(pages['items']))
else:
    results = response['markdown']
    with open("../data/outputs/data.md", 'w') as f:
        f.write(response['markdown'])
    documents = Document(text=response['markdown'])
    print(documents.get_content())


# AXIATA DIGITAL ECODE SDN BHD (121497-T)
# GREAT EASTERN LIFE ASSURANCE (MALAYSIA) BERHAD (93745-A)
# GROUP MULTIPLE BENEFITS INSURANCE SCHEME GSS315
# (SIMPLIFIED ISSUANCE OFFER) SPIF

Great Eastern Life Assurance (Malaysia) Berhad (93745-A) together with Axiata Digital Ecode Sdn Bhd (121497-T) has launched a Group Multiple Benefit Insurance Scheme with investment based on the Dana Gemilang as per the illustration below. This scheme covers your employees/members, their legal spouse and children. This scheme also provides coverage against 45 Critical Illnesses.

## AS LOW AS RM 30 – RM 45 / MONTH

| Benefits | Coverage provided under this scheme |  |
|----------|:-----------------------------------:|:--:|
|          | Sum Assured |  |
|          | Premium of RM 30 /month | Premium of RM 45 /month |
| 45 Critical Illnesses* | RM20,000 | RM30,000 |
| Death (due to illness or Natural Cause) | RM20,000 | RM30,000 |
| Accidental Death | RM40,000 | RM60,000 |
| Total and Permanent Disabili

### Or you can parse a document that you have via the API

In [35]:
file_extractor = {".pdf": parser}

reader = SimpleDirectoryReader(input_files=['../data/sources/BROCHURE.pdf'], file_extractor=file_extractor)
documents = await reader.aload_data()

Started parsing the file under job_id 7dccc4b4-06e5-4659-94a2-7a529b1c2e2e


# Data ingestion:   

## Option 1: Mongo & Pinecone (DEPRECATED)
We use mongoDB as the document database and pinecone as the vector database
since it is better to supply the full context of structured and unstructured data fetched in the future,   
we will store the raw text in mongodb first   
and apply preprocessing later when we need semantically/meaning accurate search results (important for vectorization)

In [None]:
# Connect to your MongoDB Atlas(Cloud) cluster
from pymongo import MongoClient
pinecone_api_key = os.getenv("PINECONE_API_KEY")
client = MongoClient(os.environ["MONGODB_URI"])
db = client[os.environ["MONGODB_DB"]]

In [3]:
tables = [
    {"name":"premium_plans", "field_containing_data":"all"},
    {"name":"critical_illnesses", "field_containing_data":"illnesses"},
    {"name":"schedule_of_compensation","field_containing_data":"all"},
    {"name":"total_investment_estimations", "field_containing_data":"all"},
    {"name":"premium_allocations","field_containing_data":"all"},
    {"name":"claims","field_containing_data":"claim"},
    {"name":"funds","field_containing_data":"fund_performance"},
    ]
for table in tables:
    print(data[table['name']])
    table['table'] = data[table['name']]
    

[{'currency': 'RM', 'premium_per_month': 30, 'coverage': {'critical_illnesses': 20000, 'death_natural': 20000, 'accidental_death': 40000, 'TPD_illness': 20000, 'TPD_accident': 40000, 'hospitalisation_income_per_day': 20, 'funeral_expenses': 5000}}, {'currency': 'RM', 'premium_per_month': 45, 'coverage': {'critical_illnesses': 30000, 'death_natural': 30000, 'accidental_death': 60000, 'TPD_illness': 30000, 'TPD_accident': 60000, 'hospitalisation_income_per_day': 30, 'funeral_expenses': 5000}}]
{'disclaimer': 'Upon payment of claim for Angioplasty and other invasive treatments for coronary artery disease, the sum assured will be reduced by the quantum of the payment for Angioplasty and other invasive treatments for coronary artery disease. However, the premium shall remain unchanged. This benefit is subject to a limit of RM1,000,000 under the Policy and all other non-credit-related group policies (including supplementary contract and endorsement, if any) issued by the Company by any name 

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.chat_models import ChatOllama as Ollama
from langchain_core.output_parsers import StrOutputParser as parser


# pipe = HuggingFacePipeline.from_model_id(model_id="meta-llama/Llama-3.2-3B",task = "text-generation")

def tables_summarize(tables):
    prompt_text = """You are an assistant tasked with summarizing tables and text. \
                    Give a concise summary of the table or text as sentences. \
                    Do not include text like "here is a summary of the table" in your responses \
                    Table or text chunk: {element} """

    prompt = ChatPromptTemplate.from_template(prompt_text)
    model = Ollama(temperature=0, model="llama3.2")
    summarize_chain = {"element": lambda x: x} | prompt | model | parser()
    table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})
    
        
    #text_summaries =  summarize_chain.batch(data_category[0], {"max_concurrency": 5})# no need to summarize

    return table_summaries
# table_summaries = tables_summarize(tables)



In [9]:
summaries = {}
for index, value in enumerate(table_summaries):
    summaries[tables[index]["name"]] = value

for key, value in summaries.items():
    if isinstance(data[key],list) & (len(data[key])>1):
        collection = {"table":data[key]}
    else:
        collection = data[key]
    collection["summary"] = value
    data[key] = collection
    print(data[key])



{'table': [{'currency': 'RM', 'premium_per_month': 30, 'coverage': {'critical_illnesses': 20000, 'death_natural': 20000, 'accidental_death': 40000, 'TPD_illness': 20000, 'TPD_accident': 40000, 'hospitalisation_income_per_day': 20, 'funeral_expenses': 5000}}, {'currency': 'RM', 'premium_per_month': 45, 'coverage': {'critical_illnesses': 30000, 'death_natural': 30000, 'accidental_death': 60000, 'TPD_illness': 30000, 'TPD_accident': 60000, 'hospitalisation_income_per_day': 30, 'funeral_expenses': 5000}}], 'summary': 'The table contains information on premium plans for a life insurance policy. The two plans have different monthly premiums and coverage amounts for various benefits such as critical illnesses, death due to natural causes, accidental deaths, and hospitalization income per day. The plans also include funeral expenses coverage.'}
{'disclaimer': 'Upon payment of claim for Angioplasty and other invasive treatments for coronary artery disease, the sum assured will be reduced by the

In [5]:
#specify outer level json objects as mongodb collection names, then based on the specified names, create new collections and insert corresponding data
#specify all the collections
collection_names = []
for key, value in data.items():
    collection_names.append(key)
    # if type(value) != str:
    #     value = json.dumps(value)
    # print(key+":    "+value)
    
#fetch existing collections
ignored_and_existing_collections = db.list_collection_names()

#specify the collections to ignore
if "questions_and_answers2" not in ignored_and_existing_collections:
        ignored_and_existing_collections.append("questions_and_answers2")
if "exclusions2" not in ignored_and_existing_collections:
        ignored_and_existing_collections.append("exclusions2")

for collection_name in collection_names:
    if collection_name in ignored_and_existing_collections:
       continue
    collection = db[collection_name]
    if isinstance(data[collection_name], list):
        print(f"""
inserting many records of {data[collection_name]} into {collection_name} collection
""")
        collection.insert_many(data[collection_name])
    else:
        print(f"""
inserting one record of {data[collection_name]} into {collection_name} collection
""")
        collection.insert_one(data[collection_name])

# separately insert the vector targets
collection = db["vector_targets"]
vector_target_records = list(collection.find())
if len(vector_target_records) == 0:
        print(f"""
inserting one record of {data["vector_targets"]} into {collection_name} collection
""")
        collection.insert_many(data["vector_targets"])        
else:
    print("vector_targets already exists in the database, skipping insertion")

print("insertion complete")

ignored_and_existing_collections = None
del ignored_and_existing_collections
collection_names = None
del collection_names
vector_target_records = None
del vector_target_records
data = None
del data


vector_targets already exists in the database, skipping insertion
insertion complete


further preprocessing:   
lowercasing   
special character removal

stop words removal

lemmatization

In [3]:
#use nltk to lowercase, remove special characters, tokenize, remove stopwords, and lemmatize the values in the json data
#fetch all data cleaning resources
import nltk
import re
try:
    nltk.data.find('corpora/wordnet.zip')
except:
    print("sam ting wong")
    nltk.download('wordnet')
    
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

#initialize the lemmatizer
lemmatizer = WordNetLemmatizer()
#initialize the stopwords
stop_words = set(stopwords.words('english'))
#allow specific stopwords
stop_words.remove("both")


def preprocess_text(text):
    pattern = re.compile(r"[;@#&*!()\[\]]")
    def get_wordnet_pos(tag):
        match tag[0]:
            case 'J':
                return wordnet.ADJ
            case 'V':
                return wordnet.VERB
            case 'R':
                return wordnet.ADV
            case _:
                return wordnet.NOUN
    # Tokenize
    tokens = word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    # Remove stop words and lemmatize
    processed_tokens = [
        lemmatizer.lemmatize(word.lower(), get_wordnet_pos(tag)) 
        for word, tag in pos_tags 
        if word.lower() not in stop_words and not pattern.match(word)
    ]
    # Join tokens back to a string
    return ' '.join(processed_tokens)

#setup a function to recursively preprocess the json data
def traverse_json(data, json_text_action, target_key=None):
    if isinstance(data, dict):
        for key, value in data.items():
            if isinstance(value, str) and (target_key is None or key == target_key):
                # Preprocess the string value
                data[key] = json_text_action(value)
            elif isinstance(value, dict) or isinstance(value, list):
                # Recursively preprocess the dictionary or list
                traverse_json(value, json_text_action)
    elif isinstance(data, list):
        for i, element in enumerate(data):
            if isinstance(element, str):
                # Preprocess the string in the list and save it in the original position
                data[i] = json_text_action(element)
            elif isinstance(element, dict) or isinstance(element, list):
                # Recursively preprocess the dictionary or list
                traverse_json(element, json_text_action)
    return data

# preprocess_text("i need information on performing funds. how much my premium is allocated? the allocation rate, the pricing plan, how much will i be paying per premium payment, what is the coverage performance, what are the fundings, testing the word post-apocalyptic, 80 90% 20$")

In [4]:
# Connect to your pinecone vector database and specify the working index
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(os.environ["PINECONE_API_KEY"])
index = None
if os.environ["PINECONE_INDEX_NAME"] not in [index.name for index in pc.list_indexes()]:
    index = pc.create_index(name = os.environ["PINECONE_INDEX_NAME"],
                            dimension = 768, 
                            metric = "cosine",
                            spec = ServerlessSpec(cloud = "aws", region = "us-east-1") 
                            )
else:
    index = pc.Index(os.environ["PINECONE_INDEX_NAME"])

  from tqdm.autonotebook import tqdm


In [5]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 51}},
 'total_vector_count': 51}

data embedding//vectorization

target descriptive values and generate embeddings for semantic search

In [None]:
# Load the embedding model (https://huggingface.co/nomic-ai/nomic-embed-text-v1.5")
# %pip install sentence-transformers scipy
# from sentence_transformers import SentenceTransformer
# from ollama import embeddings 
# global_embedding_model = SentenceTransformer(global_embedding_model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True))
# global_embedding_model = embeddings(model = 'nomic-embed-text')

# number_of_dimensions = 768 #nomic-embed-text-v1.5 has 768 dimensions


In [31]:
#specify the target fields to embed
import ollama
from copy import deepcopy
# from langchain_core.documents import Document

def vectorize(texts):
    "vectorize the provided texts using a globally declared model."    
    vectors = ollama.embeddings(model = 'nomic-embed-text', prompt=texts)
    return vectors

ignore_collections = ["vector_targets","test2", "test","questions_and_answers2","exclusions2"]

def vectorize_and_index(mongo_database=db, index=index, ignore_collections=ignore_collections, debug=False):
    """vectorize the target fields and store the embeddings in the pinecone index.
    
current implmentation does not support converting mongo collection name to pinecone namespaces."""
    if mongo_database == None:
        raise ValueError("mongo_database is required, the mongo database should also have a vector_targets collection specifying the target fields to vectorize")
    if not index:
        raise ValueError("index is required")
    vectors = []
    for collection_name in mongo_database.list_collection_names():
        if collection_name in ignore_collections:
            continue                                            #skip collections that should not to be vectorized
        if debug: print(f"\nIn {collection_name}")
        results = list(mongo_database[collection_name].find())
        targets = list(mongo_database.vector_targets.find({"name": collection_name}))
        for record in results:
            if targets:
                for target in targets:                              #iterate through the target fields from the same collection
                    old_data = deepcopy(record[target['field']])              #store original info
                    traverse_json(record, preprocess_text, target['field'])
                    cleaned_data = deepcopy(record[target['field']])                  #clean the data
                    traverse_json(record, vectorize, target['field'])
                    vector = record[target['field']]                #store the vectorized embeddings
                    if debug: print(f"original: {old_data}\ncleaned : {cleaned_data}")
                    if isinstance(cleaned_data, list):#check if the vectorized embeddings are of the correct length
                        #if the vectorized embeddings length not correct, possibility of other datatypes, iterate through the iterable
                        for j , sub_vector in enumerate(vector):
                            sub_vector = sub_vector['embedding']
                            if debug: print(f"vectorizing item {j+1} : {cleaned_data[j]} \nlength {len(sub_vector)}")
                            # print(f"vector: {len(sub_vector['embedding'])}")
                            # print(f"original: {old_data[j]}")
                            vectors.append({"id": str(record['_id'])+"-"+collection_name+"-"+target['field']+"-"+str(j) 
                                        ,"values": sub_vector
                                        ,"metadata": {"original":old_data[j]}
                                        })
                    else:   
                        vector = vector['embedding']
                        if debug: print(f"vectorizing string\nlength {len(vector)}")
                        vectors.append(
                            {   "id": str(record['_id'])+"-"+collection_name+"-"+target['field']
                                ,"values": vector
                                ,"metadata": {"original":cleaned_data}
                            })
                        pass
            else:
                if debug:
                    print("no matching target, skipping vectorization")
                continue
    index.upsert(vectors=vectors)
            
    

vectorize_and_index(debug=True)



In questions_and_answers
original: Yes. They can participate until they reach the maximum expiry age of sixty-five (65) years age next birthday.
cleaned : yes . participate reach maximum expiry age sixty-five 65 year age next birthday .
vectorizing string
length 768
original: Can the spouse/children continue to participate in this scheme if the Assured Member dead/ disabled/ contracted one of the 45 Critical Illnesses before attaining age sixty-five (65) years age next birthday
cleaned : spouse/children continue participate scheme assured member dead/ disabled/ contract one 45 critical illness attain age sixty-five 65 year age next birthday
vectorizing string
length 768
original: Yes. The Assured Member/spouse/children will be required to reapply by completing a standard Group Proposal form, subject to approval by the Company and up to the maximum benefit allowed
cleaned : yes . assured member/spouse/children require reapply complete standard group proposal form , subject approval com

In [11]:
#using local(ollama) base model as chatbot model
intent_classifier = Ollama(model='llama3.2',#:1B
                           temperature=0,
                           num_predict=4,
                           request_timeout=10,
                           verbose=False )
chatbot_model = Ollama(model='llama3.2', request_timeout=60)

In [14]:
#setup the semantic search function
def semantic_search(question, vector_store=vector_store, index=index, top_k=6, verbose=False):
    """vectorizes the question and queries the pinecone index for the top 3 closest matches.

current implementation does not support querying multiple namespaces or using mongoDB indexes.

Args:
    question (str): _description_
    debug (bool, optional): _description_. Defaults to False.
    embedding (SentenceTransformer, optional): _description_. Defaults to global_embedding_model.
    index (Pinecone.Index, optional): _description_. Defaults to index.

Returns:
    list(QueryResponse): the top closest query results from the pinecone index
    """
    if not index:
        print("Index not found, please specify your pinecone or mongo search index")
        return

    if verbose:
        print("Encoding question...")
        
    matches = []
    spaces = index.describe_index_stats()['namespaces']
    for key, value in spaces.items():
        res = vector_store.similarity_search(
            query=question,
            k=top_k
        #     namespace=key,
        )
        matches.append(res)
    return matches

# Example usage
response = semantic_search("who manages this product")
response

[[Document(id='67286e50987ce18efd0c3ee4-questions_and_answers-answer-2', metadata={'original': 'Fund Management Charge is 0.50% per annum. Note: The fees and charges levied may change from time to time.'}, page_content='fund management charge 0.50 % per annum . note : fee charge levy may change time time .'),
  Document(id='67286e50987ce18efd0c3ee1-questions_and_answers-answer', metadata={'original': 'Yes. The Assured Member/spouse/children will be required to reapply by completing a standard Group Proposal form, subject to approval by the Company and up to the maximum benefit allowed'}, page_content='yes . assured member/spouse/children require reapply complete standard group proposal form , subject approval company maximum benefit allow'),
  Document(id='67286e50987ce18efd0c3ec7-advantages-advantage', metadata={'original': 'Automatic premium remittance via credit card / bank deduction / JomPay ensures continuous protection.'}, page_content='automatic premium remittance via credit car

In [15]:

#setup a function to receive semantic search results and return the collection name and ids
def get_collection_matches(response : list, verbose=False) -> list:
    """based on the response from a pinecone semantic search, extract and consolidate the collection name and ids of the matching documents.

    Args:
        response (list): a list of QueryResponse objects from a pinecone semantic search
        verbose (bool, optional): flag to use debug mode. Defaults to False.

    Returns:
        list: list of mongodb documents
    """
    def extract_info(match: str) -> dict:
        data_from_id = None
        try:
            data_from_id = match['id'].split("-")
        except TypeError as e: #probably a document object
            data_from_id = match.id.split("-")
        match_id = data_from_id[0]
        collection_name = data_from_id[1]
        return {'collection':collection_name, 'id':match_id}
    
    if verbose:
        print("Getting collection matches...")
    document_metadata = []
    for namespaces_or_documents in response:
        is_using_default_namespace = False
        if isinstance(namespaces_or_documents, list):
            if verbose:
                print("no pinecone namespace found, checking id or additional metadata")
            for metadata in namespaces_or_documents:
                document = extract_info(metadata)
                if document in document_metadata:
                    if verbose:
                        print(f"Duplicate id {document['id']} for {document['collection']} found in fetch list, ignoring")
                    continue
                document_metadata.append(document)
        else: #probably a dictionary
            document = extract_info(namespaces_or_documents)
            if document in document_metadata:
                if verbose:
                    print(f"Duplicate id {document['id']} for {document['collection']} found in fetch list, ignoring")
                continue
            ids = []
            for match in collection:
                if verbose:
                    print(match)
                data_from_id = match['id'].split("-")
                match_id = data_from_id[0]
                if is_using_default_namespace:
                    collection = data_from_id[1]
                #filter out duplicate ids
                if match_id not in ids:
                    ids.append(match_id)
                else:
                    if verbose:
                        print(f"Duplicate id {match_id} for {collection} found in fetch list, ignoring")
                    continue
                if is_using_default_namespace:
                    document_metadata.append({'collection':collection, 'id':match_id})
            document_metadata.append({'collection':collection, 'ids':ids})
    return document_metadata

document_metadata = get_collection_matches(response, verbose=True)
# print(document_metadata)

Getting collection matches...
no pinecone namespace found, checking id or additional metadata


In [None]:
#find the mongo documents based on their collection and ids
from bson.objectid import ObjectId as oid 
def find_documents(collection_matches : list, verbose=False, database=db) -> list:
    """use the collection matches to find the corresponding documents in the specified mongo database.

    Args:
        collection_matches (list): list of document metadata containing collection names and ids
        verbose (bool, optional): flag to turn on debug mode. Defaults to False.
        database (pymongo.synchronous.database.Database, optional): the mongodb database to use. Defaults to the db variable.

    Returns:
        list: list of mongodb documents
    """
    print(f"Finding documents using {len(collection_matches)} references") if verbose else None
    documents = []
    for collection_match in collection_matches:
        collection = database[collection_match['collection']]
        if 'ids' in collection_match:
            for id in collection_match['ids']:
                documents.append(collection.find_one({"_id": oid(id)}))
        else:
            documents.append(collection.find_one({"_id": oid(collection_match['id'])}))
    return documents

documents = find_documents(document_metadata)
documents

[{'_id': ObjectId('67286e50987ce18efd0c3ee4'),
  'question': 'What are the current fees and charges?',
  'answer': ['Insurance charges are Applicable to the sum assured, and vary depending on the average age profile and claim experience of the scheme.',
   'Monthly Policy Fee is RM5.00',
   'Fund Management Charge is 0.50% per annum. Note: The fees and charges levied may change from time to time.']},
 {'_id': ObjectId('67286e50987ce18efd0c3ee1'),
  'question': 'Can the Assured Member/spouse/children apply to contribute more',
  'answer': 'Yes. The Assured Member/spouse/children will be required to reapply by completing a standard Group Proposal form, subject to approval by the Company and up to the maximum benefit allowed'},
 {'_id': ObjectId('67286e50987ce18efd0c3ec7'),
  'advantage': 'Automatic premium remittance via credit card / bank deduction / JomPay ensures continuous protection.'},
 {'_id': ObjectId('67286e50987ce18efd0c3ecc'),
  'demographic': 'general',
  'description': 'Assu

the previous 3 functions only allow access to unstructured data, it accesses structured data from the previously setup table summaries

In [None]:
#setup the chatbot model for tool use
import re

def get_context(question: str, verbose: bool = False) -> list:
    """Retrieves text-based information for an insurance product only based on the user query.
does not answer quwstions about the chat agent"""
    print(question)
    matches = semantic_search(question, verbose=verbose)
    document_data = get_collection_matches(matches, verbose=verbose)
    context = find_documents(document_data, verbose=verbose)
    # if verbose:
    #     for message in messages:
    #         print(message)
    template = ChatPromptTemplate.from_template(f"""fullfill the query with the provided information
Query      :{{question}}
Information:{{context}}""")

    # RAG pipeline
    chain = {
        "context": lambda x: context , "question": RunnablePassthrough()
        } | template | chatbot_model | StrOutputParser()
    return chain.stream(question)

# tools = [fetch_metrics,
#         get_context
#         ]
# toolPicker = Ollama(model='llama3.2').bind_tools(tools)
# for group in fetch_metrics("i need information on performing funds, how much my premium is allocated, the allocation rate, the pricing plan, how much will i be paying per premium payment, what is the coverage performance, what are the fundings"):
#     for document in group:
#         print(document)

# fetch_metrics("i need information on performing funds. how much my premium is allocated? the allocation rate, the pricing plan, how much will i be paying per premium payment, what is the coverage performance, what are the fundings, testing the word post-apocalyptic, 80 90% 20$")

In [None]:
#setup a simple intent classifier
def classify_intent(user_input:str) -> str:
    """classify the intent of the user input

current implementation uses a lightweight model (llama3.2:1B)
with few-shot prompting examples
for classifying 'normal', 'register', 'RAG' intents

"""
    return intent_classifier.invoke(f"""Classify the given input, use RAG if it is asking about insurance products,answer only with 'normal','register','RAG':

example:

Input: "Is there a contact number", Intent: RAG
Input: "How do I create a new account", Intent: register
Input: "How do I make a claim", Intent: RAG
Input: "Search the web for cat videos", Intent: normal
Input: "Help me register for this service", Intent: register
Input: "Where can I get started", Intent: register
Input: "What is your name", Intent: normal
Input: "What entities are attached to this service", Intent: RAG
Input: "What is your purpose", Intent: normal
Input: "Can I see some fund performance metrics", Intent: RAG
Input: "What is the weather in your country", Intent: normal
Input: "How can i register for an account", Intent: register
Input: "Who owns the product", Intent: RAG
Input: "Tell me how to subscribe", Intent: register
Input: "Guide me through the registration process", Intent: register
Input: "How do I sign up for the trial", Intent: register
Input: "Can you explain your features", Intent: normal
Input: "Sign me up", Intent: register
Input: "Can you assist me with enrolling", Intent: register
Input: "What services does the product provide", Intent: RAG
Input: "Where can i wash my dog", Intent: normal
Input: "Goodbye", Intent: normal
Input: "How do i pay for the service", Intent: RAG
Input: "how is my premium allocated", Intent: RAG
Input: "Hello", Intent: normal
Input: "What are the coverage options", Intent: RAG
Input: "What's the first step to register", Intent: register
Input: "What funds are involved", Intent: RAG
Input: "Where are you located", Intent: normal
Input: "Who do I contact for help", Intent: RAG
Input: "Can I pay with a credit card", Intent: RAG
Input: "Why should i trust you", Intent: normal
Input: "What should i get ready for enrollment", Intent: register
Input: "How are you", Intent": normal
Input: "What company distributes this service", Intent: RAG
Input: "how do i know you are not a scam", Intent: verify
Input: "Are there any additional charges", Intent: RAG
Input: "What can you tell me about the available insurance plans", Intent: RAG
Input: "how much do i need to pay for the insurance scheme", Intent: RAG
Input: "tell me about the available insurance plans", Intent: RAG
Input: "how did you get my number", Intent: verify
Input: "show me verification so i know this isn't a scam", Intent: verify
Input: "how do i know you are not scamming me", Intent: verify
Input: "Why should I sign up for this plan", Intent: RAG

Input: {user_input}""").content

classify_intent("who are you")

'normal'

In [28]:
def chat(user_input: str):
    intent = classify_intent(user_input)
    print(intent)
    if intent == "RAG":
        return get_context(user_input)
    elif intent == "register":
        print("register function work in progress")
    else:
        chain = chatbot_model | StrOutputParser()
        return chain.stream(user_input)
    


In [30]:
for chunk in chat("what is your name"):
    print(chunk, end="", flush=True)

normal
I don't have a personal name. I'm an AI designed to provide information and assist with tasks, but I don't have a personal identity or emotions. I exist solely to help users like you with their queries.

If you'd like, I can suggest some alternative names that might be helpful in our conversation. For example, I could be referred to as "Assistant" or "AI Companion." Let me know if there's anything else I can do to make our interaction more comfortable!

## Option 2: Pinecone Only
we can solely use pinecone as a vectorstore for semantic search   
we embed the parsed markdown of the document with embedding models from ollama and store them into pinecone

### Setup pinecone environment
we need
1. Index
2. Vectorstore 
3. Ingestion pipeline
    - will specify a workflow for feeding documents into our pinecone index
4. embedding model
    - to vectorize the markdown version of our document data for storage and indexing later.

In [6]:
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings
embed_model = OllamaEmbedding(model_name='nomic-embed-text', ollama_additional_kwargs={"mirostat": 0})
llm = Ollama(model='llama3.2', request_timeout=30.0)

# Settings.llm = llm
Settings.llm = llm
Settings.embed_model = embed_model

In [None]:
from pinecone import ServerlessSpec

from llama_index.vector_stores.pinecone import PineconeVectorStore
# Initialize connection to Pinecone
index_name = "llama-integration-example"
pinecone_index = None

if index_name not in [index.name for index in pc.list_indexes()]:
    # Create your index (can skip this step if your index already exists)
    pinecone_index = pc.create_index(
        index_name,
        dimension=768,
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
else:
    pinecone_index = pc.Index(index_name)

# Initialize VectorStore
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)

In [None]:
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.core.ingestion import IngestionPipeline
# Define the initial pipeline
pipeline = IngestionPipeline(
    transformations=[
        SemanticSplitterNodeParser(
            buffer_size=1,
            breakpoint_percentile_threshold=97, 
            embed_model=embed_model,
            ),
        embed_model,
        ],
    vector_store=vector_store)

In [None]:
#reset the index if it is not empty
if pinecone_index.describe_index_stats().total_vector_count > 0:
    pinecone_index.delete(delete_all=True)
pipeline.run(documents)

11

In [18]:
print(pinecone_index.describe_index_stats())

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 11}},
 'total_vector_count': 11}


## Option 3: MongoDB only

[mongodb pipeline setup](https://medium.com/@abdulsomad.me/how-to-build-rag-app-with-mongodb-atlas-database-llama-index-gemini-llm-and-embedding-and-8e82df16d6bf)   
the cheapest but most limited option

as for setup, we need
1. Mongodb client
    - for setting up a client connection to the storage or project cluster
    - uses a cluster-specific connection string provided in mongodb Atlas
2. A storage context
    - A llamaindex container to prepare data for storage
3. Embedding model
    - Can be from any provider `(openAI)`, self-hosted`(ollama)`

In [3]:
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings
embed_model = OllamaEmbedding(model_name='nomic-embed-text', ollama_additional_kwargs={"mirostat": 0})
llm = Ollama(model='llama3.2', request_timeout=60.0)

Settings.llm = llm
Settings.embed_model = embed_model

In [None]:
# Specify the collection for which to create the index
from pymongo.operations import SearchIndexModel

collection = client["product1Chunked"]["llamaIndexChunks"]
index_name = 'vector_index'
if index_name not in ([index['name'] for index in list(collection.list_search_indexes())]):
  # Create your index model, then create the search index
  search_index_model = SearchIndexModel(
    definition={
      "fields": [
        {
          "type": "vector",
          "path": "embedding",
          "numDimensions": 768,
          "similarity": "cosine"
        },
        {
          "type": "filter",
          "path": "metadata.page_label"
        }
      ]
    },
    name=index_name,
    type="vectorSearch",
  )
  collection.create_search_index(model=search_index_model)

In [21]:
list(collection.list_search_indexes())

[{'id': '674426cc05bafd0e3943651c',
  'name': 'vector_index',
  'type': 'vectorSearch',
  'status': 'READY',
  'queryable': True,
  'latestDefinitionVersion': {'version': 0,
   'createdAt': datetime.datetime(2024, 11, 25, 7, 27, 8, 200000)},
  'latestDefinition': {'fields': [{'type': 'vector',
     'path': 'embedding',
     'numDimensions': 768,
     'similarity': 'cosine'},
    {'type': 'filter', 'path': 'metadata.page_label'}]},
  'statusDetail': [{'hostname': 'atlas-11ot6b-shard-00-02',
    'status': 'READY',
    'queryable': True,
    'mainIndex': {'status': 'READY',
     'queryable': True,
     'definitionVersion': {'version': 0,
      'createdAt': datetime.datetime(2024, 11, 25, 7, 27, 8)},
     'definition': {'fields': [{'type': 'vector',
        'path': 'embedding',
        'numDimensions': 768,
        'similarity': 'cosine'},
       {'type': 'filter', 'path': 'metadata.page_label'}]}}},
   {'hostname': 'atlas-11ot6b-shard-00-00',
    'status': 'READY',
    'queryable': True,


In [6]:
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
from llama_index.core import StorageContext

# Instantiate the vector store
atlas_vector_store = MongoDBAtlasVectorSearch(
    mongodb_client=client,
    db_name = "product1Chunked",
    collection_name = "llamaIndexChunks",
    vector_index_name = index_name
)
vector_store_context = StorageContext.from_defaults(vector_store=atlas_vector_store)

reset the index if it is not empty


In [46]:
from llama_index.core.schema import MetadataMode
from tqdm import tqdm
semantic_splitter = SemanticSplitterNodeParser(
buffer_size=1, breakpoint_percentile_threshold=97, embed_model=embed_model)
nodes = semantic_splitter.get_nodes_from_documents(documents)
# Progress bar
pbar = tqdm(total=len(nodes), desc="Embedding Progress", unit="node")
for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode=MetadataMode.EMBED)
    )
    node.embedding = node_embedding
    # Update the progress bar
    pbar.update(1)

# Close the progress bar
pbar.close()

Embedding Progress: 100%|██████████| 11/11 [00:01<00:00,  7.86node/s]


In [57]:
atlas_vector_store.add(nodes=nodes)

['5d084375-797a-4e7c-8e2e-4bff5747a6e6',
 '3b32013e-07d3-4a81-9e51-dbcc23ae0f1b',
 '0a633a3d-5938-4afe-aedc-3267c6c11109',
 'f9a51f31-9002-4d5a-a7cc-241ab16843f9',
 '165d3e76-ccc5-4407-a2e6-4fa5e914278b',
 'e5c3d019-1369-4dd2-9281-a5279ff1d9b0',
 'daef95d4-924b-412b-b462-87a3d9d9ff13',
 '1c354bca-0083-4985-9b3e-674500983926',
 'fc057b1c-5dfe-4def-a71e-0d36e0c0639d',
 'ea074e98-0a6e-4327-834b-dfab21ddcd80',
 '0f48e833-cd13-4797-9c3f-d3bab1149d80']

Instantiate a vector store and Store your data into it

# Data inference
To query our data, we need to configure our storage and indexes into objects that can be used to infer our data,
for example:


## Option 2: Pinecone Only

In [20]:
from llama_index.core import VectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever

# Instantiate VectorStoreIndex object from your vector_store object
vector_index = VectorStoreIndex.from_vector_store(vector_store=vector_store)

# Grab 5 search results
retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=5)

# Query vector DB
answer = retriever.retrieve('what are the covered medical conditions')

# Inspect results

for i in answer:
    print(i.text)
    print("\n ------------- NEW NODE")

• Sustained as a result of any form of flying except as a passenger on a regular scheduled flight.
• Existed before the effective date of the coverage under the plan.
• Resulted from war, whether declared or not declared.
• Resulted from Life Assured driving a motor vehicle without possessing a valid driving licence. This exclusion will not apply if the Life Assured has an expired licence but is not disqualified from holding or obtaining such driving licence under any laws, by-laws or regulations
• Resulted from provoked assault, drugs, scuba-diving, any form of racing (other than on foot).

CRITICAL ILLNESS
• Existed before the effective date of the coverage under the plan.
• Diagnosed within the first sixty (60) days from the date of which the assurance on the Life Assured has become effective.
• Related to pre-existing illnesses.

Notes:
(1) This list is non-exhaustive. Please refer to the Master Policy for full list of exclusions under this plan.
(2) The Life Assured's assurance is

In [25]:
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import get_response_synthesizer
synth = get_response_synthesizer(streaming=True)
# Pass in your retriever from above, which is configured to return the top 5 results
query_engine = RetrieverQueryEngine(retriever=retriever,response_synthesizer=synth)

# Now you query:
llm_query = query_engine.query('what are the covered illnesses')

# Response:
# 'Logarithmic complexity in graph construction affects the construction process by organizing the graph into different layers based on their length scale. This separation of links into layers allows for efficient and scalable routing in the graph. The construction algorithm starts from the top layer, which contains the longest links, and greedily traverses through the elements until a local minimum is reached. Then, the search switches to the lower layer with shorter links, and the process repeats. By keeping the maximum number of connections per element constant in all layers, the routing complexity in the graph scales logarithmically. This logarithmic complexity is achieved by assigning an integer level to each element, determining the maximum layer it belongs to. The construction algorithm incrementally builds a proximity graph for each layer, consisting of "short" links that approximate the Delaunay graph. Overall, logarithmic complexity in graph construction enables efficient and robust approximate nearest neighbor search.'
llm_query.print_response_stream()

Based on the provided context, the following illnesses are explicitly mentioned as being covered under this Group Multiple Benefit Insurance Scheme:

1. 45 Critical Illnesses:
    - Heart Attack – of specified severity
    - Stroke – resulting in Permanent neurological deficit with persisting clinical symptoms
    - Coronary Artery By-Pass Surgery
    - Cancer – of specified severity and does not cover very early cancers
    - Kidney Failure – requiring dialysis or kidney transplant*
    - Fulminant Viral Hepatitis
    - Major Organ/Bone Marrow Transplant
    - Paralysis of Limbs
    - Multiple Sclerosis
    - Primary Pulmonary Arterial Hypertension – of specified severity
    - Blindness – Permanent and Irreversible
    - Heart Valve Surgery
    - Deafness – Permanent and Irreversible
    - Surgery To Aorta
    - Loss of Speech
    - Alzheimer's Disease / Severe Dementia
    - Third Degree Burns – of specified severity
    - Coma – resulting in Permanent neurological deficit with pers

## Option 3: MongoDB only

fetch the vector store as an index for semantic search
then instantiate it as a tool for tool calling later used by our base language model

In [14]:
from llama_index.core import VectorStoreIndex
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core import get_response_synthesizer
synth = get_response_synthesizer(streaming=True)
index = VectorStoreIndex.from_vector_store(atlas_vector_store)
query_engine = index.as_query_engine(similarity_top_k=5, llm=llm, response_synthesizer=synth)

query_engine_tool = QueryEngineTool(
    query_engine=query_engine,
    metadata=ToolMetadata(
        name="knowledge_base",
        description=(
            "Provides information about Group Multiple Benefits Insurance Scheme (GMBIS)."
            "Use a detailed plain text question as input to the tool."
        ),
    ),
)

In [15]:
query_engine.query("what are the covered medical conditions").print_response_stream()


Based on the provided context information, some of the covered medical conditions under this scheme include:

1. Heart Attack - of specified severity
2. Stroke - resulting in Permanent neurological deficit with persisting clinical symptoms
3. Coronary Artery By-Pass Surgery
4. Cancer - of specified severity (with a note that it does not cover very early cancers)
5. Kidney Failure - requiring dialysis or kidney transplant*
6. Fulminant Viral Hepatitis
7. Major Organ/Bone Marrow Transplant
8. Paralysis of Limbs
9. Multiple Sclerosis
10. Primary Pulmonary Arterial Hypertension - of specified severity
11. Blindness - Permanent and Irreversible
12. Heart Valve Surgery
13. Deafness - Permanent and Irreversible
14. Surgery To Aorta
15. Loss of Speech
16. Alzheimer's Disease / Severe Dementia
17. Third Degree Burns - of specified severity
18. Coma - resulting in Permanent neurological deficit with persisting clinical symptoms
19. Cardiomyopathy - of specified severity
20. Motor Neuron Disease 

In [None]:
from llama_index.core.agent import FunctionCallingAgentWorker

agent_worker = FunctionCallingAgentWorker.from_tools(
    [query_engine_tool], llm=llm, verbose=False
)
agent = agent_worker.as_agent()
agent.chat("What are the covered illnesses")


AttributeError: 'AgentChatResponse' object has no attribute 'print_response_stream'