# RAG USING HDFC POLICY DOCUMENTS

Using HDFC Health insurance policy documents to create a RAG system with features like cahce storage search and reranking functionality to make a scalable and robust model using llama generative model.

In [3]:
## Importing necessary libraries
import os
import pdfplumber
import openai
from langchain.schema import Document
from typing import List
import chromadb
from langchain_openai import OpenAIEmbeddings
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
from langchain.retrievers.document_compressors import CrossEncoderReranker
from pathlib import Path
import pandas as pd
from operator import itemgetter
import json
import tiktoken
from sentence_transformers import CrossEncoder, util
from langchain_community.llms import Together


import warnings
warnings.filterwarnings('ignore')

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## DATA LOADING AND PREPROCESSING

In [5]:
## Adding path of the PDF Documents
folder_path = "/content/drive/MyDrive/Colab Notebooks/Policy documents"

In [6]:
# Function to check whether a word is present in a table or not for segregation of regular text and tables
def check_bboxes(word, table_bbox):
    # Check whether word is inside a table bbox.
    l = word['x0'], word['top'], word['x1'], word['bottom']
    r = table_bbox
    return l[0] > r[0] and l[1] > r[1] and l[2] < r[2] and l[3] < r[3]

In [7]:
# Function to extract text from a PDF file.
# 1. Declare a variable p to store the iteration of the loop that will help us store page numbers alongside the text
# 2. Declare an empty list 'full_text' to store all the text files
# 3. Use pdfplumber to open the pdf pages one by one
# 4. Find the tables and their locations in the page
# 5. Extract the text from the tables in the variable 'tables'
# 6. Extract the regular words by calling the function check_bboxes() and checking whether words are present in the table or not
# 7. Use the cluster_objects utility to cluster non-table and table words together so that they retain the same chronology as in the original PDF
# 8. Declare an empty list 'lines' to store the page text
# 9. If a text element in present in the cluster, append it to 'lines', else if a table element is present, append the table
# 10. Append the page number and all lines to full_text, and increment 'p'
# 11. When the function has iterated over all pages, return the 'full_text' list


def extract_text_from_pdf(pdf_path):
    p = 0
    full_text = []


    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_no = f"Page {p+1}"
            text = page.extract_text()

            tables = page.find_tables()
            table_bboxes = [i.bbox for i in tables]
            tables = [{'table': i.extract(), 'top': i.bbox[1]} for i in tables]
            non_table_words = [word for word in page.extract_words() if not any(
                [check_bboxes(word, table_bbox) for table_bbox in table_bboxes])]
            lines = []

            for cluster in pdfplumber.utils.cluster_objects(non_table_words + tables, itemgetter('top'), tolerance=5):

                if 'text' in cluster[0]:
                    try:
                        lines.append(' '.join([i['text'] for i in cluster]))
                    except KeyError:
                        pass

                elif 'table' in cluster[0]:
                    lines.append(json.dumps(cluster[0]['table']))


            full_text.append([page_no, " ".join(lines)])
            p +=1

    return full_text

In [8]:
#Applying function sto PDF files.
# Define the directory containing the PDF files
pdf_directory = Path(folder_path)

# Initialize an empty list to store the extracted texts and document names
data = []

# Loop through all files in the directory
for pdf_path in pdf_directory.glob("*.pdf"):

    # Process the PDF file
    print(f"...Processing {pdf_path.name}")

    # Call the function to extract the text from the PDF
    extracted_text = extract_text_from_pdf(pdf_path)

    # Convert the extracted list to a PDF, and add a column to store document names
    extracted_text_df = pd.DataFrame(extracted_text, columns=['Page No.', 'Page_Text'])
    extracted_text_df['Document Name'] = pdf_path.name

    # Append the extracted text and document name to the list
    data.append(extracted_text_df)

    # Print a message to indicate progress
    print(f"Finished processing {pdf_path.name}")

# Print a message to indicate all PDFs have been processed
print("All PDFs have been processed.")

...Processing HDFC-Life-Easy-Health-101N110V03-Policy-Bond-Single-Pay.pdf
Finished processing HDFC-Life-Easy-Health-101N110V03-Policy-Bond-Single-Pay.pdf
...Processing HDFC-Life-Sampoorna-Jeevan-101N158V04-Policy-Document (1).pdf




Finished processing HDFC-Life-Sampoorna-Jeevan-101N158V04-Policy-Document (1).pdf
...Processing HDFC-Life-Group-Term-Life-Policy.pdf




Finished processing HDFC-Life-Group-Term-Life-Policy.pdf
...Processing HDFC-Surgicare-Plan-101N043V01.pdf
Finished processing HDFC-Surgicare-Plan-101N043V01.pdf
...Processing HDFC-Life-Group-Poorna-Suraksha-101N137V02-Policy-Document.pdf
Finished processing HDFC-Life-Group-Poorna-Suraksha-101N137V02-Policy-Document.pdf
...Processing HDFC-Life-Smart-Pension-Plan-Policy-Document-Online.pdf




Finished processing HDFC-Life-Smart-Pension-Plan-Policy-Document-Online.pdf
...Processing HDFC-Life-Sanchay-Plus-Life-Long-Income-Option-101N134V19-Policy-Document.pdf




Finished processing HDFC-Life-Sanchay-Plus-Life-Long-Income-Option-101N134V19-Policy-Document.pdf
All PDFs have been processed.


In [9]:
policy_data = pd.concat(data, ignore_index=True)

In [10]:
policy_data.head()

Unnamed: 0,Page No.,Page_Text,Document Name
0,Page 1,Part A <<Date>> <<Policyholder’s Name>> <<Poli...,HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...
1,Page 2,Agency/Intermediary Contact Details: <<Agency/...,HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...
2,Page 3,POLICY DOCUMENT- HDFC LIFE EASY HEALTH Unique ...,HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...
3,Page 4,"[[null, ""<< dd/mm/yyyy >>""], [""Appointee's Add...",HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...
4,Page 5,Part B Definitions The following capitalised t...,HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...


In [11]:
policy_data.shape

(217, 3)

In [12]:
## Liiking into some of the loaded data.
policy_data.Page_Text[2]

'POLICY DOCUMENT- HDFC LIFE EASY HEALTH Unique Identification Number: <<101N110V03>> Your Policy is a Single Premium paying non participating non linked fixed benefit health plan. This document is the evidence of a contract between HDFC Life Insurance Company Limited and the Policyholder as described in the Policy Schedule given below. This Policy is based on the Proposal made by the within named Policyholder and submitted to the Company along with the required documents, declarations, statements, any response given to the Short Medical Questionnaire (SMQ) by the Life Assured, and other information received by the Company from the Policyholder, Life Assured or on behalf of the Policyholder. This Policy is effective upon receipt and realisation, by the Company, of the consideration payable as Premium under the Policy. This Policy is written under and will be governed by the applicable laws in force in India and all Premiums and Benefits are expressed and payable in Indian Rupees. POLICY

In [13]:
## Checking word lenght of each document.
policy_data['Text_Length'] = policy_data['Page_Text'].apply(lambda x: len(x.split(' ')))

In [14]:
policy_data['Text_Length'].head()

Unnamed: 0,Text_Length
0,508
1,85
2,298
3,63
4,514


In [15]:
# Retain only the rows with a text length of at least 10

policy_data = policy_data.loc[policy_data['Text_Length'] >= 10]
policy_data

Unnamed: 0,Page No.,Page_Text,Document Name,Text_Length
0,Page 1,Part A <<Date>> <<Policyholder’s Name>> <<Poli...,HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...,508
1,Page 2,Agency/Intermediary Contact Details: <<Agency/...,HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...,85
2,Page 3,POLICY DOCUMENT- HDFC LIFE EASY HEALTH Unique ...,HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...,298
3,Page 4,"[[null, ""<< dd/mm/yyyy >>""], [""Appointee's Add...",HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...,63
4,Page 5,Part B Definitions The following capitalised t...,HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...,514
...,...,...,...,...
212,Page 23,HDFC Life Sanchay Plus (UIN – 101N134V19) – Ap...,HDFC-Life-Sanchay-Plus-Life-Long-Income-Option...,793
213,Page 24,HDFC Life Sanchay Plus (UIN – 101N134V19) – Ap...,HDFC-Life-Sanchay-Plus-Life-Long-Income-Option...,789
214,Page 25,HDFC Life Sanchay Plus (UIN – 101N134V19) – Ap...,HDFC-Life-Sanchay-Plus-Life-Long-Income-Option...,65
215,Page 26,HDFC Life Sanchay Plus (UIN – 101N134V19) – Ap...,HDFC-Life-Sanchay-Plus-Life-Long-Income-Option...,670


In [16]:
# Creating a metadata column in dataframe
policy_data['Metadata'] = policy_data.apply(lambda x: {'Policy_Name': x['Document Name'][:-4], 'Page_No.': x['Page No.']}, axis=1)

In [17]:
policy_data.head()

Unnamed: 0,Page No.,Page_Text,Document Name,Text_Length,Metadata
0,Page 1,Part A <<Date>> <<Policyholder’s Name>> <<Poli...,HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...,508,{'Policy_Name': 'HDFC-Life-Easy-Health-101N110...
1,Page 2,Agency/Intermediary Contact Details: <<Agency/...,HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...,85,{'Policy_Name': 'HDFC-Life-Easy-Health-101N110...
2,Page 3,POLICY DOCUMENT- HDFC LIFE EASY HEALTH Unique ...,HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...,298,{'Policy_Name': 'HDFC-Life-Easy-Health-101N110...
3,Page 4,"[[null, ""<< dd/mm/yyyy >>""], [""Appointee's Add...",HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...,63,{'Policy_Name': 'HDFC-Life-Easy-Health-101N110...
4,Page 5,Part B Definitions The following capitalised t...,HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...,514,{'Policy_Name': 'HDFC-Life-Easy-Health-101N110...


In [18]:
policy_data.describe()

Unnamed: 0,Text_Length
count,210.0
mean,429.82381
std,243.310624
min,11.0
25%,255.25
50%,467.5
75%,619.5
max,994.0


In [19]:
## we have maximum of 994 words in a document.

## EMBEDDING CREATION AND DATABASE CREATION

In [20]:
from google.colab import userdata
openai.api_key = userdata.get('API_Key')

In [21]:
# Converting dataframes to list for storing in vectordatabase.
documents_list = policy_data["Page_Text"].tolist()
metadata_list = policy_data['Metadata'].tolist()

In [22]:
## Using OpenAI embeddingfunction
embedding_function = OpenAIEmbeddingFunction(
    api_key=userdata.get('API_Key'),
    model_name="text-embedding-ada-002"
)
# Creating persistant client and creating a collection.
client = chromadb.PersistentClient()
insurance_collection = client.get_or_create_collection(
    name="RAG_on_Insurance",
    embedding_function=embedding_function
)

In [23]:
# Adding data and metadata in collection.
insurance_collection.add(
    documents= documents_list,
    ids = [str(i) for i in range(0, len(documents_list))],
    metadatas = metadata_list
)

In [24]:
# Checking some instances
insurance_collection.get(
    ids = '0',
    include = ['embeddings', 'documents', 'metadatas']
)

{'ids': ['0'],
 'embeddings': array([[ 0.006099  ,  0.01558264, -0.00211374, ..., -0.00770505,
         -0.0159012 , -0.04735849]]),
 'documents': ['Part A <<Date>> <<Policyholder’s Name>> <<Policyholder’s Address>> <<Policyholder’s Contact Number>> Dear <<Policyholder’s Name>>, Sub: Your Policy no. << >> We are glad to inform you that your proposal has been accepted and the HDFC Life Easy Health (“Policy”) being this document, has been issued. We have made every effort to design your Policy in a simple format. We have highlighted items of importance so that you may recognize them easily. Policy document: As an evidence of the insurance contract between HDFC Life Insurance Company Limited and you, the Policy is enclosed herewith. Please preserve this document safely and also inform your nominees about the same. A copy of your proposal form and other relevant documents submitted by you is also enclosed for your information and record. Cancellation in the Free-Look Period: << In case you

In [25]:
# Creating a cache collection for repeatative queries.
cache_collection = client.get_or_create_collection(name='Insurance_Cache', embedding_function=embedding_function)

In [26]:
cache_collection.peek()

{'ids': [],
 'embeddings': array([], dtype=float64),
 'documents': [],
 'uris': None,
 'included': ['metadatas', 'documents', 'embeddings'],
 'data': None,
 'metadatas': []}

In [27]:
query = input()

What is policy on eye issue?


In [29]:
# Checking cache results
cache_results = cache_collection.query(
    query_texts=query,
    n_results=1
)

In [30]:
cache_results

{'ids': [[]],
 'embeddings': None,
 'documents': [[]],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[]],
 'distances': [[]]}

In [31]:
## matching query in main collection
results = insurance_collection.query(
query_texts=query,
n_results=10
)
results.items()

dict_items([('ids', [['29', '15', '12', '140', '208', '42', '17', '114', '132', '178']]), ('embeddings', None), ('documents', [['Annexure III Provisions regarding Policy not being called into question in terms of Section 45 of the Insurance Act, 1938, as amended by Insurance Laws (Amendment) Act, 2015 dated 23.03.2015 are as follows: (1) No Policy of Life Insurance shall be called in question on any ground whatsoever after expiry of 3 yrs from a. the date of issuance of Policy or b. the date of commencement of risk or c. the date of revival of Policy or d. the date of rider to the Policy whichever is later. (2) On the ground of fraud, a Policy of Life Insurance may be called in question within 3 years from a. the date of issuance of Policy or b. the date of commencement of risk or c. the date of revival of Policy or d. the date of rider to the Policy whichever is later. For this, the insurer should communicate in writing to the insured or legal representative or nominee or assignees of

In [32]:
# Implementing Cache in Semantic Search

def search_with_cache(query, threshold=0.2):
    cache_results = cache_collection.query(
    query_texts=query,
    n_results=1
    )
    ids = []
    documents = []
    distances = []
    metadatas = []
    results_df = pd.DataFrame()

    # If the distance is greater than the threshold, fetch from main collection
    if cache_results['distances'][0] == [] or cache_results['distances'][0][0] > threshold:
        results = insurance_collection.query(
            query_texts=query,
            n_results=10
        )

        # Store the query and results into the cache
        Keys = []
        Values = []
        for key, val in results.items():
            if val is None:
                continue
            for i in range(min(10, len(val[0]) if isinstance(val[0], list) else len(val))):
              Keys.append(str(key) + str(i))
              if isinstance(val[0], list):
                Values.append(str(val[0][i]))
              else:
                Values.append(str(val[i]))

        cache_collection.add(
            documents=[query],
            ids=[query],
            metadatas=dict(zip(Keys, Values))
        )

        print("Not found in cache. Found in main collection.")

        result_dict = {
            'Metadatas': results['metadatas'][0],
            'Documents': results['documents'][0],
            'Distances': results['distances'][0],
            "IDs": results["ids"][0]
        }
        results_df = pd.DataFrame.from_dict(result_dict)

    # If distance is within threshold, fetch from cache
    elif cache_results['distances'][0][0] <= threshold:
        cache_result_dict = cache_results['metadatas'][0][0]

        for key, value in cache_result_dict.items():
            if 'ids' in key:
                ids.append(value)
            elif 'documents' in key:
                documents.append(value)
            elif 'distances' in key:
                distances.append(value)
            elif 'metadatas' in key:
                metadatas.append(value)

        print("Found in cache!")

        results_df = pd.DataFrame({
            'IDs': ids,
            'Documents': documents,
            'Distances': distances,
            'Metadatas': metadatas
        })

    return results_df

In [34]:
# Applying function to query to get best 10 results.
results_df = search_with_cache(query)
results_df

Found in cache!


Unnamed: 0,IDs,Documents,Distances,Metadatas
0,15,Annexure III Provisions regarding Policy not b...,0.4701406061649322,{'Policy_Name': 'HDFC-Life-Easy-Health-101N110...
1,114,In case you are not agreeable to any of the pr...,0.4605374634265899,{'Policy_Name': 'HDFC-Life-Easy-Health-101N110...
2,132,Annexure III Section 45 – Policy shall not be ...,0.4728600680828094,{'Policy_Name': 'HDFC-Life-Group-Poorna-Suraks...
3,12,(2) We reserve the right to change any of thes...,0.4790466427803039,"{'Page_No.': 'Page 33', 'Policy_Name': 'HDFC-L..."
4,140,HDFC Life Sanchay Plus (UIN – 101N134V19) – Ap...,0.4655665755271911,{'Policy_Name': 'HDFC-Life-Easy-Health-101N110...
5,208,intimation of change of address lies with the ...,0.476059079170227,"{'Page_No.': 'Page 14', 'Policy_Name': 'HDFC-L..."
6,42,PART D Policy Servicing Related Aspects D.1. F...,0.4664654731750488,{'Policy_Name': 'HDFC-Life-Sanchay-Plus-Life-L...
7,17,"7. Routine eye tests, any Dental Treatment or ...",0.4812735915184021,"{'Page_No.': 'Page 26', 'Policy_Name': 'HDFC-L..."
8,29,HDFC Standard Life Insurance Company Limited H...,0.4678828716278076,"{'Page_No.': 'Page 15', 'Policy_Name': 'HDFC-S..."
9,178,HDFC Life Smart Pension Plan 101L164V02 – Term...,0.4745998382568359,{'Policy_Name': 'HDFC-Life-Sampoorna-Jeevan-10...


## RE-RANKING USING CROSS ENCODER

In [35]:
# Creating cross encoder object
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

In [36]:
#Creating a function to get top-3 most relevant results using cross-encoder.
def top_3_results(query,result_df):
  cross_input = [[query,response] for response in result_df['Documents']]
  result_df['ranked_score'] = cross_encoder.predict(cross_input)
  new_df = result_df.sort_values(by='ranked_score',  ascending=False)
  return new_df[["Documents", "Metadatas"]][:3]

In [37]:
new_result_df = top_3_results(query,results_df)
new_result_df

Unnamed: 0,Documents,Metadatas
7,"7. Routine eye tests, any Dental Treatment or ...","{'Page_No.': 'Page 26', 'Policy_Name': 'HDFC-L..."
2,Annexure III Section 45 – Policy shall not be ...,{'Policy_Name': 'HDFC-Life-Group-Poorna-Suraks...
4,HDFC Life Sanchay Plus (UIN – 101N134V19) – Ap...,{'Policy_Name': 'HDFC-Life-Easy-Health-101N110...


## GENERATIVE SEARCH

In [38]:
## Creating a function to perform generative search based on query taking reference from top-3 results.
def generate_response(query, results_df):
    context = ""
    for i, row in results_df.iterrows():
        context += f"\n\n---\nDocument {i+1}:\n{row['Documents']}\n(Source: {row['Metadatas']})"

    full_prompt = f"""
You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents.

The user asked the following question:
"{query}"

You have some relevant search results from insurance documents:
{context}

Instructions:
1. Use the documents to answer the question clearly.
2. Include citations using the metadata (policy name and page).
3. Reformat tables (if any) into a readable format.
4. Be brief, accurate, and customer-friendly.
5. If the query is irrelevant to the documents, state that clearly.

Provide the final answer first, then include citations at the end.
"""
#Usinf langchain's together AI api to use llama model.
    llm = Together(
        model="meta-llama/Llama-3-8b-chat-hf",
        temperature=0.3,
        max_tokens=512,
        together_api_key=userdata.get("Together_key")
    )

    response = llm(full_prompt)

    return response.split('\n')

In [39]:
response = generate_response(query, new_result_df)

In [40]:
response

['**Answer:** The policy does not cover eye issues related to cosmetic purposes, such as elective eye surgery for cosmetic reasons.',
 '',
 '**Citations:**',
 '',
 '1. Document 8: "7. Routine eye tests, any Dental Treatment or Surgery of cosmetic nature, extraction of impacted tooth/teeth, orthodontics or orthognathic surgery, or tempero-mandibular joint disorder except as necessitated by an accidental injury and warranting Hospitalization..." (Page 26, HDFC-Life-Group-Poorna-Suraksha-101N137V02-Policy-Document)',
 '2. Document 8: "12. Experimental or unproven procedures or treatments, devices or pharmacological regimens of any description (not recognized by Indian Medical Council) or Hospitalization for treatment under any system other than allopathy..." (Page 26, HDFC-Life-Group-Poorna-Suraksha-101N137V02-Policy-Document)',
 '3. Document 3: "4. Mere silence is not fraud unless, depending on circumstances of the case, it is the duty of the insured or his agent keeping silence to speak

In [41]:
## Function to provide top_3_results from chromadb and generated output.
def query_pipeline(query):
    results_df = search_with_cache(query)
    if results_df.empty:
        return None, "Unable to find your data here, anything else I can help you with?"

    top_3_df = top_3_results(query, results_df)
    final_answer = generate_response(query, top_3_df)

    return  final_answer

## MODEL EVALUATION USING SOME TEST CASES

In [42]:
query_pipeline("How much covergae can wwe expect for accident?")

Not found in cache. Found in main collection.


['**Answer:** The amount of coverage for an accident can vary depending on the specific policy and the circumstances of the accident. However, according to the documents, the Accidental Death Benefit Exclusions (Document 3) state that "Accidental Death Benefit will not be payable if the death of the Scheme Member occurs after 180 days from the date of Accident." Additionally, the Policy Document (Document 3) defines an Accident as "a sudden, unforeseen and involuntary event caused by external, visible and violent means."',
 '',
 '**Citations:**',
 '',
 '* Document 3, Page 14',
 '* Document 3, Page 4',
 '',
 'Please note that the answer provided is based on the given documents and may not be applicable to all insurance documents or policies. It is recommended to consult the policy documents or insurance provider for specific information on accident coverage.']

In [44]:
query_pipeline("if i got into an accident, how much coverage can i get?")

Found in cache!


['Final Answer:',
 'If you get into an accident, you can get coverage up to 180 days from the date of the accident, as per Document 8, and the Accidental Death Benefit Exclusions in Document 5. However, the coverage will not be payable if the death occurs after 180 days from the date of the accident.',
 '',
 'Citations:',
 "Document 5: (Source: {'Policy_Name': 'HDFC-Life-Easy-Health-101N110V03-Policy-Bond-Single-Pay', 'Page_No.': 'Page 5'})",
 'Document 8: (Source: {\'Policy_Name\': \'HDFC-Life-Easy-Health-101N110V03-Policy-Bond-Single-Pay\', \'Page_No.\': \'Page 16\'})""',
 '',
 'I hope this helps! Let me know if you have any further questions.']

In [45]:
query_pipeline("what about cancer?")

Not found in cache. Found in main collection.


['Answer: ',
 'The policy does not cover cancer treatment unless it is a "Cancer of specified severity" as defined in the policy document. According to the policy, cancer includes leukemia, lymphoma, and sarcoma, but excludes certain types of cancer such as carcinoma in situ, benign, pre-malignant, borderline malignant, low malignant potential, neoplasm of unknown behavior, or non-invasive. The policy also excludes non-melanoma skin carcinoma unless there is evidence of metastases to lymph nodes or beyond, and non-invasive papillary cancer of the bladder histologically described as TaN0M0 or of a lesser classification.',
 '',
 '[Citations]',
 'Document 1: HDFC-Life-Easy-Health-101N110V03-Policy-Bond-Single-Pay, Page 17',
 'Document 2: HDFC-Life-Group-Poorna-Suraksha-101N137V02-Policy-Document, Page 27',
 'Document 5: HDFC-Life-Easy-Health-101N110V03-Policy-Bond-Single-Pay, Page 5',
 'Document 1: HDFC-Life-Easy-Health-101N110V03-Policy-Bond-Single-Pay, Page 17',
 'Document 2: HDFC-Life-

In [46]:
query_pipeline("in what cases can we not get coverage, tell me in brief and lamen terms")

Not found in cache. Found in main collection.


['Answer:',
 'In what cases can we not get coverage? In brief and simple terms, you cannot get coverage in the following cases:',
 '',
 '* If you intentionally self-inflict an injury or attempt suicide.',
 '* If you have a pre-existing condition that is not disclosed in the proposal form.',
 '* If you are diagnosed with a sexually transmitted disease or HIV/AIDS.',
 '* If you fail to take medical advice unreasonably.',
 '* If you undergo experimental, investigational, or unproven treatment.',
 '* If you are treated by non-allopathic or western methods.',
 '* If you are treated outside India, except in cases of emergency.',
 '* If you provide incorrect information or fail to disclose material facts in the proposal form.',
 '',
 'Citations:',
 '1. HDFC Standard Life Insurance Company Limited, HDFC SurgiCare Plan, Page 14.',
 '2. HDFC Standard Life Insurance Company Limited, HDFC SurgiCare Plan, Page 14.',
 '3. HDFC Standard Life Insurance Company Limited, HDFC SurgiCare Plan, Page 14.',
