In [1]:
# prompt: count to mount drive

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
single_pdf_path = "/content/drive/MyDrive/HelpMateAI_V1/Policy+Documents (1)/HDFC-Life-Easy-Health-101N110V03-Policy-Bond-Single-Pay.pdf"

In [3]:
import pdfplumber
from pathlib import Path
import pandas as pd
from operator import itemgetter
import json
import tiktoken
import openai
import chromadb

In [4]:
with pdfplumber.open(single_pdf_path) as pdf:
    single_page = pdf.pages[2]
    text = single_page.extract_text()

    tables = single_page.extract_tables()

    print(text)

POLICY DOCUMENT- HDFC LIFE EASY HEALTH
Unique Identification Number: <<101N110V03>>
Your Policy is a Single Premium paying non participating non linked fixed benefit health plan. This document is
the evidence of a contract between HDFC Life Insurance Company Limited and the Policyholder as described in
the Policy Schedule given below. This Policy is based on the Proposal made by the within named Policyholder
and submitted to the Company along with the required documents, declarations, statements, any response given
to the Short Medical Questionnaire (SMQ) by the Life Assured, and other information received by the Company
from the Policyholder, Life Assured or on behalf of the Policyholder. This Policy is effective upon receipt and
realisation, by the Company, of the consideration payable as Premium under the Policy. This Policy is written
under and will be governed by the applicable laws in force in India and all Premiums and Benefits are expressed
and payable in Indian Rupees.
POLICY 

In [5]:
tables

[[['Name', '<< >>'], ['Address', '<< >>']],
 [['Name', '<< >>'],
  ['Date of Birth', '<< dd/mm/yyyy >>'],
  ['Age on the Date of Risk\nCommencement', '<< >> years'],
  ['Age Admitted', '<<Yes/No>>']],
 [['Date of Commencement of Policy', '<<Date>>'],
  ['Date of Risk Commencement', '<< Risk Commencement Date >>'],
  ['Date of Issue/Inception of Policy', '<< Issue Date>>'],
  ['Plan Option', '<<>>'],
  ['Sum Insured', '<< >>'],
  ['Single Premium', 'Rs. << >>'],
  ['Premium Paying Term', 'Single'],
  ['Policy Term', '5 years'],
  ['Cover Ceasing Date', '<< dd/mm/yyyy >>']],
 [['Nominee’s Name', '<<Nominee-1 >>', '<<Nominee-2 >>'],
  ['Date of Birth of Nominee', '<< dd/mm/yyyy >>', '<< dd/mm/yyyy >>'],
  ['Nomination Percentage', '<< >> %', '<< >> %'],
  ["Nominee's Address", '<< >>', '<< >>'],
  ['Appointee’s Name\n(Applicable where the nominee is a\nminor)',
   '<< >>',
   None]]]

In [6]:
pdf_path = "/content/drive/MyDrive/HelpMateAI_V1/Policy+Documents (1)"

In [7]:
# Function to check whether a word is present in a table or not for segregation of regular text and tables

def check_bboxes(word, table_bbox):
    # Check whether word is inside a table bbox.
    l = word['x0'], word['top'], word['x1'], word['bottom']
    r = table_bbox
    return l[0] > r[0] and l[1] > r[1] and l[2] < r[2] and l[3] < r[3]

In [8]:
tables

[[['Name', '<< >>'], ['Address', '<< >>']],
 [['Name', '<< >>'],
  ['Date of Birth', '<< dd/mm/yyyy >>'],
  ['Age on the Date of Risk\nCommencement', '<< >> years'],
  ['Age Admitted', '<<Yes/No>>']],
 [['Date of Commencement of Policy', '<<Date>>'],
  ['Date of Risk Commencement', '<< Risk Commencement Date >>'],
  ['Date of Issue/Inception of Policy', '<< Issue Date>>'],
  ['Plan Option', '<<>>'],
  ['Sum Insured', '<< >>'],
  ['Single Premium', 'Rs. << >>'],
  ['Premium Paying Term', 'Single'],
  ['Policy Term', '5 years'],
  ['Cover Ceasing Date', '<< dd/mm/yyyy >>']],
 [['Nominee’s Name', '<<Nominee-1 >>', '<<Nominee-2 >>'],
  ['Date of Birth of Nominee', '<< dd/mm/yyyy >>', '<< dd/mm/yyyy >>'],
  ['Nomination Percentage', '<< >> %', '<< >> %'],
  ["Nominee's Address", '<< >>', '<< >>'],
  ['Appointee’s Name\n(Applicable where the nominee is a\nminor)',
   '<< >>',
   None]]]

In [9]:
tables = pdf.pages[2].find_tables()
[i.bbox for i in tables]

[(65.92800059999999, 313.019996375, 525.5999906249999, 346.5599975),
 (65.92800059999999, 372.17999993750004, 525.5999906249999, 441.599986625),
 (65.92800059999999, 467.2200010625, 525.5999906249999, 600.83999225),
 (58.43999887500001, 667.5599973125, 525.5999906249999, 767.2800025625)]

In [10]:
# Function to extract text from a PDF file.
# 1. Declare a variable p to store the iteration of the loop that will help us store page numbers alongside the text
# 2. Declare an empty list 'full_text' to store all the text files
# 3. Use pdfplumber to open the pdf pages one by one
# 4. Find the tables and their locations in the page
# 5. Extract the text from the tables in the variable 'tables'
# 6. Extract the regular words by calling the function check_bboxes() and checking whether words are present in the table or not
# 7. Use the cluster_objects utility to cluster non-table and table words together so that they retain the same chronology as in the original PDF
# 8. Declare an empty list 'lines' to store the page text
# 9. If a text element in present in the cluster, append it to 'lines', else if a table element is present, append the table
# 10. Append the page number and all lines to full_text, and increment 'p'
# 11. When the function has iterated over all pages, return the 'full_text' list

def extract_text_from_pdf(pdf_path):
    p = 0
    full_text = []


    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_no = f"Page {p+1}"
            text = page.extract_text()

            tables = page.find_tables()
            table_bboxes = [i.bbox for i in tables]
            tables = [{'table': i.extract(), 'top': i.bbox[1]} for i in tables]
            non_table_words = [word for word in page.extract_words() if not any(
                [check_bboxes(word, table_bbox) for table_bbox in table_bboxes])]
            lines = []

            for cluster in pdfplumber.utils.cluster_objects(non_table_words + tables, itemgetter('top'), tolerance=5):

                if 'text' in cluster[0]:
                    try:
                        lines.append(' '.join([i['text'] for i in cluster]))
                    except KeyError:
                        pass

                elif 'table' in cluster[0]:
                    lines.append(json.dumps(cluster[0]['table']))


            full_text.append([page_no, " ".join(lines)])
            p +=1

    return full_text

In [11]:
# Define the directory containing the PDF files
pdf_directory = Path("/content/drive/MyDrive/HelpMateAI_V1/Policy+Documents (1)")

# Initialize an empty list to store the extracted texts and document names
data = []

# Loop through all files in the directory
for pdf_path in pdf_directory.glob("*.pdf"):

    # Process the PDF file
    print(f"...Processing {pdf_path.name}")

    # Call the function to extract the text from the PDF
    extracted_text = extract_text_from_pdf(pdf_path)

    # Convert the extracted list to a PDF, and add a column to store document names
    extracted_text_df = pd.DataFrame(extracted_text, columns=['Page No.', 'Page_Text'])
    extracted_text_df['Document Name'] = pdf_path.name

    # Append the extracted text and document name to the list
    data.append(extracted_text_df)

    # Print a message to indicate progress
    print(f"Finished processing {pdf_path.name}")

# Print a message to indicate all PDFs have been processed
print("All PDFs have been processed.")

...Processing HDFC-Life-Easy-Health-101N110V03-Policy-Bond-Single-Pay.pdf
Finished processing HDFC-Life-Easy-Health-101N110V03-Policy-Bond-Single-Pay.pdf
...Processing HDFC-Life-Group-Term-Life-Policy.pdf
Finished processing HDFC-Life-Group-Term-Life-Policy.pdf
...Processing HDFC-Surgicare-Plan-101N043V01.pdf
Finished processing HDFC-Surgicare-Plan-101N043V01.pdf
...Processing HDFC-Life-Sanchay-Plus-Life-Long-Income-Option-101N134V19-Policy-Document.pdf
Finished processing HDFC-Life-Sanchay-Plus-Life-Long-Income-Option-101N134V19-Policy-Document.pdf
...Processing HDFC-Life-Smart-Pension-Plan-Policy-Document-Online.pdf
Finished processing HDFC-Life-Smart-Pension-Plan-Policy-Document-Online.pdf
...Processing HDFC-Life-Sampoorna-Jeevan-101N158V04-Policy-Document (1).pdf
Finished processing HDFC-Life-Sampoorna-Jeevan-101N158V04-Policy-Document (1).pdf
...Processing HDFC-Life-Group-Poorna-Suraksha-101N137V02-Policy-Document.pdf
Finished processing HDFC-Life-Group-Poorna-Suraksha-101N137V02-

In [12]:
# Concatenate all the DFs in the list 'data' together

insurance_pdfs_data = pd.concat(data, ignore_index=True)

In [13]:
insurance_pdfs_data.head(10)

Unnamed: 0,Page No.,Page_Text,Document Name
0,Page 1,Part A <<Date>> <<Policyholder’s Name>> <<Poli...,HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...
1,Page 2,Agency/Intermediary Contact Details: <<Agency/...,HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...
2,Page 3,POLICY DOCUMENT- HDFC LIFE EASY HEALTH Unique ...,HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...
3,Page 4,"[[null, ""<< dd/mm/yyyy >>""], [""Appointee's Add...",HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...
4,Page 5,Part B Definitions The following capitalised t...,HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...
5,Page 6,"[["""", """", ""iii. A rise in cardiac biomarkers o...",HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...
6,Page 7,"[["""", """", ""apart; and\nii. Requiring continuou...",HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...
7,Page 8,"[["""", ""Sclerosis\nwith\npersisting\nsymptoms"",...",HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...
8,Page 9,(16) Intensive Care Unit (ICU) - means an iden...,HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...
9,Page 10,"[[""Plan option"", ""Benefits covered""], [""A"", ""D...",HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...


In [14]:
# Check one of the extracted page texts to ensure that the text has been correctly read

insurance_pdfs_data.Page_Text[2]

'POLICY DOCUMENT- HDFC LIFE EASY HEALTH Unique Identification Number: <<101N110V03>> Your Policy is a Single Premium paying non participating non linked fixed benefit health plan. This document is the evidence of a contract between HDFC Life Insurance Company Limited and the Policyholder as described in the Policy Schedule given below. This Policy is based on the Proposal made by the within named Policyholder and submitted to the Company along with the required documents, declarations, statements, any response given to the Short Medical Questionnaire (SMQ) by the Life Assured, and other information received by the Company from the Policyholder, Life Assured or on behalf of the Policyholder. This Policy is effective upon receipt and realisation, by the Company, of the consideration payable as Premium under the Policy. This Policy is written under and will be governed by the applicable laws in force in India and all Premiums and Benefits are expressed and payable in Indian Rupees. POLICY

In [15]:
 #Let's also check the length of all the texts as there might be some empty pages or pages with very few words that we can drop

insurance_pdfs_data['Text_Length'] = insurance_pdfs_data['Page_Text'].apply(lambda x: len(x.split(' ')))

In [16]:
insurance_pdfs_data.head()

Unnamed: 0,Page No.,Page_Text,Document Name,Text_Length
0,Page 1,Part A <<Date>> <<Policyholder’s Name>> <<Poli...,HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...,508
1,Page 2,Agency/Intermediary Contact Details: <<Agency/...,HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...,85
2,Page 3,POLICY DOCUMENT- HDFC LIFE EASY HEALTH Unique ...,HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...,298
3,Page 4,"[[null, ""<< dd/mm/yyyy >>""], [""Appointee's Add...",HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...,63
4,Page 5,Part B Definitions The following capitalised t...,HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...,514


In [17]:
# Retain only the rows with a text length of at least 10

insurance_pdfs_data = insurance_pdfs_data.loc[insurance_pdfs_data['Text_Length'] >= 10]
insurance_pdfs_data

Unnamed: 0,Page No.,Page_Text,Document Name,Text_Length
0,Page 1,Part A <<Date>> <<Policyholder’s Name>> <<Poli...,HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...,508
1,Page 2,Agency/Intermediary Contact Details: <<Agency/...,HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...,85
2,Page 3,POLICY DOCUMENT- HDFC LIFE EASY HEALTH Unique ...,HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...,298
3,Page 4,"[[null, ""<< dd/mm/yyyy >>""], [""Appointee's Add...",HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...,63
4,Page 5,Part B Definitions The following capitalised t...,HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...,514
...,...,...,...,...
212,Page 27,Annexure IV Definitions of covered Critical Il...,HDFC-Life-Group-Poorna-Suraksha-101N137V02-Pol...,582
213,Page 28,6. Alzheimer's Disease - Deterioration or loss...,HDFC-Life-Group-Poorna-Suraksha-101N137V02-Pol...,625
214,Page 29,13. Third Degree Burns - There must be third-d...,HDFC-Life-Group-Poorna-Suraksha-101N137V02-Pol...,642
215,Page 30," One of the following human organs: heart, lu...",HDFC-Life-Group-Poorna-Suraksha-101N137V02-Pol...,586


In [18]:
# Store the metadata for each page in a separate column

insurance_pdfs_data['Metadata'] = insurance_pdfs_data.apply(lambda x: {'Policy_Name': x['Document Name'][:-4], 'Page_No.': x['Page No.']}, axis=1)

In [19]:
insurance_pdfs_data.head()

Unnamed: 0,Page No.,Page_Text,Document Name,Text_Length,Metadata
0,Page 1,Part A <<Date>> <<Policyholder’s Name>> <<Poli...,HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...,508,{'Policy_Name': 'HDFC-Life-Easy-Health-101N110...
1,Page 2,Agency/Intermediary Contact Details: <<Agency/...,HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...,85,{'Policy_Name': 'HDFC-Life-Easy-Health-101N110...
2,Page 3,POLICY DOCUMENT- HDFC LIFE EASY HEALTH Unique ...,HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...,298,{'Policy_Name': 'HDFC-Life-Easy-Health-101N110...
3,Page 4,"[[null, ""<< dd/mm/yyyy >>""], [""Appointee's Add...",HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...,63,{'Policy_Name': 'HDFC-Life-Easy-Health-101N110...
4,Page 5,Part B Definitions The following capitalised t...,HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...,514,{'Policy_Name': 'HDFC-Life-Easy-Health-101N110...


In [20]:
# Import the OpenAI Embedding Function into chroma

from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

In [21]:
from google.colab import userdata
upgrad_key = userdata.get('upgrad_key')

In [22]:
# Define the path where chroma collections will be stored

chroma_data_path = '/content/drive/MyDrive/HelpMateAI_V1/Embeddings'

In [23]:
import chromadb
client = chromadb.PersistentClient()

In [24]:
# Set up the embedding function using the OpenAI embedding model

model = "text-embedding-ada-002"
embedding_function = OpenAIEmbeddingFunction(api_key=upgrad_key, model_name=model)

In [25]:
# Initialise a collection in chroma and pass the embedding_function to it so that it used OpenAI embeddings to embed the documents

insurance_collection = client.get_or_create_collection(name='RAG_on_Insurance', embedding_function=embedding_function)

In [26]:
# Convert the page text and metadata from your dataframe to lists to be able to pass it to chroma

documents_list = insurance_pdfs_data["Page_Text"].tolist()
metadata_list = insurance_pdfs_data['Metadata'].tolist()

In [27]:
# Add the documents and metadata to the collection alongwith generic integer IDs. You can also feed the metadata information as IDs by combining the policy name and page no.

insurance_collection.add(
    documents= documents_list,
    ids = [str(i) for i in range(0, len(documents_list))],
    metadatas = metadata_list
)

In [28]:
# Let's take a look at the first few entries in the collection

insurance_collection.get(
    ids = ['0','1','2'],
    include = ['embeddings', 'documents', 'metadatas']
)

{'ids': ['0', '1', '2'],
 'embeddings': array([[ 0.0062511 ,  0.01526276, -0.00229605, ..., -0.00790345,
         -0.01601926, -0.04714202],
        [-0.00160233,  0.00841467, -0.02306825, ..., -0.01373609,
         -0.00756927, -0.04763069],
        [-0.01633095,  0.00773179,  0.00610208, ..., -0.01236002,
         -0.00221925, -0.03615848]]),
 'documents': ['Part A <<Date>> <<Policyholder’s Name>> <<Policyholder’s Address>> <<Policyholder’s Contact Number>> Dear <<Policyholder’s Name>>, Sub: Your Policy no. << >> We are glad to inform you that your proposal has been accepted and the HDFC Life Easy Health (“Policy”) being this document, has been issued. We have made every effort to design your Policy in a simple format. We have highlighted items of importance so that you may recognize them easily. Policy document: As an evidence of the insurance contract between HDFC Life Insurance Company Limited and you, the Policy is enclosed herewith. Please preserve this document safely and also 

In [29]:
cache_collection = client.get_or_create_collection(name='Insurance_Cache', embedding_function=embedding_function)

In [30]:
cache_collection.peek()

{'ids': [],
 'embeddings': array([], dtype=float64),
 'documents': [],
 'uris': None,
 'data': None,
 'metadatas': [],
 'included': [<IncludeEnum.embeddings: 'embeddings'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [31]:
query = input()

what are the death benefits


In [32]:
# Searh the Cache collection first
# Query the collection against the user query and return the top 20 results

cache_results = cache_collection.query(
    query_texts=query,
    n_results=1
)

In [33]:
cache_results

{'ids': [[]],
 'embeddings': None,
 'documents': [[]],
 'uris': None,
 'data': None,
 'metadatas': [[]],
 'distances': [[]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [34]:
results = insurance_collection.query(
query_texts=query,
n_results=10
)

In [35]:
# Set a threshold for cache search
threshold = 0.2

ids = []
documents = []
distances = []
metadatas = []
results_df = pd.DataFrame()

In [36]:
if cache_results['distances'][0] == [] or cache_results['distances'][0][0] > threshold:
      # Query the collection against the user query and return the top 10 results
      results = insurance_collection.query(
      query_texts=query,
      n_results=10
      )

      # Store the query in cache_collection as document w.r.t to ChromaDB so that it can be embedded and searched against later
      # Store retrieved text, ids, distances and metadatas in cache_collection as metadatas, so that they can be fetched easily if a query indeed matches to a query in cache
      Keys = []
      Values = []

      for key, val in results.items():
        if val is None:
          continue
        for i in range(len(val[0])):
          Keys.append(str(key)+str(i))
          Values.append(str(val[0][i]))


      cache_collection.add(
          documents= [query],
          ids = [query],  # Or if you want to assign integers as IDs 0,1,2,.., then you can use "len(cache_results['documents'])" as will return the no. of queries currently in the cache and assign the next digit to the new query."
          metadatas = dict(zip(Keys, Values))
      )

      print("Not found in cache. Found in main collection.")

      result_dict = {'Metadatas': results['metadatas'][0], 'Documents': results['documents'][0], 'Distances': results['distances'][0], "IDs":results["ids"][0]}
      results_df = pd.DataFrame.from_dict(result_dict)
      results_df


# If the distance is, however, less than the threshold, you can return the results from cache

elif cache_results['distances'][0][0] <= threshold:
      cache_result_dict = cache_results['metadatas'][0][0]

      # Loop through each inner list and then through the dictionary
      for key, value in cache_result_dict.items():
          if 'ids' in key:
              ids.append(value)
          elif 'documents' in key:
              documents.append(value)
          elif 'distances' in key:
              distances.append(value)
          elif 'metadatas' in key:
              metadatas.append(value)

      print("Found in cache!")

      # Create a DataFrame
      results_df = pd.DataFrame({
        'IDs': ids,
        'Documents': documents,
        'Distances': distances,
        'Metadatas': metadatas
      })


Not found in cache. Found in main collection.


In [37]:
results_df

Unnamed: 0,Metadatas,Documents,Distances,IDs
0,"{'Page_No.': 'Page 6', 'Policy_Name': 'HDFC-Li...",PART C Product Core Benefits BENEFITS PAYABLE ...,0.315423,146
1,"{'Page_No.': 'Page 15', 'Policy_Name': 'HDFC-L...","Note: For the purpose of waiting period, Date ...",0.360409,193
2,"{'Page_No.': 'Page 8', 'Policy_Name': 'HDFC-Li...",HDFC Life Smart Pension Plan 101L164V02 – Term...,0.360493,111
3,"{'Page_No.': 'Page 8', 'Policy_Name': 'HDFC-Li...","[[""21. Progressive\nScleroderma"", ""22. Muscula...",0.360673,186
4,"{'Page_No.': 'Page 11', 'Policy_Name': 'HDFC-L...",PART C PRODUCT CORE BENEFITS BENEFITS PAYABLE ...,0.368441,42
5,"{'Page_No.': 'Page 7', 'Policy_Name': 'HDFC-Li...",Part C 1. Benefits: (1) Benefits on Death or d...,0.380166,185
6,"{'Page_No.': 'Page 23', 'Policy_Name': 'HDFC-L...",HDFC Life Smart Pension Plan 101L164V02 – Term...,0.388141,126
7,"{'Page_No.': 'Page 13', 'Policy_Name': 'HDFC-L...",HDFC Life Sanchay Plus (UIN – 101N134V19) – Ap...,0.389983,89
8,"{'Page_No.': 'Page 19', 'Policy_Name': 'HDFC-L...","(i) Death Certificate, in original, issued by ...",0.398692,159
9,"{'Page_No.': 'Page 8', 'Policy_Name': 'HDFC-Li...",HDFC Life Sanchay Plus (UIN – 101N134V19) – Ap...,0.409921,84


In [38]:
# Import the CrossEncoder library from sentence_transformers

from sentence_transformers import CrossEncoder, util

In [39]:
# Initialise the cross encoder model

cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [40]:
# Test the cross encoder model

scores = cross_encoder.predict([['Does the insurance cover diabetic patients?', 'The insurance policy covers some pre-existing conditions including diabetes, heart diseases, etc. The policy does not howev'],
                                ['Does the insurance cover diabetic patients?', 'The premium rates for various age groups are given as follows. Age group (<18 years): Premium rate']])

In [41]:
scores

array([  3.8467617, -11.252879 ], dtype=float32)

In [42]:
# Input (query, response) pairs for each of the top 20 responses received from the semantic search to the cross encoder
# Generate the cross_encoder scores for these pairs

cross_inputs = [[query, response] for response in results_df['Documents']]
cross_rerank_scores = cross_encoder.predict(cross_inputs)

In [43]:
cross_rerank_scores

array([ 3.249363 ,  2.1121192,  1.3146131, -1.7923437, -0.3002551,
        2.7251625, -0.7524959, -2.072061 , -1.7593794,  1.0998142],
      dtype=float32)

In [44]:
results_df['Reranked_scores'] = cross_rerank_scores

In [45]:
results_df

Unnamed: 0,Metadatas,Documents,Distances,IDs,Reranked_scores
0,"{'Page_No.': 'Page 6', 'Policy_Name': 'HDFC-Li...",PART C Product Core Benefits BENEFITS PAYABLE ...,0.315423,146,3.249363
1,"{'Page_No.': 'Page 15', 'Policy_Name': 'HDFC-L...","Note: For the purpose of waiting period, Date ...",0.360409,193,2.112119
2,"{'Page_No.': 'Page 8', 'Policy_Name': 'HDFC-Li...",HDFC Life Smart Pension Plan 101L164V02 – Term...,0.360493,111,1.314613
3,"{'Page_No.': 'Page 8', 'Policy_Name': 'HDFC-Li...","[[""21. Progressive\nScleroderma"", ""22. Muscula...",0.360673,186,-1.792344
4,"{'Page_No.': 'Page 11', 'Policy_Name': 'HDFC-L...",PART C PRODUCT CORE BENEFITS BENEFITS PAYABLE ...,0.368441,42,-0.300255
5,"{'Page_No.': 'Page 7', 'Policy_Name': 'HDFC-Li...",Part C 1. Benefits: (1) Benefits on Death or d...,0.380166,185,2.725163
6,"{'Page_No.': 'Page 23', 'Policy_Name': 'HDFC-L...",HDFC Life Smart Pension Plan 101L164V02 – Term...,0.388141,126,-0.752496
7,"{'Page_No.': 'Page 13', 'Policy_Name': 'HDFC-L...",HDFC Life Sanchay Plus (UIN – 101N134V19) – Ap...,0.389983,89,-2.072061
8,"{'Page_No.': 'Page 19', 'Policy_Name': 'HDFC-L...","(i) Death Certificate, in original, issued by ...",0.398692,159,-1.759379
9,"{'Page_No.': 'Page 8', 'Policy_Name': 'HDFC-Li...",HDFC Life Sanchay Plus (UIN – 101N134V19) – Ap...,0.409921,84,1.099814


In [46]:
top_3_semantic = results_df.sort_values(by='Distances')
top_3_semantic[:3]

Unnamed: 0,Metadatas,Documents,Distances,IDs,Reranked_scores
0,"{'Page_No.': 'Page 6', 'Policy_Name': 'HDFC-Li...",PART C Product Core Benefits BENEFITS PAYABLE ...,0.315423,146,3.249363
1,"{'Page_No.': 'Page 15', 'Policy_Name': 'HDFC-L...","Note: For the purpose of waiting period, Date ...",0.360409,193,2.112119
2,"{'Page_No.': 'Page 8', 'Policy_Name': 'HDFC-Li...",HDFC Life Smart Pension Plan 101L164V02 – Term...,0.360493,111,1.314613


In [47]:
# Return the top 3 results after reranking

top_3_rerank = results_df.sort_values(by='Reranked_scores', ascending=False)
top_3_rerank[:3]

Unnamed: 0,Metadatas,Documents,Distances,IDs,Reranked_scores
0,"{'Page_No.': 'Page 6', 'Policy_Name': 'HDFC-Li...",PART C Product Core Benefits BENEFITS PAYABLE ...,0.315423,146,3.249363
5,"{'Page_No.': 'Page 7', 'Policy_Name': 'HDFC-Li...",Part C 1. Benefits: (1) Benefits on Death or d...,0.380166,185,2.725163
1,"{'Page_No.': 'Page 15', 'Policy_Name': 'HDFC-L...","Note: For the purpose of waiting period, Date ...",0.360409,193,2.112119


In [48]:
top_3_RAG = top_3_rerank[["Documents", "Metadatas"]][:3]

In [49]:
top_3_RAG

Unnamed: 0,Documents,Metadatas
0,PART C Product Core Benefits BENEFITS PAYABLE ...,"{'Page_No.': 'Page 6', 'Policy_Name': 'HDFC-Li..."
5,Part C 1. Benefits: (1) Benefits on Death or d...,"{'Page_No.': 'Page 7', 'Policy_Name': 'HDFC-Li..."
1,"Note: For the purpose of waiting period, Date ...","{'Page_No.': 'Page 15', 'Policy_Name': 'HDFC-L..."


In [80]:
# Define the function to generate the response. Provide a comprehensive prompt that passes the user query and the top 3 results to the model
from openai import OpenAI

def generate_response(query, top_3_RAG):
    """
    Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
    """
    messages = [
                {"role": "system", "content":  "You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents."},
                {"role": "user", "content": f"""You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents.
                                                You have a question asked by the user in '{query}' and you have some search results from a corpus of insurance documents in the dataframe '{top_3_RAG}'. These search results are essentially one page of an insurance document that may be relevant to the user query.

                                                The column 'documents' inside this dataframe contains the actual text from the policy document and the column 'metadata' contains the policy name and source page. The text inside the document may also contain tables in the format of a list of lists where each of the nested lists indicates a row.

                                                Use the documents in '{top_3_RAG}' to answer the query '{query}'. Frame an informative answer and also, use the dataframe to return the relevant policy names and page numbers as citations.

                                                Follow the guidelines below when performing the task.
                                                1. Try to provide relevant/accurate numbers if available.
                                                2. You don’t have to necessarily use all the information in the dataframe. Only choose information that is relevant.
                                                3. If the document text has tables with relevant information, please reformat the table and return the final information in a tabular in format.
                                                3. Use the Metadatas columns in the dataframe to retrieve and cite the policy name(s) and page numbers(s) as citation.
                                                4. If you can't provide the complete answer, please also provide any information that will help the user to search specific sections in the relevant cited documents.
                                                5. You are a customer facing assistant, so do not provide any information on internal workings, just answer the query directly.

                                                The generated response should answer the query directly addressing the user and avoiding additional information. If you think that the query is not relevant to the document, reply that the query is irrelevant. Provide the final response as a well-formatted and easily readable text along with the citation. Provide your complete response first with all information, and then provide the citations.
                                                """},
              ]
    client = OpenAI(api_key=upgrad_key)
    response =  client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=messages,max_tokens=500
    )

    return response.choices[0].message.content.split('\n')

In [83]:
response = generate_response(query, top_3_RAG)

In [84]:
print("\n".join(response))

The death benefits provided in the insurance policy include benefits payable in the event of the policyholder's death. Specific details regarding the death benefits, including the coverage amount and any additional conditions, are outlined in the policy document available. 

**Policy Name:** HDFC-Life Insurance

**Page Numbers:** Page 6, Page 7

Here are the death benefits summarized from the policy document:

| Benefits                | Details                                            |
|-------------------------|----------------------------------------------------|
| Benefits on Death       | - Amount payable to nominee/beneficiary           |
|                         | - Conditions for eligibility                        |
|                         | - Any exclusions or limitations                    |

For more detailed information on the death benefits, please refer to the sections on 'Benefits on Death' in the HDFC-Life Insurance policy document on Page 6 and Page 7.


In [89]:
def res_conv(query_1):
  results = insurance_collection.query(query_texts=query_1,n_results=10)
  if cache_results['distances'][0] == [] or cache_results['distances'][0][0] > threshold:
      # Query the collection against the user query and return the top 10 results
      results = insurance_collection.query(
      query_texts=query_1,
      n_results=10
      )

      # Store the query in cache_collection as document w.r.t to ChromaDB so that it can be embedded and searched against later
      # Store retrieved text, ids, distances and metadatas in cache_collection as metadatas, so that they can be fetched easily if a query indeed matches to a query in cache
      Keys = []
      Values = []

      for key, val in results.items():
        if val is None:
          continue
        for i in range(len(val[0])):
          Keys.append(str(key)+str(i))
          Values.append(str(val[0][i]))


      cache_collection.add(
          documents= [query_1],
          ids = [query_1],  # Or if you want to assign integers as IDs 0,1,2,.., then you can use "len(cache_results['documents'])" as will return the no. of queries currently in the cache and assign the next digit to the new query."
          metadatas = dict(zip(Keys, Values))
      )

      print("Not found in cache. Found in main collection.")

      result_dict = {'Metadatas': results['metadatas'][0], 'Documents': results['documents'][0], 'Distances': results['distances'][0], "IDs":results["ids"][0]}
      results_df = pd.DataFrame.from_dict(result_dict)
      results_df


# If the distance is, however, less than the threshold, you can return the results from cache

  elif cache_results['distances'][0][0] <= threshold:
        cache_result_dict = cache_results['metadatas'][0][0]

        # Loop through each inner list and then through the dictionary
        for key, value in cache_result_dict.items():
            if 'ids' in key:
                ids.append(value)
            elif 'documents' in key:
                documents.append(value)
            elif 'distances' in key:
                distances.append(value)
            elif 'metadatas' in key:
                metadatas.append(value)

        print("Found in cache!")

        # Create a DataFrame
        results_df = pd.DataFrame({
          'IDs': ids,
          'Documents': documents,
          'Distances': distances,
          'Metadatas': metadatas
        })
  cross_inputs = [[query_1, response] for response in results_df['Documents']]
  cross_rerank_scores = cross_encoder.predict(cross_inputs)

  results_df['Reranked_scores'] = cross_rerank_scores
  top_3_rerank = results_df.sort_values(by='Reranked_scores', ascending=False)
  top_3_RAG = top_3_rerank[["Documents", "Metadatas"]][:3]

  response_1 = generate_response(query_1, top_3_RAG)
  return top_3_rerank, response_1


In [99]:
top_3 , out = res_conv("is dental treatment included?")
top_3[:3]




Not found in cache. Found in main collection.


Unnamed: 0,Metadatas,Documents,Distances,IDs,Reranked_scores
0,"{'Page_No.': 'Page 17', 'Policy_Name': 'HDFC-L...","7. Routine eye tests, any Dental Treatment or ...",0.366818,15,-1.284417
2,"{'Page_No.': 'Page 14', 'Policy_Name': 'HDFC-S...",HDFC Standard Life Insurance Company Limited H...,0.451784,75,-8.883227
7,"{'Page_No.': 'Page 5', 'Policy_Name': 'HDFC-Su...",HDFC Standard Life Insurance Company Limited H...,0.487562,66,-9.4087


In [92]:
out

['Yes, dental treatment is included as part of the insurance coverage based on the information extracted from the insurance document. Specifically, the document states, "Routine eye tests, any Dental Treatment or ..." indicating that dental treatment is covered under the policy.',
 '',
 'Here is the relevant information extracted from the insurance document in a tabular format:',
 '',
 '| Dental Treatment |',
 '|------------------|',
 '| Included         |',
 '',
 'Citations:',
 '1. Policy Name: HDFC-Life Insurance Policy',
 '   Page Number: Page 17',
 '',
 '2. Policy Name: HDFC-Standard Life Insurance Policy',
 '   Page Number: Page 14']

In [94]:
top_3 , out = res_conv("For reimbursing my paid amount what is the procedure?")
top_3[:3]



Not found in cache. Found in main collection.


Unnamed: 0,Metadatas,Documents,Distances,IDs,Reranked_scores
2,"{'Page_No.': 'Page 10', 'Policy_Name': 'HDFC-S...",HDFC Standard Life Insurance Company Limited H...,0.409746,71,-7.513788
1,"{'Page_No.': 'Page 10', 'Policy_Name': 'HDFC-L...",C.4. Payment of Premiums: This Policy is issue...,0.408268,150,-7.696627
7,"{'Page_No.': 'Page 9', 'Policy_Name': 'HDFC-Li...",HDFC Life Sanchay Plus (UIN – 101N134V19) – Ap...,0.426467,85,-7.729802


In [95]:
out

['To reimburse your paid amount, please follow the procedure outlined in the policy documents:',
 '',
 '1. **Policy Name:** HDFC Standard Life Insurance Company Limited',
 '2. **Page Number:** Page 10',
 '',
 'Here is the relevant information extracted from the document:',
 '',
 '| Procedure | Details                                      |',
 '|-----------|----------------------------------------------|',
 '| Step 1    | Fill out the reimbursement claim form.       |',
 '| Step 2    | Attach all relevant bills and receipts.     |',
 '| Step 3    | Submit the completed form and documents to the designated claims department. |',
 '| Step 4    | Wait for the claim to be processed and approved. |',
 '| Step 5    | Once approved, reimbursement will be initiated to the provided account within X business days. |',
 '',
 'This information should guide you through the reimbursement process.',
 '',
 '**Citations:**',
 '- Policy Name: HDFC Standard Life Insurance Company Limited',
 '- Page Number

In [96]:
top_3 , out = res_conv("what is the grace period in sampoorn jeevan insurance?")
top_3[:3]

Not found in cache. Found in main collection.


Unnamed: 0,Metadatas,Documents,Distances,IDs,Reranked_scores
0,"{'Page_No.': 'Page 18', 'Policy_Name': 'HDFC-L...",Please communicate any changes in your mailing...,0.325777,158,2.936436
7,"{'Page_No.': 'Page 4', 'Policy_Name': 'HDFC-Li...",11. Guaranteed Surrender Value (GSV)means the ...,0.372417,144,2.385218
2,"{'Page_No.': 'Page 2', 'Policy_Name': 'HDFC-Li...",A.1. Policy Preamble HDFC Life Sampoorna Jeeva...,0.346557,142,-0.363125


In [98]:
out

['The grace period in Sampoorn Jeevan insurance is 30 days. This means you have 30 days after the premium due date to pay your premium without the policy lapsing.',
 '',
 '**Complete Response:**',
 'The grace period in Sampoorn Jeevan insurance is 30 days. This allows policyholders a period of 30 days after the premium due date to pay their premium without the policy lapsing. ',
 '',
 '**Citations:**',
 '1. Policy Name: HDFC Life Sampoorna Jeevan  ',
 '   Page Number: Page 2',
 '',
 '']