In [1]:
import os
import sys
from elasticsearch import Elasticsearch

# Add the parent directory (Auditbot_backend) to the system path
sys.path.append(
    os.path.abspath(
        os.path.join(
            os.path.dirname(f"{os.getcwd()}/RAG_dev.ipynb"),
            '..'
        )
    )
)

# import constants
from utils.initialisations import (PARSED_DOCUMENT_DIR, 
                                   save_inverted_tree_path,
                                   HTTP_AUTH)

# import custom helper functions
from utils.json_parser import json_file_to_dict
from utils.db_utils import elastic_reset


In [2]:
# create client (hosted on docker)
client = Elasticsearch(
    "http://localhost:9200",
    basic_auth=HTTP_AUTH
)


In [3]:
client.info(http_auth=HTTP_AUTH)

{'name': '50cd72118574',
 'cluster_name': 'docker-cluster',
 'cluster_uuid': 'l-puMSQDTvqL7nQDfYreOg',
 'version': {'number': '8.14.3',
  'build_flavor': 'default',
  'build_type': 'docker',
  'build_hash': 'd55f984299e0e88dee72ebd8255f7ff130859ad0',
  'build_date': '2024-07-07T22:04:49.882652950Z',
  'build_snapshot': False,
  'lucene_version': '9.10.0',
  'minimum_wire_compatibility_version': '7.17.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'You Know, for Search'}

In [4]:
# retrieve all required data structures

# load tree
inverted_tree = json_file_to_dict(save_inverted_tree_path)

# load chunks from tree's keys
chunks = list(inverted_tree.keys())
print("Number of unique chunks:", len(chunks))

Number of unique chunks: 8210


In [5]:
# create index
index_name = 'chromadb_documents'
if not client.indices.exists(index=index_name, http_auth=HTTP_AUTH):
    client.indices.create(index=index_name, http_auth=HTTP_AUTH)

In [6]:
# reset indices as we are starting the index from scratch.
reset = True

if reset: 
    elastic_reset(client, http_auth=HTTP_AUTH)

reset index: chromadb_documents


In [7]:
# Index the documents
for i, chunk in enumerate(chunks):
    client.index(index=index_name, id=i, body={'text': chunk}, http_auth=HTTP_AUTH)

In [8]:
print("Documents indexed.")

# Step 3: Perform Searches Using Elasticsearch
# Search query
query = "What are the findings pertaining to grant?"
top_k = 15

# Perform BM25 search
response = client.search(index='chromadb_documents', body={
    'query': {
        'match': {
            'text': query
        }
    },
    'size': top_k
}, http_auth=HTTP_AUTH)

# Print search results
good_chunks = []
for idx, hit in enumerate(response['hits']['hits']):
    good_chunk = hit['_source']['text']
    score = hit['_score']
    print(f"idx: {idx}")
    print(f"Score: {score}\n")
    print(f"Document: {good_chunk}")
    print("------------------------------------------------------------------")
    good_chunks.append(good_chunk)

Documents indexed.
idx: 0
Score: 11.637025

Document: Details of the lapses pertaining to the enforcement of SDL collections are in the 
 
following paragraphs
------------------------------------------------------------------
idx: 1
Score: 10.4453335

Document: Audit findings are conveyed by AGO to the ministries and statutory boards audited 
by way of “management letters”
------------------------------------------------------------------
idx: 2
Score: 10.312286

Document: Pertaining to the lack of checks on declarations by grant recipients, EDB 
explained that there were specific controls in place to ensure that grant recipients take 
ownership for accurate and credible reporting
------------------------------------------------------------------
idx: 3
Score: 10.262637

Document: Audit findings are conveyed to the Government ministries, statutory boards and other 
entities audited by way of “management letters”
------------------------------------------------------------------
idx: 4

In [9]:
response

{'took': 30,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 7525, 'relation': 'eq'},
  'max_score': 11.637025,
  'hits': [{'_index': 'chromadb_documents',
    '_id': '5370',
    '_score': 11.637025,
    '_source': {'text': 'Details of the lapses pertaining to the enforcement of SDL collections are in the \n \nfollowing paragraphs'}},
   {'_index': 'chromadb_documents',
    '_id': '3921',
    '_score': 10.4453335,
    '_source': {'text': 'Audit findings are conveyed by AGO to the ministries and statutory boards audited \nby way of “management letters”'}},
   {'_index': 'chromadb_documents',
    '_id': '7649',
    '_score': 10.312286,
    '_source': {'text': 'Pertaining to the lack of checks on declarations by grant recipients, EDB \nexplained that there were specific controls in place to ensure that grant recipients take \nownership for accurate and credible reporting'}},
   {'_index': 'chromadb_documents',
    '_i