In [1]:
%load_ext autoreload
%autoreload 2

# I HAVE USED A DIFFERENT ENVIRONMENT FOR THIS AS IT CONFLICTS WITH RAG MODULES

In [2]:
import sys
import os

# Add the parent directory (Auditbot_backend) to the system path
sys.path.append(
    os.path.abspath(
        os.path.join(
            os.path.dirname(f"{os.getcwd()}/recursive_retrieval.ipynb"),
            '..'
        )
    )
)

# import custom modules
from utils.preprocessing import *
from utils.json_parser import *
from utils.content_page_parser import *
from utils.retriever import *
from utils.custom_print import *
from utils.prompt_engineering import *
from utils.db_utils import *
from utils.langsmith_trace import *
from utils.initialisations import *
from utils.llama_index_utils import *

# constants
yearly_data_path = "../data/parsed_documents/yearly_data.json"

# models
# llama-index supported model
embed_model = llama_OpenAIEmbedding(
    model="text-embedding-3-small",
    api_key=OPENAI_API_KEY
)

# chromadb supported model
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                api_key=OPENAI_API_KEY,
                model_name="text-embedding-3-small"
            )

  from tqdm.autonotebook import tqdm, trange


In [3]:
# RAG input parameters =======================================================

# query = "extract finding on weakness in access controls from FY2018/19 to FY2020/21 AGO's report."
question = "extract finding on weakness in access controls from FY2018/19 to FY2020/21 AGO's report. Tabulate the output with row heading as Year of Report and details of findings."
question2 = "What are the findings pertaining to grant?"

# HYPERPARAMETERS ============================================================

# preprocessing --------------------------------------------------------------

# Chunk into sentences ('s') or paragraphs ('p')
chunking='s' 

# Group smaller chunks into a bigger chunk
grouping=1

# control minimum chubk size
min_chunk_size=100

# vector store ---------------------------------------------------------------

# add to data base in batches
batch_size = 1000

In [4]:
# RUN ONCE
# generate all required data structures

# generate chunks
generate_chunks(DOCUMENT_DIR,
                chunks_path,
                chunk_pageNum_pairs_path,
                s_p_pairs_path, 
                chunking, 
                grouping, 
                min_chunk_size,
                DOC_IDENTIFIER)


# generate inverted tree
has_content_page = True
generate_inverted_tree(chunk_pageNum_pairs_path, 
                       has_content_page, 
                       save_inverted_tree_path,
                       tree_path)

2.591806173324585 seconds
number of chunks: 9127


In [5]:
# RUN ONCE
# retrieve all required data structures

# load tree
inverted_tree = json_file_to_dict(save_inverted_tree_path)

# load chunks from tree's keys
chunks = list(inverted_tree.keys())
print("Number of unique chunks:", len(chunks))

# load chunk_pageNum_pairs
chunk_pageNum_pairs = json_file_to_dict(chunk_pageNum_pairs_path)

# prepare metadata for general chromadb
pre_metadata = list(inverted_tree.values())
metadata = chroma_preprocess_metadata(pre_metadata)

# prepare metadata for yearly chromadb
years = chunk_pageNum_pairs.keys()

generate_yearly_data(years, metadata, chunks, yearly_data_path)
yearly_data = json_file_to_dict(yearly_data_path)

# load sentence paragraph pairs
if (chunking == 's' or chunking == 'f') and grouping == 1:
    print("s_p_pairs will be filled")
    s_p_pairs = json_file_to_dict(s_p_pairs_path)
else:
    s_p_pairs = {}

Number of unique chunks: 8210
s_p_pairs will be filled


In [6]:
# create general db
client_dense = chromadb.PersistentClient(path="../data/db",
                                         settings = Settings(allow_reset=True))

# chromadb all data
collection = chroma_get_or_create_collection(client_dense, 
                                             name = "audit", 
                                             embedding_function = openai_ef, 
                                             reset = True)

# fill db
chroma_fill_db(collection, chunks, metadata, batch_size)
print("number of embeddings in database:",collection.count())
print()

# ---------------------------------------------------------------------------

# chromadb yearly data
for year in years:
    year_chunks = yearly_data[year]["chunks"]
    year_metadata = yearly_data[year]["metadata"]

    print(year, len(year_metadata), len(year_chunks))

    # chromadb's embedding function needs streaming
    year_collection = chroma_get_or_create_collection(client_dense, 
                                    name = f"audit_{year}", 
                                    embedding_function = openai_ef, 
                                    reset = False)
    
    chroma_fill_db(year_collection, year_chunks, year_metadata, batch_size)
    
    print("number of embeddings in database:",year_collection.count())
    print()


number of embeddings in database: 8210

2008_09 321 321
number of embeddings in database: 321

2020_21 898 898
number of embeddings in database: 898

2018_19 635 635
number of embeddings in database: 635

2011_12 549 549
number of embeddings in database: 549

2022_23 781 781
number of embeddings in database: 781

2013_14 485 485
number of embeddings in database: 485

2009_10 255 255
number of embeddings in database: 255

2015_16 442 442
number of embeddings in database: 442

2012_13 442 442
number of embeddings in database: 442

2021_22 744 744
number of embeddings in database: 744

2019_20 743 743
number of embeddings in database: 743

2010_11 335 335
number of embeddings in database: 335

2017_18 519 519
number of embeddings in database: 519

2016_17 436 436
number of embeddings in database: 436

2014_15 625 625
number of embeddings in database: 625



In [7]:
# Build agents dictionary
# Transferring from chromadb to llama index using inbuilt llama index functions
agents = {}

collection = chroma_get_or_create_collection(client_dense, 
                                             name = "audit", 
                                             embedding_function = openai_ef, 
                                             reset = False)

agents["all years"] = llama_get_agent(
    db = collection, 
    embed_model = embed_model,
    description = "Useful for retrieving context from all years",
    openai_api_key = OPENAI_API_KEY)

for year in years:
    year_collection = chroma_get_or_create_collection(client_dense, 
                                                name = f"audit_{year}", 
                                                embedding_function = openai_ef, 
                                                reset = False)
    
    print("number of embeddings in database:",year_collection.count())
    
    agents[year] = llama_get_agent(
        db = year_collection, 
        embed_model = embed_model,
        description = f"Useful for retrieving specific context from {year}",
        openai_api_key = OPENAI_API_KEY)

number of embeddings in database: 321
number of embeddings in database: 898
number of embeddings in database: 635
number of embeddings in database: 549
number of embeddings in database: 781
number of embeddings in database: 485
number of embeddings in database: 255
number of embeddings in database: 442
number of embeddings in database: 442
number of embeddings in database: 744
number of embeddings in database: 743
number of embeddings in database: 335
number of embeddings in database: 519
number of embeddings in database: 436
number of embeddings in database: 625


In [8]:
# Build agents dictionary
# build manually using TextNode and include metadata


agents_manual = {}

agents_manual["all years"] = llama_get_agent_manual(
    db = collection, 
    embed_model = embed_model,
    description = "Useful for retrieving context from all years",
    openai_api_key = OPENAI_API_KEY)

for year in years:
    year_collection = chroma_get_or_create_collection(client_dense, 
                                                name = f"audit_{year}", 
                                                embedding_function = openai_ef, 
                                                reset = False)
    
    print("number of embeddings in database:",year_collection.count())
    
    agents_manual[year] = llama_get_agent_manual(
        db = year_collection, 
        embed_model = embed_model,
        description = f"Useful for retrieving specific context from {year}",
        openai_api_key = OPENAI_API_KEY)

number of embeddings in database: 321
number of embeddings in database: 898
number of embeddings in database: 635
number of embeddings in database: 549
number of embeddings in database: 781
number of embeddings in database: 485
number of embeddings in database: 255
number of embeddings in database: 442
number of embeddings in database: 442
number of embeddings in database: 744
number of embeddings in database: 743
number of embeddings in database: 335
number of embeddings in database: 519
number of embeddings in database: 436
number of embeddings in database: 625


In [9]:
# recursive retreival is wrong because it produses an llm output for every 
# chunk found. Then it puts those llm responses together and creates a final LLM response 
top_query_engine = get_top_level_retriever(agents, years, embed_model)

response = top_query_engine.query(question)

display_markdown(response.response, raw=True)

[1;3;38;2;11;159;203mRetrieval entering 2018_19: OpenAIAgent
[0m[1;3;38;2;237;90;200mRetrieving from object OpenAIAgent with query extract finding on weakness in access controls from FY2018/19 to FY2020/21 AGO's report. Tabulate the output with row heading as Year of Report and details of findings.
[0mAdded user message to memory: extract finding on weakness in access controls from FY2018/19 to FY2020/21 AGO's report. Tabulate the output with row heading as Year of Report and details of findings.
=== Calling Function ===
Calling function: vector_tool with args: {"input": "weakness in access controls in AGO report FY2018/19"}
Got output: The weakness identified in the AGO report for the fiscal year 2018/19 pertains to the logging and review of privileged user activities within the Accountant-General's Department (AGD) of the Ministry of Finance.

=== Calling Function ===
Calling function: vector_tool with args: {"input": "weakness in access controls in AGO report FY2019/20"}
Got out

Year of Report | Details of Findings  
FY2018/19 | Weaknesses in access controls were identified in both the Ministry of Trade and Industry's Enterprise Singapore Board and the Ministry of Manpower's Workforce Singapore Agency. These weaknesses increased the risk of unauthorized activities and changes not being detected.  
FY2019/20 | The identified weaknesses in access controls increased the risk of unauthorized activities and changes not being detected.  
FY2020/21 | The weaknesses in access controls could potentially increase the risk of unauthorized activities and changes not being detected.

In [10]:
# should use Boston agent -> vector tool
response2 = top_query_engine.query(question2)

display_markdown(response2.response, raw=True)


# "This query engine does not support retrieve, use query directly"
# This means prompt engineering is not possible

# this is wrong. Does not look though "all years" nodes!!!

[1;3;38;2;11;159;203mRetrieval entering 2010_11: OpenAIAgent
[0m[1;3;38;2;237;90;200mRetrieving from object OpenAIAgent with query What are the findings pertaining to grant?
[0mAdded user message to memory: What are the findings pertaining to grant?
=== Calling Function ===
Calling function: vector_tool with args: {"input":"findings pertaining to grant"}
Got output: The findings pertain to purchase commitments exceeding approved budgets, inappropriate use of term contracts, gross overcharging for materials, materials not delivered at the time of payment, delivered materials not meeting specifications, and possible falsification of documents provided as proof of delivery of goods and services.

[1;3;38;2;11;159;203mRetrieval entering 2012_13: OpenAIAgent
[0m[1;3;38;2;237;90;200mRetrieving from object OpenAIAgent with query What are the findings pertaining to grant?
[0mAdded user message to memory: What are the findings pertaining to grant?
=== Calling Function ===
Calling functi

The findings related to grants include instances of purchase commitments exceeding approved budgets, inappropriate use of term contracts, gross overcharging for materials, payments made for undelivered materials, delivered materials not meeting specifications, and possible falsification of documents.

In [11]:
# recursive retreival is wrong because it produses an llm output for every 
# chunk found. Then it puts those llm responses together and creates a final LLM response 
top_query_engine_manual = get_top_level_retriever(agents_manual, years, embed_model)

response = top_query_engine_manual.query(question)

display_markdown(response.response, raw=True)

[1;3;38;2;11;159;203mRetrieval entering 2018_19: OpenAIAgent
[0m[1;3;38;2;237;90;200mRetrieving from object OpenAIAgent with query extract finding on weakness in access controls from FY2018/19 to FY2020/21 AGO's report. Tabulate the output with row heading as Year of Report and details of findings.
[0mAdded user message to memory: extract finding on weakness in access controls from FY2018/19 to FY2020/21 AGO's report. Tabulate the output with row heading as Year of Report and details of findings.
=== Calling Function ===
Calling function: vector_tool with args: {"input": "weakness in access controls in FY2018/19 AGO's report"}
Got output: The weakness in access controls in the FY2018/19 AGO's report was related to the logging and review of privileged user activities in the Accountant-General’s Department (AGD) within the Government accounting and financial system, NFS@Gov. This issue was also noted to be prevalent across various public sector entities audited by AGO in recent years

| Year of Report | Details of Findings |
|----------------|---------------------|
| FY2018/19      | Weaknesses in access controls were related to logging and review of privileged user activities in the Accountant-General’s Department (AGD) within the Government accounting and financial system, NFS@Gov. This issue was also noted to be prevalent across various public sector entities audited by AGO in recent years, highlighting the ongoing need for improvement in IT controls. Weaknesses in access controls were identified in both the Ministry of Trade and Industry (Enterprise Singapore Board) and the Ministry of Manpower (Workforce Singapore Agency). |
| FY2019/20      | Weaknesses in access controls were highlighted in the audit of controls over access rights granted to the Ministry of Defence (MINDEF)'s Enterprise Human Resource (E-HR) system. Weaknesses in access controls increased the risk of unauthorized activities and changes not being detected. |
| FY2020/21      | Weaknesses in access controls could be related to issues identified in the logging and review of privileged user activities within the Accountant-General’s Department (AGD) or weaknesses in the management of access rights in the Ministry of Defence's Enterprise Human Resource (E-HR) system. Weaknesses in access controls could potentially increase the risk of unauthorized activities and changes not being detected. |

In [12]:
# recursive retreival is wrong because it produses an llm output for every 
# chunk found. Then it puts those llm responses together and creates a final LLM response 

response = top_query_engine_manual.query(question2)

display_markdown(response.response, raw=True)

[1;3;38;2;11;159;203mRetrieval entering 2010_11: OpenAIAgent
[0m[1;3;38;2;237;90;200mRetrieving from object OpenAIAgent with query What are the findings pertaining to grant?
[0mAdded user message to memory: What are the findings pertaining to grant?
=== Calling Function ===
Calling function: vector_tool with args: {"input": "grant findings"}
Got output: The findings include purchase commitment exceeding approved budget, inappropriate use of term contracts, overcharging for materials, delayed delivery of materials, non-compliance with specifications for delivered materials, and potential falsification of documents related to goods and services delivery. Additionally, inaccurate or incomplete information was provided during three quotation exercises totaling $120,885.

[1;3;38;2;11;159;203mRetrieval entering 2012_13: OpenAIAgent
[0m[1;3;38;2;237;90;200mRetrieving from object OpenAIAgent with query What are the findings pertaining to grant?
[0mAdded user message to memory: What ar

The findings related to the grant include instances of purchase commitments exceeding the approved budget, inappropriate use of term contracts, overcharging for materials, delayed delivery of materials, non-compliance with specifications, potential falsification of documents, and inaccurate or incomplete information provided during three quotation exercises totaling $120,885.