In [12]:
# import useful python libraries
import sys
import os
import chromadb
from chromadb.utils import embedding_functions
from langchain_openai import OpenAIEmbeddings
from rank_bm25 import BM25Okapi
from openai import OpenAI
from IPython.display import display_markdown
from IPython.utils import io
from chromadb.config import Settings


# Add the parent directory (Auditbot_backend) to the system path
sys.path.append(
    os.path.abspath(
        os.path.join(
            os.path.dirname(f"{os.getcwd()}/chroma_db.ipynb"),
            '..'
        )
    )
)

# import custom modules
from utils.preprocessing import *
from utils.json_parser import *
from utils.content_page_parser import *
from utils.retriever import *
from utils.custom_print import *
from utils.prompt_engineering import *
from utils.db_utils import *
from utils.initialisations import *

In [8]:
# RAG input parameters =======================================================

query = "What are the findings pertaining to grant?"

# HYPERPARAMETERS ============================================================

# preprocessing --------------------------------------------------------------

# Chunk into sentences ('s') or paragraphs ('p')
chunking='s' 

# Group smaller chunks into a bigger chunk
grouping=1

# control minimum chubk size
min_chunk_size=100

# ranking --------------------------------------------------------------------

# top k matches for ranking. 
# Both sparse and dense search find top_k matches so hybrid search will return 
# at least top_k matches and most 2 * top_k matches
top_k = 30

# top n matches for reranking
top_n = 20

# Cross encoder model
# claimed to be deprecated because it is bad but seems to still work fine
# model_name = "cross-encoder/stsb-roberta-base"

# best performing on Microsoft tests
model_name = "cross-encoder/ms-marco-MiniLM-L-12-v2"

In [3]:
# retrieve all required data structures

# load tree
inverted_tree = json_file_to_dict(save_inverted_tree_path)

# load chunks from tree's keys
chunks = list(inverted_tree.keys())
print("Number of unique chunks:", len(chunks))

# prepare metadata for chromadb
pre_metadata = list(inverted_tree.values())
metadata = chroma_preprocess_metadata(pre_metadata)

# load sentence paragraph pairs
if (chunking == 's' or chunking == 'f') and grouping == 1:
    print("s_p_pairs will be filled")
    s_p_pairs = json_file_to_dict(s_p_pairs_path)
else:
    s_p_pairs = {}

Number of unique chunks: 8210
s_p_pairs will be filled


In [4]:
# chromadb supported model
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                api_key=OPENAI_API_KEY,
                model_name="text-embedding-3-small"
            )

# create db
client = chromadb.PersistentClient(path="../data/db", 
                                   settings = Settings(allow_reset=True))

# chromadb's embedding function needs streaming
collection = chroma_get_or_create_collection(client, 
                                             name = "audit", 
                                             embedding_function = openai_ef, 
                                             reset = True)



In [5]:
# fill up db
chroma_fill_db(collection, chunks, metadata, batch_size=1000)

print("number of embeddings in database:",collection.count())

number of embeddings in database: 8210


In [9]:

# Tokenize the texts for BM25
tokenized_chunks = [chunk.split() for chunk in chunks]
bm25_db = BM25Okapi(tokenized_chunks)

# bm25 search (sparce embedding)
bm25_results = bm25_search(bm25_db, chunks, query, top_k)

# dense embeding search 
embedding_results = chromadb_embedding_search(collection, query, top_k)

# RRF
good_chunks  = reciprocal_rank_fusion(bm25_results, 
                                 embedding_results, 
                                 [0.5,0.5], 
                                 k = 60)

pretty_print_list(good_chunks)

idx: 0

Stage 1: Grant Design and Setup
– whether there were processes and controls in place to ensure that 
grant programmes were authorised and administered in accordance 
with the objective(s) of the grant

----------------------------------------------------------

idx: 1

Stage 1: Grant Design and Setup
– whether processes were in place to ensure that grant programmes 
were authorised and reviewed for relevance
b

----------------------------------------------------------

idx: 2

Stage 2: Grant Evaluation and Approval
– whether there were processes and controls in place to ensure that 
grant applications were properly evaluated and approved

----------------------------------------------------------

idx: 3

Stage 1 – Grant Design and Setup
AGO observed that the grant eligibility criteria and operational requirements for 
the administration of the grant schemes were properly laid down in legislation or 
implementation documents

---------------------------------------------------

In [10]:
# Reranking
best_chunks, scores = reranking(model_name, good_chunks, query, top_n)
print("RERANKING-----------------------------------------------------------\n")
pretty_print_rank(best_chunks, scores)

RERANKING-----------------------------------------------------------

RANK: 1

39
II: Audit of Statutory Boards
Key Audit Observations
Amount 
of Grants 
Disbursed with 
Lapses Noted3
($)
Estimated 
Overpayment
 
($)
Inadequate Monitoring and Lapses on 
Checks for Grant Eligibility
2,587,800
2,587,800
Inadequate Checks by Service Provider on 
Grant Claims of Training Providers
953,600
953,600
Grants Disbursed for Individuals and 
Companies which were Disallowed Funding
269,100
269,100
Grants Disbursed for Individuals with 
 
Overlapping Attendance Records for 
Synchronous Courses 
3,290,800
13,300
Inadequate  Checks/Supporting Documents 
for Absentee Payroll Funding
615,100
393,700
Total
7,716,400
4,217,500
14

SCORE: 0.4422704

----------------------------------------------------------

RANK: 2

Stage 2: Grant Evaluation and Approval
– whether there were processes and controls in place to ensure that 
grant applications were properly evaluated and approved

SCORE: 0.34653497

--------

In [11]:
prompt = generate_prompt(query, 
                         inverted_tree, 
                         best_chunks, 
                         chunking, 
                         s_p_pairs)

print(prompt)

Role:
You are a specialist who uses the context provided to answer the query.

Instruction:
Your response should cite sources' year and page number.
If possible, make ministries or government agencies the headings.
If you are unable to provide an answer, state "Unable to find, submit prompt again."

Background:
The context is taken from audit reports from the Auditor-General's Office (AGO) of Singapore. 
AGO is an independent organ of state and the national auditor. They play an important role in enhancing public accountability in the management and use of public funds and resources through their audits.

They audit
    government ministries and departments
    organs of state
    statutory boards
    government funds
    other public authorities and bodies administering public funds (upon their request for audit), e.g. government-owned companies.

They report their audit observations to the President, Parliament and the public through the Annual Report of the Auditor-General managemen

In [13]:

# get response
client = OpenAI(api_key=OPENAI_API_KEY)

completion = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
)

response = openai_completion_to_text(completion)
display_markdown(response, raw=True)

### Ministry of Education

**SkillsFuture Singapore Agency**

The audit of 2021_22 highlighted multiple lapses in the management of grants by SkillsFuture Singapore Agency. The findings can be seen in the disbursement of grants with noted lapses amounting to $7,716,400, with an estimated overpayment of $4,217,500. Specific issues included:
- Inadequate monitoring and lapses on checks for grant eligibility.
- Inadequate checks by service providers on grant claims of training providers.
- Grants disbursed to individuals and companies that were disallowed funding.
- Grants disbursed to individuals with overlapping attendance records for synchronous courses.
- Inadequate checks/supporting documents for absentee payroll funding.
(Source: Year 2021_22, Page 39)

### Ministry of Transport

**Civil Aviation Authority of Singapore**

In the audit of 2022_23, it was found that certain eligibility criteria were either inaccurately stated or not stated in the grant agreements with two companies.
(Source: Year 2022_23, Page 42)

### Prime Minister's Office

**National Research Foundation**

The findings in 2017_18 included:
- Lack of framework leading to inconsistencies in grant management (Page 52).
- Laxity in verification of fund requests (Page 53).
- Laxity in monitoring progress and final reports (Page 55).

### Ministry of Health and Ministry of Social and Family Development

The thematic audit of 2018_19 uncovered gaps in the management of social grant programmes. A significant amount of $1.59 billion was disbursed by the two ministries. The audit revealed gaps across the five stages of grant management:
- Grant design and setup.
- Grant evaluation and approval.
- Disbursement of grants.
- Monitoring and review of grants.
- Cessation of grants.
(Source: Year 2018_19, Pages 7-8)

### Ministry of Trade and Industry

**Economic Development Board**

The audit of 2016_17 noted lapses in the administration of grants. In 47 grant projects tested, seven projects had no evidence that EDB followed up with the grant recipients to ensure that project conditions and milestones were met by the due dates.
(Source: Year 2016_17, Page 46)

### Ministry of Manpower

**Singapore Workforce Development Agency**

In the audit of 2014_15, it was found that the programme partner committed grants amounting to $0.98 million without obtaining all necessary supporting documents from the applicants to verify eligibility.
(Source: Year 2014_15, Page 49)

### General Themes Across Audits

Recurring themes across various years and ministries include:
- The inadequacy of processes and controls in grant evaluation, approval, disbursement, and monitoring stages.
(Source: Year 2019_20, Pages 53-54; Year 2022_23, Pages 48-49; Year 2017_18, Page 45; Year 2018_19, Page 54) 
- Lack of proper frameworks leading to inconsistent grant management.
(Source: Year 2017_18, Page 52; Year 2019_20, Page 54)
- Insufficient monitoring of grant conditions, leading to improper use of funds.
(Source: Year 2018_19, Page 54; Year 2016_17, Page 46)

These findings illustrate the necessity for stricter control mechanisms and consistent monitoring to ensure proper use of public funds in grant management.

In [None]:
# DO NOT RUN THIS, THIS IS AN ALTERNATIVE SOLUTION USING CUSTOM EMBEDDING 
# FUNCTION

# Alternative option is to use langchain's openai embedding function 
# This custom embedding function does not require streaming
# model
embedding_fn = OpenAIEmbeddings(model="text-embedding-3-small",api_key = OPENAI_API_KEY)

# create embeddings
def get_openai_embeddings(openai_model, text_list):
    text_embeddings = openai_model.embed_documents(text_list)
    print("Number of text embeddings:",len(text_embeddings))
    print("length of text embedding:",len(text_embeddings[0]))
    return text_embeddings

# chunk embedding
chunk_embeddings = get_openai_embeddings(embedding_fn, chunks)

# query embeddings
query_embeddings = get_openai_embeddings(embedding_fn, [query])

collection = client.get_or_create_collection(name="audit")

with io.capture_output() as captured:
    collection.add(
        ids = [f"id{i}" for i in range(len(chunks))],
        embeddings=chunk_embeddings,
        metadatas=metadata,
        documents=chunks
    )


# embeding search
def custom_embedding_search(database, query_embedding, top_k=5):
    results = database.query(query_embeddings = query_embedding,
                             n_results = top_k)
    return [(result, idx) for idx, result in enumerate(results['documents'][0])]